From 978e672c5ecc4385336216ef0fd485b4566af224 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 14 Apr 2025 16:58:17 +0200 Subject: [PATCH 01/49] add h5json package --- hsds/async_lib.py | 5 +- hsds/attr_dn.py | 7 +- hsds/attr_sn.py | 9 +- hsds/chunk_crawl.py | 7 +- hsds/chunk_dn.py | 5 +- hsds/chunk_sn.py | 8 +- hsds/chunklocator.py | 4 +- hsds/ctype_sn.py | 4 +- hsds/datanode_lib.py | 6 +- hsds/dset_lib.py | 6 +- hsds/dset_sn.py | 6 +- hsds/link_dn.py | 1 - hsds/servicenode_lib.py | 3 +- hsds/util/arrayUtil.py | 731 ---------------------------- hsds/util/hdf5dtype.py | 876 ---------------------------------- pyproject.toml | 1 + testall.py | 2 +- tests/integ/attr_test.py | 1 + tests/integ/vlen_test.py | 10 +- tests/unit/array_util_test.py | 12 +- tests/unit/hdf5_dtype_test.py | 717 ---------------------------- 21 files changed, 61 insertions(+), 2360 deletions(-) delete mode 100644 hsds/util/arrayUtil.py delete mode 100644 hsds/util/hdf5dtype.py delete mode 100755 tests/unit/hdf5_dtype_test.py diff --git a/hsds/async_lib.py b/hsds/async_lib.py index e749e8a2..15d67f5f 100755 --- a/hsds/async_lib.py +++ b/hsds/async_lib.py @@ -15,11 +15,12 @@ from aiohttp.client_exceptions import ClientError from aiohttp.web_exceptions import HTTPNotFound, HTTPInternalServerError from aiohttp.web_exceptions import HTTPForbidden +from h5json.hdf5dtype import getItemSize +from h5json.hdf5dtype import createDataType +from h5json.array_util import getNumElements, bytesToArray from .util.idUtil import isValidUuid, isSchema2Id, getS3Key, isS3ObjKey from .util.idUtil import getObjId, isValidChunkId, getCollectionForId from .util.chunkUtil import getDatasetId, getNumChunks, ChunkIterator -from .util.hdf5dtype import getItemSize, createDataType -from .util.arrayUtil import getNumElements, bytesToArray from .util.dsetUtil import getHyperslabSelection, getFilterOps, getChunkDims, getFilters from .util.dsetUtil import getDatasetLayoutClass, getDatasetLayout, getShapeDims from .util.storUtil import getStorKeys, putStorJSONObj, getStorJSONObj diff --git a/hsds/attr_dn.py b/hsds/attr_dn.py index 456e9854..cb002623 100755 --- a/hsds/attr_dn.py +++ b/hsds/attr_dn.py @@ -19,12 +19,13 @@ from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.web import json_response +from h5json.hdf5dtype import getItemSize, createDataType +from h5json.array_util import arrayToBytes, jsonToArray, decodeData +from h5json.array_util import bytesToArray, bytesArrayToList, getNumElements + from .util.attrUtil import validateAttributeName, isEqualAttr -from .util.hdf5dtype import getItemSize, createDataType from .util.globparser import globmatch from .util.dsetUtil import getShapeDims -from .util.arrayUtil import arrayToBytes, jsonToArray, decodeData -from .util.arrayUtil import bytesToArray, bytesArrayToList, getNumElements from .util.domainUtil import isValidBucketName from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj from . import hsds_logger as log diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index b7ecdce4..a735c5c6 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -18,6 +18,11 @@ from aiohttp.web import StreamResponse from json import JSONDecodeError +from h5json.hdf5dtype import validateTypeItem, getBaseTypeJson +from h5json.hdf5dtype import createDataType, getItemSize +from h5json.array_util import jsonToArray, getNumElements, bytesArrayToList +from h5json.array_util import bytesToArray, arrayToBytes, decodeData, encodeData + from .util.httpUtil import getAcceptType, jsonResponse, getHref, getBooleanParam from .util.globparser import globmatch from .util.idUtil import isValidUuid, getRootObjId @@ -25,10 +30,6 @@ from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain, verifyRoot from .util.attrUtil import validateAttributeName, getRequestCollectionName -from .util.hdf5dtype import validateTypeItem, getBaseTypeJson -from .util.hdf5dtype import createDataType, getItemSize -from .util.arrayUtil import jsonToArray, getNumElements, bytesArrayToList -from .util.arrayUtil import bytesToArray, arrayToBytes, decodeData, encodeData from .util.dsetUtil import getShapeDims from .servicenode_lib import getDomainJson, getObjectJson, validateAction diff --git a/hsds/chunk_crawl.py b/hsds/chunk_crawl.py index 847f0933..a153bfe8 100755 --- a/hsds/chunk_crawl.py +++ b/hsds/chunk_crawl.py @@ -24,16 +24,17 @@ from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.client_exceptions import ClientError +from h5json.hdf5dtype import createDataType +from h5json.array_util import jsonToArray, getNumpyValue +from h5json.array_util import getNumElements, arrayToBytes, bytesToArray + from .util.httpUtil import http_get, http_put, http_post, get_http_client from .util.httpUtil import isUnixDomainUrl from .util.idUtil import getDataNodeUrl, getNodeCount -from .util.hdf5dtype import createDataType from .util.dsetUtil import getSliceQueryParam, getShapeDims from .util.dsetUtil import getSelectionShape, getChunkLayout from .util.chunkUtil import getChunkCoverage, getDataCoverage from .util.chunkUtil import getChunkIdForPartition, getQueryDtype -from .util.arrayUtil import jsonToArray, getNumpyValue -from .util.arrayUtil import getNumElements, arrayToBytes, bytesToArray from . import config from . import hsds_logger as log diff --git a/hsds/chunk_dn.py b/hsds/chunk_dn.py index e2671b61..eeeed88d 100644 --- a/hsds/chunk_dn.py +++ b/hsds/chunk_dn.py @@ -20,11 +20,12 @@ from aiohttp.web_exceptions import HTTPNotFound, HTTPServiceUnavailable from aiohttp.web import json_response, StreamResponse +from h5json.hdf5dtype import createDataType, getSubType +from h5json.array_util import bytesToArray, arrayToBytes, getBroadcastShape + from .util.httpUtil import request_read, getContentType -from .util.arrayUtil import bytesToArray, arrayToBytes, getBroadcastShape from .util.idUtil import getS3Key, validateInPartition, isValidUuid from .util.storUtil import isStorObj, deleteStorObj -from .util.hdf5dtype import createDataType, getSubType from .util.dsetUtil import getSelectionList, getChunkLayout, getShapeDims from .util.dsetUtil import getSelectionShape, getChunkInitializer from .util.chunkUtil import getChunkIndex, getDatasetId, chunkQuery diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index 68575007..921feaf0 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -25,19 +25,19 @@ from aiohttp.web_exceptions import HTTPConflict, HTTPInternalServerError from aiohttp.web import StreamResponse +from h5json.hdf5dtype import getItemSize, getDtypeItemSize, getSubType, createDataType +from h5json.array_util import bytesArrayToList, jsonToArray, getNumElements, arrayToBytes +from h5json.array_util import bytesToArray, squeezeArray, getBroadcastShape + from .util.httpUtil import getHref, getAcceptType, getContentType from .util.httpUtil import request_read, jsonResponse, isAWSLambda from .util.idUtil import isValidUuid from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain -from .util.hdf5dtype import getItemSize, getDtypeItemSize, getSubType, createDataType from .util.dsetUtil import isNullSpace, isScalarSpace, get_slices, getShapeDims from .util.dsetUtil import isExtensible, getSelectionPagination from .util.dsetUtil import getSelectionShape, getDsetMaxDims, getChunkLayout from .util.chunkUtil import getNumChunks, getChunkIds, getChunkId -from .util.arrayUtil import bytesArrayToList, jsonToArray -from .util.arrayUtil import getNumElements, arrayToBytes, bytesToArray -from .util.arrayUtil import squeezeArray, getBroadcastShape from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .servicenode_lib import getDsetJson, validateAction from .dset_lib import getSelectionData, getParser, extendShape diff --git a/hsds/chunklocator.py b/hsds/chunklocator.py index 6727de9e..2f8bfbaf 100644 --- a/hsds/chunklocator.py +++ b/hsds/chunklocator.py @@ -5,7 +5,9 @@ import numpy as np from . import config from . import hsds_logger as log -from .util.arrayUtil import bytesArrayToList, getNumElements + +from h5json.array_util import bytesArrayToList, getNumElements + from .util.dsetUtil import getSelectionList, getSelectionShape diff --git a/hsds/ctype_sn.py b/hsds/ctype_sn.py index 84cdd17f..59faccd1 100755 --- a/hsds/ctype_sn.py +++ b/hsds/ctype_sn.py @@ -16,6 +16,9 @@ from aiohttp.web_exceptions import HTTPBadRequest, HTTPGone from json import JSONDecodeError + +from h5json.hdf5dtype import validateTypeItem, getBaseTypeJson + from .util.httpUtil import getHref, respJsonAssemble, getBooleanParam from .util.httpUtil import jsonResponse from .util.idUtil import isValidUuid @@ -24,7 +27,6 @@ from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain from .util.domainUtil import getBucketForDomain, verifyRoot -from .util.hdf5dtype import validateTypeItem, getBaseTypeJson from .servicenode_lib import getDomainJson, getObjectJson, validateAction from .servicenode_lib import getObjectIdByPath, getPathForObjectId from .servicenode_lib import createObject, createObjectByPath, deleteObject diff --git a/hsds/datanode_lib.py b/hsds/datanode_lib.py index 08ecc52a..1c6c3b6c 100644 --- a/hsds/datanode_lib.py +++ b/hsds/datanode_lib.py @@ -19,6 +19,10 @@ from aiohttp.web_exceptions import HTTPGone, HTTPInternalServerError from aiohttp.web_exceptions import HTTPNotFound, HTTPForbidden from aiohttp.web_exceptions import HTTPServiceUnavailable, HTTPBadRequest + +from h5json.hdf5dtype import createDataType +from h5json.array_util import arrayToBytes, bytesToArray, jsonToArray + from .util.idUtil import validateInPartition, getS3Key, isValidUuid from .util.idUtil import isValidChunkId, getDataNodeUrl, isSchema2Id from .util.idUtil import getRootObjId, isRootObjId @@ -31,8 +35,6 @@ from .util.dsetUtil import getChunkLayout, getFilterOps, getShapeDims from .util.dsetUtil import getChunkInitializer, getSliceQueryParam, getFilters from .util.chunkUtil import getDatasetId, getChunkSelection, getChunkIndex -from .util.arrayUtil import arrayToBytes, bytesToArray, jsonToArray -from .util.hdf5dtype import createDataType from .util.rangegetUtil import ChunkLocation, chunkMunge, getHyperChunkIndex, getHyperChunkFactors from .util.timeUtil import getNow from . import config diff --git a/hsds/dset_lib.py b/hsds/dset_lib.py index 1fe89b3e..5b729afb 100755 --- a/hsds/dset_lib.py +++ b/hsds/dset_lib.py @@ -16,7 +16,10 @@ from aiohttp.client_exceptions import ClientError from aiohttp.web_exceptions import HTTPBadRequest, HTTPConflict, HTTPInternalServerError -from .util.arrayUtil import getNumpyValue + +from h5json.hdf5dtype import createDataType, getItemSize +from h5json.array_util import getNumpyValue + from .util.boolparser import BooleanParser from .util.dsetUtil import isNullSpace, getDatasetLayout, getDatasetLayoutClass, get_slices from .util.dsetUtil import getChunkLayout, getSelectionShape, getShapeDims @@ -24,7 +27,6 @@ from .util.chunkUtil import getNumChunks, getChunkIds, getChunkId from .util.chunkUtil import getChunkCoverage, getDataCoverage from .util.chunkUtil import getQueryDtype, get_chunktable_dims -from .util.hdf5dtype import createDataType, getItemSize from .util.httpUtil import http_delete, http_put from .util.idUtil import getDataNodeUrl, isSchema2Id, getS3Key, getObjId from .util.rangegetUtil import getHyperChunkFactors diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 38e1156a..721970fb 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -18,11 +18,13 @@ from json import JSONDecodeError from aiohttp.web_exceptions import HTTPBadRequest, HTTPNotFound +from h5json.hdf5dtype import validateTypeItem, createDataType, getBaseTypeJson, getItemSize +from h5json.array_util import getNumElements, getNumpyValue + from .util.httpUtil import getHref, respJsonAssemble from .util.httpUtil import jsonResponse, getBooleanParam from .util.idUtil import isValidUuid, isSchema2Id from .util.dsetUtil import getPreviewQuery, getFilterItem, getShapeDims -from .util.arrayUtil import getNumElements, getNumpyValue from .util.chunkUtil import getChunkSize, guessChunk, expandChunk, shrinkChunk from .util.chunkUtil import getContiguousLayout from .util.authUtil import getUserPasswordFromRequest, aclCheck @@ -30,8 +32,6 @@ from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain from .util.domainUtil import getBucketForDomain, verifyRoot from .util.storUtil import getSupportedFilters -from .util.hdf5dtype import validateTypeItem, createDataType, getBaseTypeJson -from .util.hdf5dtype import getItemSize from .util.linkUtil import validateLinkName from .servicenode_lib import getDomainJson, getObjectJson, getDsetJson, getPathForObjectId from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo diff --git a/hsds/link_dn.py b/hsds/link_dn.py index f7ec5956..e53984ed 100755 --- a/hsds/link_dn.py +++ b/hsds/link_dn.py @@ -378,7 +378,6 @@ async def PUT_Links(request): if new_links: # update the group lastModified group_json["lastModified"] = create_time - log.debug(f"tbd: group_json: {group_json}") # write back to S3, save to metadata cache await save_metadata_obj(app, group_id, group_json, bucket=bucket) diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index c8c84f75..3d65e619 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -21,8 +21,9 @@ from aiohttp.client_exceptions import ClientOSError, ClientError from aiohttp import ClientResponseError +from h5json.array_util import encodeData + from .util.authUtil import getAclKeys -from .util.arrayUtil import encodeData from .util.idUtil import getDataNodeUrl, getCollectionForId, createObjId, getRootObjId from .util.idUtil import isSchema2Id, getS3Key, isValidUuid from .util.linkUtil import h5Join, validateLinkName, getLinkClass diff --git a/hsds/util/arrayUtil.py b/hsds/util/arrayUtil.py deleted file mode 100644 index 67c847c3..00000000 --- a/hsds/util/arrayUtil.py +++ /dev/null @@ -1,731 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # -# Utilities. The full HSDS copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## - -import math -import base64 -import binascii -import numpy as np - -MAX_VLEN_ELEMENT = 1_000_000 # restrict largest vlen element to one million - - -def bytesArrayToList(data): - """ - Convert list that may contain bytes type elements to list of string elements - - TBD: Need to deal with non-string byte data (hexencode?) - """ - if type(data) in (bytes, str): - is_list = False - elif isinstance(data, (np.ndarray, np.generic)): - if len(data.shape) == 0: - is_list = False - data = data.tolist() # tolist will return a scalar in this case - if type(data) in (list, tuple): - is_list = True - else: - is_list = False - else: - is_list = True - elif type(data) in (list, tuple): - is_list = True - else: - is_list = False - - if is_list: - out = [] - for item in data: - try: - rec_item = bytesArrayToList(item) # recursive call - out.append(rec_item) - except ValueError as err: - raise err - elif type(data) is bytes: - try: - out = data.decode("utf-8") - except UnicodeDecodeError as err: - raise ValueError(err) - else: - out = data - - return out - - -def toTuple(rank, data): - """ - Convert a list to a tuple, recursively. - Example. [[1,2],[3,4]] -> ((1,2),(3,4)) - """ - if type(data) in (list, tuple): - if rank > 0: - return list(toTuple(rank - 1, x) for x in data) - else: - return tuple(toTuple(rank - 1, x) for x in data) - else: - if isinstance(data, str): - data = data.encode("utf8") - return data - - -def getArraySize(arr): - """ - Get size in bytes of a numpy array. - """ - nbytes = arr.dtype.itemsize - for n in arr.shape: - nbytes *= n - return nbytes - - -def getNumElements(dims): - """ - Get num elements defined by a shape - """ - num_elements = 0 - if isinstance(dims, int): - num_elements = dims - elif isinstance(dims, (list, tuple)): - num_elements = 1 - for dim in dims: - num_elements *= dim - else: - raise ValueError("Unexpected argument") - return num_elements - - -def isVlen(dt): - """ - Return True if the type contains variable length elements - """ - is_vlen = False - if len(dt) > 1: - names = dt.names - for name in names: - if isVlen(dt[name]): - is_vlen = True - break - else: - if dt.metadata and "vlen" in dt.metadata: - is_vlen = True - return is_vlen - - -def jsonToArray(data_shape, data_dtype, data_json): - """ - Return numpy array from the given json array. - """ - def fillVlenArray(rank, data, arr, index): - for i in range(len(data)): - if rank > 1: - index = fillVlenArray(rank - 1, data[i], arr, index) - else: - arr[index] = data[i] - index += 1 - return index - - if data_json is None: - return np.array([]).astype(data_dtype) - - if isinstance(data_json, (list, tuple)): - if None in data_json: - return np.array([]).astype(data_dtype) - - # need some special conversion for compound types -- - # each element must be a tuple, but the JSON decoder - # gives us a list instead. - if len(data_dtype) > 1 and not isinstance(data_json, (list, tuple)): - raise TypeError("expected list data for compound data type") - npoints = getNumElements(data_shape) - np_shape_rank = len(data_shape) - - if type(data_json) in (list, tuple): - converted_data = [] - if npoints == 1 and len(data_json) == len(data_dtype): - converted_data.append(toTuple(0, data_json)) - else: - converted_data = toTuple(np_shape_rank, data_json) - data_json = converted_data - else: - if isinstance(data_json, str): - data_json = data_json.encode("utf8") - data_json = [data_json,] # listify - - if isVlen(data_dtype): - arr = np.zeros((npoints,), dtype=data_dtype) - fillVlenArray(np_shape_rank, data_json, arr, 0) - else: - try: - arr = np.array(data_json, dtype=data_dtype) - except UnicodeEncodeError as ude: - msg = "Unable to encode data" - raise ValueError(msg) from ude - # raise an exception of the array shape doesn't match the selection shape - # allow if the array is a scalar and the selection shape is one element, - # numpy is ok with this - if arr.size != npoints: - msg = "Input data doesn't match selection number of elements" - msg += f" Expected {npoints}, but received: {arr.size}" - raise ValueError(msg) - if arr.shape != data_shape: - arr = arr.reshape(data_shape) # reshape to match selection - - return arr - - -def getElementSize(e, dt): - """ - Get number of byte needed to given element as a bytestream - """ - # print(f"getElementSize - e: {e} dt: {dt} metadata: {dt.metadata}") - if len(dt) > 1: - count = 0 - for name in dt.names: - field_dt = dt[name] - field_val = e[name] - count += getElementSize(field_val, field_dt) - elif not dt.metadata or "vlen" not in dt.metadata: - count = dt.itemsize # fixed size element - else: - # variable length element - vlen = dt.metadata["vlen"] - if isinstance(e, int): - if e == 0: - count = 4 # non-initialized element - else: - raise ValueError("Unexpected value: {}".format(e)) - elif isinstance(e, bytes): - count = len(e) + 4 - elif isinstance(e, str): - count = len(e.encode("utf-8")) + 4 - elif isinstance(e, np.ndarray): - nElements = math.prod(e.shape) - if e.dtype.kind != "O": - count = e.dtype.itemsize * nElements - else: - arr1d = e.reshape((nElements,)) - count = 0 - for item in arr1d: - count += getElementSize(item, dt) - count += 4 # byte count - elif isinstance(e, list) or isinstance(e, tuple): - if not e: - # empty list, just add byte count - count = 4 - else: - # not sure how to deal with this - count = len(e) * vlen.itemsize + 4 # +4 for byte count - else: - raise TypeError("unexpected type: {}".format(type(e))) - return count - - -def getByteArraySize(arr): - """ - Get number of bytes needed to store given numpy array as a bytestream - """ - if not isVlen(arr.dtype): - return arr.itemsize * math.prod(arr.shape) - nElements = math.prod(arr.shape) - # reshape to 1d for easier iteration - arr1d = arr.reshape((nElements,)) - dt = arr1d.dtype - count = 0 - for e in arr1d: - count += getElementSize(e, dt) - return count - - -def copyBuffer(src, des, offset): - """ - Copy to buffer at given offset - """ - # print(f"copyBuffer - src: {src} offset: {offset}") - # TBD: just do: des[offset:] = src[:] ? - for i in range(len(src)): - des[i + offset] = src[i] - - # print("returning:", offset + len(src)) - return offset + len(src) - - -def copyElement(e, dt, buffer, offset): - """ - Copy element to bytearray - """ - # print(f"copyElement - dt: {dt} offset: {offset}") - if len(dt) > 1: - for name in dt.names: - field_dt = dt[name] - field_val = e[name] - offset = copyElement(field_val, field_dt, buffer, offset) - elif not dt.metadata or "vlen" not in dt.metadata: - # print(f"e vlen: {e} type: {type(e)} itemsize: {dt.itemsize}") - e_buf = e.tobytes() - # print("tobytes:", e_buf) - if len(e_buf) < dt.itemsize: - # extend the buffer for fixed size strings - # print("extending buffer") - e_buf_ex = bytearray(dt.itemsize) - for i in range(len(e_buf)): - e_buf_ex[i] = e_buf[i] - e_buf = bytes(e_buf_ex) - - # print("length:", len(e_buf)) - offset = copyBuffer(e_buf, buffer, offset) - else: - # variable length element - vlen = dt.metadata["vlen"] - # print("copyBuffer vlen:", vlen) - if isinstance(e, int): - # print("copyBuffer int") - if e == 0: - # write 4-byte integer 0 to buffer - offset = copyBuffer(b"\x00\x00\x00\x00", buffer, offset) - else: - raise ValueError("Unexpected value: {}".format(e)) - elif isinstance(e, bytes): - # print("copyBuffer bytes") - count = np.int32(len(e)) - if count > MAX_VLEN_ELEMENT: - raise ValueError("vlen element too large") - offset = copyBuffer(count.tobytes(), buffer, offset) - offset = copyBuffer(e, buffer, offset) - elif isinstance(e, str): - # print("copyBuffer, str") - text = e.encode("utf-8") - count = np.int32(len(text)) - if count > MAX_VLEN_ELEMENT: - raise ValueError("vlen element too large") - offset = copyBuffer(count.tobytes(), buffer, offset) - offset = copyBuffer(text, buffer, offset) - - elif isinstance(e, np.ndarray): - nElements = math.prod(e.shape) - # print("copyBuffer ndarray, nElements:", nElements) - - if e.dtype.kind != "O": - count = np.int32(e.dtype.itemsize * nElements) - # print("copyBuffeer got vlen count:", count) - # print("copyBuffer e:", e) - if count > MAX_VLEN_ELEMENT: - raise ValueError("vlen element too large") - offset = copyBuffer(count.tobytes(), buffer, offset) - # print("copyBuffer write new count, offset:", offset) - offset = copyBuffer(e.tobytes(), buffer, offset) - # print("copyBuffer write data, offset:", offset) - else: - arr1d = e.reshape((nElements,)) - for item in arr1d: - offset = copyElement(item, dt, buffer, offset) - - elif isinstance(e, list) or isinstance(e, tuple): - # print("cooyBuffer list/tuple vlen:", vlen, "e:", e) - count = np.int32(len(e) * vlen.itemsize) - offset = copyBuffer(count.tobytes(), buffer, offset) - if isinstance(e, np.ndarray): - arr = e - else: - arr = np.asarray(e, dtype=vlen) - offset = copyBuffer(arr.tobytes(), buffer, offset) - - else: - raise TypeError("unexpected type: {}".format(type(e))) - # print("buffer: {}".format(buffer)) - return offset - - -def getElementCount(buffer, offset=0): - """ - Get the count value from persisted vlen array - """ - - n = offset - m = offset + 4 - count_bytes = bytes(buffer[n:m]) - - try: - count = int(np.frombuffer(count_bytes, dtype=" MAX_VLEN_ELEMENT: - # expect variable length element to be between 0 and 1mb - raise ValueError("varlen element size expected to be less than 1MB") - return count - - -def readElement(buffer, offset, arr, index, dt): - """ - Read a single element from buffer into array. - - Parameters: - buffer (bytearray): Byte array to read an element from. - offset (int): Starting offset in the buffer. - arr (numpy.ndarray): Array to store the element. - index (int): Index in 'arr' at which to store the element. - dt (numpy.dtype): Numpy datatype of the element. - - Note: If the provided datatype is a variable-length sequence, - this function will read the byte count from the first 4 bytes - of the buffer, and then read the entire sequence. - - Returns: - int: The updated offset value after reading the element. - """ - if len(dt) > 1: - e = arr[index] - for name in dt.names: - field_dt = dt[name] - offset = readElement(buffer, offset, e, name, field_dt) - elif not dt.metadata or "vlen" not in dt.metadata: - count = dt.itemsize - n = offset - m = offset + count - e_buffer = buffer[n:m] - offset += count - try: - e = np.frombuffer(bytes(e_buffer), dtype=dt) - arr[index] = e[0] - except ValueError: - print(f"ERROR: ValueError setting {e_buffer} and dtype: {dt}") - raise - else: - # variable length element - vlenBaseType = dt.metadata["vlen"] - e = arr[index] - - if isinstance(e, np.ndarray): - nelements = math.prod(dt.shape) - e.reshape((nelements,)) - for i in range(nelements): - offset = readElement(buffer, offset, e, i, dt) - e.reshape(dt.shape) - else: - # total number of bytes in the vlen sequence/variable-length string - count = getElementCount(buffer, offset=offset) - offset += 4 - n = offset - m = offset + count - if count > 0: - e_buffer = buffer[n:m] - offset += count - - if vlenBaseType is bytes: - arr[index] = bytes(e_buffer) - elif vlenBaseType is str: - s = e_buffer.decode("utf-8") - arr[index] = s - else: - try: - e = np.frombuffer(bytes(e_buffer), dtype=vlenBaseType) - except ValueError: - msg = f"Failed to parse vlen data: {e_buffer} with dtype: {vlenBaseType}" - raise ValueError(msg) - arr[index] = e - return offset - - -def encodeData(data, encoding="base64"): - """ Encode given data """ - if encoding != "base64": - raise ValueError("only base64 encoding is supported") - try: - if isinstance(data, str): - data = data.encode("utf8") - except UnicodeEncodeError: - raise ValueError("can not encode string value") - if not isinstance(data, bytes): - msg = "Expected str or bytes type to encodeData, " - msg += f"but got: {type(data)}" - raise TypeError(msg) - try: - encoded_data = base64.b64encode(data) - except Exception as e: - # TBD: what exceptions can be raised? - raise ValueError(f"Unable to encode: {e}") - return encoded_data - - -def decodeData(data, encoding="base64"): - if encoding != "base64": - raise ValueError("only base64 decoding is supported") - try: - decoded_data = base64.b64decode(data) - except Exception as e: - # TBD: catch actual exception - raise ValueError(f"Unable to decode: {e}") - return decoded_data - - -def arrayToBytes(arr, encoding=None): - """ - Return byte representation of numpy array - """ - if isVlen(arr.dtype): - nSize = getByteArraySize(arr) - buffer = bytearray(nSize) - offset = 0 - nElements = math.prod(arr.shape) - arr1d = arr.reshape((nElements,)) - for e in arr1d: - # print("arrayToBytes:", e) - offset = copyElement(e, arr1d.dtype, buffer, offset) - data = bytes(buffer) - else: - # fixed length type - data = arr.tobytes() - - if encoding: - data = encodeData(data) - return data - - -def bytesToArray(data, dt, shape, encoding=None): - """ - Create numpy array based on byte representation - """ - if encoding: - # decode the data - # will raise ValueError if non-decodeable - data = decodeData(data) - if not isVlen(dt): - # regular numpy from string - arr = np.frombuffer(data, dtype=dt) - else: - nelements = getNumElements(shape) - - arr = np.zeros((nelements,), dtype=dt) - offset = 0 - for index in range(nelements): - offset = readElement(data, offset, arr, index, dt) - if shape is not None: - arr = arr.reshape(shape) - # check that we can update the array if needed - # Note: this seems to have been required starting with numpuy v 1.17 - # Setting the flag directly is not recommended. - # cf: https://github.com/numpy/numpy/issues/9440 - - if not arr.flags["WRITEABLE"]: - arr_copy = arr.copy() - arr = arr_copy - - return arr - - -def getNumpyValue(value, dt=None, encoding=None): - """ - Return value as numpy type for given dtype and encoding - Encoding is expected to be one of None or "base64" - """ - # create a scalar numpy array - arr = np.zeros((), dtype=dt) - - if encoding and not isinstance(value, str): - msg = "Expected value to be string to use encoding" - raise ValueError(msg) - - if encoding == "base64": - try: - data = base64.decodebytes(value.encode("utf-8")) - except binascii.Error: - msg = "Unable to decode base64 string: {value}" - # log.warn(msg) - raise ValueError(msg) - arr = bytesToArray(data, dt, dt.shape) - else: - if isinstance(value, list): - # convert to tuple - value = tuple(value) - elif dt.kind == "f" and isinstance(value, str) and value == "nan": - value = np.nan - else: - # use as is - pass - arr = np.asarray(value, dtype=dt.base) - return arr[()] - - -def squeezeArray(data): - """ - Reduce dimensions by removing any 1-extent dimensions. - Just return input if no 1-extent dimensions - - Note: only works with ndarrays (for now at least) - """ - if not isinstance(data, np.ndarray): - raise TypeError("expected ndarray") - if len(data.shape) <= 1: - return data - can_reduce = True - for extent in data.shape: - if extent == 1: - can_reduce = True - break - if can_reduce: - data = data.squeeze() - return data - - -class IndexIterator(object): - """ - Class to iterate through list of chunks of a given dataset - """ - - def __init__(self, shape, sel=None): - self._shape = shape - self._rank = len(self._shape) - self._stop = False - - if self._rank < 1: - raise ValueError("IndexIterator can not be used on arrays of zero rank") - - if sel is None: - # select over entire dataset - slices = [] - for dim in range(self._rank): - slices.append(slice(0, self._shape[dim])) - self._sel = tuple(slices) - else: - if isinstance(sel, slice): - self._sel = (sel,) - else: - self._sel = sel - if len(self._sel) != self._rank: - raise ValueError("Invalid selection - selection region must have same rank as shape") - self._index = [] - for dim in range(self._rank): - s = self._sel[dim] - if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start: - raise ValueError( - "Invalid selection - selection region must be within dataset space" - ) - self._index.append(s.start) - - def __iter__(self): - return self - - def __next__(self): - if self._stop: - raise StopIteration() - # bump up the last index and carry forward if we run outside the selection - dim = self._rank - 1 - ret_index = self._index.copy() - while True: - s = self._sel[dim] - if s.step: - step = s.step - else: - step = 1 - self._index[dim] += step - - if self._index[dim] < s.stop: - # we still have room to extend along this dimensions - break - - # reset to the start and continue iterating with higher dimension - self._index[dim] = s.start - dim -= 1 - if dim < 0: - # ran past last index, stop iteration on next run - self._stop = True - - return tuple(ret_index) - - -def ndarray_compare(arr1, arr2): - # compare two numpy arrays. - # return true if the same (exclusive of null vs. empty array) - # false otherwise - # TBD: this is slow for multi-megabyte vlen arrays, needs to be optimized - if not isinstance(arr1, np.ndarray) and not isinstance(arr2, np.ndarray): - if not isinstance(arr1, np.void) and not isinstance(arr2, np.void): - return arr1 == arr2 - if isinstance(arr1, np.void) and not isinstance(arr2, np.void): - if arr1.size == 0 and not arr2: - return True - else: - return False - if not isinstance(arr1, np.void) and isinstance(arr2, np.void): - if not arr1 and arr2.size == 0: - return True - else: - return False - # both np.voids - if arr1.size != arr2.size: - return False - - if len(arr1) != len(arr2): - return False - - for i in range(len(arr1)): - if not ndarray_compare(arr1[i], arr2[i]): - return False - return True - - if isinstance(arr1, np.ndarray) and not isinstance(arr2, np.ndarray): - # same only if arr1 is empty and arr2 is 0 - if arr1.size == 0 and not arr2: - return True - else: - return False - if not isinstance(arr1, np.ndarray) and isinstance(arr2, np.ndarray): - # same only if arr1 is empty and arr2 size is 0 - if not arr1 and arr2.size == 0: - return True - else: - return False - - # two ndarrays... - if arr1.shape != arr2.shape: - return False - if arr2.dtype != arr2.dtype: - return False - - if isVlen(arr1.dtype): - # need to compare element by element - - nElements = np.prod(arr1.shape) - arr1 = arr1.reshape((nElements,)) - arr2 = arr2.reshape((nElements,)) - for i in range(nElements): - if not ndarray_compare(arr1[i], arr2[i]): - return False - return True - else: - # can just us np array_compare - return np.array_equal(arr1, arr2) - - -def getBroadcastShape(mshape, element_count): - # if element_count is less than the number of elements - # defined by mshape, return a numpy compatible broadcast - # shape that contains element_count elements. - # If non exists return None - - if np.prod(mshape) == element_count: - return None - - if element_count == 1: - # this always works - return [1,] - - bcshape = [] - rank = len(mshape) - for n in range(rank - 1): - bcshape.insert(0, mshape[rank - n - 1]) - if element_count == np.prod(bcshape): - return bcshape # have a match - - return None # no broadcast found diff --git a/hsds/util/hdf5dtype.py b/hsds/util/hdf5dtype.py deleted file mode 100644 index 3d7d1d2f..00000000 --- a/hsds/util/hdf5dtype.py +++ /dev/null @@ -1,876 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # -# Utilities. The full HSDS copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## - -import weakref -import numpy as np - - -class Reference: - """ - Represents an HDF5 object reference - """ - - @property - def id(self): - """Low-level identifier appropriate for this object""" - return self._id - - @property - def objref(self): - """Weak reference to object""" - return self._objref # return weak ref to ref'd object - - def __init__(self, bind): - """Create a new reference by binding to - a group/dataset/committed type - """ - self._id = bind._id - self._objref = weakref.ref(bind) - - def __repr__(self): - # TBD: this is not consistent with hsds or h5py... - if not isinstance(self._id.id, str): - raise TypeError("Expected string id") - item = None - - collection_type = self._id.collection_type - item = f"{collection_type}/{self._id.id}" - return item - - def tolist(self): - if type(self._id.id) is not str: - raise TypeError("Expected string id") - if self._id.objtype_code == "d": - return [ - ("datasets/" + self._id.id), - ] - elif self._id.objtype_code == "g": - return [ - ("groups/" + self._id.id), - ] - elif self._id.objtype_code == "t": - return [ - ("datatypes/" + self._id.id), - ] - else: - raise TypeError("Unexpected id type") - - -class RegionReference: - """ - Represents an HDF5 region reference - """ - - @property - def id(self): - """Low-level identifier appropriate for this object""" - return self._id - - @property - def objref(self): - """Weak reference to object""" - return self._objref # return weak ref to ref'd object - - def __init__(self, bind): - """Create a new reference by binding to - a group/dataset/committed type - """ - self._id = bind._id - self._objref = weakref.ref(bind) - - def __repr__(self): - return "" - - -def special_dtype(**kwds): - """Create a new h5py "special" type. Only one keyword may be given. - - Legal keywords are: - - vlen = basetype - Base type for HDF5 variable-length datatype. This can be Python - str type or instance of np.dtype. - Example: special_dtype( vlen=str ) - - enum = (basetype, values_dict) - Create a NumPy representation of an HDF5 enumerated type. Provide - a 2-tuple containing an (integer) base dtype and a dict mapping - string names to integer values. - - ref = Reference | RegionReference - Create a NumPy representation of an HDF5 object or region reference - type.""" - - if len(kwds) != 1: - raise TypeError("Exactly one keyword may be provided") - - name, val = kwds.popitem() - - if name == "vlen": - - return np.dtype("O", metadata={"vlen": val}) - - if name == "enum": - - try: - dt, enum_vals = val - except TypeError: - msg = "Enums must be created from a 2-tuple " - msg += "(basetype, values_dict)" - raise TypeError(msg) - - dt = np.dtype(dt) - if dt.kind not in "iu": - raise TypeError("Only integer types can be used as enums") - - return np.dtype(dt, metadata={"enum": enum_vals}) - - if name == "ref": - dt = None - if val is Reference: - dt = np.dtype("S48", metadata={"ref": Reference}) - elif val is RegionReference: - dt = np.dtype("S48", metadata={"ref": RegionReference}) - else: - raise ValueError("Ref class must be Reference or RegionReference") - - return dt - - raise TypeError(f'Unknown special type "{name}"') - - -def check_dtype(**kwds): - """Check a dtype for h5py special type "hint" information. Only one - keyword may be given. - - vlen = dtype - If the dtype represents an HDF5 vlen, returns the Python base class. - Currently only builting string vlens (str) are supported. Returns - None if the dtype does not represent an HDF5 vlen. - - enum = dtype - If the dtype represents an HDF5 enumerated type, returns the dictionary - mapping string names to integer values. Returns None if the dtype does - not represent an HDF5 enumerated type. - - ref = dtype - If the dtype represents an HDF5 reference type, returns the reference - class (either Reference or RegionReference). Returns None if the dtype - does not represent an HDF5 reference type. - """ - - if len(kwds) != 1: - raise TypeError("Exactly one keyword may be provided") - - name, dt = kwds.popitem() - - if name not in ("vlen", "enum", "ref"): - raise TypeError('Unknown special type "%s"' % name) - - try: - return dt.metadata[name] - except TypeError: - return None - except KeyError: - return None - - -def getTypeResponse(typeItem): - """ - Convert the given type item to a predefined type string for - predefined integer and floating point types ("H5T_STD_I64LE", et. al). - For compound types, recursively iterate through the typeItem and do - same conversion for fields of the compound type.""" - response = None - if "uuid" in typeItem: - # committed type, just return uuid - response = "datatypes/" + typeItem["uuid"] - elif typeItem["class"] in ("H5T_INTEGER", "H5T_FLOAT"): - # just return the class and base for pre-defined types - response = {} - response["class"] = typeItem["class"] - response["base"] = typeItem["base"] - elif typeItem["class"] == "H5T_OPAQUE": - response = {} - response["class"] = "H5T_OPAQUE" - response["size"] = typeItem["size"] - elif typeItem["class"] == "H5T_REFERENCE": - response = {} - response["class"] = "H5T_REFERENCE" - response["base"] = typeItem["base"] - elif typeItem["class"] == "H5T_COMPOUND": - response = {} - response["class"] = "H5T_COMPOUND" - fieldList = [] - for field in typeItem["fields"]: - fieldItem = {} - fieldItem["name"] = field["name"] - fieldItem["type"] = getTypeResponse(field["type"]) # recurse call - fieldList.append(fieldItem) - response["fields"] = fieldList - else: - response = {} # otherwise, return full type - for k in typeItem.keys(): - if k == "base": - if isinstance(typeItem[k], dict): - response[k] = getTypeResponse(typeItem[k]) # recurse call - else: - response[k] = typeItem[k] # predefined type - elif k not in ("size", "base_size"): - response[k] = typeItem[k] - return response - - -def getTypeItem(dt, metadata=None): - """ - Return type info. - For primitive types, return string with typename - For compound types return array of dictionary items - """ - predefined_int_types = { - "int8": "H5T_STD_I8", - "uint8": "H5T_STD_U8", - "int16": "H5T_STD_I16", - "uint16": "H5T_STD_U16", - "int32": "H5T_STD_I32", - "uint32": "H5T_STD_U32", - "int64": "H5T_STD_I64", - "uint64": "H5T_STD_U64", - } - predefined_float_types = { - "float16": "H5T_IEEE_F16", - "float32": "H5T_IEEE_F32", - "float64": "H5T_IEEE_F64", - } - # print(">getTypeItem:", dt.str) - if not metadata and dt.metadata: - metadata = dt.metadata - # if metadata: - # print("> metadata:", metadata) - # if dt.shape: - # print("> shape:", dt.shape) - # if len(dt) > 1: - # print("> len:", len(dt)) - - type_info = {} - if len(dt) > 1: - # compound type - names = dt.names - type_info["class"] = "H5T_COMPOUND" - fields = [] - for name in names: - field = {"name": name} - field["type"] = getTypeItem(dt[name]) - fields.append(field) - type_info["fields"] = fields - elif dt.shape: - # array type - if dt.base == dt: - raise TypeError("Expected base type to be different than parent") - # array type - type_info["dims"] = dt.shape - type_info["class"] = "H5T_ARRAY" - # print("> array type, metadata:", metadata) - type_info["base"] = getTypeItem(dt.base, metadata=metadata) - elif dt.kind == "O": - # vlen string or data - # - # check for h5py variable length extension - - if metadata and "vlen" in metadata: - vlen_check = metadata["vlen"] - if vlen_check is not None and not isinstance(vlen_check, np.dtype): - vlen_check = np.dtype(vlen_check) - - if metadata and "ref" in metadata: - ref_check = metadata["ref"] - else: - ref_check = check_dtype(ref=dt.base) - if vlen_check == bytes: - type_info["class"] = "H5T_STRING" - type_info["length"] = "H5T_VARIABLE" - type_info["charSet"] = "H5T_CSET_ASCII" - type_info["strPad"] = "H5T_STR_NULLTERM" - elif vlen_check == str: - type_info["class"] = "H5T_STRING" - type_info["length"] = "H5T_VARIABLE" - type_info["charSet"] = "H5T_CSET_UTF8" - type_info["strPad"] = "H5T_STR_NULLTERM" - elif isinstance(vlen_check, np.dtype): - # vlen data - type_info["class"] = "H5T_VLEN" - type_info["size"] = "H5T_VARIABLE" - type_info["base"] = getTypeItem(vlen_check) - elif vlen_check is not None: - # unknown vlen type - raise TypeError("Unknown h5py vlen type: " + str(vlen_check)) - elif ref_check is not None: - # a reference type - type_info["class"] = "H5T_REFERENCE" - - if ref_check is Reference: - type_info["base"] = "H5T_STD_REF_OBJ" # objref - elif ref_check is RegionReference: - type_info["base"] = "H5T_STD_REF_DSETREG" # region ref - else: - raise TypeError("unexpected reference type") - else: - raise TypeError("unknown object type") - elif dt.kind == "V": - # void type - type_info["class"] = "H5T_OPAQUE" - type_info["size"] = dt.itemsize - type_info["tag"] = "" # todo - determine tag - elif dt.base.kind == "S": - # check for object reference - ref_check = check_dtype(ref=dt.base) - if ref_check is not None: - # a reference type - type_info["class"] = "H5T_REFERENCE" - - if ref_check is Reference: - type_info["base"] = "H5T_STD_REF_OBJ" # objref - elif ref_check is RegionReference: - type_info["base"] = "H5T_STD_REF_DSETREG" # region ref - else: - raise TypeError("unexpected reference type") - else: - # Fixed length string type - type_info["class"] = "H5T_STRING" - type_info["length"] = dt.itemsize - type_info["charSet"] = "H5T_CSET_ASCII" - type_info["strPad"] = "H5T_STR_NULLPAD" - elif dt.base.kind == "U": - # Fixed length unicode type - ref_check = check_dtype(ref=dt.base) - if ref_check is not None: - raise TypeError("unexpected reference type") - - # Fixed length string type with unicode support - type_info["class"] = "H5T_STRING" - - # this can be problematic if the encoding of the string is not valid, - # or reqires too many bytes. Use variable length strings to handle all - # UTF8 strings correctly - type_info["charSet"] = "H5T_CSET_UTF8" - # convert from UTF32 length to a fixed length - type_info["length"] = dt.itemsize - type_info["strPad"] = "H5T_STR_NULLPAD" - - elif dt.kind == "b": - # boolean type - h5py stores as enum - # assume LE unless the numpy byteorder is '>' - byteorder = "LE" - if dt.base.byteorder == ">": - byteorder = "BE" - # this mapping is an h5py convention for boolean support - mapping = {"FALSE": 0, "TRUE": 1} - type_info["class"] = "H5T_ENUM" - type_info["mapping"] = mapping - base_info = {"class": "H5T_INTEGER"} - base_info["base"] = "H5T_STD_I8" + byteorder - type_info["base"] = base_info - elif dt.kind == "f": - # floating point type - type_info["class"] = "H5T_FLOAT" - byteorder = "LE" - if dt.byteorder == ">": - byteorder = "BE" - if dt.name in predefined_float_types: - # maps to one of the HDF5 predefined types - float_type = predefined_float_types[dt.base.name] - type_info["base"] = float_type + byteorder - else: - raise TypeError("Unexpected floating point type: " + dt.name) - elif dt.kind == "i" or dt.kind == "u": - # integer type - - # assume LE unless the numpy byteorder is '>' - byteorder = "LE" - if dt.base.byteorder == ">": - byteorder = "BE" - - # numpy integer type - but check to see if this is the hypy - # enum extension - if metadata and "enum" in metadata: - # yes, this is an enum! - mapping = metadata["enum"] - type_info["class"] = "H5T_ENUM" - type_info["mapping"] = mapping - if dt.name not in predefined_int_types: - raise TypeError("Unexpected integer type: " + dt.name) - # maps to one of the HDF5 predefined types - base_info = {"class": "H5T_INTEGER"} - base_info["base"] = predefined_int_types[dt.name] + byteorder - type_info["base"] = base_info - else: - type_info["class"] = "H5T_INTEGER" - base_name = dt.name - - if dt.name not in predefined_int_types: - raise TypeError("Unexpected integer type: " + dt.name) - - type_info["base"] = predefined_int_types[base_name] + byteorder - - else: - # unexpected kind - raise TypeError(f"unexpected dtype kind: {dt.kind}") - - return type_info - - -def getItemSize(typeItem): - """ - Get size of an item in bytes. - For variable length types (e.g. variable length strings), - return the string "H5T_VARIABLE" - """ - # handle the case where we are passed a primitive type first - if isinstance(typeItem, str) or isinstance(typeItem, bytes): - for type_prefix in ("H5T_STD_I", "H5T_STD_U", "H5T_IEEE_F"): - if typeItem.startswith(type_prefix): - nlen = len(type_prefix) - num_bits = typeItem[nlen:] - if num_bits[-2:] in ("LE", "BE"): - num_bits = num_bits[:-2] - try: - return int(num_bits) // 8 - except ValueError: - raise TypeError("Invalid Type") - # none of the expect primative types mathched - raise TypeError("Invalid Type") - if not isinstance(typeItem, dict): - raise TypeError("invalid type") - - item_size = 0 - if "class" not in typeItem: - raise KeyError("'class' not provided") - typeClass = typeItem["class"] - - if typeClass == "H5T_INTEGER": - if "base" not in typeItem: - raise KeyError("'base' not provided") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_FLOAT": - if "base" not in typeItem: - raise KeyError("'base' not provided") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_STRING": - if "length" not in typeItem: - raise KeyError("'length' not provided") - item_size = typeItem["length"] - - elif typeClass == "H5T_VLEN": - item_size = "H5T_VARIABLE" - elif typeClass == "H5T_OPAQUE": - if "size" not in typeItem: - raise KeyError("'size' not provided") - item_size = int(typeItem["size"]) - - elif typeClass == "H5T_ARRAY": - if "dims" not in typeItem: - raise KeyError("'dims' must be provided for array types") - if "base" not in typeItem: - raise KeyError("'base' not provided") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_ENUM": - if "base" not in typeItem: - raise KeyError("'base' must be provided for enum types") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_REFERENCE": - if "length" in typeItem: - item_size = typeItem["length"] - elif "base" in typeItem and typeItem["base"] == "H5T_STD_REF_OBJ": - # obj ref values are in the form: "groups/" or - # "datasets/" or "datatypes/" - item_size = 48 - else: - raise KeyError("Unable to determine item size for reference type") - elif typeClass == "H5T_COMPOUND": - if "fields" not in typeItem: - raise KeyError("'fields' not provided for compound type") - fields = typeItem["fields"] - if not isinstance(fields, list): - raise TypeError("Type Error: expected list type for 'fields'") - if not fields: - raise KeyError("no 'field' elements provided") - # add up the size of each sub-field - for field in fields: - if not isinstance(field, dict): - raise TypeError("Expected dictionary type for field") - if "type" not in field: - raise KeyError("'type' missing from field") - subtype_size = getItemSize(field["type"]) # recursive call - if subtype_size == "H5T_VARIABLE": - item_size = "H5T_VARIABLE" - break # don't need to look at the rest - - item_size += subtype_size - else: - raise TypeError("Invalid type class") - - # calculate array type - if "dims" in typeItem and isinstance(item_size, int): - dims = typeItem["dims"] - for dim in dims: - item_size *= dim - - return item_size - - -def getDtypeItemSize(dtype): - """ Return size of dtype in bytes - For variable length types (e.g. variable length strings), - return the string "H5T_VARIABLE - """ - item_size = 0 - if len(dtype) > 0: - # compound dtype - for i in range(len(dtype)): - sub_dt = dtype[i] - sub_dt_size = getDtypeItemSize(sub_dt) - if sub_dt_size == "H5T_VARIABLE": - item_size = "H5T_VARIABLE" # return variable if any component is variable - break - item_size += sub_dt_size - else: - # primitive type - if dtype.metadata and "vlen" in dtype.metadata: - item_size = "H5T_VARIABLE" - else: - item_size = dtype.itemsize - return item_size - - -def getNumpyTypename(hdf5TypeName, typeClass=None): - predefined_int_types = { - "H5T_STD_I8": "i1", - "H5T_STD_U8": "u1", - "H5T_STD_I16": "i2", - "H5T_STD_U16": "u2", - "H5T_STD_I32": "i4", - "H5T_STD_U32": "u4", - "H5T_STD_I64": "i8", - "H5T_STD_U64": "u8", - } - predefined_float_types = { - "H5T_IEEE_F16": "f2", - "H5T_IEEE_F32": "f4", - "H5T_IEEE_F64": "f8", - } - - if len(hdf5TypeName) < 3: - raise Exception("Type Error: invalid typename: ") - endian = "<" # default endian - key = hdf5TypeName - if hdf5TypeName.endswith("LE"): - key = hdf5TypeName[:-2] - elif hdf5TypeName.endswith("BE"): - key = hdf5TypeName[:-2] - endian = ">" - - if key in predefined_int_types and ( - typeClass is None or typeClass == "H5T_INTEGER" - ): - return endian + predefined_int_types[key] - if key in predefined_float_types and ( - typeClass is None or typeClass == "H5T_FLOAT" - ): - return endian + predefined_float_types[key] - raise TypeError("Type Error: invalid type") - - -def createBaseDataType(typeItem): - dtRet = None - if isinstance(typeItem, str): - # should be one of the predefined types - dtName = getNumpyTypename(typeItem) - dtRet = np.dtype(dtName) - return dtRet # return predefined type - - if not isinstance(typeItem, dict): - raise TypeError("Type Error: invalid type") - - if "class" not in typeItem: - raise KeyError("'class' not provided") - typeClass = typeItem["class"] - - dims = "" - if "dims" in typeItem: - if typeClass != "H5T_ARRAY": - raise TypeError("'dims' only supported for integer types") - - dims = None - if isinstance(typeItem["dims"], int): - dims = typeItem["dims"] # make into a tuple - elif not isinstance(typeItem["dims"], list) and not isinstance( - typeItem["dims"], tuple - ): - raise TypeError("expected list or integer for dims") - else: - dims = typeItem["dims"] - dims = str(tuple(dims)) - - if typeClass == "H5T_INTEGER": - if "base" not in typeItem: - raise KeyError("'base' not provided") - baseType = getNumpyTypename(typeItem["base"], typeClass="H5T_INTEGER") - dtRet = np.dtype(dims + baseType) - elif typeClass == "H5T_FLOAT": - if "base" not in typeItem: - raise KeyError("'base' not provided") - baseType = getNumpyTypename(typeItem["base"], typeClass="H5T_FLOAT") - dtRet = np.dtype(dims + baseType) - elif typeClass == "H5T_STRING": - if "length" not in typeItem: - raise KeyError("'length' not provided") - if "charSet" not in typeItem: - raise KeyError("'charSet' not provided") - - if typeItem["length"] == "H5T_VARIABLE": - if dims: - msg = "ArrayType is not supported for variable len types" - raise TypeError(msg) - if typeItem["charSet"] == "H5T_CSET_ASCII": - dtRet = special_dtype(vlen=bytes) - elif typeItem["charSet"] == "H5T_CSET_UTF8": - dtRet = special_dtype(vlen=str) - else: - raise TypeError("unexpected 'charSet' value") - else: - nStrSize = typeItem["length"] - if not isinstance(nStrSize, int): - raise TypeError("expecting integer value for 'length'") - type_code = None - if typeItem["charSet"] == "H5T_CSET_ASCII": - type_code = "S" - elif typeItem["charSet"] == "H5T_CSET_UTF8": - # use the same type_code as ascii strings - # (othewise, numpy will reserve bytes for UTF32 representation) - type_code = "S" - else: - raise TypeError("unexpected 'charSet' value") - # a fixed size string - dtRet = np.dtype(dims + type_code + str(nStrSize)) - elif typeClass == "H5T_VLEN": - if dims: - msg = "ArrayType is not supported for variable len types" - raise TypeError(msg) - if "base" not in typeItem: - raise KeyError("'base' not provided") - baseType = createBaseDataType(typeItem["base"]) - dtRet = special_dtype(vlen=np.dtype(baseType)) - elif typeClass == "H5T_OPAQUE": - if dims: - msg = "Opaque Type is not supported for variable len types" - raise TypeError(msg) - if "size" not in typeItem: - raise KeyError("'size' not provided") - nSize = int(typeItem["size"]) - if nSize <= 0: - raise TypeError("'size' must be non-negative") - dtRet = np.dtype("V" + str(nSize)) - elif typeClass == "H5T_ARRAY": - if not dims: - raise KeyError("'dims' must be provided for array types") - if "base" not in typeItem: - raise KeyError("'base' not provided") - arrayBaseType = typeItem["base"] - if isinstance(arrayBaseType, dict): - if "class" not in arrayBaseType: - raise KeyError("'class' not provided for array base type") - type_classes = ("H5T_INTEGER", "H5T_FLOAT", "H5T_STRING", "H5T_ARRAY") - if arrayBaseType["class"] not in type_classes: - msg = "Array Type base type must be integer, float, string, or array" - raise TypeError(msg) - baseType = createDataType(arrayBaseType) - metadata = None - if baseType.metadata: - metadata = dict(baseType.metadata) - dtRet = np.dtype(dims + baseType.str, metadata=metadata) - else: - dtRet = np.dtype(dims + baseType.str) - return dtRet # return predefined type - elif typeClass == "H5T_REFERENCE": - if "base" not in typeItem: - raise KeyError("'base' not provided") - if typeItem["base"] == "H5T_STD_REF_OBJ": - dtRet = special_dtype(ref=Reference) - elif typeItem["base"] == "H5T_STD_REF_DSETREG": - dtRet = special_dtype(ref=RegionReference) - else: - raise TypeError("Invalid base type for reference type") - - elif typeClass == "H5T_ENUM": - if "base" not in typeItem: - raise KeyError("Expected 'base' to be provided for enum type") - base_json = typeItem["base"] - if "class" not in base_json: - raise KeyError("Expected class field in base type") - if base_json["class"] != "H5T_INTEGER": - msg = "Only integer base types can be used with enum type" - raise TypeError(msg) - if "mapping" not in typeItem: - raise KeyError("'mapping' not provided for enum type") - mapping = typeItem["mapping"] - if len(mapping) == 0: - raise KeyError("empty enum map") - - dt = createBaseDataType(base_json) - if all( - ( - dt.kind == "i", - dt.name == "int8", - len(mapping) == 2, - "TRUE" in mapping, - "FALSE" in mapping, - ) - ): - # convert to numpy boolean type - dtRet = np.dtype("bool") - else: - # not a boolean enum, use h5py special dtype - dtRet = special_dtype(enum=(dt, mapping)) - - else: - raise TypeError("Invalid type class") - - return dtRet - - -def createDataType(typeItem): - """ - Create a numpy datatype given a json type - """ - dtRet = None - if type(typeItem) in (str, bytes): - # should be one of the predefined types - dtName = getNumpyTypename(typeItem) - dtRet = np.dtype(dtName) - return dtRet # return predefined type - - if not isinstance(typeItem, dict): - raise TypeError("invalid type") - - if "class" not in typeItem: - raise KeyError("'class' not provided") - typeClass = typeItem["class"] - - if typeClass == "H5T_COMPOUND": - if "fields" not in typeItem: - raise KeyError("'fields' not provided for compound type") - fields = typeItem["fields"] - if type(fields) is not list: - raise TypeError("Type Error: expected list type for 'fields'") - if not fields: - raise KeyError("no 'field' elements provided") - subtypes = [] - for field in fields: - - if not isinstance(field, dict): - raise TypeError("Expected dictionary type for field") - if "name" not in field: - raise KeyError("'name' missing from field") - if "type" not in field: - raise KeyError("'type' missing from field") - field_name = field["name"] - if not isinstance(field_name, str): - raise TypeError("field names must be strings") - # verify the field name is ascii - try: - field_name.encode("ascii") - except UnicodeEncodeError: - raise TypeError("non-ascii field name not allowed") - - dt = createDataType(field["type"]) # recursive call - if dt is None: - raise Exception("unexpected error") - subtypes.append((field["name"], dt)) # append tuple - - dtRet = np.dtype(subtypes) - else: - dtRet = createBaseDataType(typeItem) # create non-compound dt - return dtRet - - -def validateTypeItem(typeItem): - """ - Validate a json type - call createDataType and if no exception, - it's valid - """ - createDataType(typeItem) - # throws KeyError, TypeError, or ValueError - - -def getBaseTypeJson(type_name): - """ - Return JSON representation of a predefined type string - """ - predefined_int_types = ( - "H5T_STD_I8", - "H5T_STD_U8", - "H5T_STD_I16", - "H5T_STD_U16", - "H5T_STD_I32", - "H5T_STD_U32", - "H5T_STD_I64", - "H5T_STD_U64", - ) - predefined_float_types = ("H5T_IEEE_F16", "H5T_IEEE_F32", "H5T_IEEE_F64") - type_json = {} - # predefined typenames start with 'H5T' and end with "LE" or "BE" - if all( - ( - type_name.startswith("H5T_"), - type_name[-1] == "E", - type_name[-2] in ("L", "B"), - ) - ): - # trime of the "BE/"LE" - type_prefix = type_name[:-2] - if type_prefix in predefined_int_types: - type_json["class"] = "H5T_INTEGER" - type_json["base"] = type_name - elif type_prefix in predefined_float_types: - type_json["class"] = "H5T_FLOAT" - type_json["base"] = type_name - else: - raise TypeError("Invalid type name") - else: - raise TypeError("Invalid type name") - return type_json - - -def getSubType(dt_parent, fields): - """ Return a dtype that is a compound type composed of - the fields given in the field_names list - """ - if len(dt_parent) == 0: - raise TypeError("getSubType - parent must be compound type") - if not fields: - raise TypeError("null field specification") - if isinstance(fields, str): - fields = [fields,] # convert to a list - - field_names = set(dt_parent.names) - dt_items = [] - for field in fields: - if field not in field_names: - raise TypeError(f"field: {field} is not defined in parent type") - dt_items.append((field, dt_parent[field])) - dt = np.dtype(dt_items) - - return dt diff --git a/pyproject.toml b/pyproject.toml index af575c13..9b733a85 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ dependencies = [ "bitshuffle >=0.5.2", "cryptography", "h5py >= 3.6.0", + "h5json", "importlib_resources", "numcodecs", "numpy >=2.0.0rc1; python_version>='3.9'", diff --git a/testall.py b/testall.py index 1e8ea348..5955553a 100755 --- a/testall.py +++ b/testall.py @@ -16,7 +16,7 @@ PYTHON_CMD = "python" # change to "python3" if "python" invokes python version 2.x unit_tests = ('array_util_test', 'chunk_util_test', 'compression_test', 'domain_util_test', - 'dset_util_test', 'hdf5_dtype_test', 'id_util_test', 'lru_cache_test', + 'dset_util_test', 'id_util_test', 'lru_cache_test', 'shuffle_test', 'rangeget_util_test') integ_tests = ('uptest', 'setup_test', 'domain_test', 'group_test', diff --git a/tests/integ/attr_test.py b/tests/integ/attr_test.py index de54c5ea..b9f4dd7e 100644 --- a/tests/integ/attr_test.py +++ b/tests/integ/attr_test.py @@ -915,6 +915,7 @@ def testPutCommittedType(self): value.append(i * 0.5) payload = {"type": dtype_uuid, "shape": 10, "value": value} req = self.endpoint + "/groups/" + root_id + "/attributes/" + attr_name + print("req:", req) rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create attribute diff --git a/tests/integ/vlen_test.py b/tests/integ/vlen_test.py index e45504e6..38555c5d 100755 --- a/tests/integ/vlen_test.py +++ b/tests/integ/vlen_test.py @@ -15,9 +15,8 @@ import numpy as np import sys -sys.path.append("../..") -from hsds.util.arrayUtil import arrayToBytes, bytesToArray -from hsds.util.hdf5dtype import createDataType +from h5json.hdf5dtype import createDataType +from h5json.array_util import arrayToBytes, bytesToArray class VlenTest(unittest.TestCase): @@ -646,7 +645,12 @@ def testPutVLenCompoundBinary(self): # write as binary data data = arrayToBytes(arr) + print("data:", data) + for i in range(len(data)): + print(f"{i:04d}: {data[i]}") self.assertEqual(len(data), 192) # will vary based on count + arr_copy = bytesToArray(data, dt_compound, (count,)) + print("arr_copy:", arr_copy) req = self.endpoint + "/datasets/" + dset_uuid + "/value" rsp = self.session.put(req, data=data, headers=headers_bin_req) self.assertEqual(rsp.status_code, 200) diff --git a/tests/unit/array_util_test.py b/tests/unit/array_util_test.py index 1a4f40e5..854e1314 100644 --- a/tests/unit/array_util_test.py +++ b/tests/unit/array_util_test.py @@ -16,6 +16,10 @@ import sys import base64 +from h5json.hdf5dtype import special_dtype +from h5json.hdf5dtype import check_dtype +from h5json.hdf5dtype import createDataType + sys.path.append("../..") from hsds.util.arrayUtil import ( bytesArrayToList, @@ -30,9 +34,6 @@ getNumpyValue, getBroadcastShape ) -from hsds.util.hdf5dtype import special_dtype -from hsds.util.hdf5dtype import check_dtype -from hsds.util.hdf5dtype import createDataType class ArrayUtilTest(unittest.TestCase): @@ -401,6 +402,11 @@ def testToBytes(self): # convert back to array arr_copy = bytesToArray(buffer, dt, (5,)) + print("arr_copy bytes:", arrayToBytes(arr_copy)) + print("arr_copy:", arr_copy) + print("arr_copy dt:", arr_copy.dtype) + print("arr_copy metadata:", arr_copy.dtype.metadata) + print("arr_copy kind:", arr_copy.dtype.kind) self.assertTrue(ndarray_compare(arr, arr_copy)) # VLEN of bytes dt = np.dtype("O", metadata={"vlen": bytes}) diff --git a/tests/unit/hdf5_dtype_test.py b/tests/unit/hdf5_dtype_test.py deleted file mode 100755 index e51913a6..00000000 --- a/tests/unit/hdf5_dtype_test.py +++ /dev/null @@ -1,717 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # -# Utilities. The full HSDS copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## -import unittest -import logging -import numpy as np -import sys - -sys.path.append("../..") -from hsds.util import hdf5dtype -from hsds.util.hdf5dtype import special_dtype -from hsds.util.hdf5dtype import check_dtype -from hsds.util.hdf5dtype import Reference -from hsds.util.hdf5dtype import RegionReference - - -class Hdf5dtypeTest(unittest.TestCase): - def __init__(self, *args, **kwargs): - super(Hdf5dtypeTest, self).__init__(*args, **kwargs) - # main - self.logger = logging.getLogger() - self.logger.setLevel(logging.INFO) - - def testGetBaseTypeJson(self): - type_json = hdf5dtype.getBaseTypeJson("H5T_IEEE_F64LE") - self.assertTrue("class" in type_json) - self.assertEqual(type_json["class"], "H5T_FLOAT") - self.assertTrue("base" in type_json) - self.assertEqual(type_json["base"], "H5T_IEEE_F64LE") - - type_json = hdf5dtype.getBaseTypeJson("H5T_IEEE_F16LE") - self.assertTrue("class" in type_json) - self.assertEqual(type_json["class"], "H5T_FLOAT") - self.assertTrue("base" in type_json) - self.assertEqual(type_json["base"], "H5T_IEEE_F16LE") - - type_json = hdf5dtype.getBaseTypeJson("H5T_STD_I32LE") - self.assertTrue("class" in type_json) - self.assertEqual(type_json["class"], "H5T_INTEGER") - self.assertTrue("base" in type_json) - self.assertEqual(type_json["base"], "H5T_STD_I32LE") - - try: - hdf5dtype.getBaseTypeJson("foobar") - self.assertTrue(False) - except TypeError: - pass # expected - - def testBaseIntegerTypeItem(self): - dt = np.dtype("") - self.assertEqual(dt.kind, "u") - - dt = hdf5dtype.createDataType("H5T_STD_I16LE") - self.assertEqual(dt.name, "int16") - self.assertEqual(dt.kind, "i") - - dt = hdf5dtype.createDataType("H5T_IEEE_F64LE") - self.assertEqual(dt.name, "float64") - self.assertEqual(dt.kind, "f") - - dt = hdf5dtype.createDataType("H5T_IEEE_F32LE") - self.assertEqual(dt.name, "float32") - self.assertEqual(dt.kind, "f") - - typeItem = {"class": "H5T_INTEGER", "base": "H5T_STD_I32BE"} - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "int32") - self.assertEqual(dt.kind, "i") - self.assertEqual(typeSize, 4) - - def testCreateBaseStringType(self): - typeItem = {"class": "H5T_STRING", "charSet": "H5T_CSET_ASCII", "length": 6} - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "bytes48") - self.assertEqual(dt.kind, "S") - self.assertEqual(typeSize, 6) - - def testCreateBaseUnicodeType(self): - typeItem = {"class": "H5T_STRING", "charSet": "H5T_CSET_UTF8", "length": 6} - - dt = hdf5dtype.createDataType(typeItem) - typeSize = hdf5dtype.getItemSize(typeItem) - self.assertTrue(dt is not None) - self.assertEqual(dt.name, "bytes48") - self.assertEqual(dt.kind, "S") # uses byte - self.assertEqual(typeSize, 6) - - def testCreateNullTermStringType(self): - typeItem = { - "class": "H5T_STRING", - "charSet": "H5T_CSET_ASCII", - "length": 6, - "strPad": "H5T_STR_NULLTERM", - } - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - - self.assertEqual(dt.name, "bytes48") - self.assertEqual(dt.kind, "S") - self.assertEqual(typeSize, 6) - - def testCreateVLenStringType(self): - typeItem = { - "class": "H5T_STRING", - "charSet": "H5T_CSET_ASCII", - "length": "H5T_VARIABLE", - } - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "object") - self.assertEqual(dt.kind, "O") - self.assertEqual(check_dtype(vlen=dt), bytes) - self.assertEqual(typeSize, "H5T_VARIABLE") - - def testCreateVLenUTF8Type(self): - typeItem = { - "class": "H5T_STRING", - "charSet": "H5T_CSET_UTF8", - "length": "H5T_VARIABLE", - } - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "object") - self.assertEqual(dt.kind, "O") - self.assertEqual(check_dtype(vlen=dt), str) - self.assertEqual(typeSize, "H5T_VARIABLE") - - def testCreateVLenDataType(self): - typeItem = {"class": "H5T_VLEN", "base": "H5T_STD_I32BE"} - typeSize = hdf5dtype.getItemSize(typeItem) - self.assertEqual(typeSize, "H5T_VARIABLE") - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "object") - self.assertEqual(dt.kind, "O") - - def testCreateOpaqueType(self): - typeItem = {"class": "H5T_OPAQUE", "size": 200} - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "void1600") - self.assertEqual(dt.kind, "V") - self.assertEqual(typeSize, 200) - - def testCreateEnumType(self): - typeItem = { - "class": "H5T_ENUM", - "base": {"base": "H5T_STD_I16LE", "class": "H5T_INTEGER"}, - "mapping": {"GAS": 2, "LIQUID": 1, "PLASMA": 3, "SOLID": 0}, - } - - typeSize = hdf5dtype.getItemSize(typeItem) - self.assertEqual(typeSize, 2) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "int16") - self.assertEqual(dt.kind, "i") - mapping = check_dtype(enum=dt) - self.assertTrue(isinstance(mapping, dict)) - self.assertEqual(mapping["SOLID"], 0) - self.assertEqual(mapping["LIQUID"], 1) - self.assertEqual(mapping["GAS"], 2) - self.assertEqual(mapping["PLASMA"], 3) - - def testCreateBoolType(self): - typeItem = { - "class": "H5T_ENUM", - "base": {"base": "H5T_STD_I8LE", "class": "H5T_INTEGER"}, - "mapping": {"TRUE": 1, "FALSE": 0}, - } - - typeSize = hdf5dtype.getItemSize(typeItem) - self.assertEqual(typeSize, 1) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "bool") - self.assertEqual(dt.kind, "b") - self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) - - def testCreateCompoundType(self): - typeItem = { - "class": "H5T_COMPOUND", - "fields": [ - {"name": "temp", "type": "H5T_IEEE_F32LE"}, - {"name": "pressure", "type": "H5T_IEEE_F32LE"}, - { - "name": "location", - "type": { - "length": "H5T_VARIABLE", - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - }, - }, - {"name": "wind", "type": "H5T_STD_I16LE"}, - ], - } - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "void144") - self.assertEqual(dt.kind, "V") - self.assertEqual(len(dt.fields), 4) - self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) - - dtLocation = dt[2] - self.assertEqual(dtLocation.name, "object") - self.assertEqual(dtLocation.kind, "O") - self.assertEqual(check_dtype(vlen=dtLocation), bytes) - self.assertEqual(typeSize, "H5T_VARIABLE") - self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dtLocation)) - - def testCreateCompoundInvalidFieldName(self): - typeItem = { - "class": "H5T_COMPOUND", - "fields": [ - { - "name": "\u03b1", - "type": {"base": "H5T_STD_I32LE", "class": "H5T_INTEGER"}, - }, - { - "name": "\u03c9", - "type": {"base": "H5T_STD_I32LE", "class": "H5T_INTEGER"}, - }, - ], - } - try: - hdf5dtype.createDataType(typeItem) - self.assertTrue(False) - except TypeError: - pass # expected - - def testCreateCompoundOfCompoundType(self): - typeItem = { - "class": "H5T_COMPOUND", - "fields": [ - { - "name": "field1", - "type": { - "class": "H5T_COMPOUND", - "fields": [ - { - "name": "x", - "type": { - "class": "H5T_FLOAT", - "base": "H5T_IEEE_F32LE", - }, - }, - { - "name": "y", - "type": { - "class": "H5T_FLOAT", - "base": "H5T_IEEE_F32LE", - }, - }, - ], - }, - }, - { - "name": "field2", - "type": { - "class": "H5T_COMPOUND", - "fields": [ - { - "name": "a", - "type": { - "class": "H5T_FLOAT", - "base": "H5T_IEEE_F32LE", - }, - }, - { - "name": "b", - "type": { - "class": "H5T_FLOAT", - "base": "H5T_IEEE_F32LE", - }, - }, - { - "name": "c", - "type": { - "class": "H5T_FLOAT", - "base": "H5T_IEEE_F32LE", - }, - }, - ], - }, - }, - ], - } - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "void160") - self.assertEqual(dt.kind, "V") - self.assertEqual(len(dt.fields), 2) - dt_field1 = dt[0] - self.assertEqual(dt_field1.name, "void64") - self.assertEqual(dt_field1.kind, "V") - self.assertEqual(len(dt_field1.fields), 2) - dt_field2 = dt[1] - self.assertEqual(dt_field2.name, "void96") - self.assertEqual(dt_field2.kind, "V") - self.assertEqual(len(dt_field2.fields), 3) - - def testCreateCompoundTypeUnicodeFields(self): - typeItem = { - "class": "H5T_COMPOUND", - "fields": [ - {"name": u"temp", "type": "H5T_IEEE_F32LE"}, - {"name": u"pressure", "type": "H5T_IEEE_F32LE"}, - {"name": u"wind", "type": "H5T_STD_I16LE"}, - ], - } - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "void80") - self.assertEqual(dt.kind, "V") - self.assertEqual(len(dt.fields), 3) - self.assertEqual(typeSize, 10) - self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) - - def testCreateArrayType(self): - typeItem = {"class": "H5T_ARRAY", "base": "H5T_STD_I64LE", "dims": (3, 5)} - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "void960") - self.assertEqual(dt.kind, "V") - self.assertEqual(typeSize, 120) - self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) - - def testCreateArrayIntegerType(self): - typeItem = {"class": "H5T_INTEGER", "base": "H5T_STD_I64LE", "dims": (3, 5)} - - try: - hdf5dtype.createDataType(typeItem) - self.assertTrue(False) # expected exception - dims used with non-array type - except TypeError: - pass # should get exception - - def testCreateCompoundArrayType(self): - typeItem = { - "class": "H5T_COMPOUND", - "fields": [ - {"type": {"base": "H5T_STD_I8LE", "class": "H5T_INTEGER"}, "name": "a"}, - { - "type": { - "dims": [10], - "base": { - "length": 1, - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLPAD", - }, - "class": "H5T_ARRAY", - }, - "name": "b", - }, - ], - } - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(len(dt.fields), 2) - self.assertTrue("a" in dt.fields.keys()) - self.assertTrue("b" in dt.fields.keys()) - self.assertEqual(typeSize, 11) - self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) - - def testCompoundArrayType(self): - typeItem = { - "class": "H5T_COMPOUND", - "fields": [ - { - "type": {"class": "H5T_INTEGER", "base": "H5T_STD_U64BE"}, - "name": "VALUE1", - }, - { - "type": {"class": "H5T_FLOAT", "base": "H5T_IEEE_F64BE"}, - "name": "VALUE2", - }, - { - "type": { - "class": "H5T_ARRAY", - "dims": [2], - "base": { - "class": "H5T_STRING", - "charSet": "H5T_CSET_ASCII", - "strPad": "H5T_STR_NULLTERM", - "length": "H5T_VARIABLE", - }, - }, - "name": "VALUE3", - }, - ], - } - dt = hdf5dtype.createDataType(typeItem) - typeSize = hdf5dtype.getItemSize(typeItem) - self.assertEqual(typeSize, "H5T_VARIABLE") - self.assertEqual(len(dt), 3) - self.assertTrue("VALUE1" in dt.fields.keys()) - self.assertTrue("VALUE2" in dt.fields.keys()) - self.assertTrue("VALUE3" in dt.fields.keys()) - self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) - - dt3 = dt["VALUE3"] - self.assertEqual(check_dtype(vlen=dt3), bytes) - - -if __name__ == "__main__": - # setup test files - - unittest.main() From abb5d0c2fd3aab428b0418a18f26982fd4cb0c8d Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 14 Apr 2025 17:59:27 +0200 Subject: [PATCH 02/49] temp use of github branch for h5json ref --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9b733a85..33ab5dd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ dependencies = [ "bitshuffle >=0.5.2", "cryptography", "h5py >= 3.6.0", - "h5json", + "h5json@git+https://github.com/HDFGroup/hdf5-json@abstract", "importlib_resources", "numcodecs", "numpy >=2.0.0rc1; python_version>='3.9'", From ed44afabfbb8aee373f144cafd9dfedbf3ac5c32 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 14 Apr 2025 18:03:13 +0200 Subject: [PATCH 03/49] remove array_util test --- testall.py | 2 +- tests/unit/array_util_test.py | 1031 --------------------------------- 2 files changed, 1 insertion(+), 1032 deletions(-) delete mode 100644 tests/unit/array_util_test.py diff --git a/testall.py b/testall.py index 5955553a..247d4a91 100755 --- a/testall.py +++ b/testall.py @@ -15,7 +15,7 @@ PYTHON_CMD = "python" # change to "python3" if "python" invokes python version 2.x -unit_tests = ('array_util_test', 'chunk_util_test', 'compression_test', 'domain_util_test', +unit_tests = ('chunk_util_test', 'compression_test', 'domain_util_test', 'dset_util_test', 'id_util_test', 'lru_cache_test', 'shuffle_test', 'rangeget_util_test') diff --git a/tests/unit/array_util_test.py b/tests/unit/array_util_test.py deleted file mode 100644 index 854e1314..00000000 --- a/tests/unit/array_util_test.py +++ /dev/null @@ -1,1031 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # -# Utilities. The full HSDS copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## -import unittest -import json -import numpy as np - -import sys -import base64 - -from h5json.hdf5dtype import special_dtype -from h5json.hdf5dtype import check_dtype -from h5json.hdf5dtype import createDataType - -sys.path.append("../..") -from hsds.util.arrayUtil import ( - bytesArrayToList, - toTuple, - getNumElements, - jsonToArray, - arrayToBytes, - bytesToArray, - getByteArraySize, - IndexIterator, - ndarray_compare, - getNumpyValue, - getBroadcastShape -) - - -class ArrayUtilTest(unittest.TestCase): - def __init__(self, *args, **kwargs): - super(ArrayUtilTest, self).__init__(*args, **kwargs) - # main - - def testByteArrayToList(self): - data_items = ( - 42, - "foo", - b"foo", - [1, 2, 3], - (1, 2, 3), - ["A", "B", "C"], - [b"A", b"B", b"C"], - [["A", "B"], [b"a", b"b", b"c"]], - ) - for data in data_items: - json_data = bytesArrayToList(data) - # will throw TypeError if not able to convert - json.dumps(json_data) - - def testToTuple(self): - data0d = 42 # scalar - data1d1 = [1] # one dimensional, one element list - data1d = [1, 2, 3, 4, 5] # list - data2d1 = [ - [1, 2], - ] # two dimensional, one element - data2d = [[1, 0.1], [2, 0.2], [3, 0.3], [4, 0.4]] # list of two-element lists - data3d = [[[0, 0.0], [1, 0.1]], [[2, 0.2], [3, 0.3]]] # list of list of lists - out = toTuple(0, data0d) - self.assertEqual(data0d, out) - out = toTuple(1, data1d1) - self.assertEqual(data1d1, out) - out = toTuple(1, data1d) - self.assertEqual(data1d, out) - out = toTuple(2, data2d) - self.assertEqual(data2d, out) - out = toTuple(1, data2d1) - self.assertEqual([(1, 2)], out) - out = toTuple(3, data3d) - self.assertEqual(data3d, out) - out = toTuple(1, data2d) # treat input as 1d array of two-field compound types - self.assertEqual([(1, 0.1), (2, 0.2), (3, 0.3), (4, 0.4)], out) - out = toTuple(2, data3d) # treat input as 2d array of two-field compound types - self.assertEqual([[(0, 0.0), (1, 0.1)], [(2, 0.2), (3, 0.3)]], out) - out = toTuple(1, data3d) # treat input a 1d array of compound type of compound types - self.assertEqual([((0, 0.0), (1, 0.1)), ((2, 0.2), (3, 0.3))], out) - - def testGetNumElements(self): - shape = (4,) - nelements = getNumElements(shape) - self.assertEqual(nelements, 4) - - shape = [10,] - nelements = getNumElements(shape) - self.assertEqual(nelements, 10) - - shape = (10, 8) - nelements = getNumElements(shape) - self.assertEqual(nelements, 80) - - def testJsonToArray(self): - dt = np.dtype("i4") - shape = [4, ] - data = [0, 2, 4, 6] - out = jsonToArray(shape, dt, data) - - self.assertTrue(isinstance(out, np.ndarray)) - self.assertEqual(out.shape, (4,)) - for i in range(4): - self.assertEqual(out[i], i * 2) - - # compound type - dt = np.dtype([("a", "i4"), ("b", "S5")]) - shape = [2, ] - data = [[4, "four"], [5, "five"]] - out = jsonToArray(shape, dt, data) - self.assertTrue(isinstance(out, np.ndarray)) - - self.assertEqual(out.shape, (2,)) - self.assertTrue(isinstance(out[0], np.void)) - e0 = out[0].tolist() - self.assertEqual(e0, (4, b"four")) - self.assertTrue(isinstance(out[1], np.void)) - e1 = out[1].tolist() - self.assertEqual(e1, (5, b"five")) - - shape = [1, ] - data = [ - [6, "six"], - ] - out = jsonToArray(shape, dt, data) - e0 = out[0].tolist() - self.assertEqual(e0, (6, b"six")) - - data = [6, "six"] - out = jsonToArray(shape, dt, data) - e0 = out[0].tolist() - self.assertEqual(e0, (6, b"six")) - - # test ascii chars >127 - dt = np.dtype("S26") - data = "extended ascii char 241: " + chr(241) - out = jsonToArray(shape, dt, data) - self.assertEqual(out[0], b'extended ascii char 241: \xc3') - - dt = np.dtype("S12") - data = "eight: \u516b" - out = jsonToArray(shape, dt, data) - self.assertEqual(out[0], b'eight: \xe5\x85\xab') - - # VLEN ascii - dt = special_dtype(vlen=bytes) - data = [b"one", b"two", b"three", b"four", b"five"] - shape = [5, ] - out = jsonToArray(shape, dt, data) - self.assertTrue("vlen" in out.dtype.metadata) - self.assertEqual(out.dtype.metadata["vlen"], bytes) - self.assertEqual(out.dtype.kind, "O") - self.assertEqual(out.shape, (5,)) - # TBD: code does not actually enforce use of bytes vs. str, - # probably not worth the effort to fix - self.assertEqual(out[2], b"three") - self.assertEqual(out[3], b"four") - - # VLEN str - dt = special_dtype(vlen=str) - data = [ - [b"part 1 - section A", b"part 1 - section B"], - [b"part 2 - section A", b"part 2 - section B"], - ] - shape = [2,] - out = jsonToArray(shape, dt, data) - self.assertTrue("vlen" in out.dtype.metadata) - self.assertEqual(out.dtype.metadata["vlen"], str) - self.assertEqual(out.dtype.kind, "O") - self.assertEqual(out.shape, (2,)) - self.assertEqual(out[0], tuple(data[0])) - self.assertEqual(out[1], tuple(data[1])) - - # VLEN Scalar str - dt = special_dtype(vlen=str) - data = "I'm a string!" - shape = [1, ] - out = jsonToArray(shape, dt, data) - - # VLEN unicode - dt = special_dtype(vlen=bytes) - data = ["one", "two", "three", "four", "five"] - shape = [5, ] - out = jsonToArray(shape, dt, data) - self.assertTrue("vlen" in out.dtype.metadata) - self.assertEqual(out.dtype.metadata["vlen"], bytes) - self.assertEqual(out.dtype.kind, "O") - self.assertEqual(out[2], b"three") - - # VLEN data - dt = special_dtype(vlen=np.dtype("int32")) - shape = [4, ] - data = [ - [1,], - [1, 2], - [1, 2, 3], - [1, 2, 3, 4], - ] - out = jsonToArray(shape, dt, data) - self.assertTrue(isinstance(out, np.ndarray)) - self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) - - self.assertEqual(out.shape, (4,)) - self.assertEqual(out.dtype.kind, "O") - self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) - for i in range(4): - e = out[i] # .tolist() - self.assertTrue(isinstance(e, tuple)) - self.assertEqual(e, tuple(range(1, i + 2))) - - # VLEN 2D data - dt = special_dtype(vlen=np.dtype("int32")) - shape = [2, 2] - data = [ - [ - [0,], - [1, 2], - ], - [ - [1,], - [2, 3], - ], - ] - out = jsonToArray(shape, dt, data) - self.assertTrue(isinstance(out, np.ndarray)) - self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) - - self.assertEqual(out.shape, (2, 2)) - self.assertEqual(out.dtype.kind, "O") - self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) - for i in range(2): - for j in range(2): - e = out[i, j] # .tolist() - self.assertTrue(isinstance(e, tuple)) - - # create VLEN of obj ref's - ref_type = {"class": "H5T_REFERENCE", "base": "H5T_STD_REF_OBJ"} - vlen_type = {"class": "H5T_VLEN", "base": ref_type} - dt = createDataType(vlen_type) # np datatype - - id0 = b"g-a4f455b2-c8cf-11e7-8b73-0242ac110009" - id1 = b"g-a50af844-c8cf-11e7-8b73-0242ac110009" - id2 = b"g-a5236276-c8cf-11e7-8b73-0242ac110009" - - data = [ - [id0, ], - [id0, id1], - [id0, id1, id2], - ] - shape = [3, ] - out = jsonToArray(shape, dt, data) - self.assertTrue(isinstance(out, np.ndarray)) - base_type = check_dtype(vlen=out.dtype) - self.assertEqual(base_type.kind, "S") - self.assertEqual(base_type.itemsize, 48) - - self.assertEqual(out.shape, (3,)) - self.assertEqual(out.dtype.kind, "O") - self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("S48")) - - e = out[0] - self.assertTrue(isinstance(e, tuple)) - self.assertEqual(e, (id0,)) - e = out[1] - self.assertTrue(isinstance(e, tuple)) - self.assertEqual(e, (id0, id1)) - e = out[2] - self.assertTrue(isinstance(e, tuple)) - self.assertEqual(e, (id0, id1, id2)) - - # compound type with array field - dt = np.dtype([("a", ("i4", 3)), ("b", "S5")]) - shape = [2, ] - data = [[[4, 8, 12], "four"], [[5, 10, 15], "five"]] - out = jsonToArray(shape, dt, data) - self.assertTrue(isinstance(out, np.ndarray)) - - self.assertEqual(out.shape, (2,)) - self.assertTrue(isinstance(out[0], np.void)) - e0 = out[0] - self.assertEqual(len(e0), 2) - e0a = e0[0] - self.assertTrue(isinstance(e0a, np.ndarray)) - self.assertEqual(e0a[0], 4) - self.assertEqual(e0a[1], 8) - self.assertEqual(e0a[2], 12) - e0b = e0[1] - self.assertEqual(e0b, b"four") - self.assertTrue(isinstance(out[1], np.void)) - e1 = out[1] - self.assertEqual(len(e1), 2) - e1a = e1[0] - self.assertTrue(isinstance(e1a, np.ndarray)) - self.assertEqual(e1a[0], 5) - self.assertEqual(e1a[1], 10) - self.assertEqual(e1a[2], 15) - e1b = e1[1] - self.assertEqual(e1b, b"five") - - def testToBytes(self): - # Simple array - dt = np.dtype(" expected_num_bytes) - - # convert buffer back to arr - arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") - self.assertTrue(np.array_equal(arr, arr_copy)) - - # fixed length string - dt = np.dtype("S8") - arr = np.asarray(("abcdefgh", "ABCDEFGH", "12345678"), dtype=dt) - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (3,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - - # Compound non-vlen - dt = np.dtype([("x", "f8"), ("y", "i4")]) - arr = np.zeros((4,), dtype=dt) - arr[0] = (3.12, 42) - arr[3] = (1.28, 69) - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - - # VLEN of int32's - dt = np.dtype("O", metadata={"vlen": np.dtype("int32")}) - arr = np.zeros((4,), dtype=dt) - arr[0] = np.int32([1, ]) - arr[1] = np.int32([1, 2]) - arr[2] = 0 # test un-intialized value - arr[3] = np.int32([1, 2, 3]) - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - - # VLEN of strings - dt = np.dtype("O", metadata={"vlen": str}) - arr = np.zeros((5,), dtype=dt) - arr[0] = "one: \u4e00" - arr[1] = "two: \u4e8c" - arr[2] = "three: \u4e09" - arr[3] = "four: \u56db" - arr[4] = 0 - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (5,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - # VLEN of bytes - dt = np.dtype("O", metadata={"vlen": bytes}) - arr = np.zeros((5,), dtype=dt) - arr[0] = b"Parting" - arr[1] = b"is such" - arr[2] = b"sweet" - arr[3] = b"sorrow" - arr[4] = 0 - - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (5,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - - # - # Compound str vlen - # - dt_vstr = np.dtype("O", metadata={"vlen": str}) - dt = np.dtype([("x", "i4"), ("tag", dt_vstr), ("code", "S4")]) - arr = np.zeros((4,), dtype=dt) - arr[0] = (42, "Hello", "X1") - arr[3] = (84, "Bye", "XYZ") - count = getByteArraySize(arr) - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - - # - # Compound int vlen - # - dt_vint = np.dtype("O", metadata={"vlen": "int32"}) - dt = np.dtype([("x", "int32"), ("tag", dt_vint)]) - arr = np.zeros((4,), dtype=dt) - arr[0] = (42, np.array((), dtype="int32")) - arr[3] = (84, np.array((1, 2, 3), dtype="int32")) - count = getByteArraySize(arr) - self.assertEqual(count, 44) - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - - # - # VLEN utf string with array type - # - dt_arr_str = np.dtype("(2,)O", metadata={"vlen": str}) - dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) - arr = np.zeros((4,), dtype=dt) - dt_str = np.dtype("O", metadata={"vlen": str}) - arr[0] = (42, np.asarray(["hi", "bye"], dtype=dt_str)) - arr[3] = (84, np.asarray(["hi-hi", "bye-bye"], dtype=dt_str)) - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") - - self.assertEqual(arr.dtype, arr_copy.dtype) - self.assertEqual(arr.shape, arr_copy.shape) - for i in range(4): - e = arr[i] - e_copy = arr_copy[i] - self.assertTrue(np.array_equal(e, e_copy)) - # - # VLEN ascii with array type - # - dt_arr_str = np.dtype("(2,)O", metadata={"vlen": bytes}) - dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) - arr = np.zeros((4,), dtype=dt) - dt_str = np.dtype("O", metadata={"vlen": bytes}) - arr[0] = (42, np.asarray([b"hi", b"bye"], dtype=dt_str)) - arr[3] = (84, np.asarray([b"hi-hi", b"bye-bye"], dtype=dt_str)) - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - - def testArrayCompareInt(self): - # Simple array - dt = np.dtype(" Date: Mon, 14 Apr 2025 18:07:58 +0200 Subject: [PATCH 04/49] use h5json for ndarray_compare function --- hsds/util/chunkUtil.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hsds/util/chunkUtil.py b/hsds/util/chunkUtil.py index dc03cc89..9c984de6 100644 --- a/hsds/util/chunkUtil.py +++ b/hsds/util/chunkUtil.py @@ -1,6 +1,8 @@ import numpy as np + +from h5json.array_util import ndarray_compare + from .. import hsds_logger as log -from .arrayUtil import ndarray_compare CHUNK_BASE = 16 * 1024 # Multiplier by which chunks are adjusted CHUNK_MIN = 512 * 1024 # Soft lower limit (512k) From 3904cf9eae97108f69d1ea7cb28d49c76d3ac957 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 23 Apr 2025 16:22:06 +0200 Subject: [PATCH 05/49] use h5json objid funcs --- hsds/async_lib.py | 5 +- hsds/attr_sn.py | 2 +- hsds/basenode.py | 7 +- hsds/chunk_crawl.py | 2 +- hsds/chunk_dn.py | 3 +- hsds/chunk_sn.py | 2 +- hsds/ctype_dn.py | 3 +- hsds/ctype_sn.py | 2 +- hsds/datanode.py | 5 +- hsds/datanode_lib.py | 8 +- hsds/domain_crawl.py | 4 +- hsds/domain_dn.py | 2 +- hsds/domain_sn.py | 6 +- hsds/dset_dn.py | 2 +- hsds/dset_lib.py | 3 +- hsds/dset_sn.py | 2 +- hsds/folder_crawl.py | 3 +- hsds/group_dn.py | 3 +- hsds/group_sn.py | 3 +- hsds/headnode.py | 2 +- hsds/link_dn.py | 3 +- hsds/link_sn.py | 4 +- hsds/servicenode_lib.py | 5 +- hsds/util/httpUtil.py | 3 +- hsds/util/idUtil.py | 540 ------------------------------------- testall.py | 2 +- tests/integ/vlen_test.py | 5 - tests/unit/id_util_test.py | 212 --------------- 28 files changed, 54 insertions(+), 789 deletions(-) delete mode 100644 hsds/util/idUtil.py delete mode 100755 tests/unit/id_util_test.py diff --git a/hsds/async_lib.py b/hsds/async_lib.py index 15d67f5f..715e7985 100755 --- a/hsds/async_lib.py +++ b/hsds/async_lib.py @@ -18,8 +18,9 @@ from h5json.hdf5dtype import getItemSize from h5json.hdf5dtype import createDataType from h5json.array_util import getNumElements, bytesToArray -from .util.idUtil import isValidUuid, isSchema2Id, getS3Key, isS3ObjKey -from .util.idUtil import getObjId, isValidChunkId, getCollectionForId +from h5json.objid import isValidUuid, isSchema2Id, getS3Key, isS3ObjKey +from h5json.objid import getObjId, isValidChunkId, getCollectionForId + from .util.chunkUtil import getDatasetId, getNumChunks, ChunkIterator from .util.dsetUtil import getHyperslabSelection, getFilterOps, getChunkDims, getFilters from .util.dsetUtil import getDatasetLayoutClass, getDatasetLayout, getShapeDims diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index a735c5c6..c5d76227 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -22,10 +22,10 @@ from h5json.hdf5dtype import createDataType, getItemSize from h5json.array_util import jsonToArray, getNumElements, bytesArrayToList from h5json.array_util import bytesToArray, arrayToBytes, decodeData, encodeData +from h5json.objid import isValidUuid, getRootObjId from .util.httpUtil import getAcceptType, jsonResponse, getHref, getBooleanParam from .util.globparser import globmatch -from .util.idUtil import isValidUuid, getRootObjId from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain, verifyRoot diff --git a/hsds/basenode.py b/hsds/basenode.py index f3356f34..6dd83b64 100644 --- a/hsds/basenode.py +++ b/hsds/basenode.py @@ -25,15 +25,18 @@ from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.web_exceptions import HTTPServiceUnavailable + + from . import config from .util.httpUtil import http_get, http_post, jsonResponse -from .util.idUtil import createNodeId, getNodeNumber, getNodeCount from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.authUtil import isAdminUser from .util.k8sClient import getDnLabelSelector, getPodIps +from .util.nodeUtil import createNodeId, getNodeNumber, getNodeCount + from . import hsds_logger as log -HSDS_VERSION = "0.9.2" +HSDS_VERSION = "1.0.0" def getVersion(): diff --git a/hsds/chunk_crawl.py b/hsds/chunk_crawl.py index a153bfe8..960cdadf 100755 --- a/hsds/chunk_crawl.py +++ b/hsds/chunk_crawl.py @@ -28,9 +28,9 @@ from h5json.array_util import jsonToArray, getNumpyValue from h5json.array_util import getNumElements, arrayToBytes, bytesToArray +from .util.nodeUtil import getDataNodeUrl, getNodeCount from .util.httpUtil import http_get, http_put, http_post, get_http_client from .util.httpUtil import isUnixDomainUrl -from .util.idUtil import getDataNodeUrl, getNodeCount from .util.dsetUtil import getSliceQueryParam, getShapeDims from .util.dsetUtil import getSelectionShape, getChunkLayout from .util.chunkUtil import getChunkCoverage, getDataCoverage diff --git a/hsds/chunk_dn.py b/hsds/chunk_dn.py index eeeed88d..97e86f01 100644 --- a/hsds/chunk_dn.py +++ b/hsds/chunk_dn.py @@ -22,9 +22,9 @@ from h5json.hdf5dtype import createDataType, getSubType from h5json.array_util import bytesToArray, arrayToBytes, getBroadcastShape +from h5json.objid import getS3Key, isValidUuid from .util.httpUtil import request_read, getContentType -from .util.idUtil import getS3Key, validateInPartition, isValidUuid from .util.storUtil import isStorObj, deleteStorObj from .util.dsetUtil import getSelectionList, getChunkLayout, getShapeDims from .util.dsetUtil import getSelectionShape, getChunkInitializer @@ -33,6 +33,7 @@ from .util.chunkUtil import chunkWritePoints, chunkReadPoints from .util.domainUtil import isValidBucketName from .util.boolparser import BooleanParser +from .util.nodeUtil import validateInPartition from .datanode_lib import get_metadata_obj, get_chunk, save_chunk from . import hsds_logger as log diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index 921feaf0..4bb084b3 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -28,10 +28,10 @@ from h5json.hdf5dtype import getItemSize, getDtypeItemSize, getSubType, createDataType from h5json.array_util import bytesArrayToList, jsonToArray, getNumElements, arrayToBytes from h5json.array_util import bytesToArray, squeezeArray, getBroadcastShape +from h5json.objid import isValidUuid from .util.httpUtil import getHref, getAcceptType, getContentType from .util.httpUtil import request_read, jsonResponse, isAWSLambda -from .util.idUtil import isValidUuid from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain from .util.dsetUtil import isNullSpace, isScalarSpace, get_slices, getShapeDims diff --git a/hsds/ctype_dn.py b/hsds/ctype_dn.py index f06b98b3..fe8a67a7 100755 --- a/hsds/ctype_dn.py +++ b/hsds/ctype_dn.py @@ -18,7 +18,8 @@ from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.web import json_response -from .util.idUtil import isValidUuid, validateUuid +from h5json.objid import isValidUuid, validateUuid + from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj from .datanode_lib import delete_metadata_obj, check_metadata_obj from .util.domainUtil import isValidBucketName diff --git a/hsds/ctype_sn.py b/hsds/ctype_sn.py index 59faccd1..d85ffc07 100755 --- a/hsds/ctype_sn.py +++ b/hsds/ctype_sn.py @@ -18,10 +18,10 @@ from json import JSONDecodeError from h5json.hdf5dtype import validateTypeItem, getBaseTypeJson +from h5json.objid import isValidUuid from .util.httpUtil import getHref, respJsonAssemble, getBooleanParam from .util.httpUtil import jsonResponse -from .util.idUtil import isValidUuid from .util.linkUtil import validateLinkName from .util.authUtil import getUserPasswordFromRequest, aclCheck from .util.authUtil import validateUserPassword diff --git a/hsds/datanode.py b/hsds/datanode.py index b7c00b9d..cef44bd0 100644 --- a/hsds/datanode.py +++ b/hsds/datanode.py @@ -17,10 +17,11 @@ import traceback from aiohttp.web import run_app +from h5json.objid import isValidUuid, isSchema2Id, getCollectionForId +from h5json.objid import isRootObjId + from . import config from .util.lruCache import LruCache -from .util.idUtil import isValidUuid, isSchema2Id, getCollectionForId -from .util.idUtil import isRootObjId from .util.httpUtil import isUnixDomainUrl, bindToSocket, getPortFromUrl from .util.httpUtil import jsonResponse, release_http_client from .util.storUtil import setBloscThreads, getBloscThreads diff --git a/hsds/datanode_lib.py b/hsds/datanode_lib.py index 1c6c3b6c..48843a25 100644 --- a/hsds/datanode_lib.py +++ b/hsds/datanode_lib.py @@ -22,10 +22,11 @@ from h5json.hdf5dtype import createDataType from h5json.array_util import arrayToBytes, bytesToArray, jsonToArray +from h5json.objid import getS3Key, isValidUuid +from h5json.objid import isValidChunkId, isSchema2Id +from h5json.objid import getRootObjId, isRootObjId -from .util.idUtil import validateInPartition, getS3Key, isValidUuid -from .util.idUtil import isValidChunkId, getDataNodeUrl, isSchema2Id -from .util.idUtil import getRootObjId, isRootObjId +from .util.nodeUtil import getDataNodeUrl from .util.storUtil import getStorJSONObj, putStorJSONObj, putStorBytes from .util.storUtil import getStorBytes, isStorObj, deleteStorObj, getHyperChunks from .util.storUtil import getBucketFromStorURI, getKeyFromStorURI, getURIFromKey @@ -35,6 +36,7 @@ from .util.dsetUtil import getChunkLayout, getFilterOps, getShapeDims from .util.dsetUtil import getChunkInitializer, getSliceQueryParam, getFilters from .util.chunkUtil import getDatasetId, getChunkSelection, getChunkIndex +from .util.nodeUtil import validateInPartition from .util.rangegetUtil import ChunkLocation, chunkMunge, getHyperChunkIndex, getHyperChunkFactors from .util.timeUtil import getNow from . import config diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index b8e0ba39..656b04e6 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -18,8 +18,10 @@ from aiohttp.web_exceptions import HTTPServiceUnavailable, HTTPConflict, HTTPBadRequest from aiohttp.web_exceptions import HTTPInternalServerError, HTTPNotFound, HTTPGone +from h5json.objid import getCollectionForId + +from .util.nodeUtil import getDataNodeUrl from .util.httpUtil import isOK -from .util.idUtil import getCollectionForId, getDataNodeUrl from .util.globparser import globmatch from .servicenode_lib import getObjectJson, getAttributes, putAttributes, getLinks, putLinks from . import hsds_logger as log diff --git a/hsds/domain_dn.py b/hsds/domain_dn.py index 83932e5d..0fe0d01c 100755 --- a/hsds/domain_dn.py +++ b/hsds/domain_dn.py @@ -18,7 +18,7 @@ from .util.authUtil import getAclKeys from .util.domainUtil import isValidDomain, getBucketForDomain -from .util.idUtil import validateInPartition +from .util.nodeUtil import validateInPartition from .util.timeUtil import getNow from .datanode_lib import get_metadata_obj, save_metadata_obj from .datanode_lib import delete_metadata_obj, check_metadata_obj diff --git a/hsds/domain_sn.py b/hsds/domain_sn.py index 56d3611a..efbc31ab 100755 --- a/hsds/domain_sn.py +++ b/hsds/domain_sn.py @@ -22,11 +22,13 @@ from aiohttp.web_exceptions import HTTPConflict, HTTPServiceUnavailable from aiohttp.web import json_response +from h5json.objid import createObjId, getCollectionForId +from h5json.objid import isValidUuid, isSchema2Id + +from .util.nodeUtil import getNodeCount, getDataNodeUrl from .util.httpUtil import getObjectClass, http_post, http_put, http_delete from .util.httpUtil import getHref, respJsonAssemble from .util.httpUtil import jsonResponse -from .util.idUtil import getDataNodeUrl, createObjId, getCollectionForId -from .util.idUtil import isValidUuid, isSchema2Id, getNodeCount from .util.authUtil import getUserPasswordFromRequest, aclCheck, isAdminUser from .util.authUtil import validateUserPassword, getAclKeys from .util.domainUtil import getParentDomain, getDomainFromRequest diff --git a/hsds/dset_dn.py b/hsds/dset_dn.py index 34a8ff6f..60d1037b 100755 --- a/hsds/dset_dn.py +++ b/hsds/dset_dn.py @@ -17,8 +17,8 @@ from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.web import json_response +from h5json.objid import isValidUuid, validateUuid -from .util.idUtil import isValidUuid, validateUuid from .util.domainUtil import isValidBucketName from .util.timeUtil import getNow from .datanode_lib import get_obj_id, check_metadata_obj, get_metadata_obj diff --git a/hsds/dset_lib.py b/hsds/dset_lib.py index 5b729afb..689c2c7e 100755 --- a/hsds/dset_lib.py +++ b/hsds/dset_lib.py @@ -19,7 +19,9 @@ from h5json.hdf5dtype import createDataType, getItemSize from h5json.array_util import getNumpyValue +from h5json.objid import isSchema2Id, getS3Key, getObjId +from .util.nodeUtil import getDataNodeUrl from .util.boolparser import BooleanParser from .util.dsetUtil import isNullSpace, getDatasetLayout, getDatasetLayoutClass, get_slices from .util.dsetUtil import getChunkLayout, getSelectionShape, getShapeDims @@ -28,7 +30,6 @@ from .util.chunkUtil import getChunkCoverage, getDataCoverage from .util.chunkUtil import getQueryDtype, get_chunktable_dims from .util.httpUtil import http_delete, http_put -from .util.idUtil import getDataNodeUrl, isSchema2Id, getS3Key, getObjId from .util.rangegetUtil import getHyperChunkFactors from .util.storUtil import getStorKeys diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 721970fb..77e85db0 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -20,10 +20,10 @@ from h5json.hdf5dtype import validateTypeItem, createDataType, getBaseTypeJson, getItemSize from h5json.array_util import getNumElements, getNumpyValue +from h5json.objid import isValidUuid, isSchema2Id from .util.httpUtil import getHref, respJsonAssemble from .util.httpUtil import jsonResponse, getBooleanParam -from .util.idUtil import isValidUuid, isSchema2Id from .util.dsetUtil import getPreviewQuery, getFilterItem, getShapeDims from .util.chunkUtil import getChunkSize, guessChunk, expandChunk, shrinkChunk from .util.chunkUtil import getContiguousLayout diff --git a/hsds/folder_crawl.py b/hsds/folder_crawl.py index 48f37ce6..05048758 100644 --- a/hsds/folder_crawl.py +++ b/hsds/folder_crawl.py @@ -19,8 +19,9 @@ from aiohttp.web_exceptions import HTTPGone, HTTPInternalServerError from aiohttp.web_exceptions import HTTPServiceUnavailable -from .util.idUtil import getNodeCount from .servicenode_lib import getObjectJson, getDomainResponse, getDomainJson +from .util.nodeUtil import getNodeCount + from . import hsds_logger as log diff --git a/hsds/group_dn.py b/hsds/group_dn.py index 0a6bb937..d67f672e 100755 --- a/hsds/group_dn.py +++ b/hsds/group_dn.py @@ -19,7 +19,8 @@ from aiohttp.web_exceptions import HTTPNotFound, HTTPServiceUnavailable from aiohttp.web import json_response -from .util.idUtil import isValidUuid, isSchema2Id, isRootObjId, getRootObjId +from h5json.objid import isValidUuid, isSchema2Id, isRootObjId, getRootObjId + from .util.domainUtil import isValidBucketName from .util.timeUtil import getNow from .datanode_lib import get_obj_id, check_metadata_obj, get_metadata_obj diff --git a/hsds/group_sn.py b/hsds/group_sn.py index 2b573985..c857683e 100755 --- a/hsds/group_sn.py +++ b/hsds/group_sn.py @@ -16,8 +16,9 @@ from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPNotFound from json import JSONDecodeError +from h5json.objid import isValidUuid + from .util.httpUtil import getHref, jsonResponse, getBooleanParam -from .util.idUtil import isValidUuid from .util.authUtil import getUserPasswordFromRequest, aclCheck from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain diff --git a/hsds/headnode.py b/hsds/headnode.py index 9b49517d..354a17bc 100755 --- a/hsds/headnode.py +++ b/hsds/headnode.py @@ -22,7 +22,7 @@ from . import config from .util.timeUtil import unixTimeToUTC, elapsedTime -from .util.idUtil import createNodeId +from .util.nodeUtil import createNodeId from . import hsds_logger as log from .util import query_marathon as marathonClient diff --git a/hsds/link_dn.py b/hsds/link_dn.py index e53984ed..09b3ac20 100755 --- a/hsds/link_dn.py +++ b/hsds/link_dn.py @@ -20,7 +20,8 @@ from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.web import json_response -from .util.idUtil import isValidUuid +from h5json.objid import isValidUuid + from .util.globparser import globmatch from .util.linkUtil import validateLinkName, getLinkClass, isEqualLink from .util.domainUtil import isValidBucketName diff --git a/hsds/link_sn.py b/hsds/link_sn.py index 71e39246..b7b36ef7 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -16,10 +16,12 @@ from aiohttp.web_exceptions import HTTPBadRequest from json import JSONDecodeError +from h5json.objid import isValidUuid, getCollectionForId + +from .util.nodeUtil import getDataNodeUrl from .util.httpUtil import getHref, getBooleanParam from .util.httpUtil import jsonResponse from .util.globparser import globmatch -from .util.idUtil import isValidUuid, getDataNodeUrl, getCollectionForId from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain, verifyRoot from .util.domainUtil import getBucketForDomain diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 3d65e619..d2db9d4d 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -22,10 +22,11 @@ from aiohttp import ClientResponseError from h5json.array_util import encodeData +from h5json.objid import getCollectionForId, createObjId, getRootObjId +from h5json.objid import isSchema2Id, getS3Key, isValidUuid +from .util.nodeUtil import getDataNodeUrl from .util.authUtil import getAclKeys -from .util.idUtil import getDataNodeUrl, getCollectionForId, createObjId, getRootObjId -from .util.idUtil import isSchema2Id, getS3Key, isValidUuid from .util.linkUtil import h5Join, validateLinkName, getLinkClass from .util.storUtil import getStorJSONObj, isStorObj from .util.authUtil import aclCheck diff --git a/hsds/util/httpUtil.py b/hsds/util/httpUtil.py index 0d43ae4a..3ca19f19 100644 --- a/hsds/util/httpUtil.py +++ b/hsds/util/httpUtil.py @@ -25,7 +25,8 @@ from aiohttp.web_exceptions import HTTPRequestEntityTooLarge from aiohttp.web_exceptions import HTTPServiceUnavailable, HTTPBadRequest from aiohttp.client_exceptions import ClientError -from hsds.util.idUtil import isValidUuid + +from h5json.objid import isValidUuid from .. import hsds_logger as log from .. import config diff --git a/hsds/util/idUtil.py b/hsds/util/idUtil.py deleted file mode 100644 index fe21bbb0..00000000 --- a/hsds/util/idUtil.py +++ /dev/null @@ -1,540 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # -# Utilities. The full HSDS copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## -# -# idUtil: -# id (uuid) related functions -# - -import os.path -import hashlib -import uuid -from aiohttp.web_exceptions import HTTPServiceUnavailable -from .. import hsds_logger as log - - -S3_URI = "s3://" -FILE_URI = "file://" -AZURE_URI = "blob.core.windows.net/" # preceded with "https://" - - -def _getStorageProtocol(uri): - """ returns 's3://', 'file://', or 'https://...net/' prefix if present. - If the prefix is in the form: https://myaccount.blob.core.windows.net/mycontainer - (references Azure blob storage), return: https://myaccount.blob.core.windows.net/ - otherwise None """ - - if not uri: - protocol = None - elif uri.startswith(S3_URI): - protocol = S3_URI - elif uri.startswith(FILE_URI): - protocol = FILE_URI - elif uri.startswith("https://") and uri.find(AZURE_URI) > 0: - n = uri.find(AZURE_URI) + len(AZURE_URI) - protocol = uri[:n] - elif uri.find("://") >= 0: - raise ValueError(f"storage uri: {uri} not supported") - else: - protocol = None - return protocol - - -def _getBaseName(uri): - """ Return the part of the URI after the storage protocol (if any) """ - - protocol = _getStorageProtocol(uri) - if not protocol: - return uri - else: - return uri[len(protocol):] - - -def getIdHash(id): - """Return md5 prefix based on id value""" - m = hashlib.new("md5") - m.update(id.encode("utf8")) - hexdigest = m.hexdigest() - return hexdigest[:5] - - -def isSchema2Id(id): - """return true if this is a v2 id""" - # v1 ids are in the standard UUID format: 8-4-4-4-12 - # v2 ids are in the non-standard: 8-8-4-6-6 - parts = id.split("-") - if len(parts) != 6: - raise ValueError(f"Unexpected id formation for uuid: {id}") - if len(parts[2]) == 8: - return True - else: - return False - - -def getIdHexChars(id): - """get the hex chars of the given id""" - if id[0] == "c": - # don't include chunk index - index = id.index("_") - parts = id[0:index].split("-") - else: - parts = id.split("-") - if len(parts) != 6: - raise ValueError(f"Unexpected id format for uuid: {id}") - return "".join(parts[1:]) - - -def hexRot(ch): - """rotate hex character by 8""" - return format((int(ch, base=16) + 8) % 16, "x") - - -def isRootObjId(id): - """returns true if this is a root id (only for v2 schema)""" - if not isSchema2Id(id): - raise ValueError("isRootObjId can only be used with v2 ids") - validateUuid(id) # will throw ValueError exception if not a objid - if id[0] != "g": - return False # not a group - token = getIdHexChars(id) - # root ids will have last 16 chars rotated version of the first 16 - is_root = True - for i in range(16): - if token[i] != hexRot(token[i + 16]): - is_root = False - break - return is_root - - -def getRootObjId(id): - """returns root id for this objid if this is a root id - (only for v2 schema) - """ - if isRootObjId(id): - return id # this is the root id - token = list(getIdHexChars(id)) - # root ids will have last 16 chars rotated version of the first 16 - for i in range(16): - token[i + 16] = hexRot(token[i]) - token = "".join(token) - root_id = "g-" + token[0:8] + "-" + token[8:16] + "-" + token[16:20] - root_id += "-" + token[20:26] + "-" + token[26:32] - - return root_id - - -def createObjId(obj_type, rootid=None): - if obj_type not in ("groups", "datasets", "datatypes", "chunks", "roots"): - raise ValueError("unexpected obj_type") - - prefix = None - if obj_type == "datatypes": - prefix = "t" # don't collide with datasets - elif obj_type == "roots": - prefix = "g" # root obj is a group - else: - prefix = obj_type[0] - if not rootid and obj_type != "roots": - # v1 schema - folder - objid = prefix + "-" + str(uuid.uuid1()) - elif rootid and not isSchema2Id(rootid): - # v1 schema - domain - objid = prefix + "-" + str(uuid.uuid1()) - else: - # schema v2 - salt = uuid.uuid4().hex - # take a hash to randomize the uuid - token = list(hashlib.sha256(salt.encode()).hexdigest()) - - if rootid: - # replace first 16 chars of token with first 16 chars of root id - root_hex = getIdHexChars(rootid) - token[0:16] = root_hex[0:16] - else: - # obj_type == "roots" - # use only 16 chars, but make it look a 32 char id - for i in range(16): - token[16 + i] = hexRot(token[i]) - # format as a string - token = "".join(token) - objid = prefix + "-" + token[0:8] + "-" + token[8:16] + "-" - objid += token[16:20] + "-" + token[20:26] + "-" + token[26:32] - - return objid - - -def getS3Key(id): - """Return s3 key for given id. - - For schema v1: - A md5 prefix is added to the front of the returned key to better - distribute S3 objects. - For schema v2: - The id is converted to the pattern: "db/{rootid[0:16]}" for rootids and - "db/id[0:16]/{prefix}/id[16-32]" for other ids - Chunk ids have the chunk index added after the slash: - "db/id[0:16]/d/id[16:32]/x_y_z - - For domain id's: - Return a key with the .domain suffix and no preceding slash. - For non-default buckets, use the format: /s3_key - If the id has a storage specifier ("s3://", "file://", etc.) - include that along with the bucket name. e.g.: "s3://mybucket/a_folder/a_file.h5" - """ - - base_id = _getBaseName(id) # strip any s3://, etc. - if base_id.find("/") > 0: - # a domain id - domain_suffix = ".domain.json" - index = base_id.find("/") + 1 - key = base_id[index:] - if not key.endswith(domain_suffix): - if key[-1] != "/": - key += "/" - key += domain_suffix - else: - if isSchema2Id(id): - # schema v2 id - hexid = getIdHexChars(id) - prefix = id[0] # one of g, d, t, c - if prefix not in ("g", "d", "t", "c"): - raise ValueError(f"Unexpected id: {id}") - - if isRootObjId(id): - key = f"db/{hexid[0:8]}-{hexid[8:16]}" - else: - partition = "" - if prefix == "c": - # use 'g' so that chunks will show up under their dataset - s3col = "d" - n = id.find("-") - if n > 1: - # extract the partition index if present - partition = "p" + id[1:n] - else: - s3col = prefix - key = f"db/{hexid[0:8]}-{hexid[8:16]}/{s3col}/{hexid[16:20]}" - key += f"-{hexid[20:26]}-{hexid[26:32]}" - if prefix == "c": - if partition: - key += "/" - key += partition - # add the chunk coordinate - index = id.index("_") # will raise ValueError if not found - n = index + 1 - coord = id[n:] - key += "/" - key += coord - elif prefix == "g": - # add key suffix for group - key += "/.group.json" - elif prefix == "d": - # add key suffix for dataset - key += "/.dataset.json" - else: - # add key suffix for datatype - key += "/.datatype.json" - else: - # v1 id - # schema v1 id - idhash = getIdHash(id) - key = f"{idhash}-{id}" - - return key - - -def getObjId(s3key): - """Return object id given valid s3key""" - if all( - ( - len(s3key) >= 44 and s3key[0:5].isalnum(), - len(s3key) >= 44 and s3key[5] == "-", - len(s3key) >= 44 and s3key[6] in ("g", "d", "c", "t"), - ) - ): - # v1 obj keys - objid = s3key[6:] - elif s3key.endswith("/.domain.json"): - objid = "/" + s3key[: -(len("/.domain.json"))] - elif s3key.startswith("db/"): - # schema v2 object key - parts = s3key.split("/") - chunk_coord = "" # used only for chunk ids - partition = "" # likewise - token = [] - for ch in parts[1]: - if ch != "-": - token.append(ch) - - if len(parts) == 3: - # root id - # last part should be ".group.json" - if parts[2] != ".group.json": - raise ValueError(f"unexpected S3Key: {s3key}") - # add 16 more chars using rotated version of first 16 - for i in range(16): - token.append(hexRot(token[i])) - prefix = "g" - elif len(parts) == 5: - # group, dataset, or datatype or chunk - for ch in parts[3]: - if ch != "-": - token.append(ch) - - if parts[2] == "g" and parts[4] == ".group.json": - prefix = "g" # group json - elif parts[2] == "t" and parts[4] == ".datatype.json": - prefix = "t" # datatype json - elif parts[2] == "d": - if parts[4] == ".dataset.json": - prefix = "d" # dataset json - else: - # chunk object - prefix = "c" - chunk_coord = "_" + parts[4] - else: - raise ValueError(f"unexpected S3Key: {s3key}") - elif len(parts) == 6: - # chunk key with partitioning - for ch in parts[3]: - if ch != "-": - token.append(ch) - if parts[2][0] != "d": - raise ValueError(f"unexpected S3Key: {s3key}") - prefix = "c" - partition = parts[4] - if partition[0] != "p": - raise ValueError(f"unexpected S3Key: {s3key}") - partition = partition[1:] # strip off the p - chunk_coord = "_" + parts[5] - else: - raise ValueError(f"unexpected S3Key: {s3key}") - - token = "".join(token) - objid = prefix + partition + "-" + token[0:8] + "-" + token[8:16] - objid += "-" + token[16:20] + "-" + token[20:26] + "-" - objid += token[26:32] + chunk_coord - else: - msg = f"unexpected S3Key: {s3key}" - log.warn(msg) - raise ValueError(msg) - return objid - - -def isS3ObjKey(s3key): - valid = False - try: - objid = getObjId(s3key) - if objid: - valid = True - except KeyError: - pass # ignore - except ValueError: - pass # ignore - return valid - - -def createNodeId(prefix, node_number=None): - """Create a random id used to identify nodes""" - node_id = "" # nothing too bad happens if this doesn't get set - if node_number is not None: - # just make an id based on the node_number - hash_key = f"{node_number + 1:03d}" - else: - # use the container id if we are running inside docker - hash_key = getIdHash(str(uuid.uuid1())) - proc_file = "/proc/self/cgroup" - if os.path.isfile(proc_file): - with open(proc_file) as f: - first_line = f.readline() - if first_line: - fields = first_line.split(":") - if len(fields) >= 3: - field = fields[2] - if field.startswith("/docker/"): - docker_len = len("/docker/") - - if len(field) > docker_len + 12: - n = docker_len - m = n + 12 - node_id = field[n:m] - - if node_id: - key = f"{prefix}-{node_id}-{hash_key}" - else: - key = f"{prefix}-{hash_key}" - return key - - -def getCollectionForId(obj_id): - """return groups/datasets/datatypes based on id""" - if not isinstance(obj_id, str): - raise ValueError("invalid object id") - collection = None - if obj_id.startswith("g-"): - collection = "groups" - elif obj_id.startswith("d-"): - collection = "datasets" - elif obj_id.startswith("t-"): - collection = "datatypes" - else: - raise ValueError("not a collection id") - return collection - - -def validateUuid(id, obj_class=None): - if not isinstance(id, str): - raise ValueError("Expected string type") - if len(id) < 38: - # id should be prefix (e.g. "g-") and uuid value - raise ValueError("Unexpected id length") - if id[0] not in ("g", "d", "t", "c"): - raise ValueError("Unexpected prefix") - if id[0] != "c" and id[1] != "-": - # chunk ids may have a partition index following the c - raise ValueError("Unexpected prefix") - if obj_class is not None: - obj_class = obj_class.lower() - prefix = obj_class[0] - if obj_class.startswith("datatype"): - prefix = "t" - if id[0] != prefix: - raise ValueError(f"Unexpected prefix for class: {obj_class}") - if id[0] == "c": - # trim the type char and any partition id - n = id.find("-") - if n == -1: - raise ValueError("Invalid chunk id") - - # trim the chunk index for chunk ids - m = id.find("_") - if m == -1: - raise ValueError("Invalid chunk id") - n += 1 - id = "c-" + id[n:m] - if len(id) != 38: - # id should be 36 now - raise ValueError("Unexpected id length") - - for ch in id: - if ch.isalnum(): - continue - if ch == "-": - continue - raise ValueError(f"Unexpected character in uuid: {ch}") - - -def isValidUuid(id, obj_class=None): - try: - validateUuid(id, obj_class) - return True - except ValueError: - return False - - -def isValidChunkId(id): - if not isValidUuid(id): - return False - if id[0] != "c": - return False - return True - - -def getClassForObjId(id): - """return domains/chunks/groups/datasets/datatypes based on id""" - if not isinstance(id, str): - raise ValueError("Expected string type") - if len(id) == 0: - raise ValueError("Empty string") - if id[0] == "/": - return "domains" - if isValidChunkId(id): - return "chunks" - else: - return getCollectionForId(id) - - -def isObjId(id): - """return true if uuid or domain""" - if not isinstance(id, str) or len(id) == 0: - return False - if id.find("/") > 0: - # domain id is any string in the form / - return True - return isValidUuid(id) - - -def getUuidFromId(id): - """strip off the type prefix ('g-' or 'd-', or 't-') - and return the uuid part""" - return id[2:] - - -def getObjPartition(id, count): - """Get the id of the dn node that should be handling the given obj id""" - hash_code = getIdHash(id) - hash_value = int(hash_code, 16) - number = hash_value % count - return number - - -def getNodeNumber(app): - if app["node_type"] == "sn": - log.error("node number if only for DN nodes") - raise ValueError() - - dn_ids = app["dn_ids"] - log.debug(f"getNodeNumber(from dn_ids: {dn_ids})") - for i in range(len(dn_ids)): - dn_id = dn_ids[i] - if dn_id == app["id"]: - log.debug(f"returning nodeNumber: {i}") - return i - log.error("getNodeNumber, no matching id") - return -1 - - -def getNodeCount(app): - dn_urls = app["dn_urls"] - log.debug(f"getNodeCount for dn_urls: {dn_urls}") - dn_node_count = len(dn_urls) - return dn_node_count - - -def validateInPartition(app, obj_id): - node_number = getNodeNumber(app) - node_count = getNodeCount(app) - msg = f"obj_id: {obj_id}, node_count: {node_count}, " - msg += f"node_number: {node_number}" - log.debug(msg) - partition_number = getObjPartition(obj_id, node_count) - if partition_number != node_number: - # The request shouldn't have come to this node' - msg = f"wrong node for 'id':{obj_id}, expected node {node_number} " - msg += f"got {partition_number}" - log.error(msg) - raise KeyError(msg) - - -def getDataNodeUrl(app, obj_id): - """Return host/port for datanode for given obj_id. - Throw exception if service is not ready""" - dn_urls = app["dn_urls"] - dn_node_count = getNodeCount(app) - node_state = app["node_state"] - if node_state != "READY" or dn_node_count <= 0: - msg = "Service not ready" - log.warn(msg) - raise HTTPServiceUnavailable() - dn_number = getObjPartition(obj_id, dn_node_count) - url = dn_urls[dn_number] - log.debug(f"got dn_url: {url} for obj_id: {obj_id}") - return url diff --git a/testall.py b/testall.py index 247d4a91..4123d87f 100755 --- a/testall.py +++ b/testall.py @@ -16,7 +16,7 @@ PYTHON_CMD = "python" # change to "python3" if "python" invokes python version 2.x unit_tests = ('chunk_util_test', 'compression_test', 'domain_util_test', - 'dset_util_test', 'id_util_test', 'lru_cache_test', + 'dset_util_test', 'lru_cache_test', 'shuffle_test', 'rangeget_util_test') integ_tests = ('uptest', 'setup_test', 'domain_test', 'group_test', diff --git a/tests/integ/vlen_test.py b/tests/integ/vlen_test.py index 38555c5d..d318c1e7 100755 --- a/tests/integ/vlen_test.py +++ b/tests/integ/vlen_test.py @@ -23,7 +23,6 @@ class VlenTest(unittest.TestCase): def __init__(self, *args, **kwargs): super(VlenTest, self).__init__(*args, **kwargs) self.base_domain = helper.getTestDomainName(self.__class__.__name__) - print(self.base_domain) helper.setupDomain(self.base_domain) self.endpoint = helper.getEndpoint() @@ -645,12 +644,8 @@ def testPutVLenCompoundBinary(self): # write as binary data data = arrayToBytes(arr) - print("data:", data) - for i in range(len(data)): - print(f"{i:04d}: {data[i]}") self.assertEqual(len(data), 192) # will vary based on count arr_copy = bytesToArray(data, dt_compound, (count,)) - print("arr_copy:", arr_copy) req = self.endpoint + "/datasets/" + dset_uuid + "/value" rsp = self.session.put(req, data=data, headers=headers_bin_req) self.assertEqual(rsp.status_code, 200) diff --git a/tests/unit/id_util_test.py b/tests/unit/id_util_test.py deleted file mode 100755 index 06f974c4..00000000 --- a/tests/unit/id_util_test.py +++ /dev/null @@ -1,212 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # -# Utilities. The full HSDS copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## -import unittest -import sys - -sys.path.append("../..") -from hsds.util.idUtil import getObjPartition, isValidUuid, validateUuid -from hsds.util.idUtil import createObjId, getCollectionForId -from hsds.util.idUtil import isObjId, isS3ObjKey, getS3Key, getObjId, isSchema2Id -from hsds.util.idUtil import isRootObjId, getRootObjId - - -class IdUtilTest(unittest.TestCase): - def __init__(self, *args, **kwargs): - super(IdUtilTest, self).__init__(*args, **kwargs) - # main - - def testCreateObjId(self): - id_len = 38 # 36 for uuid plus two for prefix ("g-", "d-") - ids = set() - for obj_class in ("groups", "datasets", "datatypes", "chunks"): - for i in range(100): - id = createObjId(obj_class) - self.assertEqual(len(id), id_len) - self.assertTrue(id[0] in ("g", "d", "t", "c")) - self.assertEqual(id[1], "-") - ids.add(id) - - self.assertEqual(len(ids), 400) - try: - createObjId("bad_class") - self.assertTrue(False) # should throw exception - except ValueError: - pass # expected - - def testIsValidUuid(self): - group1_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e" # orig schema - group2_id = "g-314d61b8-995411e6-a733-3c15c2-da029e" - root_id = "g-f9aaa28e-d42e10e5-7122-2a065c-a6986d" - dataset1_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e" # orig schema - dataset2_id = "d-4c48f3ae-995411e6-a3cd-3c15c2-da029e" - ctype1_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005" # orig schema - ctype2_id = "t-8c785f1c-995311e6-9bc2-0242ac-110005" - chunk1_id = "c-8c785f1c-9953-11e6-9bc2-0242ac110005_7_2" # orig schema - chunk2_id = "c-8c785f1c-995311e6-9bc2-0242ac-110005_7_2" - domain_id = "mybucket/bob/mydata.h5" - s3_domain_id = "s3://mybucket/bob/mydata.h5" - file_domain_id = "file://mybucket/bob/mydata.h5" - azure_domain_id = "https://myaccount.blob.core.windows.net/mybucket/bob/mydata.h5" - valid_id_map = { - group1_id: "a49be-g-314d61b8-9954-11e6-a733-3c15c2da029e", - group2_id: "db/314d61b8-995411e6/g/a733-3c15c2-da029e/.group.json", - dataset1_id: "26928-d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e", - dataset2_id: "db/4c48f3ae-995411e6/d/a3cd-3c15c2-da029e/.dataset.json", - ctype1_id: "5a9cf-t-8c785f1c-9953-11e6-9bc2-0242ac110005", - ctype2_id: "db/8c785f1c-995311e6/t/9bc2-0242ac-110005/.datatype.json", - chunk1_id: "dc4ce-c-8c785f1c-9953-11e6-9bc2-0242ac110005_7_2", - chunk2_id: "db/8c785f1c-995311e6/d/9bc2-0242ac-110005/7_2", - domain_id: "bob/mydata.h5/.domain.json", - s3_domain_id: "bob/mydata.h5/.domain.json", - file_domain_id: "bob/mydata.h5/.domain.json", - azure_domain_id: "bob/mydata.h5/.domain.json", } - - bad_ids = ("g-1e76d862", "/bob/mydata.h5") - - self.assertTrue(isValidUuid(group1_id)) - self.assertFalse(isSchema2Id(group1_id)) - self.assertTrue(isValidUuid(group1_id, obj_class="Group")) - self.assertTrue(isValidUuid(group1_id, obj_class="group")) - self.assertTrue(isValidUuid(group1_id, obj_class="groups")) - self.assertTrue(isSchema2Id(root_id)) - self.assertTrue(isValidUuid(root_id, obj_class="Group")) - self.assertTrue(isValidUuid(root_id, obj_class="group")) - self.assertTrue(isValidUuid(root_id, obj_class="groups")) - self.assertTrue(isRootObjId(root_id)) - self.assertTrue(isValidUuid(dataset1_id, obj_class="datasets")) - self.assertFalse(isSchema2Id(dataset1_id)) - self.assertTrue(isValidUuid(ctype1_id, obj_class="datatypes")) - self.assertFalse(isSchema2Id(ctype1_id)) - self.assertTrue(isValidUuid(chunk1_id, obj_class="chunks")) - self.assertFalse(isSchema2Id(chunk1_id)) - self.assertTrue(isValidUuid(group2_id)) - self.assertTrue(isSchema2Id(group2_id)) - self.assertTrue(isValidUuid(group2_id, obj_class="Group")) - self.assertTrue(isValidUuid(group2_id, obj_class="group")) - self.assertTrue(isValidUuid(group2_id, obj_class="groups")) - self.assertFalse(isRootObjId(group2_id)) - self.assertTrue(isValidUuid(dataset2_id, obj_class="datasets")) - self.assertTrue(isSchema2Id(dataset2_id)) - self.assertTrue(isValidUuid(ctype2_id, obj_class="datatypes")) - self.assertTrue(isSchema2Id(ctype2_id)) - self.assertTrue(isValidUuid(chunk2_id, obj_class="chunks")) - self.assertTrue(isSchema2Id(chunk2_id)) - validateUuid(group1_id) - try: - isRootObjId(group1_id) - self.assertTrue(False) - except ValueError: - # only works for v2 schema - pass # expected - - for item in valid_id_map: - self.assertTrue(isObjId(item)) - s3key = getS3Key(item) - self.assertTrue(s3key[0] != "/") - self.assertTrue(isS3ObjKey(s3key)) - expected = valid_id_map[item] - self.assertEqual(s3key, expected) - if item.find("/") > 0: - continue # bucket name gets lost when domain ids get converted to s3keys - objid = getObjId(s3key) - self.assertEqual(objid, item) - for item in bad_ids: - self.assertFalse(isValidUuid(item)) - self.assertFalse(isObjId(item)) - - def testGetObjPartition(self): - node_count = 12 - for obj_class in ("groups", "datasets", "datatypes", "chunks"): - for i in range(100): - id = createObjId(obj_class) - node_number = getObjPartition(id, node_count) - self.assertTrue(node_number >= 0) - self.assertTrue(node_number < node_count) - # try a domain partition - node_number = getObjPartition("/home/test_user1", node_count) - self.assertTrue(node_number >= 0) - self.assertTrue(node_number < node_count) - - def testGetCollection(self): - group_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e" - dataset_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e" - ctype_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005" - bad_id = "x-59647858-9954-11e6-95d2-3c15c2da029e" - self.assertEqual(getCollectionForId(group_id), "groups") - self.assertEqual(getCollectionForId(dataset_id), "datasets") - self.assertEqual(getCollectionForId(ctype_id), "datatypes") - try: - getCollectionForId(bad_id) - self.assertTrue(False) - except ValueError: - pass # expected - try: - getCollectionForId(None) - self.assertTrue(False) - except ValueError: - pass # expected - - def testSchema2Id(self): - root_id = createObjId("roots") - group_id = createObjId("groups", rootid=root_id) - dataset_id = createObjId("datasets", rootid=root_id) - ctype_id = createObjId("datatypes", rootid=root_id) - - self.assertEqual(getCollectionForId(root_id), "groups") - self.assertEqual(getCollectionForId(group_id), "groups") - self.assertEqual(getCollectionForId(dataset_id), "datasets") - self.assertEqual(getCollectionForId(ctype_id), "datatypes") - chunk_id = "c" + dataset_id[1:] + "_1_2" - print(chunk_id) - chunk_partition_id = "c42-" + dataset_id[2:] + "_1_2" - - for id in (chunk_id, chunk_partition_id): - try: - getCollectionForId(id) - self.assertTrue(False) - except ValueError: - pass # expected - valid_ids = ( - group_id, - dataset_id, - ctype_id, - chunk_id, - chunk_partition_id, - root_id, - ) - s3prefix = getS3Key(root_id) - self.assertTrue(s3prefix.endswith("/.group.json")) - s3prefix = s3prefix[: -(len(".group.json"))] - for oid in valid_ids: - print("oid:", oid) - self.assertTrue(len(oid) >= 38) - parts = oid.split("-") - self.assertEqual(len(parts), 6) - self.assertTrue(oid[0] in ("g", "d", "t", "c")) - self.assertTrue(isSchema2Id(oid)) - if oid == root_id: - self.assertTrue(isRootObjId(oid)) - else: - self.assertFalse(isRootObjId(oid)) - self.assertEqual(getRootObjId(oid), root_id) - - s3key = getS3Key(oid) - print(s3key) - self.assertTrue(s3key.startswith(s3prefix)) - self.assertEqual(getObjId(s3key), oid) - self.assertTrue(isS3ObjKey(s3key)) - - -if __name__ == "__main__": - # setup test files - - unittest.main() From e1926c06c536b5f76d3af45dfb58526268a5850f Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 23 Apr 2025 18:30:16 +0200 Subject: [PATCH 06/49] add nodeUtil.py --- hsds/util/nodeUtil.py | 122 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 hsds/util/nodeUtil.py diff --git a/hsds/util/nodeUtil.py b/hsds/util/nodeUtil.py new file mode 100644 index 00000000..d39f158c --- /dev/null +++ b/hsds/util/nodeUtil.py @@ -0,0 +1,122 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +# +# nodeUtil: +# node (SN/DN mapping) related functions +# +import hashlib +import os.path +import uuid + +from aiohttp.web_exceptions import HTTPServiceUnavailable + +from .. import hsds_logger as log + + +def _getIdHash(id): + """Return md5 prefix based on id value""" + m = hashlib.new("md5") + m.update(id.encode("utf8")) + hexdigest = m.hexdigest() + return hexdigest[:5] + +def createNodeId(prefix, node_number=None): + """Create a random id used to identify nodes""" + node_id = "" # nothing too bad happens if this doesn't get set + if node_number is not None: + # just make an id based on the node_number + hash_key = f"{node_number + 1:03d}" + else: + # use the container id if we are running inside docker + hash_key = _getIdHash(str(uuid.uuid1())) + proc_file = "/proc/self/cgroup" + if os.path.isfile(proc_file): + with open(proc_file) as f: + first_line = f.readline() + if first_line: + fields = first_line.split(":") + if len(fields) >= 3: + field = fields[2] + if field.startswith("/docker/"): + docker_len = len("/docker/") + + if len(field) > docker_len + 12: + n = docker_len + m = n + 12 + node_id = field[n:m] + + if node_id: + key = f"{prefix}-{node_id}-{hash_key}" + else: + key = f"{prefix}-{hash_key}" + return key + + +def getObjPartition(id, count): + """Get the id of the dn node that should be handling the given obj id""" + hash_code = _getIdHash(id) + hash_value = int(hash_code, 16) + number = hash_value % count + return number + + +def getNodeNumber(app): + if app["node_type"] == "sn": + log.error("node number if only for DN nodes") + raise ValueError() + + dn_ids = app["dn_ids"] + log.debug(f"getNodeNumber(from dn_ids: {dn_ids})") + for i in range(len(dn_ids)): + dn_id = dn_ids[i] + if dn_id == app["id"]: + log.debug(f"returning nodeNumber: {i}") + return i + log.error("getNodeNumber, no matching id") + return -1 + +def getNodeCount(app): + dn_urls = app["dn_urls"] + log.debug(f"getNodeCount for dn_urls: {dn_urls}") + dn_node_count = len(dn_urls) + return dn_node_count + + +def validateInPartition(app, obj_id): + node_number = getNodeNumber(app) + node_count = getNodeCount(app) + msg = f"obj_id: {obj_id}, node_count: {node_count}, " + msg += f"node_number: {node_number}" + log.debug(msg) + partition_number = getObjPartition(obj_id, node_count) + if partition_number != node_number: + # The request shouldn't have come to this node' + msg = f"wrong node for 'id':{obj_id}, expected node {node_number} " + msg += f"got {partition_number}" + log.error(msg) + raise KeyError(msg) + + +def getDataNodeUrl(app, obj_id): + """Return host/port for datanode for given obj_id. + Throw exception if service is not ready""" + dn_urls = app["dn_urls"] + dn_node_count = getNodeCount(app) + node_state = app["node_state"] + if node_state != "READY" or dn_node_count <= 0: + msg = "Service not ready" + log.warn(msg) + raise HTTPServiceUnavailable() + dn_number = getObjPartition(obj_id, dn_node_count) + url = dn_urls[dn_number] + log.debug(f"got dn_url: {url} for obj_id: {obj_id}") + return url From ae4579ff98d9169acf5557144618212de5f32ca1 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 23 Apr 2025 18:54:06 +0200 Subject: [PATCH 07/49] fix parameter for createObjId call --- hsds/servicenode_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index d2db9d4d..80835bcb 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -1076,7 +1076,7 @@ async def createObject(app, if creation_props: log.debug(f" cprops: {creation_props}") - obj_id = createObjId(collection, rootid=root_id) + obj_id = createObjId(collection, root_id=root_id) log.info(f"new obj id: {obj_id}") obj_json = {"id": obj_id, "root": root_id} if obj_type: From d6cad74320b18d491ec2424d8c267849026eb31d Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 23 Apr 2025 19:00:30 +0200 Subject: [PATCH 08/49] fix collection name for use with h5json --- hsds/domain_sn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hsds/domain_sn.py b/hsds/domain_sn.py index efbc31ab..4436db37 100755 --- a/hsds/domain_sn.py +++ b/hsds/domain_sn.py @@ -985,7 +985,7 @@ async def PUT_Domain(request): if not is_folder and not linked_json: # create a root group for the new domain - root_id = createObjId("roots") + root_id = createObjId("groups") log.debug(f"new root group id: {root_id}") group_json = {"id": root_id, "root": root_id, "domain": domain} log.debug(f"create group for domain, body: {group_json}") From 6add48a9185cb4eb1bc0d2d9d20458fad8c4bd5f Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 23 Apr 2025 19:38:21 +0200 Subject: [PATCH 09/49] use connsistent collection name for isValidUuid --- hsds/ctype_dn.py | 4 ++-- hsds/ctype_sn.py | 8 ++++---- hsds/dset_dn.py | 8 ++++---- hsds/group_dn.py | 10 +++++----- hsds/link_dn.py | 8 ++++---- hsds/link_sn.py | 10 +++++----- hsds/servicenode_lib.py | 2 +- 7 files changed, 25 insertions(+), 25 deletions(-) diff --git a/hsds/ctype_dn.py b/hsds/ctype_dn.py index fe8a67a7..0d0f83e3 100755 --- a/hsds/ctype_dn.py +++ b/hsds/ctype_dn.py @@ -34,7 +34,7 @@ async def GET_Datatype(request): params = request.rel_url.query ctype_id = get_obj_id(request) - if not isValidUuid(ctype_id, obj_class="type"): + if not isValidUuid(ctype_id, obj_class="datatypes"): log.error(f"Unexpected type_id: {ctype_id}") raise HTTPInternalServerError() @@ -91,7 +91,7 @@ async def POST_Datatype(request): raise HTTPBadRequest(reason=msg) ctype_id = get_obj_id(request, body=body) - if not isValidUuid(ctype_id, obj_class="datatype"): + if not isValidUuid(ctype_id, obj_class="datatypes"): log.error("Unexpected type_id: {ctype_id}") raise HTTPInternalServerError() diff --git a/hsds/ctype_sn.py b/hsds/ctype_sn.py index d85ffc07..2030b63b 100755 --- a/hsds/ctype_sn.py +++ b/hsds/ctype_sn.py @@ -51,7 +51,7 @@ async def GET_Datatype(request): include_attrs = True if ctype_id: - if not isValidUuid(ctype_id, "Type"): + if not isValidUuid(ctype_id, "datatypes"): msg = f"Invalid type id: {ctype_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -62,7 +62,7 @@ async def GET_Datatype(request): group_id = None if "grpid" in params: group_id = params["grpid"] - if not isValidUuid(group_id, "Group"): + if not isValidUuid(group_id, "groups"): msg = f"Invalid parent group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -103,7 +103,7 @@ async def GET_Datatype(request): # throws 404 if not found kwargs = {"bucket": bucket, "domain": domain} ctype_id, domain, _ = await getObjectIdByPath(app, group_id, h5path, **kwargs) - if not isValidUuid(ctype_id, "Datatype"): + if not isValidUuid(ctype_id, "datatypes"): msg = f"No datatype exist with the path: {h5path}" log.warn(msg) raise HTTPGone() @@ -273,7 +273,7 @@ async def DELETE_Datatype(request): msg = "Missing committed type id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(ctype_id, "Type"): + if not isValidUuid(ctype_id, "datatypes"): msg = f"Invalid committed type id: {ctype_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/dset_dn.py b/hsds/dset_dn.py index 60d1037b..dd761365 100755 --- a/hsds/dset_dn.py +++ b/hsds/dset_dn.py @@ -33,7 +33,7 @@ async def GET_Dataset(request): params = request.rel_url.query dset_id = get_obj_id(request) - if not isValidUuid(dset_id, obj_class="dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): log.error(f"Unexpected dataset_id: {dset_id}") raise HTTPInternalServerError() if "bucket" in params: @@ -94,7 +94,7 @@ async def POST_Dataset(request): raise HTTPBadRequest(reason=msg) dset_id = get_obj_id(request, body=body) - if not isValidUuid(dset_id, obj_class="dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): log.error(f"Unexpected dataset_id: {dset_id}") raise HTTPInternalServerError() @@ -176,7 +176,7 @@ async def DELETE_Dataset(request): dset_id = request.match_info.get("id") log.info(f"DELETE dataset: {dset_id}") - if not isValidUuid(dset_id, obj_class="dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): log.error(f"Unexpected dataset id: {dset_id}") raise HTTPInternalServerError() @@ -220,7 +220,7 @@ async def PUT_DatasetShape(request): params = request.rel_url.query dset_id = request.match_info.get("id") - if not isValidUuid(dset_id, obj_class="dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): log.error(f"Unexpected dset_id: {dset_id}") raise HTTPInternalServerError() diff --git a/hsds/group_dn.py b/hsds/group_dn.py index d67f672e..8a4f10f8 100755 --- a/hsds/group_dn.py +++ b/hsds/group_dn.py @@ -47,7 +47,7 @@ async def GET_Group(request): log.info(f"GET group: {group_id} bucket: {bucket}") - if not isValidUuid(group_id, obj_class="group"): + if not isValidUuid(group_id, obj_class="groups"): log.error(f"Unexpected group_id: {group_id}") raise HTTPInternalServerError() @@ -100,7 +100,7 @@ async def POST_Group(request): group_id = get_obj_id(request, body=body) log.info(f"POST group: {group_id} bucket: {bucket}") - if not isValidUuid(group_id, obj_class="group"): + if not isValidUuid(group_id, obj_class="groups"): log.error(f"Unexpected group_id: {group_id}") raise HTTPInternalServerError() if "root" not in body: @@ -116,7 +116,7 @@ async def POST_Group(request): root_id = body["root"] - if not isValidUuid(root_id, obj_class="group"): + if not isValidUuid(root_id, obj_class="groups"): msg = "Invalid root_id: " + root_id log.error(msg) raise HTTPInternalServerError() @@ -179,7 +179,7 @@ async def PUT_Group(request): # don't really need bucket param since the dirty ids know which bucket # they should write too - if not isValidUuid(root_id, obj_class="group"): + if not isValidUuid(root_id, obj_class="groups"): log.error(f"Unexpected group_id: {root_id}") raise HTTPInternalServerError() @@ -248,7 +248,7 @@ async def DELETE_Group(request): params = request.rel_url.query group_id = get_obj_id(request) - if not isValidUuid(group_id, obj_class="group"): + if not isValidUuid(group_id, obj_class="groups"): log.error(f"Unexpected group_id: {group_id}") raise HTTPInternalServerError() diff --git a/hsds/link_dn.py b/hsds/link_dn.py index 09b3ac20..1ad6133e 100755 --- a/hsds/link_dn.py +++ b/hsds/link_dn.py @@ -75,7 +75,7 @@ async def GET_Links(request): log.debug(f"GET_Links params: {params}") group_id = get_obj_id(request) log.info(f"GET links: {group_id}") - if not isValidUuid(group_id, obj_class="group"): + if not isValidUuid(group_id, obj_class="groups"): log.error(f"Unexpected group_id: {group_id}") raise HTTPInternalServerError() @@ -171,7 +171,7 @@ async def POST_Links(request): group_id = get_obj_id(request) log.info(f"POST_Links: {group_id}") - if not isValidUuid(group_id, obj_class="group"): + if not isValidUuid(group_id, obj_class="groups"): log.error(f"Unexpected group_id: {group_id}") raise HTTPInternalServerError() @@ -286,7 +286,7 @@ async def PUT_Links(request): group_id = get_obj_id(request) log.info(f"PUT links: {group_id}") - if not isValidUuid(group_id, obj_class="group"): + if not isValidUuid(group_id, obj_class="groups"): log.error(f"Unexpected group_id: {group_id}") raise HTTPInternalServerError() @@ -405,7 +405,7 @@ async def DELETE_Links(request): group_id = get_obj_id(request) log.info(f"DELETE links: {group_id}") - if not isValidUuid(group_id, obj_class="group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Unexpected group_id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/link_sn.py b/hsds/link_sn.py index b7b36ef7..dc80d9e4 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -45,7 +45,7 @@ async def GET_Links(request): msg = "Missing group id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(group_id, obj_class="Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -177,7 +177,7 @@ async def GET_Link(request): msg = "Missing group id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(group_id, obj_class="Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -495,7 +495,7 @@ async def DELETE_Links(request): msg = "Missing group id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(group_id, obj_class="Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -642,7 +642,7 @@ async def POST_Links(request): # do a check that everything is as it should with the item list for group_id in items: - if not isValidUuid(group_id, obj_class="Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid group id: {group_id}" log.warn(msg) @@ -749,7 +749,7 @@ async def DELETE_Link(request): msg = "Missing group id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(group_id, obj_class="Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 80835bcb..bb40620a 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -527,7 +527,7 @@ async def putLinks(app, group_id, items, bucket=None): """ create a new links. Return 201 if any item is a new link, or 200 if it's a duplicate of an existing link. """ - isValidUuid(group_id, obj_class="group") + isValidUuid(group_id, obj_class="groups") group_json = None # validate input From b13321cd97bb5e81efb63958dd5cbd2b8e4d826b Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 23 Apr 2025 19:48:00 +0200 Subject: [PATCH 10/49] fix flake8 format errors --- hsds/basenode.py | 1 - hsds/util/nodeUtil.py | 2 ++ tests/integ/vlen_test.py | 2 -- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/hsds/basenode.py b/hsds/basenode.py index 6dd83b64..1b1bbbd0 100644 --- a/hsds/basenode.py +++ b/hsds/basenode.py @@ -26,7 +26,6 @@ from aiohttp.web_exceptions import HTTPServiceUnavailable - from . import config from .util.httpUtil import http_get, http_post, jsonResponse from .util.authUtil import getUserPasswordFromRequest, validateUserPassword diff --git a/hsds/util/nodeUtil.py b/hsds/util/nodeUtil.py index d39f158c..8f67f400 100644 --- a/hsds/util/nodeUtil.py +++ b/hsds/util/nodeUtil.py @@ -29,6 +29,7 @@ def _getIdHash(id): hexdigest = m.hexdigest() return hexdigest[:5] + def createNodeId(prefix, node_number=None): """Create a random id used to identify nodes""" node_id = "" # nothing too bad happens if this doesn't get set @@ -84,6 +85,7 @@ def getNodeNumber(app): log.error("getNodeNumber, no matching id") return -1 + def getNodeCount(app): dn_urls = app["dn_urls"] log.debug(f"getNodeCount for dn_urls: {dn_urls}") diff --git a/tests/integ/vlen_test.py b/tests/integ/vlen_test.py index d318c1e7..d3d44ab5 100755 --- a/tests/integ/vlen_test.py +++ b/tests/integ/vlen_test.py @@ -13,7 +13,6 @@ import json import helper import numpy as np -import sys from h5json.hdf5dtype import createDataType from h5json.array_util import arrayToBytes, bytesToArray @@ -645,7 +644,6 @@ def testPutVLenCompoundBinary(self): # write as binary data data = arrayToBytes(arr) self.assertEqual(len(data), 192) # will vary based on count - arr_copy = bytesToArray(data, dt_compound, (count,)) req = self.endpoint + "/datasets/" + dset_uuid + "/value" rsp = self.session.put(req, data=data, headers=headers_bin_req) self.assertEqual(rsp.status_code, 200) From fee9390ee6fd0bc60275deac8c69ec5c31847ddc Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 23 Apr 2025 19:59:40 +0200 Subject: [PATCH 11/49] fix flake8 error in testall --- testall.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testall.py b/testall.py index 4123d87f..480ab693 100755 --- a/testall.py +++ b/testall.py @@ -16,7 +16,7 @@ PYTHON_CMD = "python" # change to "python3" if "python" invokes python version 2.x unit_tests = ('chunk_util_test', 'compression_test', 'domain_util_test', - 'dset_util_test', 'lru_cache_test', + 'dset_util_test', 'lru_cache_test', 'shuffle_test', 'rangeget_util_test') integ_tests = ('uptest', 'setup_test', 'domain_test', 'group_test', From f1b1cabddad9a19e93f75c597e17bbc49eb824c8 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 23 Apr 2025 20:17:52 +0200 Subject: [PATCH 12/49] use h5json for unit test id --- tests/unit/lru_cache_test.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/tests/unit/lru_cache_test.py b/tests/unit/lru_cache_test.py index 5e747c92..002ca822 100755 --- a/tests/unit/lru_cache_test.py +++ b/tests/unit/lru_cache_test.py @@ -14,9 +14,16 @@ import sys import numpy as np +from h5json.objid import createObjId + sys.path.append("../..") from hsds.util.lruCache import LruCache -from hsds.util.idUtil import createObjId + + +def _createId(): + objid = createObjId("groups") + objid = 'c' + objid[1:] # fake a chunk id + return objid class LruCacheTest(unittest.TestCase): @@ -34,7 +41,7 @@ def testSimple(self): self.assertFalse("xyz" in cc) - id = createObjId("chunks") + id = _createId() try: # only dict objects can be added cc[id] = list(range(20)) @@ -42,7 +49,7 @@ def testSimple(self): except TypeError: pass # expected - rand_id = createObjId("chunks") + rand_id = _createId() np_arr = np.random.random((500, 500)) # smaller than our chunk cache size cc[rand_id] = np_arr # add to cache cc.consistencyCheck() @@ -104,7 +111,7 @@ def testLRU(self): ids = [] # add chunks to the cache for i in range(10): - id = createObjId("chunks") + id = _createId() ids.append(id) arr = np.empty((16, 16), dtype="i4") # 1024 bytes arr[...] = i @@ -165,7 +172,7 @@ def testClearCache(self): ids = [] # add chunks to the cache for i in range(10): - id = createObjId("chunks") + id = _createId() ids.append(id) arr = np.empty((16, 16), dtype="i4") # 1024 bytes arr[...] = i @@ -190,7 +197,7 @@ def testMemUtil(self): self.assertEqual(len(cc), 0) ids = set() for i in range(10): - id = createObjId("chunks") + id = _createId() ids.add(id) arr = np.empty((16, 16), dtype="i4") # 1024 bytes arr[...] = i @@ -208,7 +215,7 @@ def testMemUtil(self): # add 10 more chunks, but set dirty to true each time for i in range(10): - id = createObjId("chunks") + id = _createId() ids.add(id) arr = np.empty((16, 16), dtype="i4") # 1024 bytes arr[...] = i @@ -255,7 +262,7 @@ def testMetaDataCache(self): data = {"x": 123, "y": 456} - rand_id = createObjId("groups") + rand_id = _createId() data = {"foo": "bar"} cc[rand_id] = data # add to cache cc.consistencyCheck() From 5dc3f761adbf528913a9eec5499fc937f597cc1e Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 24 Apr 2025 21:16:55 +0200 Subject: [PATCH 13/49] restrict version on numcodecs --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 33ab5dd5..3f1dc4de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,7 @@ dependencies = [ "h5py >= 3.6.0", "h5json@git+https://github.com/HDFGroup/hdf5-json@abstract", "importlib_resources", - "numcodecs", + "numcodecs <= 0.15.1", "numpy >=2.0.0rc1; python_version>='3.9'", "psutil", "pyjwt", From fb17e1052dcd6c7bae9b032851df56985984b223 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 30 Apr 2025 19:57:32 +0200 Subject: [PATCH 14/49] allow client to generate obj ids --- hsds/ctype_dn.py | 5 +- hsds/ctype_sn.py | 9 +++ hsds/dset_dn.py | 5 +- hsds/dset_sn.py | 10 ++- hsds/group_dn.py | 5 +- hsds/group_sn.py | 8 ++ hsds/servicenode_lib.py | 25 +++++- tests/integ/dataset_test.py | 107 ++++++++++++++++++++++++++ tests/integ/datatype_test.py | 51 ++++++++++++ tests/integ/group_test.py | 145 ++++++++++++++++++++++++++++++++++- 10 files changed, 358 insertions(+), 12 deletions(-) diff --git a/hsds/ctype_dn.py b/hsds/ctype_dn.py index 0d0f83e3..b63d0a4d 100755 --- a/hsds/ctype_dn.py +++ b/hsds/ctype_dn.py @@ -98,8 +98,9 @@ async def POST_Datatype(request): # verify the id doesn't already exist obj_found = await check_metadata_obj(app, ctype_id, bucket=bucket) if obj_found: - log.error(f"Post with existing type_id: {ctype_id}") - raise HTTPInternalServerError() + msg = f"Post with existing type_id: {ctype_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) root_id = None diff --git a/hsds/ctype_sn.py b/hsds/ctype_sn.py index 2030b63b..dfd026f1 100755 --- a/hsds/ctype_sn.py +++ b/hsds/ctype_sn.py @@ -211,7 +211,12 @@ async def POST_Datatype(request): parent_id = None link_title = None + obj_id = None h5path = None + if "id" in body: + obj_id = body["id"] + log.debug(f"POST datatype using client id: {obj_id}") + if "link" in body: if "h5path" in body: msg = "link can't be used with h5path" @@ -220,6 +225,7 @@ async def POST_Datatype(request): link_body = body["link"] if "id" in link_body: parent_id = link_body["id"] + if "name" in link_body: link_title = link_body["name"] try: @@ -243,6 +249,9 @@ async def POST_Datatype(request): # setup args to createObject kwargs = {"bucket": bucket, "obj_type": datatype} + if obj_id: + kwargs["obj_id"] = obj_id + # TBD: creation props for datatype obj? if parent_id: kwargs["parent_id"] = parent_id diff --git a/hsds/dset_dn.py b/hsds/dset_dn.py index dd761365..159d6b63 100755 --- a/hsds/dset_dn.py +++ b/hsds/dset_dn.py @@ -101,8 +101,9 @@ async def POST_Dataset(request): # verify the id doesn't already exist obj_found = await check_metadata_obj(app, dset_id, bucket=bucket) if obj_found: - log.error("Post with existing dset_id: {}".format(dset_id)) - raise HTTPInternalServerError() + msg = f"Post with existing dset_id: {dset_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) if "root" not in body: msg = "POST_Dataset with no root" diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 77e85db0..0c0f4619 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -1098,11 +1098,16 @@ async def POST_Dataset(request): log.debug(f"setting filters to: {f_out}") creationProperties["filters"] = f_out - log.debug(f"set dataset json creationPropries: {creationProperties}") + log.debug(f"set dataset json creationProperties: {creationProperties}") parent_id = None + obj_id = None link_title = None h5path = None + if "id" in body: + obj_id = body["id"] + log.debug(f"POST dataset using client id: {obj_id}") + if "link" in body: if "h5path" in body: msg = "link can't be used with h5path" @@ -1111,6 +1116,7 @@ async def POST_Dataset(request): link_body = body["link"] if "id" in link_body: parent_id = link_body["id"] + if "name" in link_body: link_title = link_body["name"] try: @@ -1134,6 +1140,8 @@ async def POST_Dataset(request): # setup args to createObject kwargs = {"bucket": bucket, "obj_type": datatype, "obj_shape": shape_json} + if obj_id: + kwargs["obj_id"] = obj_id if creationProperties: kwargs["creation_props"] = creationProperties if layout: diff --git a/hsds/group_dn.py b/hsds/group_dn.py index 8a4f10f8..db146a62 100755 --- a/hsds/group_dn.py +++ b/hsds/group_dn.py @@ -111,8 +111,9 @@ async def POST_Group(request): # verify the id doesn't already exist obj_found = await check_metadata_obj(app, group_id, bucket=bucket) if obj_found: - log.error(f"Post with existing group_id: {group_id}") - raise HTTPInternalServerError() + msg = f"Post with existing group_id: {group_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) root_id = body["root"] diff --git a/hsds/group_sn.py b/hsds/group_sn.py index c857683e..68f5fab3 100755 --- a/hsds/group_sn.py +++ b/hsds/group_sn.py @@ -185,6 +185,7 @@ async def POST_Group(request): implicit = getBooleanParam(params, "implicit") parent_id = None + obj_id = None h5path = None creation_props = None @@ -227,11 +228,16 @@ async def POST_Group(request): parent_id = root_id else: parent_id = body["parent_id"] + if "id" in body: + obj_id = body["id"] + log.debug(f"POST group using client id: {obj_id}") if "creationProperties" in body: creation_props = body["creationProperties"] if parent_id: kwargs = {"bucket": bucket, "parent_id": parent_id, "h5path": h5path} + if obj_id: + kwargs["obj_id"] = obj_id if creation_props: kwargs["creation_props"] = creation_props if implicit: @@ -240,6 +246,8 @@ async def POST_Group(request): else: # create an anonymous group kwargs = {"bucket": bucket, "root_id": root_id} + if obj_id: + kwargs["obj_id"] = obj_id if creation_props: kwargs["creation_props"] = creation_props group_json = await createObject(app, **kwargs) diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index bb40620a..2bb3919e 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -1048,6 +1048,7 @@ async def deleteObject(app, obj_id, bucket=None): async def createObject(app, root_id=None, + obj_id=None, obj_type=None, obj_shape=None, layout=None, @@ -1076,7 +1077,18 @@ async def createObject(app, if creation_props: log.debug(f" cprops: {creation_props}") - obj_id = createObjId(collection, root_id=root_id) + if obj_id: + log.debug(f"using client supplied id: {obj_id}") + if not isValidUuid(obj_id, obj_class=collection): + msg = f"invalid id: {obj_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if getRootObjId(obj_id) != root_id: + msg = f"id: {obj_id} is not valid for root: {root_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + obj_id = createObjId(collection, root_id=root_id) log.info(f"new obj id: {obj_id}") obj_json = {"id": obj_id, "root": root_id} if obj_type: @@ -1098,6 +1110,7 @@ async def createObject(app, async def createObjectByPath(app, parent_id=None, + obj_id=None, h5path=None, implicit=False, obj_type=None, @@ -1118,6 +1131,8 @@ async def createObjectByPath(app, log.warn(msg) raise HTTPBadRequest(reason=msg) log.debug(f"createObjectByPath - parent_id: {parent_id}, h5path: {h5path}") + if obj_id: + log.debug(f"createObjectByPath using client id: {obj_id}") root_id = getRootObjId(parent_id) @@ -1196,11 +1211,13 @@ async def createObjectByPath(app, kwargs["layout"] = layout if creation_props: kwargs["creation_props"] = creation_props + if obj_id: + kwargs["obj_id"] = obj_id obj_json = await createObject(app, **kwargs) - obj_id = obj_json["id"] + tgt_id = obj_json["id"] # create a link to the new object - await putHardLink(app, parent_id, link_title, tgt_id=obj_id, bucket=bucket) - parent_id = obj_id # new parent + await putHardLink(app, parent_id, link_title, tgt_id=tgt_id, bucket=bucket) + parent_id = tgt_id # new parent log.info(f"createObjectByPath {h5path} done, returning obj_json") return obj_json diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index 280877cf..958b6552 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -13,6 +13,9 @@ import json import time import numpy as np + +from h5json.objid import createObjId + import helper import config @@ -190,6 +193,110 @@ def testScalarDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 410) + def testPostDatasetWithId(self): + # Test creation of a dataset obj with client creating obj id + domain = self.base_domain + "/testPostDatasetWithId.h5" + helper.setupDomain(domain) + print("testPostDatasetWithId", domain) + headers = helper.getRequestHeaders(domain=domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # make a new dataset id + dset_id = createObjId("datasets", root_id=root_uuid) + + # create a dataset obj + data = {"id": dset_id, "type": "H5T_IEEE_F32LE", "shape": "H5S_SCALAR"} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["attributeCount"], 0) + self.assertEqual(rspJson["id"], dset_id) + + # read back the obj + req = self.endpoint + "/datasets/" + dset_id + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + + expected_keys = [ + "id", + "shape", + "hrefs", + "layout", + "creationProperties", + "attributeCount", + "created", + "lastModified", + "root", + "domain", + ] + + for name in expected_keys: + self.assertTrue(name in rspJson) + self.assertEqual(rspJson["id"], dset_id) + self.assertEqual(rspJson["root"], root_uuid) + self.assertEqual(rspJson["domain"], domain) + self.assertEqual(rspJson["attributeCount"], 0) + shape_json = rspJson["shape"] + self.assertTrue(shape_json["class"], "H5S_SCALAR") + self.assertTrue(rspJson["type"], "H5T_IEEE_F32LE") + + # Get the type + rsp = self.session.get(req + "/type", headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("type" in rspJson) + self.assertTrue(rspJson["type"], "H5T_IEEE_F32LE") + self.assertTrue("hrefs" in rspJson) + hrefs = rspJson["hrefs"] + self.assertEqual(len(hrefs), 3) + + # Get the shape + rsp = self.session.get(req + "/shape", headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("created" in rspJson) + self.assertTrue("lastModified" in rspJson) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("shape" in rspJson) + shape_json = rspJson["shape"] + self.assertTrue(shape_json["class"], "H5S_SCALAR") + + # try getting verbose info + params = {"verbose": 1} + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + + for name in expected_keys: + self.assertTrue(name in rspJson) + + # flush to storage and force an immediate rescan + domain_req = self.endpoint + "/" + domain_params = {"flush": 1, "rescan": 1} + rsp = self.session.put(domain_req, params=domain_params, headers=headers) + # should get a NO_CONTENT code, + self.assertEqual(rsp.status_code, 204) + + # do a get and verify the additional keys are present + expected_keys.append("num_chunks") + expected_keys.append("allocated_size") + + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + + for name in expected_keys: + self.assertTrue(name in rspJson) + def testScalarEmptyDimsDataset(self): # Test creation/deletion of scalar dataset obj domain = self.base_domain + "/testScalarEmptyDimsDataset.h5" diff --git a/tests/integ/datatype_test.py b/tests/integ/datatype_test.py index f3f2d1a9..93c0b3d5 100755 --- a/tests/integ/datatype_test.py +++ b/tests/integ/datatype_test.py @@ -11,6 +11,9 @@ ############################################################################## import unittest import json + +from h5json.objid import createObjId + import helper import config @@ -120,6 +123,54 @@ def testCommittedType(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 410) + def testPostdTypeWithId(self): + # Test creation/deletion of datatype obj + + print("testPostTypeWithId", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create a datatype id + ctype_id = createObjId("datatypes", root_id=root_uuid) + + # create a committed type obj + data = {"id": ctype_id, "type": "H5T_IEEE_F32LE"} + req = self.endpoint + "/datatypes" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["attributeCount"], 0) + self.assertEqual(rspJson["id"], ctype_id) + self.assertTrue("type" in rspJson) + type_json = rspJson["type"] + self.assertEqual(type_json["class"], "H5T_FLOAT") + self.assertEqual(type_json["base"], "H5T_IEEE_F32LE") + + # read back the obj + req = self.endpoint + "/datatypes/" + ctype_id + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("id" in rspJson) + self.assertEqual(rspJson["id"], ctype_id) + self.assertTrue("root" in rspJson) + self.assertEqual(rspJson["root"], root_uuid) + self.assertTrue("created" in rspJson) + self.assertTrue("lastModified" in rspJson) + self.assertTrue("attributeCount" in rspJson) + self.assertEqual(rspJson["attributeCount"], 0) + self.assertTrue("type" in rspJson) + type_json = rspJson["type"] + self.assertEqual(type_json["class"], "H5T_FLOAT") + self.assertEqual(type_json["base"], "H5T_IEEE_F32LE") + def testPostTypes(self): # Test creation with all primitive types diff --git a/tests/integ/group_test.py b/tests/integ/group_test.py index 7a832271..6c154836 100755 --- a/tests/integ/group_test.py +++ b/tests/integ/group_test.py @@ -13,6 +13,9 @@ import time import json import uuid + +from h5json.objid import createObjId + import helper import config @@ -233,6 +236,39 @@ def testPost(self): rsp = self.session.post(req, headers=headers) self.assertEqual(rsp.status_code, 403) # forbidden + def testPostId(self): + # test POST group + print("testPostId", self.base_domain) + endpoint = helper.getEndpoint() + headers = helper.getRequestHeaders(domain=self.base_domain) + req = endpoint + "/groups" + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create a group id + grp_id = createObjId("groups", root_id=root_uuid) + + # create a new group using the grp_id + payload = {"id": grp_id} + req = helper.getEndpoint() + "/groups" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + self.assertEqual(rspJson["attributeCount"], 0) + self.assertEqual(grp_id, rspJson["id"]) + + # try sending the same request again + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) # bad request + def testPostWithLink(self): # test POST with link creation print("testPostWithLink", self.base_domain) @@ -310,6 +346,32 @@ def testPostWithLink(self): self.assertTrue("alias" in rspJson) self.assertEqual(rspJson["alias"], ["/linked_group",]) + def testPostIdWithLink(self): + # test POST with link creation + print("testPostIdWithLink", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create a group id + grp_id = createObjId("groups", root_id=root_uuid) + + # create new group + payload = {"id": grp_id, "link": {"id": root_uuid, "name": "linked_group"}} + req = helper.getEndpoint() + "/groups" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + self.assertEqual(rspJson["attributeCount"], 0) + self.assertEqual(grp_id, rspJson["id"]) + def testPostWithPath(self): # test POST with implicit parent group creation print("testPostWithPath", self.base_domain) @@ -427,9 +489,90 @@ def testPostWithPath(self): rsp = self.session.get(req, headers=headers, params=params) self.assertEqual(rsp.status_code, 200) + def testPostIdWithPath(self): + # test POST with implicit parent group creation + print("testPostIdWithPath", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # get root group and verify link count is 0 + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + + # create new group with link path: /g1 + g1_id = createObjId("groups", root_id=root_uuid) + payload = {"id": g1_id, "h5path": "g1"} + req = helper.getEndpoint() + "/groups" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + self.assertEqual(rspJson["attributeCount"], 0) + self.assertEqual(rspJson["id"], g1_id) + + # get root group and verify link count is 1 + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 1) + + # get the group at "g1" + req = helper.getEndpoint() + "/groups/" + params = {"h5path": "/g1"} + rsp = self.session.get(req, headers=headers, params=params) + self.assertEqual(rsp.status_code, 200) + + # try creating new group with link path: /g2/g2.1 + g21_id = createObjId("groups", root_id=root_uuid) + payload = {"id": g21_id, "h5path": "g2/g2.1"} + req = helper.getEndpoint() + "/groups" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 404) # g2 not found + + # try again with implicit creation set + params = {"implicit": 1} + rsp = self.session.post(req, data=json.dumps(payload), params=params, headers=headers) + self.assertEqual(rsp.status_code, 201) # g2 and g2.1 created + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + self.assertEqual(rspJson["attributeCount"], 0) + self.assertEqual(rspJson["id"], g21_id) + + # get root group and verify link count is 2 + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 2) + + # get the group at "/g2" + req = helper.getEndpoint() + "/groups/" + params = {"h5path": "/g2"} + rsp = self.session.get(req, headers=headers, params=params) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 1) # group g2.1 + + # get the group at "/g2/g2.1" + req = helper.getEndpoint() + "/groups/" + params = {"h5path": "/g2/g2.1"} + rsp = self.session.get(req, headers=headers, params=params) + self.assertEqual(rsp.status_code, 200) + def testPostWithCreationProps(self): # test POST group with creation properties - print("testPost", self.base_domain) + print("testPostWithCreationProps", self.base_domain) endpoint = helper.getEndpoint() headers = helper.getRequestHeaders(domain=self.base_domain) req = endpoint + "/groups" From 3be18a08d691e614df06a3e3b2a0404f0029ef9a Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 7 May 2025 13:24:22 +0200 Subject: [PATCH 15/49] enable attributes to be included with POST req --- hsds/attr_sn.py | 262 ++--------------------------------- hsds/ctype_dn.py | 13 +- hsds/ctype_sn.py | 8 ++ hsds/dset_dn.py | 13 +- hsds/dset_sn.py | 8 ++ hsds/group_dn.py | 13 +- hsds/group_sn.py | 8 ++ hsds/servicenode_lib.py | 251 ++++++++++++++++++++++++++++++++- tests/integ/dataset_test.py | 38 +++++ tests/integ/datatype_test.py | 39 +++++- tests/integ/group_test.py | 40 +++++- tests/integ/link_test.py | 1 - 12 files changed, 429 insertions(+), 265 deletions(-) diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index c5d76227..d3b05ca0 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -18,11 +18,10 @@ from aiohttp.web import StreamResponse from json import JSONDecodeError -from h5json.hdf5dtype import validateTypeItem, getBaseTypeJson from h5json.hdf5dtype import createDataType, getItemSize -from h5json.array_util import jsonToArray, getNumElements, bytesArrayToList +from h5json.array_util import jsonToArray, getNumElements from h5json.array_util import bytesToArray, arrayToBytes, decodeData, encodeData -from h5json.objid import isValidUuid, getRootObjId +from h5json.objid import isValidUuid from .util.httpUtil import getAcceptType, jsonResponse, getHref, getBooleanParam from .util.globparser import globmatch @@ -32,8 +31,8 @@ from .util.attrUtil import validateAttributeName, getRequestCollectionName from .util.dsetUtil import getShapeDims -from .servicenode_lib import getDomainJson, getObjectJson, validateAction -from .servicenode_lib import getAttributes, putAttributes, deleteAttributes +from .servicenode_lib import getDomainJson, getAttributeFromRequest, getAttributesFromRequest +from .servicenode_lib import getAttributes, putAttributes, deleteAttributes, validateAction from .domain_crawl import DomainCrawler from . import hsds_logger as log from . import config @@ -296,244 +295,6 @@ async def GET_Attribute(request): return resp -async def _getTypeFromRequest(app, body, obj_id=None, bucket=None): - """ return a type json from the request body """ - if "type" not in body: - msg = "PUT attribute with no type in body" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - datatype = body["type"] - - if isinstance(datatype, str) and datatype.startswith("t-"): - # Committed type - fetch type json from DN - ctype_id = datatype - log.debug(f"got ctypeid: {ctype_id}") - ctype_json = await getObjectJson(app, ctype_id, bucket=bucket) - log.debug(f"ctype {ctype_id}: {ctype_json}") - root_id = getRootObjId(obj_id) - if ctype_json["root"] != root_id: - msg = "Referenced committed datatype must belong in same domain" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - datatype = ctype_json["type"] - # add the ctype_id to the type - datatype["id"] = ctype_id - elif isinstance(datatype, str): - try: - # convert predefined type string (e.g. "H5T_STD_I32LE") to - # corresponding json representation - datatype = getBaseTypeJson(datatype) - except TypeError: - msg = "PUT attribute with invalid predefined type" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - try: - validateTypeItem(datatype) - except KeyError as ke: - msg = f"KeyError creating type: {ke}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - except TypeError as te: - msg = f"TypeError creating type: {te}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - except ValueError as ve: - msg = f"ValueError creating type: {ve}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - return datatype - - -def _getShapeFromRequest(body): - """ get shape json from request body """ - shape_json = {} - if "shape" in body: - shape_body = body["shape"] - shape_class = None - if isinstance(shape_body, dict) and "class" in shape_body: - shape_class = shape_body["class"] - elif isinstance(shape_body, str): - shape_class = shape_body - if shape_class: - if shape_class == "H5S_NULL": - shape_json["class"] = "H5S_NULL" - if isinstance(shape_body, dict) and "dims" in shape_body: - msg = "can't include dims with null shape" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if isinstance(shape_body, dict) and "value" in body: - msg = "can't have H5S_NULL shape with value" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif shape_class == "H5S_SCALAR": - shape_json["class"] = "H5S_SCALAR" - dims = getShapeDims(shape_body) - if len(dims) != 1 or dims[0] != 1: - msg = "dimensions aren't valid for scalar attribute" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif shape_class == "H5S_SIMPLE": - shape_json["class"] = "H5S_SIMPLE" - dims = getShapeDims(shape_body) - shape_json["dims"] = dims - else: - msg = f"Unknown shape class: {shape_class}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - # no class, interpet shape value as dimensions and - # use H5S_SIMPLE as class - if isinstance(shape_body, list) and len(shape_body) == 0: - shape_json["class"] = "H5S_SCALAR" - else: - shape_json["class"] = "H5S_SIMPLE" - dims = getShapeDims(shape_body) - shape_json["dims"] = dims - else: - shape_json["class"] = "H5S_SCALAR" - - return shape_json - - -def _getValueFromRequest(body, data_type, data_shape): - """ Get attribute value from request json """ - dims = getShapeDims(data_shape) - if "value" in body: - if dims is None: - msg = "Bad Request: data can not be included with H5S_NULL space" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - value = body["value"] - # validate that the value agrees with type/shape - arr_dtype = createDataType(data_type) # np datatype - if len(dims) == 0: - np_dims = [1, ] - else: - np_dims = dims - - if body.get("encoding"): - item_size = getItemSize(data_type) - if item_size == "H5T_VARIABLE": - msg = "base64 encoding is not support for variable length attributes" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - try: - data = decodeData(value) - except ValueError: - msg = "unable to decode data" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - expected_numbytes = arr_dtype.itemsize * np.prod(dims) - if len(data) != expected_numbytes: - msg = f"expected: {expected_numbytes} but got: {len(data)}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - # check to see if this works with our shape and type - try: - arr = bytesToArray(data, arr_dtype, np_dims) - except ValueError as e: - log.debug(f"data: {data}") - log.debug(f"type: {arr_dtype}") - log.debug(f"np_dims: {np_dims}") - msg = f"Bad Request: encoded input data doesn't match shape and type: {e}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - value_json = None - # now try converting to JSON - list_data = arr.tolist() - try: - value_json = bytesArrayToList(list_data) - except ValueError as err: - msg = f"Cannot decode bytes to list: {err}, will store as encoded bytes" - log.warn(msg) - if value_json: - log.debug("will store base64 input as json") - if data_shape["class"] == "H5S_SCALAR": - # just use the scalar value - value = value_json[0] - else: - value = value_json # return this - else: - value = data # return bytes to signal that this needs to be encoded - else: - # verify that the input data matches the array shape and type - try: - jsonToArray(np_dims, arr_dtype, value) - except ValueError as e: - msg = f"Bad Request: input data doesn't match selection: {e}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - value = None - - return value - - -async def _getAttributeFromRequest(app, req_json, obj_id=None, bucket=None): - """ return attribute from given request json """ - attr_item = {} - attr_type = await _getTypeFromRequest(app, req_json, obj_id=obj_id, bucket=bucket) - attr_shape = _getShapeFromRequest(req_json) - attr_item = {"type": attr_type, "shape": attr_shape} - attr_value = _getValueFromRequest(req_json, attr_type, attr_shape) - if attr_value is not None: - if isinstance(attr_value, bytes): - attr_value = encodeData(attr_value) # store as base64 - attr_item["encoding"] = "base64" - else: - # just store the JSON dict or primitive value - attr_item["value"] = attr_value - else: - attr_item["value"] = None - - return attr_item - - -async def _getAttributesFromRequest(request, req_json, obj_id=None, bucket=None): - """ read the given JSON dictinary and return dict of attribute json """ - - app = request.app - attr_items = {} - kwargs = {"obj_id": obj_id} - if bucket: - kwargs["bucket"] = bucket - if "attributes" in req_json: - attributes = req_json["attributes"] - if not isinstance(attributes, dict): - msg = f"expected list for attributes but got: {type(attributes)}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - # read each attr_item and canonicalize the shape, type, verify value - for attr_name in attributes: - attr_json = attributes[attr_name] - attr_item = await _getAttributeFromRequest(app, attr_json, **kwargs) - attr_items[attr_name] = attr_item - - elif "type" in req_json: - # single attribute create - fake an item list - attr_item = await _getAttributeFromRequest(app, req_json, **kwargs) - if "name" in req_json: - attr_name = req_json["name"] - else: - attr_name = request.match_info.get("name") - validateAttributeName(attr_name) - if not attr_name: - msg = "Missing attribute name" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - attr_items[attr_name] = attr_item - else: - log.debug(f"_getAttributes from request - no attribute defined in {req_json}") - - return attr_items - - async def PUT_Attribute(request): """HTTP method to create a new attribute""" log.request(request) @@ -556,7 +317,7 @@ async def PUT_Attribute(request): log.debug(f"Attribute name: [{attr_name}]") validateAttributeName(attr_name) - log.info(f"PUT Attributes id: {req_obj_id} name: {attr_name}") + log.info(f"PUT Attribute id: {req_obj_id} name: {attr_name}") username, pswd = getUserPasswordFromRequest(request) # write actions need auth await validateUserPassword(app, username, pswd) @@ -588,7 +349,7 @@ async def PUT_Attribute(request): # get attribute from request body kwargs = {"bucket": bucket, "obj_id": req_obj_id} - attr_body = await _getAttributeFromRequest(app, body, **kwargs) + attr_body = await getAttributeFromRequest(app, body, **kwargs) # write attribute to DN attr_json = {attr_name: attr_body} @@ -625,7 +386,7 @@ async def PUT_Attributes(request): await validateUserPassword(app, username, pswd) if not request.has_body: - msg = "PUT Attribute with no body" + msg = "PUT Attributes with no body" log.warn(msg) raise HTTPBadRequest(reason=msg) try: @@ -655,10 +416,10 @@ async def PUT_Attributes(request): if not req_obj_id: req_obj_id = domain_json["root"] kwargs = {"obj_id": req_obj_id, "bucket": bucket} - attr_items = await _getAttributesFromRequest(request, body, **kwargs) + attr_items = await getAttributesFromRequest(app, body, **kwargs) if attr_items: - log.debug(f"PUT Attribute {len(attr_items)} attibutes to add") + log.debug(f"PUT Attribute {len(attr_items)} attributes to add") else: log.debug("no attributes defined yet") @@ -667,6 +428,7 @@ async def PUT_Attributes(request): obj_ids = {} if "obj_ids" in body: body_ids = body["obj_ids"] + if isinstance(body_ids, list): # multi cast the attributes - each attribute in attr-items # will be written to each of the objects identified by obj_id @@ -686,7 +448,7 @@ async def PUT_Attributes(request): msg += f"{len(obj_ids)} objects" log.info(msg) elif isinstance(body_ids, dict): - # each value is body_ids is a set of attriutes to write to the object + # each value is body_ids is a set of attributes to write to the object # unlike the above case, different attributes can be written to # different objects if attr_items: @@ -702,7 +464,7 @@ async def PUT_Attributes(request): id_json = body_ids[obj_id] kwargs = {"obj_id": obj_id, "bucket": bucket} - obj_items = await _getAttributesFromRequest(request, id_json, **kwargs) + obj_items = await getAttributesFromRequest(app, id_json, **kwargs) if obj_items: obj_ids[obj_id] = obj_items diff --git a/hsds/ctype_dn.py b/hsds/ctype_dn.py index b63d0a4d..0b14ab41 100755 --- a/hsds/ctype_dn.py +++ b/hsds/ctype_dn.py @@ -122,10 +122,17 @@ async def POST_Datatype(request): raise HTTPInternalServerError() type_json = body["type"] + if "attributes" in body: + # initialize attributes + attrs = body["attributes"] + log.debug(f"POST datatype with attributes: {attrs}") + else: + attrs = {} + # ok - all set, create committed type obj now = getNow(app) - log.info(f"POST_datatype, typejson: {type_json}") + log.info(f"POST_datatype, type_json: {type_json}") ctype_json = { "id": ctype_id, @@ -133,7 +140,7 @@ async def POST_Datatype(request): "created": now, "lastModified": now, "type": type_json, - "attributes": {}, + "attributes": attrs, } kwargs = {"bucket": bucket, "notify": True, "flush": True} @@ -145,7 +152,7 @@ async def POST_Datatype(request): resp_json["created"] = ctype_json["created"] resp_json["lastModified"] = ctype_json["lastModified"] resp_json["type"] = type_json - resp_json["attributeCount"] = 0 + resp_json["attributeCount"] = len(attrs) resp = json_response(resp_json, status=201) log.response(request, resp=resp) diff --git a/hsds/ctype_sn.py b/hsds/ctype_sn.py index dfd026f1..ccf033ac 100755 --- a/hsds/ctype_sn.py +++ b/hsds/ctype_sn.py @@ -213,10 +213,16 @@ async def POST_Datatype(request): link_title = None obj_id = None h5path = None + attrs = None + if "id" in body: obj_id = body["id"] log.debug(f"POST datatype using client id: {obj_id}") + if "attributes" in body: + attrs = body["attributes"] + log.debug(f"POST datatype attributes: {attrs}") + if "link" in body: if "h5path" in body: msg = "link can't be used with h5path" @@ -251,6 +257,8 @@ async def POST_Datatype(request): kwargs = {"bucket": bucket, "obj_type": datatype} if obj_id: kwargs["obj_id"] = obj_id + if attrs: + kwargs["attrs"] = attrs # TBD: creation props for datatype obj? if parent_id: diff --git a/hsds/dset_dn.py b/hsds/dset_dn.py index 159d6b63..bca36457 100755 --- a/hsds/dset_dn.py +++ b/hsds/dset_dn.py @@ -135,7 +135,14 @@ async def POST_Dataset(request): # ok - all set, create committed type obj now = getNow(app) - log.debug(f"POST_dataset typejson: {type_json}, shapejson: {shape_json}") + if "attributes" in body: + # initialize attributes + attrs = body["attributes"] + log.debug(f"POST Dataset with attributes: {attrs}") + else: + attrs = {} + + log.debug(f"POST_dataset type_json: {type_json}, shape_json: {shape_json}") dset_json = { "id": dset_id, @@ -144,7 +151,7 @@ async def POST_Dataset(request): "lastModified": now, "type": type_json, "shape": shape_json, - "attributes": {}, + "attributes": attrs, } if "creationProperties" in body: @@ -162,7 +169,7 @@ async def POST_Dataset(request): resp_json["type"] = type_json resp_json["shape"] = shape_json resp_json["lastModified"] = dset_json["lastModified"] - resp_json["attributeCount"] = 0 + resp_json["attributeCount"] = len(attrs) resp = json_response(resp_json, status=201) log.response(request, resp=resp) diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 0c0f4619..c6c5e502 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -859,6 +859,12 @@ async def POST_Dataset(request): else: creationProperties = {} + if "attributes" in body: + attrs = body["attributes"] + log.debug(f"POST Dataset attributes: {attrs}") + else: + attrs = None + # TBD: check for invalid layout class... if layout_props: if layout_props["class"] == "H5D_CONTIGUOUS": @@ -1144,6 +1150,8 @@ async def POST_Dataset(request): kwargs["obj_id"] = obj_id if creationProperties: kwargs["creation_props"] = creationProperties + if attrs: + kwargs["attrs"] = attrs if layout: kwargs["layout"] = layout diff --git a/hsds/group_dn.py b/hsds/group_dn.py index db146a62..dfce8f66 100755 --- a/hsds/group_dn.py +++ b/hsds/group_dn.py @@ -99,7 +99,7 @@ async def POST_Group(request): group_id = get_obj_id(request, body=body) - log.info(f"POST group: {group_id} bucket: {bucket}") + log.info(f"POST group: {group_id} bucket: {bucket} body: {body}") if not isValidUuid(group_id, obj_class="groups"): log.error(f"Unexpected group_id: {group_id}") raise HTTPInternalServerError() @@ -125,13 +125,20 @@ async def POST_Group(request): # ok - all set, create group obj now = getNow(app) + if "attributes" in body: + # initialize attributes + attrs = body["attributes"] + log.debug(f"POST Group with attributes: {attrs}") + else: + attrs = {} + group_json = { "id": group_id, "root": root_id, "created": now, "lastModified": now, "links": {}, - "attributes": {}, + "attributes": attrs, } if "creationProperties" in body: @@ -147,7 +154,7 @@ async def POST_Group(request): resp_json["created"] = group_json["created"] resp_json["lastModified"] = group_json["lastModified"] resp_json["linkCount"] = 0 - resp_json["attributeCount"] = 0 + resp_json["attributeCount"] = len(attrs) resp = json_response(resp_json, status=201) log.response(request, resp=resp) diff --git a/hsds/group_sn.py b/hsds/group_sn.py index 68f5fab3..e2395826 100755 --- a/hsds/group_sn.py +++ b/hsds/group_sn.py @@ -188,6 +188,7 @@ async def POST_Group(request): obj_id = None h5path = None creation_props = None + attrs = None if request.has_body: try: @@ -233,6 +234,9 @@ async def POST_Group(request): log.debug(f"POST group using client id: {obj_id}") if "creationProperties" in body: creation_props = body["creationProperties"] + if "attributes" in body: + attrs = body["attributes"] + log.debug(f"POST Group attributes: {attrs}") if parent_id: kwargs = {"bucket": bucket, "parent_id": parent_id, "h5path": h5path} @@ -240,6 +244,8 @@ async def POST_Group(request): kwargs["obj_id"] = obj_id if creation_props: kwargs["creation_props"] = creation_props + if attrs: + kwargs["attrs"] = attrs if implicit: kwargs["implicit"] = True group_json = await createObjectByPath(app, **kwargs) @@ -250,6 +256,8 @@ async def POST_Group(request): kwargs["obj_id"] = obj_id if creation_props: kwargs["creation_props"] = creation_props + if attrs: + kwargs["attrs"] = attrs group_json = await createObject(app, **kwargs) log.debug(f"returning resp: {group_json}") diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 2bb3919e..69b909dc 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -15,15 +15,17 @@ import asyncio import json +import numpy as np from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPGone, HTTPConflict from aiohttp.web_exceptions import HTTPNotFound, HTTPInternalServerError from aiohttp.client_exceptions import ClientOSError, ClientError from aiohttp import ClientResponseError -from h5json.array_util import encodeData +from h5json.array_util import encodeData, decodeData, bytesToArray, bytesArrayToList, jsonToArray from h5json.objid import getCollectionForId, createObjId, getRootObjId from h5json.objid import isSchema2Id, getS3Key, isValidUuid +from h5json.hdf5dtype import getBaseTypeJson, validateTypeItem, createDataType, getItemSize from .util.nodeUtil import getDataNodeUrl from .util.authUtil import getAclKeys @@ -33,6 +35,7 @@ from .util.httpUtil import http_get, http_put, http_post, http_delete from .util.domainUtil import getBucketForDomain, verifyRoot, getLimits from .util.storUtil import getCompressors +from .util.dsetUtil import getShapeDims from .basenode import getVersion from . import hsds_logger as log @@ -888,6 +891,229 @@ async def doFlush(app, root_id, bucket=None): return dn_ids +async def getTypeFromRequest(app, body, obj_id=None, bucket=None): + """ return a type json from the request body """ + if "type" not in body: + msg = "expected type in body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + datatype = body["type"] + + if isinstance(datatype, str) and datatype.startswith("t-"): + # Committed type - fetch type json from DN + ctype_id = datatype + log.debug(f"got ctypeid: {ctype_id}") + ctype_json = await getObjectJson(app, ctype_id, bucket=bucket) + log.debug(f"ctype {ctype_id}: {ctype_json}") + root_id = getRootObjId(obj_id) + if ctype_json["root"] != root_id: + msg = "Referenced committed datatype must belong in same domain" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + datatype = ctype_json["type"] + # add the ctype_id to the type + datatype["id"] = ctype_id + elif isinstance(datatype, str): + try: + # convert predefined type string (e.g. "H5T_STD_I32LE") to + # corresponding json representation + datatype = getBaseTypeJson(datatype) + except TypeError: + msg = "PUT attribute with invalid predefined type" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + try: + validateTypeItem(datatype) + except KeyError as ke: + msg = f"KeyError creating type: {ke}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + except TypeError as te: + msg = f"TypeError creating type: {te}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + except ValueError as ve: + msg = f"ValueError creating type: {ve}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + return datatype + + +def getShapeFromRequest(body): + """ get shape json from request body """ + shape_json = {} + if "shape" in body: + shape_body = body["shape"] + shape_class = None + if isinstance(shape_body, dict) and "class" in shape_body: + shape_class = shape_body["class"] + elif isinstance(shape_body, str): + shape_class = shape_body + if shape_class: + if shape_class == "H5S_NULL": + shape_json["class"] = "H5S_NULL" + if isinstance(shape_body, dict) and "dims" in shape_body: + msg = "can't include dims with null shape" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if isinstance(shape_body, dict) and "value" in body: + msg = "can't have H5S_NULL shape with value" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + elif shape_class == "H5S_SCALAR": + shape_json["class"] = "H5S_SCALAR" + dims = getShapeDims(shape_body) + if len(dims) != 1 or dims[0] != 1: + msg = "dimensions aren't valid for scalar attribute" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + elif shape_class == "H5S_SIMPLE": + shape_json["class"] = "H5S_SIMPLE" + dims = getShapeDims(shape_body) + shape_json["dims"] = dims + else: + msg = f"Unknown shape class: {shape_class}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + # no class, interpret shape value as dimensions and + # use H5S_SIMPLE as class + if isinstance(shape_body, list) and len(shape_body) == 0: + shape_json["class"] = "H5S_SCALAR" + else: + shape_json["class"] = "H5S_SIMPLE" + dims = getShapeDims(shape_body) + shape_json["dims"] = dims + else: + shape_json["class"] = "H5S_SCALAR" + + return shape_json + + +async def getAttributeFromRequest(app, req_json, obj_id=None, bucket=None): + """ return attribute from given request json """ + attr_item = {} + log.debug(f"getAttributeFromRequest req_json: {req_json} obj_id: {obj_id}") + attr_type = await getTypeFromRequest(app, req_json, obj_id=obj_id, bucket=bucket) + attr_shape = getShapeFromRequest(req_json) + attr_item = {"type": attr_type, "shape": attr_shape} + attr_value = getValueFromRequest(req_json, attr_type, attr_shape) + if attr_value is not None: + if isinstance(attr_value, bytes): + attr_value = encodeData(attr_value) # store as base64 + attr_item["encoding"] = "base64" + else: + # just store the JSON dict or primitive value + attr_item["value"] = attr_value + else: + attr_item["value"] = None + + return attr_item + + +async def getAttributesFromRequest(app, req_json, obj_id=None, bucket=None): + """ read the given JSON dictionary and return dict of attribute json """ + + attr_items = {} + kwargs = {"obj_id": obj_id} + if bucket: + kwargs["bucket"] = bucket + if "attributes" in req_json: + attributes = req_json["attributes"] + if not isinstance(attributes, dict): + msg = f"expected list for attributes but got: {type(attributes)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + # read each attr_item and canonicalize the shape, type, verify value + for attr_name in attributes: + attr_json = attributes[attr_name] + attr_item = await getAttributeFromRequest(app, attr_json, **kwargs) + attr_items[attr_name] = attr_item + else: + log.debug(f"getAttributesFromRequest - no attribute defined in {req_json}") + + return attr_items + + +def getValueFromRequest(body, data_type, data_shape): + """ Get attribute value from request json """ + dims = getShapeDims(data_shape) + if "value" in body: + if dims is None: + msg = "Bad Request: data can not be included with H5S_NULL space" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + value = body["value"] + # validate that the value agrees with type/shape + arr_dtype = createDataType(data_type) # np datatype + if len(dims) == 0: + np_dims = [1, ] + else: + np_dims = dims + + if body.get("encoding"): + item_size = getItemSize(data_type) + if item_size == "H5T_VARIABLE": + msg = "base64 encoding is not support for variable length attributes" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + try: + data = decodeData(value) + except ValueError: + msg = "unable to decode data" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + expected_byte_count = arr_dtype.itemsize * np.prod(dims) + if len(data) != expected_byte_count: + msg = f"expected: {expected_byte_count} but got: {len(data)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + # check to see if this works with our shape and type + try: + arr = bytesToArray(data, arr_dtype, np_dims) + except ValueError as e: + log.debug(f"data: {data}") + log.debug(f"type: {arr_dtype}") + log.debug(f"np_dims: {np_dims}") + msg = f"Bad Request: encoded input data doesn't match shape and type: {e}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + value_json = None + # now try converting to JSON + list_data = arr.tolist() + try: + value_json = bytesArrayToList(list_data) + except ValueError as err: + msg = f"Cannot decode bytes to list: {err}, will store as encoded bytes" + log.warn(msg) + if value_json: + log.debug("will store base64 input as json") + if data_shape["class"] == "H5S_SCALAR": + # just use the scalar value + value = value_json[0] + else: + value = value_json # return this + else: + value = data # return bytes to signal that this needs to be encoded + else: + # verify that the input data matches the array shape and type + try: + jsonToArray(np_dims, arr_dtype, value) + except ValueError as e: + msg = f"Bad Request: input data doesn't match selection: {e}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + value = None + + return value + + async def getAttributes(app, obj_id, attr_names=None, include_data=False, @@ -1053,6 +1279,7 @@ async def createObject(app, obj_shape=None, layout=None, creation_props=None, + attrs=None, bucket=None): """ create a group, ctype, or dataset object and return object json Determination on whether a group, ctype, or dataset is created is based on: @@ -1076,6 +1303,8 @@ async def createObject(app, log.debug(f" layout: {layout}") if creation_props: log.debug(f" cprops: {creation_props}") + if attrs: + log.debug(f" attrs: {attrs}") if obj_id: log.debug(f"using client supplied id: {obj_id}") @@ -1099,6 +1328,13 @@ async def createObject(app, obj_json["layout"] = layout if creation_props: obj_json["creationProperties"] = creation_props + if attrs: + kwargs = {"obj_id": obj_id, "bucket": bucket} + attrs_json = {"attributes": attrs} + attr_items = await getAttributesFromRequest(app, attrs_json, **kwargs) + log.debug(f"got attr_items: {attr_items}") + + obj_json["attributes"] = attr_items log.debug(f"create {collection} obj, body: {obj_json}") dn_url = getDataNodeUrl(app, obj_id) req = f"{dn_url}/{collection}" @@ -1117,6 +1353,7 @@ async def createObjectByPath(app, obj_shape=None, layout=None, creation_props=None, + attrs=None, bucket=None): """ create an object at the designated path relative to the parent. @@ -1133,6 +1370,16 @@ async def createObjectByPath(app, log.debug(f"createObjectByPath - parent_id: {parent_id}, h5path: {h5path}") if obj_id: log.debug(f"createObjectByPath using client id: {obj_id}") + if obj_type: + log.debug(f" obj_type: {obj_type}") + if obj_shape: + log.debug(f" obj_shape: {obj_shape}") + if layout: + log.debug(f" layout: {layout}") + if creation_props: + log.debug(f" cprops: {creation_props}") + if attrs: + log.debug(f" attrs: {attrs}") root_id = getRootObjId(parent_id) @@ -1211,6 +1458,8 @@ async def createObjectByPath(app, kwargs["layout"] = layout if creation_props: kwargs["creation_props"] = creation_props + if attrs: + kwargs["attrs"] = attrs if obj_id: kwargs["obj_id"] = obj_id obj_json = await createObject(app, **kwargs) diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index 958b6552..d0b60f1a 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -297,6 +297,44 @@ def testPostDatasetWithId(self): for name in expected_keys: self.assertTrue(name in rspJson) + def testPostDatasetWithAttributes(self): + # test POST with attribute initialization + domain = self.base_domain + "/testPostDatasetWithAttributes.h5" + helper.setupDomain(domain) + print("testPostDatasetWithAttributes", domain) + headers = helper.getRequestHeaders(domain=domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + print("rspJson:", rspJson) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # setup some attributes to include + attr_count = 4 + attributes = {} + extent = 10 + for i in range(attr_count): + value = [i * 10 + j for j in range(extent)] + data = {"type": "H5T_STD_I32LE", "shape": extent, "value": value} + attr_name = f"attr{i + 1:04d}" + attributes[attr_name] = data + + # create new dataset + payload = {"type": "H5T_IEEE_F32LE", "shape": "H5S_SCALAR"} + payload["attributes"] = attributes + payload["link"] = {"id": root_uuid, "name": "linked_datatype"} + + req = helper.getEndpoint() + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["attributeCount"], 4) + self.assertTrue(helper.validateId(rspJson["id"])) + def testScalarEmptyDimsDataset(self): # Test creation/deletion of scalar dataset obj domain = self.base_domain + "/testScalarEmptyDimsDataset.h5" diff --git a/tests/integ/datatype_test.py b/tests/integ/datatype_test.py index 93c0b3d5..ce2a0e1a 100755 --- a/tests/integ/datatype_test.py +++ b/tests/integ/datatype_test.py @@ -123,7 +123,7 @@ def testCommittedType(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 410) - def testPostdTypeWithId(self): + def testPostTypeWithId(self): # Test creation/deletion of datatype obj print("testPostTypeWithId", self.base_domain) @@ -171,6 +171,43 @@ def testPostdTypeWithId(self): self.assertEqual(type_json["class"], "H5T_FLOAT") self.assertEqual(type_json["base"], "H5T_IEEE_F32LE") + def testPostWithAttributes(self): + # test POST with attribute initialization + print("testPostWithAttributes", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # setup some attributes to include + attr_count = 4 + attributes = {} + extent = 10 + for i in range(attr_count): + value = [i * 10 + j for j in range(extent)] + data = {"type": "H5T_STD_I32LE", "shape": extent, "value": value} + attr_name = f"attr{i + 1:04d}" + attributes[attr_name] = data + + # create new datatype + link = {"id": root_uuid, "name": "linked_datatype"} + payload = {"type": "H5T_IEEE_F32LE", "attributes": attributes, "link": link} + req = helper.getEndpoint() + "/datatypes" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertTrue(helper.validateId(rspJson["id"])) + self.assertTrue("type" in rspJson) + type_json = rspJson["type"] + self.assertEqual(type_json["class"], "H5T_FLOAT") + self.assertEqual(type_json["base"], "H5T_IEEE_F32LE") + self.assertEqual(rspJson["attributeCount"], 4) + def testPostTypes(self): # Test creation with all primitive types diff --git a/tests/integ/group_test.py b/tests/integ/group_test.py index 6c154836..32234245 100755 --- a/tests/integ/group_test.py +++ b/tests/integ/group_test.py @@ -236,9 +236,9 @@ def testPost(self): rsp = self.session.post(req, headers=headers) self.assertEqual(rsp.status_code, 403) # forbidden - def testPostId(self): - # test POST group - print("testPostId", self.base_domain) + def testPostWithId(self): + # test POST group with a client-generated id + print("testPostWithId", self.base_domain) endpoint = helper.getEndpoint() headers = helper.getRequestHeaders(domain=self.base_domain) req = endpoint + "/groups" @@ -261,6 +261,7 @@ def testPostId(self): self.assertEqual(rsp.status_code, 201) rspJson = json.loads(rsp.text) + print("rspJson:", rspJson) self.assertEqual(rspJson["linkCount"], 0) self.assertEqual(rspJson["attributeCount"], 0) self.assertEqual(grp_id, rspJson["id"]) @@ -372,6 +373,39 @@ def testPostIdWithLink(self): self.assertEqual(rspJson["attributeCount"], 0) self.assertEqual(grp_id, rspJson["id"]) + def testPostWithAttributes(self): + # test POST with attribute initialization + print("testPostWithAttributes", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # setup some attributes to include + attr_count = 4 + attributes = {} + extent = 10 + for i in range(attr_count): + value = [i * 10 + j for j in range(extent)] + data = {"type": "H5T_STD_I32LE", "shape": extent, "value": value} + attr_name = f"attr{i + 1:04d}" + attributes[attr_name] = data + + # create new group + payload = {"attributes": attributes, "link": {"id": root_uuid, "name": "linked_group"}} + req = helper.getEndpoint() + "/groups" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + self.assertEqual(rspJson["attributeCount"], 4) + self.assertTrue(helper.validateId(rspJson["id"])) + def testPostWithPath(self): # test POST with implicit parent group creation print("testPostWithPath", self.base_domain) diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index a6f72aeb..7c909435 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -1767,7 +1767,6 @@ def testLinkCreationOrder(self): self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) links_json = rspJson["links"] - print("params:", params) # verify the links are in order for i in range(link_count - 1): From 00d7c962c2f0195fb82ceeb5cd1b69b529441225 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 7 May 2025 15:28:34 +0200 Subject: [PATCH 16/49] add create timestamps for attributes in obj create --- admin/config/config.yml | 1 + hsds/servicenode_lib.py | 14 ++++++++++++++ tests/integ/dataset_test.py | 14 +++++++++++++- tests/integ/datatype_test.py | 16 ++++++++++++++-- tests/integ/group_test.py | 16 ++++++++++++++-- 5 files changed, 56 insertions(+), 5 deletions(-) diff --git a/admin/config/config.yml b/admin/config/config.yml index 756be465..6e92d65b 100755 --- a/admin/config/config.yml +++ b/admin/config/config.yml @@ -88,6 +88,7 @@ allow_any_bucket_read: true # enable reads to buckets other than default bucket allow_any_bucket_write: true # enable writes to buckets other than default bucket bit_shuffle_default_blocksize: 2048 # default blocksize for bitshuffle filter max_rangeget_gap: 1024 # max gap in byte for intelligent range get requests +predate_maxtime: 10.0 # max delta between object created timestamp in request and actual time # DEPRECATED - the remaining config values are not used in currently but kept for backward compatibility with older container images aws_lambda_chunkread_function: null # name of aws lambda function for chunk reading aws_lambda_threshold: 4 # number of chunks per node per request to reach before using lambda diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 69b909dc..80fe7af1 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -15,6 +15,7 @@ import asyncio import json +import time import numpy as np from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPGone, HTTPConflict @@ -39,6 +40,7 @@ from .basenode import getVersion from . import hsds_logger as log +from . import config async def getDomainJson(app, domain, reload=False): @@ -1010,6 +1012,18 @@ async def getAttributeFromRequest(app, req_json, obj_id=None, bucket=None): else: attr_item["value"] = None + now = time.time() + if "created" in req_json: + created = req_json["created"] + # allow "pre-dated" attributes if the timestamp is within the last 10 seconds + predate_max_time = config.get("predate_max_time", default=10.0) + if now - created > predate_max_time: + attr_item["created"] = created + else: + log.warn("stale created timestamp for attribute, ignoring") + if "created" not in attr_item: + attr_item["created"] = now + return attr_item diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index d0b60f1a..46726e60 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -333,7 +333,19 @@ def testPostDatasetWithAttributes(self): self.assertEqual(rsp.status_code, 201) rspJson = json.loads(rsp.text) self.assertEqual(rspJson["attributeCount"], 4) - self.assertTrue(helper.validateId(rspJson["id"])) + dset_id = rspJson["id"] + self.assertTrue(helper.validateId(dset_id)) + + # fetch the attributes + req = f"{helper.getEndpoint()}/datasets/{dset_id}/attributes" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertFalse("type" in rspJson) + self.assertFalse("shape" in rspJson) + self.assertTrue("attributes") in rspJson + self.assertEqual(len(rspJson["attributes"]), attr_count) def testScalarEmptyDimsDataset(self): # Test creation/deletion of scalar dataset obj diff --git a/tests/integ/datatype_test.py b/tests/integ/datatype_test.py index ce2a0e1a..f8f01bea 100755 --- a/tests/integ/datatype_test.py +++ b/tests/integ/datatype_test.py @@ -201,12 +201,24 @@ def testPostWithAttributes(self): rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) rspJson = json.loads(rsp.text) - self.assertTrue(helper.validateId(rspJson["id"])) + ctype_id = rspJson["id"] + self.assertTrue(helper.validateId(ctype_id)) self.assertTrue("type" in rspJson) type_json = rspJson["type"] self.assertEqual(type_json["class"], "H5T_FLOAT") self.assertEqual(type_json["base"], "H5T_IEEE_F32LE") - self.assertEqual(rspJson["attributeCount"], 4) + self.assertEqual(rspJson["attributeCount"], attr_count) + + # fetch the attributes, check count + req = f"{helper.getEndpoint()}/datatypes/{ctype_id}/attributes" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertFalse("type" in rspJson) + self.assertFalse("shape" in rspJson) + self.assertTrue("attributes") in rspJson + self.assertEqual(len(rspJson["attributes"]), attr_count) def testPostTypes(self): # Test creation with all primitive types diff --git a/tests/integ/group_test.py b/tests/integ/group_test.py index 32234245..b78b8e58 100755 --- a/tests/integ/group_test.py +++ b/tests/integ/group_test.py @@ -403,8 +403,20 @@ def testPostWithAttributes(self): self.assertEqual(rsp.status_code, 201) rspJson = json.loads(rsp.text) self.assertEqual(rspJson["linkCount"], 0) - self.assertEqual(rspJson["attributeCount"], 4) - self.assertTrue(helper.validateId(rspJson["id"])) + self.assertEqual(rspJson["attributeCount"], attr_count) + grp_id = rspJson["id"] + self.assertTrue(helper.validateId(grp_id)) + + # fetch the attributes, check count + req = f"{helper.getEndpoint()}/groups/{grp_id}/attributes" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertFalse("type" in rspJson) + self.assertFalse("shape" in rspJson) + self.assertTrue("attributes") in rspJson + self.assertEqual(len(rspJson["attributes"]), attr_count) def testPostWithPath(self): # test POST with implicit parent group creation From 47b9a6e55050231fd3f4f64e755fc3e6ac4de3fc Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 7 May 2025 18:46:32 +0200 Subject: [PATCH 17/49] enable links to be initialized in post groups --- hsds/group_dn.py | 11 ++++-- hsds/group_sn.py | 72 +++++++++++++++++++++++++++++++-------- hsds/link_sn.py | 6 +++- hsds/servicenode_lib.py | 19 ++++++++++- tests/integ/group_test.py | 60 +++++++++++++++++++++++++++++++- 5 files changed, 148 insertions(+), 20 deletions(-) diff --git a/hsds/group_dn.py b/hsds/group_dn.py index dfce8f66..0a93bed4 100755 --- a/hsds/group_dn.py +++ b/hsds/group_dn.py @@ -132,12 +132,19 @@ async def POST_Group(request): else: attrs = {} + if "links" in body: + # initialize links + links = body["links"] + log.debug(f"POST Group with links: {links}") + else: + links = {} + group_json = { "id": group_id, "root": root_id, "created": now, "lastModified": now, - "links": {}, + "links": links, "attributes": attrs, } @@ -153,7 +160,7 @@ async def POST_Group(request): resp_json["root"] = root_id resp_json["created"] = group_json["created"] resp_json["lastModified"] = group_json["lastModified"] - resp_json["linkCount"] = 0 + resp_json["linkCount"] = len(links) resp_json["attributeCount"] = len(attrs) resp = json_response(resp_json, status=201) diff --git a/hsds/group_sn.py b/hsds/group_sn.py index e2395826..06874f5c 100755 --- a/hsds/group_sn.py +++ b/hsds/group_sn.py @@ -13,6 +13,8 @@ # group handler for service node of hsds cluster # +import time + from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPNotFound from json import JSONDecodeError @@ -23,11 +25,12 @@ from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain, getPathForDomain, verifyRoot -from .util.linkUtil import validateLinkName +from .util.linkUtil import validateLinkName, getLinkClass from .servicenode_lib import getDomainJson, getObjectJson, validateAction from .servicenode_lib import getObjectIdByPath, getPathForObjectId from .servicenode_lib import createObject, createObjectByPath, deleteObject from . import hsds_logger as log +from . import config async def GET_Group(request): @@ -189,6 +192,7 @@ async def POST_Group(request): h5path = None creation_props = None attrs = None + links = None if request.has_body: try: @@ -236,28 +240,66 @@ async def POST_Group(request): creation_props = body["creationProperties"] if "attributes" in body: attrs = body["attributes"] + if not isinstance(attrs, dict): + msg = f"POST_Groups expected dict for for links, but got: {type(links)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) log.debug(f"POST Group attributes: {attrs}") + if "links" in body: + links = body["links"] + if not isinstance(links, dict): + msg = f"POST_Groups expected dict for for links, but got: {type(links)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + # validate the links + now = time.time() + + for title in links: + try: + validateLinkName(title) + link_item = links[title] + link_class = getLinkClass(link_item) + if "class" in link_item: + if link_class != link_item["class"]: + msg = f"expected link class of: {link_class} but got {link_item}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + link_item["class"] = link_class + getLinkClass(link_item) + if "created" in link_item: + created = link_item["created"] + # allow "pre-dated" attributes if recent enough + predate_max_time = config.get("predate_max_time", default=10.0) + if now - created > predate_max_time: + link_item["created"] = created + else: + log.warn("stale created timestamp for link, ignoring") + if "created" not in link_item: + link_item["created"] = now + + except ValueError: + raise HTTPBadRequest(reason="invalid link item") + + kwargs = {"bucket": bucket} + if obj_id: + kwargs["obj_id"] = obj_id + if creation_props: + kwargs["creation_props"] = creation_props + if attrs: + kwargs["attrs"] = attrs + if links: + kwargs["links"] = links if parent_id: - kwargs = {"bucket": bucket, "parent_id": parent_id, "h5path": h5path} - if obj_id: - kwargs["obj_id"] = obj_id - if creation_props: - kwargs["creation_props"] = creation_props - if attrs: - kwargs["attrs"] = attrs + kwargs["parent_id"] = parent_id + kwargs["h5path"] = h5path if implicit: kwargs["implicit"] = True group_json = await createObjectByPath(app, **kwargs) else: # create an anonymous group - kwargs = {"bucket": bucket, "root_id": root_id} - if obj_id: - kwargs["obj_id"] = obj_id - if creation_props: - kwargs["creation_props"] = creation_props - if attrs: - kwargs["attrs"] = attrs + kwargs["root_id"] = root_id group_json = await createObject(app, **kwargs) log.debug(f"returning resp: {group_json}") diff --git a/hsds/link_sn.py b/hsds/link_sn.py index dc80d9e4..bc7f79ee 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -13,7 +13,7 @@ # service node of hsds cluster # -from aiohttp.web_exceptions import HTTPBadRequest +from aiohttp.web_exceptions import HTTPBadRequest, HTTPInternalServerError from json import JSONDecodeError from h5json.objid import isValidUuid, getCollectionForId @@ -142,6 +142,10 @@ async def GET_Links(request): # mix in collection key, target and hrefs for link in links: + if "class" not in link: + log.error("expected to find class key in link") + raise HTTPInternalServerError() + if link["class"] == "H5L_TYPE_HARD": collection_name = getCollectionForId(link["id"]) link["collection"] = collection_name diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 80fe7af1..4247ab7a 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -1294,6 +1294,7 @@ async def createObject(app, layout=None, creation_props=None, attrs=None, + links=None, bucket=None): """ create a group, ctype, or dataset object and return object json Determination on whether a group, ctype, or dataset is created is based on: @@ -1319,6 +1320,8 @@ async def createObject(app, log.debug(f" cprops: {creation_props}") if attrs: log.debug(f" attrs: {attrs}") + if links: + log.debug(f" links: {links}") if obj_id: log.debug(f"using client supplied id: {obj_id}") @@ -1347,8 +1350,13 @@ async def createObject(app, attrs_json = {"attributes": attrs} attr_items = await getAttributesFromRequest(app, attrs_json, **kwargs) log.debug(f"got attr_items: {attr_items}") - obj_json["attributes"] = attr_items + if links: + if collection != "groups": + msg = "links can only be used with groups" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + obj_json["links"] = links log.debug(f"create {collection} obj, body: {obj_json}") dn_url = getDataNodeUrl(app, obj_id) req = f"{dn_url}/{collection}" @@ -1368,6 +1376,7 @@ async def createObjectByPath(app, layout=None, creation_props=None, attrs=None, + links=None, bucket=None): """ create an object at the designated path relative to the parent. @@ -1394,6 +1403,12 @@ async def createObjectByPath(app, log.debug(f" cprops: {creation_props}") if attrs: log.debug(f" attrs: {attrs}") + if links: + log.debug(f" links: {links}") + if obj_type: + msg = "only group objects can have links" + log.warn(msg) + raise HTTPBadRequest(reason=msg) root_id = getRootObjId(parent_id) @@ -1474,6 +1489,8 @@ async def createObjectByPath(app, kwargs["creation_props"] = creation_props if attrs: kwargs["attrs"] = attrs + if links: + kwargs["links"] = links if obj_id: kwargs["obj_id"] = obj_id obj_json = await createObject(app, **kwargs) diff --git a/tests/integ/group_test.py b/tests/integ/group_test.py index b78b8e58..d2ac33b5 100755 --- a/tests/integ/group_test.py +++ b/tests/integ/group_test.py @@ -316,7 +316,7 @@ def testPostWithLink(self): self.assertEqual(rspJson["linkCount"], 0) self.assertEqual(rspJson["attributeCount"], 0) new_group_id = rspJson["id"] - self.assertTrue(helper.validateId(rspJson["id"])) + self.assertTrue(helper.validateId(new_group_id)) self.assertTrue(new_group_id != root_uuid) # get root group and verify link count is 1 @@ -418,6 +418,64 @@ def testPostWithAttributes(self): self.assertTrue("attributes") in rspJson self.assertEqual(len(rspJson["attributes"]), attr_count) + def testPostWithLinks(self): + # test POST with attribute initialization + print("testPostWithLinks", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # some objects to link + link_count = 4 + links = {} + req = helper.getEndpoint() + "/groups" + + for i in range(link_count): + rsp = self.session.post(req, headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + group_id = rspJson["id"] + self.assertTrue(helper.validateId(group_id)) + links[f"obj_{i}"] = {"id": group_id} + + # create new group + payload = {"links": links, "link": {"id": root_uuid, "name": "g1"}} + req = helper.getEndpoint() + "/groups" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], link_count) + self.assertEqual(rspJson["attributeCount"], 0) + grp_id = rspJson["id"] + helper.validateId(grp_id) + + # fetch all the links + req = helper.getEndpoint() + "/groups/" + grp_id + "/links" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + + self.assertTrue("links" in rspJson) + links_rsp = rspJson["links"] + self.assertEqual(len(links_rsp), link_count) + for i in range(link_count): + link_rsp = links_rsp[i] + self.assertTrue("class" in link_rsp) + self.assertEqual(link_rsp["class"], "H5L_TYPE_HARD") + self.assertTrue("id" in link_rsp) + self.assertTrue("title" in link_rsp) + self.assertEqual(link_rsp["title"], f"obj_{i}") + self.assertTrue("collection" in link_rsp) + self.assertEqual(link_rsp["collection"], "groups") + self.assertTrue("target" in link_rsp) + self.assertTrue("href" in link_rsp) + def testPostWithPath(self): # test POST with implicit parent group creation print("testPostWithPath", self.base_domain) From d9c3e875d87a53c1c60ddff15c732db74a87099c Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 8 May 2025 20:46:51 +0200 Subject: [PATCH 18/49] support dataset value init in post request --- hsds/chunk_crawl.py | 1 + hsds/chunk_sn.py | 192 +---------------------------------- hsds/dset_lib.py | 195 +++++++++++++++++++++++++++++++++++- hsds/dset_sn.py | 51 +++++++++- hsds/servicenode_lib.py | 5 +- tests/integ/dataset_test.py | 1 - tests/integ/value_test.py | 61 +++++++++++ 7 files changed, 307 insertions(+), 199 deletions(-) diff --git a/hsds/chunk_crawl.py b/hsds/chunk_crawl.py index 960cdadf..47b4b114 100755 --- a/hsds/chunk_crawl.py +++ b/hsds/chunk_crawl.py @@ -84,6 +84,7 @@ async def write_chunk_hyperslab( msg = f"write_chunk_hyperslab, chunk_id: {chunk_id}, slices: {slices}, " msg += f"bucket: {bucket}" + msg += f" dset_json: {dset_json}" log.info(msg) if "layout" not in dset_json: log.error(f"No layout found in dset_json: {dset_json}") diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index 4bb084b3..87f2fdb4 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -19,7 +19,6 @@ import numpy as np from json import JSONDecodeError -from asyncio import IncompleteReadError from aiohttp.web_exceptions import HTTPException, HTTPBadRequest from aiohttp.web_exceptions import HTTPRequestEntityTooLarge from aiohttp.web_exceptions import HTTPConflict, HTTPInternalServerError @@ -37,11 +36,9 @@ from .util.dsetUtil import isNullSpace, isScalarSpace, get_slices, getShapeDims from .util.dsetUtil import isExtensible, getSelectionPagination from .util.dsetUtil import getSelectionShape, getDsetMaxDims, getChunkLayout -from .util.chunkUtil import getNumChunks, getChunkIds, getChunkId from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .servicenode_lib import getDsetJson, validateAction -from .dset_lib import getSelectionData, getParser, extendShape -from .chunk_crawl import ChunkCrawler +from .dset_lib import getSelectionData, getParser, extendShape, doPointWrite, doHyperslabWrite from . import config from . import hsds_logger as log @@ -464,188 +461,6 @@ async def arrayResponse(arr, request, dset_json): return resp -async def _doPointWrite(app, - request, - points=None, - data=None, - dset_json=None, - bucket=None - ): - """ write the given points to the dataset """ - - num_points = len(points) - log.debug(f"doPointWrite - num_points: {num_points}") - dset_id = dset_json["id"] - layout = getChunkLayout(dset_json) - datashape = dset_json["shape"] - dims = getShapeDims(datashape) - rank = len(dims) - - chunk_dict = {} # chunk ids to list of points in chunk - - for pt_indx in range(num_points): - if rank == 1: - point = int(points[pt_indx]) - else: - point_tuple = points[pt_indx] - point = [] - for i in range(len(point_tuple)): - point.append(int(point_tuple[i])) - if rank == 1: - if point < 0 or point >= dims[0]: - msg = f"PUT Value point: {point} is not within the " - msg += "bounds of the dataset" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - if len(point) != rank: - msg = "PUT Value point value did not match dataset rank" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - for i in range(rank): - if point[i] < 0 or point[i] >= dims[i]: - msg = f"PUT Value point: {point} is not within the " - msg += "bounds of the dataset" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - chunk_id = getChunkId(dset_id, point, layout) - # get the pt_indx element from the input data - value = data[pt_indx] - if chunk_id not in chunk_dict: - point_list = [point, ] - point_data = [value, ] - chunk_dict[chunk_id] = {"indices": point_list, "points": point_data} - else: - item = chunk_dict[chunk_id] - point_list = item["indices"] - point_list.append(point) - point_data = item["points"] - point_data.append(value) - - num_chunks = len(chunk_dict) - log.debug(f"num_chunks: {num_chunks}") - max_chunks = int(config.get("max_chunks_per_request", default=1000)) - if num_chunks > max_chunks: - msg = f"PUT value request with more than {max_chunks} chunks" - log.warn(msg) - - chunk_ids = list(chunk_dict.keys()) - chunk_ids.sort() - - crawler = ChunkCrawler( - app, - chunk_ids, - dset_json=dset_json, - bucket=bucket, - points=chunk_dict, - action="write_point_sel", - ) - await crawler.crawl() - - crawler_status = crawler.get_status() - - if crawler_status not in (200, 201): - msg = f"doPointWritte raising HTTPInternalServerError for status: {crawler_status}" - log.error(msg) - raise HTTPInternalServerError() - else: - log.info("doPointWrite success") - - -async def _doHyperslabWrite(app, - request, - page_number=0, - page=None, - data=None, - dset_json=None, - select_dtype=None, - bucket=None - ): - """ write the given page selection to the dataset """ - dset_id = dset_json["id"] - log.info(f"_doHyperslabWrite on {dset_id} - page: {page_number}") - type_json = dset_json["type"] - - if select_dtype is not None: - item_size = getDtypeItemSize(select_dtype) - else: - item_size = getItemSize(type_json) - if item_size == "H5T_VARIABLE" and data is None: - msg = "unexpected call to _doHyperslabWrite for variable length data" - log.error(msg) - raise HTTPInternalServerError() - - layout = getChunkLayout(dset_json) - - num_chunks = getNumChunks(page, layout) - log.debug(f"num_chunks: {num_chunks}") - max_chunks = int(config.get("max_chunks_per_request", default=1000)) - if num_chunks > max_chunks: - msg = f"PUT value chunk count: {num_chunks} exceeds max_chunks: {max_chunks}" - log.warn(msg) - select_shape = getSelectionShape(page) - log.debug(f"got select_shape: {select_shape} for page: {page_number}") - - if data is None: - num_bytes = math.prod(select_shape) * item_size - log.debug(f"reading {num_bytes} from request stream") - # read page of data from input stream - try: - page_bytes = await request_read(request, count=num_bytes) - except HTTPRequestEntityTooLarge as tle: - msg = "Got HTTPRequestEntityTooLarge exception during " - msg += f"binary read: {tle}) for page: {page_number}" - log.warn(msg) - raise # re-throw - except IncompleteReadError as ire: - msg = "Got asyncio.IncompleteReadError during binary " - msg += f"read: {ire} for page: {page_number}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - log.debug(f"read {len(page_bytes)} for page: {page_number}") - try: - arr = bytesToArray(page_bytes, select_dtype, select_shape) - except ValueError as ve: - msg = f"bytesToArray value error for page: {page_number}: {ve}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - arr = data # use array provided to function - - try: - chunk_ids = getChunkIds(dset_id, page, layout) - except ValueError: - log.warn("getChunkIds failed") - raise HTTPInternalServerError() - if len(chunk_ids) < 10: - log.debug(f"chunk_ids: {chunk_ids}") - else: - log.debug(f"chunk_ids: {chunk_ids[:10]} ...") - if len(chunk_ids) > max_chunks: - msg = f"got {len(chunk_ids)} for page: {page_number}. max_chunks: {max_chunks}" - log.warn(msg) - - crawler = ChunkCrawler( - app, - chunk_ids, - dset_json=dset_json, - bucket=bucket, - slices=page, - arr=arr, - action="write_chunk_hyperslab", - ) - await crawler.crawl() - - crawler_status = crawler.get_status() - - if crawler_status not in (200, 201): - msg = f"crawler failed for page: {page_number} with status: {crawler_status}" - log.error(msg) - raise HTTPInternalServerError() - else: - log.info("crawler write_chunk_hyperslab successful") - - async def PUT_Value(request): """ Handler for PUT //value request @@ -940,13 +755,13 @@ async def PUT_Value(request): else: kwargs["data"] = None # do write for one page selection - await _doHyperslabWrite(app, request, **kwargs) + await doHyperslabWrite(app, request, **kwargs) else: # # Do point put # kwargs = {"points": points, "data": arr, "dset_json": dset_json, "bucket": bucket} - await _doPointWrite(app, request, **kwargs) + await doPointWrite(app, request, **kwargs) # write successful @@ -1089,7 +904,6 @@ async def GET_Value(request): arr = None # will be set based on returned data if stream_pagination: - # example # get binary data a page at a time and write back to response if item_size == "H5T_VARIABLE": page_item_size = VARIABLE_AVG_ITEM_SIZE # random guess of avg item_size diff --git a/hsds/dset_lib.py b/hsds/dset_lib.py index 689c2c7e..c40e2c39 100755 --- a/hsds/dset_lib.py +++ b/hsds/dset_lib.py @@ -11,25 +11,28 @@ ############################################################################## import asyncio +from asyncio import IncompleteReadError + import math import numpy as np from aiohttp.client_exceptions import ClientError -from aiohttp.web_exceptions import HTTPBadRequest, HTTPConflict, HTTPInternalServerError +from aiohttp.web_exceptions import HTTPBadRequest, HTTPConflict +from aiohttp.web_exceptions import HTTPInternalServerError, HTTPRequestEntityTooLarge -from h5json.hdf5dtype import createDataType, getItemSize -from h5json.array_util import getNumpyValue +from h5json.hdf5dtype import createDataType, getItemSize, getDtypeItemSize +from h5json.array_util import getNumpyValue, bytesToArray from h5json.objid import isSchema2Id, getS3Key, getObjId from .util.nodeUtil import getDataNodeUrl from .util.boolparser import BooleanParser from .util.dsetUtil import isNullSpace, getDatasetLayout, getDatasetLayoutClass, get_slices -from .util.dsetUtil import getChunkLayout, getSelectionShape, getShapeDims +from .util.dsetUtil import getShapeDims, getSelectionShape, getChunkLayout from .util.chunkUtil import getChunkCoordinate, getChunkIndex, getChunkSuffix from .util.chunkUtil import getNumChunks, getChunkIds, getChunkId from .util.chunkUtil import getChunkCoverage, getDataCoverage from .util.chunkUtil import getQueryDtype, get_chunktable_dims -from .util.httpUtil import http_delete, http_put +from .util.httpUtil import http_delete, http_put, request_read from .util.rangegetUtil import getHyperChunkFactors from .util.storUtil import getStorKeys @@ -1056,3 +1059,185 @@ async def deleteAllChunks(app, dset_id, bucket=None): await removeChunks(app, chunk_ids, bucket=bucket) else: log.info(f"deleteAllChunks for {dset_id} - no chunks need deletion") + + +async def doPointWrite(app, + request, + points=None, + data=None, + dset_json=None, + bucket=None + ): + """ write the given points to the dataset """ + + num_points = len(points) + log.debug(f"doPointWrite - num_points: {num_points}") + dset_id = dset_json["id"] + layout = getChunkLayout(dset_json) + datashape = dset_json["shape"] + dims = getShapeDims(datashape) + rank = len(dims) + + chunk_dict = {} # chunk ids to list of points in chunk + + for pt_indx in range(num_points): + if rank == 1: + point = int(points[pt_indx]) + else: + point_tuple = points[pt_indx] + point = [] + for i in range(len(point_tuple)): + point.append(int(point_tuple[i])) + if rank == 1: + if point < 0 or point >= dims[0]: + msg = f"PUT Value point: {point} is not within the " + msg += "bounds of the dataset" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + if len(point) != rank: + msg = "PUT Value point value did not match dataset rank" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + for i in range(rank): + if point[i] < 0 or point[i] >= dims[i]: + msg = f"PUT Value point: {point} is not within the " + msg += "bounds of the dataset" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + chunk_id = getChunkId(dset_id, point, layout) + # get the pt_indx element from the input data + value = data[pt_indx] + if chunk_id not in chunk_dict: + point_list = [point, ] + point_data = [value, ] + chunk_dict[chunk_id] = {"indices": point_list, "points": point_data} + else: + item = chunk_dict[chunk_id] + point_list = item["indices"] + point_list.append(point) + point_data = item["points"] + point_data.append(value) + + num_chunks = len(chunk_dict) + log.debug(f"num_chunks: {num_chunks}") + max_chunks = int(config.get("max_chunks_per_request", default=1000)) + if num_chunks > max_chunks: + msg = f"PUT value request with more than {max_chunks} chunks" + log.warn(msg) + + chunk_ids = list(chunk_dict.keys()) + chunk_ids.sort() + + crawler = ChunkCrawler( + app, + chunk_ids, + dset_json=dset_json, + bucket=bucket, + points=chunk_dict, + action="write_point_sel", + ) + await crawler.crawl() + + crawler_status = crawler.get_status() + + if crawler_status not in (200, 201): + msg = f"doPointWritte raising HTTPInternalServerError for status: {crawler_status}" + log.error(msg) + raise HTTPInternalServerError() + else: + log.info("doPointWrite success") + + +async def doHyperslabWrite(app, + request, + page_number=0, + page=None, + data=None, + dset_json=None, + select_dtype=None, + bucket=None + ): + """ write the given page selection to the dataset """ + dset_id = dset_json["id"] + log.info(f"doHyperslabWrite on {dset_id} - page: {page_number} dset_json: {dset_json}") + type_json = dset_json["type"] + + if select_dtype is not None: + item_size = getDtypeItemSize(select_dtype) + else: + item_size = getItemSize(type_json) + if item_size == "H5T_VARIABLE" and data is None: + msg = "unexpected call to doHyperslabWrite for variable length data" + log.error(msg) + raise HTTPInternalServerError() + + layout = getChunkLayout(dset_json) + + num_chunks = getNumChunks(page, layout) + log.debug(f"num_chunks: {num_chunks}") + max_chunks = int(config.get("max_chunks_per_request", default=1000)) + if num_chunks > max_chunks: + msg = f"PUT value chunk count: {num_chunks} exceeds max_chunks: {max_chunks}" + log.warn(msg) + select_shape = getSelectionShape(page) + log.debug(f"got select_shape: {select_shape} for page: {page_number}") + + if data is None: + num_bytes = math.prod(select_shape) * item_size + log.debug(f"reading {num_bytes} from request stream") + # read page of data from input stream + try: + page_bytes = await request_read(request, count=num_bytes) + except HTTPRequestEntityTooLarge as tle: + msg = "Got HTTPRequestEntityTooLarge exception during " + msg += f"binary read: {tle}) for page: {page_number}" + log.warn(msg) + raise # re-throw + except IncompleteReadError as ire: + msg = "Got asyncio.IncompleteReadError during binary " + msg += f"read: {ire} for page: {page_number}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + log.debug(f"read {len(page_bytes)} for page: {page_number}") + try: + arr = bytesToArray(page_bytes, select_dtype, select_shape) + except ValueError as ve: + msg = f"bytesToArray value error for page: {page_number}: {ve}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + arr = data # use array provided to function + + try: + chunk_ids = getChunkIds(dset_id, page, layout) + except ValueError: + log.warn("getChunkIds failed") + raise HTTPInternalServerError() + if len(chunk_ids) < 10: + log.debug(f"chunk_ids: {chunk_ids}") + else: + log.debug(f"chunk_ids: {chunk_ids[:10]} ...") + if len(chunk_ids) > max_chunks: + msg = f"got {len(chunk_ids)} for page: {page_number}. max_chunks: {max_chunks}" + log.warn(msg) + + crawler = ChunkCrawler( + app, + chunk_ids, + dset_json=dset_json, + bucket=bucket, + slices=page, + arr=arr, + action="write_chunk_hyperslab", + ) + await crawler.crawl() + + crawler_status = crawler.get_status() + + if crawler_status not in (200, 201): + msg = f"crawler failed for page: {page_number} with status: {crawler_status}" + log.error(msg) + raise HTTPInternalServerError() + else: + log.info("crawler write_chunk_hyperslab successful") diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index c6c5e502..7d50dbef 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -19,7 +19,7 @@ from aiohttp.web_exceptions import HTTPBadRequest, HTTPNotFound from h5json.hdf5dtype import validateTypeItem, createDataType, getBaseTypeJson, getItemSize -from h5json.array_util import getNumElements, getNumpyValue +from h5json.array_util import getNumElements, getNumpyValue, jsonToArray from h5json.objid import isValidUuid, isSchema2Id from .util.httpUtil import getHref, respJsonAssemble @@ -36,7 +36,7 @@ from .servicenode_lib import getDomainJson, getObjectJson, getDsetJson, getPathForObjectId from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo from .servicenode_lib import createObject, createObjectByPath, deleteObject -from .dset_lib import updateShape, deleteAllChunks +from .dset_lib import updateShape, deleteAllChunks, doHyperslabWrite from . import config from . import hsds_logger as log @@ -764,7 +764,7 @@ async def POST_Dataset(request): elif shape == "H5S_SCALAR": shape_json["class"] = "H5S_SCALAR" else: - msg = "POST Datset with invalid shape value" + msg = "POST Dataset with invalid shape value" log.warn(msg) raise HTTPBadRequest(reason=msg) elif isinstance(shape, list): @@ -847,6 +847,30 @@ async def POST_Dataset(request): else: shape_json["maxdims"].append(maxextent) + if "value" in body and body["value"]: + # data to initialize dataset included in request + input_data = body["value"] + msg = "input data doesn't match request type and shape" + dims = getShapeDims(shape_json) + if not dims: + log.warn(msg) + raise HTTPBadRequest(reason=msg) + arr_dtype = createDataType(datatype) + try: + input_arr = jsonToArray(dims, arr_dtype, input_data) + except ValueError: + log.warn(f"ValueError: {msg}") + raise HTTPBadRequest(reason=msg) + except TypeError: + log.warn(f"TypeError: {msg}") + raise HTTPBadRequest(reason=msg) + except IndexError: + log.warn(f"IndexError: {msg}") + raise HTTPBadRequest(reason=msg) + log.debug(f"got json arr: {input_arr.shape}") + else: + input_arr = None + layout_props = None min_chunk_size = int(config.get("min_chunk_size")) max_chunk_size = int(config.get("max_chunk_size")) @@ -1168,6 +1192,27 @@ async def POST_Dataset(request): kwargs["root_id"] = root_id dset_json = await createObject(app, **kwargs) + # write data if provided + if input_arr: + log.debug(f"write input_arr: {input_arr}") + # mixin the layout + dset_json["layout"] = layout + # make selection for entire dataspace + dims = getShapeDims(shape_json) + slices = [] + for dim in dims: + s = slice(0, dim, 1) + slices.append(s) + # make a one page list to handle the write in one chunk crawler run + # (larger write request should user binary streaming) + kwargs = {"page_number": 0, "page": slices} + kwargs["dset_json"] = dset_json + kwargs["bucket"] = bucket + kwargs["select_dtype"] = input_arr.dtype + kwargs["data"] = input_arr + # do write + await doHyperslabWrite(app, request, **kwargs) + # dataset creation successful resp = await jsonResponse(request, dset_json, status=201) log.response(request, resp=resp) diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 4247ab7a..baba0f27 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -20,13 +20,15 @@ from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPGone, HTTPConflict from aiohttp.web_exceptions import HTTPNotFound, HTTPInternalServerError + from aiohttp.client_exceptions import ClientOSError, ClientError from aiohttp import ClientResponseError from h5json.array_util import encodeData, decodeData, bytesToArray, bytesArrayToList, jsonToArray from h5json.objid import getCollectionForId, createObjId, getRootObjId from h5json.objid import isSchema2Id, getS3Key, isValidUuid -from h5json.hdf5dtype import getBaseTypeJson, validateTypeItem, createDataType, getItemSize +from h5json.hdf5dtype import getBaseTypeJson, validateTypeItem, createDataType +from h5json.hdf5dtype import getItemSize from .util.nodeUtil import getDataNodeUrl from .util.authUtil import getAclKeys @@ -37,6 +39,7 @@ from .util.domainUtil import getBucketForDomain, verifyRoot, getLimits from .util.storUtil import getCompressors from .util.dsetUtil import getShapeDims + from .basenode import getVersion from . import hsds_logger as log diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index 46726e60..3d4610a4 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -309,7 +309,6 @@ def testPostDatasetWithAttributes(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - print("rspJson:", rspJson) root_uuid = rspJson["root"] helper.validateId(root_uuid) diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index 66287caf..2eca94bb 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -951,6 +951,67 @@ def testPutScalarDataset(self): self.assertTrue("value" in rspJson) self.assertEqual(rspJson["value"], "Hello, world") + def testScalarDatasetInitData(self): + # Test creation/deletion of scalar dataset obj along with initial data + print("testScalarDatasetInitData", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create a dataset obj + data = {"type": "H5T_STD_I32LE", "shape": "H5S_SCALAR", "value": 42} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["attributeCount"], 0) + dset_id = rspJson["id"] + self.assertTrue(helper.validateId(dset_id)) + + # read back the obj + req = self.endpoint + "/datasets/" + dset_id + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + + expected_keys = [ + "id", + "shape", + "hrefs", + "layout", + "creationProperties", + "attributeCount", + "created", + "lastModified", + "root", + "domain", + ] + + for name in expected_keys: + self.assertTrue(name in rspJson) + self.assertEqual(rspJson["id"], dset_id) + self.assertEqual(rspJson["root"], root_uuid) + self.assertEqual(rspJson["domain"], self.base_domain) + self.assertEqual(rspJson["attributeCount"], 0) + shape_json = rspJson["shape"] + self.assertTrue(shape_json["class"], "H5S_SCALAR") + self.assertTrue(rspJson["type"], "H5T_STD_I32LE") + + # read the data back + req += "/value" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], 42) + def testNullSpaceDataset(self): # Test attempted read/write to null space dataset print("testNullSpaceDataset", self.base_domain) From 4ab24fc6837158ca9eecfb14c3f5c63e391f6190 Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 9 May 2025 12:05:00 +0200 Subject: [PATCH 19/49] add compound init value test --- tests/integ/attr_test.py | 1 - tests/integ/value_test.py | 90 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/tests/integ/attr_test.py b/tests/integ/attr_test.py index b9f4dd7e..de54c5ea 100644 --- a/tests/integ/attr_test.py +++ b/tests/integ/attr_test.py @@ -915,7 +915,6 @@ def testPutCommittedType(self): value.append(i * 0.5) payload = {"type": dtype_uuid, "shape": 10, "value": value} req = self.endpoint + "/groups/" + root_id + "/attributes/" + attr_name - print("req:", req) rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create attribute diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index 2eca94bb..dd4ef4f7 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -1273,6 +1273,96 @@ def testPutCompound(self): self.assertEqual(len(item), 1) self.assertEqual(item[0], i * 10) + def testPutCompoundInitData(self): + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + str_type = { + "charSet": "H5T_CSET_ASCII", + "class": "H5T_STRING", + "strPad": "H5T_STR_NULLPAD", + "length": 1, + } + + fields = ( + {"name": "temp", "type": "H5T_STD_I32LE"}, + {"name": "unit", "type": str_type}, + ) + datatype = {"class": "H5T_COMPOUND", "fields": fields} + + # + # create compound scalar dataset + # + value = (42, 'F') + payload = {"type": datatype} # , "value": value} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # create dataset + + rspJson = json.loads(rsp.text) + dset0d_uuid = rspJson["id"] + self.assertTrue(helper.validateId(dset0d_uuid)) + + # verify the shape of the dataset + req = self.endpoint + "/datasets/" + dset0d_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) # get dataset + rspJson = json.loads(rsp.text) + shape = rspJson["shape"] + self.assertEqual(shape["class"], "H5S_SCALAR") + + # write entire array + payload = {"value": value} + req = self.endpoint + "/datasets/" + dset0d_uuid + "/value" + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 200) # write value + + # read back the value + req = self.endpoint + "/datasets/" + dset0d_uuid + "/value" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + + # + # create 1d dataset + # + num_elements = 10 + payload = {"type": datatype, "shape": num_elements} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # create dataset + + rspJson = json.loads(rsp.text) + dset1d_uuid = rspJson["id"] + self.assertTrue(helper.validateId(dset1d_uuid)) + + # link new dataset as 'dset1' + name = "dset1d" + helper.getRandomName() + req = self.endpoint + "/groups/" + root_uuid + "/links/" + name + payload = {"id": dset1d_uuid} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # write entire array + value = [] + for i in range(num_elements): + item = (i * 10, 'F') + value.append(item) + payload = {"value": value} + + req = self.endpoint + "/datasets/" + dset1d_uuid + "/value" + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 200) # write value + def testSimpleTypeFillValue(self): # test Dataset with simple type and fill value print("testSimpleTypeFillValue", self.base_domain) From fc3ad689abb54ea19787247d2be256aa516f1a2c Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 9 May 2025 14:52:11 +0200 Subject: [PATCH 20/49] added post data with compound data initializer --- hsds/dset_sn.py | 3 ++- hsds/servicenode_lib.py | 4 +++- tests/integ/value_test.py | 45 +++++++++++++++++---------------------- 3 files changed, 24 insertions(+), 28 deletions(-) diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 7d50dbef..9f0593e6 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -856,6 +856,7 @@ async def POST_Dataset(request): log.warn(msg) raise HTTPBadRequest(reason=msg) arr_dtype = createDataType(datatype) + try: input_arr = jsonToArray(dims, arr_dtype, input_data) except ValueError: @@ -1193,7 +1194,7 @@ async def POST_Dataset(request): dset_json = await createObject(app, **kwargs) # write data if provided - if input_arr: + if input_arr is not None: log.debug(f"write input_arr: {input_arr}") # mixin the layout dset_json["layout"] = layout diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index baba0f27..de1253ee 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -1070,7 +1070,9 @@ def getValueFromRequest(body, data_type, data_shape): else: np_dims = dims - if body.get("encoding"): + if "encoding" in body: + encoding = body["encoding"] + log.debug(f"using encoding: {encoding}") item_size = getItemSize(data_type) if item_size == "H5T_VARIABLE": msg = "base64 encoding is not support for variable length attributes" diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index dd4ef4f7..c9e88afb 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -1288,7 +1288,7 @@ def testPutCompoundInitData(self): "charSet": "H5T_CSET_ASCII", "class": "H5T_STRING", "strPad": "H5T_STR_NULLPAD", - "length": 1, + "length": 5, } fields = ( @@ -1300,8 +1300,8 @@ def testPutCompoundInitData(self): # # create compound scalar dataset # - value = (42, 'F') - payload = {"type": datatype} # , "value": value} + value = (42, 'C') + payload = {"type": datatype, "value": value} req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -1318,12 +1318,6 @@ def testPutCompoundInitData(self): shape = rspJson["shape"] self.assertEqual(shape["class"], "H5S_SCALAR") - # write entire array - payload = {"value": value} - req = self.endpoint + "/datasets/" + dset0d_uuid + "/value" - rsp = self.session.put(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 200) # write value - # read back the value req = self.endpoint + "/datasets/" + dset0d_uuid + "/value" rsp = self.session.get(req, headers=headers) @@ -1331,12 +1325,19 @@ def testPutCompoundInitData(self): rspJson = json.loads(rsp.text) self.assertTrue("hrefs" in rspJson) self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], [42, 'C']) # # create 1d dataset # + + # make up some data num_elements = 10 - payload = {"type": datatype, "shape": num_elements} + value = [] + for i in range(num_elements): + item = (i * 10, chr(ord('A') + i)) + value.append(item) + payload = {"type": datatype, "shape": num_elements, "value": value} req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -1345,23 +1346,15 @@ def testPutCompoundInitData(self): dset1d_uuid = rspJson["id"] self.assertTrue(helper.validateId(dset1d_uuid)) - # link new dataset as 'dset1' - name = "dset1d" + helper.getRandomName() - req = self.endpoint + "/groups/" + root_uuid + "/links/" + name - payload = {"id": dset1d_uuid} - rsp = self.session.put(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 201) - - # write entire array - value = [] - for i in range(num_elements): - item = (i * 10, 'F') - value.append(item) - payload = {"value": value} - + # read back the value req = self.endpoint + "/datasets/" + dset1d_uuid + "/value" - rsp = self.session.put(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 200) # write value + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + self.assertEqual(len(rspJson["value"]), num_elements) + self.assertEqual(rspJson["value"][2], [20, 'C']) def testSimpleTypeFillValue(self): # test Dataset with simple type and fill value From 8a1894558d969017dda50e719f3d3424964e58c5 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 15 May 2025 15:01:30 +0200 Subject: [PATCH 21/49] add post_crawler class --- hsds/domain_crawl.py | 2 +- hsds/dset_lib.py | 2 +- hsds/group_sn.py | 259 ++++++++++++++++++++++---------------- hsds/link_sn.py | 10 +- hsds/post_crawl.py | 213 +++++++++++++++++++++++++++++++ hsds/servicenode_lib.py | 1 - hsds/util/linkUtil.py | 78 ++++++++++++ tests/integ/group_test.py | 94 ++++++++++++-- 8 files changed, 534 insertions(+), 125 deletions(-) create mode 100644 hsds/post_crawl.py diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index 656b04e6..0b707329 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -466,7 +466,7 @@ async def crawl(self): pass # ok elif status == 400: log.warn("DomainCrawler - BadRequest") - raise HTTPBadRequest(reason="unkown") + raise HTTPBadRequest(reason="unknown") elif status == 404: log.warn("DomainCrawler - not found") raise HTTPNotFound() diff --git a/hsds/dset_lib.py b/hsds/dset_lib.py index c40e2c39..fc1d3626 100755 --- a/hsds/dset_lib.py +++ b/hsds/dset_lib.py @@ -1142,7 +1142,7 @@ async def doPointWrite(app, crawler_status = crawler.get_status() if crawler_status not in (200, 201): - msg = f"doPointWritte raising HTTPInternalServerError for status: {crawler_status}" + msg = f"doPointWrite raising HTTPInternalServerError for status: {crawler_status}" log.error(msg) raise HTTPInternalServerError() else: diff --git a/hsds/group_sn.py b/hsds/group_sn.py index 06874f5c..e8dd9325 100755 --- a/hsds/group_sn.py +++ b/hsds/group_sn.py @@ -13,8 +13,6 @@ # group handler for service node of hsds cluster # -import time - from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPNotFound from json import JSONDecodeError @@ -25,11 +23,12 @@ from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain, getPathForDomain, verifyRoot -from .util.linkUtil import validateLinkName, getLinkClass +from .util.linkUtil import validateLinkName, getRequestLinks from .servicenode_lib import getDomainJson, getObjectJson, validateAction from .servicenode_lib import getObjectIdByPath, getPathForObjectId from .servicenode_lib import createObject, createObjectByPath, deleteObject from . import hsds_logger as log +from .post_crawl import createObjects from . import config @@ -159,6 +158,114 @@ async def GET_Group(request): return resp +async def _create_group(app, **kwargs): + """ helper method for group creation """ + + if kwargs.get("parent_id") and kwargs.get("h5path"): + group_json = await createObjectByPath(app, **kwargs) + else: + # create an anonymous group + log.debug(f"_create_group - kwargs: {kwargs}") + group_json = await createObject(app, **kwargs) + + return group_json + + +def _get_create_args(body, root_id=None, bucket=None, implicit=False): + """ get query args for _create_group from request body """ + kwargs = {"bucket": bucket} + predate_max_time = config.get("predate_max_time", default=10.0) + + parent_id = None + obj_id = None + h5path = None + + if "link" in body: + if "h5path" in body: + msg = "link can't be used with h5path" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + link_body = body["link"] + log.debug(f"link_body: {link_body}") + if "id" in link_body: + parent_id = link_body["id"] + if "name" in link_body: + link_title = link_body["name"] + try: + # will throw exception if there's a slash in the name + validateLinkName(link_title) + except ValueError: + msg = f"invalid link title: {link_title}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if parent_id and link_title: + log.debug(f"parent id: {parent_id}, link_title: {link_title}") + h5path = link_title # just use the link name as the h5path + + if "parent_id" not in body: + parent_id = root_id + else: + parent_id = body["parent_id"] + + if "h5path" in body: + h5path = body["h5path"] + # normalize the h5path + if h5path.startswith("/"): + if parent_id == root_id: + # just adjust the path to be relative + h5path = h5path[1:] + else: + msg = f"PostCrawler expecting relative h5path, but got: {h5path}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if h5path.endswith("/"): + h5path = h5path[:-1] # makes iterating through the links a bit easier + + if parent_id and h5path: + # these are used by createObjectByPath + kwargs["parent_id"] = parent_id + kwargs["implicit"] = implicit + kwargs["h5path"] = h5path + else: + kwargs["root_id"] = root_id + + if "id" in body: + obj_id = body["id"] + # tbd: validate this is a group id + kwargs["obj_id"] = obj_id + log.debug(f"POST group using client id: {obj_id}") + + if "creationProperties" in body: + creation_props = body["creationProperties"] + # tbd: validate creation_props + kwargs["creation_props"] = creation_props + + if "attributes" in body: + attrs = body["attributes"] + if not isinstance(attrs, dict): + msg = f"POST_Groups expected dict for for attributes, but got: {type(attrs)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + log.debug(f"POST Group attributes: {attrs}") + + # tbd: validate attributes + kwargs["attrs"] = attrs + if "links" in body: + body_links = body["links"] + log.debug(f"got links for new group: {body_links}") + try: + links = getRequestLinks(body["links"], predate_max_time=predate_max_time) + except ValueError: + msg = "invalid link item sent in request" + raise HTTPBadRequest(reason=msg) + log.debug(f"adding links to group POST request: {links}") + kwargs["links"] = links + + return kwargs + + async def POST_Group(request): """HTTP method to create new Group object""" log.request(request) @@ -177,6 +284,7 @@ async def POST_Group(request): bucket = getBucketForDomain(domain) domain_json = await getDomainJson(app, domain, reload=True) + log.debug(f"got domain_json: {domain_json}") # throws exception if not allowed aclCheck(app, domain_json, "create", username) @@ -186,14 +294,8 @@ async def POST_Group(request): # allow parent group creation or not implicit = getBooleanParam(params, "implicit") - - parent_id = None - obj_id = None - h5path = None - creation_props = None - attrs = None - links = None - + kwargs = {} + post_group_rsp = None if request.has_body: try: body = await request.json() @@ -204,107 +306,48 @@ async def POST_Group(request): log.info(f"POST Group body: {body}") if body: - if "link" in body: - if "h5path" in body: - msg = "link can't be used with h5path" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - link_body = body["link"] - log.debug(f"link_body: {link_body}") - if "id" in link_body: - parent_id = link_body["id"] - if "name" in link_body: - link_title = link_body["name"] - try: - # will throw exception if there's a slash in the name - validateLinkName(link_title) - except ValueError: - msg = f"invalid link title: {link_title}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if parent_id and link_title: - log.debug(f"parent id: {parent_id}, link_title: {link_title}") - h5path = link_title # just use the link name as the h5path - - if "h5path" in body: - h5path = body["h5path"] - if "parent_id" not in body: - parent_id = root_id + if isinstance(body, list): + count = len(body) + log.debug(f"multiple group create: {count} items") + if count == 0: + # equivalent to no body, anonymous group case + kwargs = {"root_id": root_id, "bucket": bucket} + elif count == 1: + # just create one object in typical way + kwargs = _get_create_args(body[0], + root_id=root_id, + bucket=bucket, + implicit=implicit) else: - parent_id = body["parent_id"] - if "id" in body: - obj_id = body["id"] - log.debug(f"POST group using client id: {obj_id}") - if "creationProperties" in body: - creation_props = body["creationProperties"] - if "attributes" in body: - attrs = body["attributes"] - if not isinstance(attrs, dict): - msg = f"POST_Groups expected dict for for links, but got: {type(links)}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - log.debug(f"POST Group attributes: {attrs}") - if "links" in body: - links = body["links"] - if not isinstance(links, dict): - msg = f"POST_Groups expected dict for for links, but got: {type(links)}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - # validate the links - now = time.time() - - for title in links: - try: - validateLinkName(title) - link_item = links[title] - link_class = getLinkClass(link_item) - if "class" in link_item: - if link_class != link_item["class"]: - msg = f"expected link class of: {link_class} but got {link_item}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - link_item["class"] = link_class - getLinkClass(link_item) - if "created" in link_item: - created = link_item["created"] - # allow "pre-dated" attributes if recent enough - predate_max_time = config.get("predate_max_time", default=10.0) - if now - created > predate_max_time: - link_item["created"] = created - else: - log.warn("stale created timestamp for link, ignoring") - if "created" not in link_item: - link_item["created"] = now - - except ValueError: - raise HTTPBadRequest(reason="invalid link item") - - kwargs = {"bucket": bucket} - if obj_id: - kwargs["obj_id"] = obj_id - if creation_props: - kwargs["creation_props"] = creation_props - if attrs: - kwargs["attrs"] = attrs - if links: - kwargs["links"] = links - - if parent_id: - kwargs["parent_id"] = parent_id - kwargs["h5path"] = h5path - if implicit: - kwargs["implicit"] = True - group_json = await createObjectByPath(app, **kwargs) + # create multiple group objects + kwarg_list = [] # list of kwargs for each object + + for item in body: + log.debug(f"item: {item}") + if not isinstance(item, dict): + msg = f"PostGroup - invalid item type: {type(item)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + kwargs = _get_create_args(item, root_id=root_id, bucket=bucket) + kwarg_list.append(kwargs) + kwargs = {"bucket": bucket, "root_id": root_id} + post_group_rsp = await createObjects(app, kwarg_list, **kwargs) + else: + kwargs = _get_create_args(body, root_id=root_id, bucket=bucket, implicit=implicit) + else: + kwargs["root_id"] = root_id + kwargs["bucket"] = bucket else: - # create an anonymous group - kwargs["root_id"] = root_id - group_json = await createObject(app, **kwargs) + kwargs = {"root_id": root_id, "bucket": bucket} + + if post_group_rsp is None: + # Handle cases other than multi-group create here + log.debug(f"_create_group - kwargs: {kwargs}") + post_group_rsp = await _create_group(app, **kwargs) - log.debug(f"returning resp: {group_json}") + log.debug(f"returning resp: {post_group_rsp}") # group creation successful - resp = await jsonResponse(request, group_json, status=201) + resp = await jsonResponse(request, post_group_rsp, status=201) log.response(request, resp=resp) return resp diff --git a/hsds/link_sn.py b/hsds/link_sn.py index bc7f79ee..66e3a698 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -142,11 +142,15 @@ async def GET_Links(request): # mix in collection key, target and hrefs for link in links: - if "class" not in link: - log.error("expected to find class key in link") - raise HTTPInternalServerError() + for key in ("class", "title"): + if key not in link: + log.error(f"expected to find {key} key in link") + raise HTTPInternalServerError() if link["class"] == "H5L_TYPE_HARD": + if "id" not in link: + log.error("expected to id key in hard link") + raise HTTPInternalServerError() collection_name = getCollectionForId(link["id"]) link["collection"] = collection_name target_uri = "/" + collection_name + "/" + link["id"] diff --git a/hsds/post_crawl.py b/hsds/post_crawl.py new file mode 100644 index 00000000..057cd96d --- /dev/null +++ b/hsds/post_crawl.py @@ -0,0 +1,213 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +# +# post crawler +# + +import asyncio + +from aiohttp.web_exceptions import HTTPServiceUnavailable, HTTPConflict, HTTPBadRequest +from aiohttp.web_exceptions import HTTPInternalServerError, HTTPNotFound, HTTPGone + +from .util.httpUtil import isOK +from .servicenode_lib import createObject, createObjectByPath +from . import hsds_logger as log + + +class PostCrawler: + def __init__( + self, + app, + items=None, + root_id=None, + bucket=None, + max_tasks=40, + ignore_error=False + ): + log.info("PostCrawler.__init__") + self._app = app + self._root_id = root_id + self._bucket = bucket + self._max_tasks = max_tasks + self._ignore_error = ignore_error + + if not items: + log.error("no post requests for crawler to crawl!") + raise ValueError() + if not bucket: + log.error("bucket not set for PostCrawler") + raise ValueError() + self._count = len(items) + self._items = items + self._rsp_objs = [None,] * self._count + self._q = asyncio.Queue() + log.debug(f"PostCrawler adding index 0 - {self._count} to queue") + for i in range(self._count): + self._q.put_nowait(i) + + def get_rsp_objs(self): + """ return list of object responses """ + + return self._rsp_objs + + def get_status(self): + """ return the highest status of any of the returned objects """ + status = None + for i in range(self._count): + item = self._rsp_objs[i] + if not item: + continue # resp not filled in yet + if "status" in item: + item_status = item["status"] + if status is None or item_status > status: + # return the more severe error + log.debug(f"setting status to {item_status}") + status = item_status + elif "id" in item: + # post request succeeded + if status is None: + status = 201 + else: + log.error(f"PostCrawler unexpected response for item {i}: {item}") + status = 500 + + return status + + async def crawl(self): + max_tasks = min(self._max_tasks, self._count) + workers = [asyncio.Task(self.work()) for _ in range(max_tasks)] + # When all work is done, exit. + msg = "PostCrawler - await queue.join - " + msg += f"count: {self._count} with {max_tasks} workers" + log.info(msg) + await self._q.join() + msg = "PostCrawler - join complete - " + msg += f"count: {self._count}" + log.info(msg) + + for w in workers: + w.cancel() + log.debug("PostCrawler - workers canceled") + + status = self.get_status() + if status: + log.debug(f"PostCrawler -- status: {status}") + log.debug(f"ignore_error: {self._ignore_error}") + if not self._ignore_error: + # throw the appropriate exception if other than 200, 201 + if isOK(status): + pass # ok + elif status == 400: + log.warn("PostCrawler - BadRequest") + raise HTTPBadRequest(reason="unknown") + elif status == 404: + log.warn("PostCrawler - not found") + raise HTTPNotFound() + elif status == 409: + log.warn("PostCrawler - conflict") + raise HTTPConflict() + elif status == 410: + log.warn("PostCrawler - gone") + raise HTTPGone() + elif status == 500: + log.error("PostCrawler - internal server error") + raise HTTPInternalServerError() + elif status == 503: + log.error("PostCrawler - server busy") + raise HTTPServiceUnavailable() + else: + log.error(f"PostCrawler - unexpected status: {status}") + raise HTTPInternalServerError() + else: + # no tasks returned anything + log.error("PostCrawler - no results returned") + if not self._ignore_error: + raise HTTPInternalServerError() + + async def work(self): + while True: + index = await self._q.get() + await self.create(index) + self._q.task_done() + + async def create(self, index): + log.debug(f"PostCrawler fetch for index: {index}") + item = self._items[index] + log.debug(f"got item: {item}") + kwargs = {"bucket": self._bucket} + + if "obj_id" in item: + kwargs["obj_id"] = item["obj_id"] + if "type" in item: + kwargs["obj_type"] = item["type"] + if "layout" in item: + kwargs["layout"] = item["layout"] + if "creation_props" in item: + kwargs["creation_props"] = item["creation_props"] + if "attrs" in item: + kwargs["attrs"] = item["attrs"] + if "parent_id" in item: + kwargs["parent_id"] = item["parent_id"] + elif "root_id" in item: + kwargs["root_id"] = item["root_id"] + if "h5path" in item: + kwargs["h5path"] = item["h5path"] + if "links" in item: + kwargs["links"] = item["links"] + + log.debug(f"PostCrawler index {index} kwargs: {kwargs}") + rsp_json = None + try: + if kwargs.get("parent_id") and kwargs.get("h5path"): + rsp_json = await createObjectByPath(self._app, **kwargs) + else: + # create an anonymous group + rsp_json = await createObject(self._app, **kwargs) + except HTTPConflict: + log.warn("PostCrawler - got HTTPConflict from http_post") + rsp_json = {"status_code": 409} + except HTTPServiceUnavailable: + rsp_json = {"status_code": 503} + except HTTPInternalServerError: + rsp_json = {"status_code": 500} + except Exception as e: + log.error(f"unexpected exception {e}") + rsp_json = {"status_code": 500} + + log.info(f"PostCrawler - index: {index} post rsp: {rsp_json}") + + self._rsp_objs[index] = rsp_json + + +async def createObjects(app, items, root_id=None, bucket=None): + """ create an objects based on parameters in items list """ + + if not root_id: + msg = "no root_id given for createObjects" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + log.info(f"createObjects with {len(items)} items, root_id: {root_id}") + + post_crawler = PostCrawler(app, root_id=root_id, bucket=bucket, items=items) + await post_crawler.crawl() + if post_crawler.get_status() > 201: + msg = f"createObjects returning status from crawler: {post_crawler.get_status()}" + log.error(msg) + raise HTTPInternalServerError() + + obj_list = post_crawler.get_rsp_objs() + if not isinstance(obj_list, list): + msg = f"createObjects expected list but got: {type(obj_list)}" + log.error(msg) + raise HTTPInternalServerError() + return {"objects": obj_list} diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index de1253ee..dd32bd6e 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -41,7 +41,6 @@ from .util.dsetUtil import getShapeDims from .basenode import getVersion - from . import hsds_logger as log from . import config diff --git a/hsds/util/linkUtil.py b/hsds/util/linkUtil.py index 3469a8a1..f872f848 100644 --- a/hsds/util/linkUtil.py +++ b/hsds/util/linkUtil.py @@ -13,6 +13,7 @@ # linkdUtil: # link related functions # +import time from .. import hsds_logger as log @@ -132,3 +133,80 @@ def h5Join(path, paths): h5path += "/" h5path += s return h5path + + +def getRequestLink(title, link_json, predate_max_time=0.0): + """ return normalized link from request json + Throw value error if badly formatted """ + + if not isinstance(link_json, dict): + msg = f"expected dict for for links, but got: {type(link_json)}" + log.warn(msg) + raise ValueError(msg) + + log.debug(f"getRequestLink title: {title} link_json: {link_json}") + link_item = {} # normalized link item to return + + now = time.time() + + validateLinkName(title) # will raise ValueError is invalid + + link_class = getLinkClass(link_json) + if "class" in link_item: + if link_class != link_json["class"]: + msg = f"expected link class of: {link_class} but got {link_json}" + log.warn(msg) + raise ValueError(msg) + + link_item = {"class": link_class} + + if link_class == "H5L_TYPE_HARD": + if "id" not in link_json: + msg = "expected id key for hard link" + log.warn(msg) + raise ValueError + link_item["id"] = link_json["id"] + else: + if link_class in ("H5L_TYPE_SOFT", "H5L_TYPE_EXTERNAL"): + if "h5path" not in link_json: + msg = "expected h5path key for soft link" + log.warn(msg) + raise ValueError(msg) + link_item["h5path"] = link_json["h5path"] + + if link_class == "H5L_TYPE_EXTERNAL": + if "h5domain" not in link_json: + msg = "expected h5domain key for external link" + log.warn(msg) + raise ValueError(msg) + + if "created" in link_json: + created = link_json["created"] + # allow "pre-dated" attributes if recent enough + if now - created > predate_max_time: + link_item["created"] = created + else: + log.warn("stale created timestamp for link, ignoring") + if "created" not in link_item: + link_item["created"] = now + + return link_item + + +def getRequestLinks(links_json, predate_max_time=0.0): + """ return list of normalized links from request json + Throw value error if any is badly formatted """ + + if not isinstance(links_json, dict): + msg = f"POST_Groups expected dict for for links, but got: {type(links_json)}" + log.warn(msg) + raise ValueError(msg) + + links = {} # normalized link items to return + kwargs = {"predate_max_time": predate_max_time} + + for title in links_json: + links[title] = getRequestLink(title, links_json[title], **kwargs) + + return links + diff --git a/tests/integ/group_test.py b/tests/integ/group_test.py index d2ac33b5..fbbda066 100755 --- a/tests/integ/group_test.py +++ b/tests/integ/group_test.py @@ -261,7 +261,6 @@ def testPostWithId(self): self.assertEqual(rsp.status_code, 201) rspJson = json.loads(rsp.text) - print("rspJson:", rspJson) self.assertEqual(rspJson["linkCount"], 0) self.assertEqual(rspJson["attributeCount"], 0) self.assertEqual(grp_id, rspJson["id"]) @@ -360,18 +359,28 @@ def testPostIdWithLink(self): root_uuid = rspJson["root"] helper.validateId(root_uuid) - # create a group id - grp_id = createObjId("groups", root_id=root_uuid) - - # create new group - payload = {"id": grp_id, "link": {"id": root_uuid, "name": "linked_group"}} + grp_count = 3 req = helper.getEndpoint() + "/groups" - rsp = self.session.post(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 201) + + for i in range(grp_count): + # create a group id + grp_id = createObjId("groups", root_id=root_uuid) + + # create new group + payload = {"id": grp_id, "link": {"id": root_uuid, "name": f"g{i:04d}"}} + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + self.assertEqual(rspJson["attributeCount"], 0) + self.assertEqual(grp_id, rspJson["id"]) + + # get root group and verify number of links + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertEqual(rspJson["linkCount"], 0) - self.assertEqual(rspJson["attributeCount"], 0) - self.assertEqual(grp_id, rspJson["id"]) + self.assertEqual(rspJson["linkCount"], grp_count) def testPostWithAttributes(self): # test POST with attribute initialization @@ -717,6 +726,69 @@ def testPostWithCreationProps(self): self.assertTrue("alias" in rspJson) self.assertEqual(rspJson["alias"], []) + def testPostMulti(self): + # test POST with multi-object creation + print("testPostMulti", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # get root group and verify link count is 0 + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + + # create a set of anonymous groups + grp_count = 3 + req = helper.getEndpoint() + "/groups" + + payload = [{},] * grp_count + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertTrue("objects" in rspJson) + rsp_objs = rspJson["objects"] + self.assertEqual(len(rsp_objs), grp_count) + for i in range(grp_count): + grp_rsp = rsp_objs[i] + self.assertEqual(grp_rsp["linkCount"], 0) + self.assertEqual(grp_rsp["attributeCount"], 0) + group_id = grp_rsp["id"] + self.assertTrue(helper.validateId(group_id)) + + # create a set of linked groups + grp_count = 3 + payload = [] + for i in range(grp_count): + payload.append({"link": {"id": root_uuid, "name": f"g{i}"}}) + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertTrue("objects" in rspJson) + rsp_objs = rspJson["objects"] + self.assertEqual(len(rsp_objs), grp_count) + for i in range(grp_count): + grp_rsp = rsp_objs[i] + self.assertEqual(grp_rsp["linkCount"], 0) + self.assertEqual(grp_rsp["attributeCount"], 0) + group_id = grp_rsp["id"] + self.assertTrue(helper.validateId(group_id)) + + # get root group and verify link count is grp_count + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], grp_count) + def testDelete(self): # test Delete print("testDelete", self.base_domain) From a8ec66d9d3325ec62eb7582306c7b2c58e025d5a Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 15 May 2025 19:38:23 +0200 Subject: [PATCH 22/49] avoid exception for mkdir race condition --- hsds/util/fileClient.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/hsds/util/fileClient.py b/hsds/util/fileClient.py index 1bc5e786..feebe2c1 100644 --- a/hsds/util/fileClient.py +++ b/hsds/util/fileClient.py @@ -88,7 +88,7 @@ def _getFileStats(self, filepath, data=None): return key_stats def _file_stats_increment(self, counter, inc=1): - """Incremenet the indicated connter""" + """Increment the indicated counter""" if "file_stats" not in self._app: # setup stats file_stats = {} @@ -175,6 +175,26 @@ async def get_object(self, key, bucket=None, offset=0, length=-1): raise HTTPInternalServerError() return data + def _mkdir(self, dirpath): + """ create the given directory if it doesn't already exist """ + try: + dirpath = pp.normpath(dirpath) + log.debug(f"normpath: {dirpath}") + + if not pp.isdir(dirpath): + log.debug(f"mkdir({dirpath})") + mkdir(dirpath) + else: + log.debug(f"isdir {dirpath} found") + except IOError as ioe: + if ioe.errno == 17: + # likely directory was created by another process since we checked + log.warn(f"mkdir failed, {dirpath} created outside this process") + else: + msg = f"fileClient: IOError on mkdir {dirpath}: {ioe}" + log.warn(msg) + raise HTTPInternalServerError() + async def put_object(self, key, data, bucket=None): """Write data to given key. Returns client specific dict on success @@ -202,15 +222,7 @@ async def put_object(self, key, data, bucket=None): for key_dir in key_dirs: dirpath = pp.join(dirpath, key_dir) log.debug(f"pp.join({key_dir}) => {dirpath}") - - dirpath = pp.normpath(dirpath) - log.debug(f"normpath: {dirpath}") - - if not pp.isdir(dirpath): - log.debug(f"mkdir({dirpath})") - mkdir(dirpath) - else: - log.debug(f"isdir {dirpath} found") + self._mkdir(dirpath) log.debug(f"open({filepath}, 'wb')") async with aiofiles.open(filepath, loop=loop, mode="wb") as f: await f.write(data) From 41e23e91b057eb4cff98a3c1593c79b5824a0fd2 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 15 May 2025 19:39:14 +0200 Subject: [PATCH 23/49] use domain crawler to create links for post group multi --- hsds/domain_crawl.py | 4 ++-- hsds/group_sn.py | 48 ++++++++++++++++++++++++++++++++++++++--- hsds/link_sn.py | 2 +- hsds/post_crawl.py | 4 ++-- hsds/servicenode_lib.py | 21 ++++++++---------- hsds/util/linkUtil.py | 37 ++++++++++++++++--------------- 6 files changed, 77 insertions(+), 39 deletions(-) diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index 0b707329..35b20bf9 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -248,7 +248,7 @@ async def put_attributes(self, obj_id, attr_items): async def get_obj_json(self, obj_id): """ get the given obj_json for the obj_id. for each group found, search the links if follow_links is set """ - log.debug(f"get_obj_json: {obj_id}") + log.debug(f"DomainCrawler get_obj_json: {obj_id}") collection = getCollectionForId(obj_id) kwargs = {"bucket": self._bucket, "include_attrs": self._include_attrs} @@ -408,7 +408,7 @@ async def get_links(self, grp_id, titles=None): async def put_links(self, grp_id, link_items): # write the given links for the obj_id - log.debug(f"put_links for {grp_id}, {len(link_items)} links") + log.debug(f"DomainCrawler put_links for {grp_id}, {len(link_items)} links") req = getDataNodeUrl(self._app, grp_id) req += f"/groups/{grp_id}/links" kwargs = {"bucket": self._bucket} diff --git a/hsds/group_sn.py b/hsds/group_sn.py index e8dd9325..bfad5b28 100755 --- a/hsds/group_sn.py +++ b/hsds/group_sn.py @@ -29,6 +29,7 @@ from .servicenode_lib import createObject, createObjectByPath, deleteObject from . import hsds_logger as log from .post_crawl import createObjects +from .domain_crawl import DomainCrawler from . import config @@ -171,7 +172,7 @@ async def _create_group(app, **kwargs): return group_json -def _get_create_args(body, root_id=None, bucket=None, implicit=False): +def _get_create_args(body, root_id=None, bucket=None, implicit=False, ignore_link=False): """ get query args for _create_group from request body """ kwargs = {"bucket": bucket} predate_max_time = config.get("predate_max_time", default=10.0) @@ -187,7 +188,7 @@ def _get_create_args(body, root_id=None, bucket=None, implicit=False): raise HTTPBadRequest(reason=msg) link_body = body["link"] log.debug(f"link_body: {link_body}") - if "id" in link_body: + if "id" in link_body and not ignore_link: parent_id = link_body["id"] if "name" in link_body: link_title = link_body["name"] @@ -201,7 +202,8 @@ def _get_create_args(body, root_id=None, bucket=None, implicit=False): if parent_id and link_title: log.debug(f"parent id: {parent_id}, link_title: {link_title}") - h5path = link_title # just use the link name as the h5path + if not ignore_link: + h5path = link_title # just use the link name as the h5path if "parent_id" not in body: parent_id = root_id @@ -329,6 +331,7 @@ async def POST_Group(request): log.warn(msg) raise HTTPBadRequest(reason=msg) kwargs = _get_create_args(item, root_id=root_id, bucket=bucket) + kwargs["ignore_link"] = True kwarg_list.append(kwargs) kwargs = {"bucket": bucket, "root_id": root_id} post_group_rsp = await createObjects(app, kwarg_list, **kwargs) @@ -346,6 +349,45 @@ async def POST_Group(request): post_group_rsp = await _create_group(app, **kwargs) log.debug(f"returning resp: {post_group_rsp}") + + if "objects" in post_group_rsp: + # add any links in multi request + objects = post_group_rsp["objects"] + obj_count = len(objects) + log.debug(f"PostGroup multi create: {obj_count} objects") + if len(body) != obj_count: + msg = f"Expected {obj_count} objects but got {len(body)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + parent_ids = {} + for index in range(obj_count): + item = body[index] + if "link" in item: + link_item = item["link"] + parent_id = link_item.get("id") + title = link_item.get("name") + if parent_id and title: + # add a hard link + object = objects[index] + obj_id = object["id"] + if parent_id not in parent_ids: + parent_ids[parent_id] = {} + links = parent_ids[parent_id] + links[title] = {"id": obj_id} + if parent_ids: + log.debug(f"POSTGroup multi - adding links: {parent_ids}") + kwargs = {"action": "put_link", "bucket": bucket} + kwargs["replace"] = True + + crawler = DomainCrawler(app, parent_ids, **kwargs) + + # will raise exception on not found, server busy, etc. + await crawler.crawl() + + status = crawler.get_status() + + log.info(f"DomainCrawler done for put_links action, status: {status}") + # group creation successful resp = await jsonResponse(request, post_group_rsp, status=201) log.response(request, resp=resp) diff --git a/hsds/link_sn.py b/hsds/link_sn.py index 66e3a698..938f78c2 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -457,7 +457,7 @@ async def PUT_Links(request): count = len(grp_ids) if count == 0: msg = "no grp_ids defined" - log.warn(f"PUT_Attributes: {msg}") + log.warn(f"PUT_Links: {msg}") raise HTTPBadRequest(reason=msg) elif count == 1: # just send one PUT Attributes request to the dn diff --git a/hsds/post_crawl.py b/hsds/post_crawl.py index 057cd96d..88ffab92 100644 --- a/hsds/post_crawl.py +++ b/hsds/post_crawl.py @@ -66,8 +66,8 @@ def get_status(self): item = self._rsp_objs[i] if not item: continue # resp not filled in yet - if "status" in item: - item_status = item["status"] + if "status_code" in item: + item_status = item["status_code"] if status is None or item_status > status: # return the more severe error log.debug(f"setting status to {item_status}") diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index dd32bd6e..8230502e 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -535,7 +535,6 @@ async def putLinks(app, group_id, items, bucket=None): or 200 if it's a duplicate of an existing link. """ isValidUuid(group_id, obj_class="groups") - group_json = None # validate input for title in items: @@ -548,25 +547,23 @@ async def putLinks(app, group_id, items, bucket=None): raise HTTPBadRequest(reason="invalid link") if link_class == "H5L_TYPE_HARD": + if "id" not in item: + msg = "expected id key for hard link class" + log.warn(msg) + raise HTTPBadRequest(reason=msg) tgt_id = item["id"] - isValidUuid(tgt_id) - # for hard links, verify that the referenced id exists and is in - # this domain - ref_json = await getObjectJson(app, tgt_id, bucket=bucket) - if not group_json: - # just need to fetch this once - group_json = await getObjectJson(app, group_id, bucket=bucket) - if ref_json["root"] != group_json["root"]: - msg = "Hard link must reference an object in the same domain" + try: + isValidUuid(tgt_id) + except ValueError: + msg = f"invalid object id: {tgt_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) # ready to add links now req = getDataNodeUrl(app, group_id) req += "/groups/" + group_id + "/links" - log.debug(f"PUT links - PUT request: {req}") + log.debug(f"PUT links {len(items)} items - PUT request: {req}") params = {"bucket": bucket} - data = {"links": items} put_rsp = await http_put(app, req, data=data, params=params) diff --git a/hsds/util/linkUtil.py b/hsds/util/linkUtil.py index f872f848..0090b045 100644 --- a/hsds/util/linkUtil.py +++ b/hsds/util/linkUtil.py @@ -136,17 +136,17 @@ def h5Join(path, paths): def getRequestLink(title, link_json, predate_max_time=0.0): - """ return normalized link from request json + """ return normalized link from request json Throw value error if badly formatted """ - + if not isinstance(link_json, dict): - msg = f"expected dict for for links, but got: {type(link_json)}" - log.warn(msg) - raise ValueError(msg) - - log.debug(f"getRequestLink title: {title} link_json: {link_json}") + msg = f"expected dict for for links, but got: {type(link_json)}" + log.warn(msg) + raise ValueError(msg) + + log.debug(f"getRequestLink title: {title} link_json: {link_json}") link_item = {} # normalized link item to return - + now = time.time() validateLinkName(title) # will raise ValueError is invalid @@ -157,7 +157,7 @@ def getRequestLink(title, link_json, predate_max_time=0.0): msg = f"expected link class of: {link_class} but got {link_json}" log.warn(msg) raise ValueError(msg) - + link_item = {"class": link_class} if link_class == "H5L_TYPE_HARD": @@ -173,13 +173,13 @@ def getRequestLink(title, link_json, predate_max_time=0.0): log.warn(msg) raise ValueError(msg) link_item["h5path"] = link_json["h5path"] - + if link_class == "H5L_TYPE_EXTERNAL": if "h5domain" not in link_json: msg = "expected h5domain key for external link" log.warn(msg) raise ValueError(msg) - + if "created" in link_json: created = link_json["created"] # allow "pre-dated" attributes if recent enough @@ -192,16 +192,16 @@ def getRequestLink(title, link_json, predate_max_time=0.0): return link_item - + def getRequestLinks(links_json, predate_max_time=0.0): - """ return list of normalized links from request json + """ return list of normalized links from request json Throw value error if any is badly formatted """ - + if not isinstance(links_json, dict): - msg = f"POST_Groups expected dict for for links, but got: {type(links_json)}" - log.warn(msg) - raise ValueError(msg) - + msg = f"POST_Groups expected dict for for links, but got: {type(links_json)}" + log.warn(msg) + raise ValueError(msg) + links = {} # normalized link items to return kwargs = {"predate_max_time": predate_max_time} @@ -209,4 +209,3 @@ def getRequestLinks(links_json, predate_max_time=0.0): links[title] = getRequestLink(title, links_json[title], **kwargs) return links - From 7cfa3d67ebcbab48a5680e9a2cf7a6d76a93a231 Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 16 May 2025 18:21:09 +0200 Subject: [PATCH 24/49] added multi create for datatype objs --- hsds/ctype_sn.py | 174 ++++++------- hsds/dset_sn.py | 71 +---- hsds/group_sn.py | 154 ++--------- hsds/post_crawl.py | 81 ++++-- hsds/servicenode_lib.py | 485 +++++++++++++++++++++++++---------- tests/integ/datatype_test.py | 74 ++++++ tests/integ/group_test.py | 18 ++ 7 files changed, 615 insertions(+), 442 deletions(-) diff --git a/hsds/ctype_sn.py b/hsds/ctype_sn.py index ccf033ac..d9cfb71d 100755 --- a/hsds/ctype_sn.py +++ b/hsds/ctype_sn.py @@ -17,19 +17,19 @@ from aiohttp.web_exceptions import HTTPBadRequest, HTTPGone from json import JSONDecodeError -from h5json.hdf5dtype import validateTypeItem, getBaseTypeJson from h5json.objid import isValidUuid from .util.httpUtil import getHref, respJsonAssemble, getBooleanParam from .util.httpUtil import jsonResponse -from .util.linkUtil import validateLinkName from .util.authUtil import getUserPasswordFromRequest, aclCheck from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain from .util.domainUtil import getBucketForDomain, verifyRoot from .servicenode_lib import getDomainJson, getObjectJson, validateAction -from .servicenode_lib import getObjectIdByPath, getPathForObjectId -from .servicenode_lib import createObject, createObjectByPath, deleteObject +from .servicenode_lib import getObjectIdByPath, getPathForObjectId, deleteObject +from .servicenode_lib import getCreateArgs, createDatatypeObj +from .post_crawl import createDatatypeObjs +from .domain_crawl import DomainCrawler from . import hsds_logger as log @@ -165,35 +165,6 @@ async def POST_Datatype(request): msg = "Unable to load JSON body" log.warn(msg) raise HTTPBadRequest(reason=msg) - if "type" not in body: - msg = "POST Datatype has no type key in body" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - datatype = body["type"] - if isinstance(datatype, str): - try: - # convert predefined type string (e.g. "H5T_STD_I32LE") to - # corresponding json representation - datatype = getBaseTypeJson(datatype) - log.debug(f"got datatype: {datatype}") - except TypeError: - msg = "POST Dataset with invalid predefined type" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - try: - validateTypeItem(datatype) - except KeyError as ke: - msg = f"KeyError creating type: {ke}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - except TypeError as te: - msg = f"TypeError creating type: {te}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - except ValueError as ve: - msg = f"ValueError creating type: {ve}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) domain = getDomainFromRequest(request) if not isValidDomain(domain): @@ -209,73 +180,92 @@ async def POST_Datatype(request): verifyRoot(domain_json) root_id = domain_json["root"] - parent_id = None - link_title = None - obj_id = None - h5path = None - attrs = None - - if "id" in body: - obj_id = body["id"] - log.debug(f"POST datatype using client id: {obj_id}") + # allow parent group creation or not + implicit = getBooleanParam(params, "implicit") - if "attributes" in body: - attrs = body["attributes"] - log.debug(f"POST datatype attributes: {attrs}") + post_rsp = None - if "link" in body: - if "h5path" in body: - msg = "link can't be used with h5path" + if isinstance(body, list): + count = len(body) + log.debug(f"multiple ctype create: {count} items") + if count == 0: + # equivalent to no body + msg = "POST Datatype with no body" log.warn(msg) raise HTTPBadRequest(reason=msg) - link_body = body["link"] - if "id" in link_body: - parent_id = link_body["id"] - - if "name" in link_body: - link_title = link_body["name"] - try: - # will throw exception if there's a slash in the name - validateLinkName(link_title) - except ValueError: - msg = f"invalid link title: {link_title}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if parent_id and link_title: - log.debug(f"parent id: {parent_id}, link_title: {link_title}") - h5path = link_title # just use the link name as the h5path - - if "h5path" in body: - h5path = body["h5path"] - if "parent_id" not in body: - parent_id = root_id + elif count == 1: + # just create one object in typical way + kwargs = getCreateArgs(body[0], + root_id=root_id, + bucket=bucket, + implicit=implicit) else: - parent_id = body["parent_id"] - - # setup args to createObject - kwargs = {"bucket": bucket, "obj_type": datatype} - if obj_id: - kwargs["obj_id"] = obj_id - if attrs: - kwargs["attrs"] = attrs - - # TBD: creation props for datatype obj? - if parent_id: - kwargs["parent_id"] = parent_id - kwargs["h5path"] = h5path - # allow parent group creation or not - implicit = getBooleanParam(params, "implicit") - if implicit: - kwargs["implicit"] = True - ctype_json = await createObjectByPath(app, **kwargs) + # create multiple ctype objects + kwarg_list = [] # list of kwargs for each object + + for item in body: + log.debug(f"item: {item}") + if not isinstance(item, dict): + msg = f"Post_Datatype - invalid item type: {type(item)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + kwargs = getCreateArgs(item, root_id=root_id, bucket=bucket) + kwargs["ignore_link"] = True + kwarg_list.append(kwargs) + kwargs = {"bucket": bucket, "root_id": root_id} + log.debug(f"createDatatypeObjcs, items: {kwarg_list}") + post_rsp = await createDatatypeObjs(app, kwarg_list, **kwargs) else: - # create an anonymous datatype - kwargs["root_id"] = root_id - ctype_json = await createObject(app, **kwargs) + # single object create + kwargs = getCreateArgs(body, root_id=root_id, bucket=bucket, implicit=implicit) + log.debug(f"kwargs for datatype create: {kwargs}") + + if post_rsp is None: + # Handle cases other than multi ctype create here + post_rsp = await createDatatypeObj(app, **kwargs) + + log.debug(f"returning resp: {post_rsp}") + + if "objects" in post_rsp: + # add any links in multi request + objects = post_rsp["objects"] + obj_count = len(objects) + log.debug(f"Post datatype multi create: {obj_count} objects") + if len(body) != obj_count: + msg = f"Expected {obj_count} objects but got {len(body)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + parent_ids = {} + for index in range(obj_count): + item = body[index] + if "link" in item: + link_item = item["link"] + parent_id = link_item.get("id") + title = link_item.get("name") + if parent_id and title: + # add a hard link + object = objects[index] + obj_id = object["id"] + if parent_id not in parent_ids: + parent_ids[parent_id] = {} + links = parent_ids[parent_id] + links[title] = {"id": obj_id} + if parent_ids: + log.debug(f"POST ctype multi - adding links: {parent_ids}") + kwargs = {"action": "put_link", "bucket": bucket} + kwargs["replace"] = True + + crawler = DomainCrawler(app, parent_ids, **kwargs) + + # will raise exception on not found, server busy, etc. + await crawler.crawl() + + status = crawler.get_status() + + log.info(f"DomainCrawler done for put_links action, status: {status}") # datatype creation successful - resp = await jsonResponse(request, ctype_json, status=201) + resp = await jsonResponse(request, post_rsp, status=201) log.response(request, resp=resp) return resp diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 9f0593e6..3650db7b 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -32,10 +32,9 @@ from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain from .util.domainUtil import getBucketForDomain, verifyRoot from .util.storUtil import getSupportedFilters -from .util.linkUtil import validateLinkName from .servicenode_lib import getDomainJson, getObjectJson, getDsetJson, getPathForObjectId from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo -from .servicenode_lib import createObject, createObjectByPath, deleteObject +from .servicenode_lib import getCreateArgs, createDataset, deleteObject from .dset_lib import updateShape, deleteAllChunks, doHyperslabWrite from . import config from . import hsds_logger as log @@ -1131,67 +1130,17 @@ async def POST_Dataset(request): log.debug(f"set dataset json creationProperties: {creationProperties}") - parent_id = None - obj_id = None - link_title = None - h5path = None - if "id" in body: - obj_id = body["id"] - log.debug(f"POST dataset using client id: {obj_id}") + # setup args to createDataset + implicit = getBooleanParam(params, "implicit") + kwargs = getCreateArgs(body, root_id=root_id, type=datatype, bucket=bucket, implicit=implicit) + # fill in dataset-specific keys + kwargs["creation_props"] = creationProperties + kwargs["shape"] = shape_json + kwargs["layout"] = layout - if "link" in body: - if "h5path" in body: - msg = "link can't be used with h5path" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - link_body = body["link"] - if "id" in link_body: - parent_id = link_body["id"] + log.debug(f"kwargs for dataset create: {kwargs}") - if "name" in link_body: - link_title = link_body["name"] - try: - # will throw exception if there's a slash in the name - validateLinkName(link_title) - except ValueError: - msg = f"invalid link title: {link_title}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if parent_id and link_title: - log.debug(f"parent id: {parent_id}, link_title: {link_title}") - h5path = link_title # just use the link name as the h5path - - if "h5path" in body: - h5path = body["h5path"] - if "parent_id" not in body: - parent_id = root_id - else: - parent_id = body["parent_id"] - - # setup args to createObject - kwargs = {"bucket": bucket, "obj_type": datatype, "obj_shape": shape_json} - if obj_id: - kwargs["obj_id"] = obj_id - if creationProperties: - kwargs["creation_props"] = creationProperties - if attrs: - kwargs["attrs"] = attrs - if layout: - kwargs["layout"] = layout - - if parent_id: - kwargs["parent_id"] = parent_id - kwargs["h5path"] = h5path - # allow parent group creation or not - implicit = getBooleanParam(params, "implicit") - if implicit: - kwargs["implicit"] = True - dset_json = await createObjectByPath(app, **kwargs) - else: - # create an anonymous datatype - kwargs["root_id"] = root_id - dset_json = await createObject(app, **kwargs) + dset_json = await createDataset(app, **kwargs) # write data if provided if input_arr is not None: diff --git a/hsds/group_sn.py b/hsds/group_sn.py index bfad5b28..4d83e5c7 100755 --- a/hsds/group_sn.py +++ b/hsds/group_sn.py @@ -23,14 +23,12 @@ from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain, getPathForDomain, verifyRoot -from .util.linkUtil import validateLinkName, getRequestLinks from .servicenode_lib import getDomainJson, getObjectJson, validateAction -from .servicenode_lib import getObjectIdByPath, getPathForObjectId -from .servicenode_lib import createObject, createObjectByPath, deleteObject +from .servicenode_lib import getObjectIdByPath, getPathForObjectId, deleteObject +from .servicenode_lib import getCreateArgs, createGroup from . import hsds_logger as log -from .post_crawl import createObjects +from .post_crawl import createGroups from .domain_crawl import DomainCrawler -from . import config async def GET_Group(request): @@ -159,115 +157,6 @@ async def GET_Group(request): return resp -async def _create_group(app, **kwargs): - """ helper method for group creation """ - - if kwargs.get("parent_id") and kwargs.get("h5path"): - group_json = await createObjectByPath(app, **kwargs) - else: - # create an anonymous group - log.debug(f"_create_group - kwargs: {kwargs}") - group_json = await createObject(app, **kwargs) - - return group_json - - -def _get_create_args(body, root_id=None, bucket=None, implicit=False, ignore_link=False): - """ get query args for _create_group from request body """ - kwargs = {"bucket": bucket} - predate_max_time = config.get("predate_max_time", default=10.0) - - parent_id = None - obj_id = None - h5path = None - - if "link" in body: - if "h5path" in body: - msg = "link can't be used with h5path" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - link_body = body["link"] - log.debug(f"link_body: {link_body}") - if "id" in link_body and not ignore_link: - parent_id = link_body["id"] - if "name" in link_body: - link_title = link_body["name"] - try: - # will throw exception if there's a slash in the name - validateLinkName(link_title) - except ValueError: - msg = f"invalid link title: {link_title}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if parent_id and link_title: - log.debug(f"parent id: {parent_id}, link_title: {link_title}") - if not ignore_link: - h5path = link_title # just use the link name as the h5path - - if "parent_id" not in body: - parent_id = root_id - else: - parent_id = body["parent_id"] - - if "h5path" in body: - h5path = body["h5path"] - # normalize the h5path - if h5path.startswith("/"): - if parent_id == root_id: - # just adjust the path to be relative - h5path = h5path[1:] - else: - msg = f"PostCrawler expecting relative h5path, but got: {h5path}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if h5path.endswith("/"): - h5path = h5path[:-1] # makes iterating through the links a bit easier - - if parent_id and h5path: - # these are used by createObjectByPath - kwargs["parent_id"] = parent_id - kwargs["implicit"] = implicit - kwargs["h5path"] = h5path - else: - kwargs["root_id"] = root_id - - if "id" in body: - obj_id = body["id"] - # tbd: validate this is a group id - kwargs["obj_id"] = obj_id - log.debug(f"POST group using client id: {obj_id}") - - if "creationProperties" in body: - creation_props = body["creationProperties"] - # tbd: validate creation_props - kwargs["creation_props"] = creation_props - - if "attributes" in body: - attrs = body["attributes"] - if not isinstance(attrs, dict): - msg = f"POST_Groups expected dict for for attributes, but got: {type(attrs)}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - log.debug(f"POST Group attributes: {attrs}") - - # tbd: validate attributes - kwargs["attrs"] = attrs - if "links" in body: - body_links = body["links"] - log.debug(f"got links for new group: {body_links}") - try: - links = getRequestLinks(body["links"], predate_max_time=predate_max_time) - except ValueError: - msg = "invalid link item sent in request" - raise HTTPBadRequest(reason=msg) - log.debug(f"adding links to group POST request: {links}") - kwargs["links"] = links - - return kwargs - - async def POST_Group(request): """HTTP method to create new Group object""" log.request(request) @@ -297,7 +186,7 @@ async def POST_Group(request): # allow parent group creation or not implicit = getBooleanParam(params, "implicit") kwargs = {} - post_group_rsp = None + post_rsp = None if request.has_body: try: body = await request.json() @@ -316,10 +205,10 @@ async def POST_Group(request): kwargs = {"root_id": root_id, "bucket": bucket} elif count == 1: # just create one object in typical way - kwargs = _get_create_args(body[0], - root_id=root_id, - bucket=bucket, - implicit=implicit) + kwargs = getCreateArgs(body[0], + root_id=root_id, + bucket=bucket, + implicit=implicit) else: # create multiple group objects kwarg_list = [] # list of kwargs for each object @@ -330,31 +219,34 @@ async def POST_Group(request): msg = f"PostGroup - invalid item type: {type(item)}" log.warn(msg) raise HTTPBadRequest(reason=msg) - kwargs = _get_create_args(item, root_id=root_id, bucket=bucket) + kwargs = getCreateArgs(item, root_id=root_id, bucket=bucket) kwargs["ignore_link"] = True kwarg_list.append(kwargs) kwargs = {"bucket": bucket, "root_id": root_id} - post_group_rsp = await createObjects(app, kwarg_list, **kwargs) + post_rsp = await createGroups(app, kwarg_list, **kwargs) else: - kwargs = _get_create_args(body, root_id=root_id, bucket=bucket, implicit=implicit) + kwargs = getCreateArgs(body, root_id=root_id, bucket=bucket, implicit=implicit) else: kwargs["root_id"] = root_id kwargs["bucket"] = bucket else: kwargs = {"root_id": root_id, "bucket": bucket} - if post_group_rsp is None: + if post_rsp is None: # Handle cases other than multi-group create here - log.debug(f"_create_group - kwargs: {kwargs}") - post_group_rsp = await _create_group(app, **kwargs) + if "type" in kwargs: + msg = "type key is not allowed for Group creation" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + post_rsp = await createGroup(app, **kwargs) - log.debug(f"returning resp: {post_group_rsp}") + log.debug(f"returning resp: {post_rsp}") - if "objects" in post_group_rsp: + if "objects" in post_rsp: # add any links in multi request - objects = post_group_rsp["objects"] + objects = post_rsp["objects"] obj_count = len(objects) - log.debug(f"PostGroup multi create: {obj_count} objects") + log.debug(f"Post group multi create: {obj_count} objects") if len(body) != obj_count: msg = f"Expected {obj_count} objects but got {len(body)}" log.warn(msg) @@ -375,7 +267,7 @@ async def POST_Group(request): links = parent_ids[parent_id] links[title] = {"id": obj_id} if parent_ids: - log.debug(f"POSTGroup multi - adding links: {parent_ids}") + log.debug(f"POST group multi - adding links: {parent_ids}") kwargs = {"action": "put_link", "bucket": bucket} kwargs["replace"] = True @@ -389,7 +281,7 @@ async def POST_Group(request): log.info(f"DomainCrawler done for put_links action, status: {status}") # group creation successful - resp = await jsonResponse(request, post_group_rsp, status=201) + resp = await jsonResponse(request, post_rsp, status=201) log.response(request, resp=resp) return resp diff --git a/hsds/post_crawl.py b/hsds/post_crawl.py index 88ffab92..225376aa 100644 --- a/hsds/post_crawl.py +++ b/hsds/post_crawl.py @@ -19,7 +19,7 @@ from aiohttp.web_exceptions import HTTPInternalServerError, HTTPNotFound, HTTPGone from .util.httpUtil import isOK -from .servicenode_lib import createObject, createObjectByPath +from .servicenode_lib import createObject from . import hsds_logger as log @@ -148,7 +148,7 @@ async def create(self, index): if "obj_id" in item: kwargs["obj_id"] = item["obj_id"] if "type" in item: - kwargs["obj_type"] = item["type"] + kwargs["type"] = item["type"] if "layout" in item: kwargs["layout"] = item["layout"] if "creation_props" in item: @@ -167,11 +167,7 @@ async def create(self, index): log.debug(f"PostCrawler index {index} kwargs: {kwargs}") rsp_json = None try: - if kwargs.get("parent_id") and kwargs.get("h5path"): - rsp_json = await createObjectByPath(self._app, **kwargs) - else: - # create an anonymous group - rsp_json = await createObject(self._app, **kwargs) + rsp_json = await createObject(self._app, **kwargs) except HTTPConflict: log.warn("PostCrawler - got HTTPConflict from http_post") rsp_json = {"status_code": 409} @@ -188,26 +184,75 @@ async def create(self, index): self._rsp_objs[index] = rsp_json -async def createObjects(app, items, root_id=None, bucket=None): - """ create an objects based on parameters in items list """ - - if not root_id: - msg = "no root_id given for createObjects" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - log.info(f"createObjects with {len(items)} items, root_id: {root_id}") +async def _createObjects(app, items: list, root_id=None, bucket=None): + """ generic create function """ post_crawler = PostCrawler(app, root_id=root_id, bucket=bucket, items=items) await post_crawler.crawl() if post_crawler.get_status() > 201: - msg = f"createObjects returning status from crawler: {post_crawler.get_status()}" + msg = f"createGroups returning status from crawler: {post_crawler.get_status()}" log.error(msg) raise HTTPInternalServerError() obj_list = post_crawler.get_rsp_objs() if not isinstance(obj_list, list): - msg = f"createObjects expected list but got: {type(obj_list)}" + msg = f"createGroups expected list but got: {type(obj_list)}" log.error(msg) raise HTTPInternalServerError() return {"objects": obj_list} + + +async def createGroups(app, items: list, root_id=None, bucket=None): + """ create an group objects based on parameters in items list """ + + if not root_id: + msg = "no root_id given for createObjects" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + for item in items: + if not isinstance(item, dict): + msg = "expected list of dictionary objects for multi-object create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "type" in item: + msg = "type key not allowed for multi-group create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "shape" in item: + msg = "shape key not allowed for multi-group create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + log.info(f"createGroups with {len(items)} items, root_id: {root_id}") + + rsp_json = await _createObjects(app, items=items, root_id=root_id, bucket=bucket) + return rsp_json + + +async def createDatatypeObjs(app, items: list, root_id=None, bucket=None): + """ create an datatype objects based on parameters in items list """ + + if not root_id: + msg = "no root_id given for createDatatypeObjs" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + for item in items: + if not isinstance(item, dict): + msg = "expected list of dictionary objects for multi-object create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "type" not in item: + msg = "type key not provided for multi-datatype create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "shape" in item: + msg = "shape key not allowed for multi-datatype create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + log.info(f"createDatatypes with {len(items)} items, root_id: {root_id}") + + rsp_json = await _createObjects(app, items=items, root_id=root_id, bucket=bucket) + return rsp_json diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 8230502e..fdbfe6c4 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -32,7 +32,7 @@ from .util.nodeUtil import getDataNodeUrl from .util.authUtil import getAclKeys -from .util.linkUtil import h5Join, validateLinkName, getLinkClass +from .util.linkUtil import h5Join, validateLinkName, getLinkClass, getRequestLinks from .util.storUtil import getStorJSONObj, isStorObj from .util.authUtil import aclCheck from .util.httpUtil import http_get, http_put, http_post, http_delete @@ -1287,34 +1287,254 @@ async def deleteObject(app, obj_id, bucket=None): del meta_cache[obj_id] # remove from cache +def getCreateArgs(body, + root_id=None, + bucket=None, + type=None, + shape=None, + implicit=False, + ignore_link=False): + """ get args for createObject from request body """ + + kwargs = {"bucket": bucket} + predate_max_time = config.get("predate_max_time", default=10.0) + + parent_id = None + obj_id = None + h5path = None + + if "link" in body: + if "h5path" in body: + msg = "link can't be used with h5path" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + # if ingore_link is set, parent_links will be created post object creation + link_body = body["link"] + log.debug(f"link_body: {link_body}") + if "id" in link_body and not ignore_link: + parent_id = link_body["id"] + if "name" in link_body: + link_title = link_body["name"] + try: + # will throw exception if there's a slash in the name + validateLinkName(link_title) + except ValueError: + msg = f"invalid link title: {link_title}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if parent_id and link_title: + log.debug(f"parent id: {parent_id}, link_title: {link_title}") + if not ignore_link: + h5path = link_title # just use the link name as the h5path + + if "parent_id" not in body: + parent_id = root_id + else: + parent_id = body["parent_id"] + + if "h5path" in body: + h5path = body["h5path"] + # normalize the h5path + if h5path.startswith("/"): + if parent_id == root_id: + # just adjust the path to be relative + h5path = h5path[1:] + else: + msg = f"PostCrawler expecting relative h5path, but got: {h5path}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if h5path.endswith("/"): + h5path = h5path[:-1] # makes iterating through the links a bit easier + + if parent_id and h5path: + # these are used by createObjectByPath + kwargs["parent_id"] = parent_id + kwargs["implicit"] = implicit + kwargs["h5path"] = h5path + else: + kwargs["root_id"] = root_id + + if "id" in body: + obj_id = body["id"] + # tbd: validate this is a group id + kwargs["obj_id"] = obj_id + log.debug(f"createObject will use client id: {obj_id}") + + if "creationProperties" in body: + creation_props = body["creationProperties"] + # tbd: validate creation_props + kwargs["creation_props"] = creation_props + + if "attributes" in body: + attrs = body["attributes"] + if not isinstance(attrs, dict): + msg = f"expected dict for for attributes, but got: {type(attrs)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + log.debug(f"createObject attributes: {attrs}") + + # tbd: validate attributes + kwargs["attrs"] = attrs + + if "links" in body: + body_links = body["links"] + log.debug(f"got links for new group: {body_links}") + try: + links = getRequestLinks(body["links"], predate_max_time=predate_max_time) + except ValueError: + msg = "invalid link item sent in request" + raise HTTPBadRequest(reason=msg) + log.debug(f"adding links to createObject request: {links}") + kwargs["links"] = links + + if type: + kwargs["type"] = type + elif "type" in body: + datatype = body["type"] + if isinstance(datatype, str): + try: + # convert predefined type string (e.g. "H5T_STD_I32LE") to + # corresponding json representation + datatype = getBaseTypeJson(datatype) + log.debug(f"got datatype: {datatype}") + except TypeError: + msg = f"POST with invalid predefined type: {datatype}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + try: + validateTypeItem(datatype) + except KeyError as ke: + msg = f"KeyError creating type: {ke}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + except TypeError as te: + msg = f"TypeError creating type: {te}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + except ValueError as ve: + msg = f"ValueError creating type: {ve}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + kwargs["type"] = datatype + else: + pass # no type + + return kwargs + + +async def createLinkFromParent(app, parent_id, h5path, tgt_id=None, bucket=None, implicit=False): + """ create link or links from parentId to tgt_id. + If implicit is True, create any intermediate group objects needed """ + + if not h5path: + log.warn("createLinkFromParent with null h5path") + return + log.info(f"createLinkFromParent, parent_id: {parent_id} h5path: {h5path} tgt_id={tgt_id}") + if implicit: + log.debug("createLinkFromParent - using implicit creation") + link_titles = h5path.split("/") + log.debug(f"link_titles: {link_titles}") + for i in range(len(link_titles)): + if i == len(link_titles) - 1: + last_link = True + else: + last_link = False + link_title = link_titles[i] + log.debug(f"createLinkFromParent - processing link: {link_title}") + link_json = None + try: + link_json = await getLink(app, parent_id, link_title, bucket=bucket) + except (HTTPNotFound, HTTPGone): + pass # link doesn't exist + + if link_json: + log.debug(f"link for link_title {link_title} found: {link_json}") + # if this is the last link, that's a problem + if last_link: + msg = f"object at {h5path} already exists" + log.warn(msg) + raise HTTPConflict() + # otherwise, verify that this is a hardlink + if link_json.get("class") != "H5L_TYPE_HARD": + msg = "createLinkFromParent - h5path must contain only hard links" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + parent_id = link_json["id"] + if getCollectionForId(parent_id) != "groups": + # parent objects must be groups! + msg = f"{link_title} is not a group" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + log.debug(f"link: {link_title} to sub-group found") + else: + log.debug(f"link for link_title {link_title} not found") + if last_link: + # create a link to the new object + await putHardLink(app, parent_id, link_title, tgt_id=tgt_id, bucket=bucket) + parent_id = tgt_id # new parent + elif implicit: + # create a new group object + log.info(f"creating intermediate group object for: {link_title}") + kwargs = {"parent_id": parent_id, "bucket": bucket} + grp_id = createObjId("groups", root_id=getRootObjId(parent_id)) + kwargs["obj_id"] = grp_id + # createObject won't call back to this function since we haven't set the h5path + await createObject(app, **kwargs) + # create a link to the subgroup + await putHardLink(app, parent_id, link_title, tgt_id=grp_id, bucket=bucket) + parent_id = grp_id # new parent + else: + if len(link_titles) > 1: + msg = f"createLinkFromParent failed: not all groups in {h5path} exist" + else: + msg = f"createLinkFromParent failed: {h5path} does not exist" + log.warn(msg) + raise HTTPNotFound(reason=msg) + + async def createObject(app, + parent_id=None, root_id=None, + h5path=None, obj_id=None, - obj_type=None, - obj_shape=None, + type=None, + shape=None, layout=None, creation_props=None, attrs=None, links=None, + implicit=None, bucket=None): """ create a group, ctype, or dataset object and return object json Determination on whether a group, ctype, or dataset is created is based on: - 1) if obj_type and obj_shape are set, a dataset object will be created - 2) if obj_type is set but not obj_shape, a datatype object will be created + 1) if type and shape are set, a dataset object will be created + 2) if type is set but not shape, a datatype object will be created 3) otherwise (type and shape are both None), a group object will be created The layout parameter only applies to dataset creation """ - if obj_type and obj_shape: + if type and shape: collection = "datasets" - elif obj_type: + elif type: collection = "datatypes" else: collection = "groups" - log.info(f"createObject for {collection} collection, root: {root_id}, bucket: {bucket}") - if obj_type: - log.debug(f" obj_type: {obj_type}") - if obj_shape: - log.debug(f" obj_shape: {obj_shape}") + + if not root_id: + root_id = getRootObjId(parent_id) + log.info(f"createObject for {collection} collection, root_id: {root_id}, bucket: {bucket}") + if root_id != parent_id: + log.debug(f" parent_id: {parent_id}") + if obj_id: + log.debug(f" obj_id: {obj_id}") + if h5path: + log.debug(f" h5path: {h5path}") + if type: + log.debug(f" type: {type}") + if shape: + log.debug(f" shape: {shape}") if layout: log.debug(f" layout: {layout}") if creation_props: @@ -1324,6 +1544,19 @@ async def createObject(app, if links: log.debug(f" links: {links}") + if h5path: + if h5path.startswith("/"): + if parent_id == root_id: + # just adjust the path to be relative + h5path = h5path[1:] + else: + msg = f"createObject expecting relative h5path, but got: {h5path}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if h5path.endswith("/"): + h5path = h5path[:-1] # makes iterating through the links a bit easier + if obj_id: log.debug(f"using client supplied id: {obj_id}") if not isValidUuid(obj_id, obj_class=collection): @@ -1336,12 +1569,12 @@ async def createObject(app, raise HTTPBadRequest(reason=msg) else: obj_id = createObjId(collection, root_id=root_id) - log.info(f"new obj id: {obj_id}") + log.info(f"new obj id: {obj_id}") obj_json = {"id": obj_id, "root": root_id} - if obj_type: - obj_json["type"] = obj_type - if obj_shape: - obj_json["shape"] = obj_shape + if type: + obj_json["type"] = type + if shape: + obj_json["shape"] = shape if layout: obj_json["layout"] = layout if creation_props: @@ -1364,141 +1597,113 @@ async def createObject(app, params = {"bucket": bucket} rsp_json = await http_post(app, req, data=obj_json, params=params) + # object creation successful, create link from parent if requested + if h5path: + kwargs = {"tgt_id": obj_id, "bucket": bucket, "implicit": implicit} + await createLinkFromParent(app, parent_id, h5path, **kwargs) + return rsp_json -async def createObjectByPath(app, - parent_id=None, - obj_id=None, - h5path=None, - implicit=False, - obj_type=None, - obj_shape=None, - layout=None, - creation_props=None, - attrs=None, - links=None, - bucket=None): +async def createGroup(app, + parent_id=None, + root_id=None, + h5path=None, + obj_id=None, + creation_props=None, + attrs=None, + links=None, + implicit=None, + bucket=None): - """ create an object at the designated path relative to the parent. - If implicit is True, make any intermediate groups needed in the h5path. """ + """ create a new group object """ - if not parent_id: - msg = "no parent_id given for createObjectByPath" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if not h5path: - msg = "no h5path given for createObjectByPath" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - log.debug(f"createObjectByPath - parent_id: {parent_id}, h5path: {h5path}") - if obj_id: - log.debug(f"createObjectByPath using client id: {obj_id}") - if obj_type: - log.debug(f" obj_type: {obj_type}") - if obj_shape: - log.debug(f" obj_shape: {obj_shape}") - if layout: - log.debug(f" layout: {layout}") - if creation_props: - log.debug(f" cprops: {creation_props}") - if attrs: - log.debug(f" attrs: {attrs}") - if links: - log.debug(f" links: {links}") - if obj_type: - msg = "only group objects can have links" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + kwargs = {} + kwargs["parent_id"] = parent_id + kwargs["root_id"] = root_id + kwargs["h5path"] = h5path + kwargs["obj_id"] = obj_id + kwargs["creation_props"] = creation_props + kwargs["attrs"] = attrs + kwargs["links"] = links + kwargs["implicit"] = implicit + kwargs["bucket"] = bucket + rsp_json = await createObject(app, **kwargs) + return rsp_json - root_id = getRootObjId(parent_id) - if h5path.startswith("/"): - if parent_id == root_id: - # just adjust the path to be relative - h5path = h5path[1:] - else: - msg = f"createObjectByPath expecting relative h5path, but got: {h5path}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) +async def createDatatypeObj(app, + parent_id=None, + root_id=None, + type=None, + h5path=None, + obj_id=None, + creation_props=None, + attrs=None, + links=None, + implicit=None, + bucket=None): - if h5path.endswith("/"): - h5path = h5path[:-1] # makes iterating through the links a bit easier + """ create a new committed type object""" - if not h5path: - msg = "h5path for createObjectByPath invalid" + if not type: + msg = "type not set for committed type creation" log.warn(msg) raise HTTPBadRequest(reason=msg) - obj_json = None - link_titles = h5path.split("/") - log.debug(f"link_titles: {link_titles}") - for i in range(len(link_titles)): - if i == len(link_titles) - 1: - last_link = True - else: - last_link = False - link_title = link_titles[i] - log.debug(f"createObjectByPath - processing link: {link_title}") - link_json = None - try: - link_json = await getLink(app, parent_id, link_title, bucket=bucket) - except (HTTPNotFound, HTTPGone): - pass # link doesn't exist + kwargs = {} + kwargs["parent_id"] = parent_id + kwargs["root_id"] = root_id + kwargs["type"] = type + kwargs["h5path"] = h5path + kwargs["obj_id"] = obj_id + kwargs["creation_props"] = creation_props + kwargs["attrs"] = attrs + kwargs["links"] = links + kwargs["implicit"] = implicit + kwargs["bucket"] = bucket + rsp_json = await createObject(app, **kwargs) + return rsp_json - if link_json: - log.debug(f"link for link_title {link_title} found: {link_json}") - # if this is the last link, that's a problem - if last_link: - msg = f"object at {h5path} already exists" - log.warn(msg) - raise HTTPConflict() - # otherwise, verify that this is a hardlink - if link_json.get("class") != "H5L_TYPE_HARD": - msg = "createObjectByPath - h5path must contain only hardlinks" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - parent_id = link_json["id"] - if getCollectionForId(parent_id) != "groups": - # parent objects must be groups! - msg = f"{link_title} is not a group" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - log.debug(f"link: {link_title} to sub-group found") - else: - log.debug(f"link for link_title {link_title} not found") - if not last_link and not implicit: - if len(link_titles) > 1: - msg = f"createObjectByPath failed: not all groups in {h5path} exist" - else: - msg = f"createObjectByPath failed: {h5path} does not exist" - log.warn(msg) - raise HTTPNotFound(reason=msg) - # create the group or group/datatype/dataset for the last - # item in the path (based on parameters passed in) - kwargs = {"bucket": bucket, "root_id": root_id} - if last_link: - if obj_type: - kwargs["obj_type"] = obj_type - if obj_shape: - kwargs["obj_shape"] = obj_shape - if layout: - kwargs["layout"] = layout - if creation_props: - kwargs["creation_props"] = creation_props - if attrs: - kwargs["attrs"] = attrs - if links: - kwargs["links"] = links - if obj_id: - kwargs["obj_id"] = obj_id - obj_json = await createObject(app, **kwargs) - tgt_id = obj_json["id"] - # create a link to the new object - await putHardLink(app, parent_id, link_title, tgt_id=tgt_id, bucket=bucket) - parent_id = tgt_id # new parent - log.info(f"createObjectByPath {h5path} done, returning obj_json") +async def createDataset(app, + parent_id=None, + root_id=None, + type=None, + shape=None, + h5path=None, + obj_id=None, + creation_props=None, + layout=None, + attrs=None, + links=None, + implicit=None, + bucket=None): + + """ create a new dataset object""" + + if not type: + msg = "type not set for dataset creation" + log.warn(msg) + raise HTTPBadRequest(reason=msg) - return obj_json + if not shape: + msg = "shape not set for dataset creation" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + kwargs = {} + kwargs["parent_id"] = parent_id + kwargs["root_id"] = root_id + kwargs["type"] = type + kwargs["shape"] = shape + kwargs["h5path"] = h5path + kwargs["obj_id"] = obj_id + kwargs["layout"] = layout + kwargs["creation_props"] = creation_props + kwargs["attrs"] = attrs + kwargs["links"] = links + kwargs["implicit"] = implicit + kwargs["bucket"] = bucket + rsp_json = await createObject(app, **kwargs) + return rsp_json diff --git a/tests/integ/datatype_test.py b/tests/integ/datatype_test.py index f8f01bea..9a73d9a5 100755 --- a/tests/integ/datatype_test.py +++ b/tests/integ/datatype_test.py @@ -609,6 +609,80 @@ def testPostWithPath(self): rspJson = json.loads(rsp.text) self.assertEqual(rspJson["id"], new_datatype_id) + def testPostMulti(self): + # test POST with multi-object creation + print("testPostMulti", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # get root group and verify link count is 0 + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + + str_type = { + "charSet": "H5T_CSET_ASCII", + "class": "H5T_STRING", + "length": 12, + "strPad": "H5T_STR_NULLPAD", + } + + float_type = "H5T_IEEE_F32LE" + + # create a set of anonymous ctypes + fields = ( + {"name": "temp", "type": "H5T_STD_I32LE"}, + {"name": "pressure", "type": "H5T_IEEE_F32LE"}, + ) + compound_type = {"class": "H5T_COMPOUND", "fields": fields} + + payload = [{"type": str_type}, {"type": float_type}, {"type": compound_type}] + req = helper.getEndpoint() + "/datatypes" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertTrue("objects" in rspJson) + rsp_objs = rspJson["objects"] + self.assertEqual(len(rsp_objs), 3) + + for i in range(3): + obj_json = rsp_objs[i] + self.assertEqual(obj_json["attributeCount"], 0) + ctype_id = obj_json["id"] + self.assertTrue(helper.validateId(ctype_id)) + + # create a set of linked ctypes + for i in range(3): + item = payload[i] + item["link"] = {"id": root_uuid, "name": f"ctype_{i + 1}"} + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertTrue("objects" in rspJson) + rsp_objs = rspJson["objects"] + self.assertEqual(len(rsp_objs), 3) + for i in range(3): + json_rsp = rsp_objs[i] + self.assertEqual(json_rsp["attributeCount"], 0) + ctype_id = json_rsp["id"] + self.assertTrue(helper.validateId(ctype_id)) + + # get root group and verify link count is 3 + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 3) + if __name__ == "__main__": # setup test files diff --git a/tests/integ/group_test.py b/tests/integ/group_test.py index fbbda066..ce617e4e 100755 --- a/tests/integ/group_test.py +++ b/tests/integ/group_test.py @@ -223,6 +223,22 @@ def testPost(self): self.assertTrue("alias" in rspJson) self.assertEqual(rspJson["alias"], []) + # try with an empty body + payload = {} + req = endpoint + "/groups" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + self.assertEqual(rspJson["attributeCount"], 0) + group_id = rspJson["id"] + self.assertTrue(helper.validateId(group_id)) + + # try with a type in body (as if we were trying to create a committed type) + payload["type"] = "H5T_IEEE_F32LE" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) + # try POST with user who doesn't have create permission on this domain test_user2 = config.get("user2_name") # some tests will be skipped if not set if not test_user2: @@ -655,6 +671,8 @@ def testPostIdWithPath(self): # try again with implicit creation set params = {"implicit": 1} + g21_id = createObjId("groups", root_id=root_uuid) + payload = {"id": g21_id, "h5path": "g2/g2.1"} rsp = self.session.post(req, data=json.dumps(payload), params=params, headers=headers) self.assertEqual(rsp.status_code, 201) # g2 and g2.1 created rspJson = json.loads(rsp.text) From ef746d0f595c7ffed48d38b35be192213822a38b Mon Sep 17 00:00:00 2001 From: John Readey Date: Sun, 18 May 2025 12:24:22 +0200 Subject: [PATCH 25/49] added datatype test with no type in body --- hsds/ctype_sn.py | 6 +++--- tests/integ/datatype_test.py | 7 ++++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/hsds/ctype_sn.py b/hsds/ctype_sn.py index d9cfb71d..b2a3d260 100755 --- a/hsds/ctype_sn.py +++ b/hsds/ctype_sn.py @@ -155,7 +155,7 @@ async def POST_Datatype(request): await validateUserPassword(app, username, pswd) if not request.has_body: - msg = "POST Datatype with no body" + msg = "POST datatype with no body" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -213,7 +213,7 @@ async def POST_Datatype(request): kwargs["ignore_link"] = True kwarg_list.append(kwargs) kwargs = {"bucket": bucket, "root_id": root_id} - log.debug(f"createDatatypeObjcs, items: {kwarg_list}") + log.debug(f"createDatatypeObjects, items: {kwarg_list}") post_rsp = await createDatatypeObjs(app, kwarg_list, **kwargs) else: # single object create @@ -251,7 +251,7 @@ async def POST_Datatype(request): links = parent_ids[parent_id] links[title] = {"id": obj_id} if parent_ids: - log.debug(f"POST ctype multi - adding links: {parent_ids}") + log.debug(f"POST datatype multi - adding links: {parent_ids}") kwargs = {"action": "put_link", "bucket": bucket} kwargs["replace"] = True diff --git a/tests/integ/datatype_test.py b/tests/integ/datatype_test.py index 9a73d9a5..a3f20fd3 100755 --- a/tests/integ/datatype_test.py +++ b/tests/integ/datatype_test.py @@ -140,9 +140,14 @@ def testPostTypeWithId(self): # create a datatype id ctype_id = createObjId("datatypes", root_id=root_uuid) + # try creating a committed type without a type in the body + req = self.endpoint + "/datatypes" + data = {"id": ctype_id} + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 401) # bad request + # create a committed type obj data = {"id": ctype_id, "type": "H5T_IEEE_F32LE"} - req = self.endpoint + "/datatypes" rsp = self.session.post(req, data=json.dumps(data), headers=headers) self.assertEqual(rsp.status_code, 201) rspJson = json.loads(rsp.text) From b1af9bc7347f51f4447df59fb25e35789224c871 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 20 May 2025 20:41:57 +0200 Subject: [PATCH 26/49] modularize dataset creation args processing --- hsds/dset_sn.py | 736 +++++------------------------ hsds/post_crawl.py | 31 +- hsds/servicenode_lib.py | 381 ++++++++++++++- hsds/util/chunkUtil.py | 224 --------- hsds/util/dsetUtil.py | 842 +++++++++++++++++++++++++++++++--- tests/integ/datatype_test.py | 2 +- tests/integ/value_test.py | 2 +- tests/unit/chunk_util_test.py | 287 +----------- tests/unit/dset_util_test.py | 274 ++++++++++- 9 files changed, 1569 insertions(+), 1210 deletions(-) diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 3650db7b..75110bb7 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -14,225 +14,29 @@ # handles dataset requests # -import math from json import JSONDecodeError from aiohttp.web_exceptions import HTTPBadRequest, HTTPNotFound -from h5json.hdf5dtype import validateTypeItem, createDataType, getBaseTypeJson, getItemSize -from h5json.array_util import getNumElements, getNumpyValue, jsonToArray +#from h5json.hdf5dtype import createDataType +from h5json.array_util import getNumElements #, jsonToArray from h5json.objid import isValidUuid, isSchema2Id from .util.httpUtil import getHref, respJsonAssemble from .util.httpUtil import jsonResponse, getBooleanParam -from .util.dsetUtil import getPreviewQuery, getFilterItem, getShapeDims -from .util.chunkUtil import getChunkSize, guessChunk, expandChunk, shrinkChunk -from .util.chunkUtil import getContiguousLayout +from .util.dsetUtil import getPreviewQuery# , getShapeDims, validateChunkLayout from .util.authUtil import getUserPasswordFromRequest, aclCheck from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain from .util.domainUtil import getBucketForDomain, verifyRoot -from .util.storUtil import getSupportedFilters from .servicenode_lib import getDomainJson, getObjectJson, getDsetJson, getPathForObjectId from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo -from .servicenode_lib import getCreateArgs, createDataset, deleteObject -from .dset_lib import updateShape, deleteAllChunks, doHyperslabWrite -from . import config +from .servicenode_lib import getDatasetCreateArgs, createDataset, deleteObject +from .dset_lib import updateShape, deleteAllChunks #, doHyperslabWrite +from .post_crawl import createDatasets +from .domain_crawl import DomainCrawler from . import hsds_logger as log -async def validateChunkLayout(app, shape_json, item_size, layout, bucket=None): - """ - Use chunk layout given in the creationPropertiesList (if defined and - layout is valid). - Return chunk_layout_json - """ - - rank = 0 - space_dims = None - chunk_dims = None - max_dims = None - - if "dims" in shape_json: - space_dims = shape_json["dims"] - rank = len(space_dims) - - if "maxdims" in shape_json: - max_dims = shape_json["maxdims"] - if "dims" in layout: - chunk_dims = layout["dims"] - - if chunk_dims: - # validate that the chunk_dims are valid and correlates with the - # dataset shape - if isinstance(chunk_dims, int): - chunk_dims = [ - chunk_dims, - ] # promote to array - if len(chunk_dims) != rank: - msg = "Layout rank does not match shape rank" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - for i in range(rank): - dim_extent = space_dims[i] - chunk_extent = chunk_dims[i] - if not isinstance(chunk_extent, int): - msg = "Layout dims must be integer or integer array" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if chunk_extent <= 0: - msg = "Invalid layout value" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if max_dims is None: - if chunk_extent > dim_extent: - msg = "Invalid layout value" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif max_dims[i] != 0: - if chunk_extent > max_dims[i]: - msg = "Invalid layout value for extensible dimension" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - pass # allow any positive value for unlimited dimensions - - if "class" not in layout: - msg = "class key not found in layout for creation property list" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - layout_class = layout["class"] - - if layout_class == "H5D_CONTIGUOUS_REF": - # reference to a dataset in a traditional HDF5 files with - # contigious storage - if item_size == "H5T_VARIABLE": - # can't be used with variable types... - msg = "Datsets with variable types cannot be used with " - msg += "reference layouts" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "file_uri" not in layout: - # needed for H5D_CONTIGUOUS_REF - msg = "'file_uri' key must be provided for " - msg += "H5D_CONTIGUOUS_REF layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "offset" not in layout: - # needed for H5D_CONTIGUOUS_REF - msg = "'offset' key must be provided for " - msg += "H5D_CONTIGUOUS_REF layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "size" not in layout: - # needed for H5D_CONTIGUOUS_REF - msg = "'size' key must be provided for " - msg += "H5D_CONTIGUOUS_REF layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "dims" in layout: - # used defined chunk layout not allowed for H5D_CONTIGUOUS_REF - msg = "'dims' key can not be provided for " - msg += "H5D_CONTIGUOUS_REF layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif layout_class == "H5D_CHUNKED_REF": - # reference to a dataset in a traditional HDF5 files with - # chunked storage - if item_size == "H5T_VARIABLE": - # can't be used with variable types.. - msg = "Datsets with variable types cannot be used with " - msg += "reference layouts" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "file_uri" not in layout: - # needed for H5D_CHUNKED_REF - msg = "'file_uri' key must be provided for " - msg += "H5D_CHUNKED_REF layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "dims" not in layout: - # needed for H5D_CHUNKED_REF - msg = "'dimns' key must be provided for " - msg += "H5D_CHUNKED_REF layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "chunks" not in layout: - msg = "'chunks' key must be provided for " - msg += "H5D_CHUNKED_REF layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif layout_class == "H5D_CHUNKED_REF_INDIRECT": - # reference to a dataset in a traditional HDF5 files with chunked - # storage using an auxillary dataset - if item_size == "H5T_VARIABLE": - # can't be used with variable types.. - msg = "Datsets with variable types cannot be used with " - msg += "reference layouts" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "dims" not in layout: - # needed for H5D_CHUNKED_REF_INDIRECT - msg = "'dimns' key must be provided for " - msg += "H5D_CHUNKED_REF_INDIRECT layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "chunk_table" not in layout: - msg = "'chunk_table' key must be provided for " - msg += "H5D_CHUNKED_REF_INDIRECT layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - chunktable_id = layout["chunk_table"] - if not isValidUuid(chunktable_id, "Dataset"): - msg = f"Invalid chunk table id: {chunktable_id}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - # verify the chunk table exists and is of reasonable shape - try: - chunktable_json = await getDsetJson(app, chunktable_id, bucket=bucket) - except HTTPNotFound: - msg = f"chunk table id: {chunktable_id} not found" - log.warn(msg) - raise - chunktable_shape = chunktable_json["shape"] - if chunktable_shape["class"] == "H5S_NULL": - msg = "Null space datasets can not be used as chunk tables" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - chunktable_dims = getShapeDims(chunktable_shape) - if len(chunktable_dims) != len(space_dims): - msg = "Chunk table rank must be same as dataspace rank" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif layout_class == "H5D_CHUNKED": - if "dims" not in layout: - msg = "dims key not found in layout for creation property list" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if shape_json["class"] != "H5S_SIMPLE": - msg = "Bad Request: chunked layout not valid with shape class: " - msg += f"{shape_json['class']}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif layout_class == "H5D_CONTIGUOUS": - if "dims" in layout: - msg = "dims key found in layout for creation property list " - msg += "for H5D_CONTIGUOUS storage class" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif layout_class == "H5D_COMPACT": - if "dims" in layout: - msg = "dims key found in layout for creation property list " - msg += "for H5D_COMPACT storage class" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - msg = f"Unexpected layout: {layout_class}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - async def getDatasetDetails(app, dset_id, root_id, bucket=None): """Get extra information about the given dataset""" # Gather additional info on the domain @@ -687,165 +491,118 @@ async def POST_Dataset(request): verifyRoot(domain_json) - # - # validate type input - # - if "type" not in body: - msg = "POST Dataset has no type key in body" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - datatype = body["type"] - log.debug(f"got datatype: {datatype}") - if isinstance(datatype, str) and datatype.startswith("t-"): - # Committed type - fetch type json from DN - ctype_id = datatype - log.debug(f"got ctypeid: {ctype_id}") - ctype_json = await getObjectJson(app, ctype_id, bucket=bucket) - log.debug(f"ctype: {ctype_json}") - if ctype_json["root"] != root_id: - msg = "Referenced committed datatype must belong in same domain" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - datatype = ctype_json["type"] - # add the ctype_id to type type - datatype["id"] = ctype_id - elif isinstance(datatype, str): - try: - # convert predefined type string (e.g. "H5T_STD_I32LE") to - # corresponding json representation - datatype = getBaseTypeJson(datatype) - log.debug(f"got datatype: {datatype}") - except TypeError: - msg = "POST Dataset with invalid predefined type" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + # allow parent group creation or not + implicit = getBooleanParam(params, "implicit") - try: - validateTypeItem(datatype) - except KeyError as ke: - msg = f"KeyError creating type: {ke}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - except TypeError as te: - msg = f"TypeError creating type: {te}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - except ValueError as ve: - msg = f"ValueError creating type: {ve}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + post_rsp = None - item_size = getItemSize(datatype) + datatype_json = None # - # Validate shape input + # handle case of committed type input # - dims = None - shape_json = {} - rank = 0 - chunk_size = None - - if "shape" not in body: - shape_json["class"] = "H5S_SCALAR" - else: - shape = body["shape"] - log.debug(f"got shape: {shape}") - if isinstance(shape, int): - shape_json["class"] = "H5S_SIMPLE" - dims = [shape, ] - shape_json["dims"] = dims - rank = 1 - elif isinstance(shape, str): - # only valid string value is H5S_NULL or H5S_SCALAR - if shape == "H5S_NULL": - shape_json["class"] = "H5S_NULL" - elif shape == "H5S_SCALAR": - shape_json["class"] = "H5S_SCALAR" - else: - msg = "POST Dataset with invalid shape value" + if isinstance(body, dict) and "type" in body: + + body_type = body["type"] + log.debug(f"got datatype: {body_type}") + if isinstance(body_type, str) and body_type.startswith("t-"): + ctype_id = body_type + # Committed type - fetch type json from DN + log.debug(f"got ctype_id: {ctype_id}") + ctype_json = await getObjectJson(app, ctype_id, bucket=bucket) + log.debug(f"ctype: {ctype_json}") + if ctype_json["root"] != root_id: + msg = "Referenced committed datatype must belong in same domain" log.warn(msg) raise HTTPBadRequest(reason=msg) - elif isinstance(shape, list): - if len(shape) == 0: - shape_json["class"] = "H5S_SCALAR" - else: - shape_json["class"] = "H5S_SIMPLE" - shape_json["dims"] = shape - dims = shape - rank = len(dims) + datatype_json = ctype_json["type"] + # add the ctype_id to type type + datatype_json["id"] = ctype_id else: - msg = "Bad Request: shape is invalid" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if dims is not None: - for i in range(rank): - extent = dims[i] - if not isinstance(extent, int): - msg = "Invalid shape type" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if extent < 0: - msg = "shape dimension is negative" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - maxdims = None - if "maxdims" in body: - if dims is None: - msg = "Maxdims cannot be supplied if space is NULL" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - maxdims = body["maxdims"] - if isinstance(maxdims, int): - dim1 = maxdims - maxdims = [dim1] - elif isinstance(maxdims, list): - pass # can use as is + pass # we'll fetch type in getDatasetCreateArgs + + if isinstance(body, list): + count = len(body) + log.debug(f"multiple dataset create: {count} items") + if count == 0: + # equivalent to no body + msg = "POST Dataset with no body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + elif count == 1: + # just create one object in typical way + kwargs = getDatasetCreateArgs(body[0], + root_id=root_id, + type=datatype_json, + bucket=bucket, + implicit=implicit) else: - msg = "Bad Request: maxdims is invalid" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if len(dims) != len(maxdims): - msg = "Maxdims rank doesn't match Shape" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if maxdims is not None: - for extent in maxdims: - if not isinstance(extent, int): - msg = "Invalid maxdims type" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if extent < 0: - msg = "maxdims dimension is negative" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if len(maxdims) != len(dims): - msg = "Bad Request: maxdims array length must equal " - msg += "shape array length" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - shape_json["maxdims"] = [] - for i in range(rank): - maxextent = maxdims[i] - if not isinstance(maxextent, int): - msg = "Bad Request: maxdims must be integer type" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif maxextent == 0: - # unlimited dimension - shape_json["maxdims"].append(0) - elif maxextent < dims[i]: - msg = "Bad Request: maxdims extent can't be smaller " - msg += "than shape extent" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - shape_json["maxdims"].append(maxextent) + # create multiple dataset objects + kwarg_list = [] # list of kwargs for each object + for item in body: + log.debug(f"item: {item}") + if not isinstance(item, dict): + msg = f"Post_Dataset - invalid item type: {type(item)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + kwargs = getDatasetCreateArgs(item, root_id=root_id, type=datatype_json, bucket=bucket) + kwargs["ignore_link"] = True + kwarg_list.append(kwargs) + kwargs = {"bucket": bucket, "root_id": root_id} + if datatype_json: + kwargs["type"] = datatype_json + log.debug(f"createDatasetObjects, items: {kwarg_list}") + post_rsp = await createDatasets(app, kwarg_list, **kwargs) + else: + # single object create + kwargs = getDatasetCreateArgs(body, root_id=root_id, type=datatype_json, bucket=bucket, implicit=implicit) + log.debug(f"kwargs for dataset create: {kwargs}") + + if post_rsp is None: + # Handle cases other than multi ctype create here + post_rsp = await createDataset(app, **kwargs) + + log.debug(f"returning resp: {post_rsp}") + + if "objects" in post_rsp: + # add any links in multi request + objects = post_rsp["objects"] + obj_count = len(objects) + log.debug(f"Post datatype multi create: {obj_count} objects") + if len(body) != obj_count: + msg = f"Expected {obj_count} objects but got {len(body)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + parent_ids = {} + for index in range(obj_count): + item = body[index] + if "link" in item: + link_item = item["link"] + parent_id = link_item.get("id") + title = link_item.get("name") + if parent_id and title: + # add a hard link + object = objects[index] + obj_id = object["id"] + if parent_id not in parent_ids: + parent_ids[parent_id] = {} + links = parent_ids[parent_id] + links[title] = {"id": obj_id} + if parent_ids: + log.debug(f"POST dataset multi - adding links: {parent_ids}") + kwargs = {"action": "put_link", "bucket": bucket} + kwargs["replace"] = True + + crawler = DomainCrawler(app, parent_ids, **kwargs) + + # will raise exception on not found, server busy, etc. + await crawler.crawl() + + status = crawler.get_status() + + log.info(f"DomainCrawler done for put_links action, status: {status}") + """ if "value" in body and body["value"]: # data to initialize dataset included in request input_data = body["value"] @@ -871,277 +628,6 @@ async def POST_Dataset(request): else: input_arr = None - layout_props = None - min_chunk_size = int(config.get("min_chunk_size")) - max_chunk_size = int(config.get("max_chunk_size")) - if "creationProperties" in body: - creationProperties = body["creationProperties"] - log.debug(f"got creationProperties: {creationProperties}") - if "layout" in creationProperties: - layout_props = creationProperties["layout"] - await validateChunkLayout(app, shape_json, item_size, layout_props, bucket=bucket) - else: - creationProperties = {} - - if "attributes" in body: - attrs = body["attributes"] - log.debug(f"POST Dataset attributes: {attrs}") - else: - attrs = None - - # TBD: check for invalid layout class... - if layout_props: - if layout_props["class"] == "H5D_CONTIGUOUS": - # treat contiguous as chunked - layout_class = "H5D_CHUNKED" - else: - layout_class = layout_props["class"] - elif shape_json["class"] != "H5S_NULL": - layout_class = "H5D_CHUNKED" - else: - layout_class = None - - if layout_class == "H5D_COMPACT": - layout = {"class": "H5D_COMPACT"} - elif layout_class: - # initialize to H5D_CHUNKED - layout = {"class": "H5D_CHUNKED"} - else: - # null space - no layout - layout = None - - if layout_props and "dims" in layout_props: - chunk_dims = layout_props["dims"] - else: - chunk_dims = None - - if layout_class == "H5D_CONTIGUOUS_REF": - kwargs = {"chunk_min": min_chunk_size, "chunk_max": max_chunk_size} - chunk_dims = getContiguousLayout(shape_json, item_size, **kwargs) - layout["dims"] = chunk_dims - log.debug(f"autoContiguous layout: {layout}") - - if layout_class == "H5D_CHUNKED" and chunk_dims is None: - # do autochunking - chunk_dims = guessChunk(shape_json, item_size) - log.debug(f"initial autochunk layout: {chunk_dims}") - - if layout_class == "H5D_CHUNKED": - chunk_size = getChunkSize(chunk_dims, item_size) - - msg = f"chunk_size: {chunk_size}, min: {min_chunk_size}, " - msg += f"max: {max_chunk_size}" - log.debug(msg) - # adjust the chunk shape if chunk size is too small or too big - adjusted_chunk_dims = None - if chunk_size < min_chunk_size: - msg = f"chunk size: {chunk_size} less than min size: " - msg += f"{min_chunk_size}, expanding" - log.debug(msg) - kwargs = {"chunk_min": min_chunk_size, "layout_class": layout_class} - adjusted_chunk_dims = expandChunk(chunk_dims, item_size, shape_json, **kwargs) - elif chunk_size > max_chunk_size: - msg = f"chunk size: {chunk_size} greater than max size: " - msg += f"{max_chunk_size}, shrinking" - log.debug(msg) - kwargs = {"chunk_max": max_chunk_size} - adjusted_chunk_dims = shrinkChunk(chunk_dims, item_size, **kwargs) - if adjusted_chunk_dims: - msg = f"requested chunk_dimensions: {chunk_dims} modified " - msg += f"dimensions: {adjusted_chunk_dims}" - log.debug(msg) - layout["dims"] = adjusted_chunk_dims - else: - layout["dims"] = chunk_dims # don't need to adjust chunk size - - # set partition_count if needed: - max_chunks_per_folder = int(config.get("max_chunks_per_folder")) - set_partition = False - if max_chunks_per_folder > 0: - if "dims" in shape_json and "dims" in layout: - set_partition = True - - if set_partition: - chunk_dims = layout["dims"] - shape_dims = shape_json["dims"] - if "maxdims" in shape_json: - max_dims = shape_json["maxdims"] - else: - max_dims = None - num_chunks = 1 - rank = len(shape_dims) - unlimited_count = 0 - if max_dims: - for i in range(rank): - if max_dims[i] == 0: - unlimited_count += 1 - msg = f"number of unlimited dimensions: {unlimited_count}" - log.debug(msg) - - for i in range(rank): - max_dim = 1 - if max_dims: - max_dim = max_dims[i] - if max_dim == 0: - # don't really know what the ultimate extent - # could be, but assume 10^6 for total number of - # elements and square-shaped array... - MAX_ELEMENT_GUESS = 10.0 ** 6 - exp = 1 / unlimited_count - max_dim = int(math.pow(MAX_ELEMENT_GUESS, exp)) - else: - max_dim = shape_dims[i] - num_chunks *= math.ceil(max_dim / chunk_dims[i]) - - if num_chunks > max_chunks_per_folder: - partition_count = math.ceil(num_chunks / max_chunks_per_folder) - msg = f"set partition count to: {partition_count}, " - msg += f"num_chunks: {num_chunks}" - log.info(msg) - layout["partition_count"] = partition_count - else: - msg = "do not need chunk partitions, num_chunks: " - msg += f"{num_chunks} max_chunks_per_folder: " - msg += f"{max_chunks_per_folder}" - log.info(msg) - - if layout_class in ("H5D_CHUNKED_REF", "H5D_CHUNKED_REF_INDIRECT"): - chunk_size = getChunkSize(chunk_dims, item_size) - - msg = f"chunk_size: {chunk_size}, min: {min_chunk_size}, " - msg += f"max: {max_chunk_size}" - log.debug(msg) - # nothing to do about inefficiently small chunks, but large chunks - # can be subdivided - if chunk_size < min_chunk_size: - msg = f"chunk size: {chunk_size} less than min size: " - msg += f"{min_chunk_size} for {layout_class} dataset" - log.warn(msg) - elif chunk_size > max_chunk_size: - msg = f"chunk size: {chunk_size} greater than max size: " - msg += f"{max_chunk_size}, for {layout_class} dataset" - log.warn(msg) - layout["dims"] = chunk_dims - - if creationProperties: - # TBD - validate all creationProperties - if "fillValue" in creationProperties: - # validate fill value compatible with type - dt = createDataType(datatype) - fill_value = creationProperties["fillValue"] - if "fillValue_encoding" in creationProperties: - fill_value_encoding = creationProperties["fillValue_encoding"] - - if fill_value_encoding not in ("None", "base64"): - msg = f"unexpected value for fill_value_encoding: {fill_value_encoding}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - # should see a string in this case - if not isinstance(fill_value, str): - msg = f"unexpected fill value: {fill_value} " - msg += f"for encoding: {fill_value_encoding}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - fill_value_encoding = None - - try: - getNumpyValue(fill_value, dt=dt, encoding=fill_value_encoding) - except ValueError: - msg = f"invalid fill value: {fill_value}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if "filters" in creationProperties: - # convert to standard representation - # refer to https://hdf5-json.readthedocs.io/en/latest/bnf/\ - # filters.html#grammar-token-filter_list - f_in = creationProperties["filters"] - supported_filters = getSupportedFilters(include_compressors=True) - log.debug(f"supported_compressors: {supported_filters}") - - log.debug(f"filters provided in creationProperties: {f_in}") - - if not isinstance(f_in, list): - msg = "Expected filters in creationProperties to be a list" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if f_in and chunk_size is None: - # filters can only be used with chunked datasets - msg = "Filters can only be used with chunked datasets" - log.warning(msg) - raise HTTPBadRequest(reason=msg) - - f_out = [] - for filter in f_in: - if isinstance(filter, int) or isinstance(filter, str): - item = getFilterItem(filter) - if not item: - msg = f"filter {filter} not recognized" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if item["name"] not in supported_filters: - msg = f"filter {filter} is not supported" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - f_out.append(item) - elif isinstance(filter, dict): - if "class" not in filter: - msg = "expected 'class' key for filter property" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if filter["class"] != "H5Z_FILTER_USER": - item = getFilterItem(filter["class"]) - elif "id" in filter: - item = getFilterItem(filter["id"]) - elif "name" in filter: - item = getFilterItem(filter["name"]) - else: - item = None - if not item: - msg = f"filter {filter['class']} not recognized" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "id" not in filter: - filter["id"] = item["id"] - elif item["id"] != filter["id"]: - msg = f"Expected {filter['class']} to have id: " - msg += f"{item['id']} but got {filter['id']}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "name" not in filter: - filter["name"] = item["name"] - if filter["name"] not in supported_filters: - msg = f"filter {filter} is not supported" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - f_out.append(filter) - else: - msg = f"Unexpected type for filter: {filter}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - # replace filters with our starndardized list - log.debug(f"setting filters to: {f_out}") - creationProperties["filters"] = f_out - - log.debug(f"set dataset json creationProperties: {creationProperties}") - - # setup args to createDataset - implicit = getBooleanParam(params, "implicit") - kwargs = getCreateArgs(body, root_id=root_id, type=datatype, bucket=bucket, implicit=implicit) - # fill in dataset-specific keys - kwargs["creation_props"] = creationProperties - kwargs["shape"] = shape_json - kwargs["layout"] = layout - - log.debug(f"kwargs for dataset create: {kwargs}") - - dset_json = await createDataset(app, **kwargs) - # write data if provided if input_arr is not None: log.debug(f"write input_arr: {input_arr}") @@ -1162,9 +648,9 @@ async def POST_Dataset(request): kwargs["data"] = input_arr # do write await doHyperslabWrite(app, request, **kwargs) - + """ # dataset creation successful - resp = await jsonResponse(request, dset_json, status=201) + resp = await jsonResponse(request, post_rsp, status=201) log.response(request, resp=resp) return resp diff --git a/hsds/post_crawl.py b/hsds/post_crawl.py index 225376aa..198b1492 100644 --- a/hsds/post_crawl.py +++ b/hsds/post_crawl.py @@ -149,6 +149,8 @@ async def create(self, index): kwargs["obj_id"] = item["obj_id"] if "type" in item: kwargs["type"] = item["type"] + if "shape" in item: + kwargs["shape"] = item["shape"] if "layout" in item: kwargs["layout"] = item["layout"] if "creation_props" in item: @@ -231,7 +233,7 @@ async def createGroups(app, items: list, root_id=None, bucket=None): async def createDatatypeObjs(app, items: list, root_id=None, bucket=None): - """ create an datatype objects based on parameters in items list """ + """ create datatype objects based on parameters in items list """ if not root_id: msg = "no root_id given for createDatatypeObjs" @@ -256,3 +258,30 @@ async def createDatatypeObjs(app, items: list, root_id=None, bucket=None): rsp_json = await _createObjects(app, items=items, root_id=root_id, bucket=bucket) return rsp_json + +async def createDatasets(app, items: list, root_id=None, bucket=None): + """ create dataset objects based on parameters in items list """ + + if not root_id: + msg = "no root_id given for createDatatypeObjs" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + for item in items: + if not isinstance(item, dict): + msg = "expected list of dictionary objects for multi-object create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "type" not in item: + msg = "type key not provided for multi-dataset create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "shape" not in item: + msg = "shape key not provided for multi-dataset create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + log.info(f"createDatasets with {len(items)} items, root_id: {root_id}") + + rsp_json = await _createObjects(app, items=items, root_id=root_id, bucket=bucket) + return rsp_json diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index fdbfe6c4..bc089420 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -15,6 +15,7 @@ import asyncio import json +import math import time import numpy as np @@ -24,7 +25,8 @@ from aiohttp.client_exceptions import ClientOSError, ClientError from aiohttp import ClientResponseError -from h5json.array_util import encodeData, decodeData, bytesToArray, bytesArrayToList, jsonToArray +from h5json.array_util import encodeData, decodeData, bytesToArray, bytesArrayToList +from h5json.array_util import jsonToArray, getNumpyValue from h5json.objid import getCollectionForId, createObjId, getRootObjId from h5json.objid import isSchema2Id, getS3Key, isValidUuid from h5json.hdf5dtype import getBaseTypeJson, validateTypeItem, createDataType @@ -33,12 +35,14 @@ from .util.nodeUtil import getDataNodeUrl from .util.authUtil import getAclKeys from .util.linkUtil import h5Join, validateLinkName, getLinkClass, getRequestLinks -from .util.storUtil import getStorJSONObj, isStorObj +from .util.storUtil import getStorJSONObj, isStorObj, getSupportedFilters from .util.authUtil import aclCheck from .util.httpUtil import http_get, http_put, http_post, http_delete from .util.domainUtil import getBucketForDomain, verifyRoot, getLimits from .util.storUtil import getCompressors -from .util.dsetUtil import getShapeDims +from .util.dsetUtil import getShapeDims, getShapeJson, getFiltersJson, validateChunkLayout +from .util.dsetUtil import getContiguousLayout, guessChunk, getChunkSize +from .util.dsetUtil import expandChunk, shrinkChunk from .basenode import getVersion from . import hsds_logger as log @@ -1287,15 +1291,69 @@ async def deleteObject(app, obj_id, bucket=None): del meta_cache[obj_id] # remove from cache +def validateDatasetCreationProps(creation_props, type_json=None, shape=None): + """ validate creation props """ + + log.debug(f"validateCreationProps: {creation_props}") + if "fillValue" in creation_props: + if not type_json or not shape: + msg = "shape and type must be set to use fillValue" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + # validate fill value compatible with type + dt = createDataType(type_json) + fill_value = creation_props["fillValue"] + log.debug(f"got fill_value: {fill_value}") + if "fillValue_encoding" in creation_props: + fill_value_encoding = creation_props["fillValue_encoding"] + if fill_value_encoding not in ("None", "base64"): + msg = f"unexpected value for fill_value_encoding: {fill_value_encoding}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + # should see a string in this case + if not isinstance(fill_value, str): + msg = f"unexpected fill value: {fill_value} " + msg += f"for encoding: {fill_value_encoding}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + fill_value_encoding = None + + try: + getNumpyValue(fill_value, dt=dt, encoding=fill_value_encoding) + except ValueError: + msg = f"invalid fill value: {fill_value}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if "filters" in creation_props: + if not type_json or not shape: + msg = "shape and type must be set to use filters" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + supported_filters = getSupportedFilters() + # will raise bad request exception if not valid + supported_filters = getSupportedFilters(include_compressors=True) + log.debug(f"supported_filters: {supported_filters}") + filters_out = getFiltersJson(creation_props, supported_filters=supported_filters) + # replace filters with our starndardized list + log.debug(f"setting filters to: {filters_out}") + creation_props["filters"] = filters_out + def getCreateArgs(body, root_id=None, bucket=None, type=None, - shape=None, implicit=False, + chunk_table=None, ignore_link=False): """ get args for createObject from request body """ + log.debug(f"getCreateArgs with body keys: {list(body.keys())}") + kwargs = {"bucket": bucket} predate_max_time = config.get("predate_max_time", default=10.0) @@ -1364,8 +1422,10 @@ def getCreateArgs(body, if "creationProperties" in body: creation_props = body["creationProperties"] - # tbd: validate creation_props - kwargs["creation_props"] = creation_props + # validate after we've checked for shape and type + else: + creation_props = {} + kwargs["creation_props"] = creation_props if "attributes" in body: attrs = body["attributes"] @@ -1391,20 +1451,25 @@ def getCreateArgs(body, if type: kwargs["type"] = type + type_json = type elif "type" in body: - datatype = body["type"] - if isinstance(datatype, str): + type_json = body["type"] + if isinstance(type_json, str): try: # convert predefined type string (e.g. "H5T_STD_I32LE") to # corresponding json representation - datatype = getBaseTypeJson(datatype) - log.debug(f"got datatype: {datatype}") + type_json = getBaseTypeJson(type_json) + log.debug(f"got type: {type_json}") except TypeError: - msg = f"POST with invalid predefined type: {datatype}" + msg = f"POST with invalid predefined type: {type_json}" log.warn(msg) raise HTTPBadRequest(reason=msg) + else: + type_json = None + + if type_json: try: - validateTypeItem(datatype) + validateTypeItem(type_json) except KeyError as ke: msg = f"KeyError creating type: {ke}" log.warn(msg) @@ -1417,10 +1482,237 @@ def getCreateArgs(body, msg = f"ValueError creating type: {ve}" log.warn(msg) raise HTTPBadRequest(reason=msg) - kwargs["type"] = datatype + kwargs["type"] = type_json else: pass # no type + return kwargs + +def getDatasetCreateArgs(body, + root_id=None, + bucket=None, + type=None, + implicit=False, + chunk_table=None, + ignore_link=False): + + """ get args for createDataset from request body """ + + # call getCreateArgs for group, datatype objects, then fill in for dataset specific options + kwargs = getCreateArgs(body, + root_id=root_id, + bucket=bucket, + type=type, + implicit=implicit, + ignore_link=ignore_link) + + if not "type" in kwargs: + msg = "no type specified for create dataset" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + type_json = kwargs["type"] + # + # Validate shape if present + # + + # will return scalar shape if no shape key in body + shape_json = getShapeJson(body) + kwargs["shape"] = shape_json + + # get layout for dataset creation + log.debug("getting dataset creation settings") + layout_props = None + min_chunk_size = int(config.get("min_chunk_size")) + max_chunk_size = int(config.get("max_chunk_size")) + type_json = kwargs["type"] + item_size = getItemSize(type_json) + creation_props = kwargs["creation_props"] + layout_props = None + + if creation_props: + validateDatasetCreationProps(creation_props, type_json=type_json, shape=shape_json) + if "layout" in creation_props: + layout_props = creation_props["layout"] + try: + validateChunkLayout(shape_json, item_size, layout_props, chunk_table=chunk_table) + except ValueError: + msg = f"invalid chunk layout: {layout_props}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + # TBD: check for invalid layout class... + if layout_props: + if layout_props["class"] == "H5D_CONTIGUOUS": + # treat contiguous as chunked + layout_class = "H5D_CHUNKED" + else: + layout_class = layout_props["class"] + elif shape_json["class"] != "H5S_NULL": + layout_class = "H5D_CHUNKED" + else: + layout_class = None + log.debug(f"using layout_class: {layout_class}") + + if layout_class == "H5D_COMPACT": + layout = {"class": "H5D_COMPACT"} + elif layout_class: + # initialize to H5D_CHUNKED + layout = {"class": "H5D_CHUNKED"} + else: + # null space - no layout + layout = None + + if layout_props and "dims" in layout_props: + chunk_dims = layout_props["dims"] + else: + chunk_dims = None + + if layout_class == "H5D_CONTIGUOUS_REF": + opts = {"chunk_min": min_chunk_size, "chunk_max": max_chunk_size} + chunk_dims = getContiguousLayout(shape_json, item_size, **opts) + layout["dims"] = chunk_dims + log.debug(f"autoContiguous layout: {layout}") + + if layout_class == "H5D_CHUNKED" and chunk_dims is None: + # do auto-chunking + chunk_dims = guessChunk(shape_json, item_size) + log.debug(f"initial autochunk layout: {chunk_dims}") + if layout_class == "H5D_CHUNKED": + chunk_size = getChunkSize(chunk_dims, item_size) + + msg = f"chunk_size: {chunk_size}, min: {min_chunk_size}, " + msg += f"max: {max_chunk_size}" + log.debug(msg) + + # adjust the chunk shape if chunk size is too small or too big + adjusted_chunk_dims = None + if chunk_size < min_chunk_size: + msg = f"chunk size: {chunk_size} less than min size: " + msg += f"{min_chunk_size}, expanding" + log.debug(msg) + opts = {"chunk_min": min_chunk_size, "layout_class": layout_class} + adjusted_chunk_dims = expandChunk(chunk_dims, item_size, shape_json, **opts) + elif chunk_size > max_chunk_size: + msg = f"chunk size: {chunk_size} greater than max size: " + msg += f"{max_chunk_size}, shrinking" + log.debug(msg) + opts = {"chunk_max": max_chunk_size} + adjusted_chunk_dims = shrinkChunk(chunk_dims, item_size, **opts) + + if adjusted_chunk_dims: + msg = f"requested chunk_dimensions: {chunk_dims} modified " + msg += f"dimensions: {adjusted_chunk_dims}" + log.debug(msg) + layout["dims"] = adjusted_chunk_dims + else: + layout["dims"] = chunk_dims # don't need to adjust chunk size + + # set partition_count if needed: + max_chunks_per_folder = int(config.get("max_chunks_per_folder")) + set_partition = False + if max_chunks_per_folder > 0: + if "dims" in shape_json and "dims" in layout: + set_partition = True + + if set_partition: + chunk_dims = layout["dims"] + shape_dims = shape_json["dims"] + if "maxdims" in shape_json: + max_dims = shape_json["maxdims"] + else: + max_dims = None + num_chunks = 1 + rank = len(shape_dims) + unlimited_count = 0 + if max_dims: + for i in range(rank): + if max_dims[i] == 0: + unlimited_count += 1 + msg = f"number of unlimited dimensions: {unlimited_count}" + log.debug(msg) + + for i in range(rank): + max_dim = 1 + if max_dims: + max_dim = max_dims[i] + if max_dim == 0: + # don't really know what the ultimate extent + # could be, but assume 10^6 for total number of + # elements and square-shaped array... + MAX_ELEMENT_GUESS = 10.0 ** 6 + exp = 1 / unlimited_count + max_dim = int(math.pow(MAX_ELEMENT_GUESS, exp)) + else: + max_dim = shape_dims[i] + num_chunks *= math.ceil(max_dim / chunk_dims[i]) + + if num_chunks > max_chunks_per_folder: + partition_count = math.ceil(num_chunks / max_chunks_per_folder) + msg = f"set partition count to: {partition_count}, " + msg += f"num_chunks: {num_chunks}" + log.info(msg) + layout["partition_count"] = partition_count + else: + msg = "do not need chunk partitions, num_chunks: " + msg += f"{num_chunks} max_chunks_per_folder: " + msg += f"{max_chunks_per_folder}" + log.info(msg) + + if layout_class in ("H5D_CHUNKED_REF", "H5D_CHUNKED_REF_INDIRECT"): + chunk_size = getChunkSize(chunk_dims, item_size) + + msg = f"chunk_size: {chunk_size}, min: {min_chunk_size}, " + msg += f"max: {max_chunk_size}" + log.debug(msg) + # nothing to do about inefficiently small chunks, but large chunks + # can be subdivided + if chunk_size < min_chunk_size: + msg = f"chunk size: {chunk_size} less than min size: " + msg += f"{min_chunk_size} for {layout_class} dataset" + log.warn(msg) + elif chunk_size > max_chunk_size: + msg = f"chunk size: {chunk_size} greater than max size: " + msg += f"{max_chunk_size}, for {layout_class} dataset" + log.warn(msg) + layout["dims"] = chunk_dims + + if layout: + log.debug(f"setting layout to: {layout}") + kwargs["layout"] = layout + + # + # get input data if present + # + if "value" in body and body["value"]: + # data to initialize dataset included in request + if shape_json["class"] == "H5S_NULL": + msg = "null shape datasets can not have initial values" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + input_data = body["value"] + msg = "input data doesn't match request type and shape" + dims = getShapeDims(shape_json) + if not dims: + log.warn(msg) + raise HTTPBadRequest(reason=msg) + arr_dtype = createDataType(type_json) + + try: + input_arr = jsonToArray(dims, arr_dtype, input_data) + except ValueError: + log.warn(f"ValueError: {msg}") + raise HTTPBadRequest(reason=msg) + except TypeError: + log.warn(f"TypeError: {msg}") + raise HTTPBadRequest(reason=msg) + except IndexError: + log.warn(f"IndexError: {msg}") + raise HTTPBadRequest(reason=msg) + log.debug(f"got json arr: {input_arr.shape}") + kwargs["value"] = input_data + return kwargs @@ -1597,6 +1889,8 @@ async def createObject(app, params = {"bucket": bucket} rsp_json = await http_post(app, req, data=obj_json, params=params) + log.debug(f"createObject: {req} got rsp_json: {rsp_json}") + # object creation successful, create link from parent if requested if h5path: kwargs = {"tgt_id": obj_id, "bucket": bucket, "implicit": implicit} @@ -1675,6 +1969,7 @@ async def createDataset(app, obj_id=None, creation_props=None, layout=None, + value=None, attrs=None, links=None, implicit=None, @@ -1688,9 +1983,8 @@ async def createDataset(app, raise HTTPBadRequest(reason=msg) if not shape: - msg = "shape not set for dataset creation" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + # default to a scalar dataset + shape = {"class": "H5S_SCALAR"} kwargs = {} kwargs["parent_id"] = parent_id @@ -1705,5 +1999,56 @@ async def createDataset(app, kwargs["links"] = links kwargs["implicit"] = implicit kwargs["bucket"] = bucket - rsp_json = await createObject(app, **kwargs) - return rsp_json + dset_json = await createObject(app, **kwargs) + + if value: + log.debug(f"tbd - set dataset value to: {value}") + shape_json = kwargs["shape"] + type_json = kwargs["type"] + # data to initialize dataset included in request + msg = "input data doesn't match request type and shape" + dims = getShapeDims(shape_json) + if not dims: + log.warn(msg) + raise HTTPBadRequest(reason=msg) + arr_dtype = createDataType(type_json) + + try: + input_arr = jsonToArray(dims, arr_dtype, value) + except ValueError: + log.warn(f"ValueError: {msg}") + raise HTTPBadRequest(reason=msg) + except TypeError: + log.warn(f"TypeError: {msg}") + raise HTTPBadRequest(reason=msg) + except IndexError: + log.warn(f"IndexError: {msg}") + raise HTTPBadRequest(reason=msg) + log.debug(f"got json arr: {input_arr.shape}") + else: + input_arr = None + + # write data if provided + if input_arr is not None: + log.debug(f"write input_arr: {input_arr}") + # mixin the layout + dset_json["layout"] = layout + # make selection for entire dataspace + dims = getShapeDims(shape_json) + slices = [] + for dim in dims: + s = slice(0, dim, 1) + slices.append(s) + # make a one page list to handle the write in one chunk crawler run + # (larger write request should user binary streaming) + kwargs = {"page_number": 0, "page": slices} + kwargs["dset_json"] = dset_json + kwargs["bucket"] = bucket + kwargs["select_dtype"] = input_arr.dtype + kwargs["data"] = input_arr + log.debug(f"kwargs for hyperslab write: {kwargs}") + # do write + #request = None # don't need in this case since not reading from input stream + #await doHyperslabWrite(app, request, **kwargs) + + return dset_json diff --git a/hsds/util/chunkUtil.py b/hsds/util/chunkUtil.py index 9c984de6..9dd51bf9 100644 --- a/hsds/util/chunkUtil.py +++ b/hsds/util/chunkUtil.py @@ -11,230 +11,6 @@ PRIMES = [29, 31, 37, 41, 43, 47, 53, 59, 61, 67] # for chunk partitioning -def getChunkSize(layout, type_size): - """Return chunk size given layout. - i.e. just the product of the values in the list. - """ - if type_size == "H5T_VARIABLE": - type_size = DEFAULT_TYPE_SIZE - - chunk_size = type_size - for n in layout: - if n <= 0: - raise ValueError("Invalid chunk layout") - chunk_size *= n - return chunk_size - - -def get_dset_size(shape_json, typesize): - """Return the size of the dataspace. For - any unlimited dimensions, assume a value of 1. - (so the return size will be the absolute minimum) - """ - if shape_json is None or shape_json["class"] == "H5S_NULL": - return None - if shape_json["class"] == "H5S_SCALAR": - return typesize # just return size for one item - if typesize == "H5T_VARIABLE": - typesize = DEFAULT_TYPE_SIZE # just take a guess at the item size - dset_size = typesize - shape = shape_json["dims"] - rank = len(shape) - - for n in range(rank): - if shape[n] == 0: - # extendable extent with value of 0 - continue # assume this is one - dset_size *= shape[n] - return dset_size - - -def expandChunk( - layout, typesize, shape_json, chunk_min=CHUNK_MIN, layout_class="H5D_CHUNKED" -): - """Compute an increased chunk shape with a size in bytes greater than chunk_min.""" - if shape_json is None or shape_json["class"] == "H5S_NULL": - return None - if shape_json["class"] == "H5S_SCALAR": - return (1,) # just enough to store one item - - layout = list(layout) - log.debug(f"expandChunk layout: {layout} typesize: {typesize}") - dims = shape_json["dims"] - rank = len(dims) - extendable_dims = 0 # number of dimensions that are extenable - maxdims = None - if "maxdims" in shape_json: - maxdims = shape_json["maxdims"] - for n in range(rank): - if maxdims[n] == 0 or maxdims[n] > dims[n]: - extendable_dims += 1 - - dset_size = get_dset_size(shape_json, typesize) - if dset_size <= chunk_min and extendable_dims == 0: - # just use the entire dataspace shape as one big chunk - return tuple(dims) - - chunk_size = getChunkSize(layout, typesize) - if chunk_size >= chunk_min: - return tuple(layout) # good already - while chunk_size < chunk_min: - # just adjust along extendable dimensions first - old_chunk_size = chunk_size - for n in range(rank): - dim = rank - n - 1 # start from last dim - - if extendable_dims > 0: - if maxdims[dim] == 0: - # infinitely extendable dimensions - layout[dim] *= 2 - chunk_size = getChunkSize(layout, typesize) - if chunk_size > chunk_min: - break - elif maxdims[dim] > layout[dim]: - # can only be extended so much - layout[dim] *= 2 - if layout[dim] >= dims[dim]: - layout[dim] = maxdims[dim] # trim back - extendable_dims -= 1 # one less extenable dimension - - chunk_size = getChunkSize(layout, typesize) - if chunk_size > chunk_min: - break - else: - pass # ignore non-extensible for now - else: - # no extendable dimensions - if dims[dim] > layout[dim]: - # can expand chunk along this dimension - layout[dim] *= 2 - if layout[dim] > dims[dim]: - layout[dim] = dims[dim] # trim back - chunk_size = getChunkSize(layout, typesize) - if chunk_size > chunk_min: - break - else: - pass # can't extend chunk along this dimension - if chunk_size <= old_chunk_size: - # stop iteration if we haven't increased the chunk size - log.debug("stopping expandChunk iteration") - break - elif chunk_size > chunk_min: - break # we're good - else: - pass # do another round - return tuple(layout) - - -def shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX, layout_class="H5D_CHUNKED"): - """Compute a reduced chunk shape with a size in bytes less than chunk_max.""" - layout = list(layout) - chunk_size = getChunkSize(layout, typesize) - if chunk_size <= chunk_max: - return tuple(layout) # good already - log.debug(f"shrinkChunk layout: {layout} typesize: {typesize}") - rank = len(layout) - - while chunk_size > chunk_max: - # just adjust along extendable dimensions first - old_chunk_size = chunk_size - for dim in range(rank): - if layout[dim] > 1: - # tricky way to do x // 2 with ceil - layout[dim] = -(-layout[dim] // 2) - chunk_size = getChunkSize(layout, typesize) - if chunk_size <= chunk_max: - break - else: - pass # can't shrink chunk along this dimension - if chunk_size >= old_chunk_size: - # reality check to see if we'll ever break out of the while loop - log.warning("Unexpected error in shrink_chunk") - break - elif chunk_size <= chunk_max: - break # we're good - else: - pass # do another round - return tuple(layout) - - -def guessChunk(shape_json, typesize): - """Guess an appropriate chunk layout for a dataset, given its shape and - the size of each element in bytes. Will allocate chunks only as large - as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of - each axis, slightly favoring bigger values for the last index. - - Undocumented and subject to change without warning. - """ - if shape_json is None or shape_json["class"] == "H5S_NULL": - return None - if shape_json["class"] == "H5S_SCALAR": - return (1,) # just enough to store one item - - if "maxdims" in shape_json: - shape = shape_json["maxdims"] - else: - shape = shape_json["dims"] - - if typesize == "H5T_VARIABLE": - typesize = 128 # just take a guess at the item size - - # For unlimited dimensions we have to guess. use 1024 - shape = tuple((x if x != 0 else 1024) for i, x in enumerate(shape)) - - return shape - - -def getContiguousLayout( - shape_json, item_size, chunk_min=1000 * 1000, chunk_max=4 * 1000 * 1000 -): - """ - create a chunklayout for datasets use continguous storage. - """ - if not isinstance(item_size, int): - msg = "ContiguousLayout can only be used with fixed-length types" - raise ValueError(msg) - if chunk_max < chunk_min: - raise ValueError("chunk_max cannot be less than chunk_min") - if shape_json is None or shape_json["class"] == "H5S_NULL": - return None - if shape_json["class"] == "H5S_SCALAR": - return (1,) # just enough to store one item - dims = shape_json["dims"] - rank = len(dims) - if rank == 0: - raise ValueError("rank must be positive for Contiguous Layout") - for dim in dims: - if dim < 0: - raise ValueError("extents must be positive for Contiguous Layout") - if dim == 0: - # datashape with no elements, just return dims as layout - return dims - - nsize = item_size - layout = [ - 1, - ] * rank - - for i in range(rank): - dim = rank - i - 1 - extent = dims[dim] - if extent * nsize < chunk_max: - # just use the full extent as layout - layout[dim] = extent - nsize *= extent - else: - n = extent - while n > 1: - n = -(-n // 2) # use negatives so we round up on odds - if n * nsize < chunk_max: - break - layout[dim] = n - break # just use 1's for the rest of the layout - - return layout - - def frac(x, d): """ Utility func -- Works like fractional div, but returns ceiling diff --git a/hsds/util/dsetUtil.py b/hsds/util/dsetUtil.py index 044127f0..b259aae9 100644 --- a/hsds/util/dsetUtil.py +++ b/hsds/util/dsetUtil.py @@ -13,7 +13,17 @@ from aiohttp.web_exceptions import HTTPBadRequest, HTTPInternalServerError import math +from h5json.hdf5dtype import getItemSize, isVlen +from h5json.objid import isValidUuid + from .. import hsds_logger as log +from .. import config + +#from .chunkUtil import getChunkSize, guessChunk, expandChunk, shrinkChunk + +CHUNK_MIN = 512 * 1024 # Soft lower limit (512k) +CHUNK_MAX = 2048 * 1024 # Hard upper limit (2M) +DEFAULT_TYPE_SIZE = 128 # Type size case when it is variable """ Filters that are known to HSDS. @@ -74,24 +84,27 @@ "H5D_CONTIGUOUS_REF", ) - -# copied from arrayUtil.py -def isVlen(dt): +def get_dset_size(shape_json, typesize): + """Return the size of the dataspace. For + any unlimited dimensions, assume a value of 1. + (so the return size will be the absolute minimum) """ - Return True if the type contains variable length elements - """ - is_vlen = False - if len(dt) > 1: - names = dt.names - for name in names: - if isVlen(dt[name]): - is_vlen = True - break - else: - if dt.metadata and "vlen" in dt.metadata: - is_vlen = True - return is_vlen + if shape_json is None or shape_json["class"] == "H5S_NULL": + return None + if shape_json["class"] == "H5S_SCALAR": + return typesize # just return size for one item + if typesize == "H5T_VARIABLE": + typesize = DEFAULT_TYPE_SIZE # just take a guess at the item size + dset_size = typesize + shape = shape_json["dims"] + rank = len(shape) + for n in range(rank): + if shape[n] == 0: + # extendable extent with value of 0 + continue # assume this is one + dset_size *= shape[n] + return dset_size def getFilterItem(key): """ @@ -107,6 +120,81 @@ def getFilterItem(key): return None # not found +def getFiltersJson(create_props, supported_filters=None): + """ return standardized filter representation from creation properties + raise bad request if invalid """ + + # refer to https://hdf5-json.readthedocs.io/en/latest/bnf/\ + # filters.html#grammar-token-filter_list + + if "filters" not in create_props: + return {} # null set + + f_in = create_props["filters"] + + log.debug(f"filters provided in creation_prop: {f_in}") + + if not isinstance(f_in, list): + msg = "Expected filters in creation_props to be a list" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + f_out = [] + for filter in f_in: + if isinstance(filter, int) or isinstance(filter, str): + item = getFilterItem(filter) + if not item: + msg = f"filter {filter} not recognized" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if item["name"] not in supported_filters: + msg = f"filter {filter} is not supported" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + f_out.append(item) + elif isinstance(filter, dict): + if "class" not in filter: + msg = "expected 'class' key for filter property" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if filter["class"] != "H5Z_FILTER_USER": + item = getFilterItem(filter["class"]) + elif "id" in filter: + item = getFilterItem(filter["id"]) + elif "name" in filter: + item = getFilterItem(filter["name"]) + else: + item = None + if not item: + msg = f"filter {filter['class']} not recognized" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "id" not in filter: + filter["id"] = item["id"] + elif item["id"] != filter["id"]: + msg = f"Expected {filter['class']} to have id: " + msg += f"{item['id']} but got {filter['id']}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "name" not in filter: + filter["name"] = item["name"] + if filter["name"] not in supported_filters: + msg = f"filter {filter} is not supported" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + f_out.append(filter) + else: + msg = f"Unexpected type for filter: {filter}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + # return standardized filter representation + log.debug(f"using filters: {f_out}") + return f_out + + def getFilters(dset_json): """Return list of filters, or empty list""" if "creationProperties" not in dset_json: @@ -210,27 +298,141 @@ def getFilterOps(app, dset_id, filters, dtype=None, chunk_shape=None): return filter_ops else: return None + +def getShapeJson(body): + """ Return normalized json description of data space """ + + dims = None + maxdims = None + shape_class = None + + if "shape" not in body: + shape_class = "H5S_SCALAR" + log.debug("not shape given - using H5S_SCALAR") + return {"class": shape_class} + + body_shape = body["shape"] + log.debug(f"got shape: {body_shape}") + + if isinstance(body_shape, int): + shape_class = "H5S_SIMPLE" + dims = [body_shape, ] + elif isinstance(body_shape, str): + # only valid string value is H5S_NULL or H5S_SCALAR + shape_class = body_shape + elif isinstance(body_shape, (tuple, list)): + if len(body_shape) == 0: + shape_class = "H5S_SCALAR" + else: + shape_class = "H5S_SIMPLE" + dims = body_shape + else: + msg = "invalid shape: {body_shape}" + log.warn(msg) + raise ValueError(msg) + + if shape_class not in ("H5S_NULL", "H5S_SCALAR", "H5S_SIMPLE"): + msg = f"invalid shape class: {shape_class}" + log.warn(msg) + raise ValueError(msg) + if shape_class in ("H5S_NULL", "H5S_SCALAR") and dims: + msg = f"dims not valid for shape class: {body_shape}" + log.warn(msg) + raise ValueError(msg) -def getDsetRank(dset_json): - """Get rank returning 0 for sclar or NULL datashapes""" - datashape = dset_json["shape"] - if datashape["class"] == "H5S_NULL": - return 0 - if datashape["class"] == "H5S_SCALAR": + if dims is None and shape_class == "H5S_SIMPLE": + msg = "dims not specified for H5S_SIMPLE shape" + log.warn(msg) + raise ValueError(msg) + + if dims is not None: + rank = len(dims) + for i in range(rank): + extent = dims[i] + if not isinstance(extent, int): + msg = f"Invalid shape dims: {dims}" + log.warn(msg) + raise ValueError(msg) + if extent < 0: + msg = f"shape dimension is negative for dims: {dims}" + log.warn(msg) + raise ValueError(msg) + + if "maxdims" in body: + maxdims = body["maxdims"] + elif isinstance(body_shape, dict) and "maxdims" in body_shape: + maxdims = body_shape["maxdims"] + else: + maxdims = None + + # validate maxdims + if maxdims: + if dims is None: + msg = f"maxdims cannot be supplied for space class: {shape_class}" + log.warn(msg) + raise ValueError(msg) + + if isinstance(maxdims, int): + dim1 = maxdims + maxdims = [dim1] + elif isinstance(maxdims, list): + pass # can use as is + else: + msg = f"Bad Request: maxdims is invalid: {maxdims}" + log.warn(msg) + raise ValueError(msg) + if len(dims) != len(maxdims): + msg = "max_dims rank doesn't match dims" + log.warn(msg) + raise ValueError(msg) + + # return json description of shape + shape_json = {"class": shape_class} + if shape_class == "H5S_SIMPLE": + shape_json["dims"] = dims + if maxdims: + shape_json["maxdims"] = maxdims + log.debug(f"returning shape_json: {shape_json}") + return shape_json + +def getShapeClass(data_shape): + """ Return shape class of the given data shape """ + + if not isinstance(data_shape, dict): + raise TypeError("expected dict object") + + if "class" not in data_shape: + raise KeyError("expected 'class' key for data shape")\ + + return data_shape["class"] + +def getRank(data_shape): + """ Return rank of given data shape_json """ + + shape_class = getShapeClass(data_shape) + + if shape_class == "H5S_NULL": return 0 - if "dims" not in datashape: - log.warn(f"expected to find dims key in shape_json: {datashape}") + elif shape_class == "H5S_SCALAR": return 0 - dims = datashape["dims"] - rank = len(dims) - return rank + elif shape_class == "H5S_SIMPLE": + if "dims" not in data_shape: + raise KeyError("expected dims key for H5S_SIMPLE data shape") + return len(data_shape["dims"]) + else: + raise ValueError(f"unexpected data shape class: {shape_class}") + +def getDsetRank(dset_json): + """Get rank returning 0 for scalar or NULL data shapes""" + data_shape = dset_json["shape"] + return getRank(data_shape) def isNullSpace(dset_json): - """Return true if this dataset is a null dataspace""" - datashape = dset_json["shape"] - if datashape["class"] == "H5S_NULL": + """Return true if this dataset is a null data space""" + shape_class = getShapeClass(dset_json["shape"]) + if shape_class == "H5S_NULL": return True else: return False @@ -238,33 +440,567 @@ def isNullSpace(dset_json): def isScalarSpace(dset_json): """ return true if this is a scalar dataset """ - datashape = dset_json["shape"] - is_scalar = False - if datashape["class"] == "H5S_NULL": - is_scalar = False - elif datashape["class"] == "H5S_SCALAR": - is_scalar = True + + data_shape = dset_json["shape"] + shape_class = getShapeClass(data_shape) + if shape_class == "H5S_NULL": + return False + + rank = getRank(data_shape) + return True if rank == 0 else False + + +def getContiguousLayout(shape_json, item_size, chunk_min=None, chunk_max=None): + """ + create a chunk layout for datasets use contiguous storage. + """ + if not isinstance(item_size, int): + msg = "ContiguousLayout can only be used with fixed-length types" + log.warn(msg) + raise ValueError(msg) + + if chunk_min is None: + msg = "chunk_min not set" + log.warn(msg) + raise ValueError(msg) + if chunk_max is None: + msg = "chunk_max not set" + log.warn(msg) + raise ValueError(msg) + + if chunk_max < chunk_min: + raise ValueError("chunk_max cannot be less than chunk_min") + + if shape_json is None or shape_json["class"] == "H5S_NULL": + return None + if shape_json["class"] == "H5S_SCALAR": + return (1,) # just enough to store one item + dims = shape_json["dims"] + rank = len(dims) + if rank == 0: + raise ValueError("rank must be positive for Contiguous Layout") + for dim in dims: + if dim < 0: + raise ValueError("extents must be positive for Contiguous Layout") + if dim == 0: + # data shape with no elements, just return dims as layout + return dims + + nsize = item_size + layout = [1,] * rank + + for i in range(rank): + dim = rank - i - 1 + extent = dims[dim] + if extent * nsize < chunk_max: + # just use the full extent as layout + layout[dim] = extent + nsize *= extent + else: + n = extent + while n > 1: + n = -(-n // 2) # use negatives so we round up on odds + if n * nsize < chunk_max: + break + layout[dim] = n + break # just use 1's for the rest of the layout + + return layout + +def getChunkSize(layout, type_size): + """Return chunk size given layout. + i.e. just the product of the values in the list. + """ + if type_size == "H5T_VARIABLE": + type_size = DEFAULT_TYPE_SIZE + + chunk_size = type_size + for n in layout: + if n <= 0: + raise ValueError("Invalid chunk layout") + chunk_size *= n + return chunk_size + +def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): + """ + Use chunk layout given in the creationPropertiesList (if defined and + layout is valid). + Return chunk_layout_json + """ + + rank = 0 + space_dims = None + chunk_dims = None + max_dims = None + + if "dims" in shape_json: + space_dims = shape_json["dims"] + rank = len(space_dims) + + if "maxdims" in shape_json: + max_dims = shape_json["maxdims"] + if "dims" in layout: + chunk_dims = layout["dims"] + + if chunk_dims: + # validate that the chunk_dims are valid and correlates with the + # dataset shape + if isinstance(chunk_dims, int): + chunk_dims = [ + chunk_dims, + ] # promote to array + if len(chunk_dims) != rank: + msg = "Layout rank does not match shape rank" + log.warn(msg) + raise ValueError(msg) + for i in range(rank): + dim_extent = space_dims[i] + chunk_extent = chunk_dims[i] + if not isinstance(chunk_extent, int): + msg = "Layout dims must be integer or integer array" + log.warn(msg) + raise ValueError(msg) + if chunk_extent <= 0: + msg = "Invalid layout value" + log.warn(msg) + raise ValueError(msg) + if max_dims is None: + if chunk_extent > dim_extent: + msg = "Invalid layout value" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + elif max_dims[i] != 0: + if chunk_extent > max_dims[i]: + msg = "Invalid layout value for extensible dimension" + log.warn(msg) + raise ValueError(msg) + else: + pass # allow any positive value for unlimited dimensions + + if "class" not in layout: + msg = "class key not found in layout for creation property list" + log.warn(msg) + raise ValueError(msg) + + layout_class = layout["class"] + + if layout_class == "H5D_CONTIGUOUS_REF": + # reference to a dataset in a traditional HDF5 files with + # contiguous storage + if item_size == "H5T_VARIABLE": + # can't be used with variable types... + msg = "Datasets with variable types cannot be used with " + msg += "reference layouts" + log.warn(msg) + raise ValueError(msg) + if "file_uri" not in layout: + # needed for H5D_CONTIGUOUS_REF + msg = "'file_uri' key must be provided for " + msg += "H5D_CONTIGUOUS_REF layout" + log.warn(msg) + raise ValueError(msg) + if "offset" not in layout: + # needed for H5D_CONTIGUOUS_REF + msg = "'offset' key must be provided for " + msg += "H5D_CONTIGUOUS_REF layout" + log.warn(msg) + raise ValueError(msg) + if "size" not in layout: + # needed for H5D_CONTIGUOUS_REF + msg = "'size' key must be provided for " + msg += "H5D_CONTIGUOUS_REF layout" + log.warn(msg) + raise ValueError(msg) + if "dims" in layout: + # used defined chunk layout not allowed for H5D_CONTIGUOUS_REF + msg = "'dims' key can not be provided for " + msg += "H5D_CONTIGUOUS_REF layout" + log.warn(msg) + raise ValueError(msg) + elif layout_class == "H5D_CHUNKED_REF": + # reference to a dataset in a traditional HDF5 files with + # chunked storage + if item_size == "H5T_VARIABLE": + # can't be used with variable types.. + msg = "Datasets with variable types cannot be used with " + msg += "reference layouts" + log.warn(msg) + raise ValueError(msg) + if "file_uri" not in layout: + # needed for H5D_CHUNKED_REF + msg = "'file_uri' key must be provided for " + msg += "H5D_CHUNKED_REF layout" + log.warn(msg) + raise ValueError(msg) + if "dims" not in layout: + # needed for H5D_CHUNKED_REF + msg = "'dimns' key must be provided for " + msg += "H5D_CHUNKED_REF layout" + log.warn(msg) + raise ValueError(msg) + if "chunks" not in layout: + msg = "'chunks' key must be provided for " + msg += "H5D_CHUNKED_REF layout" + log.warn(msg) + raise ValueError(msg) + elif layout_class == "H5D_CHUNKED_REF_INDIRECT": + # reference to a dataset in a traditional HDF5 files with chunked + # storage using an auxiliary dataset + if item_size == "H5T_VARIABLE": + # can't be used with variable types.. + msg = "Datasets with variable types cannot be used with " + msg += "reference layouts" + log.warn(msg) + raise ValueError(msg) + if "dims" not in layout: + # needed for H5D_CHUNKED_REF_INDIRECT + msg = "'dims' key must be provided for " + msg += "H5D_CHUNKED_REF_INDIRECT layout" + log.warn(msg) + raise ValueError(msg) + if "chunk_table" not in layout: + msg = "'chunk_table' key must be provided for " + msg += "H5D_CHUNKED_REF_INDIRECT layout" + log.warn(msg) + raise ValueError(msg) + chunk_table_id = layout["chunk_table"] + if not isValidUuid(chunk_table_id, "Dataset"): + msg = f"Invalid chunk table id: {chunk_table_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + elif layout_class == "H5D_CHUNKED": + if "dims" not in layout: + msg = "dims key not found in layout for creation property list" + log.warn(msg) + raise ValueError(msg) + if shape_json["class"] != "H5S_SIMPLE": + msg = "Bad Request: chunked layout not valid with shape class: " + msg += f"{shape_json['class']}" + log.warn(msg) + raise ValueError(msg) + elif layout_class == "H5D_CONTIGUOUS": + if "dims" in layout: + msg = "dims key found in layout for creation property list " + msg += "for H5D_CONTIGUOUS storage class" + log.warn(msg) + raise ValueError(msg) + elif layout_class == "H5D_COMPACT": + if "dims" in layout: + msg = "dims key found in layout for creation property list " + msg += "for H5D_COMPACT storage class" + log.warn(msg) + raise ValueError(msg) else: - if "dims" not in datashape: - log.warn(f"expected to find dims key in shape_json: {datashape}") - is_scalar = False + msg = f"Unexpected layout: {layout_class}" + log.warn(msg) + raise ValueError(msg) + +def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN, layout_class="H5D_CHUNKED"): + """Compute an increased chunk shape with a size in bytes greater than chunk_min.""" + if shape_json is None or shape_json["class"] == "H5S_NULL": + return None + if shape_json["class"] == "H5S_SCALAR": + return (1,) # just enough to store one item + + layout = list(layout) + log.debug(f"expandChunk layout: {layout} typesize: {typesize}") + dims = shape_json["dims"] + rank = len(dims) + extendable_dims = 0 # number of dimensions that are extenable + maxdims = None + if "maxdims" in shape_json: + maxdims = shape_json["maxdims"] + for n in range(rank): + if maxdims[n] == 0 or maxdims[n] > dims[n]: + extendable_dims += 1 + + dset_size = get_dset_size(shape_json, typesize) + if dset_size <= chunk_min and extendable_dims == 0: + # just use the entire dataspace shape as one big chunk + return tuple(dims) + + chunk_size = getChunkSize(layout, typesize) + if chunk_size >= chunk_min: + return tuple(layout) # good already + while chunk_size < chunk_min: + # just adjust along extendable dimensions first + old_chunk_size = chunk_size + for n in range(rank): + dim = rank - n - 1 # start from last dim + + if extendable_dims > 0: + if maxdims[dim] == 0: + # infinitely extendable dimensions + layout[dim] *= 2 + chunk_size = getChunkSize(layout, typesize) + if chunk_size > chunk_min: + break + elif maxdims[dim] > layout[dim]: + # can only be extended so much + layout[dim] *= 2 + if layout[dim] >= dims[dim]: + layout[dim] = maxdims[dim] # trim back + extendable_dims -= 1 # one less extenable dimension + + chunk_size = getChunkSize(layout, typesize) + if chunk_size > chunk_min: + break + else: + pass # ignore non-extensible for now + else: + # no extendable dimensions + if dims[dim] > layout[dim]: + # can expand chunk along this dimension + layout[dim] *= 2 + if layout[dim] > dims[dim]: + layout[dim] = dims[dim] # trim back + chunk_size = getChunkSize(layout, typesize) + if chunk_size > chunk_min: + break + else: + pass # can't extend chunk along this dimension + if chunk_size <= old_chunk_size: + # stop iteration if we haven't increased the chunk size + log.debug("stopping expandChunk iteration") + break + elif chunk_size > chunk_min: + break # we're good + else: + pass # do another round + return tuple(layout) + + +def shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX, layout_class="H5D_CHUNKED"): + """Compute a reduced chunk shape with a size in bytes less than chunk_max.""" + layout = list(layout) + chunk_size = getChunkSize(layout, typesize) + if chunk_size <= chunk_max: + return tuple(layout) # good already + log.debug(f"shrinkChunk layout: {layout} typesize: {typesize}") + rank = len(layout) + + while chunk_size > chunk_max: + # just adjust along extendable dimensions first + old_chunk_size = chunk_size + for dim in range(rank): + if layout[dim] > 1: + # tricky way to do x // 2 with ceil + layout[dim] = -(-layout[dim] // 2) + chunk_size = getChunkSize(layout, typesize) + if chunk_size <= chunk_max: + break + else: + pass # can't shrink chunk along this dimension + if chunk_size >= old_chunk_size: + # reality check to see if we'll ever break out of the while loop + log.warning("Unexpected error in shrink_chunk") + break + elif chunk_size <= chunk_max: + break # we're good else: - dims = datashape["dims"] - if len(dims) == 0: - # guess this properly be a H5S_SCALAR class - # but treat this as equivalent - is_scalar = True - return is_scalar + pass # do another round + return tuple(layout) -def getHyperslabSelection(dsetshape, start=None, stop=None, step=None): +def guessChunk(shape_json, typesize): + """Guess an appropriate chunk layout for a dataset, given its shape and + the size of each element in bytes. Will allocate chunks only as large + as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of + each axis, slightly favoring bigger values for the last index. + + Undocumented and subject to change without warning. + """ + if shape_json is None or shape_json["class"] == "H5S_NULL": + return None + if shape_json["class"] == "H5S_SCALAR": + return (1,) # just enough to store one item + + if "maxdims" in shape_json: + shape = shape_json["maxdims"] + else: + shape = shape_json["dims"] + + if typesize == "H5T_VARIABLE": + typesize = 128 # just take a guess at the item size + + # For unlimited dimensions we have to guess. use 1024 + shape = tuple((x if x != 0 else 1024) for i, x in enumerate(shape)) + + return shape + + +def getLayoutJson(creation_props, shape=None, type_json=None, chunk_min=None, chunk_max=None): + """ Get the layout json given by creation_props. + Raise bad request error if invalid """ + + min_chunk_size = int(config.get("min_chunk_size")) + max_chunk_size = int(config.get("max_chunk_size")) + + item_size = getItemSize(type_json) + if chunk_min is None: + chunk_min = 1000 * 1000 + if chunk_max is None: + chunk_max = 4 * 1000 * 1000 + + if chunk_min > chunk_max: + msg = "chunk_max must be larger than chunk_min" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + layout = None + if "layout" in creation_props: + layout_props = creation_props["layout"] + else: + layout_props = None + + if layout_props: + if "class" not in layout_props: + msg = "expected class key in layout props" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + layout_class = layout_props["class"] + if layout_class == "H5D_CONTIGUOUS": + # treat contiguous as chunked + layout_class = "H5D_CHUNKED" + else: + layout_class = layout_props["class"] + elif shape["class"] != "H5S_NULL": + layout_class = "H5D_CHUNKED" + else: + layout_class = None + + if layout_class == "H5D_COMPACT": + layout = {"class": "H5D_COMPACT"} + elif layout_class: + # initialize to H5D_CHUNKED + layout = {"class": "H5D_CHUNKED"} + else: + # null space - no layout + layout = None + + if layout_props and "dims" in layout_props: + chunk_dims = layout_props["dims"] + else: + chunk_dims = None + + if layout_class == "H5D_CONTIGUOUS_REF": + kwargs = {"chunk_min": min_chunk_size, "chunk_max": max_chunk_size} + chunk_dims = getContiguousLayout(shape, item_size, **kwargs) + layout["dims"] = chunk_dims + log.debug(f"autoContiguous layout: {layout}") + + if layout_class == "H5D_CHUNKED" and chunk_dims is None: + # do auto-chunking + chunk_dims = guessChunk(shape, item_size) + log.debug(f"initial autochunk layout: {chunk_dims}") + + if layout_class == "H5D_CHUNKED": + chunk_size = getChunkSize(chunk_dims, item_size) + + msg = f"chunk_size: {chunk_size}, min: {min_chunk_size}, " + msg += f"max: {max_chunk_size}" + log.debug(msg) + # adjust the chunk shape if chunk size is too small or too big + adjusted_chunk_dims = None + if chunk_size < min_chunk_size: + msg = f"chunk size: {chunk_size} less than min size: " + msg += f"{min_chunk_size}, expanding" + log.debug(msg) + kwargs = {"chunk_min": min_chunk_size, "layout_class": layout_class} + adjusted_chunk_dims = expandChunk(chunk_dims, item_size, shape, **kwargs) + elif chunk_size > max_chunk_size: + msg = f"chunk size: {chunk_size} greater than max size: " + msg += f"{max_chunk_size}, shrinking" + log.debug(msg) + kwargs = {"chunk_max": max_chunk_size} + adjusted_chunk_dims = shrinkChunk(chunk_dims, item_size, **kwargs) + if adjusted_chunk_dims: + msg = f"requested chunk_dimensions: {chunk_dims} modified " + msg += f"dimensions: {adjusted_chunk_dims}" + log.debug(msg) + layout["dims"] = adjusted_chunk_dims + else: + layout["dims"] = chunk_dims # don't need to adjust chunk size + + # set partition_count if needed: + max_chunks_per_folder = int(config.get("max_chunks_per_folder")) + set_partition = False + if max_chunks_per_folder > 0: + if "dims" in shape and "dims" in layout: + set_partition = True + + if set_partition: + chunk_dims = layout["dims"] + shape_dims = shape["dims"] + if "maxdims" in shape: + max_dims = shape["maxdims"] + else: + max_dims = None + num_chunks = 1 + rank = len(shape_dims) + unlimited_count = 0 + if max_dims: + for i in range(rank): + if max_dims[i] == 0: + unlimited_count += 1 + msg = f"number of unlimited dimensions: {unlimited_count}" + log.debug(msg) + + for i in range(rank): + max_dim = 1 + if max_dims: + max_dim = max_dims[i] + if max_dim == 0: + # don't really know what the ultimate extent + # could be, but assume 10^6 for total number of + # elements and square-shaped array... + MAX_ELEMENT_GUESS = 10.0 ** 6 + exp = 1 / unlimited_count + max_dim = int(math.pow(MAX_ELEMENT_GUESS, exp)) + else: + max_dim = shape_dims[i] + num_chunks *= math.ceil(max_dim / chunk_dims[i]) + + if num_chunks > max_chunks_per_folder: + partition_count = math.ceil(num_chunks / max_chunks_per_folder) + msg = f"set partition count to: {partition_count}, " + msg += f"num_chunks: {num_chunks}" + log.info(msg) + layout["partition_count"] = partition_count + else: + msg = "do not need chunk partitions, num_chunks: " + msg += f"{num_chunks} max_chunks_per_folder: " + msg += f"{max_chunks_per_folder}" + log.info(msg) + + if layout_class in ("H5D_CHUNKED_REF", "H5D_CHUNKED_REF_INDIRECT"): + chunk_size = getChunkSize(chunk_dims, item_size) + + msg = f"chunk_size: {chunk_size}, min: {min_chunk_size}, " + msg += f"max: {max_chunk_size}" + log.debug(msg) + # nothing to do about inefficiently small chunks, but large chunks + # can be subdivided + if chunk_size < min_chunk_size: + msg = f"chunk size: {chunk_size} less than min size: " + msg += f"{min_chunk_size} for {layout_class} dataset" + log.warn(msg) + elif chunk_size > max_chunk_size: + msg = f"chunk size: {chunk_size} greater than max size: " + msg += f"{max_chunk_size}, for {layout_class} dataset" + log.warn(msg) + layout["dims"] = chunk_dims + + +def getHyperslabSelection(dims, start=None, stop=None, step=None): """ Get slices given lists of start, stop, step values TBD: for step>1, adjust the slice to not extend beyond last data point returned """ - rank = len(dsetshape) + rank = len(dims) if start: if not isinstance(start, (list, tuple)): start = [start] @@ -273,7 +1009,7 @@ def getHyperslabSelection(dsetshape, start=None, stop=None, step=None): log.warn(msg) raise HTTPBadRequest(reason=msg) for dim in range(rank): - if start[dim] < 0 or start[dim] >= dsetshape[dim]: + if start[dim] < 0 or start[dim] >= dims[dim]: msg = "Bad Request: start index invalid for dim: " + str(dim) log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -290,14 +1026,14 @@ def getHyperslabSelection(dsetshape, start=None, stop=None, step=None): log.warn(msg) raise HTTPBadRequest(reason=msg) for dim in range(rank): - if stop[dim] <= start[dim] or stop[dim] > dsetshape[dim]: + if stop[dim] <= start[dim] or stop[dim] > dims[dim]: msg = "Bad Request: stop index invalid for dim: " + str(dim) log.warn(msg) raise HTTPBadRequest(reason=msg) else: stop = [] for dim in range(rank): - stop.append(dsetshape[dim]) + stop.append(dims[dim]) if step: if not isinstance(step, (list, tuple)): @@ -307,7 +1043,7 @@ def getHyperslabSelection(dsetshape, start=None, stop=None, step=None): log.warn(msg) raise HTTPBadRequest(reason=msg) for dim in range(rank): - if step[dim] <= 0 or step[dim] > dsetshape[dim]: + if step[dim] <= 0 or step[dim] > dims[dim]: msg = "Bad Request: step index invalid for dim: " + str(dim) log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -427,11 +1163,11 @@ def isSelectAll(slices, dims): def getQueryParameter(request, query_name, body=None, default=None): """ - Herlper function, get query parameter value from request. + Helper function, get query parameter value from request. If body is provided (as a JSON object) look in JSON and if not found look for query param. Return default value (or None) if not found """ - # as a convience, look up different capitilizations of query name + # as a convenience, look up different capitalizations of query name params = request.rel_url.query query_names = [] query_names.append(query_name.lower()) diff --git a/tests/integ/datatype_test.py b/tests/integ/datatype_test.py index a3f20fd3..ce7418cb 100755 --- a/tests/integ/datatype_test.py +++ b/tests/integ/datatype_test.py @@ -144,7 +144,7 @@ def testPostTypeWithId(self): req = self.endpoint + "/datatypes" data = {"id": ctype_id} rsp = self.session.post(req, data=json.dumps(data), headers=headers) - self.assertEqual(rsp.status_code, 401) # bad request + self.assertEqual(rsp.status_code, 400) # bad request # create a committed type obj data = {"id": ctype_id, "type": "H5T_IEEE_F32LE"} diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index c9e88afb..9ad5c5e6 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -928,7 +928,7 @@ def testPutScalarDataset(self): rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) - # read unintialized value from dataset + # read uninitialized value from dataset req = self.endpoint + "/datasets/" + dset_id + "/value" rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) diff --git a/tests/unit/chunk_util_test.py b/tests/unit/chunk_util_test.py index 37d1e512..7ae16bd5 100755 --- a/tests/unit/chunk_util_test.py +++ b/tests/unit/chunk_util_test.py @@ -23,7 +23,6 @@ chunkReadPoints, chunkWritePoints, chunkQuery, - guessChunk, getNumChunks, getChunkIds, getChunkId, @@ -33,11 +32,7 @@ getChunkSelection, getChunkCoverage, getDataCoverage, - getChunkSize, - shrinkChunk, - expandChunk, getDatasetId, - getContiguousLayout, _getEvalStr, _getWhereFieldName, _getWhereElements, @@ -50,288 +45,8 @@ def __init__(self, *args, **kwargs): # main logging.getLogger().setLevel(logging.ERROR) - def testGuessChunk(self): - - typesize = "H5T_VARIABLE" - logging.debug("hello") - - shape = {"class": "H5S_NULL"} - layout = guessChunk(shape, typesize) - self.assertTrue(layout is None) - - shape = {"class": "H5S_SCALAR"} - layout = guessChunk(shape, typesize) - self.assertEqual(layout, (1,)) - - shape = {"class": "H5S_SIMPLE", "dims": [100, 100]} - layout = guessChunk(shape, typesize) - self.assertTrue(len(layout), 2) - for i in range(2): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= 100) - - typesize = 8 - layout = guessChunk(shape, typesize) - self.assertTrue(len(layout), 2) - for i in range(2): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= 100) - - shape = {"class": "H5S_SIMPLE", "dims": [5]} - layout = guessChunk(shape, typesize) - self.assertEqual(layout, (5,)) - - shape = {"class": "H5S_SIMPLE", "dims": [100, 100, 100]} - layout = guessChunk(shape, typesize) - print("layout:", layout) - self.assertTrue(len(layout), 3) - for i in range(3): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= 100) - - shape = {"class": "H5S_SIMPLE", "dims": [100, 0], "maxdims": [100, 0]} - layout = guessChunk(shape, typesize) - self.assertTrue(len(layout), 2) - for i in range(2): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= 1024) - - shape = {"class": "H5S_SCALAR"} - layout = guessChunk(shape, typesize) - self.assertEqual(layout, (1,)) - - shape = {"class": "H5S_NULL"} - layout = guessChunk(shape, typesize) - self.assertEqual(layout, None) - - def testShrinkChunk(self): - CHUNK_MIN = 500 - CHUNK_MAX = 5000 - typesize = 1 - layout = (1, 2, 3) - shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) - self.assertEqual(shrunk, layout) - - layout = (100, 200, 300) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes > CHUNK_MAX) - shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) - rank = len(layout) - for i in range(rank): - self.assertTrue(shrunk[i] >= 1) - self.assertTrue(shrunk[i] <= 1000 * (i + 1)) - num_bytes = getChunkSize(shrunk, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - layout = (300, 200, 100) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes > CHUNK_MAX) - shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) - rank = len(layout) - for i in range(rank): - self.assertTrue(shrunk[i] >= 1) - self.assertTrue(shrunk[i] <= 1000 * (3 - i)) - num_bytes = getChunkSize(shrunk, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - CHUNK_MIN = 1 * 1024 * 1024 - CHUNK_MAX = 4 * 1024 * 1024 - typesize = 4 - layout = (117, 201, 189, 1) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes > CHUNK_MAX) - shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) - self.assertEqual(shrunk, (59, 101, 95, 1)) - num_bytes = getChunkSize(shrunk, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - def testExpandChunk(self): - CHUNK_MIN = 5000 - CHUNK_MAX = 50000 - - typesize = 20 - shape = {"class": "H5S_SIMPLE", "dims": [12, ], "maxdims": [20, ]} - layout = (20,) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - # chunk layout can't be larger than dataspace - self.assertTrue(num_bytes < CHUNK_MIN) - self.assertEqual(expanded, (20,)) - - typesize = 1 - shape = {"class": "H5S_SIMPLE", "dims": [10, 10, 10]} - layout = (10, 10, 10) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - # chunk layout can't be larger than dataspace - self.assertTrue(num_bytes < CHUNK_MIN) - self.assertEqual(expanded, (10, 10, 10)) - - shape = {"class": "H5S_SIMPLE", "dims": [1000, 2000, 3000]} - layout = (10, 10, 10) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - shape = {"class": "H5S_SIMPLE", "dims": [1000,]} - layout = (10,) - num_bytes = getChunkSize(layout, "H5T_VARIABLE") - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, "H5T_VARIABLE", shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, "H5T_VARIABLE") - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - shape = { - "class": "H5S_SIMPLE", - "dims": [1000, 10, 1000], - "maxdims": [1000, 100, 1000], - } - layout = (10, 10, 10) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - shape = { - "class": "H5S_SIMPLE", - "dims": [1000, 0, 1000], - "maxdims": [1000, 100, 1000], - } - layout = (10, 10, 10) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - shape = { - "class": "H5S_SIMPLE", - "dims": [1000, 10, 1000], - "maxdims": [1000, 0, 1000], - } - layout = (10, 10, 10) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - def testGetContiguiousLayout(self): - - typesize = 4 - chunk_min = 400 - chunk_max = 800 - - def get_num_bytes(dims): - num_bytes = typesize - for n in dims: - num_bytes *= n - return num_bytes - - try: - shape = {"class": "H5S_SIMPLE", "dims": [100, 100]} - layout = getContiguousLayout(shape, "H5T_VARIABLE") - self.assertTrue(False) - except ValueError: - pass # expected - - shape = {"class": "H5S_NULL"} - layout = getContiguousLayout(shape, typesize) - self.assertTrue(layout is None) - - shape = {"class": "H5S_SCALAR"} - layout = getContiguousLayout(shape, typesize) - self.assertEqual(layout, (1,)) - - for extent in (1, 100, 10000): - dims = [ - extent, - ] - shape = {"class": "H5S_SIMPLE", "dims": dims} - layout = getContiguousLayout( - shape, typesize, chunk_min=chunk_min, chunk_max=chunk_max - ) - self.assertTrue(len(layout), 1) - chunk_bytes = get_num_bytes(layout) - space_bytes = get_num_bytes(dims) - if space_bytes > chunk_min: - self.assertTrue(chunk_bytes >= chunk_min) - - self.assertTrue(chunk_bytes <= chunk_max) - - for extent in (1, 9, 90): - dims = [extent, extent] - shape = {"class": "H5S_SIMPLE", "dims": dims} - layout = getContiguousLayout( - shape, typesize, chunk_min=chunk_min, chunk_max=chunk_max - ) - self.assertTrue(len(layout), 2) - for i in range(2): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= extent) - self.assertEqual(layout[1], extent) - chunk_bytes = get_num_bytes(layout) - space_bytes = get_num_bytes(dims) - - if space_bytes > chunk_min: - self.assertTrue(chunk_bytes >= chunk_min) - self.assertTrue(chunk_bytes <= chunk_max) - - for extent in (1, 10, 100): - dims = [extent, extent, 50] - shape = {"class": "H5S_SIMPLE", "dims": dims} - layout = getContiguousLayout( - shape, typesize, chunk_min=chunk_min, chunk_max=chunk_max - ) - self.assertTrue(len(layout), 3) - for i in range(3): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= dims[i]) - - chunk_bytes = get_num_bytes(layout) - space_bytes = get_num_bytes(dims) - - if space_bytes > chunk_min: - self.assertTrue(chunk_bytes >= chunk_min) - self.assertTrue(chunk_bytes <= chunk_max) - - for extent in (1, 100, 1000): - dims = [extent, 4] - shape = {"class": "H5S_SIMPLE", "dims": dims} - layout = getContiguousLayout( - shape, typesize, chunk_min=chunk_min, chunk_max=chunk_max - ) - self.assertTrue(len(layout), 2) - for i in range(2): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= dims[i]) - - chunk_bytes = get_num_bytes(layout) - space_bytes = get_num_bytes(dims) - - if space_bytes > chunk_min: - self.assertTrue(chunk_bytes >= chunk_min) - self.assertTrue(chunk_bytes <= chunk_max) - def testGetNumChunks(self): - datashape = [ - 100, - ] + datashape = [100,] layout = (10,) selection = getHyperslabSelection(datashape) count = getNumChunks(selection, layout) diff --git a/tests/unit/dset_util_test.py b/tests/unit/dset_util_test.py index 0e77ab1b..7c2028b9 100755 --- a/tests/unit/dset_util_test.py +++ b/tests/unit/dset_util_test.py @@ -15,7 +15,8 @@ sys.path.append("../..") from hsds.util.dsetUtil import getHyperslabSelection, getSelectionShape -from hsds.util.dsetUtil import getSelectionList, ItemIterator, getSelectionPagination +from hsds.util.dsetUtil import getSelectionList, ItemIterator, getSelectionPagination, expandChunk +from hsds.util.dsetUtil import guessChunk, shrinkChunk, getChunkSize, getContiguousLayout class DsetUtilTest(unittest.TestCase): @@ -25,6 +26,277 @@ def __init__(self, *args, **kwargs): self.logger = logging.getLogger() self.logger.setLevel(logging.WARNING) + def testGuessChunk(self): + + typesize = "H5T_VARIABLE" + logging.debug("hello") + + shape = {"class": "H5S_NULL"} + layout = guessChunk(shape, typesize) + self.assertTrue(layout is None) + + shape = {"class": "H5S_SCALAR"} + layout = guessChunk(shape, typesize) + self.assertEqual(layout, (1,)) + + shape = {"class": "H5S_SIMPLE", "dims": [100, 100]} + layout = guessChunk(shape, typesize) + self.assertTrue(len(layout), 2) + for i in range(2): + self.assertTrue(layout[i] >= 1) + self.assertTrue(layout[i] <= 100) + + typesize = 8 + layout = guessChunk(shape, typesize) + self.assertTrue(len(layout), 2) + for i in range(2): + self.assertTrue(layout[i] >= 1) + self.assertTrue(layout[i] <= 100) + + shape = {"class": "H5S_SIMPLE", "dims": [5]} + layout = guessChunk(shape, typesize) + self.assertEqual(layout, (5,)) + + shape = {"class": "H5S_SIMPLE", "dims": [100, 100, 100]} + layout = guessChunk(shape, typesize) + print("layout:", layout) + self.assertTrue(len(layout), 3) + for i in range(3): + self.assertTrue(layout[i] >= 1) + self.assertTrue(layout[i] <= 100) + + shape = {"class": "H5S_SIMPLE", "dims": [100, 0], "maxdims": [100, 0]} + layout = guessChunk(shape, typesize) + self.assertTrue(len(layout), 2) + for i in range(2): + self.assertTrue(layout[i] >= 1) + self.assertTrue(layout[i] <= 1024) + + shape = {"class": "H5S_SCALAR"} + layout = guessChunk(shape, typesize) + self.assertEqual(layout, (1,)) + + shape = {"class": "H5S_NULL"} + layout = guessChunk(shape, typesize) + self.assertEqual(layout, None) + + def testShrinkChunk(self): + CHUNK_MIN = 500 + CHUNK_MAX = 5000 + typesize = 1 + layout = (1, 2, 3) + shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) + self.assertEqual(shrunk, layout) + + layout = (100, 200, 300) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes > CHUNK_MAX) + shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) + rank = len(layout) + for i in range(rank): + self.assertTrue(shrunk[i] >= 1) + self.assertTrue(shrunk[i] <= 1000 * (i + 1)) + num_bytes = getChunkSize(shrunk, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + layout = (300, 200, 100) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes > CHUNK_MAX) + shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) + rank = len(layout) + for i in range(rank): + self.assertTrue(shrunk[i] >= 1) + self.assertTrue(shrunk[i] <= 1000 * (3 - i)) + num_bytes = getChunkSize(shrunk, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + CHUNK_MIN = 1 * 1024 * 1024 + CHUNK_MAX = 4 * 1024 * 1024 + typesize = 4 + layout = (117, 201, 189, 1) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes > CHUNK_MAX) + shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) + self.assertEqual(shrunk, (59, 101, 95, 1)) + num_bytes = getChunkSize(shrunk, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + def testExpandChunk(self): + CHUNK_MIN = 5000 + CHUNK_MAX = 50000 + + typesize = 20 + shape = {"class": "H5S_SIMPLE", "dims": [12, ], "maxdims": [20, ]} + layout = (20,) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, typesize) + # chunk layout can't be larger than dataspace + self.assertTrue(num_bytes < CHUNK_MIN) + self.assertEqual(expanded, (20,)) + + typesize = 1 + shape = {"class": "H5S_SIMPLE", "dims": [10, 10, 10]} + layout = (10, 10, 10) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, typesize) + # chunk layout can't be larger than dataspace + self.assertTrue(num_bytes < CHUNK_MIN) + self.assertEqual(expanded, (10, 10, 10)) + + shape = {"class": "H5S_SIMPLE", "dims": [1000, 2000, 3000]} + layout = (10, 10, 10) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + shape = {"class": "H5S_SIMPLE", "dims": [1000,]} + layout = (10,) + num_bytes = getChunkSize(layout, "H5T_VARIABLE") + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, "H5T_VARIABLE", shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, "H5T_VARIABLE") + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + shape = { + "class": "H5S_SIMPLE", + "dims": [1000, 10, 1000], + "maxdims": [1000, 100, 1000], + } + layout = (10, 10, 10) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + shape = { + "class": "H5S_SIMPLE", + "dims": [1000, 0, 1000], + "maxdims": [1000, 100, 1000], + } + layout = (10, 10, 10) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + shape = { + "class": "H5S_SIMPLE", + "dims": [1000, 10, 1000], + "maxdims": [1000, 0, 1000], + } + layout = (10, 10, 10) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + def testGetContiguousLayout(self): + typesize = 4 + chunk_min = 400 + chunk_max = 800 + + kwargs = {"chunk_min": chunk_min, "chunk_max": chunk_max} + + def get_num_bytes(dims): + num_bytes = typesize + for n in dims: + num_bytes *= n + return num_bytes + + try: + shape = {"class": "H5S_SIMPLE", "dims": [100, 100]} + layout = getContiguousLayout(shape, "H5T_VARIABLE", **kwargs) + self.assertTrue(False) + except ValueError: + pass # expected + + shape = {"class": "H5S_NULL"} + layout = getContiguousLayout(shape, typesize, **kwargs) + self.assertTrue(layout is None) + + shape = {"class": "H5S_SCALAR"} + layout = getContiguousLayout(shape, typesize, **kwargs) + self.assertEqual(layout, (1,)) + + for extent in (1, 100, 10000): + dims = [ + extent, + ] + shape = {"class": "H5S_SIMPLE", "dims": dims} + layout = getContiguousLayout(shape, typesize, **kwargs) + self.assertTrue(len(layout), 1) + chunk_bytes = get_num_bytes(layout) + space_bytes = get_num_bytes(dims) + if space_bytes > chunk_min: + self.assertTrue(chunk_bytes >= chunk_min) + + self.assertTrue(chunk_bytes <= chunk_max) + + for extent in (1, 9, 90): + dims = [extent, extent] + shape = {"class": "H5S_SIMPLE", "dims": dims} + layout = getContiguousLayout(shape, typesize, **kwargs) + self.assertTrue(len(layout), 2) + for i in range(2): + self.assertTrue(layout[i] >= 1) + self.assertTrue(layout[i] <= extent) + self.assertEqual(layout[1], extent) + chunk_bytes = get_num_bytes(layout) + space_bytes = get_num_bytes(dims) + + if space_bytes > chunk_min: + self.assertTrue(chunk_bytes >= chunk_min) + self.assertTrue(chunk_bytes <= chunk_max) + + for extent in (1, 10, 100): + dims = [extent, extent, 50] + shape = {"class": "H5S_SIMPLE", "dims": dims} + layout = getContiguousLayout(shape, typesize, **kwargs) + self.assertTrue(len(layout), 3) + for i in range(3): + self.assertTrue(layout[i] >= 1) + self.assertTrue(layout[i] <= dims[i]) + + chunk_bytes = get_num_bytes(layout) + space_bytes = get_num_bytes(dims) + + if space_bytes > chunk_min: + self.assertTrue(chunk_bytes >= chunk_min) + self.assertTrue(chunk_bytes <= chunk_max) + + for extent in (1, 100, 1000): + dims = [extent, 4] + shape = {"class": "H5S_SIMPLE", "dims": dims} + layout = getContiguousLayout(shape, typesize, **kwargs) + self.assertTrue(len(layout), 2) + for i in range(2): + self.assertTrue(layout[i] >= 1) + self.assertTrue(layout[i] <= dims[i]) + + chunk_bytes = get_num_bytes(layout) + space_bytes = get_num_bytes(dims) + + if space_bytes > chunk_min: + self.assertTrue(chunk_bytes >= chunk_min) + self.assertTrue(chunk_bytes <= chunk_max) + def testGetHyperslabSelection(self): # getHyperslabSelection(dsetshape, start, stop, step) # 1-D case From 52f42f3898cab1916cdf816f4926f281ef0d9d17 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 21 May 2025 12:51:10 +0200 Subject: [PATCH 27/49] refacotr post dataset args to service_lib.py --- hsds/chunk_sn.py | 1 + hsds/ctype_sn.py | 2 +- hsds/dset_dn.py | 2 + hsds/dset_sn.py | 152 +++++++++++++++++++++-------------- hsds/post_crawl.py | 1 + hsds/servicenode_lib.py | 93 +++++---------------- tests/integ/dataset_test.py | 29 ++++--- tests/integ/datatype_test.py | 1 + 8 files changed, 137 insertions(+), 144 deletions(-) diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index 87f2fdb4..a039a911 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -515,6 +515,7 @@ async def PUT_Value(request): # get state for dataset from DN - will need this to validate # some of the query parameters dset_json = await getDsetJson(app, dset_id, bucket=bucket) + log.debug(f"got dset_json: {dset_json}") datashape = dset_json["shape"] if isNullSpace(dset_json): diff --git a/hsds/ctype_sn.py b/hsds/ctype_sn.py index b2a3d260..83f581a6 100755 --- a/hsds/ctype_sn.py +++ b/hsds/ctype_sn.py @@ -210,7 +210,7 @@ async def POST_Datatype(request): log.warn(msg) raise HTTPBadRequest(reason=msg) kwargs = getCreateArgs(item, root_id=root_id, bucket=bucket) - kwargs["ignore_link"] = True + kwargs["ignore_link"] = True # will create parent links later kwarg_list.append(kwargs) kwargs = {"bucket": bucket, "root_id": root_id} log.debug(f"createDatatypeObjects, items: {kwarg_list}") diff --git a/hsds/dset_dn.py b/hsds/dset_dn.py index bca36457..5b28f69c 100755 --- a/hsds/dset_dn.py +++ b/hsds/dset_dn.py @@ -170,6 +170,8 @@ async def POST_Dataset(request): resp_json["shape"] = shape_json resp_json["lastModified"] = dset_json["lastModified"] resp_json["attributeCount"] = len(attrs) + if layout is not None: + resp_json["layout"] = layout resp = json_response(resp_json, status=201) log.response(request, resp=resp) diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 75110bb7..a60f87fd 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -15,15 +15,15 @@ # from json import JSONDecodeError -from aiohttp.web_exceptions import HTTPBadRequest, HTTPNotFound +from aiohttp.web_exceptions import HTTPBadRequest, HTTPNotFound, HTTPInternalServerError -#from h5json.hdf5dtype import createDataType -from h5json.array_util import getNumElements #, jsonToArray +from h5json.hdf5dtype import createDataType +from h5json.array_util import getNumElements, jsonToArray from h5json.objid import isValidUuid, isSchema2Id from .util.httpUtil import getHref, respJsonAssemble from .util.httpUtil import jsonResponse, getBooleanParam -from .util.dsetUtil import getPreviewQuery# , getShapeDims, validateChunkLayout +from .util.dsetUtil import getPreviewQuery, getShapeDims from .util.authUtil import getUserPasswordFromRequest, aclCheck from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain @@ -31,7 +31,7 @@ from .servicenode_lib import getDomainJson, getObjectJson, getDsetJson, getPathForObjectId from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo from .servicenode_lib import getDatasetCreateArgs, createDataset, deleteObject -from .dset_lib import updateShape, deleteAllChunks #, doHyperslabWrite +from .dset_lib import updateShape, deleteAllChunks, doHyperslabWrite from .post_crawl import createDatasets from .domain_crawl import DomainCrawler from . import hsds_logger as log @@ -497,12 +497,23 @@ async def POST_Dataset(request): post_rsp = None datatype_json = None + init_values = [] # value initializer for each object + + def _updateInitValuesList(kwargs): + # remove value key from kwargs and append + # to init_values list + if "value" in kwargs: + init_values.append(kwargs["value"]) + del kwargs["value"] + else: + # add a placeholder + init_values.append(None) # # handle case of committed type input # if isinstance(body, dict) and "type" in body: - + body_type = body["type"] log.debug(f"got datatype: {body_type}") if isinstance(body_type, str) and body_type.startswith("t-"): @@ -532,10 +543,11 @@ async def POST_Dataset(request): elif count == 1: # just create one object in typical way kwargs = getDatasetCreateArgs(body[0], - root_id=root_id, - type=datatype_json, - bucket=bucket, - implicit=implicit) + root_id=root_id, + type=datatype_json, + bucket=bucket, + implicit=implicit) + _updateInitValuesList(kwargs) else: # create multiple dataset objects kwarg_list = [] # list of kwargs for each object @@ -546,7 +558,11 @@ async def POST_Dataset(request): msg = f"Post_Dataset - invalid item type: {type(item)}" log.warn(msg) raise HTTPBadRequest(reason=msg) - kwargs = getDatasetCreateArgs(item, root_id=root_id, type=datatype_json, bucket=bucket) + kwargs = getDatasetCreateArgs(item, + root_id=root_id, + type=datatype_json, + bucket=bucket) + _updateInitValuesList(kwargs) kwargs["ignore_link"] = True kwarg_list.append(kwargs) kwargs = {"bucket": bucket, "root_id": root_id} @@ -556,7 +572,12 @@ async def POST_Dataset(request): post_rsp = await createDatasets(app, kwarg_list, **kwargs) else: # single object create - kwargs = getDatasetCreateArgs(body, root_id=root_id, type=datatype_json, bucket=bucket, implicit=implicit) + kwargs = getDatasetCreateArgs(body, + root_id=root_id, + type=datatype_json, + bucket=bucket, + implicit=implicit) + _updateInitValuesList(kwargs) log.debug(f"kwargs for dataset create: {kwargs}") if post_rsp is None: @@ -569,52 +590,33 @@ async def POST_Dataset(request): # add any links in multi request objects = post_rsp["objects"] obj_count = len(objects) - log.debug(f"Post datatype multi create: {obj_count} objects") + log.debug(f"Post dataset multi create: {obj_count} objects") if len(body) != obj_count: msg = f"Expected {obj_count} objects but got {len(body)}" log.warn(msg) raise HTTPBadRequest(reason=msg) - parent_ids = {} - for index in range(obj_count): - item = body[index] - if "link" in item: - link_item = item["link"] - parent_id = link_item.get("id") - title = link_item.get("name") - if parent_id and title: - # add a hard link - object = objects[index] - obj_id = object["id"] - if parent_id not in parent_ids: - parent_ids[parent_id] = {} - links = parent_ids[parent_id] - links[title] = {"id": obj_id} - if parent_ids: - log.debug(f"POST dataset multi - adding links: {parent_ids}") - kwargs = {"action": "put_link", "bucket": bucket} - kwargs["replace"] = True - - crawler = DomainCrawler(app, parent_ids, **kwargs) - - # will raise exception on not found, server busy, etc. - await crawler.crawl() - - status = crawler.get_status() - - log.info(f"DomainCrawler done for put_links action, status: {status}") - """ - if "value" in body and body["value"]: - # data to initialize dataset included in request - input_data = body["value"] - msg = "input data doesn't match request type and shape" + else: + obj_count = 1 # single object create + objects = [post_rsp, ] # treat as an array to make the following code more consistent + + if len(init_values) != obj_count: + msg = f"Expected {obj_count} init values" + log.error(msg) + raise HTTPInternalServerError() + + # write any init data values + for index in range(obj_count): + init_data = init_values[index] + if init_data is None: + continue + dset_json = objects[index] + log.debug(f"init value, post_rsp: {dset_json}") + shape_json = dset_json["shape"] + type_json = dset_json["type"] + arr_dtype = createDataType(type_json) dims = getShapeDims(shape_json) - if not dims: - log.warn(msg) - raise HTTPBadRequest(reason=msg) - arr_dtype = createDataType(datatype) - try: - input_arr = jsonToArray(dims, arr_dtype, input_data) + input_arr = jsonToArray(dims, arr_dtype, init_data) except ValueError: log.warn(f"ValueError: {msg}") raise HTTPBadRequest(reason=msg) @@ -625,14 +627,9 @@ async def POST_Dataset(request): log.warn(f"IndexError: {msg}") raise HTTPBadRequest(reason=msg) log.debug(f"got json arr: {input_arr.shape}") - else: - input_arr = None - # write data if provided - if input_arr is not None: + # write data if provided log.debug(f"write input_arr: {input_arr}") - # mixin the layout - dset_json["layout"] = layout # make selection for entire dataspace dims = getShapeDims(shape_json) slices = [] @@ -648,7 +645,44 @@ async def POST_Dataset(request): kwargs["data"] = input_arr # do write await doHyperslabWrite(app, request, **kwargs) - """ + + if "objects" in post_rsp: + # add any links in multi request + objects = post_rsp["objects"] + obj_count = len(objects) + log.debug(f"Post datatype multi create: {obj_count} objects") + if len(body) != obj_count: + msg = f"Expected {obj_count} objects but got {len(body)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + parent_ids = {} + for index in range(obj_count): + item = body[index] + if "link" in item: + link_item = item["link"] + parent_id = link_item.get("id") + title = link_item.get("name") + if parent_id and title: + # add a hard link + object = objects[index] + obj_id = object["id"] + if parent_id not in parent_ids: + parent_ids[parent_id] = {} + links = parent_ids[parent_id] + links[title] = {"id": obj_id} + if parent_ids: + log.debug(f"POST dataset multi - adding links: {parent_ids}") + kwargs = {"action": "put_link", "bucket": bucket} + kwargs["replace"] = True + + crawler = DomainCrawler(app, parent_ids, **kwargs) + + # will raise exception on not found, server busy, etc. + await crawler.crawl() + status = crawler.get_status() + + log.info(f"DomainCrawler done for put_links action, status: {status}") + # dataset creation successful resp = await jsonResponse(request, post_rsp, status=201) log.response(request, resp=resp) diff --git a/hsds/post_crawl.py b/hsds/post_crawl.py index 198b1492..2c79b47e 100644 --- a/hsds/post_crawl.py +++ b/hsds/post_crawl.py @@ -259,6 +259,7 @@ async def createDatatypeObjs(app, items: list, root_id=None, bucket=None): rsp_json = await _createObjects(app, items=items, root_id=root_id, bucket=bucket) return rsp_json + async def createDatasets(app, items: list, root_id=None, bucket=None): """ create dataset objects based on parameters in items list """ diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index bc089420..cbc71b0a 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -1300,7 +1300,7 @@ def validateDatasetCreationProps(creation_props, type_json=None, shape=None): msg = "shape and type must be set to use fillValue" log.warn(msg) raise HTTPBadRequest(reason=msg) - + # validate fill value compatible with type dt = createDataType(type_json) fill_value = creation_props["fillValue"] @@ -1327,14 +1327,14 @@ def validateDatasetCreationProps(creation_props, type_json=None, shape=None): msg = f"invalid fill value: {fill_value}" log.warn(msg) raise HTTPBadRequest(reason=msg) - + if "filters" in creation_props: if not type_json or not shape: msg = "shape and type must be set to use filters" log.warn(msg) raise HTTPBadRequest(reason=msg) - - supported_filters = getSupportedFilters() + + supported_filters = getSupportedFilters() # will raise bad request exception if not valid supported_filters = getSupportedFilters(include_compressors=True) log.debug(f"supported_filters: {supported_filters}") @@ -1343,6 +1343,7 @@ def validateDatasetCreationProps(creation_props, type_json=None, shape=None): log.debug(f"setting filters to: {filters_out}") creation_props["filters"] = filters_out + def getCreateArgs(body, root_id=None, bucket=None, @@ -1487,13 +1488,14 @@ def getCreateArgs(body, pass # no type return kwargs + def getDatasetCreateArgs(body, - root_id=None, - bucket=None, - type=None, - implicit=False, - chunk_table=None, - ignore_link=False): + root_id=None, + bucket=None, + type=None, + implicit=False, + chunk_table=None, + ignore_link=False): """ get args for createDataset from request body """ @@ -1504,12 +1506,12 @@ def getDatasetCreateArgs(body, type=type, implicit=implicit, ignore_link=ignore_link) - - if not "type" in kwargs: + + if "type" not in kwargs: msg = "no type specified for create dataset" log.warn(msg) raise HTTPBadRequest(reason=msg) - + type_json = kwargs["type"] # # Validate shape if present @@ -1518,7 +1520,7 @@ def getDatasetCreateArgs(body, # will return scalar shape if no shape key in body shape_json = getShapeJson(body) kwargs["shape"] = shape_json - + # get layout for dataset creation log.debug("getting dataset creation settings") layout_props = None @@ -1584,7 +1586,7 @@ def getDatasetCreateArgs(body, msg = f"chunk_size: {chunk_size}, min: {min_chunk_size}, " msg += f"max: {max_chunk_size}" log.debug(msg) - + # adjust the chunk shape if chunk size is too small or too big adjusted_chunk_dims = None if chunk_size < min_chunk_size: @@ -1599,7 +1601,7 @@ def getDatasetCreateArgs(body, log.debug(msg) opts = {"chunk_max": max_chunk_size} adjusted_chunk_dims = shrinkChunk(chunk_dims, item_size, **opts) - + if adjusted_chunk_dims: msg = f"requested chunk_dimensions: {chunk_dims} modified " msg += f"dimensions: {adjusted_chunk_dims}" @@ -1676,12 +1678,12 @@ def getDatasetCreateArgs(body, msg += f"{max_chunk_size}, for {layout_class} dataset" log.warn(msg) layout["dims"] = chunk_dims - + if layout: log.debug(f"setting layout to: {layout}") kwargs["layout"] = layout - # + # # get input data if present # if "value" in body and body["value"]: @@ -1690,7 +1692,7 @@ def getDatasetCreateArgs(body, msg = "null shape datasets can not have initial values" log.warn(msg) raise HTTPBadRequest(reason=msg) - + input_data = body["value"] msg = "input data doesn't match request type and shape" dims = getShapeDims(shape_json) @@ -1712,7 +1714,7 @@ def getDatasetCreateArgs(body, raise HTTPBadRequest(reason=msg) log.debug(f"got json arr: {input_arr.shape}") kwargs["value"] = input_data - + return kwargs @@ -1969,7 +1971,6 @@ async def createDataset(app, obj_id=None, creation_props=None, layout=None, - value=None, attrs=None, links=None, implicit=None, @@ -2001,54 +2002,4 @@ async def createDataset(app, kwargs["bucket"] = bucket dset_json = await createObject(app, **kwargs) - if value: - log.debug(f"tbd - set dataset value to: {value}") - shape_json = kwargs["shape"] - type_json = kwargs["type"] - # data to initialize dataset included in request - msg = "input data doesn't match request type and shape" - dims = getShapeDims(shape_json) - if not dims: - log.warn(msg) - raise HTTPBadRequest(reason=msg) - arr_dtype = createDataType(type_json) - - try: - input_arr = jsonToArray(dims, arr_dtype, value) - except ValueError: - log.warn(f"ValueError: {msg}") - raise HTTPBadRequest(reason=msg) - except TypeError: - log.warn(f"TypeError: {msg}") - raise HTTPBadRequest(reason=msg) - except IndexError: - log.warn(f"IndexError: {msg}") - raise HTTPBadRequest(reason=msg) - log.debug(f"got json arr: {input_arr.shape}") - else: - input_arr = None - - # write data if provided - if input_arr is not None: - log.debug(f"write input_arr: {input_arr}") - # mixin the layout - dset_json["layout"] = layout - # make selection for entire dataspace - dims = getShapeDims(shape_json) - slices = [] - for dim in dims: - s = slice(0, dim, 1) - slices.append(s) - # make a one page list to handle the write in one chunk crawler run - # (larger write request should user binary streaming) - kwargs = {"page_number": 0, "page": slices} - kwargs["dset_json"] = dset_json - kwargs["bucket"] = bucket - kwargs["select_dtype"] = input_arr.dtype - kwargs["data"] = input_arr - log.debug(f"kwargs for hyperslab write: {kwargs}") - # do write - #request = None # don't need in this case since not reading from input stream - #await doHyperslabWrite(app, request, **kwargs) - return dset_json diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index 3d4610a4..12fe3c1f 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -64,31 +64,34 @@ def testScalarDataset(self): rsp = self.session.post(req, data=json.dumps(data), headers=headers) self.assertEqual(rsp.status_code, 201) rspJson = json.loads(rsp.text) - self.assertEqual(rspJson["attributeCount"], 0) - dset_id = rspJson["id"] - self.assertTrue(helper.validateId(dset_id)) - - # read back the obj - req = self.endpoint + "/datasets/" + dset_id - rsp = self.session.get(req, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) expected_keys = [ "id", "shape", - "hrefs", "layout", - "creationProperties", "attributeCount", "created", "lastModified", "root", - "domain", ] - for name in expected_keys: self.assertTrue(name in rspJson) + + # additional keys expected for GET response + expected_keys.append("hrefs") + expected_keys.append("creationProperties") + expected_keys.append("domain") + + self.assertEqual(rspJson["attributeCount"], 0) + dset_id = rspJson["id"] + self.assertTrue(helper.validateId(dset_id)) + + # read back the obj + req = self.endpoint + "/datasets/" + dset_id + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["id"], dset_id) self.assertEqual(rspJson["root"], root_uuid) self.assertEqual(rspJson["domain"], domain) diff --git a/tests/integ/datatype_test.py b/tests/integ/datatype_test.py index ce7418cb..7bf90d09 100755 --- a/tests/integ/datatype_test.py +++ b/tests/integ/datatype_test.py @@ -475,6 +475,7 @@ def testPostWithLink(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) # link doesn't exist yet rspJson = json.loads(rsp.text) + self.assertTrue("link" in rspJson) link_json = rspJson["link"] self.assertEqual(link_json["collection"], "datatypes") From ce45804747568c426721d6ed33723a6eebb0e365 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 21 May 2025 21:04:10 +0200 Subject: [PATCH 28/49] add multi-dataset test with init data --- tests/integ/dataset_test.py | 89 +++++++++++++++++++++++++++++++++++-- tests/integ/value_test.py | 80 +++++++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+), 4 deletions(-) diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index 12fe3c1f..cf15ada2 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -20,9 +20,8 @@ import config # min/max chunk size - these can be set by config, but -# practially the min config value should be larger than -# CHUNK_MIN and the max config value should less than -# CHUNK_MAX +# practically the min config value should be larger than +# CHUNK_MIN and the max config value should less than CHUNK_MAX CHUNK_MIN = 1024 # lower limit (1024b) CHUNK_MAX = 50 * 1024 * 1024 # upper limit (50M) @@ -2751,7 +2750,7 @@ def testExtendibleDatasetChunkPartitioning(self): req = self.endpoint + "/datasets" # 50K x 80K x 90K dataset dims = [0, 80000, 90000] - # unlimited extend in dim 0, fixeed in dimension 2, extenbile by 10x in dim 3 + # unlimited extend in dim 0, fixeed in dimension 2, extensible by 10x in dim 3 max_dims = [0, 80000, 900000] payload = {"type": "H5T_IEEE_F32LE", "shape": dims, "maxdims": max_dims} @@ -2815,6 +2814,88 @@ def testDatasetEmptyChunkExtent(self): # Should fail with Bad Request due to invalid layout value self.assertEqual(rsp.status_code, 400) # create dataset + def testDatasetPostMulti(self): + # test POST with multi-object creation + domain = self.base_domain + "/testDatasetPostMulti.h5" + helper.setupDomain(domain) + print("testDatasetPostMulti", domain) + headers = helper.getRequestHeaders(domain=domain) + req = helper.getEndpoint() + "/" + + # get root ids + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # get root group and verify link count is 0 + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + + dataset_count = 3 + datatype = "H5T_STD_I32LE" + payload = [] + for _ in range(dataset_count): + dataset_args = {"type": datatype} + payload.append(dataset_args) + + req = helper.getEndpoint() + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertTrue("objects" in rspJson) + rsp_objs = rspJson["objects"] + self.assertEqual(len(rsp_objs), dataset_count) + + expected_keys = [ + "id", + "shape", + "layout", + "attributeCount", + "created", + "lastModified", + "root", + ] + + for i in range(dataset_count): + obj_json = rsp_objs[i] + self.assertEqual(obj_json["attributeCount"], 0) + dset_id = obj_json["id"] + self.assertTrue(helper.validateId(dset_id)) + self.assertTrue(dset_id.startswith("d-")) + for key in expected_keys: + self.assertTrue(key in obj_json) + + # create a set of linked datasets + for i in range(dataset_count): + item = payload[i] + item["link"] = {"id": root_uuid, "name": f"dset_{i + 1}"} + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertTrue("objects" in rspJson) + rsp_objs = rspJson["objects"] + self.assertEqual(len(rsp_objs), dataset_count) + for i in range(dataset_count): + json_rsp = rsp_objs[i] + self.assertEqual(json_rsp["attributeCount"], 0) + dset_id = json_rsp["id"] + self.assertTrue(helper.validateId(dset_id)) + for key in expected_keys: + self.assertTrue(key in obj_json) + + # get root group and verify link count is dataset_count + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], dataset_count) + if __name__ == "__main__": # setup test files diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index 9ad5c5e6..1f34a99e 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -1012,6 +1012,86 @@ def testScalarDatasetInitData(self): self.assertTrue("value" in rspJson) self.assertEqual(rspJson["value"], 42) + def testScalarDatasetInitDataMulti(self): + # Test creation/deletion of multiple scalar dataset obj along with initial data + print("testScalarDatasetInitDataMulti", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + dataset_count = 3 + datatype = "H5T_STD_I32LE" + payload = [] + for i in range(dataset_count): + dataset_args = {"type": datatype} + dataset_args["value"] = i + payload.append(dataset_args) + + # create dataset objects + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + + self.assertTrue("objects" in rspJson) + rsp_objs = rspJson["objects"] + self.assertEqual(len(rsp_objs), dataset_count) + + for i in range(dataset_count): + obj_json = rsp_objs[i] + self.assertEqual(obj_json["attributeCount"], 0) + dset_id = obj_json["id"] + self.assertTrue(helper.validateId(dset_id)) + self.assertTrue(dset_id.startswith("d-")) + + # read back the obj + for i in range(dataset_count): + dset_id = rsp_objs[i]["id"] + req = self.endpoint + "/datasets/" + dset_id + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + + expected_keys = [ + "id", + "shape", + "hrefs", + "layout", + "creationProperties", + "attributeCount", + "created", + "lastModified", + "root", + "domain", + ] + + for name in expected_keys: + self.assertTrue(name in rspJson) + self.assertEqual(rspJson["id"], dset_id) + self.assertEqual(rspJson["root"], root_uuid) + self.assertEqual(rspJson["domain"], self.base_domain) + self.assertEqual(rspJson["attributeCount"], 0) + shape_json = rspJson["shape"] + self.assertTrue(shape_json["class"], "H5S_SCALAR") + self.assertTrue(rspJson["type"], "H5T_STD_I32LE") + + # read the data back + for i in range(dataset_count): + dset_id = rsp_objs[i]["id"] + req = self.endpoint + "/datasets/" + dset_id + "/value" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], i) + def testNullSpaceDataset(self): # Test attempted read/write to null space dataset print("testNullSpaceDataset", self.base_domain) From 88e06919b3f41045abfbf6afcba00fe3300774da Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 6 Jun 2025 20:32:21 +0200 Subject: [PATCH 29/49] allow client group id for PUT domain --- hsds/chunk_dn.py | 8 ++-- hsds/chunk_sn.py | 6 +-- hsds/ctype_dn.py | 5 +++ hsds/ctype_sn.py | 8 ++-- hsds/domain_sn.py | 39 +++++++++++++--- hsds/dset_dn.py | 5 +++ hsds/dset_sn.py | 14 +++--- hsds/group_dn.py | 4 ++ hsds/group_sn.py | 6 +-- hsds/servicenode_lib.py | 8 +++- tests/integ/domain_test.py | 91 ++++++++++++++++++++++++++++++++++++++ 11 files changed, 164 insertions(+), 30 deletions(-) diff --git a/hsds/chunk_dn.py b/hsds/chunk_dn.py index 97e86f01..02545b85 100644 --- a/hsds/chunk_dn.py +++ b/hsds/chunk_dn.py @@ -65,7 +65,7 @@ async def PUT_Chunk(request): log.error(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(chunk_id, "Chunk"): + if not isValidUuid(chunk_id, obj_class="chunks"): msg = f"Invalid chunk id: {chunk_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -339,7 +339,7 @@ async def GET_Chunk(request): msg = "Missing chunk id" log.error(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(chunk_id, "Chunk"): + if not isValidUuid(chunk_id, obj_class="chunks"): msg = f"Invalid chunk id: {chunk_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -655,7 +655,7 @@ async def POST_Chunk(request): chunk_index = getChunkIndex(chunk_id) log.debug(f"chunk_index: {chunk_index}") - if not isValidUuid(chunk_id, "Chunk"): + if not isValidUuid(chunk_id, obj_class="chunks"): msg = f"Invalid chunk id: {chunk_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -842,7 +842,7 @@ async def DELETE_Chunk(request): raise HTTPBadRequest(reason=msg) log.info(f"DELETE chunk: {chunk_id}") - if not isValidUuid(chunk_id, "Chunk"): + if not isValidUuid(chunk_id, obj_class="chunks"): msg = f"Invalid chunk id: {chunk_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index a039a911..285617f6 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -497,7 +497,7 @@ async def PUT_Value(request): msg = "Missing dataset id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -784,7 +784,7 @@ async def GET_Value(request): msg = "Missing dataset id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -1062,7 +1062,7 @@ async def POST_Value(request): msg = "Missing dataset id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/ctype_dn.py b/hsds/ctype_dn.py index 0b14ab41..834e02cb 100755 --- a/hsds/ctype_dn.py +++ b/hsds/ctype_dn.py @@ -95,6 +95,11 @@ async def POST_Datatype(request): log.error("Unexpected type_id: {ctype_id}") raise HTTPInternalServerError() + deleted_ids = app["deleted_ids"] + if ctype_id in deleted_ids: + log.warn(f"POST Dataset has id: {ctype_id} that has previously been deleted") + deleted_ids.remove(ctype_id) + # verify the id doesn't already exist obj_found = await check_metadata_obj(app, ctype_id, bucket=bucket) if obj_found: diff --git a/hsds/ctype_sn.py b/hsds/ctype_sn.py index 83f581a6..dfa96f98 100755 --- a/hsds/ctype_sn.py +++ b/hsds/ctype_sn.py @@ -51,7 +51,7 @@ async def GET_Datatype(request): include_attrs = True if ctype_id: - if not isValidUuid(ctype_id, "datatypes"): + if not isValidUuid(ctype_id, obj_class="datatypes"): msg = f"Invalid type id: {ctype_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -62,7 +62,7 @@ async def GET_Datatype(request): group_id = None if "grpid" in params: group_id = params["grpid"] - if not isValidUuid(group_id, "groups"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid parent group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -103,7 +103,7 @@ async def GET_Datatype(request): # throws 404 if not found kwargs = {"bucket": bucket, "domain": domain} ctype_id, domain, _ = await getObjectIdByPath(app, group_id, h5path, **kwargs) - if not isValidUuid(ctype_id, "datatypes"): + if not isValidUuid(ctype_id, obj_class="datatypes"): msg = f"No datatype exist with the path: {h5path}" log.warn(msg) raise HTTPGone() @@ -280,7 +280,7 @@ async def DELETE_Datatype(request): msg = "Missing committed type id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(ctype_id, "datatypes"): + if not isValidUuid(ctype_id, obj_class="datatypes"): msg = f"Invalid committed type id: {ctype_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/domain_sn.py b/hsds/domain_sn.py index 4436db37..1ef469d5 100755 --- a/hsds/domain_sn.py +++ b/hsds/domain_sn.py @@ -18,12 +18,12 @@ import os.path as op from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPNotFound -from aiohttp.web_exceptions import HTTPInternalServerError +from aiohttp.web_exceptions import HTTPInternalServerError, HTTPGone from aiohttp.web_exceptions import HTTPConflict, HTTPServiceUnavailable from aiohttp.web import json_response from h5json.objid import createObjId, getCollectionForId -from h5json.objid import isValidUuid, isSchema2Id +from h5json.objid import isValidUuid, isRootObjId, isSchema2Id from .util.nodeUtil import getNodeCount, getDataNodeUrl from .util.httpUtil import getObjectClass, http_post, http_put, http_delete @@ -99,7 +99,7 @@ async def get_collections(app, root_id, bucket=None, max_objects_limit=None): async def getDomainObjects(app, root_id, include_attrs=False, bucket=None): - """Iterate through all objects in heirarchy and add to obj_dict + """Iterate through all objects in hierarchy and add to obj_dict keyed by obj id """ @@ -754,7 +754,7 @@ async def PUT_Domain(request): username, pswd = getUserPasswordFromRequest(request) await validateUserPassword(app, username, pswd) - # inital perms for owner and default + # initial perms for owner and default owner_perm = { "create": True, "read": True, @@ -858,7 +858,7 @@ async def PUT_Domain(request): if "root" in domain_json: # nothing to update for folders root_id = domain_json["root"] - if not isValidUuid(root_id): + if not isValidUuid(root_id, obj_class="groups"): msg = f"domain: {domain} with invalid root id: {root_id}" log.error(msg) raise HTTPInternalServerError() @@ -985,8 +985,33 @@ async def PUT_Domain(request): if not is_folder and not linked_json: # create a root group for the new domain - root_id = createObjId("groups") - log.debug(f"new root group id: {root_id}") + if body and "root_id" in body: + root_id = body["root_id"] + if not isRootObjId(root_id): + msg = f"invalid client provided root id: {root_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + # verify that the group object doesn't already exist + log.debug(f"attempting to fetch root id: {root_id}") + kwargs = { + "refresh": True, + "include_links": False, + "include_attrs": False, + "bucket": bucket, + } + try: + await getObjectJson(app, root_id, **kwargs) + msg = "client specified root_id already exists" + log.warn(msg) + raise HTTPConflict() + except HTTPNotFound: + log.debug(f"root_id: {root_id} not found (expected)") + except HTTPGone: + log.debug(f"root_id: {root_id} has been removed (expected)") + log.debug(f"using client supplied root_id: {root_id}") + else: + root_id = createObjId("groups") + log.debug(f"new root group id: {root_id}") group_json = {"id": root_id, "root": root_id, "domain": domain} log.debug(f"create group for domain, body: {group_json}") diff --git a/hsds/dset_dn.py b/hsds/dset_dn.py index 5b28f69c..fc949203 100755 --- a/hsds/dset_dn.py +++ b/hsds/dset_dn.py @@ -98,6 +98,11 @@ async def POST_Dataset(request): log.error(f"Unexpected dataset_id: {dset_id}") raise HTTPInternalServerError() + deleted_ids = app["deleted_ids"] + if dset_id in deleted_ids: + log.warn(f"POST Dataset has id: {dset_id} that has previously been deleted") + deleted_ids.remove(dset_id) + # verify the id doesn't already exist obj_found = await check_metadata_obj(app, dset_id, bucket=bucket) if obj_found: diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index a60f87fd..48ee8609 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -85,7 +85,7 @@ async def GET_Dataset(request): include_attrs = True if dset_id: - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -96,7 +96,7 @@ async def GET_Dataset(request): group_id = None if "grpid" in params: group_id = params["grpid"] - if not isValidUuid(group_id, "Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid parent group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -140,7 +140,7 @@ async def GET_Dataset(request): # throws 404 if not found kwargs = {"bucket": bucket, "domain": domain} dset_id, domain, _ = await getObjectIdByPath(app, group_id, h5path, **kwargs) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"No dataset exist with the path: {h5path}" log.warn(msg) raise HTTPNotFound() @@ -228,7 +228,7 @@ async def GET_DatasetType(request): msg = "Missing dataset id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -280,7 +280,7 @@ async def GET_DatasetShape(request): msg = "Missing dataset id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -337,7 +337,7 @@ async def PUT_DatasetShape(request): msg = "Missing dataset id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -700,7 +700,7 @@ async def DELETE_Dataset(request): msg = "Missing dataset id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/group_dn.py b/hsds/group_dn.py index 0a93bed4..e2b69eef 100755 --- a/hsds/group_dn.py +++ b/hsds/group_dn.py @@ -98,6 +98,10 @@ async def POST_Group(request): raise HTTPBadRequest(reason=msg) group_id = get_obj_id(request, body=body) + deleted_ids = app["deleted_ids"] + if group_id in deleted_ids: + log.warn(f"POST Group has id: {group_id} that has previously been deleted") + deleted_ids.remove(group_id) log.info(f"POST group: {group_id} bucket: {bucket} body: {body}") if not isValidUuid(group_id, obj_class="groups"): diff --git a/hsds/group_sn.py b/hsds/group_sn.py index 4d83e5c7..1011b883 100755 --- a/hsds/group_sn.py +++ b/hsds/group_sn.py @@ -52,7 +52,7 @@ async def GET_Group(request): if group_id: log.info(f"GET_Group, id: {group_id}") # is the id a group id and not something else? - if not isValidUuid(group_id, "Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -99,7 +99,7 @@ async def GET_Group(request): kwargs = {"bucket": bucket, "domain": domain} group_id, domain, obj_json = await getObjectIdByPath(app, group_id, h5path, **kwargs) - if not isValidUuid(group_id, "Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"No group exist with the path: {h5path}" log.warn(msg) raise HTTPNotFound() @@ -296,7 +296,7 @@ async def DELETE_Group(request): msg = "Missing group id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(group_id, "Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index cbc71b0a..1c17edad 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -1367,7 +1367,7 @@ def getCreateArgs(body, msg = "link can't be used with h5path" log.warn(msg) raise HTTPBadRequest(reason=msg) - # if ingore_link is set, parent_links will be created post object creation + # if ignore_link is set, parent_links will be created post object creation link_body = body["link"] log.debug(f"link_body: {link_body}") if "id" in link_body and not ignore_link: @@ -1417,7 +1417,11 @@ def getCreateArgs(body, if "id" in body: obj_id = body["id"] - # tbd: validate this is a group id + if not isValidUuid(obj_id): + msg = f"Invalid id: {obj_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + kwargs["obj_id"] = obj_id log.debug(f"createObject will use client id: {obj_id}") diff --git a/tests/integ/domain_test.py b/tests/integ/domain_test.py index 4f21d44e..1c68cb5e 100755 --- a/tests/integ/domain_test.py +++ b/tests/integ/domain_test.py @@ -13,6 +13,9 @@ import time import json from os import path as pp + +from h5json.objid import createObjId + import config import helper @@ -489,6 +492,94 @@ def testCreateDomain(self): self.assertTrue(k in rspJson) # we should get the same value for root id self.assertEqual(root_id, rspJson["root"]) + + def testCreateDomainWithId(self): + domain = self.base_domain + "/newdomainwithid.h5" + print("testCreateDomainWithId", domain) + headers = helper.getRequestHeaders(domain=domain) + + root_id = createObjId("groups") + body = {"root_id": root_id} + req = helper.getEndpoint() + "/" + + rsp = self.session.put(req, data=json.dumps(body), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + for k in ( + "root", + "owner", + "acls", + "created", + "lastModified", + "version", + "limits", + "compressors", + ): + self.assertTrue(k in rspJson) + + self.assertEqual(rspJson["root"], root_id) + + limit_keys = ("min_chunk_size", "max_chunk_size", "max_request_size") + limits = rspJson["limits"] + for k in limit_keys: + self.assertTrue(k in limits) + limit = limits[k] + self.assertTrue(isinstance(limit, int)) + self.assertTrue(limit > 0) + compressors = rspJson["compressors"] + for compressor in EXPECTED_COMPRESSORS: + self.assertTrue(compressor in compressors) + + # do a get on the new domain + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + for k in ( + "root", + "owner", + "class", + "created", + "lastModified", + "limits", + "version", + ): + self.assertTrue(k in rspJson) + # we should get the same value for root id + self.assertEqual(root_id, rspJson["root"]) + # should get limits here too + limits = rspJson["limits"] + for k in limit_keys: + self.assertTrue(k in limits) + limit = limits[k] + self.assertTrue(isinstance(limit, int)) + self.assertTrue(limit > 0) + + # verify we can access root groups + root_req = helper.getEndpoint() + "/groups/" + root_id + headers = helper.getRequestHeaders(domain=domain) + rsp = self.session.get(root_req, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # verify that putting the same domain again fails with a 409 error + rsp = self.session.put(req, headers=headers) + self.assertEqual(rsp.status_code, 409) + + # PUT with a different domain name should also give a 409 + # (due to the root_id conflicting) + domain2 = self.base_domain + "/newdomainwithid2.h5" + headers2 = helper.getRequestHeaders(domain=domain2) + rsp = self.session.put(req, data=json.dumps(body), headers=headers2) + self.assertEqual(rsp.status_code, 409) + + # Delete the original domain + headers = helper.getRequestHeaders(domain=domain) + rsp = self.session.delete(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # re-create the domain with the same root id + rsp = self.session.put(req, data=json.dumps(body), headers=headers) + self.assertEqual(rsp.status_code, 201) + """ def testCreateDomainWithCustomClass(self): domain = self.base_domain + "/newclassdomain.h6" From 7561534242e745fea750fca7ac037f8dce712b44 Mon Sep 17 00:00:00 2001 From: John Readey Date: Sun, 8 Jun 2025 12:06:45 +0200 Subject: [PATCH 30/49] fix np.frombuffer error --- hsds/attr_sn.py | 6 +----- hsds/chunk_sn.py | 7 +++---- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index d3b05ca0..d7f05f75 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -851,11 +851,7 @@ async def PUT_AttributeValue(request): msg += f"but got {len(binary_data)}" log.warn(msg) raise HTTPBadRequest(reason=msg) - arr = np.fromstring(binary_data, dtype=np_dtype) - if attr_shape["class"] == "H5S_SCALAR": - arr = arr.reshape([]) - else: - arr = arr.reshape(np_shape) # conform to selection shape + arr = bytesToArray(binary_data, np_dtype, np_shape) log.debug(f"got array {arr} from binary data") else: try: diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index 285617f6..86d7539f 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -657,7 +657,7 @@ async def PUT_Value(request): log.warn(f"bytesToArray value error: {ve}") raise HTTPBadRequest() else: - # fixed item size + # fixed item size - check against number of bytes if len(input_data) % item_size != 0: msg = f"Expected request size to be a multiple of {item_size}, " msg += f"but {len(input_data)} bytes received" @@ -668,8 +668,7 @@ async def PUT_Value(request): msg = f"expected {item_size * num_elements} bytes but got {len(input_data)}" log.warn(msg) raise HTTPBadRequest(reason=msg) - - arr = np.fromstring(input_data, dtype=dset_dtype) + arr = np.frombuffer(input_data, dtype=dset_dtype) log.debug(f"read fixed type array: {arr}") if bc_shape: @@ -1166,7 +1165,7 @@ async def POST_Value(request): log.warn(msg) raise HTTPBadRequest(reason=msg) num_points = request.content_length // point_dt.itemsize - points = np.fromstring(binary_data, dtype=point_dt) + points = np.frombuffer(binary_data, dtype=point_dt) # reshape the data based on the rank (num_points x rank) if rank > 1: if len(points) % rank != 0: From 25c4cf37d2888b800a2a3f8017b92bccd965179e Mon Sep 17 00:00:00 2001 From: John Readey Date: Sun, 8 Jun 2025 13:29:37 +0200 Subject: [PATCH 31/49] fix dsetUtil flake errors --- hsds/util/dsetUtil.py | 50 +++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/hsds/util/dsetUtil.py b/hsds/util/dsetUtil.py index b259aae9..a1d20cbf 100644 --- a/hsds/util/dsetUtil.py +++ b/hsds/util/dsetUtil.py @@ -19,7 +19,6 @@ from .. import hsds_logger as log from .. import config -#from .chunkUtil import getChunkSize, guessChunk, expandChunk, shrinkChunk CHUNK_MIN = 512 * 1024 # Soft lower limit (512k) CHUNK_MAX = 2048 * 1024 # Hard upper limit (2M) @@ -84,6 +83,7 @@ "H5D_CONTIGUOUS_REF", ) + def get_dset_size(shape_json, typesize): """Return the size of the dataspace. For any unlimited dimensions, assume a value of 1. @@ -106,6 +106,7 @@ def get_dset_size(shape_json, typesize): dset_size *= shape[n] return dset_size + def getFilterItem(key): """ Return filter code, id, and name, based on an id, a name or a code. @@ -121,15 +122,15 @@ def getFilterItem(key): def getFiltersJson(create_props, supported_filters=None): - """ return standardized filter representation from creation properties + """ return standardized filter representation from creation properties raise bad request if invalid """ - + # refer to https://hdf5-json.readthedocs.io/en/latest/bnf/\ # filters.html#grammar-token-filter_list if "filters" not in create_props: return {} # null set - + f_in = create_props["filters"] log.debug(f"filters provided in creation_prop: {f_in}") @@ -189,11 +190,11 @@ def getFiltersJson(create_props, supported_filters=None): msg = f"Unexpected type for filter: {filter}" log.warn(msg) raise HTTPBadRequest(reason=msg) - + # return standardized filter representation log.debug(f"using filters: {f_out}") return f_out - + def getFilters(dset_json): """Return list of filters, or empty list""" @@ -298,7 +299,8 @@ def getFilterOps(app, dset_id, filters, dtype=None, chunk_shape=None): return filter_ops else: return None - + + def getShapeJson(body): """ Return normalized json description of data space """ @@ -310,8 +312,8 @@ def getShapeJson(body): shape_class = "H5S_SCALAR" log.debug("not shape given - using H5S_SCALAR") return {"class": shape_class} - - body_shape = body["shape"] + + body_shape = body["shape"] log.debug(f"got shape: {body_shape}") if isinstance(body_shape, int): @@ -326,11 +328,11 @@ def getShapeJson(body): else: shape_class = "H5S_SIMPLE" dims = body_shape - else: + else: msg = "invalid shape: {body_shape}" log.warn(msg) raise ValueError(msg) - + if shape_class not in ("H5S_NULL", "H5S_SCALAR", "H5S_SIMPLE"): msg = f"invalid shape class: {shape_class}" log.warn(msg) @@ -386,7 +388,7 @@ def getShapeJson(body): msg = "max_dims rank doesn't match dims" log.warn(msg) raise ValueError(msg) - + # return json description of shape shape_json = {"class": shape_class} if shape_class == "H5S_SIMPLE": @@ -396,6 +398,7 @@ def getShapeJson(body): log.debug(f"returning shape_json: {shape_json}") return shape_json + def getShapeClass(data_shape): """ Return shape class of the given data shape """ @@ -404,11 +407,12 @@ def getShapeClass(data_shape): if "class" not in data_shape: raise KeyError("expected 'class' key for data shape")\ - + return data_shape["class"] + def getRank(data_shape): - """ Return rank of given data shape_json """ + """ Return rank of given data shape_json """ shape_class = getShapeClass(data_shape) @@ -423,6 +427,7 @@ def getRank(data_shape): else: raise ValueError(f"unexpected data shape class: {shape_class}") + def getDsetRank(dset_json): """Get rank returning 0 for scalar or NULL data shapes""" data_shape = dset_json["shape"] @@ -445,7 +450,7 @@ def isScalarSpace(dset_json): shape_class = getShapeClass(data_shape) if shape_class == "H5S_NULL": return False - + rank = getRank(data_shape) return True if rank == 0 else False @@ -458,7 +463,7 @@ def getContiguousLayout(shape_json, item_size, chunk_min=None, chunk_max=None): msg = "ContiguousLayout can only be used with fixed-length types" log.warn(msg) raise ValueError(msg) - + if chunk_min is None: msg = "chunk_min not set" log.warn(msg) @@ -470,7 +475,7 @@ def getContiguousLayout(shape_json, item_size, chunk_min=None, chunk_max=None): if chunk_max < chunk_min: raise ValueError("chunk_max cannot be less than chunk_min") - + if shape_json is None or shape_json["class"] == "H5S_NULL": return None if shape_json["class"] == "H5S_SCALAR": @@ -507,6 +512,7 @@ def getContiguousLayout(shape_json, item_size, chunk_min=None, chunk_max=None): return layout + def getChunkSize(layout, type_size): """Return chunk size given layout. i.e. just the product of the values in the list. @@ -521,6 +527,7 @@ def getChunkSize(layout, type_size): chunk_size *= n return chunk_size + def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): """ Use chunk layout given in the creationPropertiesList (if defined and @@ -668,7 +675,7 @@ def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): msg = f"Invalid chunk table id: {chunk_table_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) - + elif layout_class == "H5D_CHUNKED": if "dims" not in layout: msg = "dims key not found in layout for creation property list" @@ -695,7 +702,8 @@ def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): msg = f"Unexpected layout: {layout_class}" log.warn(msg) raise ValueError(msg) - + + def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN, layout_class="H5D_CHUNKED"): """Compute an increased chunk shape with a size in bytes greater than chunk_min.""" if shape_json is None or shape_json["class"] == "H5S_NULL": @@ -833,7 +841,7 @@ def guessChunk(shape_json, typesize): def getLayoutJson(creation_props, shape=None, type_json=None, chunk_min=None, chunk_max=None): """ Get the layout json given by creation_props. Raise bad request error if invalid """ - + min_chunk_size = int(config.get("min_chunk_size")) max_chunk_size = int(config.get("max_chunk_size")) @@ -853,7 +861,7 @@ def getLayoutJson(creation_props, shape=None, type_json=None, chunk_min=None, ch layout_props = creation_props["layout"] else: layout_props = None - + if layout_props: if "class" not in layout_props: msg = "expected class key in layout props" From 5cc77e7843e573f01ac794cb961573ed1b6e862e Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 14 Jul 2025 20:57:19 +0100 Subject: [PATCH 32/49] expanded link test --- hsds/link_sn.py | 11 +++++++++++ hsds/servicenode_lib.py | 7 +++++-- hsds/util/dsetUtil.py | 2 +- hsds/util/linkUtil.py | 3 ++- tests/integ/link_test.py | 17 ++++++++++++++++- tests/integ/value_test.py | 4 +--- 6 files changed, 36 insertions(+), 8 deletions(-) diff --git a/hsds/link_sn.py b/hsds/link_sn.py index 938f78c2..8b90af12 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -336,6 +336,16 @@ async def PUT_Links(request): msg = "Unable to load JSON body" log.warn(msg) raise HTTPBadRequest(reason=msg) + + if not body: + msg = "PUT links with empty body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if not isinstance(body, dict): + msg = f"PUT links expected dictionary body but got: {type(body)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) domain = getDomainFromRequest(request) if not isValidDomain(domain): @@ -432,6 +442,7 @@ async def PUT_Links(request): link_item = link_items[title] getLinkClass(link_item) except ValueError: + log.warn(f"invalid link for {title}: {link_item}") raise HTTPBadRequest(reason="invalid link item") grp_ids[grp_id] = link_items diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 1c17edad..65214aa0 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -1020,10 +1020,13 @@ async def getAttributeFromRequest(app, req_json, obj_id=None, bucket=None): created = req_json["created"] # allow "pre-dated" attributes if the timestamp is within the last 10 seconds predate_max_time = config.get("predate_max_time", default=10.0) - if now - created > predate_max_time: + if now - created < predate_max_time: attr_item["created"] = created else: - log.warn("stale created timestamp for attribute, ignoring") + msg = "stale created timestamp for attribute, ignoring " + msg += f"predate config: {predate_max_time:6.2f} " + msg += f"age: {(now - created):6.2f}" + log.warn(msg) if "created" not in attr_item: attr_item["created"] = now diff --git a/hsds/util/dsetUtil.py b/hsds/util/dsetUtil.py index a1d20cbf..5bf3afc5 100644 --- a/hsds/util/dsetUtil.py +++ b/hsds/util/dsetUtil.py @@ -329,7 +329,7 @@ def getShapeJson(body): shape_class = "H5S_SIMPLE" dims = body_shape else: - msg = "invalid shape: {body_shape}" + msg = f"invalid shape: {body_shape}" log.warn(msg) raise ValueError(msg) diff --git a/hsds/util/linkUtil.py b/hsds/util/linkUtil.py index 0090b045..d0063a39 100644 --- a/hsds/util/linkUtil.py +++ b/hsds/util/linkUtil.py @@ -33,6 +33,7 @@ def validateLinkName(name): def getLinkClass(link_json): """ verify this is a valid link returns the link class """ + log.debug(f"getLinkClass({link_json})") if "class" in link_json: link_class = link_json["class"] else: @@ -183,7 +184,7 @@ def getRequestLink(title, link_json, predate_max_time=0.0): if "created" in link_json: created = link_json["created"] # allow "pre-dated" attributes if recent enough - if now - created > predate_max_time: + if now - created < predate_max_time: link_item["created"] = created else: log.warn("stale created timestamp for link, ignoring") diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index 7c909435..801f5d0c 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -68,6 +68,14 @@ def testHardLink(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 404) # link doesn't exist yet + # try creating link with no body + rsp = self.session.put(req, headers=headers) + self.assertEqual(rsp.status_code, 400) + + # try creating link with no items + rsp = self.session.put(req, headers=headers, data=json.dumps({})) + self.assertEqual(rsp.status_code, 400) + # try creating a link with a different user (should fail) if test_user2: headers = helper.getRequestHeaders(domain=domain, username=test_user2) @@ -1481,7 +1489,14 @@ def testPutLinkMultiple(self): links = {} for i in range(grp_count): title = grp_names[i] - links[title] = {"id": grp_ids[i]} + if i%2 == 0: + # create a hardlink implicitly + links[title] = {"id": grp_ids[i]} + else: + # for variety, create a hardlink by providing full link json + links[title] = {"class": "H5L_TYPE_HARD", "id": grp_ids[i]} + print("putLinkMulti:", links[title]) + self.assertTrue(False) # add a soft and external link as well links["softlink"] = {"h5path": "a_path"} diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index 1f34a99e..293c625d 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -111,9 +111,7 @@ def testPut1DDataset(self): rspJson = json.loads(rsp.text) self.assertTrue("hrefs" in rspJson) self.assertTrue("value" in rspJson) - expect_value = [ - 0, - ] + expect_value = [0, ] expect_value *= data["shape"] self.assertEqual(rspJson["value"], expect_value) From 45f3aa5b934227224bf19226713fc4521e40c74a Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 16 Jul 2025 20:33:11 +0100 Subject: [PATCH 33/49] added config to test high latency storage --- admin/config/config.yml | 3 ++- hsds/util/fileClient.py | 23 ++++++++++++++++++++++- tests/integ/link_test.py | 4 +--- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/admin/config/config.yml b/admin/config/config.yml index 6e92d65b..998d3f36 100755 --- a/admin/config/config.yml +++ b/admin/config/config.yml @@ -70,7 +70,7 @@ admin_group: null # enable admin privileges for any user in this group openid_provider: azure # OpenID authentication provider openid_url: null # OpenID connect endpoint if provider is not azure or google openid_audience: null # OpenID audience. This is synonymous with azure_resource_id for azure -openid_claims: unique_name,appid,roles # Comma seperated list of claims to resolve to usernames. +openid_claims: unique_name,appid,roles # Comma separated list of claims to resolve to usernames. chaos_die: 0 # if > 0, have nodes randomly die after n seconds (for testing) standalone_app: false # True when run as a single application blosc_nthreads: 2 # number of threads to use for blosc compression. Set to 0 to have blosc auto-determine thread count @@ -89,6 +89,7 @@ allow_any_bucket_write: true # enable writes to buckets other than default bucke bit_shuffle_default_blocksize: 2048 # default blocksize for bitshuffle filter max_rangeget_gap: 1024 # max gap in byte for intelligent range get requests predate_maxtime: 10.0 # max delta between object created timestamp in request and actual time +posix_delay: 0.0 # delay for POSIX IO operations for simulating cloud storage latencies # DEPRECATED - the remaining config values are not used in currently but kept for backward compatibility with older container images aws_lambda_chunkread_function: null # name of aws lambda function for chunk reading aws_lambda_threshold: 4 # number of chunks per node per request to reach before using lambda diff --git a/hsds/util/fileClient.py b/hsds/util/fileClient.py index feebe2c1..0d7d88ba 100644 --- a/hsds/util/fileClient.py +++ b/hsds/util/fileClient.py @@ -173,6 +173,12 @@ async def get_object(self, key, bucket=None, offset=0, length=-1): msg = f"Unexpected Exception {type(e)} get get_object {key}: {e}" log.error(msg) raise HTTPInternalServerError() + + posix_delay = config.get("posix_delay", default=0.0) + if posix_delay > 0.0: + log.warn(f"posix_delay for get_object, sleep for: {posix_delay}") + await asyncio.sleep(posix_delay) + return data def _mkdir(self, dirpath): @@ -254,6 +260,12 @@ async def put_object(self, key, data, bucket=None): msg = f"fileClient.put_object {key} complete, " msg += f"write_rsp: {write_rsp}" log.debug(msg) + + posix_delay = config.get("posix_delay", default=0.0) + if posix_delay > 0.0: + log.warn(f"posix_delay for put_object, sleep for: {posix_delay}") + await asyncio.sleep(posix_delay) + return write_rsp async def delete_object(self, key, bucket=None): @@ -294,7 +306,11 @@ async def delete_object(self, key, bucket=None): msg = f"Unexpected Exception {type(e)} deleting file obj {key}: {e}" log.error(msg) raise HTTPInternalServerError() - await asyncio.sleep(0) # for async compat + + posix_delay = config.get("posix_delay", default=0.0) + if posix_delay > 0.0: + log.warn(f"posix_delay for delete_object , sleep for: {posix_delay}") + await asyncio.sleep(posix_delay) # for async compat async def is_object(self, key, bucket=None): self._validateBucket(bucket) @@ -429,6 +445,11 @@ async def list_keys( msg == f"got {len(key_names)}" log.warning(msg) + posix_delay = config.get("posix_delay", default=0.0) + if posix_delay > 0.0: + log.warn(f"posix_delay for list_keys, sleep for: {posix_delay}") + await asyncio.sleep(posix_delay) + return key_names async def releaseClient(self): diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index 801f5d0c..a796155f 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -1489,14 +1489,12 @@ def testPutLinkMultiple(self): links = {} for i in range(grp_count): title = grp_names[i] - if i%2 == 0: + if i % 2 == 0: # create a hardlink implicitly links[title] = {"id": grp_ids[i]} else: # for variety, create a hardlink by providing full link json links[title] = {"class": "H5L_TYPE_HARD", "id": grp_ids[i]} - print("putLinkMulti:", links[title]) - self.assertTrue(False) # add a soft and external link as well links["softlink"] = {"h5path": "a_path"} From ff1c04367063662e633849b1f49c90e5ff76f2a2 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 22 Jul 2025 12:50:43 +0100 Subject: [PATCH 34/49] added put_data action for DomainCrawler --- admin/config/config.yml | 1 + hsds/domain_crawl.py | 55 +++++++++++++++++++++++++++++++++++++---- hsds/dset_dn.py | 2 +- hsds/dset_sn.py | 47 +++++++++++++++++++++++++++++++---- hsds/link_sn.py | 4 +-- 5 files changed, 96 insertions(+), 13 deletions(-) diff --git a/admin/config/config.yml b/admin/config/config.yml index 998d3f36..4b9f40d5 100755 --- a/admin/config/config.yml +++ b/admin/config/config.yml @@ -90,6 +90,7 @@ bit_shuffle_default_blocksize: 2048 # default blocksize for bitshuffle filter max_rangeget_gap: 1024 # max gap in byte for intelligent range get requests predate_maxtime: 10.0 # max delta between object created timestamp in request and actual time posix_delay: 0.0 # delay for POSIX IO operations for simulating cloud storage latencies +max_compact_dset_size: 65536 # size in bytes for maximum compact storage size # DEPRECATED - the remaining config values are not used in currently but kept for backward compatibility with older container images aws_lambda_chunkread_function: null # name of aws lambda function for chunk reading aws_lambda_threshold: 4 # number of chunks per node per request to reach before using lambda diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index 35b20bf9..5c1fe13a 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -19,9 +19,10 @@ from aiohttp.web_exceptions import HTTPInternalServerError, HTTPNotFound, HTTPGone from h5json.objid import getCollectionForId +from h5json.array_util import arrayToBytes from .util.nodeUtil import getDataNodeUrl -from .util.httpUtil import isOK +from .util.httpUtil import isOK, http_put from .util.globparser import globmatch from .servicenode_lib import getObjectJson, getAttributes, putAttributes, getLinks, putLinks from . import hsds_logger as log @@ -233,7 +234,7 @@ async def put_attributes(self, obj_id, attr_items): try: status = await putAttributes(self._app, obj_id, attr_items, **kwargs) except HTTPConflict: - log.warn("DomainCrawler - got HTTPConflict from http_put") + log.warn("DomainCrawler - got HTTPConflict from putAttributers") status = 409 except HTTPServiceUnavailable: status = 503 @@ -419,8 +420,10 @@ async def put_links(self, grp_id, link_items): log.warn("DomainCrawler - got HTTPConflict from http_put") status = 409 except HTTPServiceUnavailable: + log.warn("DomainCrawler - got HTTPServiceUnavailable exception") status = 503 except HTTPInternalServerError: + log.warn("DomainCrawler - got 500 error from DN") status = 500 except Exception as e: log.error(f"unexpected exception {e}") @@ -428,6 +431,38 @@ async def put_links(self, grp_id, link_items): log.debug(f"DomainCrawler fetch for {grp_id} - returning status: {status}") self._obj_dict[grp_id] = {"status": status} + async def put_data(self, chunk_id, arr): + # write a one-chunk dataset value + log.debug(f"DomainCrawler put_data for {chunk_id}, arr: {arr}") + req = getDataNodeUrl(self._app, chunk_id) + req += "/chunks/" + chunk_id + log.debug(f"put_data req: {req}") + params = {"bucket": self._bucket} + + data = arrayToBytes(arr) + + log.debug(f"DomainCrawler - put_data req: {req}, {len(data)} bytes") + + try: + # TBD: setup an http client? + await http_put(self._app, req, data=data, params=params, client=None) + log.debug("http_put return") + except HTTPConflict: + log.warn("DomainCrawler - got HTTPConflict from http_put") + status = 409 + except HTTPServiceUnavailable: + log.warn("DomainCrawler - got HTTPServiceUnavailable exception") + status = 503 + except HTTPInternalServerError: + log.warn("DomainCrawler - got 500 error from DN") + status = 500 + except Exception as e: + log.error(f"unexpected exception {e}") + status = 500 + + log.debug(f"DomainCrawler put_data for {chunk_id} - returning status: {status}") + self._obj_dict[chunk_id] = {"status": status} + def get_status(self): """ return the highest status of any of the returned objects """ status = None @@ -528,7 +563,7 @@ async def fetch(self, obj_id): await self.put_attributes(obj_id, attr_items) elif self._action == "get_link": - log.debug("DomainCrawlwer - get links") + log.debug("DomainCrawler - get links") log.debug(f"self._objs: {self._objs}, type: {type(self._objs)}") if self._objs is None or obj_id not in self._objs: @@ -548,7 +583,7 @@ async def fetch(self, obj_id): log.debug(f"DomainCrawler - get link titles: {link_titles}") await self.get_links(obj_id, link_titles) elif self._action == "put_link": - log.debug("DomainCrawlwer - put links") + log.debug("DomainCrawler - put links") # write links if self._objs and obj_id not in self._objs: log.error(f"couldn't find {obj_id} in self._objs") @@ -557,11 +592,21 @@ async def fetch(self, obj_id): log.debug(f"got {len(link_items)} link items for {obj_id}") await self.put_links(obj_id, link_items) + elif self._action == "put_data": + log.debug("DomainCrawler - put data") + # write one chunk per dataset + if self._objs and obj_id not in self._objs: + log.error(f"couldn't find {obj_id} in self._objs") + return + data = self._objs[obj_id] + log.debug(f"got {data} data for {obj_id}") + + await self.put_data(obj_id, data) else: msg = f"DomainCrawler: unexpected action: {self._action}" log.error(msg) msg = f"DomainCrawler - fetch complete obj_id: {obj_id}, " - msg += f"{len(self._obj_dict)} objects found" + msg += f"{len(self._obj_dict)} objects processed" log.debug(msg) log.debug(f"obj_dict: {len(self._obj_dict)} items") diff --git a/hsds/dset_dn.py b/hsds/dset_dn.py index fc949203..b2b640e4 100755 --- a/hsds/dset_dn.py +++ b/hsds/dset_dn.py @@ -137,7 +137,7 @@ async def POST_Dataset(request): if "layout" in body: layout = body["layout"] # client specified chunk layout - # ok - all set, create committed type obj + # ok - all set, create dataset obj now = getNow(app) if "attributes" in body: diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 48ee8609..b5c912c8 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -23,7 +23,8 @@ from .util.httpUtil import getHref, respJsonAssemble from .util.httpUtil import jsonResponse, getBooleanParam -from .util.dsetUtil import getPreviewQuery, getShapeDims +from .util.chunkUtil import getChunkIds +from .util.dsetUtil import getPreviewQuery, getShapeDims, getChunkLayout, getDatasetLayoutClass from .util.authUtil import getUserPasswordFromRequest, aclCheck from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain @@ -31,7 +32,7 @@ from .servicenode_lib import getDomainJson, getObjectJson, getDsetJson, getPathForObjectId from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo from .servicenode_lib import getDatasetCreateArgs, createDataset, deleteObject -from .dset_lib import updateShape, deleteAllChunks, doHyperslabWrite +from .dset_lib import updateShape, deleteAllChunks from .post_crawl import createDatasets from .domain_crawl import DomainCrawler from . import hsds_logger as log @@ -605,12 +606,35 @@ def _updateInitValuesList(kwargs): raise HTTPInternalServerError() # write any init data values + init_chunks = {} for index in range(obj_count): init_data = init_values[index] if init_data is None: - continue + continue # no data to initialize dset_json = objects[index] + dset_id = dset_json["id"] log.debug(f"init value, post_rsp: {dset_json}") + layout_class = getDatasetLayoutClass(dset_json) + log.debug(f"layout_class: {layout_class}") + if layout_class != "H5D_CHUNKED": + msg = f"dataset init_data used with unsupported layout_class: {layout_class}" + log.error(msg) + raise HTTPInternalServerError() + layout_dims = getChunkLayout(dset_json) + log.debug(f"init data layout is: {layout_dims}") + # make selection for entire dataspace + dims = getShapeDims(dset_json["shape"]) + slices = [] + for dim in dims: + s = slice(0, dim, 1) + slices.append(s) + chunk_ids = getChunkIds(dset_id, slices, layout_dims) + log.debug(f"init data, got chunk_ids: {chunk_ids}") + if not chunk_ids or len(chunk_ids) != 1: + msg = "expected one chunk for init_data but got: {chunk_ids}" + log.error(msg) + raise HTTPInternalServerError() + chunk_id = chunk_ids[0] shape_json = dset_json["shape"] type_json = dset_json["type"] arr_dtype = createDataType(type_json) @@ -627,7 +651,8 @@ def _updateInitValuesList(kwargs): log.warn(f"IndexError: {msg}") raise HTTPBadRequest(reason=msg) log.debug(f"got json arr: {input_arr.shape}") - + init_chunks[chunk_id] = input_arr + """ # write data if provided log.debug(f"write input_arr: {input_arr}") # make selection for entire dataspace @@ -636,7 +661,7 @@ def _updateInitValuesList(kwargs): for dim in dims: s = slice(0, dim, 1) slices.append(s) - # make a one page list to handle the write in one chunk crawler run + #make a one page list to handle the write in one chunk crawler run # (larger write request should user binary streaming) kwargs = {"page_number": 0, "page": slices} kwargs["dset_json"] = dset_json @@ -645,6 +670,18 @@ def _updateInitValuesList(kwargs): kwargs["data"] = input_arr # do write await doHyperslabWrite(app, request, **kwargs) + """ + if init_chunks: + # write dataset init values using the Domain Crawler + log.debug(f"POST dataset - setting init values: {list(init_chunks.keys())}") + kwargs = {"action": "put_data", "bucket": bucket} + + crawler = DomainCrawler(app, init_chunks, **kwargs) + + # will raise exception on not found, server busy, etc. + await crawler.crawl() + status = crawler.get_status() + log.info(f"DomainCrawler done for put_data action, status: {status}") if "objects" in post_rsp: # add any links in multi request diff --git a/hsds/link_sn.py b/hsds/link_sn.py index 8b90af12..94139ce5 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -336,12 +336,12 @@ async def PUT_Links(request): msg = "Unable to load JSON body" log.warn(msg) raise HTTPBadRequest(reason=msg) - + if not body: msg = "PUT links with empty body" log.warn(msg) raise HTTPBadRequest(reason=msg) - + if not isinstance(body, dict): msg = f"PUT links expected dictionary body but got: {type(body)}" log.warn(msg) From cda56cf819767e94fdfc971bbab2c27be1d3f87f Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 23 Jul 2025 11:08:45 +0100 Subject: [PATCH 35/49] fix for hang in DomainCrawler put_data handler --- hsds/domain_crawl.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index 5c1fe13a..15dc2c6d 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -442,11 +442,10 @@ async def put_data(self, chunk_id, arr): data = arrayToBytes(arr) log.debug(f"DomainCrawler - put_data req: {req}, {len(data)} bytes") - try: - # TBD: setup an http client? - await http_put(self._app, req, data=data, params=params, client=None) - log.debug("http_put return") + rsp = await http_put(self._app, req, data=data, params=params) + log.debug(f"http_put return: {rsp}") + status = 200 except HTTPConflict: log.warn("DomainCrawler - got HTTPConflict from http_put") status = 409 @@ -459,6 +458,8 @@ async def put_data(self, chunk_id, arr): except Exception as e: log.error(f"unexpected exception {e}") status = 500 + finally: + log.debug("DomainCrawler put_data end try") log.debug(f"DomainCrawler put_data for {chunk_id} - returning status: {status}") self._obj_dict[chunk_id] = {"status": status} From 5a2d4d682ba4ad798d179935bad99c843f2b9474 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 23 Jul 2025 12:50:15 +0100 Subject: [PATCH 36/49] reduce log verbosity --- hsds/domain_crawl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index 15dc2c6d..19eee5df 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -600,7 +600,7 @@ async def fetch(self, obj_id): log.error(f"couldn't find {obj_id} in self._objs") return data = self._objs[obj_id] - log.debug(f"got {data} data for {obj_id}") + log.debug(f"got {len(data)} data for {obj_id}") await self.put_data(obj_id, data) else: From 053395c5bf41238cc6d157ca7846bfcf9609dcd1 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 29 Jul 2025 11:10:16 +0100 Subject: [PATCH 37/49] fix for regression with h5pyd master branch --- hsds/dset_sn.py | 20 +----------------- hsds/group_sn.py | 1 + hsds/servicenode_lib.py | 45 ++++++++++++++++++++++------------------- 3 files changed, 26 insertions(+), 40 deletions(-) diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index b5c912c8..84a983c9 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -652,25 +652,7 @@ def _updateInitValuesList(kwargs): raise HTTPBadRequest(reason=msg) log.debug(f"got json arr: {input_arr.shape}") init_chunks[chunk_id] = input_arr - """ - # write data if provided - log.debug(f"write input_arr: {input_arr}") - # make selection for entire dataspace - dims = getShapeDims(shape_json) - slices = [] - for dim in dims: - s = slice(0, dim, 1) - slices.append(s) - #make a one page list to handle the write in one chunk crawler run - # (larger write request should user binary streaming) - kwargs = {"page_number": 0, "page": slices} - kwargs["dset_json"] = dset_json - kwargs["bucket"] = bucket - kwargs["select_dtype"] = input_arr.dtype - kwargs["data"] = input_arr - # do write - await doHyperslabWrite(app, request, **kwargs) - """ + if init_chunks: # write dataset init values using the Domain Crawler log.debug(f"POST dataset - setting init values: {list(init_chunks.keys())}") diff --git a/hsds/group_sn.py b/hsds/group_sn.py index 1011b883..991b50bd 100755 --- a/hsds/group_sn.py +++ b/hsds/group_sn.py @@ -233,6 +233,7 @@ async def POST_Group(request): kwargs = {"root_id": root_id, "bucket": bucket} if post_rsp is None: + log.debug(f"post_rsp is None, call createGroup with kwargs: {kwargs}") # Handle cases other than multi-group create here if "type" in kwargs: msg = "type key is not allowed for Group creation" diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 65214aa0..a36eaeac 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -1357,6 +1357,8 @@ def getCreateArgs(body, """ get args for createObject from request body """ log.debug(f"getCreateArgs with body keys: {list(body.keys())}") + if ignore_link: + log.debug("getCreateArgs, ignore_link is set") kwargs = {"bucket": bucket} predate_max_time = config.get("predate_max_time", default=10.0) @@ -1365,9 +1367,29 @@ def getCreateArgs(body, obj_id = None h5path = None + if "parent_id" not in body: + parent_id = root_id + else: + parent_id = body["parent_id"] + + if "h5path" in body: + h5path = body["h5path"] + # normalize the h5path + if h5path.startswith("/"): + if parent_id == root_id: + # just adjust the path to be relative + h5path = h5path[1:] + else: + msg = f"PostCrawler expecting relative h5path, but got: {h5path}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if h5path.endswith("/"): + h5path = h5path[:-1] # makes iterating through the links a bit easier + if "link" in body: if "h5path" in body: - msg = "link can't be used with h5path" + msg = "'link' key in body can't be used with h5path" log.warn(msg) raise HTTPBadRequest(reason=msg) # if ignore_link is set, parent_links will be created post object creation @@ -1389,26 +1411,7 @@ def getCreateArgs(body, log.debug(f"parent id: {parent_id}, link_title: {link_title}") if not ignore_link: h5path = link_title # just use the link name as the h5path - - if "parent_id" not in body: - parent_id = root_id - else: - parent_id = body["parent_id"] - - if "h5path" in body: - h5path = body["h5path"] - # normalize the h5path - if h5path.startswith("/"): - if parent_id == root_id: - # just adjust the path to be relative - h5path = h5path[1:] - else: - msg = f"PostCrawler expecting relative h5path, but got: {h5path}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if h5path.endswith("/"): - h5path = h5path[:-1] # makes iterating through the links a bit easier + log.debug(f"set h5path to {link_title}") if parent_id and h5path: # these are used by createObjectByPath From 78127f11c989480d443b5d1fb0528b01cf3b2e29 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 8 Sep 2025 17:22:51 +0100 Subject: [PATCH 38/49] enable client-based timestamps for attribute and link creation --- admin/config/config.yml | 1 + hsds/attr_dn.py | 18 ++- hsds/link_dn.py | 16 ++- hsds/link_sn.py | 3 + hsds/servicenode_lib.py | 9 +- tests/integ/attr_test.py | 61 +++++++++ tests/integ/link_test.py | 266 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 365 insertions(+), 9 deletions(-) diff --git a/admin/config/config.yml b/admin/config/config.yml index 4b9f40d5..a4303edd 100755 --- a/admin/config/config.yml +++ b/admin/config/config.yml @@ -91,6 +91,7 @@ max_rangeget_gap: 1024 # max gap in byte for intelligent range get requests predate_maxtime: 10.0 # max delta between object created timestamp in request and actual time posix_delay: 0.0 # delay for POSIX IO operations for simulating cloud storage latencies max_compact_dset_size: 65536 # size in bytes for maximum compact storage size +max_timestamp_drift: 300 # number of seconds a client-based timestamp can differ from current time # DEPRECATED - the remaining config values are not used in currently but kept for backward compatibility with older container images aws_lambda_chunkread_function: null # name of aws lambda function for chunk reading aws_lambda_threshold: 4 # number of chunks per node per request to reach before using lambda diff --git a/hsds/attr_dn.py b/hsds/attr_dn.py index cb002623..3c640c3f 100755 --- a/hsds/attr_dn.py +++ b/hsds/attr_dn.py @@ -12,7 +12,6 @@ # # attribute handling routines # -import time from bisect import bisect_left from aiohttp.web_exceptions import HTTPBadRequest, HTTPConflict, HTTPNotFound, HTTPGone @@ -28,6 +27,8 @@ from .util.dsetUtil import getShapeDims from .util.domainUtil import isValidBucketName from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj +from .util.timeUtil import getNow +from . import config from . import hsds_logger as log @@ -362,6 +363,8 @@ async def PUT_Attributes(request): params = request.rel_url.query log.debug(f"got PUT_Attributes params: {params}") obj_id = get_obj_id(request) + now = getNow(app) + max_timestamp_drift = int(config.get("max_timestamp_drift", default=300)) if not request.has_body: log.error("PUT_Attribute with no body") @@ -459,11 +462,18 @@ async def PUT_Attributes(request): attributes = obj_json["attributes"] - create_time = time.time() # check for conflicts new_attributes = set() # attribute names that are new or replacements for attr_name in items: attribute = items[attr_name] + if attribute.get("created"): + create_time = attribute["created"] + log.debug(f"attribute {attr_name} has create time: {create_time}") + if abs(create_time - now) > max_timestamp_drift: + log.warn(f"attribute {attr_name} create time stale, ignoring") + create_time = now + else: + create_time = now if attr_name in attributes: log.debug(f"attribute {attr_name} exists") old_item = attributes[attr_name] @@ -511,7 +521,7 @@ async def PUT_Attributes(request): if new_attributes: # update the obj lastModified - now = time.time() + now = getNow(app) obj_json["lastModified"] = now # write back to S3, save to metadata cache await save_metadata_obj(app, obj_id, obj_json, bucket=bucket) @@ -610,7 +620,7 @@ async def DELETE_Attributes(request): if save_obj: # update the object lastModified - now = time.time() + now = getNow(app) obj_json["lastModified"] = now await save_metadata_obj(app, obj_id, obj_json, bucket=bucket) diff --git a/hsds/link_dn.py b/hsds/link_dn.py index 1ad6133e..a35acf17 100755 --- a/hsds/link_dn.py +++ b/hsds/link_dn.py @@ -27,6 +27,7 @@ from .util.domainUtil import isValidBucketName from .util.timeUtil import getNow from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj +from . import config from . import hsds_logger as log @@ -285,6 +286,8 @@ async def PUT_Links(request): params = request.rel_url.query group_id = get_obj_id(request) log.info(f"PUT links: {group_id}") + now = getNow(app) + max_timestamp_drift = int(config.get("max_timestamp_drift", default=300)) if not isValidUuid(group_id, obj_class="groups"): log.error(f"Unexpected group_id: {group_id}") @@ -365,11 +368,16 @@ async def PUT_Links(request): link_delete_set = deleted_links[group_id] else: link_delete_set = set() - - create_time = getNow(app) - for title in new_links: item = items[title] + if item.get("created"): + create_time = item["created"] + log.debug(f"link {title} has create time: {create_time}") + if abs(create_time - now) > max_timestamp_drift: + log.warn(f"link {title} create time stale, ignoring") + create_time = now + else: + create_time = now item["created"] = create_time links[title] = item log.debug(f"added link {title}: {item}") @@ -378,7 +386,7 @@ async def PUT_Links(request): if new_links: # update the group lastModified - group_json["lastModified"] = create_time + group_json["lastModified"] = now # write back to S3, save to metadata cache await save_metadata_obj(app, group_id, group_json, bucket=bucket) diff --git a/hsds/link_sn.py b/hsds/link_sn.py index 94139ce5..2048dd7c 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -299,6 +299,9 @@ async def PUT_Link(request): kwargs["tgt_id"] = body.get("id") kwargs["h5path"] = body.get("h5path") kwargs["h5domain"] = body.get("h5domain") + created = body.get("created") + if created: + kwargs["created"] = created status = await putLink(app, group_id, link_title, **kwargs) diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index a36eaeac..3e70a71a 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -448,7 +448,12 @@ async def getLink(app, group_id, title, bucket=None): return link_json -async def putLink(app, group_id, title, tgt_id=None, h5path=None, h5domain=None, bucket=None): +async def putLink(app, group_id, title, + tgt_id=None, + h5path=None, + h5domain=None, + bucket=None, + created=None): """ create a new link. Return 201 if this is a new link, or 200 if it's a duplicate of an existing link. """ @@ -469,6 +474,8 @@ async def putLink(app, group_id, title, tgt_id=None, h5path=None, h5domain=None, link_json["h5path"] = h5path if h5domain: link_json["h5domain"] = h5domain + if created: + link_json["created"] = created try: link_class = getLinkClass(link_json) diff --git a/tests/integ/attr_test.py b/tests/integ/attr_test.py index de54c5ea..e986a464 100644 --- a/tests/integ/attr_test.py +++ b/tests/integ/attr_test.py @@ -12,6 +12,7 @@ from copy import copy import unittest import json +import time import numpy as np import base64 import helper @@ -500,6 +501,66 @@ def testPutFixedString(self): self.assertTrue("length" in type_json) self.assertEqual(type_json["length"], 7) + def testUseTimestamp(self): + # Test PUT value for 1d attribute with timestamp included + print("testUseTimestamp", self.base_domain) + + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + def _create_attr(attr_name, ts=None): + + # create attr + fixed_str_type = { + "charSet": "H5T_CSET_ASCII", + "class": "H5T_STRING", + "length": 12, + "strPad": "H5T_STR_NULLPAD", + } + data = {"type": fixed_str_type, "value": "XYZ"} + if ts: + data["created"] = ts + req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + def _check_attr_ts(attr_name, min_ts=None, max_ts=None): + # read attr + req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], "XYZ") + self.assertTrue("type" in rspJson) + self.assertTrue("created" in rspJson) + if min_ts: + self.assertGreaterEqual(rspJson["created"], min_ts) + if max_ts: + self.assertLessEqual(rspJson["created"], max_ts) + + now = time.time() + # server-based timestamp + _create_attr("a1") + _check_attr_ts("a1", min_ts=(now - 1), max_ts=(now + 1)) + # client assigned timestamp + _create_attr("a2", ts=now) + _check_attr_ts("a2", min_ts=now, max_ts=now) + # client assigned with small time-skew, ok + _create_attr("a3", ts=int(now)) + _check_attr_ts("a3", min_ts=int(now), max_ts=int(now)) + # client assigned with large time-skew, ignored + _create_attr("a4", ts=999) + _check_attr_ts("a4", min_ts=now, max_ts=(now + 1)) + def testPutFixedStringNullTerm(self): # Test PUT value for 1d attribute with fixed length string/null terminated types print("testPutFixedStringNullTerm", self.base_domain) diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index a796155f..a6142aaf 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -1630,6 +1630,220 @@ def testPutLinkMultiple(self): else: self.assertTrue(False) # unexpected + def testPutLinkMultipleWithTimestamps(self): + domain = self.base_domain + "/testPutLinkMultipleWithTImestamps.h5" + helper.setupDomain(domain) + print("testPutLinkMultipleWithTimestamps", domain) + headers = helper.getRequestHeaders(domain=domain) + req = self.endpoint + "/" + + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_id = rspJson["root"] + + # create a group + req = self.endpoint + "/groups" + rsp = self.session.post(req, headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + grpA_id = rspJson["id"] + self.assertTrue(helper.validateId(grpA_id)) + + # link new obj as '/grpA' + req = self.endpoint + "/groups/" + root_id + "/links/grpA" + payload = {"id": grpA_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # created + + # create some groups under grp1 + grp_count = 3 + + grp_names = [f"grp{(i + 1):04d}" for i in range(grp_count)] + grp_ids = [] + + for grp_name in grp_names: + # create sub_groups + req = self.endpoint + "/groups" + rsp = self.session.post(req, headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + grp_id = rspJson["id"] + self.assertTrue(helper.validateId(grp_id)) + grp_ids.append(grp_id) + + # create some links + links = {} + for i in range(grp_count): + title = grp_names[i] + if i % 2 == 0: + # create a hardlink implicitly + links[title] = {"id": grp_ids[i]} + else: + # for variety, create a hardlink by providing full link json + links[title] = {"class": "H5L_TYPE_HARD", "id": grp_ids[i]} + + # add a soft and external link as well + links["softlink"] = {"h5path": "a_path"} + links["extlink"] = {"h5path": "another_path", "h5domain": "/a_domain"} + link_count = len(links) + # add timestamp + timestamps = set() + for title in links: + link = links[title] + now = time.time() + link["created"] = now + timestamps.add(now) + + # write links to the grpA + data = {"links": links} + req = self.endpoint + "/groups/" + grpA_id + "/links" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # do a get on the links + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + ret_links = rspJson["links"] + self.assertEqual(len(ret_links), link_count) + for link in ret_links: + self.assertTrue("title" in link) + title = link["title"] + self.assertTrue("class" in link) + link_class = link["class"] + if link_class == "H5L_TYPE_HARD": + self.assertTrue("id" in link) + self.assertTrue(link["id"] in grp_ids) + self.assertTrue(title in grp_names) + elif link_class == "H5L_TYPE_SOFT": + self.assertTrue("h5path" in link) + h5path = link["h5path"] + self.assertEqual(h5path, "a_path") + elif link_class == "H5L_TYPE_EXTERNAL": + self.assertTrue("h5path" in link) + h5path = link["h5path"] + self.assertEqual(h5path, "another_path") + self.assertTrue("h5domain" in link) + h5domain = link["h5domain"] + self.assertEqual(h5domain, "/a_domain") + else: + self.assertTrue(False) # unexpected + self.assertTrue("created" in link) + self.assertTrue(link["created"] in timestamps) + + # try writing again, should get 200 (no new links) + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 200) + + # write some links to three group objects + links = {} + links["hardlink_multicast"] = {"id": root_id} + links["softlink_multicast"] = {"h5path": "multi_path"} + links["extlink_multicast"] = {"h5path": "multi_path", "h5domain": "/another_domain"} + link_count = len(links) + timestamps = set() + for title in links: + link = links[title] + now = time.time() + link["created"] = now + timestamps.add(now) + + data = {"links": links, "grp_ids": grp_ids} + req = self.endpoint + "/groups/" + root_id + "/links" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # check that the links got created + for grp_id in grp_ids: + req = self.endpoint + "/groups/" + grp_id + "/links" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + ret_links = rspJson["links"] + self.assertEqual(len(ret_links), 3) + for ret_link in ret_links: + self.assertTrue("class" in ret_link) + link_class = ret_link["class"] + if link_class == "H5L_TYPE_HARD": + self.assertTrue("id" in ret_link) + self.assertEqual(ret_link["id"], root_id) + elif link_class == "H5L_TYPE_SOFT": + self.assertTrue("h5path" in ret_link) + self.assertEqual(ret_link["h5path"], "multi_path") + elif link_class == "H5L_TYPE_EXTERNAL": + self.assertTrue("h5path" in ret_link) + self.assertEqual(ret_link["h5path"], "multi_path") + self.assertTrue("h5domain" in ret_link) + self.assertEqual(ret_link["h5domain"], "/another_domain") + else: + self.assertTrue(False) # unexpected + self.assertTrue("created" in ret_link) + self.assertTrue(ret_link["created"] in timestamps) + + # write different links to three group objects + link_data = {} + timestamps = set() + for i in range(grp_count): + grp_id = grp_ids[i] + links = {} + links[f"hardlink_{i}"] = {"id": root_id} + links[f"softlink_{i}"] = {"h5path": f"multi_path_{i}"} + ext_link = {"h5path": f"multi_path_{i}", "h5domain": f"/another_domain/{i}"} + links[f"extlink_{i}"] = ext_link + for title in links: + link = links[title] + now = time.time() + link["created"] = now + timestamps.add(now) + link_data[grp_id] = {"links": links} + + data = {"grp_ids": link_data} + req = self.endpoint + "/groups/" + root_id + "/links" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # check that the new links got created + for i in range(grp_count): + grp_id = grp_ids[i] + titles = [f"hardlink_{i}", f"softlink_{i}", f"extlink_{i}", ] + data = {"titles": titles} + # do a post to just return the links we are interested in + req = self.endpoint + "/groups/" + grp_id + "/links" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + ret_links = rspJson["links"] + self.assertEqual(len(ret_links), len(titles)) + for j in range(len(titles)): + ret_link = ret_links[j] + self.assertTrue("class" in ret_link) + link_class = ret_link["class"] + self.assertTrue("title" in ret_link) + link_title = ret_link["title"] + if link_class == "H5L_TYPE_HARD": + self.assertEqual(link_title, f"hardlink_{i}") + self.assertTrue("id" in ret_link) + self.assertEqual(ret_link["id"], root_id) + elif link_class == "H5L_TYPE_SOFT": + self.assertEqual(link_title, f"softlink_{i}") + self.assertTrue("h5path" in ret_link) + self.assertEqual(ret_link["h5path"], f"multi_path_{i}") + elif link_class == "H5L_TYPE_EXTERNAL": + self.assertEqual(link_title, f"extlink_{i}") + self.assertTrue("h5path" in ret_link) + self.assertEqual(ret_link["h5path"], f"multi_path_{i}") + self.assertTrue("h5domain" in ret_link) + self.assertEqual(ret_link["h5domain"], f"/another_domain/{i}") + else: + self.assertTrue(False) # unexpected + self.assertTrue("created" in ret_link) + self.assertTrue(ret_link["created"] in timestamps) + print(timestamps) + def testDeleteLinkMultiple(self): domain = self.base_domain + "/testDeleteLinkMultiple.h5" helper.setupDomain(domain) @@ -1805,6 +2019,58 @@ def testLinkCreationOrder(self): self.assertEqual(prev_link['title'], sorted(link_names)[i]) self.assertEqual(link['title'], sorted(link_names)[i + 1]) + def testUseTimestamp(self): + # Test PUT value for link with timestamp included + domain = self.base_domain + "/testLinkUseTimestamp.h5" + + helper.setupDomain(domain) + print("testUseTimestamp", domain) + headers = helper.getRequestHeaders(domain=domain) + req = helper.getEndpoint() + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + def _create_link(title, ts=None): + # create link + req = helper.getEndpoint() + f"/groups/{root_uuid}/links/{title}" + body = {"h5path": "some_path"} + if ts: + body["created"] = ts + rsp = self.session.put(req, data=json.dumps(body), headers=headers) + self.assertEqual(rsp.status_code, 201) + + def _check_link_ts(title, min_ts=None, max_ts=None): + # read link + req = helper.getEndpoint() + f"/groups/{root_uuid}/links/{title}" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("created" in rspJson) + if min_ts: + self.assertGreaterEqual(rspJson["created"], min_ts) + if max_ts: + self.assertLessEqual(rspJson["created"], max_ts) + + now = time.time() + # server-based timestamp + _create_link("a1", ts=None) + _check_link_ts("a1", min_ts=(now - 1), max_ts=(now + 1)) + # client assigned timestamp + _create_link("a2", ts=now) + _check_link_ts("a2", min_ts=now, max_ts=now) + # client assigned with small time-skew, ok + _create_link("a3", ts=int(now)) + _check_link_ts("a3", min_ts=int(now), max_ts=int(now)) + # client assigned with large time-skew, ignored + _create_link("a4", ts=999) + _check_link_ts("a4", min_ts=now, max_ts=(now + 1)) + if __name__ == "__main__": # setup test files From f96b34c4dac935846080ec6da39e89480cd4c8d5 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 9 Sep 2025 10:46:01 +0100 Subject: [PATCH 39/49] remove python 3.9 from .git workflow --- .github/workflows/python-package.yml | 2 +- pyproject.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 7e24ec14..5175ea54 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-22.04, ubuntu-latest, windows-latest] - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12"] build-method: ["manual", "docker"] runs-on: ${{ matrix.os }} diff --git a/pyproject.toml b/pyproject.toml index 3f1dc4de..8e260cda 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ classifiers = [ "Topic :: Database", "Topic :: Software Development :: Libraries :: Python Modules", ] -requires-python = ">=3.8" +requires-python = ">=3.10" version = "0.9.2" dependencies = [ @@ -45,7 +45,7 @@ dependencies = [ "h5json@git+https://github.com/HDFGroup/hdf5-json@abstract", "importlib_resources", "numcodecs <= 0.15.1", - "numpy >=2.0.0rc1; python_version>='3.9'", + "numpy >=2.0.0", "psutil", "pyjwt", "pytz", From 03e413f4d5a3a890c5d83d97ff2fbd4e5bea7348 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 9 Sep 2025 10:53:53 +0100 Subject: [PATCH 40/49] adjust min time for time skew test --- tests/integ/link_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index a6142aaf..511bdf4e 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -2069,7 +2069,7 @@ def _check_link_ts(title, min_ts=None, max_ts=None): _check_link_ts("a3", min_ts=int(now), max_ts=int(now)) # client assigned with large time-skew, ignored _create_link("a4", ts=999) - _check_link_ts("a4", min_ts=now, max_ts=(now + 1)) + _check_link_ts("a4", min_ts=(now - 1), max_ts=(now + 1)) if __name__ == "__main__": From b6016e078229277b0a45ed87e775cd748593805b Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 29 Oct 2025 14:07:06 +0000 Subject: [PATCH 41/49] use hdf5-json util classes --- admin/config/config.yml | 1 + hsds/async_lib.py | 7 +- hsds/attr_dn.py | 10 +- hsds/attr_sn.py | 7 +- hsds/chunk_crawl.py | 7 +- hsds/chunk_dn.py | 11 +- hsds/chunk_sn.py | 10 +- hsds/datanode_lib.py | 69 ++- hsds/dset_dn.py | 20 +- hsds/dset_lib.py | 23 +- hsds/dset_sn.py | 6 +- hsds/servicenode_lib.py | 281 ++++----- hsds/util/dsetUtil.py | 1034 +--------------------------------- tests/integ/dataset_test.py | 321 +++++------ tests/unit/dset_util_test.py | 274 +-------- 15 files changed, 429 insertions(+), 1652 deletions(-) diff --git a/admin/config/config.yml b/admin/config/config.yml index a4303edd..cb46522a 100755 --- a/admin/config/config.yml +++ b/admin/config/config.yml @@ -47,6 +47,7 @@ flush_sleep_interval: 1 # time to wait between checking on dirty objects flush_timeout: 10 # max time to wait on all I/O operations to complete for a flush min_chunk_size: 1m # 1 MB max_chunk_size: 4m # 4 MB +default_vlen_type_size: 128 # guess for average size of variable length types max_request_size: 100m # 100 MB - should be no smaller than client_max_body_size in nginx tmpl (if using nginx) max_chunks_per_folder: 0 # max number of chunks per s3 folder. 0 for unlimiited max_task_count: 100 # maximum number of concurrent tasks per node before server will return 503 error diff --git a/hsds/async_lib.py b/hsds/async_lib.py index 715e7985..9888d3dd 100755 --- a/hsds/async_lib.py +++ b/hsds/async_lib.py @@ -20,12 +20,15 @@ from h5json.array_util import getNumElements, bytesToArray from h5json.objid import isValidUuid, isSchema2Id, getS3Key, isS3ObjKey from h5json.objid import getObjId, isValidChunkId, getCollectionForId +from h5json.filters import getFilters +from h5json.shape_util import getShapeDims +from h5json.dset_util import getDatasetLayoutClass, getDatasetLayout, getChunkDims from .util.chunkUtil import getDatasetId, getNumChunks, ChunkIterator -from .util.dsetUtil import getHyperslabSelection, getFilterOps, getChunkDims, getFilters -from .util.dsetUtil import getDatasetLayoutClass, getDatasetLayout, getShapeDims +from .util.dsetUtil import getHyperslabSelection from .util.storUtil import getStorKeys, putStorJSONObj, getStorJSONObj from .util.storUtil import deleteStorObj, getStorBytes, isStorObj +from .datanode_lib import getFilterOps from . import hsds_logger as log from . import config import time diff --git a/hsds/attr_dn.py b/hsds/attr_dn.py index 3c640c3f..43c04232 100755 --- a/hsds/attr_dn.py +++ b/hsds/attr_dn.py @@ -21,10 +21,10 @@ from h5json.hdf5dtype import getItemSize, createDataType from h5json.array_util import arrayToBytes, jsonToArray, decodeData from h5json.array_util import bytesToArray, bytesArrayToList, getNumElements +from h5json.shape_util import getShapeDims from .util.attrUtil import validateAttributeName, isEqualAttr from .util.globparser import globmatch -from .util.dsetUtil import getShapeDims from .util.domainUtil import isValidBucketName from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj from .util.timeUtil import getNow @@ -361,7 +361,7 @@ async def PUT_Attributes(request): log.request(request) app = request.app params = request.rel_url.query - log.debug(f"got PUT_Attributes params: {params}") + log.debug(f"got PUT_Attributes params: {dict(params)}") obj_id = get_obj_id(request) now = getNow(app) max_timestamp_drift = int(config.get("max_timestamp_drift", default=300)) @@ -371,7 +371,7 @@ async def PUT_Attributes(request): raise HTTPBadRequest(message="body expected") body = await request.json() - log.debug(f"got body: {body}") + log.debug(f"PUT_Attributes got body: {body}") if "bucket" in params: bucket = params["bucket"] elif "bucket" in body: @@ -440,8 +440,8 @@ async def PUT_Attributes(request): data = arr.tolist() try: json_data = bytesArrayToList(data) - log.debug(f"converted encoded data to {json_data}") - if attr_shape["class"] == "H5S_SCALAR": + log.debug(f"converted encoded data to '{json_data}'") + if attr_shape["class"] == "H5S_SCALAR" and isinstance(json_data, list): attr_json["value"] = json_data[0] # just store the scalar else: attr_json["value"] = json_data diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index d7f05f75..44346929 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -22,6 +22,7 @@ from h5json.array_util import jsonToArray, getNumElements from h5json.array_util import bytesToArray, arrayToBytes, decodeData, encodeData from h5json.objid import isValidUuid +from h5json.shape_util import getShapeDims from .util.httpUtil import getAcceptType, jsonResponse, getHref, getBooleanParam from .util.globparser import globmatch @@ -29,7 +30,6 @@ from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain, verifyRoot from .util.attrUtil import validateAttributeName, getRequestCollectionName -from .util.dsetUtil import getShapeDims from .servicenode_lib import getDomainJson, getAttributeFromRequest, getAttributesFromRequest from .servicenode_lib import getAttributes, putAttributes, deleteAttributes, validateAction @@ -358,7 +358,7 @@ async def PUT_Attribute(request): kwargs = {"bucket": bucket} if "replace" in params and params["replace"]: # allow attribute to be overwritten - log.debug("setting replace for PUT Atttribute") + log.debug("setting replace for PUT Attribute") kwargs["replace"] = True else: log.debug("replace is not set for PUT Attribute") @@ -819,7 +819,7 @@ async def PUT_AttributeValue(request): log.debug("PUT AttributeValue - request_type is binary") request_type = "binary" elif "application/json" in content_type: - log.debug("PUT AttribueValue - request type is json") + log.debug("PUT AttributeValue - request type is json") else: msg = f"Unknown content_type: {content_type}" log.warn(msg) @@ -896,6 +896,7 @@ async def PUT_AttributeValue(request): attr_body["value"] = data.decode("ascii") attr_body["encoding"] = "base64" attr_json = {attr_name: attr_body} + log.debug(f"putting attr {attr_name} to DN: {attr_json}") kwargs = {"bucket": bucket, "replace": True} diff --git a/hsds/chunk_crawl.py b/hsds/chunk_crawl.py index 47b4b114..a92bdf36 100755 --- a/hsds/chunk_crawl.py +++ b/hsds/chunk_crawl.py @@ -27,12 +27,13 @@ from h5json.hdf5dtype import createDataType from h5json.array_util import jsonToArray, getNumpyValue from h5json.array_util import getNumElements, arrayToBytes, bytesToArray +from h5json.shape_util import getShapeDims +from h5json.dset_util import getChunkDims from .util.nodeUtil import getDataNodeUrl, getNodeCount from .util.httpUtil import http_get, http_put, http_post, get_http_client from .util.httpUtil import isUnixDomainUrl -from .util.dsetUtil import getSliceQueryParam, getShapeDims -from .util.dsetUtil import getSelectionShape, getChunkLayout +from .util.dsetUtil import getSliceQueryParam, getSelectionShape from .util.chunkUtil import getChunkCoverage, getDataCoverage from .util.chunkUtil import getChunkIdForPartition, getQueryDtype @@ -108,7 +109,7 @@ async def write_chunk_hyperslab( log.debug(f"setting fields_param to: {fields_param}") params["fields"] = fields_param - layout = getChunkLayout(dset_json) + layout = getChunkDims(dset_json) log.debug(f"getChunkCoverage({chunk_id}, {slices}, {layout})") chunk_sel = getChunkCoverage(chunk_id, slices, layout) if chunk_sel is None: diff --git a/hsds/chunk_dn.py b/hsds/chunk_dn.py index 02545b85..839da5ac 100644 --- a/hsds/chunk_dn.py +++ b/hsds/chunk_dn.py @@ -23,11 +23,12 @@ from h5json.hdf5dtype import createDataType, getSubType from h5json.array_util import bytesToArray, arrayToBytes, getBroadcastShape from h5json.objid import getS3Key, isValidUuid +from h5json.shape_util import getShapeDims +from h5json.dset_util import getChunkDims from .util.httpUtil import request_read, getContentType from .util.storUtil import isStorObj, deleteStorObj -from .util.dsetUtil import getSelectionList, getChunkLayout, getShapeDims -from .util.dsetUtil import getSelectionShape, getChunkInitializer +from .util.dsetUtil import getSelectionList, getSelectionShape, getChunkInitializer from .util.chunkUtil import getChunkIndex, getDatasetId, chunkQuery from .util.chunkUtil import chunkWriteSelection, chunkReadSelection from .util.chunkUtil import chunkWritePoints, chunkReadPoints @@ -131,7 +132,7 @@ async def PUT_Chunk(request): dset_json = await get_metadata_obj(app, dset_id, bucket=bucket) # TBD - does this work with linked datasets? - dims = getChunkLayout(dset_json) + dims = getChunkDims(dset_json) rank = len(dims) type_json = dset_json["type"] @@ -435,7 +436,7 @@ async def GET_Chunk(request): dset_json = await get_metadata_obj(app, dset_id, bucket=bucket) shape_dims = getShapeDims(dset_json["shape"]) log.debug(f"shape_dims: {shape_dims}") - dims = getChunkLayout(dset_json) + dims = getChunkDims(dset_json) log.debug(f"GET_Chunk - got dims: {dims}") # get chunk selection from query params @@ -682,7 +683,7 @@ async def POST_Chunk(request): dset_json = await get_metadata_obj(app, dset_id, bucket=bucket) log.debug(f"get_metadata_obj for {dset_id} returned {dset_json}") - dims = getChunkLayout(dset_json) + dims = getChunkDims(dset_json) rank = len(dims) type_json = dset_json["type"] diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index 86d7539f..a1cd3d06 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -28,14 +28,14 @@ from h5json.array_util import bytesArrayToList, jsonToArray, getNumElements, arrayToBytes from h5json.array_util import bytesToArray, squeezeArray, getBroadcastShape from h5json.objid import isValidUuid +from h5json.shape_util import isNullSpace, isScalar, getShapeDims +from h5json.dset_util import getChunkDims, isExtensible, getDsetMaxDims from .util.httpUtil import getHref, getAcceptType, getContentType from .util.httpUtil import request_read, jsonResponse, isAWSLambda from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain -from .util.dsetUtil import isNullSpace, isScalarSpace, get_slices, getShapeDims -from .util.dsetUtil import isExtensible, getSelectionPagination -from .util.dsetUtil import getSelectionShape, getDsetMaxDims, getChunkLayout +from .util.dsetUtil import getSelectionShape, getSelectionPagination, get_slices from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .servicenode_lib import getDsetJson, validateAction from .dset_lib import getSelectionData, getParser, extendShape, doPointWrite, doHyperslabWrite @@ -819,7 +819,7 @@ async def GET_Value(request): log.debug(f"dset shape: {dims}") rank = len(dims) - layout = getChunkLayout(dset_json) + layout = getChunkDims(dset_json) log.debug(f"chunk layout: {layout}") await validateAction(app, domain, dset_id, username, "read") @@ -1103,7 +1103,7 @@ async def POST_Value(request): msg = "POST value not supported for datasets with NULL shape" log.warn(msg) raise HTTPBadRequest(reason=msg) - if isScalarSpace(dset_json): + if isScalar(dset_json): msg = "POST value not supported for datasets with SCALAR shape" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/datanode_lib.py b/hsds/datanode_lib.py index 48843a25..9bd2b0a5 100644 --- a/hsds/datanode_lib.py +++ b/hsds/datanode_lib.py @@ -20,11 +20,14 @@ from aiohttp.web_exceptions import HTTPNotFound, HTTPForbidden from aiohttp.web_exceptions import HTTPServiceUnavailable, HTTPBadRequest -from h5json.hdf5dtype import createDataType +from h5json.hdf5dtype import createDataType, isVlen from h5json.array_util import arrayToBytes, bytesToArray, jsonToArray +from h5json.filters import getFilters, getCompressionFilter, getShuffleFilter from h5json.objid import getS3Key, isValidUuid from h5json.objid import isValidChunkId, isSchema2Id from h5json.objid import getRootObjId, isRootObjId +from h5json.shape_util import getShapeDims +from h5json.dset_util import getChunkDims from .util.nodeUtil import getDataNodeUrl from .util.storUtil import getStorJSONObj, putStorJSONObj, putStorBytes @@ -33,8 +36,7 @@ from .util.domainUtil import isValidDomain, getBucketForDomain from .util.attrUtil import getRequestCollectionName from .util.httpUtil import http_post -from .util.dsetUtil import getChunkLayout, getFilterOps, getShapeDims -from .util.dsetUtil import getChunkInitializer, getSliceQueryParam, getFilters +from .util.dsetUtil import getChunkInitializer, getSliceQueryParam from .util.chunkUtil import getDatasetId, getChunkSelection, getChunkIndex from .util.nodeUtil import validateInPartition from .util.rangegetUtil import ChunkLocation, chunkMunge, getHyperChunkIndex, getHyperChunkFactors @@ -558,6 +560,54 @@ async def delete_metadata_obj(app, obj_id, notify=True, root_id=None, bucket=Non log.debug(f"delete_metadata_obj for {obj_id} done") +def getFilterOps(app, dset_id, filters, dtype=None, chunk_shape=None): + """Get list of filter operations to be used for this dataset""" + filter_map = app["filter_map"] + + if dset_id in filter_map: + return filter_map[dset_id] + + compressionFilter = getCompressionFilter(filters) + + filter_ops = {} + + shuffleFilter = getShuffleFilter(filters) + + if shuffleFilter and not isVlen(dtype): + shuffle_name = shuffleFilter["name"] + if shuffle_name == "shuffle": + filter_ops["shuffle"] = 1 # use regular shuffle + elif shuffle_name == "bitshuffle": + filter_ops["shuffle"] = 2 # use bitshuffle + else: + filter_ops["shuffle"] = 0 # no shuffle + else: + filter_ops["shuffle"] = 0 # no shuffle + + if compressionFilter: + if compressionFilter["class"] == "H5Z_FILTER_DEFLATE": + filter_ops["compressor"] = "zlib" # blosc compressor + else: + if "name" in compressionFilter: + filter_ops["compressor"] = compressionFilter["name"] + else: + filter_ops["compressor"] = "lz4" # default to lz4 + if "level" not in compressionFilter: + filter_ops["level"] = 5 # medium level + else: + filter_ops["level"] = int(compressionFilter["level"]) + + if filter_ops: + # save the chunk shape and dtype + filter_ops["chunk_shape"] = chunk_shape + filter_ops["dtype"] = dtype + filter_map[dset_id] = filter_ops # save + + return filter_ops + else: + return None + + def arange_chunk_init( app, initializer, @@ -588,9 +638,8 @@ def arange_chunk_init( log.warn(msg) raise None - try: - chunk_layout = getChunkLayout(dset_json) - except HTTPInternalServerError: + chunk_layout = getChunkDims(dset_json) + if chunk_layout is None: msg = "non-chunked dataset" log.warning(msg) raise None @@ -714,7 +763,7 @@ async def run_chunk_initializer( dims = getShapeDims(datashape) log.debug(f"dataset shape: {dims}") # get the chunk layout for this dataset - layout = getChunkLayout(dset_json) + layout = getChunkDims(dset_json) log.debug(f"chunk layout: {layout}") rank = len(dims) @@ -1008,12 +1057,12 @@ async def get_chunk( log.debug(msg) chunk_arr = None - dims = getChunkLayout(dset_json) + dims = getChunkDims(dset_json) type_json = dset_json["type"] dt = createDataType(type_json) layout_json = dset_json["layout"] layout_class = layout_json.get("class") - chunk_dims = getChunkLayout(dset_json) + chunk_dims = getChunkDims(dset_json) fill_value = getFillValue(dset_json) # note - officially we should follow the order in which the filters are @@ -1167,7 +1216,7 @@ def save_chunk(app, chunk_id, dset_json, chunk_arr, bucket=None): dset_id = dset_json["id"] dtype = createDataType(dset_json["type"]) - chunk_shape = getChunkLayout(dset_json) + chunk_shape = getChunkDims(dset_json) # will store filter options into app['filter_map'] filters = getFilters(dset_json) diff --git a/hsds/dset_dn.py b/hsds/dset_dn.py index b2b640e4..7b2029f8 100755 --- a/hsds/dset_dn.py +++ b/hsds/dset_dn.py @@ -57,9 +57,12 @@ async def GET_Dataset(request): resp_json["shape"] = dset_json["shape"] resp_json["attributeCount"] = len(dset_json["attributes"]) if "creationProperties" in dset_json: - resp_json["creationProperties"] = dset_json["creationProperties"] + cpl = dset_json["creationProperties"] + else: + cpl = {} if "layout" in dset_json: - resp_json["layout"] = dset_json["layout"] + cpl["layout"] = dset_json["layout"] + resp_json["creationProperties"] = cpl if "include_attrs" in params and params["include_attrs"]: resp_json["attributes"] = dset_json["attributes"] @@ -133,9 +136,8 @@ async def POST_Dataset(request): raise HTTPInternalServerError() shape_json = body["shape"] - layout = None if "layout" in body: - layout = body["layout"] # client specified chunk layout + log.error("unexpected key for POST Dataset: 'layout'") # ok - all set, create dataset obj now = getNow(app) @@ -160,9 +162,10 @@ async def POST_Dataset(request): } if "creationProperties" in body: - dset_json["creationProperties"] = body["creationProperties"] - if layout is not None: - dset_json["layout"] = layout + cpl = body["creationProperties"] + else: + cpl = {} + dset_json["creationProperties"] = cpl kwargs = {"bucket": bucket, "notify": True, "flush": True} await save_metadata_obj(app, dset_id, dset_json, **kwargs) @@ -175,8 +178,7 @@ async def POST_Dataset(request): resp_json["shape"] = shape_json resp_json["lastModified"] = dset_json["lastModified"] resp_json["attributeCount"] = len(attrs) - if layout is not None: - resp_json["layout"] = layout + resp_json["creationProperties"] = cpl resp = json_response(resp_json, status=201) log.response(request, resp=resp) diff --git a/hsds/dset_lib.py b/hsds/dset_lib.py index fc1d3626..384defe7 100755 --- a/hsds/dset_lib.py +++ b/hsds/dset_lib.py @@ -23,11 +23,12 @@ from h5json.hdf5dtype import createDataType, getItemSize, getDtypeItemSize from h5json.array_util import getNumpyValue, bytesToArray from h5json.objid import isSchema2Id, getS3Key, getObjId +from h5json.shape_util import isNullSpace, getShapeDims +from h5json.dset_util import getChunkDims, getDatasetLayout, getDatasetLayoutClass from .util.nodeUtil import getDataNodeUrl from .util.boolparser import BooleanParser -from .util.dsetUtil import isNullSpace, getDatasetLayout, getDatasetLayoutClass, get_slices -from .util.dsetUtil import getShapeDims, getSelectionShape, getChunkLayout +from .util.dsetUtil import get_slices, getSelectionShape from .util.chunkUtil import getChunkCoordinate, getChunkIndex, getChunkSuffix from .util.chunkUtil import getNumChunks, getChunkIds, getChunkId from .util.chunkUtil import getChunkCoverage, getDataCoverage @@ -370,7 +371,7 @@ def get_chunk_selections(chunk_map, chunk_ids, slices, dset_json): log.debug("no slices set, returning") return # nothing to do log.debug(f"slices: {slices}") - layout = getChunkLayout(dset_json) + layout = getChunkDims(dset_json) for chunk_id in chunk_ids: if chunk_id in chunk_map: item = chunk_map[chunk_id] @@ -448,7 +449,7 @@ async def getSelectionData( log.error("getSelectionData - expected either slices or points to be set") raise HTTPInternalServerError() - layout = getChunkLayout(dset_json) + layout = getChunkDims(dset_json) chunkinfo = {} @@ -861,7 +862,15 @@ async def reduceShape(app, dset_json, shape_update, bucket=None): arr = np.zeros([1], dtype=dt, order="C") # and the chunk layout - layout = tuple(getChunkLayout(dset_json)) + layout = getChunkDims(dset_json) + if not layout: + layout = dset_json.get("layout") # older storage version put layout here + if layout: + log.warn(f"got layout for {dset_id} from dataset_json") + if not layout: + msg = f"no layout found for {dset_id}" + log.error(msg) + raise HTTPInternalServerError() log.debug(f"got layout: {layout}") # get all chunk ids for chunks that have been allocated @@ -1073,7 +1082,7 @@ async def doPointWrite(app, num_points = len(points) log.debug(f"doPointWrite - num_points: {num_points}") dset_id = dset_json["id"] - layout = getChunkLayout(dset_json) + layout = getChunkDims(dset_json) datashape = dset_json["shape"] dims = getShapeDims(datashape) rank = len(dims) @@ -1172,7 +1181,7 @@ async def doHyperslabWrite(app, log.error(msg) raise HTTPInternalServerError() - layout = getChunkLayout(dset_json) + layout = getChunkDims(dset_json) num_chunks = getNumChunks(page, layout) log.debug(f"num_chunks: {num_chunks}") diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 84a983c9..7a05999f 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -20,11 +20,13 @@ from h5json.hdf5dtype import createDataType from h5json.array_util import getNumElements, jsonToArray from h5json.objid import isValidUuid, isSchema2Id +from h5json.shape_util import getShapeDims +from h5json.dset_util import getChunkDims, getDatasetLayoutClass from .util.httpUtil import getHref, respJsonAssemble from .util.httpUtil import jsonResponse, getBooleanParam from .util.chunkUtil import getChunkIds -from .util.dsetUtil import getPreviewQuery, getShapeDims, getChunkLayout, getDatasetLayoutClass +from .util.dsetUtil import getPreviewQuery from .util.authUtil import getUserPasswordFromRequest, aclCheck from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain @@ -620,7 +622,7 @@ def _updateInitValuesList(kwargs): msg = f"dataset init_data used with unsupported layout_class: {layout_class}" log.error(msg) raise HTTPInternalServerError() - layout_dims = getChunkLayout(dset_json) + layout_dims = getChunkDims(dset_json) log.debug(f"init data layout is: {layout_dims}") # make selection for entire dataspace dims = getShapeDims(dset_json["shape"]) diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 3e70a71a..b26ba8ee 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -29,8 +29,12 @@ from h5json.array_util import jsonToArray, getNumpyValue from h5json.objid import getCollectionForId, createObjId, getRootObjId from h5json.objid import isSchema2Id, getS3Key, isValidUuid -from h5json.hdf5dtype import getBaseTypeJson, validateTypeItem, createDataType -from h5json.hdf5dtype import getItemSize +from h5json.hdf5dtype import getBaseTypeJson, validateTypeItem, createDataType, getItemSize +from h5json.filters import getFiltersJson +from h5json.shape_util import getShapeDims, getShapeClass +from h5json.dset_util import guessChunk, getChunkSize +from h5json.dset_util import validateChunkLayout, getDataSize, getDsetMaxDims +from h5json.dset_util import LAYOUT_CLASSES from .util.nodeUtil import getDataNodeUrl from .util.authUtil import getAclKeys @@ -39,10 +43,8 @@ from .util.authUtil import aclCheck from .util.httpUtil import http_get, http_put, http_post, http_delete from .util.domainUtil import getBucketForDomain, verifyRoot, getLimits +from .util.dsetUtil import getShapeJson from .util.storUtil import getCompressors -from .util.dsetUtil import getShapeDims, getShapeJson, getFiltersJson, validateChunkLayout -from .util.dsetUtil import getContiguousLayout, guessChunk, getChunkSize -from .util.dsetUtil import expandChunk, shrinkChunk from .basenode import getVersion from . import hsds_logger as log @@ -976,8 +978,7 @@ def getShapeFromRequest(body): raise HTTPBadRequest(reason=msg) elif shape_class == "H5S_SCALAR": shape_json["class"] = "H5S_SCALAR" - dims = getShapeDims(shape_body) - if len(dims) != 1 or dims[0] != 1: + if "dims" in shape_body: msg = "dimensions aren't valid for scalar attribute" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -1230,7 +1231,7 @@ async def putAttributes(app, req = getDataNodeUrl(app, obj_id) collection = getCollectionForId(obj_id) req += f"/{collection}/{obj_id}/attributes" - log.info(f"putAttribute: {req}") + log.info(f"putAttributes: {req}") params = {} if replace: @@ -1304,7 +1305,7 @@ async def deleteObject(app, obj_id, bucket=None): def validateDatasetCreationProps(creation_props, type_json=None, shape=None): """ validate creation props """ - log.debug(f"validateCreationProps: {creation_props}") + log.debug(f"validateDatasetCreationProps: {creation_props}") if "fillValue" in creation_props: if not type_json or not shape: msg = "shape and type must be set to use fillValue" @@ -1344,12 +1345,16 @@ def validateDatasetCreationProps(creation_props, type_json=None, shape=None): log.warn(msg) raise HTTPBadRequest(reason=msg) - supported_filters = getSupportedFilters() - # will raise bad request exception if not valid supported_filters = getSupportedFilters(include_compressors=True) log.debug(f"supported_filters: {supported_filters}") - filters_out = getFiltersJson(creation_props, supported_filters=supported_filters) - # replace filters with our starndardized list + try: + filters_out = getFiltersJson(creation_props, supported_filters=supported_filters) + except (KeyError, ValueError): + # raise bad request exception if not valid + msg = "invalid filter provided" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + # replace filters with our standardized list log.debug(f"setting filters to: {filters_out}") creation_props["filters"] = filters_out @@ -1536,6 +1541,16 @@ def getDatasetCreateArgs(body, # will return scalar shape if no shape key in body shape_json = getShapeJson(body) + try: + shape_class = getShapeClass(shape_json) + shape_dims = getShapeDims(shape_json) + except (KeyError, TypeError, ValueError): + msg = f"Invalid shape: {shape_json}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + log.debug(f"got createArgs: {list(kwargs.keys())}") + kwargs["shape"] = shape_json # get layout for dataset creation @@ -1545,6 +1560,8 @@ def getDatasetCreateArgs(body, max_chunk_size = int(config.get("max_chunk_size")) type_json = kwargs["type"] item_size = getItemSize(type_json) + if item_size == "H5T_VARIABLE": + item_size = config.get("default_vlen_type_size", default=128) creation_props = kwargs["creation_props"] layout_props = None @@ -1558,125 +1575,129 @@ def getDatasetCreateArgs(body, msg = f"invalid chunk layout: {layout_props}" log.warn(msg) raise HTTPBadRequest(reason=msg) + else: + creation_props = {} - # TBD: check for invalid layout class... + layout_class = None + chunk_dims = None if layout_props: - if layout_props["class"] == "H5D_CONTIGUOUS": - # treat contiguous as chunked - layout_class = "H5D_CHUNKED" - else: - layout_class = layout_props["class"] - elif shape_json["class"] != "H5S_NULL": - layout_class = "H5D_CHUNKED" - else: - layout_class = None - log.debug(f"using layout_class: {layout_class}") + layout_class = layout_props.get("class") - if layout_class == "H5D_COMPACT": - layout = {"class": "H5D_COMPACT"} - elif layout_class: - # initialize to H5D_CHUNKED - layout = {"class": "H5D_CHUNKED"} - else: - # null space - no layout - layout = None + if layout_class: + if layout_class not in LAYOUT_CLASSES: + msg = f"unknown layout_class: {layout_class}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + # check dims is defined for any chunked layout + if layout_class.startswith("H5D_CHUNKED"): + if "dims" not in layout_props: + msg = "chunked layout specified without dims" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + chunk_dims = layout_props["dims"] + if len(chunk_dims) != len(shape_dims): + msg = "chunk dimensions have different rank than dataset" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + elif layout_class == "H5D_CONTIGUOUS_REF" and getItemSize(type_json) == "H5T_VARIABLE": + # ref dataset does not work with vlen type + msg = "H5D_CONTIGUOUS_REF cannot be used with variable length types" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + pass - if layout_props and "dims" in layout_props: - chunk_dims = layout_props["dims"] + elif shape_class == "H5S_NULL": + layout_class = None + log.debug("using None layout for H5S_NULL dataset") + elif shape_class == "H5S_SCALAR": + layout_class = "H5D_CONTIGUOUS" + log.debug("Using H5D_CONTIGUOUS for H5S_SCALAR dataset") + elif shape_class == "H5S_SIMPLE": + dset_size = getDataSize(shape_dims, item_size) + if dset_size <= min_chunk_size: + # default to contiguous + layout_class = "H5D_CONTIGUOUS" + log.debug(f"Using H5D_CONTIGUOUS for small (<{min_chunk_size}) dataset") + else: + layout_class = "H5D_CHUNKED" + log.debug(f"shape_json: {shape_json}") + log.debug(f"item_size: {item_size}") + log.debug(f"chunk_min: {min_chunk_size}") + log.debug(f"chunk_max: {max_chunk_size}") + kwargs = {"chunk_min": min_chunk_size, "chunk_max": max_chunk_size} + chunk_dims = guessChunk(shape_json, item_size, **kwargs) + log.debug(f"initial autochunk layout: {chunk_dims}") + chunk_size = getChunkSize(chunk_dims, item_size) + + # log warning if the chunk shape if chunk size is too small or too big + if chunk_size < min_chunk_size: + msg = f"chunk size: {chunk_size} less than recommended min size: {min_chunk_size}" + log.warn(msg) + elif chunk_size > max_chunk_size: + msg = f"chunk size: {chunk_size} greater than recommended " + msg += f"max size: {max_chunk_size}" + log.debug(msg) else: - chunk_dims = None - - if layout_class == "H5D_CONTIGUOUS_REF": - opts = {"chunk_min": min_chunk_size, "chunk_max": max_chunk_size} - chunk_dims = getContiguousLayout(shape_json, item_size, **opts) - layout["dims"] = chunk_dims - log.debug(f"autoContiguous layout: {layout}") - - if layout_class == "H5D_CHUNKED" and chunk_dims is None: - # do auto-chunking - chunk_dims = guessChunk(shape_json, item_size) - log.debug(f"initial autochunk layout: {chunk_dims}") - - if layout_class == "H5D_CHUNKED": - chunk_size = getChunkSize(chunk_dims, item_size) - - msg = f"chunk_size: {chunk_size}, min: {min_chunk_size}, " - msg += f"max: {max_chunk_size}" - log.debug(msg) + msg = f"unexpected shape_class: {shape_class}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) - # adjust the chunk shape if chunk size is too small or too big - adjusted_chunk_dims = None - if chunk_size < min_chunk_size: - msg = f"chunk size: {chunk_size} less than min size: " - msg += f"{min_chunk_size}, expanding" - log.debug(msg) - opts = {"chunk_min": min_chunk_size, "layout_class": layout_class} - adjusted_chunk_dims = expandChunk(chunk_dims, item_size, shape_json, **opts) - elif chunk_size > max_chunk_size: - msg = f"chunk size: {chunk_size} greater than max size: " - msg += f"{max_chunk_size}, shrinking" + if not layout_props: + layout_props = {"class": layout_class} + if chunk_dims: + layout_props["dims"] = chunk_dims + log.debug(f"using dataset layout: {layout_props}") + creation_props["layout"] = layout_props + + # set partition_count if needed: + max_chunks_per_folder = int(config.get("max_chunks_per_folder")) + set_partition = False + if max_chunks_per_folder > 0: + if "dims" in layout_props: + set_partition = True + + if set_partition: + log.debug(f"updating layout for partition constraint: {max_chunks_per_folder}") + shape_dims = getShapeDims(shape_json) + max_dims = getDsetMaxDims(shape_json) + + num_chunks = 1 + rank = len(shape_dims) + unlimited_count = 0 + if max_dims: + for i in range(rank): + if max_dims[i] == 0: + unlimited_count += 1 + msg = f"number of unlimited dimensions: {unlimited_count}" log.debug(msg) - opts = {"chunk_max": max_chunk_size} - adjusted_chunk_dims = shrinkChunk(chunk_dims, item_size, **opts) - if adjusted_chunk_dims: - msg = f"requested chunk_dimensions: {chunk_dims} modified " - msg += f"dimensions: {adjusted_chunk_dims}" - log.debug(msg) - layout["dims"] = adjusted_chunk_dims - else: - layout["dims"] = chunk_dims # don't need to adjust chunk size - - # set partition_count if needed: - max_chunks_per_folder = int(config.get("max_chunks_per_folder")) - set_partition = False - if max_chunks_per_folder > 0: - if "dims" in shape_json and "dims" in layout: - set_partition = True - - if set_partition: - chunk_dims = layout["dims"] - shape_dims = shape_json["dims"] - if "maxdims" in shape_json: - max_dims = shape_json["maxdims"] - else: - max_dims = None - num_chunks = 1 - rank = len(shape_dims) - unlimited_count = 0 + for i in range(rank): + max_dim = 1 if max_dims: - for i in range(rank): - if max_dims[i] == 0: - unlimited_count += 1 - msg = f"number of unlimited dimensions: {unlimited_count}" - log.debug(msg) - - for i in range(rank): - max_dim = 1 - if max_dims: - max_dim = max_dims[i] - if max_dim == 0: - # don't really know what the ultimate extent - # could be, but assume 10^6 for total number of - # elements and square-shaped array... - MAX_ELEMENT_GUESS = 10.0 ** 6 - exp = 1 / unlimited_count - max_dim = int(math.pow(MAX_ELEMENT_GUESS, exp)) - else: - max_dim = shape_dims[i] - num_chunks *= math.ceil(max_dim / chunk_dims[i]) - - if num_chunks > max_chunks_per_folder: - partition_count = math.ceil(num_chunks / max_chunks_per_folder) - msg = f"set partition count to: {partition_count}, " - msg += f"num_chunks: {num_chunks}" - log.info(msg) - layout["partition_count"] = partition_count + max_dim = max_dims[i] + if max_dim == 0: + # don't really know what the ultimate extent + # could be, but assume 10^6 for total number of + # elements and square-shaped array... + MAX_ELEMENT_GUESS = 10.0 ** 6 + exp = 1 / unlimited_count + max_dim = int(math.pow(MAX_ELEMENT_GUESS, exp)) else: - msg = "do not need chunk partitions, num_chunks: " - msg += f"{num_chunks} max_chunks_per_folder: " - msg += f"{max_chunks_per_folder}" - log.info(msg) + max_dim = shape_dims[i] + num_chunks *= math.ceil(max_dim / chunk_dims[i]) + + if num_chunks > max_chunks_per_folder: + partition_count = math.ceil(num_chunks / max_chunks_per_folder) + msg = f"set partition count to: {partition_count}, " + msg += f"num_chunks: {num_chunks}" + log.info(msg) + layout_props["partition_count"] = partition_count + else: + msg = "do not need chunk partitions, num_chunks: " + msg += f"{num_chunks} max_chunks_per_folder: " + msg += f"{max_chunks_per_folder}" + log.info(msg) if layout_class in ("H5D_CHUNKED_REF", "H5D_CHUNKED_REF_INDIRECT"): chunk_size = getChunkSize(chunk_dims, item_size) @@ -1694,11 +1715,10 @@ def getDatasetCreateArgs(body, msg = f"chunk size: {chunk_size} greater than max size: " msg += f"{max_chunk_size}, for {layout_class} dataset" log.warn(msg) - layout["dims"] = chunk_dims + layout_props["dims"] = chunk_dims - if layout: - log.debug(f"setting layout to: {layout}") - kwargs["layout"] = layout + creation_props["layout"] = layout_props + kwargs["creation_props"] = creation_props # # get input data if present @@ -1813,7 +1833,6 @@ async def createObject(app, obj_id=None, type=None, shape=None, - layout=None, creation_props=None, attrs=None, links=None, @@ -1846,8 +1865,6 @@ async def createObject(app, log.debug(f" type: {type}") if shape: log.debug(f" shape: {shape}") - if layout: - log.debug(f" layout: {layout}") if creation_props: log.debug(f" cprops: {creation_props}") if attrs: @@ -1886,10 +1903,10 @@ async def createObject(app, obj_json["type"] = type if shape: obj_json["shape"] = shape - if layout: - obj_json["layout"] = layout if creation_props: obj_json["creationProperties"] = creation_props + else: + obj_json["creationProperties"] = {} if attrs: kwargs = {"obj_id": obj_id, "bucket": bucket} attrs_json = {"attributes": attrs} @@ -1987,7 +2004,6 @@ async def createDataset(app, h5path=None, obj_id=None, creation_props=None, - layout=None, attrs=None, links=None, implicit=None, @@ -2011,7 +2027,6 @@ async def createDataset(app, kwargs["shape"] = shape kwargs["h5path"] = h5path kwargs["obj_id"] = obj_id - kwargs["layout"] = layout kwargs["creation_props"] = creation_props kwargs["attrs"] = attrs kwargs["links"] = links diff --git a/hsds/util/dsetUtil.py b/hsds/util/dsetUtil.py index 5bf3afc5..3704822d 100644 --- a/hsds/util/dsetUtil.py +++ b/hsds/util/dsetUtil.py @@ -10,297 +10,14 @@ # request a copy from help@hdfgroup.org. # ############################################################################## -from aiohttp.web_exceptions import HTTPBadRequest, HTTPInternalServerError +from aiohttp.web_exceptions import HTTPBadRequest import math -from h5json.hdf5dtype import getItemSize, isVlen -from h5json.objid import isValidUuid +from h5json.shape_util import getShapeDims from .. import hsds_logger as log -from .. import config - - -CHUNK_MIN = 512 * 1024 # Soft lower limit (512k) -CHUNK_MAX = 2048 * 1024 # Hard upper limit (2M) -DEFAULT_TYPE_SIZE = 128 # Type size case when it is variable - -""" -Filters that are known to HSDS. -Format is: - FILTER_CODE, FILTER_ID, Name - - H5Z_FILTER_FLETCHER32, H5Z_FILTER_SZIP, H5Z_FILTER_NBIT, - and H5Z_FILTER_SCALEOFFSET, are not currently supported. - - Non-supported filters metadata will be stored, but are - not (currently) used for compression/decompression. -""" - -FILTER_DEFS = ( - ("H5Z_FILTER_NONE", 0, "none"), - ("H5Z_FILTER_DEFLATE", 1, "gzip"), # aka as "zlib" for blosc - ("H5Z_FILTER_SHUFFLE", 2, "shuffle"), - ("H5Z_FILTER_FLETCHER32", 3, "fletcher32"), - ("H5Z_FILTER_SZIP", 4, "szip"), - ("H5Z_FILTER_NBIT", 5, "nbit"), - ("H5Z_FILTER_SCALEOFFSET", 6, "scaleoffset"), - ("H5Z_FILTER_LZF", 32000, "lzf"), - ("H5Z_FILTER_BLOSC", 32001, "blosclz"), - ("H5Z_FILTER_SNAPPY", 32003, "snappy"), - ("H5Z_FILTER_LZ4", 32004, "lz4"), - ("H5Z_FILTER_LZ4HC", 32005, "lz4hc"), - ("H5Z_FILTER_BITSHUFFLE", 32008, "bitshuffle"), - ("H5Z_FILTER_ZSTD", 32015, "zstd"), -) - -COMPRESSION_FILTER_IDS = ( - "H5Z_FILTER_DEFLATE", - "H5Z_FILTER_SZIP", - "H5Z_FILTER_SCALEOFFSET", - "H5Z_FILTER_LZF", - "H5Z_FILTER_BLOSC", - "H5Z_FILTER_SNAPPY", - "H5Z_FILTER_LZ4", - "H5Z_FILTER_LZ4HC", - "H5Z_FILTER_ZSTD", -) - -COMPRESSION_FILTER_NAMES = ( - "gzip", - "szip", - "lzf", - "blosclz", - "snappy", - "lz4", - "lz4hc", - "zstd", -) - -CHUNK_LAYOUT_CLASSES = ( - "H5D_CHUNKED", - "H5D_CHUNKED_REF", - "H5D_CHUNKED_REF_INDIRECT", - "H5D_CONTIGUOUS_REF", -) - - -def get_dset_size(shape_json, typesize): - """Return the size of the dataspace. For - any unlimited dimensions, assume a value of 1. - (so the return size will be the absolute minimum) - """ - if shape_json is None or shape_json["class"] == "H5S_NULL": - return None - if shape_json["class"] == "H5S_SCALAR": - return typesize # just return size for one item - if typesize == "H5T_VARIABLE": - typesize = DEFAULT_TYPE_SIZE # just take a guess at the item size - dset_size = typesize - shape = shape_json["dims"] - rank = len(shape) - - for n in range(rank): - if shape[n] == 0: - # extendable extent with value of 0 - continue # assume this is one - dset_size *= shape[n] - return dset_size - - -def getFilterItem(key): - """ - Return filter code, id, and name, based on an id, a name or a code. - """ - - if key == "deflate": - key = "gzip" # use gzip as equivalent - for item in FILTER_DEFS: - for i in range(3): - if key == item[i]: - return {"class": item[0], "id": item[1], "name": item[2]} - return None # not found - - -def getFiltersJson(create_props, supported_filters=None): - """ return standardized filter representation from creation properties - raise bad request if invalid """ - - # refer to https://hdf5-json.readthedocs.io/en/latest/bnf/\ - # filters.html#grammar-token-filter_list - - if "filters" not in create_props: - return {} # null set - - f_in = create_props["filters"] - - log.debug(f"filters provided in creation_prop: {f_in}") - - if not isinstance(f_in, list): - msg = "Expected filters in creation_props to be a list" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - f_out = [] - for filter in f_in: - if isinstance(filter, int) or isinstance(filter, str): - item = getFilterItem(filter) - if not item: - msg = f"filter {filter} not recognized" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if item["name"] not in supported_filters: - msg = f"filter {filter} is not supported" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - f_out.append(item) - elif isinstance(filter, dict): - if "class" not in filter: - msg = "expected 'class' key for filter property" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if filter["class"] != "H5Z_FILTER_USER": - item = getFilterItem(filter["class"]) - elif "id" in filter: - item = getFilterItem(filter["id"]) - elif "name" in filter: - item = getFilterItem(filter["name"]) - else: - item = None - if not item: - msg = f"filter {filter['class']} not recognized" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "id" not in filter: - filter["id"] = item["id"] - elif item["id"] != filter["id"]: - msg = f"Expected {filter['class']} to have id: " - msg += f"{item['id']} but got {filter['id']}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "name" not in filter: - filter["name"] = item["name"] - if filter["name"] not in supported_filters: - msg = f"filter {filter} is not supported" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - f_out.append(filter) - else: - msg = f"Unexpected type for filter: {filter}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - # return standardized filter representation - log.debug(f"using filters: {f_out}") - return f_out - - -def getFilters(dset_json): - """Return list of filters, or empty list""" - if "creationProperties" not in dset_json: - return [] - creationProperties = dset_json["creationProperties"] - if "filters" not in creationProperties: - return [] - filters = creationProperties["filters"] - return filters - - -def getCompressionFilter(filters): - """Return compression filter from filters, or None""" - for filter in filters: - if "class" not in filter: - msg = f"filter option: {filter} with no class key" - log.warn(msg) - continue - filter_class = filter["class"] - if filter_class in COMPRESSION_FILTER_IDS: - return filter - if all( - ( - filter_class == "H5Z_FILTER_USER", - "name" in filter, - filter["name"] in COMPRESSION_FILTER_NAMES, - ) - ): - return filter - return None - - -def getShuffleFilter(filters): - """Return shuffle filter, or None""" - FILTER_CLASSES = ("H5Z_FILTER_SHUFFLE", "H5Z_FILTER_BITSHUFFLE") - for filter in filters: - log.debug(f"filter: {filter}") - if "class" not in filter: - log.warn(f"filter option: {filter} with no class key") - continue - filter_class = filter["class"] - if filter_class in FILTER_CLASSES: - log.debug(f"found filter: {filter}") - return filter - - log.debug("Shuffle filter not used") - return None - - -def getFilterOps(app, dset_id, filters, dtype=None, chunk_shape=None): - """Get list of filter operations to be used for this dataset""" - filter_map = app["filter_map"] - - try: - if dset_id in filter_map: - log.debug(f"returning filter from filter_map for dset: {dset_id}") - return filter_map[dset_id] - except TypeError: - log.error(f"getFilterOps TypeError - dset_id: {dset_id} filter_map: {filter_map}") - raise - - compressionFilter = getCompressionFilter(filters) - log.debug(f"got compressionFilter: {compressionFilter}") - - filter_ops = {} - - shuffleFilter = getShuffleFilter(filters) - - if shuffleFilter and not isVlen(dtype): - shuffle_name = shuffleFilter["name"] - if shuffle_name == "shuffle": - filter_ops["shuffle"] = 1 # use regular shuffle - elif shuffle_name == "bitshuffle": - filter_ops["shuffle"] = 2 # use bitshuffle - else: - log.warn(f"unexpected shuffleFilter: {shuffle_name}") - filter_ops["shuffle"] = 0 # no shuffle - else: - filter_ops["shuffle"] = 0 # no shuffle - - if compressionFilter: - if compressionFilter["class"] == "H5Z_FILTER_DEFLATE": - filter_ops["compressor"] = "zlib" # blosc compressor - else: - if "name" in compressionFilter: - filter_ops["compressor"] = compressionFilter["name"] - else: - filter_ops["compressor"] = "lz4" # default to lz4 - if "level" not in compressionFilter: - filter_ops["level"] = 5 # medium level - else: - filter_ops["level"] = int(compressionFilter["level"]) - - if filter_ops: - # save the chunk shape and dtype - filter_ops["chunk_shape"] = chunk_shape - filter_ops["dtype"] = dtype - log.debug(f"save filter ops for {dset_id}") - filter_map[dset_id] = filter_ops # save - - return filter_ops - else: - return None - + def getShapeJson(body): """ Return normalized json description of data space """ @@ -399,608 +116,6 @@ def getShapeJson(body): return shape_json -def getShapeClass(data_shape): - """ Return shape class of the given data shape """ - - if not isinstance(data_shape, dict): - raise TypeError("expected dict object") - - if "class" not in data_shape: - raise KeyError("expected 'class' key for data shape")\ - - return data_shape["class"] - - -def getRank(data_shape): - """ Return rank of given data shape_json """ - - shape_class = getShapeClass(data_shape) - - if shape_class == "H5S_NULL": - return 0 - elif shape_class == "H5S_SCALAR": - return 0 - elif shape_class == "H5S_SIMPLE": - if "dims" not in data_shape: - raise KeyError("expected dims key for H5S_SIMPLE data shape") - return len(data_shape["dims"]) - else: - raise ValueError(f"unexpected data shape class: {shape_class}") - - -def getDsetRank(dset_json): - """Get rank returning 0 for scalar or NULL data shapes""" - data_shape = dset_json["shape"] - return getRank(data_shape) - - -def isNullSpace(dset_json): - """Return true if this dataset is a null data space""" - shape_class = getShapeClass(dset_json["shape"]) - if shape_class == "H5S_NULL": - return True - else: - return False - - -def isScalarSpace(dset_json): - """ return true if this is a scalar dataset """ - - data_shape = dset_json["shape"] - shape_class = getShapeClass(data_shape) - if shape_class == "H5S_NULL": - return False - - rank = getRank(data_shape) - return True if rank == 0 else False - - -def getContiguousLayout(shape_json, item_size, chunk_min=None, chunk_max=None): - """ - create a chunk layout for datasets use contiguous storage. - """ - if not isinstance(item_size, int): - msg = "ContiguousLayout can only be used with fixed-length types" - log.warn(msg) - raise ValueError(msg) - - if chunk_min is None: - msg = "chunk_min not set" - log.warn(msg) - raise ValueError(msg) - if chunk_max is None: - msg = "chunk_max not set" - log.warn(msg) - raise ValueError(msg) - - if chunk_max < chunk_min: - raise ValueError("chunk_max cannot be less than chunk_min") - - if shape_json is None or shape_json["class"] == "H5S_NULL": - return None - if shape_json["class"] == "H5S_SCALAR": - return (1,) # just enough to store one item - dims = shape_json["dims"] - rank = len(dims) - if rank == 0: - raise ValueError("rank must be positive for Contiguous Layout") - for dim in dims: - if dim < 0: - raise ValueError("extents must be positive for Contiguous Layout") - if dim == 0: - # data shape with no elements, just return dims as layout - return dims - - nsize = item_size - layout = [1,] * rank - - for i in range(rank): - dim = rank - i - 1 - extent = dims[dim] - if extent * nsize < chunk_max: - # just use the full extent as layout - layout[dim] = extent - nsize *= extent - else: - n = extent - while n > 1: - n = -(-n // 2) # use negatives so we round up on odds - if n * nsize < chunk_max: - break - layout[dim] = n - break # just use 1's for the rest of the layout - - return layout - - -def getChunkSize(layout, type_size): - """Return chunk size given layout. - i.e. just the product of the values in the list. - """ - if type_size == "H5T_VARIABLE": - type_size = DEFAULT_TYPE_SIZE - - chunk_size = type_size - for n in layout: - if n <= 0: - raise ValueError("Invalid chunk layout") - chunk_size *= n - return chunk_size - - -def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): - """ - Use chunk layout given in the creationPropertiesList (if defined and - layout is valid). - Return chunk_layout_json - """ - - rank = 0 - space_dims = None - chunk_dims = None - max_dims = None - - if "dims" in shape_json: - space_dims = shape_json["dims"] - rank = len(space_dims) - - if "maxdims" in shape_json: - max_dims = shape_json["maxdims"] - if "dims" in layout: - chunk_dims = layout["dims"] - - if chunk_dims: - # validate that the chunk_dims are valid and correlates with the - # dataset shape - if isinstance(chunk_dims, int): - chunk_dims = [ - chunk_dims, - ] # promote to array - if len(chunk_dims) != rank: - msg = "Layout rank does not match shape rank" - log.warn(msg) - raise ValueError(msg) - for i in range(rank): - dim_extent = space_dims[i] - chunk_extent = chunk_dims[i] - if not isinstance(chunk_extent, int): - msg = "Layout dims must be integer or integer array" - log.warn(msg) - raise ValueError(msg) - if chunk_extent <= 0: - msg = "Invalid layout value" - log.warn(msg) - raise ValueError(msg) - if max_dims is None: - if chunk_extent > dim_extent: - msg = "Invalid layout value" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif max_dims[i] != 0: - if chunk_extent > max_dims[i]: - msg = "Invalid layout value for extensible dimension" - log.warn(msg) - raise ValueError(msg) - else: - pass # allow any positive value for unlimited dimensions - - if "class" not in layout: - msg = "class key not found in layout for creation property list" - log.warn(msg) - raise ValueError(msg) - - layout_class = layout["class"] - - if layout_class == "H5D_CONTIGUOUS_REF": - # reference to a dataset in a traditional HDF5 files with - # contiguous storage - if item_size == "H5T_VARIABLE": - # can't be used with variable types... - msg = "Datasets with variable types cannot be used with " - msg += "reference layouts" - log.warn(msg) - raise ValueError(msg) - if "file_uri" not in layout: - # needed for H5D_CONTIGUOUS_REF - msg = "'file_uri' key must be provided for " - msg += "H5D_CONTIGUOUS_REF layout" - log.warn(msg) - raise ValueError(msg) - if "offset" not in layout: - # needed for H5D_CONTIGUOUS_REF - msg = "'offset' key must be provided for " - msg += "H5D_CONTIGUOUS_REF layout" - log.warn(msg) - raise ValueError(msg) - if "size" not in layout: - # needed for H5D_CONTIGUOUS_REF - msg = "'size' key must be provided for " - msg += "H5D_CONTIGUOUS_REF layout" - log.warn(msg) - raise ValueError(msg) - if "dims" in layout: - # used defined chunk layout not allowed for H5D_CONTIGUOUS_REF - msg = "'dims' key can not be provided for " - msg += "H5D_CONTIGUOUS_REF layout" - log.warn(msg) - raise ValueError(msg) - elif layout_class == "H5D_CHUNKED_REF": - # reference to a dataset in a traditional HDF5 files with - # chunked storage - if item_size == "H5T_VARIABLE": - # can't be used with variable types.. - msg = "Datasets with variable types cannot be used with " - msg += "reference layouts" - log.warn(msg) - raise ValueError(msg) - if "file_uri" not in layout: - # needed for H5D_CHUNKED_REF - msg = "'file_uri' key must be provided for " - msg += "H5D_CHUNKED_REF layout" - log.warn(msg) - raise ValueError(msg) - if "dims" not in layout: - # needed for H5D_CHUNKED_REF - msg = "'dimns' key must be provided for " - msg += "H5D_CHUNKED_REF layout" - log.warn(msg) - raise ValueError(msg) - if "chunks" not in layout: - msg = "'chunks' key must be provided for " - msg += "H5D_CHUNKED_REF layout" - log.warn(msg) - raise ValueError(msg) - elif layout_class == "H5D_CHUNKED_REF_INDIRECT": - # reference to a dataset in a traditional HDF5 files with chunked - # storage using an auxiliary dataset - if item_size == "H5T_VARIABLE": - # can't be used with variable types.. - msg = "Datasets with variable types cannot be used with " - msg += "reference layouts" - log.warn(msg) - raise ValueError(msg) - if "dims" not in layout: - # needed for H5D_CHUNKED_REF_INDIRECT - msg = "'dims' key must be provided for " - msg += "H5D_CHUNKED_REF_INDIRECT layout" - log.warn(msg) - raise ValueError(msg) - if "chunk_table" not in layout: - msg = "'chunk_table' key must be provided for " - msg += "H5D_CHUNKED_REF_INDIRECT layout" - log.warn(msg) - raise ValueError(msg) - chunk_table_id = layout["chunk_table"] - if not isValidUuid(chunk_table_id, "Dataset"): - msg = f"Invalid chunk table id: {chunk_table_id}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - elif layout_class == "H5D_CHUNKED": - if "dims" not in layout: - msg = "dims key not found in layout for creation property list" - log.warn(msg) - raise ValueError(msg) - if shape_json["class"] != "H5S_SIMPLE": - msg = "Bad Request: chunked layout not valid with shape class: " - msg += f"{shape_json['class']}" - log.warn(msg) - raise ValueError(msg) - elif layout_class == "H5D_CONTIGUOUS": - if "dims" in layout: - msg = "dims key found in layout for creation property list " - msg += "for H5D_CONTIGUOUS storage class" - log.warn(msg) - raise ValueError(msg) - elif layout_class == "H5D_COMPACT": - if "dims" in layout: - msg = "dims key found in layout for creation property list " - msg += "for H5D_COMPACT storage class" - log.warn(msg) - raise ValueError(msg) - else: - msg = f"Unexpected layout: {layout_class}" - log.warn(msg) - raise ValueError(msg) - - -def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN, layout_class="H5D_CHUNKED"): - """Compute an increased chunk shape with a size in bytes greater than chunk_min.""" - if shape_json is None or shape_json["class"] == "H5S_NULL": - return None - if shape_json["class"] == "H5S_SCALAR": - return (1,) # just enough to store one item - - layout = list(layout) - log.debug(f"expandChunk layout: {layout} typesize: {typesize}") - dims = shape_json["dims"] - rank = len(dims) - extendable_dims = 0 # number of dimensions that are extenable - maxdims = None - if "maxdims" in shape_json: - maxdims = shape_json["maxdims"] - for n in range(rank): - if maxdims[n] == 0 or maxdims[n] > dims[n]: - extendable_dims += 1 - - dset_size = get_dset_size(shape_json, typesize) - if dset_size <= chunk_min and extendable_dims == 0: - # just use the entire dataspace shape as one big chunk - return tuple(dims) - - chunk_size = getChunkSize(layout, typesize) - if chunk_size >= chunk_min: - return tuple(layout) # good already - while chunk_size < chunk_min: - # just adjust along extendable dimensions first - old_chunk_size = chunk_size - for n in range(rank): - dim = rank - n - 1 # start from last dim - - if extendable_dims > 0: - if maxdims[dim] == 0: - # infinitely extendable dimensions - layout[dim] *= 2 - chunk_size = getChunkSize(layout, typesize) - if chunk_size > chunk_min: - break - elif maxdims[dim] > layout[dim]: - # can only be extended so much - layout[dim] *= 2 - if layout[dim] >= dims[dim]: - layout[dim] = maxdims[dim] # trim back - extendable_dims -= 1 # one less extenable dimension - - chunk_size = getChunkSize(layout, typesize) - if chunk_size > chunk_min: - break - else: - pass # ignore non-extensible for now - else: - # no extendable dimensions - if dims[dim] > layout[dim]: - # can expand chunk along this dimension - layout[dim] *= 2 - if layout[dim] > dims[dim]: - layout[dim] = dims[dim] # trim back - chunk_size = getChunkSize(layout, typesize) - if chunk_size > chunk_min: - break - else: - pass # can't extend chunk along this dimension - if chunk_size <= old_chunk_size: - # stop iteration if we haven't increased the chunk size - log.debug("stopping expandChunk iteration") - break - elif chunk_size > chunk_min: - break # we're good - else: - pass # do another round - return tuple(layout) - - -def shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX, layout_class="H5D_CHUNKED"): - """Compute a reduced chunk shape with a size in bytes less than chunk_max.""" - layout = list(layout) - chunk_size = getChunkSize(layout, typesize) - if chunk_size <= chunk_max: - return tuple(layout) # good already - log.debug(f"shrinkChunk layout: {layout} typesize: {typesize}") - rank = len(layout) - - while chunk_size > chunk_max: - # just adjust along extendable dimensions first - old_chunk_size = chunk_size - for dim in range(rank): - if layout[dim] > 1: - # tricky way to do x // 2 with ceil - layout[dim] = -(-layout[dim] // 2) - chunk_size = getChunkSize(layout, typesize) - if chunk_size <= chunk_max: - break - else: - pass # can't shrink chunk along this dimension - if chunk_size >= old_chunk_size: - # reality check to see if we'll ever break out of the while loop - log.warning("Unexpected error in shrink_chunk") - break - elif chunk_size <= chunk_max: - break # we're good - else: - pass # do another round - return tuple(layout) - - -def guessChunk(shape_json, typesize): - """Guess an appropriate chunk layout for a dataset, given its shape and - the size of each element in bytes. Will allocate chunks only as large - as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of - each axis, slightly favoring bigger values for the last index. - - Undocumented and subject to change without warning. - """ - if shape_json is None or shape_json["class"] == "H5S_NULL": - return None - if shape_json["class"] == "H5S_SCALAR": - return (1,) # just enough to store one item - - if "maxdims" in shape_json: - shape = shape_json["maxdims"] - else: - shape = shape_json["dims"] - - if typesize == "H5T_VARIABLE": - typesize = 128 # just take a guess at the item size - - # For unlimited dimensions we have to guess. use 1024 - shape = tuple((x if x != 0 else 1024) for i, x in enumerate(shape)) - - return shape - - -def getLayoutJson(creation_props, shape=None, type_json=None, chunk_min=None, chunk_max=None): - """ Get the layout json given by creation_props. - Raise bad request error if invalid """ - - min_chunk_size = int(config.get("min_chunk_size")) - max_chunk_size = int(config.get("max_chunk_size")) - - item_size = getItemSize(type_json) - if chunk_min is None: - chunk_min = 1000 * 1000 - if chunk_max is None: - chunk_max = 4 * 1000 * 1000 - - if chunk_min > chunk_max: - msg = "chunk_max must be larger than chunk_min" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - layout = None - if "layout" in creation_props: - layout_props = creation_props["layout"] - else: - layout_props = None - - if layout_props: - if "class" not in layout_props: - msg = "expected class key in layout props" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - layout_class = layout_props["class"] - if layout_class == "H5D_CONTIGUOUS": - # treat contiguous as chunked - layout_class = "H5D_CHUNKED" - else: - layout_class = layout_props["class"] - elif shape["class"] != "H5S_NULL": - layout_class = "H5D_CHUNKED" - else: - layout_class = None - - if layout_class == "H5D_COMPACT": - layout = {"class": "H5D_COMPACT"} - elif layout_class: - # initialize to H5D_CHUNKED - layout = {"class": "H5D_CHUNKED"} - else: - # null space - no layout - layout = None - - if layout_props and "dims" in layout_props: - chunk_dims = layout_props["dims"] - else: - chunk_dims = None - - if layout_class == "H5D_CONTIGUOUS_REF": - kwargs = {"chunk_min": min_chunk_size, "chunk_max": max_chunk_size} - chunk_dims = getContiguousLayout(shape, item_size, **kwargs) - layout["dims"] = chunk_dims - log.debug(f"autoContiguous layout: {layout}") - - if layout_class == "H5D_CHUNKED" and chunk_dims is None: - # do auto-chunking - chunk_dims = guessChunk(shape, item_size) - log.debug(f"initial autochunk layout: {chunk_dims}") - - if layout_class == "H5D_CHUNKED": - chunk_size = getChunkSize(chunk_dims, item_size) - - msg = f"chunk_size: {chunk_size}, min: {min_chunk_size}, " - msg += f"max: {max_chunk_size}" - log.debug(msg) - # adjust the chunk shape if chunk size is too small or too big - adjusted_chunk_dims = None - if chunk_size < min_chunk_size: - msg = f"chunk size: {chunk_size} less than min size: " - msg += f"{min_chunk_size}, expanding" - log.debug(msg) - kwargs = {"chunk_min": min_chunk_size, "layout_class": layout_class} - adjusted_chunk_dims = expandChunk(chunk_dims, item_size, shape, **kwargs) - elif chunk_size > max_chunk_size: - msg = f"chunk size: {chunk_size} greater than max size: " - msg += f"{max_chunk_size}, shrinking" - log.debug(msg) - kwargs = {"chunk_max": max_chunk_size} - adjusted_chunk_dims = shrinkChunk(chunk_dims, item_size, **kwargs) - if adjusted_chunk_dims: - msg = f"requested chunk_dimensions: {chunk_dims} modified " - msg += f"dimensions: {adjusted_chunk_dims}" - log.debug(msg) - layout["dims"] = adjusted_chunk_dims - else: - layout["dims"] = chunk_dims # don't need to adjust chunk size - - # set partition_count if needed: - max_chunks_per_folder = int(config.get("max_chunks_per_folder")) - set_partition = False - if max_chunks_per_folder > 0: - if "dims" in shape and "dims" in layout: - set_partition = True - - if set_partition: - chunk_dims = layout["dims"] - shape_dims = shape["dims"] - if "maxdims" in shape: - max_dims = shape["maxdims"] - else: - max_dims = None - num_chunks = 1 - rank = len(shape_dims) - unlimited_count = 0 - if max_dims: - for i in range(rank): - if max_dims[i] == 0: - unlimited_count += 1 - msg = f"number of unlimited dimensions: {unlimited_count}" - log.debug(msg) - - for i in range(rank): - max_dim = 1 - if max_dims: - max_dim = max_dims[i] - if max_dim == 0: - # don't really know what the ultimate extent - # could be, but assume 10^6 for total number of - # elements and square-shaped array... - MAX_ELEMENT_GUESS = 10.0 ** 6 - exp = 1 / unlimited_count - max_dim = int(math.pow(MAX_ELEMENT_GUESS, exp)) - else: - max_dim = shape_dims[i] - num_chunks *= math.ceil(max_dim / chunk_dims[i]) - - if num_chunks > max_chunks_per_folder: - partition_count = math.ceil(num_chunks / max_chunks_per_folder) - msg = f"set partition count to: {partition_count}, " - msg += f"num_chunks: {num_chunks}" - log.info(msg) - layout["partition_count"] = partition_count - else: - msg = "do not need chunk partitions, num_chunks: " - msg += f"{num_chunks} max_chunks_per_folder: " - msg += f"{max_chunks_per_folder}" - log.info(msg) - - if layout_class in ("H5D_CHUNKED_REF", "H5D_CHUNKED_REF_INDIRECT"): - chunk_size = getChunkSize(chunk_dims, item_size) - - msg = f"chunk_size: {chunk_size}, min: {min_chunk_size}, " - msg += f"max: {max_chunk_size}" - log.debug(msg) - # nothing to do about inefficiently small chunks, but large chunks - # can be subdivided - if chunk_size < min_chunk_size: - msg = f"chunk size: {chunk_size} less than min size: " - msg += f"{min_chunk_size} for {layout_class} dataset" - log.warn(msg) - elif chunk_size > max_chunk_size: - msg = f"chunk size: {chunk_size} greater than max size: " - msg += f"{max_chunk_size}, for {layout_class} dataset" - log.warn(msg) - layout["dims"] = chunk_dims - - def getHyperslabSelection(dims, start=None, stop=None, step=None): """ Get slices given lists of start, stop, step values @@ -1117,40 +232,6 @@ def getSelectionShape(selection): return shape -def getShapeDims(shape): - """ - Get dims from a given shape json. Return [1,] for Scalar datasets, - None for null dataspaces - """ - dims = None - if isinstance(shape, int): - dims = [shape, ] - elif isinstance(shape, list) or isinstance(shape, tuple): - dims = shape # can use as is - elif isinstance(shape, str): - # only valid string value is H5S_NULL - if shape != "H5S_NULL": - raise ValueError("Invalid value for shape") - dims = None - elif isinstance(shape, dict): - if "class" not in shape: - raise ValueError("'class' key not found in shape") - if shape["class"] == "H5S_NULL": - dims = None - elif shape["class"] == "H5S_SCALAR": - dims = [1,] - elif shape["class"] == "H5S_SIMPLE": - if "dims" not in shape: - raise ValueError("'dims' key expected for shape") - dims = shape["dims"] - else: - raise ValueError("Unknown shape class: {}".format(shape["class"])) - else: - raise ValueError(f"Unexpected shape class: {type(shape)}") - - return dims - - def isSelectAll(slices, dims): """ return True if the selection covers the entire dataspace """ if len(slices) != len(dims): @@ -1605,58 +686,10 @@ def setChunkDimQueryParam(params, dims): extent = dims[i] dim_param += str(extent) dim_param += "]" - log.debug("dim query param: {}".format(dim_param)) + log.debug(f"dim query param: {dim_param}") params["dim"] = dim_param -def getDsetMaxDims(dset_json): - """ - Get maxdims from a given shape. Return [1,] for Scalar datasets - - Use with H5S_NULL datasets will throw a 400 error. - """ - if "shape" not in dset_json: - log.error("No shape found in dset_json") - raise HTTPInternalServerError() - shape_json = dset_json["shape"] - maxdims = None - if shape_json["class"] == "H5S_NULL": - msg = "Expected shape class other than H5S_NULL" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif shape_json["class"] == "H5S_SCALAR": - maxdims = [ - 1, - ] - elif shape_json["class"] == "H5S_SIMPLE": - if "maxdims" in shape_json: - maxdims = shape_json["maxdims"] - else: - log.error("Unexpected shape class: {}".format(shape_json["class"])) - raise HTTPInternalServerError() - return maxdims - - -def getChunkLayout(dset_json): - """Get chunk layout. Throw 500 if used with non-H5D_CHUNKED layout""" - if "layout" not in dset_json: - log.error("No layout found in dset_json") - raise HTTPInternalServerError() - layout_json = dset_json["layout"] - if "class" not in layout_json: - log.error(f"Expected class key for layout: {layout_json}") - raise HTTPInternalServerError() - layout_class = layout_json["class"] - if layout_class not in CHUNK_LAYOUT_CLASSES: - log.error(f"Unexpected shape layout: {layout_class}") - raise HTTPInternalServerError() - if "dims" not in layout_json: - log.error(f"Expected dims key in layout: {layout_json}") - raise HTTPInternalServerError() - layout = layout_json["dims"] - return layout - - def getChunkInitializer(dset_json): """ get initializer application and arguments if set """ initializer = None @@ -1708,65 +741,6 @@ def getPreviewQuery(dims): return select -def isExtensible(dims, maxdims): - """ - Determine if the dataset can be extended - """ - if maxdims is None or len(dims) == 0: - return False - rank = len(dims) - if len(maxdims) != rank: - raise ValueError("rank of maxdims does not match dataset") - for n in range(rank): - # TBD - shouldn't have H5S_UNLIMITED in any new files. - # Remove check once this is confirmed - if maxdims[n] in (0, "H5S_UNLIMITED") or maxdims[n] > dims[n]: - return True - return False - - -def getDatasetLayout(dset_json): - """ Return layout json from creation property list or layout json """ - layout = None - - if "creationProperties" in dset_json: - cp = dset_json["creationProperties"] - if "layout" in cp: - layout = cp["layout"] - if not layout and "layout" in dset_json: - layout = dset_json["layout"] - if not layout: - log.warn(f"no layout for {dset_json}") - return layout - - -def getDatasetLayoutClass(dset_json): - """ return layout class """ - layout = getDatasetLayout(dset_json) - if layout and "class" in layout: - layout_class = layout["class"] - else: - layout_class = None - return layout_class - - -def getChunkDims(dset_json): - """ get chunk shape for given dset_json """ - - layout = getDatasetLayout(dset_json) - if layout and "dims" in layout: - return layout["dims"] - else: - # H5D_COMPACT and H5D_CONTIGUOUS will not have a dims key - # Check the layout dict in dset_json to see if it's - # defined there - if "layout" in dset_json: - layout = dset_json["layout"] - if "dims" in layout: - return layout["dims"] - return None - - class ItemIterator: """ Class to iterator through items in a selection diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index cf15ada2..e8d4feb3 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -19,12 +19,6 @@ import helper import config -# min/max chunk size - these can be set by config, but -# practically the min config value should be larger than -# CHUNK_MIN and the max config value should less than CHUNK_MAX -CHUNK_MIN = 1024 # lower limit (1024b) -CHUNK_MAX = 50 * 1024 * 1024 # upper limit (50M) - class DatasetTest(unittest.TestCase): def __init__(self, *args, **kwargs): @@ -67,7 +61,7 @@ def testScalarDataset(self): expected_keys = [ "id", "shape", - "layout", + "creationProperties", "attributeCount", "created", "lastModified", @@ -232,7 +226,6 @@ def testPostDatasetWithId(self): "id", "shape", "hrefs", - "layout", "creationProperties", "attributeCount", "created", @@ -423,7 +416,6 @@ def testGet(self): "id", "shape", "hrefs", - "layout", "creationProperties", "attributeCount", "created", @@ -447,7 +439,8 @@ def testGet(self): self.assertEqual(shape["dims"], [10, 10]) self.assertEqual(shape["maxdims"], [10, 10]) - layout = rspJson["layout"] + cpl = rspJson["creationProperties"] + layout = cpl["layout"] self.assertEqual(layout["class"], "H5D_CHUNKED") self.assertEqual(layout["dims"], [10, 10]) self.assertTrue("partition_count" not in layout) @@ -517,7 +510,6 @@ def testGetByPath(self): "id", "shape", "hrefs", - "layout", "creationProperties", "attributeCount", "created", @@ -539,7 +531,6 @@ def testGetByPath(self): "id", "shape", "hrefs", - "layout", "creationProperties", "attributeCount", "created", @@ -601,7 +592,6 @@ def testGetVerbose(self): "id", "shape", "hrefs", - "layout", "creationProperties", "attributeCount", "created", @@ -1265,6 +1255,95 @@ def testExtend3DDataset(self): rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 409) # tried to extend a non-extensible dimension + def testInvalidDatasetLayout(self): + # test that various invalid layouts fail with a 400 status + domain = self.base_domain + "/testInvalidDatasetLayout.h5" + helper.setupDomain(domain) + + print("testInvalidDatasetLayout", domain) + headers = helper.getRequestHeaders(domain=domain) + # get domain + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("root" in rspJson) + + # dataset create + req = self.endpoint + "/datasets" + dims = [365, 780, 1024] + maxdims = [0, 780, 1024] + payload = { + "type": "H5T_IEEE_F32LE", + "shape": dims, + "maxdims": maxdims, + } + # bad layout class + payload["creationProperties"] = { + "layout": {"class": "H5D_XYZ", "dims": [1, 390, 512]}, + } + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) # create dataset + + # chunked layout with mismatched dimensions + payload["creationProperties"] = { + "layout": {"class": "H5D_XYZ", "dims": [1, 390]}, + } + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) # create dataset + + # chunked layout with negative dimensions + payload["creationProperties"] = { + "layout": {"class": "H5D_XYZ", "dims": [100, 200, -300]}, + } + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) # create dataset + + file_uri = "s3://a-storage-bucket/some-file.h5" + offset = 1234 + size = dims[0] * dims[1] * dims[2] * 4 + + # H5D_CONTIGUOUS layout missing different required keys... + for key in ("file_uri", "offset", "size"): + layout = {"class": "H5D_CONTIGUOUS_REF"} + if key != "file_uri": + layout["file_uri"] = file_uri + elif key != "offset": + layout["offset"] = offset + elif key != "size": + layout["size"] = size + else: + self.assertTrue(False) # one of the above should be true + + payload["creationProperties"] = { + "layout": layout + } + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) # create dataset + + # H5D_CONTIGOUS with a vlen type + type_vstr = { + "charSet": "H5T_CSET_ASCII", + "class": "H5T_STRING", + "strPad": "H5T_STR_NULLTERM", + "length": "H5T_VARIABLE", + } + payload = { + "type": type_vstr, + "shape": dims, + } + layout = { + "class": "H5D_CONTIGUOUS_REF", + "file_uri": file_uri, + "offset": offset, + "size": size + } + payload["creationProperties"] = { + "layout": layout + } + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) # create dataset + def testCreationPropertiesLayoutDataset(self): # test Dataset with creation property list domain = self.base_domain + "/testCreationPropertiesLayoutDataset.h5" @@ -1288,6 +1367,7 @@ def testCreationPropertiesLayoutDataset(self): "shape": [365, 780, 1024], "maxdims": [0, 780, 1024], } + # define a chunk layout with 4 chunks per 'slice' # chunk size is 798720 bytes gzip_filter = { @@ -1308,7 +1388,6 @@ def testCreationPropertiesLayoutDataset(self): fletcher32_filter, ], } - req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset rspJson = json.loads(rsp.text) @@ -1326,12 +1405,14 @@ def testCreationPropertiesLayoutDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) - self.assertEqual(layout_json["dims"], [1, 390, 1024]) + self.assertEqual(layout_json["dims"], [1, 390, 512]) if config.get("max_chunks_per_folder") > 0: self.assertTrue("partition_count" in layout_json) self.assertEqual(layout_json["partition_count"], 10) @@ -1407,16 +1488,13 @@ def testCreationPropertiesContiguousDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] - self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CHUNKED") - self.assertTrue("dims" in layout_json) - self.assertEqual(layout_json["dims"], [10, 20]) - # verify creation properties are preserved self.assertTrue("creationProperties" in rspJson) cpl = rspJson["creationProperties"] self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] + self.assertTrue("class" in layout_json) + self.assertEqual(layout_json["class"], "H5D_CONTIGUOUS") + self.assertFalse("dims" in layout_json) def testCompressionFiltersDataset(self): # test Dataset with creation property list @@ -1463,10 +1541,12 @@ def testCompressionFiltersDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CHUNKED") + self.assertEqual(layout_json["class"], "H5D_CONTIGUOUS") # verify compression self.assertTrue("creationProperties" in rspJson) @@ -1525,10 +1605,13 @@ def testCompressionFilterOptionDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CHUNKED") + self.assertEqual(layout_json["class"], "H5D_CONTIGUOUS") + self.assertFalse("dims" in layout_json) # verify compression self.assertTrue("creationProperties" in rspJson) @@ -1762,9 +1845,7 @@ def testAutoChunk1dDataset(self): req = self.endpoint + "/datasets" # 50K x 80K dataset extent = 1000 * 1000 * 1000 - dims = [ - extent, - ] + dims = [extent, ] fields = ( {"name": "x", "type": "H5T_IEEE_F64LE"}, {"name": "y", "type": "H5T_IEEE_F64LE"}, @@ -1773,13 +1854,12 @@ def testAutoChunk1dDataset(self): datatype = {"class": "H5T_COMPOUND", "fields": fields} payload = {"type": datatype, "shape": dims} - # the following should get ignored as too small + # the following specifies an efficiently small chunk size + chunk_dims = [10,] payload["creationProperties"] = { "layout": { "class": "H5D_CHUNKED", - "dims": [ - 10, - ], + "dims": chunk_dims } } req = self.endpoint + "/datasets" @@ -1802,19 +1882,14 @@ def testAutoChunk1dDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) - self.assertTrue("partition_count" not in layout_json) - layout = layout_json["dims"] - self.assertEqual(len(layout), 1) - self.assertTrue(layout[0] < dims[0]) - chunk_size = layout[0] * 8 * 3 # three 64bit - # chunk size should be between chunk min and max - self.assertTrue(chunk_size >= CHUNK_MIN) - self.assertTrue(chunk_size <= CHUNK_MAX) + self.assertEqual(layout_json["dims"], chunk_dims) def testAutoChunk2dDataset(self): # test Dataset where chunk layout is set automatically @@ -1855,8 +1930,10 @@ def testAutoChunk2dDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) @@ -1865,64 +1942,10 @@ def testAutoChunk2dDataset(self): self.assertTrue(layout[0] < dims[0]) self.assertTrue(layout[1] < dims[1]) chunk_size = layout[0] * layout[1] * 4 - # chunk size should be between chunk min and max - self.assertTrue(chunk_size >= CHUNK_MIN) - self.assertTrue(chunk_size <= CHUNK_MAX) - - def testMinChunkSizeDataset(self): - # test Dataset where chunk layout is adjusted if provided - # layout is too small - domain = self.base_domain + "/testMinChunkSizeDataset.h5" - helper.setupDomain(domain) - print("testMinChunkSizeDataset", domain) - headers = helper.getRequestHeaders(domain=domain) - # get domain - req = helper.getEndpoint() + "/" - rsp = self.session.get(req, headers=headers) - rspJson = json.loads(rsp.text) - self.assertTrue("root" in rspJson) - root_uuid = rspJson["root"] - - # create the dataset - req = self.endpoint + "/datasets" - # 50K x 80K dataset - dims = [50000, 80000] - payload = {"type": "H5T_IEEE_F32LE", "shape": dims} - # define a chunk layout with lots of small chunks - payload["creationProperties"] = { - "layout": {"class": "H5D_CHUNKED", "dims": [10, 10]} - } - - req = self.endpoint + "/datasets" - rsp = self.session.post(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 201) # create dataset - rspJson = json.loads(rsp.text) - dset_uuid = rspJson["id"] - self.assertTrue(helper.validateId(dset_uuid)) - # link new dataset as 'dset' - name = "dset" + helper.getRandomName() - req = self.endpoint + "/groups/" + root_uuid + "/links/" + name - payload = {"id": dset_uuid} - rsp = self.session.put(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 201) - - # verify layout - req = helper.getEndpoint() + "/datasets/" + dset_uuid - rsp = self.session.get(req, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] - self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CHUNKED") - self.assertTrue("dims" in layout_json) - layout = layout_json["dims"] - self.assertEqual(len(layout), 2) - self.assertTrue(layout[0] < dims[0]) - self.assertTrue(layout[1] < dims[1]) - chunk_size = layout[0] * layout[1] * 4 - # chunk size should be between chunk min and max + # chunk size will be based on server config, but assume a min/max of 1MB to 1GB + CHUNK_MIN = 1024 * 1024 + CHUNK_MAX = 1024 * 1024 * 1024 self.assertTrue(chunk_size >= CHUNK_MIN) self.assertTrue(chunk_size <= CHUNK_MAX) @@ -2307,17 +2330,13 @@ def testContiguousRefDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CHUNKED") - self.assertTrue("dims" in layout_json) - chunk_dims = layout_json["dims"] - self.assertEqual(len(chunk_dims), 2) - chunk_size = chunk_dims[0] * chunk_dims[1] * 4 - # chunk size should be between chunk min and max - self.assertTrue(chunk_size >= CHUNK_MIN) - self.assertTrue(chunk_size <= CHUNK_MAX) + self.assertEqual(layout_json["class"], "H5D_CONTIGUOUS_REF") + self.assertFalse("dims" in layout_json) # verify cpl self.assertTrue("creationProperties" in rspJson) @@ -2380,23 +2399,13 @@ def testContiguousRefZeroDimDataset(self): rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) - # verify layout + # get dataset json req = helper.getEndpoint() + "/datasets/" + dset_uuid rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] - self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CHUNKED") - self.assertTrue("dims" in layout_json) - chunk_dims = layout_json["dims"] - self.assertEqual(len(chunk_dims), 2) - # layout should be same as the dims - self.assertEqual(chunk_dims[0], dims[0]) - self.assertEqual(chunk_dims[1], dims[1]) - # verify cpl + # verify layout self.assertTrue("creationProperties" in rspJson) cpl = rspJson["creationProperties"] self.assertTrue("layout" in cpl) @@ -2470,13 +2479,6 @@ def testChunkedRefDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] - self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CHUNKED") - self.assertTrue("dims" in layout_json) - chunk_dims = layout_json["dims"] - self.assertEqual(len(chunk_dims), 2) self.assertTrue("creationProperties" in rspJson) cpl = rspJson["creationProperties"] self.assertTrue("layout" in cpl) @@ -2549,21 +2551,15 @@ def testChunkedRefIndirectDataset(self): rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) - # verify layout + # get dataset json req = helper.getEndpoint() + "/datasets/" + dset_uuid rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] - self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CHUNKED") - self.assertTrue("chunks" not in layout_json) - chunk_dims = layout_json["dims"] - self.assertEqual(len(chunk_dims), 2) self.assertTrue("creationProperties" in rspJson) cpl = rspJson["creationProperties"] + self.assertTrue("layout") self.assertTrue("layout" in cpl) cpl_layout = cpl["layout"] self.assertTrue("class" in cpl_layout) @@ -2645,19 +2641,11 @@ def testChunkedRefIndirectS3UriDataset(self): rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) - # verify layout + # fetch dataset json req = helper.getEndpoint() + "/datasets/" + dset_uuid rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] - self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CHUNKED") - self.assertTrue("chunks" not in layout_json) - self.assertTrue("dims" in layout_json) - chunk_dims = layout_json["dims"] - self.assertEqual(len(chunk_dims), 2) self.assertTrue("creationProperties" in rspJson) cpl = rspJson["creationProperties"] @@ -2711,8 +2699,10 @@ def testDatasetChunkPartitioning(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) @@ -2728,10 +2718,6 @@ def testDatasetChunkPartitioning(self): self.assertTrue(layout[0] < dims[0]) self.assertTrue(layout[1] < dims[1]) self.assertTrue(layout[2] < dims[2]) - chunk_size = layout[0] * layout[1] * layout[2] * 4 - # chunk size should be between chunk min and max - self.assertTrue(chunk_size >= CHUNK_MIN) - self.assertTrue(chunk_size <= CHUNK_MAX) def testExtendibleDatasetChunkPartitioning(self): # test Dataset partitioning logic for large datasets @@ -2750,9 +2736,17 @@ def testExtendibleDatasetChunkPartitioning(self): req = self.endpoint + "/datasets" # 50K x 80K x 90K dataset dims = [0, 80000, 90000] + # unlimited extend in dim 0, fixeed in dimension 2, extensible by 10x in dim 3 max_dims = [0, 80000, 900000] + chunk_shape = [1000, 1000, 1000] + layout = { + "class": "H5D_CHUNKED", + "dims": chunk_shape + } + cpl = {"layout": layout} payload = {"type": "H5T_IEEE_F32LE", "shape": dims, "maxdims": max_dims} + payload["creationProperties"] = cpl req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) @@ -2774,8 +2768,10 @@ def testExtendibleDatasetChunkPartitioning(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) @@ -2785,10 +2781,6 @@ def testExtendibleDatasetChunkPartitioning(self): layout = layout_json["dims"] self.assertEqual(len(layout), 3) - chunk_size = layout[0] * layout[1] * layout[2] * 4 - # chunk size should be between chunk min and max - self.assertTrue(chunk_size >= CHUNK_MIN) - self.assertTrue(chunk_size <= CHUNK_MAX) def testDatasetEmptyChunkExtent(self): # Attempting to create 0-extent chunks should respond with Bad Request @@ -2855,7 +2847,6 @@ def testDatasetPostMulti(self): expected_keys = [ "id", "shape", - "layout", "attributeCount", "created", "lastModified", diff --git a/tests/unit/dset_util_test.py b/tests/unit/dset_util_test.py index 7c2028b9..0e77ab1b 100755 --- a/tests/unit/dset_util_test.py +++ b/tests/unit/dset_util_test.py @@ -15,8 +15,7 @@ sys.path.append("../..") from hsds.util.dsetUtil import getHyperslabSelection, getSelectionShape -from hsds.util.dsetUtil import getSelectionList, ItemIterator, getSelectionPagination, expandChunk -from hsds.util.dsetUtil import guessChunk, shrinkChunk, getChunkSize, getContiguousLayout +from hsds.util.dsetUtil import getSelectionList, ItemIterator, getSelectionPagination class DsetUtilTest(unittest.TestCase): @@ -26,277 +25,6 @@ def __init__(self, *args, **kwargs): self.logger = logging.getLogger() self.logger.setLevel(logging.WARNING) - def testGuessChunk(self): - - typesize = "H5T_VARIABLE" - logging.debug("hello") - - shape = {"class": "H5S_NULL"} - layout = guessChunk(shape, typesize) - self.assertTrue(layout is None) - - shape = {"class": "H5S_SCALAR"} - layout = guessChunk(shape, typesize) - self.assertEqual(layout, (1,)) - - shape = {"class": "H5S_SIMPLE", "dims": [100, 100]} - layout = guessChunk(shape, typesize) - self.assertTrue(len(layout), 2) - for i in range(2): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= 100) - - typesize = 8 - layout = guessChunk(shape, typesize) - self.assertTrue(len(layout), 2) - for i in range(2): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= 100) - - shape = {"class": "H5S_SIMPLE", "dims": [5]} - layout = guessChunk(shape, typesize) - self.assertEqual(layout, (5,)) - - shape = {"class": "H5S_SIMPLE", "dims": [100, 100, 100]} - layout = guessChunk(shape, typesize) - print("layout:", layout) - self.assertTrue(len(layout), 3) - for i in range(3): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= 100) - - shape = {"class": "H5S_SIMPLE", "dims": [100, 0], "maxdims": [100, 0]} - layout = guessChunk(shape, typesize) - self.assertTrue(len(layout), 2) - for i in range(2): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= 1024) - - shape = {"class": "H5S_SCALAR"} - layout = guessChunk(shape, typesize) - self.assertEqual(layout, (1,)) - - shape = {"class": "H5S_NULL"} - layout = guessChunk(shape, typesize) - self.assertEqual(layout, None) - - def testShrinkChunk(self): - CHUNK_MIN = 500 - CHUNK_MAX = 5000 - typesize = 1 - layout = (1, 2, 3) - shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) - self.assertEqual(shrunk, layout) - - layout = (100, 200, 300) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes > CHUNK_MAX) - shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) - rank = len(layout) - for i in range(rank): - self.assertTrue(shrunk[i] >= 1) - self.assertTrue(shrunk[i] <= 1000 * (i + 1)) - num_bytes = getChunkSize(shrunk, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - layout = (300, 200, 100) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes > CHUNK_MAX) - shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) - rank = len(layout) - for i in range(rank): - self.assertTrue(shrunk[i] >= 1) - self.assertTrue(shrunk[i] <= 1000 * (3 - i)) - num_bytes = getChunkSize(shrunk, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - CHUNK_MIN = 1 * 1024 * 1024 - CHUNK_MAX = 4 * 1024 * 1024 - typesize = 4 - layout = (117, 201, 189, 1) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes > CHUNK_MAX) - shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) - self.assertEqual(shrunk, (59, 101, 95, 1)) - num_bytes = getChunkSize(shrunk, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - def testExpandChunk(self): - CHUNK_MIN = 5000 - CHUNK_MAX = 50000 - - typesize = 20 - shape = {"class": "H5S_SIMPLE", "dims": [12, ], "maxdims": [20, ]} - layout = (20,) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - # chunk layout can't be larger than dataspace - self.assertTrue(num_bytes < CHUNK_MIN) - self.assertEqual(expanded, (20,)) - - typesize = 1 - shape = {"class": "H5S_SIMPLE", "dims": [10, 10, 10]} - layout = (10, 10, 10) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - # chunk layout can't be larger than dataspace - self.assertTrue(num_bytes < CHUNK_MIN) - self.assertEqual(expanded, (10, 10, 10)) - - shape = {"class": "H5S_SIMPLE", "dims": [1000, 2000, 3000]} - layout = (10, 10, 10) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - shape = {"class": "H5S_SIMPLE", "dims": [1000,]} - layout = (10,) - num_bytes = getChunkSize(layout, "H5T_VARIABLE") - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, "H5T_VARIABLE", shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, "H5T_VARIABLE") - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - shape = { - "class": "H5S_SIMPLE", - "dims": [1000, 10, 1000], - "maxdims": [1000, 100, 1000], - } - layout = (10, 10, 10) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - shape = { - "class": "H5S_SIMPLE", - "dims": [1000, 0, 1000], - "maxdims": [1000, 100, 1000], - } - layout = (10, 10, 10) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - shape = { - "class": "H5S_SIMPLE", - "dims": [1000, 10, 1000], - "maxdims": [1000, 0, 1000], - } - layout = (10, 10, 10) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - def testGetContiguousLayout(self): - typesize = 4 - chunk_min = 400 - chunk_max = 800 - - kwargs = {"chunk_min": chunk_min, "chunk_max": chunk_max} - - def get_num_bytes(dims): - num_bytes = typesize - for n in dims: - num_bytes *= n - return num_bytes - - try: - shape = {"class": "H5S_SIMPLE", "dims": [100, 100]} - layout = getContiguousLayout(shape, "H5T_VARIABLE", **kwargs) - self.assertTrue(False) - except ValueError: - pass # expected - - shape = {"class": "H5S_NULL"} - layout = getContiguousLayout(shape, typesize, **kwargs) - self.assertTrue(layout is None) - - shape = {"class": "H5S_SCALAR"} - layout = getContiguousLayout(shape, typesize, **kwargs) - self.assertEqual(layout, (1,)) - - for extent in (1, 100, 10000): - dims = [ - extent, - ] - shape = {"class": "H5S_SIMPLE", "dims": dims} - layout = getContiguousLayout(shape, typesize, **kwargs) - self.assertTrue(len(layout), 1) - chunk_bytes = get_num_bytes(layout) - space_bytes = get_num_bytes(dims) - if space_bytes > chunk_min: - self.assertTrue(chunk_bytes >= chunk_min) - - self.assertTrue(chunk_bytes <= chunk_max) - - for extent in (1, 9, 90): - dims = [extent, extent] - shape = {"class": "H5S_SIMPLE", "dims": dims} - layout = getContiguousLayout(shape, typesize, **kwargs) - self.assertTrue(len(layout), 2) - for i in range(2): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= extent) - self.assertEqual(layout[1], extent) - chunk_bytes = get_num_bytes(layout) - space_bytes = get_num_bytes(dims) - - if space_bytes > chunk_min: - self.assertTrue(chunk_bytes >= chunk_min) - self.assertTrue(chunk_bytes <= chunk_max) - - for extent in (1, 10, 100): - dims = [extent, extent, 50] - shape = {"class": "H5S_SIMPLE", "dims": dims} - layout = getContiguousLayout(shape, typesize, **kwargs) - self.assertTrue(len(layout), 3) - for i in range(3): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= dims[i]) - - chunk_bytes = get_num_bytes(layout) - space_bytes = get_num_bytes(dims) - - if space_bytes > chunk_min: - self.assertTrue(chunk_bytes >= chunk_min) - self.assertTrue(chunk_bytes <= chunk_max) - - for extent in (1, 100, 1000): - dims = [extent, 4] - shape = {"class": "H5S_SIMPLE", "dims": dims} - layout = getContiguousLayout(shape, typesize, **kwargs) - self.assertTrue(len(layout), 2) - for i in range(2): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= dims[i]) - - chunk_bytes = get_num_bytes(layout) - space_bytes = get_num_bytes(dims) - - if space_bytes > chunk_min: - self.assertTrue(chunk_bytes >= chunk_min) - self.assertTrue(chunk_bytes <= chunk_max) - def testGetHyperslabSelection(self): # getHyperslabSelection(dsetshape, start, stop, step) # 1-D case From 61d38fd9d1229ddffc2b25c358fcb046534215df Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 13 Nov 2025 16:56:52 +0100 Subject: [PATCH 42/49] update requirement.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 5aa9d39b..7dfad721 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,7 @@ aiofiles azure-storage-blob cryptography h5py>=3.6.0 +hdf5-json>1.0.0 numcodecs numpy>=2.0.0rc1 psutil From 73d822313093edb3658f616b1d98718e2cc6300b Mon Sep 17 00:00:00 2001 From: Joshua Stillerman Date: Tue, 16 Dec 2025 08:15:49 +0000 Subject: [PATCH 43/49] updates to support h5json latest --- hsds/chunk_crawl.py | 4 +- hsds/chunk_sn.py | 14 ++- hsds/datanode_lib.py | 5 +- hsds/domain_crawl.py | 7 +- hsds/dset_sn.py | 14 +-- hsds/servicenode_lib.py | 222 ++++++++++++---------------------- hsds/util/chunkUtil.py | 26 +++- hsds/util/dsetUtil.py | 36 ++++-- hsds/util/storUtil.py | 2 +- tests/integ/dataset_test.py | 101 ++++++++++------ tests/integ/filter_test.py | 23 +++- tests/integ/link_test.py | 1 - tests/integ/pointsel_test.py | 6 +- tests/integ/value_test.py | 7 +- tests/unit/chunk_util_test.py | 33 +++++ tests/unit/dset_util_test.py | 34 +++++- 16 files changed, 302 insertions(+), 233 deletions(-) diff --git a/hsds/chunk_crawl.py b/hsds/chunk_crawl.py index a92bdf36..142577d8 100755 --- a/hsds/chunk_crawl.py +++ b/hsds/chunk_crawl.py @@ -87,9 +87,7 @@ async def write_chunk_hyperslab( msg += f"bucket: {bucket}" msg += f" dset_json: {dset_json}" log.info(msg) - if "layout" not in dset_json: - log.error(f"No layout found in dset_json: {dset_json}") - raise HTTPInternalServerError() + partition_chunk_id = getChunkIdForPartition(chunk_id, dset_json) if partition_chunk_id != chunk_id: log.debug(f"using partition_chunk_id: {partition_chunk_id}") diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index a1cd3d06..f278e4e9 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -201,7 +201,7 @@ def _getSelect(params, dset_json, body=None): """ return selection region if any as a list of slices. """ slices = None - log.debug(f"_getSelect params: {params} body: {body}") + log.debug(f"_getSelect params: {dict(params)} body: {body}") try: if body and isinstance(body, dict): if "select" in body and body["select"]: @@ -214,6 +214,7 @@ def _getSelect(params, dset_json, body=None): if slices: msg = "select defined in both request body and query parameters" raise ValueError(msg) + log.debug(f"_getSelect - select param: {select}") slices = get_slices(select, dset_json) except ValueError as ve: log.warn(f"Invalid selection: {ve}") @@ -226,12 +227,17 @@ def _getSelect(params, dset_json, body=None): if not slices: # just return the entire dataspace + log.debug("_getSelect - no selection, using entire dataspace") datashape = dset_json["shape"] dims = getShapeDims(datashape) slices = [] - for dim in dims: - s = slice(0, dim, 1) - slices.append(s) + if dims: + for dim in dims: + s = slice(0, dim, 1) + slices.append(s) + else: + # scalar dataset + slices.append(slice(0, 1, 1)) log.debug(f"_getSelect returning: {slices}") return slices diff --git a/hsds/datanode_lib.py b/hsds/datanode_lib.py index 9bd2b0a5..a6adbe28 100644 --- a/hsds/datanode_lib.py +++ b/hsds/datanode_lib.py @@ -27,7 +27,7 @@ from h5json.objid import isValidChunkId, isSchema2Id from h5json.objid import getRootObjId, isRootObjId from h5json.shape_util import getShapeDims -from h5json.dset_util import getChunkDims +from h5json.dset_util import getChunkDims, getDatasetLayoutClass from .util.nodeUtil import getDataNodeUrl from .util.storUtil import getStorJSONObj, putStorJSONObj, putStorBytes @@ -1060,8 +1060,7 @@ async def get_chunk( dims = getChunkDims(dset_json) type_json = dset_json["type"] dt = createDataType(type_json) - layout_json = dset_json["layout"] - layout_class = layout_json.get("class") + layout_class = getDatasetLayoutClass(dset_json) chunk_dims = getChunkDims(dset_json) fill_value = getFillValue(dset_json) diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index 19eee5df..d9285d45 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -433,10 +433,9 @@ async def put_links(self, grp_id, link_items): async def put_data(self, chunk_id, arr): # write a one-chunk dataset value - log.debug(f"DomainCrawler put_data for {chunk_id}, arr: {arr}") + log.debug(f"DomainCrawler put_data for {chunk_id}, arr.shape: {arr.shape}") req = getDataNodeUrl(self._app, chunk_id) req += "/chunks/" + chunk_id - log.debug(f"put_data req: {req}") params = {"bucket": self._bucket} data = arrayToBytes(arr) @@ -600,7 +599,9 @@ async def fetch(self, obj_id): log.error(f"couldn't find {obj_id} in self._objs") return data = self._objs[obj_id] - log.debug(f"got {len(data)} data for {obj_id}") + if data is None: + log.error(f"no data found for {obj_id}") + return await self.put_data(obj_id, data) else: diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 7a05999f..e314e22c 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -26,7 +26,7 @@ from .util.httpUtil import getHref, respJsonAssemble from .util.httpUtil import jsonResponse, getBooleanParam from .util.chunkUtil import getChunkIds -from .util.dsetUtil import getPreviewQuery +from .util.dsetUtil import getPreviewQuery, getHyperslabSelection from .util.authUtil import getUserPasswordFromRequest, aclCheck from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain @@ -476,7 +476,7 @@ async def POST_Dataset(request): log.warn(msg) raise HTTPBadRequest(reason=msg) - log.debug(f"got body: {body}") + log.debug(f"POST_Dataset got body: {body}") # get domain, check authorization domain = getDomainFromRequest(request) if not isValidDomain(domain): @@ -507,6 +507,7 @@ def _updateInitValuesList(kwargs): # to init_values list if "value" in kwargs: init_values.append(kwargs["value"]) + log.debug(f"init value appended: {kwargs['value']}") del kwargs["value"] else: # add a placeholder @@ -613,12 +614,13 @@ def _updateInitValuesList(kwargs): init_data = init_values[index] if init_data is None: continue # no data to initialize + log.debug(f"init data: {init_data}") dset_json = objects[index] dset_id = dset_json["id"] log.debug(f"init value, post_rsp: {dset_json}") layout_class = getDatasetLayoutClass(dset_json) log.debug(f"layout_class: {layout_class}") - if layout_class != "H5D_CHUNKED": + if layout_class not in ("H5D_CONTIGUOUS", "H5D_CHUNKED"): msg = f"dataset init_data used with unsupported layout_class: {layout_class}" log.error(msg) raise HTTPInternalServerError() @@ -626,10 +628,8 @@ def _updateInitValuesList(kwargs): log.debug(f"init data layout is: {layout_dims}") # make selection for entire dataspace dims = getShapeDims(dset_json["shape"]) - slices = [] - for dim in dims: - s = slice(0, dim, 1) - slices.append(s) + slices = getHyperslabSelection(dims) + chunk_ids = getChunkIds(dset_id, slices, layout_dims) log.debug(f"init data, got chunk_ids: {chunk_ids}") if not chunk_ids or len(chunk_ids) != 1: diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index b26ba8ee..b9a179c7 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -26,15 +26,14 @@ from aiohttp import ClientResponseError from h5json.array_util import encodeData, decodeData, bytesToArray, bytesArrayToList -from h5json.array_util import jsonToArray, getNumpyValue +from h5json.array_util import jsonToArray from h5json.objid import getCollectionForId, createObjId, getRootObjId from h5json.objid import isSchema2Id, getS3Key, isValidUuid from h5json.hdf5dtype import getBaseTypeJson, validateTypeItem, createDataType, getItemSize -from h5json.filters import getFiltersJson from h5json.shape_util import getShapeDims, getShapeClass -from h5json.dset_util import guessChunk, getChunkSize -from h5json.dset_util import validateChunkLayout, getDataSize, getDsetMaxDims -from h5json.dset_util import LAYOUT_CLASSES +from h5json.filters import getFiltersJson +from h5json.dset_util import guessChunk, getChunkSize, validateDatasetCreationProps +from h5json.dset_util import getDataSize, isExtensible from .util.nodeUtil import getDataNodeUrl from .util.authUtil import getAclKeys @@ -1302,63 +1301,6 @@ async def deleteObject(app, obj_id, bucket=None): del meta_cache[obj_id] # remove from cache -def validateDatasetCreationProps(creation_props, type_json=None, shape=None): - """ validate creation props """ - - log.debug(f"validateDatasetCreationProps: {creation_props}") - if "fillValue" in creation_props: - if not type_json or not shape: - msg = "shape and type must be set to use fillValue" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - # validate fill value compatible with type - dt = createDataType(type_json) - fill_value = creation_props["fillValue"] - log.debug(f"got fill_value: {fill_value}") - if "fillValue_encoding" in creation_props: - fill_value_encoding = creation_props["fillValue_encoding"] - if fill_value_encoding not in ("None", "base64"): - msg = f"unexpected value for fill_value_encoding: {fill_value_encoding}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - # should see a string in this case - if not isinstance(fill_value, str): - msg = f"unexpected fill value: {fill_value} " - msg += f"for encoding: {fill_value_encoding}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - fill_value_encoding = None - - try: - getNumpyValue(fill_value, dt=dt, encoding=fill_value_encoding) - except ValueError: - msg = f"invalid fill value: {fill_value}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if "filters" in creation_props: - if not type_json or not shape: - msg = "shape and type must be set to use filters" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - supported_filters = getSupportedFilters(include_compressors=True) - log.debug(f"supported_filters: {supported_filters}") - try: - filters_out = getFiltersJson(creation_props, supported_filters=supported_filters) - except (KeyError, ValueError): - # raise bad request exception if not valid - msg = "invalid filter provided" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - # replace filters with our standardized list - log.debug(f"setting filters to: {filters_out}") - creation_props["filters"] = filters_out - - def getCreateArgs(body, root_id=None, bucket=None, @@ -1544,10 +1486,18 @@ def getDatasetCreateArgs(body, try: shape_class = getShapeClass(shape_json) shape_dims = getShapeDims(shape_json) + if "maxdims" in shape_json: + max_dims = shape_json["maxdims"] + is_extensible = isExtensible(shape_dims, max_dims) + else: + max_dims = None + is_extensible = False except (KeyError, TypeError, ValueError): msg = f"Invalid shape: {shape_json}" log.warn(msg) raise HTTPBadRequest(reason=msg) + + log.debug(f"shape_class: {shape_class}, shape_dims: {shape_dims}") log.debug(f"got createArgs: {list(kwargs.keys())}") @@ -1555,58 +1505,73 @@ def getDatasetCreateArgs(body, # get layout for dataset creation log.debug("getting dataset creation settings") - layout_props = None min_chunk_size = int(config.get("min_chunk_size")) max_chunk_size = int(config.get("max_chunk_size")) type_json = kwargs["type"] + item_size = getItemSize(type_json) if item_size == "H5T_VARIABLE": item_size = config.get("default_vlen_type_size", default=128) - creation_props = kwargs["creation_props"] - layout_props = None - - if creation_props: - validateDatasetCreationProps(creation_props, type_json=type_json, shape=shape_json) - if "layout" in creation_props: - layout_props = creation_props["layout"] - try: - validateChunkLayout(shape_json, item_size, layout_props, chunk_table=chunk_table) - except ValueError: - msg = f"invalid chunk layout: {layout_props}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + if shape_dims is None: + dset_size = 0 else: - creation_props = {} + dset_size = getDataSize(shape_dims, item_size) + creation_props = kwargs["creation_props"] layout_class = None + layout_json = {} chunk_dims = None - if layout_props: - layout_class = layout_props.get("class") + partition_count = None - if layout_class: - if layout_class not in LAYOUT_CLASSES: - msg = f"unknown layout_class: {layout_class}" + if creation_props: + log.debug(f"POST_Dataset creation props: {creation_props}") + try: + validateDatasetCreationProps(creation_props, type_json=type_json, shape=shape_json) + except ValueError as ve: + msg = f"Provided creation properties are invalid: {ve}" log.warn(msg) raise HTTPBadRequest(reason=msg) - # check dims is defined for any chunked layout - if layout_class.startswith("H5D_CHUNKED"): - if "dims" not in layout_props: - msg = "chunked layout specified without dims" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - chunk_dims = layout_props["dims"] - if len(chunk_dims) != len(shape_dims): - msg = "chunk dimensions have different rank than dataset" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif layout_class == "H5D_CONTIGUOUS_REF" and getItemSize(type_json) == "H5T_VARIABLE": + log.debug(f"create_props after validation: {creation_props}") + if creation_props.get("layout"): + layout_json = creation_props["layout"] + layout_class = layout_json.get("class") + if "filters" in creation_props: + # normalize filter format + filters = getFiltersJson(creation_props) + supported_filters = getSupportedFilters() + log.debug(f"supported filters: {supported_filters}") + for filter_item in filters: + if filter_item["name"] not in supported_filters: + msg = f"Unsupported filter id: {filter_item['id']}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + creation_props["filters"] = filters + log.debug(f"post validate creation properties: {creation_props}") + + if layout_class: + if layout_class == "H5D_CONTIGUOUS_REF" and getItemSize(type_json) == "H5T_VARIABLE": # ref dataset does not work with vlen type - msg = "H5D_CONTIGUOUS_REF cannot be used with variable length types" + msg = "H5D_CONTIGUOUS_REF datasets cannot be used with variable length types" log.warn(msg) raise HTTPBadRequest(reason=msg) + + if "dims" in layout_json: + chunk_dims = layout_json["dims"] + if chunk_dims: + # log warning if the chunk shape if chunk size is too small or too big + chunk_size = getChunkSize(chunk_dims, item_size) + if chunk_size < min_chunk_size: + msg = f"chunk size: {chunk_size} less than recommended min size: {min_chunk_size}" + log.warn(msg) + elif chunk_size > max_chunk_size: + msg = f"chunk size: {chunk_size} greater than recommended " + msg += f"max size: {max_chunk_size}" + log.debug(msg) else: - pass - + # log warning if contiguous layout used with too large datadset + if dset_size > max_chunk_size: + msg = f"dataset larger than recommended {max_chunk_size} for CONTIGUOUS storage" + log.warn(msg) elif shape_class == "H5S_NULL": layout_class = None log.debug("using None layout for H5S_NULL dataset") @@ -1614,8 +1579,7 @@ def getDatasetCreateArgs(body, layout_class = "H5D_CONTIGUOUS" log.debug("Using H5D_CONTIGUOUS for H5S_SCALAR dataset") elif shape_class == "H5S_SIMPLE": - dset_size = getDataSize(shape_dims, item_size) - if dset_size <= min_chunk_size: + if dset_size <= min_chunk_size and not is_extensible: # default to contiguous layout_class = "H5D_CONTIGUOUS" log.debug(f"Using H5D_CONTIGUOUS for small (<{min_chunk_size}) dataset") @@ -1625,42 +1589,25 @@ def getDatasetCreateArgs(body, log.debug(f"item_size: {item_size}") log.debug(f"chunk_min: {min_chunk_size}") log.debug(f"chunk_max: {max_chunk_size}") - kwargs = {"chunk_min": min_chunk_size, "chunk_max": max_chunk_size} - chunk_dims = guessChunk(shape_json, item_size, **kwargs) + args = {"chunk_min": min_chunk_size, "chunk_max": max_chunk_size} + chunk_dims = guessChunk(shape_json, item_size, **args) log.debug(f"initial autochunk layout: {chunk_dims}") chunk_size = getChunkSize(chunk_dims, item_size) - - # log warning if the chunk shape if chunk size is too small or too big - if chunk_size < min_chunk_size: - msg = f"chunk size: {chunk_size} less than recommended min size: {min_chunk_size}" - log.warn(msg) - elif chunk_size > max_chunk_size: - msg = f"chunk size: {chunk_size} greater than recommended " - msg += f"max size: {max_chunk_size}" - log.debug(msg) + log.debug(f"chunk_size: {chunk_size}") else: msg = f"unexpected shape_class: {shape_class}" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not layout_props: - layout_props = {"class": layout_class} - if chunk_dims: - layout_props["dims"] = chunk_dims - log.debug(f"using dataset layout: {layout_props}") - creation_props["layout"] = layout_props - # set partition_count if needed: max_chunks_per_folder = int(config.get("max_chunks_per_folder")) set_partition = False if max_chunks_per_folder > 0: - if "dims" in layout_props: + if "dims" in layout_json: set_partition = True - if set_partition: + if set_partition and dset_size > max_chunk_size: log.debug(f"updating layout for partition constraint: {max_chunks_per_folder}") - shape_dims = getShapeDims(shape_json) - max_dims = getDsetMaxDims(shape_json) num_chunks = 1 rank = len(shape_dims) @@ -1692,33 +1639,22 @@ def getDatasetCreateArgs(body, msg = f"set partition count to: {partition_count}, " msg += f"num_chunks: {num_chunks}" log.info(msg) - layout_props["partition_count"] = partition_count else: msg = "do not need chunk partitions, num_chunks: " msg += f"{num_chunks} max_chunks_per_folder: " msg += f"{max_chunks_per_folder}" log.info(msg) - if layout_class in ("H5D_CHUNKED_REF", "H5D_CHUNKED_REF_INDIRECT"): - chunk_size = getChunkSize(chunk_dims, item_size) - - msg = f"chunk_size: {chunk_size}, min: {min_chunk_size}, " - msg += f"max: {max_chunk_size}" - log.debug(msg) - # nothing to do about inefficiently small chunks, but large chunks - # can be subdivided - if chunk_size < min_chunk_size: - msg = f"chunk size: {chunk_size} less than min size: " - msg += f"{min_chunk_size} for {layout_class} dataset" - log.warn(msg) - elif chunk_size > max_chunk_size: - msg = f"chunk size: {chunk_size} greater than max size: " - msg += f"{max_chunk_size}, for {layout_class} dataset" - log.warn(msg) - layout_props["dims"] = chunk_dims - - creation_props["layout"] = layout_props + if layout_class: + # should be set if shape is not H5S_NULL + if "class" not in layout_json: + layout_json["class"] = layout_class + if chunk_dims: + layout_json["dims"] = chunk_dims + log.debug(f"using dataset layout: {layout_json}") + creation_props["layout"] = layout_json kwargs["creation_props"] = creation_props + log.debug(f"updated creation props: {creation_props}") # # get input data if present @@ -1733,9 +1669,7 @@ def getDatasetCreateArgs(body, input_data = body["value"] msg = "input data doesn't match request type and shape" dims = getShapeDims(shape_json) - if not dims: - log.warn(msg) - raise HTTPBadRequest(reason=msg) + arr_dtype = createDataType(type_json) try: diff --git a/hsds/util/chunkUtil.py b/hsds/util/chunkUtil.py index 9dd51bf9..7d299d9a 100644 --- a/hsds/util/chunkUtil.py +++ b/hsds/util/chunkUtil.py @@ -1,6 +1,7 @@ import numpy as np from h5json.array_util import ndarray_compare +from h5json.dset_util import getDatasetLayout from .. import hsds_logger as log @@ -37,7 +38,16 @@ def getNumChunks(selection, layout): If selection is provided (a list of slices), return the number of chunks that intersect with the selection. """ + print(f"getNumChunks: {selection}, layout: {layout}") + if len(selection) == 0: + print("zero length selection") + return 0 + rank = len(layout) + if rank == 1 and layout[0] == 1: + # scalar dataset + print("scalar dset") + return 1 if len(selection) != rank: msg = f"selection list has {len(selection)} items, but rank is {rank}" raise ValueError(msg) @@ -47,10 +57,12 @@ def getNumChunks(selection, layout): if isinstance(s, slice): if s.stop <= s.start: log.debug("null selection") + print("null selection") return 0 else: # coordinate list if len(s) == 0: + print("null coordinate list") return 0 # first, get the number of chunks needed for any coordinate selection chunk_indices = [] @@ -80,6 +92,8 @@ def getNumChunks(selection, layout): else: num_chunks = 1 + print("num_chunks:", num_chunks) + # now deal with any slices in the selection for i in range(len(selection)): s = selection[i] @@ -207,11 +221,8 @@ def getPartitionKey(chunk_id, partition_count): def getChunkIdForPartition(chunk_id, dset_json): """Return the partition specific chunk id for given chunk""" - if "layout" not in dset_json: - msg = "No layout found in dset_json" - log.error(msg) - raise KeyError(msg) - layout_json = dset_json["layout"] + + layout_json = getDatasetLayout(dset_json) if "partition_count" in layout_json: partition_count = layout_json["partition_count"] partition = getChunkPartition(chunk_id) @@ -250,7 +261,12 @@ def chunk_id_to_index(chunk_id): indices.append(x) return indices + log.debug(f"getChunkIds - dset_id: {dset_id}, selection: {selection}, layout: {layout}") + if prefix: + log.debug(f"prefix: {prefix}") + num_chunks = getNumChunks(selection, layout) + log.debug(f"getChunkIds - num_chunks: {num_chunks}") if num_chunks == 0: return [] # empty list if prefix is None: diff --git a/hsds/util/dsetUtil.py b/hsds/util/dsetUtil.py index 3704822d..49723750 100644 --- a/hsds/util/dsetUtil.py +++ b/hsds/util/dsetUtil.py @@ -17,7 +17,7 @@ from .. import hsds_logger as log - + def getShapeJson(body): """ Return normalized json description of data space """ @@ -27,11 +27,11 @@ def getShapeJson(body): if "shape" not in body: shape_class = "H5S_SCALAR" - log.debug("not shape given - using H5S_SCALAR") + log.debug("getShapeJson - no shape given, using H5S_SCALAR") return {"class": shape_class} body_shape = body["shape"] - log.debug(f"got shape: {body_shape}") + log.debug(f"getShapeJson - got shape: {body_shape}") if isinstance(body_shape, int): shape_class = "H5S_SIMPLE" @@ -112,7 +112,7 @@ def getShapeJson(body): shape_json["dims"] = dims if maxdims: shape_json["maxdims"] = maxdims - log.debug(f"returning shape_json: {shape_json}") + log.debug(f"getShapeJson - returning shape_json: {shape_json}") return shape_json @@ -123,6 +123,11 @@ def getHyperslabSelection(dims, start=None, stop=None, step=None): TBD: for step>1, adjust the slice to not extend beyond last data point returned """ + + if len(dims) == 0: + # scalar dataset + dims = (1,) + rank = len(dims) if start: if not isinstance(start, (list, tuple)): @@ -494,20 +499,25 @@ def get_slices(select, dset_json): dset_id = dset_json["id"] datashape = dset_json["shape"] - if datashape["class"] == "H5S_NULL": + shape_class = datashape["class"] + if shape_class == "H5S_NULL": msg = "Null space datasets can not be used as target for GET value" log.warn(msg) raise HTTPBadRequest(reason=msg) - dims = getShapeDims(datashape) # throws 400 for HS_NULL dsets + if shape_class == "H5S_SCALAR": + # return single slice + slices = [slice(0, 1, 1), ] + else: + dims = getShapeDims(datashape) # throws 400 for HS_NULL dsets - try: - slices = getSelectionList(select, dims) - except ValueError: - msg = f"Invalid selection: {select} on dims: {dims} " - msg += f"for dataset: {dset_id}" - log.warn(msg) - raise + try: + slices = getSelectionList(select, dims) + except ValueError: + msg = f"Invalid selection: {select} on dims: {dims} " + msg += f"for dataset: {dset_id}" + log.warn(msg) + raise return slices diff --git a/hsds/util/storUtil.py b/hsds/util/storUtil.py index 3bbb073c..b37e25bc 100644 --- a/hsds/util/storUtil.py +++ b/hsds/util/storUtil.py @@ -69,7 +69,7 @@ def getCompressors(): def getSupportedFilters(include_compressors=True): """return list of other supported filters""" filters = [ - # "bitshuffle", + "bitshuffle", "shuffle", "fletcher32", "nbit", # No-op diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index e8d4feb3..5979e042 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -785,10 +785,19 @@ def testResizableDataset(self): # create the dataset req = self.endpoint + "/datasets" payload = {"type": "H5T_IEEE_F32LE", "shape": 10, "maxdims": 20} - payload["creationProperties"] = {"fillValue": 3.12} + contiguous_layout = {"class": "H5D_CONTIGUOUS"} + cpl = {"fillValue": 3.12, "layout": contiguous_layout} + payload["creationProperties"] = cpl + req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 201) # create dataset + self.assertEqual(rsp.status_code, 400) # need chunk layout for resizable + + # if we remove the layout, HSDS will setup a chunked layout for us + del cpl["layout"] + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) dset_uuid = rspJson["id"] self.assertTrue(helper.validateId(dset_uuid)) @@ -816,8 +825,16 @@ def testResizableDataset(self): self.assertTrue("maxdims" in shape) self.assertEqual(shape["maxdims"][0], 20) + self.assertTrue("creationProperties" in rspJson) creationProps = rspJson["creationProperties"] + self.assertTrue("fillValue" in creationProps) self.assertEqual(creationProps["fillValue"], 3.12) + self.assertTrue("layout" in creationProps) + layout = creationProps["layout"] + self.assertTrue("class" in layout) + self.assertEqual(layout["class"], "H5D_CHUNKED") + self.assertTrue("dims" in layout) + self.assertEqual(len(layout["dims"]), 1) # verify shape using the GET shape request req = req + "/shape" @@ -1271,8 +1288,11 @@ def testInvalidDatasetLayout(self): # dataset create req = self.endpoint + "/datasets" - dims = [365, 780, 1024] - maxdims = [0, 780, 1024] + # TBD: the larger dimensions are causing SN to crash + # dims = [365, 780, 1024] + dims = [365, 780, 10] + # maxdims = [0, 780, 1024] + maxdims = [0, 780, 10] payload = { "type": "H5T_IEEE_F32LE", "shape": dims, @@ -1455,7 +1475,7 @@ def testCreationPropertiesContiguousDataset(self): req = self.endpoint + "/datasets" # Create ~1GB dataset - layout = {"class": "H5D_CONTIGUOUS"} + layout = {"class": "H5D_CHUNKED", "dims": [10, 20]} gzip_filter = { "class": "H5Z_FILTER_DEFLATE", "id": 1, @@ -1493,8 +1513,9 @@ def testCreationPropertiesContiguousDataset(self): self.assertTrue("layout" in cpl) layout_json = cpl["layout"] self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CONTIGUOUS") - self.assertFalse("dims" in layout_json) + self.assertEqual(layout_json["class"], "H5D_CHUNKED") + self.assertTrue("dims" in layout_json) + self.assertEqual(layout_json["dims"], [10, 20]) def testCompressionFiltersDataset(self): # test Dataset with creation property list @@ -1519,11 +1540,10 @@ def testCompressionFiltersDataset(self): req = self.endpoint + "/datasets" payload = {"type": "H5T_IEEE_F32LE", "shape": [40, 80]} - payload["creationProperties"] = { - "filters": [ - compressor, - ] - } + filters = [compressor, ] + layout = {"class": "H5D_CHUNKED", "dims": [10, 20]} + cpl = {"filters": filters, "layout": layout} + payload["creationProperties"] = cpl req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -1546,7 +1566,7 @@ def testCompressionFiltersDataset(self): self.assertTrue("layout" in cpl) layout_json = cpl["layout"] self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CONTIGUOUS") + self.assertEqual(layout_json["class"], "H5D_CHUNKED") # verify compression self.assertTrue("creationProperties" in rspJson) @@ -1580,14 +1600,14 @@ def testCompressionFilterOptionDataset(self): # create the dataset req = self.endpoint + "/datasets" - compressor = {"class": "H5Z_FILTER_USER", "name": "lz4", "level": 5} + compressor = {"class": "H5Z_FILTER_LZ4", "name": "lz4", "level": 5} + filters = [compressor, ] payload = {"type": "H5T_IEEE_F32LE", "shape": [40, 80]} - payload["creationProperties"] = { - "filters": [ - compressor, - ] - } + layout = {"class": "H5D_CHUNKED", "dims": [10, 20]} + cpl = {"filters": filters, "layout": layout} + payload["creationProperties"] = cpl + req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -1610,8 +1630,8 @@ def testCompressionFilterOptionDataset(self): self.assertTrue("layout" in cpl) layout_json = cpl["layout"] self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CONTIGUOUS") - self.assertFalse("dims" in layout_json) + self.assertEqual(layout_json["class"], "H5D_CHUNKED") + self.assertTrue("dims" in layout_json) # verify compression self.assertTrue("creationProperties" in rspJson) @@ -1622,7 +1642,7 @@ def testCompressionFilterOptionDataset(self): filter = filters[0] self.assertTrue(isinstance(filter, dict)) self.assertTrue("class" in filter) - self.assertEqual(filter["class"], "H5Z_FILTER_USER") + self.assertEqual(filter["class"], "H5Z_FILTER_LZ4") self.assertTrue("id" in filter) self.assertTrue("name" in filter) self.assertEqual(filter["name"], "lz4") @@ -1852,16 +1872,11 @@ def testAutoChunk1dDataset(self): {"name": "z", "type": "H5T_IEEE_F64LE"}, ) datatype = {"class": "H5T_COMPOUND", "fields": fields} + item_size = 12 # 3 fields of 4 bytes each + cpl = {"fillValue": 3.12} # no layout given + + payload = {"type": datatype, "shape": dims, "creationProperties": cpl} - payload = {"type": datatype, "shape": dims} - # the following specifies an efficiently small chunk size - chunk_dims = [10,] - payload["creationProperties"] = { - "layout": { - "class": "H5D_CHUNKED", - "dims": chunk_dims - } - } req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -1884,12 +1899,23 @@ def testAutoChunk1dDataset(self): rspJson = json.loads(rsp.text) self.assertTrue("creationProperties" in rspJson) cpl = rspJson["creationProperties"] + self.assertTrue("fillValue" in cpl) self.assertTrue("layout" in cpl) layout_json = cpl["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) - self.assertEqual(layout_json["dims"], chunk_dims) + chunk_dims = layout_json["dims"] + self.assertEqual(len(chunk_dims), 1) + self.assertTrue(chunk_dims[0] < dims[0]) + + chunk_size = chunk_dims[0] * item_size + + # chunk size will be based on server config, but assume a min/max of 1MB to 1GB + CHUNK_MIN = 1024 * 1024 + CHUNK_MAX = 1024 * 1024 * 1024 + self.assertTrue(chunk_size >= CHUNK_MIN) + self.assertTrue(chunk_size <= CHUNK_MAX) def testAutoChunk2dDataset(self): # test Dataset where chunk layout is set automatically @@ -1908,6 +1934,7 @@ def testAutoChunk2dDataset(self): req = self.endpoint + "/datasets" # 50K x 80K dataset dims = [50000, 80000] + item_size = 4 # 4 bytes per float32 payload = {"type": "H5T_IEEE_F32LE", "shape": dims} req = self.endpoint + "/datasets" @@ -1937,11 +1964,11 @@ def testAutoChunk2dDataset(self): self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) - layout = layout_json["dims"] - self.assertEqual(len(layout), 2) - self.assertTrue(layout[0] < dims[0]) - self.assertTrue(layout[1] < dims[1]) - chunk_size = layout[0] * layout[1] * 4 + chunk_dims = layout_json["dims"] + self.assertEqual(len(chunk_dims), 2) + self.assertTrue(chunk_dims[0] < dims[0]) + self.assertTrue(chunk_dims[1] < dims[1]) + chunk_size = chunk_dims[0] * chunk_dims[1] * item_size # chunk size will be based on server config, but assume a min/max of 1MB to 1GB CHUNK_MIN = 1024 * 1024 diff --git a/tests/integ/filter_test.py b/tests/integ/filter_test.py index ea2df637..0cd7fdb5 100755 --- a/tests/integ/filter_test.py +++ b/tests/integ/filter_test.py @@ -58,6 +58,8 @@ def testDeflateCompression(self): # Create ~1MB dataset payload = {"type": "H5T_STD_I8LE", "shape": [1024, 1024]} + # use a chunked layout for compression + layout = {"class": "H5D_CHUNKED", "dims": [64, 64]} # define deflate compression gzip_filter = { "class": "H5Z_FILTER_DEFLATE", @@ -65,7 +67,7 @@ def testDeflateCompression(self): "level": 9, "name": "deflate", } - payload["creationProperties"] = {"filters": [gzip_filter]} + payload["creationProperties"] = {"layout": layout, "filters": [gzip_filter]} req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -120,7 +122,9 @@ def testShuffleFilter(self): payload = {"type": "H5T_STD_I32LE", "shape": [1024, 1024]} # define sshufle compression shuffle_filter = {"class": "H5Z_FILTER_SHUFFLE", "id": 2, "name": "shuffle"} - payload["creationProperties"] = {"filters": [shuffle_filter]} + # use chunked layout for compression + layout = {"class": "H5D_CHUNKED", "dims": [64, 64]} + payload["creationProperties"] = {"filters": [shuffle_filter], "layout": layout} req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -182,7 +186,11 @@ def testShuffleAndDeflate(self): } # and shuffle compression shuffle_filter = {"class": "H5Z_FILTER_SHUFFLE", "id": 2, "name": "shuffle"} - payload["creationProperties"] = {"filters": [shuffle_filter, gzip_filter]} + filters = [shuffle_filter, gzip_filter] + # use chunked layout + layout = {"class": "H5D_CHUNKED", "dims": [64, 64]} + payload["creationProperties"] = {"layout": layout, "filters": filters} + req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -237,7 +245,9 @@ def testBitShuffle(self): # bit shuffle bitshuffle_filter = {"class": "H5Z_FILTER_BITSHUFFLE", "id": 32008, "name": "bitshuffle"} - payload["creationProperties"] = {"filters": [bitshuffle_filter, ]} + # use chunked layout + layout = {"class": "H5D_CHUNKED", "dims": [64, 64]} + payload["creationProperties"] = {"filters": [bitshuffle_filter], "layout": layout} req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -299,7 +309,10 @@ def testBitShuffleAndDeflate(self): } # and bit shuffle bitshuffle_filter = {"class": "H5Z_FILTER_BITSHUFFLE", "id": 32008, "name": "bitshuffle"} - payload["creationProperties"] = {"filters": [bitshuffle_filter, gzip_filter]} + filters = [bitshuffle_filter, gzip_filter] + # use chunked layout + layout = {"class": "H5D_CHUNKED", "dims": [64, 64]} + payload["creationProperties"] = {"filters": filters, "layout": layout} req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index 511bdf4e..244f8f5d 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -1842,7 +1842,6 @@ def testPutLinkMultipleWithTimestamps(self): self.assertTrue(False) # unexpected self.assertTrue("created" in ret_link) self.assertTrue(ret_link["created"] in timestamps) - print(timestamps) def testDeleteLinkMultiple(self): domain = self.base_domain + "/testDeleteLinkMultiple.h5" diff --git a/tests/integ/pointsel_test.py b/tests/integ/pointsel_test.py index 194eb2ce..57949114 100755 --- a/tests/integ/pointsel_test.py +++ b/tests/integ/pointsel_test.py @@ -1370,8 +1370,10 @@ def testDatasetChunkPartitioning(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + creation_props = rspJson["creationProperties"] + self.assertTrue("layout" in creation_props) + layout_json = creation_props["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index 293c625d..02a4f990 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -982,7 +982,6 @@ def testScalarDatasetInitData(self): "id", "shape", "hrefs", - "layout", "creationProperties", "attributeCount", "created", @@ -1060,7 +1059,6 @@ def testScalarDatasetInitDataMulti(self): "id", "shape", "hrefs", - "layout", "creationProperties", "attributeCount", "created", @@ -2133,7 +2131,7 @@ def testResizable1DValue(self): # read values from the extended region req = self.endpoint + "/datasets/" + dset_uuid + "/value" - params = {"select": "[{}:{}]".format(0, num_elements)} + params = {"select": f"[0:{num_elements}]"} rsp = self.session.get(req, params=params, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) @@ -3236,7 +3234,7 @@ def testARangeInitializerDataset(self): extent = 1_000_000_000 # one billion elements dset_dims = [extent, ] layout = {"class": "H5D_CHUNKED"} - layout["dims"] = dset_dims + layout["dims"] = [1_000, ] range_start = 0 # -0.25 range_step = 1 @@ -3286,6 +3284,7 @@ def testARangeInitializerDataset(self): def testIntelligentRangeGet1D(self): test_name = "testIntelligentRangeGet1D" + print(test_name, self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) diff --git a/tests/unit/chunk_util_test.py b/tests/unit/chunk_util_test.py index 7ae16bd5..623f26ca 100755 --- a/tests/unit/chunk_util_test.py +++ b/tests/unit/chunk_util_test.py @@ -115,11 +115,44 @@ def testGetNumChunks(self): selection = getHyperslabSelection(datashape, (0, 0), (100, 100), (20, 40)) count = getNumChunks(selection, layout) self.assertEqual(count, 15) + # test with scalar + datashape = () + layout = (1, ) + selection = getHyperslabSelection(datashape, 0, 1) + print("selection:", selection) + count = getNumChunks(selection, layout) + self.assertEqual(count, 1) def testGetChunkIds(self): # getChunkIds(dset_id, selection, layout, dim=0, prefix=None, chunk_ids=None): dset_id = "d-12345678-1234-1234-1234-1234567890ab" + datashape = [] + layout = (1,) + + selection = getHyperslabSelection(datashape, 0, 1) + num_chunks = getNumChunks(selection, layout) + + self.assertEqual(num_chunks, 1) + chunk_ids = getChunkIds(dset_id, selection, layout) + self.assertEqual(len(chunk_ids), 1) + chunk_id = chunk_ids[0] + self.assertTrue(chunk_id.startswith("c-")) + self.assertTrue(chunk_id.endswith("_0")) + self.assertEqual(chunk_id[2:-2], dset_id[2:]) + self.assertEqual(len(chunk_id), 2 + 36 + 2) + self.assertEqual(getDatasetId(chunk_id), dset_id) + + selection = getHyperslabSelection(datashape) + chunk_ids = getChunkIds(dset_id, selection, layout) + self.assertEqual(len(chunk_ids), 1) + chunk_id = chunk_ids[0] + self.assertTrue(chunk_id.startswith("c-")) + self.assertTrue(chunk_id.endswith("_0")) + self.assertEqual(chunk_id[2:-2], dset_id[2:]) + self.assertEqual(len(chunk_id), 2 + 36 + 2) + self.assertEqual(getDatasetId(chunk_id), dset_id) + datashape = [1,] layout = (1,) selection = getHyperslabSelection(datashape) diff --git a/tests/unit/dset_util_test.py b/tests/unit/dset_util_test.py index 0e77ab1b..f89690d9 100755 --- a/tests/unit/dset_util_test.py +++ b/tests/unit/dset_util_test.py @@ -14,7 +14,7 @@ import sys sys.path.append("../..") -from hsds.util.dsetUtil import getHyperslabSelection, getSelectionShape +from hsds.util.dsetUtil import getHyperslabSelection, getSelectionShape, get_slices from hsds.util.dsetUtil import getSelectionList, ItemIterator, getSelectionPagination @@ -25,8 +25,40 @@ def __init__(self, *args, **kwargs): self.logger = logging.getLogger() self.logger.setLevel(logging.WARNING) + def testGetSlices(self): + dset_json = {"id": "d-b4b3b3d6-94343adc-1727-28bebf-12caac"} + datashape = {"class": "H5S_SCALAR"} + cprops = {"layout": {"class": "H5D_CONTIGUOUS"}} + dtype_json = {"class": "H5T_INTEGER", "base": "H5T_STD_I32LE"} + dset_json["shape"] = datashape + dset_json["creationProperties"] = cprops + dset_json["type"] = dtype_json + + slices = get_slices("", dset_json) + self.assertEqual(len(slices), 1) + self.assertEqual(slices[0], slice(0, 1, 1)) + + slices = get_slices(None, dset_json) + self.assertEqual(len(slices), 1) + self.assertEqual(slices[0], slice(0, 1, 1)) + def testGetHyperslabSelection(self): # getHyperslabSelection(dsetshape, start, stop, step) + + # Scalar case + datashape = [] + slices = getHyperslabSelection(datashape) + self.assertEqual(len(slices), 1) + self.assertEqual(slices[0], slice(0, 1, 1)) + + slices = getHyperslabSelection(datashape, 0) + self.assertEqual(len(slices), 1) + self.assertEqual(slices[0], slice(0, 1, 1)) + + slices = getHyperslabSelection(datashape, 0, 1) + self.assertEqual(len(slices), 1) + self.assertEqual(slices[0], slice(0, 1, 1)) + # 1-D case datashape = [100,] slices = getHyperslabSelection(datashape) From a2ca1ee159f8ac2b1d8dc7f86a0a0a93c2126300 Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 26 Dec 2025 14:25:01 +0800 Subject: [PATCH 44/49] updated for new hdf5-json methods --- hsds/async_lib.py | 9 +- hsds/attr_dn.py | 3 +- hsds/chunk_crawl.py | 11 ++- hsds/chunk_sn.py | 8 +- hsds/chunklocator.py | 6 +- hsds/ctype_dn.py | 3 +- hsds/datanode_lib.py | 3 +- hsds/domain_dn.py | 3 +- hsds/domain_sn.py | 3 +- hsds/dset_dn.py | 2 +- hsds/dset_sn.py | 4 +- hsds/folder_crawl.py | 9 +- hsds/group_dn.py | 2 +- hsds/headnode.py | 13 ++- hsds/hsds_app.py | 18 ++-- hsds/link_dn.py | 2 +- hsds/servicenode.py | 7 +- hsds/servicenode_lib.py | 159 +++++++++++------------------------- hsds/util/chunkUtil.py | 9 +- hsds/util/timeUtil.py | 83 ------------------- pyproject.toml | 2 +- tests/integ/dataset_test.py | 76 ++++++++++------- 22 files changed, 157 insertions(+), 278 deletions(-) delete mode 100755 hsds/util/timeUtil.py diff --git a/hsds/async_lib.py b/hsds/async_lib.py index 9888d3dd..674caa98 100755 --- a/hsds/async_lib.py +++ b/hsds/async_lib.py @@ -23,6 +23,7 @@ from h5json.filters import getFilters from h5json.shape_util import getShapeDims from h5json.dset_util import getDatasetLayoutClass, getDatasetLayout, getChunkDims +from h5json.time_util import getNow from .util.chunkUtil import getDatasetId, getNumChunks, ChunkIterator from .util.dsetUtil import getHyperslabSelection @@ -31,7 +32,7 @@ from .datanode_lib import getFilterOps from . import hsds_logger as log from . import config -import time + # List all keys under given root and optionally update info.json # Note: only works with schema v2 domains! @@ -78,7 +79,7 @@ async def updateDatasetInfo(app, dset_id, dataset_info, bucket=None): return type_json = dset_json["type"] item_size = getItemSize(type_json) - if "layout" not in dset_json: + if not getDatasetLayout(dset_json): msg = "updateDatasetInfo - expected to find layout in dataset_json " msg += f"for {dset_id}" log.warn(msg) @@ -387,7 +388,7 @@ async def scanRoot(app, rootid, update=False, bucket=None): results["logical_bytes"] = 0 results["checksums"] = {} # map of objid to checksums results["bucket"] = bucket - results["scan_start"] = time.time() + results["scan_start"] = getNow(app=app) app["scanRoot_results"] = results app["scanRoot_keyset"] = set() @@ -442,7 +443,7 @@ async def scanRoot(app, rootid, update=False, bucket=None): # free up memory used by the checksums del results["checksums"] - results["scan_complete"] = time.time() + results["scan_complete"] = getNow(app=app) if update: # write .info object back to S3 diff --git a/hsds/attr_dn.py b/hsds/attr_dn.py index 43c04232..a660b836 100755 --- a/hsds/attr_dn.py +++ b/hsds/attr_dn.py @@ -22,12 +22,13 @@ from h5json.array_util import arrayToBytes, jsonToArray, decodeData from h5json.array_util import bytesToArray, bytesArrayToList, getNumElements from h5json.shape_util import getShapeDims +from h5json.time_util import getNow from .util.attrUtil import validateAttributeName, isEqualAttr from .util.globparser import globmatch from .util.domainUtil import isValidBucketName from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj -from .util.timeUtil import getNow + from . import config from . import hsds_logger as log diff --git a/hsds/chunk_crawl.py b/hsds/chunk_crawl.py index 142577d8..38da4d3e 100755 --- a/hsds/chunk_crawl.py +++ b/hsds/chunk_crawl.py @@ -15,7 +15,6 @@ # import asyncio -import time import traceback import random from asyncio import CancelledError @@ -29,6 +28,7 @@ from h5json.array_util import getNumElements, arrayToBytes, bytesToArray from h5json.shape_util import getShapeDims from h5json.dset_util import getChunkDims +from h5json.time_util import getNow from .util.nodeUtil import getDataNodeUrl, getNodeCount from .util.httpUtil import http_get, http_put, http_post, get_http_client @@ -87,7 +87,7 @@ async def write_chunk_hyperslab( msg += f"bucket: {bucket}" msg += f" dset_json: {dset_json}" log.info(msg) - + partition_chunk_id = getChunkIdForPartition(chunk_id, dset_json) if partition_chunk_id != chunk_id: log.debug(f"using partition_chunk_id: {partition_chunk_id}") @@ -676,6 +676,9 @@ def __init__( app["cc_clients"] = {} self._clients = app["cc_clients"] + def now(self): + return getNow(app=self._app) + def get_status(self): if len(self._status_map) != len(self._chunk_ids): msg = "get_status code while crawler not complete" @@ -720,7 +723,7 @@ async def work(self): log.info(f"ChunkCrawler - client_name: {client_name}") while True: try: - start = time.time() + start = self.now() chunk_id = await self._q.get() if self._limit > 0 and self._hits >= self._limit: msg = f"ChunkCrawler - maxhits exceeded, skipping fetch for chunk: {chunk_id}" @@ -745,7 +748,7 @@ async def work(self): await self.do_work(chunk_id, client=client) self._q.task_done() - elapsed = time.time() - start + elapsed = self.now() - start msg = f"ChunkCrawler - task {chunk_id} start: {start:.3f} " msg += f"elapsed: {elapsed:.3f}" log.debug(msg) diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index f278e4e9..1cca80f1 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -28,8 +28,8 @@ from h5json.array_util import bytesArrayToList, jsonToArray, getNumElements, arrayToBytes from h5json.array_util import bytesToArray, squeezeArray, getBroadcastShape from h5json.objid import isValidUuid -from h5json.shape_util import isNullSpace, isScalar, getShapeDims -from h5json.dset_util import getChunkDims, isExtensible, getDsetMaxDims +from h5json.shape_util import isNullSpace, isScalar, getShapeDims, getMaxDims +from h5json.dset_util import getChunkDims, isExtensible from .util.httpUtil import getHref, getAcceptType, getContentType from .util.httpUtil import request_read, jsonResponse, isAWSLambda @@ -163,8 +163,8 @@ def _getAppendRows(params, dset_json, body=None): datashape = dset_json["shape"] dims = getShapeDims(datashape) rank = len(dims) - maxdims = getDsetMaxDims(dset_json) - if not isExtensible(dims, maxdims): + maxdims = getMaxDims(datashape) + if not isExtensible(datashape): msg = "Dataset shape must be extensible for packet updates" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/chunklocator.py b/hsds/chunklocator.py index 2f8bfbaf..f2cd93d7 100644 --- a/hsds/chunklocator.py +++ b/hsds/chunklocator.py @@ -1,11 +1,11 @@ import sys -import time import h5py import s3fs import numpy as np from . import config from . import hsds_logger as log +from h5json.time_util import getNow from h5json.array_util import bytesArrayToList, getNumElements from .util.dsetUtil import getSelectionList, getSelectionShape @@ -191,7 +191,7 @@ def main(): prefix = config.get("log_prefix") log_timestamps = config.get("log_timestamps", default=False) log.setLogConfig(log_level, prefix=prefix, timestamps=log_timestamps) - start_time = time.time() + start_time = getNow() log.info(f"chunklocator start: {start_time:.2f}") cmd_options = get_cmd_options() @@ -231,6 +231,6 @@ def main(): sys.exit(1) log.info('done') - stop_time = time.time() + stop_time = getNow() log.info(f"chunklocator stop: {stop_time:.2f}") log.info(f"chunklocator elapsed: {(stop_time - start_time):.2f}") diff --git a/hsds/ctype_dn.py b/hsds/ctype_dn.py index 834e02cb..465d8916 100755 --- a/hsds/ctype_dn.py +++ b/hsds/ctype_dn.py @@ -19,11 +19,12 @@ from aiohttp.web import json_response from h5json.objid import isValidUuid, validateUuid +from h5json.time_util import getNow from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj from .datanode_lib import delete_metadata_obj, check_metadata_obj from .util.domainUtil import isValidBucketName -from .util.timeUtil import getNow + from . import hsds_logger as log diff --git a/hsds/datanode_lib.py b/hsds/datanode_lib.py index a6adbe28..6b76e37a 100644 --- a/hsds/datanode_lib.py +++ b/hsds/datanode_lib.py @@ -28,6 +28,7 @@ from h5json.objid import getRootObjId, isRootObjId from h5json.shape_util import getShapeDims from h5json.dset_util import getChunkDims, getDatasetLayoutClass +from h5json.time_util import getNow from .util.nodeUtil import getDataNodeUrl from .util.storUtil import getStorJSONObj, putStorJSONObj, putStorBytes @@ -40,7 +41,7 @@ from .util.chunkUtil import getDatasetId, getChunkSelection, getChunkIndex from .util.nodeUtil import validateInPartition from .util.rangegetUtil import ChunkLocation, chunkMunge, getHyperChunkIndex, getHyperChunkFactors -from .util.timeUtil import getNow + from . import config from . import hsds_logger as log from .dset_lib import getFillValue diff --git a/hsds/domain_dn.py b/hsds/domain_dn.py index 0fe0d01c..5b14ba70 100755 --- a/hsds/domain_dn.py +++ b/hsds/domain_dn.py @@ -16,10 +16,11 @@ from aiohttp.web_exceptions import HTTPConflict, HTTPInternalServerError from aiohttp.web import json_response +from h5json.time_util import getNow + from .util.authUtil import getAclKeys from .util.domainUtil import isValidDomain, getBucketForDomain from .util.nodeUtil import validateInPartition -from .util.timeUtil import getNow from .datanode_lib import get_metadata_obj, save_metadata_obj from .datanode_lib import delete_metadata_obj, check_metadata_obj from . import hsds_logger as log diff --git a/hsds/domain_sn.py b/hsds/domain_sn.py index 1ef469d5..cc634526 100755 --- a/hsds/domain_sn.py +++ b/hsds/domain_sn.py @@ -24,6 +24,8 @@ from h5json.objid import createObjId, getCollectionForId from h5json.objid import isValidUuid, isRootObjId, isSchema2Id +from h5json.time_util import getNow + from .util.nodeUtil import getNodeCount, getDataNodeUrl from .util.httpUtil import getObjectClass, http_post, http_put, http_delete @@ -37,7 +39,6 @@ from .util.storUtil import getStorKeys, getCompressors from .util.boolparser import BooleanParser from .util.globparser import globmatch -from .util.timeUtil import getNow from .servicenode_lib import getDomainJson, getObjectJson, getObjectIdByPath from .servicenode_lib import getRootInfo, checkBucketAccess, doFlush, getDomainResponse from .basenode import getVersion diff --git a/hsds/dset_dn.py b/hsds/dset_dn.py index 7b2029f8..5b99711d 100755 --- a/hsds/dset_dn.py +++ b/hsds/dset_dn.py @@ -18,9 +18,9 @@ from aiohttp.web import json_response from h5json.objid import isValidUuid, validateUuid +from h5json.time_util import getNow from .util.domainUtil import isValidBucketName -from .util.timeUtil import getNow from .datanode_lib import get_obj_id, check_metadata_obj, get_metadata_obj from .datanode_lib import save_metadata_obj, delete_metadata_obj from . import hsds_logger as log diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index e314e22c..bd7ad394 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -507,7 +507,7 @@ def _updateInitValuesList(kwargs): # to init_values list if "value" in kwargs: init_values.append(kwargs["value"]) - log.debug(f"init value appended: {kwargs['value']}") + log.debug(f"init value appended: {kwargs['value']}") del kwargs["value"] else: # add a placeholder @@ -629,7 +629,7 @@ def _updateInitValuesList(kwargs): # make selection for entire dataspace dims = getShapeDims(dset_json["shape"]) slices = getHyperslabSelection(dims) - + chunk_ids = getChunkIds(dset_id, slices, layout_dims) log.debug(f"init data, got chunk_ids: {chunk_ids}") if not chunk_ids or len(chunk_ids) != 1: diff --git a/hsds/folder_crawl.py b/hsds/folder_crawl.py index 05048758..f1b3fcf2 100644 --- a/hsds/folder_crawl.py +++ b/hsds/folder_crawl.py @@ -13,12 +13,12 @@ # service node of hsds cluster # -import time import asyncio from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPNotFound from aiohttp.web_exceptions import HTTPGone, HTTPInternalServerError from aiohttp.web_exceptions import HTTPServiceUnavailable +from h5json.time_util import getNow from .servicenode_lib import getObjectJson, getDomainResponse, getDomainJson from .util.nodeUtil import getNodeCount @@ -51,6 +51,9 @@ def __init__( else: self._max_tasks = len(domains) + def now(self): + return getNow(app=self._app) + async def crawl(self): workers = [asyncio.Task(self.work()) for _ in range(self._max_tasks)] # When all work is done, exit. @@ -68,11 +71,11 @@ async def crawl(self): async def work(self): while True: - start = time.time() + start = self.now() domain = await self._q.get() await self.fetch(domain) self._q.task_done() - elapsed = time.time() - start + elapsed = self.now() - start msg = f"FolderCrawler - task {domain} start: {start:.3f} " msg += f"elapsed: {elapsed:.3f}" log.debug(msg) diff --git a/hsds/group_dn.py b/hsds/group_dn.py index e2b69eef..69fa35d4 100755 --- a/hsds/group_dn.py +++ b/hsds/group_dn.py @@ -20,9 +20,9 @@ from aiohttp.web import json_response from h5json.objid import isValidUuid, isSchema2Id, isRootObjId, getRootObjId +from h5json.time_util import getNow from .util.domainUtil import isValidBucketName -from .util.timeUtil import getNow from .datanode_lib import get_obj_id, check_metadata_obj, get_metadata_obj from .datanode_lib import save_metadata_obj, delete_metadata_obj from . import hsds_logger as log diff --git a/hsds/headnode.py b/hsds/headnode.py index 354a17bc..41501d00 100755 --- a/hsds/headnode.py +++ b/hsds/headnode.py @@ -15,13 +15,12 @@ import asyncio import os -import time from aiohttp.web import Application, StreamResponse, run_app, json_response from aiohttp.web_exceptions import HTTPBadRequest, HTTPInternalServerError +from h5json.time_util import unixTimeToUTC, elapsedTime, getNow from . import config -from .util.timeUtil import unixTimeToUTC, elapsedTime from .util.nodeUtil import createNodeId from . import hsds_logger as log from .util import query_marathon as marathonClient @@ -46,7 +45,7 @@ def __init__(self, node_id=None, node_type=None, node_host=None, node_port=None) self._type = node_type self._host = node_host self._port = node_port - now = time.time() + now = getNow() self._create_time = now self._last_poll = now self._stats = {} @@ -87,13 +86,13 @@ def get_info(self): return info def poll_update(self): - now = time.time() + now = getNow() self._last_poll = now def is_healthy(self): sleep_sec = int(config.get("node_sleep_time")) - now = time.time() + now = getNow() if now - self._last_poll < sleep_sec * 2: return True else: @@ -301,7 +300,7 @@ async def register(request): answer["dn_ids"] = dn_ids answer["req_ip"] = node_host log.debug(f"register returning: {answer}") - app["last_health_check"] = int(time.time()) + app["last_health_check"] = int(getNow()) resp = json_response(answer) log.response(request, resp=resp) @@ -475,7 +474,7 @@ async def init(): app["nodes"] = nodes app["dead_node_ids"] = set() - app["start_time"] = int(time.time()) # seconds after epoch + app["start_time"] = int(getNow()) # seconds after epoch app["last_health_check"] = 0 app["max_task_count"] = config.get("max_task_count") app.router.add_get("/", info) diff --git a/hsds/hsds_app.py b/hsds/hsds_app.py index e690b68d..d0d25d6d 100644 --- a/hsds/hsds_app.py +++ b/hsds/hsds_app.py @@ -3,12 +3,14 @@ from pathlib import Path import site import subprocess -import time import queue import threading +import time import logging from shutil import which +from h5json.time_util import getNow + def _enqueue_output(out, queue, loglevel): try: @@ -318,7 +320,7 @@ def run(self): self._threads.append(t) # wait to sockets are initialized - start_ts = time.time() + start_ts = getNow() SLEEP_TIME = 1 # time to sleep between checking on socket connection MAX_INIT_TIME = 10.0 # max time to wait for socket to be initialized @@ -329,7 +331,7 @@ def run(self): if os.path.exists(socket_path): ready += 1 else: - if time.time() > start_ts + 5: + if getNow() > start_ts + 5: # TBD - put a real ready check here ready = count if ready == count: @@ -339,12 +341,12 @@ def run(self): self.log.debug(f"{ready}/{count} ready") self.log.debug(f"sleeping for {SLEEP_TIME}") time.sleep(SLEEP_TIME) - if time.time() > start_ts + MAX_INIT_TIME: + if getNow() > start_ts + MAX_INIT_TIME: msg = f"failed to initialize after {MAX_INIT_TIME} seconds" self.log.error(msg) raise IOError(msg) - self.log.info(f"Ready after: {(time.time() - start_ts):4.2f} s") + self.log.info(f"Ready after: {(getNow() - start_ts):4.2f} s") self._ready = True def stop(self): @@ -352,7 +354,7 @@ def stop(self): if not self._processes: return - now = time.time() + now = getNow() logging.info(f"hsds app stop at {now}") for pname in self._processes: @@ -363,7 +365,7 @@ def stop(self): # wait for sub-proccesses to exit SLEEP_TIME = 0.1 # time to sleep between checking on process state MAX_WAIT_TIME = 10.0 # max time to wait for sub-process to terminate - start_ts = time.time() + start_ts = getNow() while True: is_alive_cnt = 0 for pname in self._processes: @@ -380,7 +382,7 @@ def stop(self): else: logging.debug("all subprocesses exited") break - if time.time() > start_ts + MAX_WAIT_TIME: + if getNow() > start_ts + MAX_WAIT_TIME: msg = f"failed to terminate after {MAX_WAIT_TIME} seconds" self.log.error(msg) break diff --git a/hsds/link_dn.py b/hsds/link_dn.py index a35acf17..f602a405 100755 --- a/hsds/link_dn.py +++ b/hsds/link_dn.py @@ -21,11 +21,11 @@ from aiohttp.web import json_response from h5json.objid import isValidUuid +from h5json.time_util import getNow from .util.globparser import globmatch from .util.linkUtil import validateLinkName, getLinkClass, isEqualLink from .util.domainUtil import isValidBucketName -from .util.timeUtil import getNow from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj from . import config from . import hsds_logger as log diff --git a/hsds/servicenode.py b/hsds/servicenode.py index 8a5ddaee..7ec4f5a3 100755 --- a/hsds/servicenode.py +++ b/hsds/servicenode.py @@ -14,9 +14,10 @@ # import asyncio -import time from aiohttp.web import run_app import aiohttp_cors +from h5json.time_util import getNow + from .util.lruCache import LruCache from .util.httpUtil import isUnixDomainUrl, bindToSocket, getPortFromUrl from .util.httpUtil import release_http_client, jsonResponse @@ -217,10 +218,10 @@ async def preStop(request): log.request(request) app = request.app - shutdown_start = time.time() + shutdown_start = getNow() log.warn(f"preStop request calling on_shutdown at {shutdown_start:.2f}") await on_shutdown(app) - shutdown_elapse_time = time.time() - shutdown_start + shutdown_elapse_time = getNow() - shutdown_start msg = f"shutdown took: {shutdown_elapse_time:.2f} seconds" if shutdown_elapse_time > 2.0: # 2.0 is the default grace period for kubernetes diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index b9a179c7..48a5538b 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -15,8 +15,6 @@ import asyncio import json -import math -import time import numpy as np from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPGone, HTTPConflict @@ -31,9 +29,9 @@ from h5json.objid import isSchema2Id, getS3Key, isValidUuid from h5json.hdf5dtype import getBaseTypeJson, validateTypeItem, createDataType, getItemSize from h5json.shape_util import getShapeDims, getShapeClass -from h5json.filters import getFiltersJson -from h5json.dset_util import guessChunk, getChunkSize, validateDatasetCreationProps -from h5json.dset_util import getDataSize, isExtensible +from h5json.dset_util import getChunkSize, generateLayout +from h5json.dset_util import getDataSize, validateDatasetCreationProps +from h5json.time_util import getNow from .util.nodeUtil import getDataNodeUrl from .util.authUtil import getAclKeys @@ -1022,7 +1020,7 @@ async def getAttributeFromRequest(app, req_json, obj_id=None, bucket=None): else: attr_item["value"] = None - now = time.time() + now = getNow() if "created" in req_json: created = req_json["created"] # allow "pre-dated" attributes if the timestamp is within the last 10 seconds @@ -1453,12 +1451,28 @@ def getCreateArgs(body, return kwargs +def genLayout(shape_json, item_size, has_filters=False): + """ create a chunked or contiguous layout based on shape and itemsize """ + + min_chunk_size = int(config.get("min_chunk_size")) + max_chunk_size = int(config.get("max_chunk_size")) + max_chunks_per_folder = int(config.get("max_chunks_per_folder", default=0)) + kwargs = {"chunk_min": min_chunk_size, "chunk_max": max_chunk_size} + max_chunks_per_folder = int(config.get("max_chunks_per_folder")) + if max_chunks_per_folder > 0: + kwargs["max_chunks_per_folder"] = max_chunks_per_folder + if has_filters: + kwargs["chunks"] = True # force a chunked layout to support compression + + layout_json = generateLayout(shape_json, item_size, **kwargs) + return layout_json + + def getDatasetCreateArgs(body, root_id=None, bucket=None, type=None, implicit=False, - chunk_table=None, ignore_link=False): """ get args for createDataset from request body """ @@ -1486,17 +1500,11 @@ def getDatasetCreateArgs(body, try: shape_class = getShapeClass(shape_json) shape_dims = getShapeDims(shape_json) - if "maxdims" in shape_json: - max_dims = shape_json["maxdims"] - is_extensible = isExtensible(shape_dims, max_dims) - else: - max_dims = None - is_extensible = False except (KeyError, TypeError, ValueError): msg = f"Invalid shape: {shape_json}" log.warn(msg) raise HTTPBadRequest(reason=msg) - + log.debug(f"shape_class: {shape_class}, shape_dims: {shape_dims}") log.debug(f"got createArgs: {list(kwargs.keys())}") @@ -1517,27 +1525,27 @@ def getDatasetCreateArgs(body, else: dset_size = getDataSize(shape_dims, item_size) - creation_props = kwargs["creation_props"] - layout_class = None - layout_json = {} + if "creationProperties" in body: + creation_props = body["creationProperties"] + else: + creation_props = None + layout_json = None chunk_dims = None - partition_count = None + has_filters = False if creation_props: log.debug(f"POST_Dataset creation props: {creation_props}") try: validateDatasetCreationProps(creation_props, type_json=type_json, shape=shape_json) - except ValueError as ve: - msg = f"Provided creation properties are invalid: {ve}" + except (KeyError, TypeError, ValueError) as e: + msg = f"Provided creation properties are invalid: {e}" log.warn(msg) raise HTTPBadRequest(reason=msg) log.debug(f"create_props after validation: {creation_props}") - if creation_props.get("layout"): - layout_json = creation_props["layout"] - layout_class = layout_json.get("class") + if "filters" in creation_props: - # normalize filter format - filters = getFiltersJson(creation_props) + # check that the given filters are supported by HSDS + filters = creation_props["filters"] supported_filters = getSupportedFilters() log.debug(f"supported filters: {supported_filters}") for filter_item in filters: @@ -1545,19 +1553,21 @@ def getDatasetCreateArgs(body, msg = f"Unsupported filter id: {filter_item['id']}" log.warn(msg) raise HTTPBadRequest(reason=msg) + if filters: + has_filters = True creation_props["filters"] = filters log.debug(f"post validate creation properties: {creation_props}") + if "layout" in creation_props: + layout_json = creation_props["layout"] + else: + creation_props = {} - if layout_class: - if layout_class == "H5D_CONTIGUOUS_REF" and getItemSize(type_json) == "H5T_VARIABLE": - # ref dataset does not work with vlen type - msg = "H5D_CONTIGUOUS_REF datasets cannot be used with variable length types" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - + if "layout" in creation_props: + layout_json = creation_props["layout"] + # layout_json was validated in validateDatasetCreationProps, but issue + # a warning if the chunk size is outside suggested range if "dims" in layout_json: chunk_dims = layout_json["dims"] - if chunk_dims: # log warning if the chunk shape if chunk size is too small or too big chunk_size = getChunkSize(chunk_dims, item_size) if chunk_size < min_chunk_size: @@ -1572,87 +1582,12 @@ def getDatasetCreateArgs(body, if dset_size > max_chunk_size: msg = f"dataset larger than recommended {max_chunk_size} for CONTIGUOUS storage" log.warn(msg) - elif shape_class == "H5S_NULL": - layout_class = None - log.debug("using None layout for H5S_NULL dataset") - elif shape_class == "H5S_SCALAR": - layout_class = "H5D_CONTIGUOUS" - log.debug("Using H5D_CONTIGUOUS for H5S_SCALAR dataset") - elif shape_class == "H5S_SIMPLE": - if dset_size <= min_chunk_size and not is_extensible: - # default to contiguous - layout_class = "H5D_CONTIGUOUS" - log.debug(f"Using H5D_CONTIGUOUS for small (<{min_chunk_size}) dataset") - else: - layout_class = "H5D_CHUNKED" - log.debug(f"shape_json: {shape_json}") - log.debug(f"item_size: {item_size}") - log.debug(f"chunk_min: {min_chunk_size}") - log.debug(f"chunk_max: {max_chunk_size}") - args = {"chunk_min": min_chunk_size, "chunk_max": max_chunk_size} - chunk_dims = guessChunk(shape_json, item_size, **args) - log.debug(f"initial autochunk layout: {chunk_dims}") - chunk_size = getChunkSize(chunk_dims, item_size) - log.debug(f"chunk_size: {chunk_size}") else: - msg = f"unexpected shape_class: {shape_class}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + # no layout, create one based on shape and itemsize + layout_json = genLayout(shape_json, item_size, has_filters=has_filters) + log.info(f"created chunk layout for new dset: {layout_json}") + creation_props["layout"] = layout_json - # set partition_count if needed: - max_chunks_per_folder = int(config.get("max_chunks_per_folder")) - set_partition = False - if max_chunks_per_folder > 0: - if "dims" in layout_json: - set_partition = True - - if set_partition and dset_size > max_chunk_size: - log.debug(f"updating layout for partition constraint: {max_chunks_per_folder}") - - num_chunks = 1 - rank = len(shape_dims) - unlimited_count = 0 - if max_dims: - for i in range(rank): - if max_dims[i] == 0: - unlimited_count += 1 - msg = f"number of unlimited dimensions: {unlimited_count}" - log.debug(msg) - - for i in range(rank): - max_dim = 1 - if max_dims: - max_dim = max_dims[i] - if max_dim == 0: - # don't really know what the ultimate extent - # could be, but assume 10^6 for total number of - # elements and square-shaped array... - MAX_ELEMENT_GUESS = 10.0 ** 6 - exp = 1 / unlimited_count - max_dim = int(math.pow(MAX_ELEMENT_GUESS, exp)) - else: - max_dim = shape_dims[i] - num_chunks *= math.ceil(max_dim / chunk_dims[i]) - - if num_chunks > max_chunks_per_folder: - partition_count = math.ceil(num_chunks / max_chunks_per_folder) - msg = f"set partition count to: {partition_count}, " - msg += f"num_chunks: {num_chunks}" - log.info(msg) - else: - msg = "do not need chunk partitions, num_chunks: " - msg += f"{num_chunks} max_chunks_per_folder: " - msg += f"{max_chunks_per_folder}" - log.info(msg) - - if layout_class: - # should be set if shape is not H5S_NULL - if "class" not in layout_json: - layout_json["class"] = layout_class - if chunk_dims: - layout_json["dims"] = chunk_dims - log.debug(f"using dataset layout: {layout_json}") - creation_props["layout"] = layout_json kwargs["creation_props"] = creation_props log.debug(f"updated creation props: {creation_props}") diff --git a/hsds/util/chunkUtil.py b/hsds/util/chunkUtil.py index 7d299d9a..8715ac3d 100644 --- a/hsds/util/chunkUtil.py +++ b/hsds/util/chunkUtil.py @@ -38,15 +38,14 @@ def getNumChunks(selection, layout): If selection is provided (a list of slices), return the number of chunks that intersect with the selection. """ - print(f"getNumChunks: {selection}, layout: {layout}") + if len(selection) == 0: - print("zero length selection") + # zero length selection return 0 rank = len(layout) if rank == 1 and layout[0] == 1: # scalar dataset - print("scalar dset") return 1 if len(selection) != rank: msg = f"selection list has {len(selection)} items, but rank is {rank}" @@ -57,12 +56,10 @@ def getNumChunks(selection, layout): if isinstance(s, slice): if s.stop <= s.start: log.debug("null selection") - print("null selection") return 0 else: # coordinate list if len(s) == 0: - print("null coordinate list") return 0 # first, get the number of chunks needed for any coordinate selection chunk_indices = [] @@ -92,8 +89,6 @@ def getNumChunks(selection, layout): else: num_chunks = 1 - print("num_chunks:", num_chunks) - # now deal with any slices in the selection for i in range(len(selection)): s = selection[i] diff --git a/hsds/util/timeUtil.py b/hsds/util/timeUtil.py deleted file mode 100755 index e4ae9d3f..00000000 --- a/hsds/util/timeUtil.py +++ /dev/null @@ -1,83 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # -# Utilities. The full HSDS copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## -from datetime import datetime -import time -import os -import pytz - - -def unixTimeToUTC(timestamp): - """Convert unix timestamp (seconds since Jan 1, 1970, to ISO-8601 - compatible UTC time string. - - """ - utc = pytz.utc - dtTime = datetime.fromtimestamp(timestamp, utc) - iso_str = dtTime.isoformat() - # isoformat returns a string like this: - # '2014-10-30T04:25:21+00:00' - # strip off the '+00:00' and replace - # with 'Z' (both are ISO-8601 compatible) - npos = iso_str.rfind("+") - iso_z = iso_str[:npos] + "Z" - return iso_z - - -def elapsedTime(timestamp): - """Get Elapsed time from given timestamp""" - delta = int(time.time()) - timestamp - if delta < 0: - return "Invalid timestamp!" - day_length = 24 * 60 * 60 - days = 0 - hour_length = 60 * 60 - hours = 0 - minute_length = 60 - minutes = 0 - ret_str = "" - - if delta > day_length: - days = delta // day_length - delta = delta % day_length - ret_str += "{} days ".format(days) - if delta > hour_length or days > 0: - hours = delta // hour_length - delta = delta % hour_length - ret_str += "{} hours ".format(hours) - if delta > minute_length or days > 0 or hours > 0: - minutes = delta // minute_length - delta = delta % minute_length - ret_str += "{} minutes ".format(minutes) - ret_str += "{} seconds".format(delta) - return ret_str - - -def getNow(app): - """ - Get current time in unix timestamp - - Returns a precise timestamp even on platforms where - time.time() has low resolution (e.g. Windows) - """ - system = os.name - current_time = 0 - - if system == "nt": - # Windows - current_time = (time.perf_counter() - app["start_time_relative"]) + app["start_time"] - elif system == "posix": - # Unix - current_time = time.time() - else: - raise ValueError(f"Unsupported OS: {system}") - - return current_time diff --git a/pyproject.toml b/pyproject.toml index 8e260cda..422750fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ dependencies = [ "bitshuffle >=0.5.2", "cryptography", "h5py >= 3.6.0", - "h5json@git+https://github.com/HDFGroup/hdf5-json@abstract", + "h5json >= 1.0.0", "importlib_resources", "numcodecs <= 0.15.1", "numpy >=2.0.0", diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index 5979e042..697258da 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -15,6 +15,7 @@ import numpy as np from h5json.objid import createObjId +from h5json.filters import getFilterItem import helper import config @@ -1401,25 +1402,21 @@ def testCreationPropertiesLayoutDataset(self): "id": 3, "name": "fletcher32" } - payload["creationProperties"] = { - "layout": {"class": "H5D_CHUNKED", "dims": [1, 390, 512]}, + contiguous_layout = {"class": "H5D_CONTIGUOUS"} + chunked_layout = {"class": "H5D_CHUNKED", "dims": [1, 390, 512]} + creationProps = { "filters": [ gzip_filter, fletcher32_filter, ], } + payload["creationProperties"] = creationProps rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset rspJson = json.loads(rsp.text) dset_uuid = rspJson["id"] self.assertTrue(helper.validateId(dset_uuid)) - # link new dataset as 'chunktest' - name = "chunktest" - req = self.endpoint + "/groups/" + root_uuid + "/links/" + name - payload = {"id": dset_uuid} - rsp = self.session.put(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 201) # verify layout req = helper.getEndpoint() + "/datasets/" + dset_uuid rsp = self.session.get(req, headers=headers) @@ -1431,12 +1428,41 @@ def testCreationPropertiesLayoutDataset(self): layout_json = cpl["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") + self.assertTrue("dims" in layout_json) # layout created automatically + + # add an explicit layout to creation props and verify contiguous + creationProps["layout"] = contiguous_layout + payload["creationProperties"] = creationProps + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) + + # use a chunk layout to creation props and verify success + creationProps["layout"] = chunked_layout + payload["creationProperties"] = creationProps + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] + self.assertTrue("class" in layout_json) + self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) + self.assertEqual(layout_json["dims"], [1, 390, 512]) if config.get("max_chunks_per_folder") > 0: self.assertTrue("partition_count" in layout_json) self.assertEqual(layout_json["partition_count"], 10) + # link new dataset as 'chunktest' + name = "chunktest" + req = self.endpoint + "/groups/" + root_uuid + "/links/" + name + payload = {"id": dset_uuid} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + # verify compression self.assertTrue("creationProperties" in rspJson) cpl = rspJson["creationProperties"] @@ -1540,7 +1566,8 @@ def testCompressionFiltersDataset(self): req = self.endpoint + "/datasets" payload = {"type": "H5T_IEEE_F32LE", "shape": [40, 80]} - filters = [compressor, ] + filter_item = getFilterItem(compressor) + filters = [filter_item, ] layout = {"class": "H5D_CHUNKED", "dims": [10, 20]} cpl = {"filters": filters, "layout": layout} payload["creationProperties"] = cpl @@ -1600,8 +1627,9 @@ def testCompressionFilterOptionDataset(self): # create the dataset req = self.endpoint + "/datasets" - compressor = {"class": "H5Z_FILTER_LZ4", "name": "lz4", "level": 5} - filters = [compressor, ] + filter_item = getFilterItem("lz4", options={"level": 4}) + print("filter_item:", filter_item) + filters = [filter_item, ] payload = {"type": "H5T_IEEE_F32LE", "shape": [40, 80]} layout = {"class": "H5D_CHUNKED", "dims": [10, 20]} @@ -1661,25 +1689,15 @@ def testInvalidCompressionFilter(self): rspJson = json.loads(rsp.text) self.assertTrue("root" in rspJson) - bad_compressors = ("shrink-o-rama") - for compressor_name in bad_compressors: - # create the dataset - req = self.endpoint + "/datasets" - compressor = { - "class": "H5Z_FILTER_USER", - "name": compressor_name, - "level": 5, - } + filter_item = {'class': 'H5Z_FILTER_FOOBAR', 'id': 123, 'name': 'foobar'} + # create the dataset + req = self.endpoint + "/datasets" - payload = {"type": "H5T_IEEE_F32LE", "shape": [40, 80]} - payload["creationProperties"] = { - "filters": [ - compressor, - ] - } - req = self.endpoint + "/datasets" - rsp = self.session.post(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 400) # create dataset + payload = {"type": "H5T_IEEE_F32LE", "shape": [40, 80]} + payload["creationProperties"] = {"filters": [filter_item, ]} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) # create dataset def testInvalidFillValue(self): # test Dataset with simple type and fill value that is incompatible with the type From 55c85981796d7c42d45799aa2ac3f489057dffef Mon Sep 17 00:00:00 2001 From: John Readey Date: Sun, 4 Jan 2026 17:09:01 +0800 Subject: [PATCH 45/49] update for h5json changes --- hsds/servicenode_lib.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 48a5538b..717912cc 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -1451,7 +1451,7 @@ def getCreateArgs(body, return kwargs -def genLayout(shape_json, item_size, has_filters=False): +def genLayout(shape_json, type_json, has_filters=False): """ create a chunked or contiguous layout based on shape and itemsize """ min_chunk_size = int(config.get("min_chunk_size")) @@ -1464,7 +1464,7 @@ def genLayout(shape_json, item_size, has_filters=False): if has_filters: kwargs["chunks"] = True # force a chunked layout to support compression - layout_json = generateLayout(shape_json, item_size, **kwargs) + layout_json = generateLayout(shape_json, type_json, **kwargs) return layout_json @@ -1584,7 +1584,7 @@ def getDatasetCreateArgs(body, log.warn(msg) else: # no layout, create one based on shape and itemsize - layout_json = genLayout(shape_json, item_size, has_filters=has_filters) + layout_json = genLayout(shape_json, type_json, has_filters=has_filters) log.info(f"created chunk layout for new dset: {layout_json}") creation_props["layout"] = layout_json From 77042d85a70eb54b1aca2c30e6768569c70016c5 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 6 Jan 2026 09:39:00 +0800 Subject: [PATCH 46/49] added consolidated metadata support --- hsds/async_lib.py | 188 ++++++++++++++++++++++++++++++++++-- hsds/datanode_lib.py | 2 +- hsds/domain_sn.py | 20 ++-- hsds/servicenode_lib.py | 35 ++++++- hsds/util/storUtil.py | 2 +- tests/integ/dataset_test.py | 24 +++++ tests/integ/vlen_test.py | 10 +- 7 files changed, 250 insertions(+), 31 deletions(-) diff --git a/hsds/async_lib.py b/hsds/async_lib.py index 674caa98..997432cc 100755 --- a/hsds/async_lib.py +++ b/hsds/async_lib.py @@ -17,15 +17,15 @@ from aiohttp.web_exceptions import HTTPForbidden from h5json.hdf5dtype import getItemSize from h5json.hdf5dtype import createDataType -from h5json.array_util import getNumElements, bytesToArray +from h5json.array_util import getNumElements, bytesToArray, bytesArrayToList from h5json.objid import isValidUuid, isSchema2Id, getS3Key, isS3ObjKey from h5json.objid import getObjId, isValidChunkId, getCollectionForId from h5json.filters import getFilters -from h5json.shape_util import getShapeDims +from h5json.shape_util import getShapeDims, getDataSize from h5json.dset_util import getDatasetLayoutClass, getDatasetLayout, getChunkDims from h5json.time_util import getNow -from .util.chunkUtil import getDatasetId, getNumChunks, ChunkIterator +from .util.chunkUtil import getDatasetId, getNumChunks, ChunkIterator, getChunkIndex, getChunkIds from .util.dsetUtil import getHyperslabSelection from .util.storUtil import getStorKeys, putStorJSONObj, getStorJSONObj from .util.storUtil import deleteStorObj, getStorBytes, isStorObj @@ -77,6 +77,7 @@ async def updateDatasetInfo(app, dset_id, dataset_info, bucket=None): msg += f"{dset_id}" log.warn(msg) return + type_json = dset_json["type"] item_size = getItemSize(type_json) if not getDatasetLayout(dset_json): @@ -112,7 +113,7 @@ async def updateDatasetInfo(app, dset_id, dataset_info, bucket=None): if layout_class == "H5D_CONTIGUOUS_REF": # In H5D_CONTIGUOUS_REF a non-compressed part of the HDF5 is divided # into equal size chunks, so we can just compute link bytes and num - # chunks based on the size of the coniguous dataset + # chunks based on the size of the contiguous dataset layout_dims = getChunkDims(dset_json) num_chunks = getNumChunks(selection, layout_dims) chunk_size = item_size @@ -268,20 +269,26 @@ def scanRootCallback(app, s3keys): results = app["scanRoot_results"] scanRoot_keyset = app["scanRoot_keyset"] checksums = results["checksums"] + for s3key in s3keys.keys(): if not isS3ObjKey(s3key): - log.info(f"not s3obj key, ignoring: {s3key}") + log.info(f"scanRoot -not s3obj key, ignoring: {s3key}") continue if s3key in scanRoot_keyset: - log.warn(f"scanRoot - dejavu for key: {s3key}") + log.warn(f"scanRoot -scanRoot - dejavu for key: {s3key}") continue scanRoot_keyset.add(s3key) - msg = f"scanRoot adding key: {s3key} to keyset, " + msg = f"scanRoot - adding key: {s3key} to keyset, " msg += f"{len(scanRoot_keyset)} keys" log.debug(msg) objid = getObjId(s3key) + + if objid in app["deleted_ids"]: + log.debug(f"scanRoot - skipping deleted id: {objid}") + continue + etag = None obj_size = None lastModified = None @@ -306,8 +313,15 @@ def scanRootCallback(app, s3keys): is_chunk = True results["num_chunks"] += 1 results["allocated_bytes"] += obj_size + chunk_index = getChunkIndex(objid) + if max(chunk_index) == 0: + # save the first chunk if present + # this will be used to save dataset values to + # the the obj_ids set for small datasets + results["obj_ids"].add(objid) else: results["metadata_bytes"] += obj_size + results["obj_ids"].add(objid) if is_chunk or getCollectionForId(objid) == "datasets": if is_chunk: @@ -345,6 +359,144 @@ def scanRootCallback(app, s3keys): log.error(msg) +async def _getDatsetValueJson(app, dset_id, dset_json, obj_ids, size_limit=None, bucket=None): + """ If the dataset size is less than size_limit, and the chunk_ids for the dataset are + available, return a JSON representation of the dataset values. Othewise, return None """ + + dims = getShapeDims(dset_json) + if dims is None: + return None # null dataspace + if "type" not in dset_json: + msg = f"_getDatsetValueJson - expected to find type in dataset_json for {dset_id}" + log.warn(msg) + return None + type_json = dset_json["type"] + item_size = getItemSize(type_json) + if item_size == "H5T_VARIABLE": + item_size = 1024 # make a guess for variable length types + dataset_size = getDataSize(dims, item_size) + if dataset_size > size_limit: + log.debug(f"_getDatasetValueJson - dataset size {dataset_size} exceeds limit {size_limit}") + return None + + chunk_dims = getChunkDims(dset_json) + if not chunk_dims: + log.warning(f"_getDatasetValueJson - no layout found for dataset: {dset_id}") + return None + if chunk_dims != dims: + msg = f"_getDatasetValueJson - dataset layout {chunk_dims} does not match dims {dims} " + msg += f"for dataset: {dset_id}, ignoring" + log.warning(msg) + return None + select_all = getHyperslabSelection(dims) # select entire datashape + chunk_ids = getChunkIds(dset_id, select_all, dims) + if len(chunk_ids) == 0: + log.debug(f"_getDatasetValueJson - no chunk ids found for dataset: {dset_id}") + return None + if len(chunk_ids) > 1: + log.debug(f"_getDatasetValueJson - more than one chunk id found for dataset: {dset_id}") + return None + chunk_id = chunk_ids[0] + if chunk_id not in obj_ids: + log.debug(f"_getDatasetValueJson - chunk id {chunk_id} not in scanned obj_ids") + return None + log.debug(f"using chunk: {chunk_id} to get dataset value for {dset_id}") + + # fetch the chunk - using getStoreBytes since this will not be used with + # chunk cache or chunk crawlers + # TBD: need parameters for s3path, s3offset, s3size for ref layouts + # regular store read + + filters = getFilters(dset_json) + dt = createDataType(type_json) + filter_ops = getFilterOps(app, dset_id, filters, dtype=dt, chunk_shape=chunk_dims) + + kwargs = { + "filter_ops": filter_ops, + "offset": None, + "length": None, + "bucket": bucket + } + s3key = getS3Key(chunk_id) + + try: + chunk_bytes = await getStorBytes(app, s3key, **kwargs) + except HTTPNotFound: + log.warning(f"_getDatasetValueJson - HTTPNotFound for chunk {chunk_id} bucket:{bucket}") + return None + except HTTPForbidden: + log.warning(f"_getDatasetValueJson - HTTPForbidden for chunk {chunk_id} bucket:{bucket}") + return None + except HTTPInternalServerError: + msg = "_getDatasetValueJson - " + msg += f"HTTPInternalServerError for chunk {chunk_id} bucket:{bucket}" + log.warning(msg) + return None + + if chunk_bytes is None: + msg = f"_getDatasetValueJson -read {chunk_id} bucket: {bucket} returned None" + log.warning(msg) + return None + + arr = bytesToArray(chunk_bytes, dt, chunk_dims) + + json_value = bytesArrayToList(arr) + log.debug(f"_getDatsetValueJson - returning {json_value}") + + return json_value + + +async def getConsolidatedMetaData(app, obj_ids, bucket=None): + # create a consolidated metadata summary for all objects in the domain + # return a dict of obj_ids to their metadata summaries + log.info("getConsolidatedMetaData - creating consolidated metadata summary") + consolidated_metadata = {} + for obj_id in obj_ids: + if isValidChunkId(obj_id): + # skip chunks - we may use the chunk later when processing it's dataset object + continue + s3_key = getS3Key(obj_id) + try: + obj_json = await getStorJSONObj(app, s3_key, bucket=bucket) + except HTTPNotFound: + log.warn(f"HTTPNotFound for {s3_key} bucket:{bucket}") + continue + except HTTPForbidden: + log.warn(f"HTTPForbidden error for {s3_key} bucket:{bucket}") + continue + except HTTPInternalServerError: + msg = f"HTTPInternalServerError error for {s3_key} bucket:{bucket}" + log.warn(msg) + continue + log.debug(f"getConsolidatedMetaData - got json for obj_id: {obj_id}: {obj_json}") + # extract relevant metadata + metadata_summary = {} + if "type" in obj_json: + metadata_summary["type"] = obj_json["type"] + if "shape" in obj_json: + metadata_summary["shape"] = obj_json["shape"] + if "attributes" in obj_json: + metadata_summary["attributes"] = obj_json["attributes"] + if "links" in obj_json: + metadata_summary["links"] = obj_json["links"] + if "creationProperties" in obj_json: + metadata_summary["creationProperties"] = obj_json["creationProperties"] + if getCollectionForId(obj_id) == "datasets": + log.debug("getConsolidatedMetaData - got dataset") + size_limit = 4096 # TBD - make this a config + kwargs = {"size_limit": size_limit, "bucket": bucket} + json_value = await _getDatsetValueJson(app, obj_id, obj_json, obj_ids, **kwargs) + if json_value is not None: + log.debug(f"adding dataset value to metadata summary for dataset: {obj_id}") + metadata_summary["value"] = json_value + else: + log.debug("getConsolidatedMetaData - not a dataset") + + consolidated_metadata[obj_id] = metadata_summary + log.info("getConsolidatedMetaData - done creating consolidated metadata summary") + return consolidated_metadata + + async def scanRoot(app, rootid, update=False, bucket=None): # iterate through all s3 keys under the given root. @@ -386,7 +538,8 @@ async def scanRoot(app, rootid, update=False, bucket=None): results["num_linked_chunks"] = 0 results["linked_bytes"] = 0 results["logical_bytes"] = 0 - results["checksums"] = {} # map of objid to checksums + results["obj_ids"] = set() # map of object ids scanned (and first chunk id for datasets) + results["checksums"] = {} # map of objid to checksums results["bucket"] = bucket results["scan_start"] = getNow(app=app) @@ -405,6 +558,9 @@ async def scanRoot(app, rootid, update=False, bucket=None): num_objects += len(results["datasets"]) num_objects += results["num_chunks"] log.info(f"scanRoot - got {num_objects} keys for rootid: {rootid}") + obj_ids = results["obj_ids"] + log.info(f"scanRoot - got {len(obj_ids)} unique object ids") + log.debug(f"scanRoot - obj_ids: {obj_ids}") dataset_results = results["datasets"] for dsetid in dataset_results: @@ -445,6 +601,11 @@ async def scanRoot(app, rootid, update=False, bucket=None): results["scan_complete"] = getNow(app=app) + # extract the obj_ids set, that won't go into .info.json + obj_ids = results["obj_ids"] + del results["obj_ids"] + log.debug(f"obj_ids set: {obj_ids}") + if update: # write .info object back to S3 info_key = root_prefix + ".info.json" @@ -452,6 +613,17 @@ async def scanRoot(app, rootid, update=False, bucket=None): msg += f"{results}" log.info(msg) await putStorJSONObj(app, info_key, results, bucket=bucket) + + # create a json summary of objects in ths domain + log.debug(f"Creating consolidated metadata summary for root {rootid}") + summary_key = root_prefix + ".summary.json" + summary_data = await getConsolidatedMetaData(app, obj_ids, bucket=bucket) + if summary_data: + log.info(f"Got consolidated metadata summary for root {rootid}") + log.debug(f"Summary data: {summary_data}") + await putStorJSONObj(app, summary_key, summary_data, bucket=bucket) + else: + log.info(f"No consolidated metadata summary for root {rootid}") return results diff --git a/hsds/datanode_lib.py b/hsds/datanode_lib.py index 6b76e37a..d2b1840c 100644 --- a/hsds/datanode_lib.py +++ b/hsds/datanode_lib.py @@ -1094,7 +1094,7 @@ async def get_chunk( log.debug(msg) else: s3key = getS3Key(chunk_id) - log.debug(f"getChunk chunkid: {chunk_id} bucket: {bucket}") + log.debug(f"getChunk chunkid: {chunk_id} bucket: {bucket} using key: {s3key}") if chunk_id in chunk_cache: log.debug(f"getChunk chunkid: {chunk_id} found in cache") chunk_arr = chunk_cache[chunk_id] diff --git a/hsds/domain_sn.py b/hsds/domain_sn.py index cc634526..46f2f890 100755 --- a/hsds/domain_sn.py +++ b/hsds/domain_sn.py @@ -462,6 +462,11 @@ async def GET_Domain(request): if "verbose" in params and params["verbose"]: verbose = True + getobjs = False + # include domain objects if requested + if params.get("getobjs"): + getobjs = True + if not domain: log.info("no domain passed in, returning all top-level domains") # no domain passed in, return top-level domains for this request @@ -543,22 +548,9 @@ async def GET_Domain(request): return resp # return just the keys as per the REST API - kwargs = {"verbose": verbose, "bucket": bucket} + kwargs = {"verbose": verbose, "getobjs": getobjs, "bucket": bucket} rsp_json = await getDomainResponse(app, domain_json, **kwargs) - # include domain objects if requested - if params.get("getobjs") and "root" in domain_json: - - log.debug("getting all domain objects") - root_id = domain_json["root"] - kwargs = {"include_attrs": include_attrs, "bucket": bucket} - domain_objs = await getDomainObjects(app, root_id, **kwargs) - if domain_objs: - rsp_json["domain_objs"] = domain_objs - - # include domain class if present - # if "class" in domain_json: - # rsp_json["class"] = domain_json["class"] # include dn_ids if requested if "getdnids" in params and params["getdnids"]: diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 717912cc..921e2f2d 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -116,7 +116,7 @@ async def getDomainJson(app, domain, reload=False): return domain_json -async def getDomainResponse(app, domain_json, bucket=None, verbose=False): +async def getDomainResponse(app, domain_json, bucket=None, verbose=False, getobjs=False): """ construct JSON response for domain request """ rsp_json = {} if "root" in domain_json: @@ -189,6 +189,13 @@ async def getDomainResponse(app, domain_json, bucket=None, verbose=False): rsp_json["num_linked_chunks"] = num_linked_chunks rsp_json["md5_sum"] = md5_sum + if getobjs and "root" in domain_json: + root_id = domain_json["root"] + domain_objs = await getDomainObjs(app, root_id, bucket=bucket) + if domain_objs: + log.debug(f"returning {len(domain_objs)} for root_id: {root_id}") + rsp_json["domain_objs"] = domain_objs + # pass back config parameters the client may care about rsp_json["limits"] = getLimits() @@ -849,8 +856,32 @@ async def getRootInfo(app, root_id, bucket=None): return info_json +async def getDomainObjs(app, root_id, bucket=None): + """ Return domain objects if available for this root id """ + log.debug(f"getDomainObjs {root_id}") + + s3_key = getS3Key(root_id) + + parts = s3_key.split("/") + # dset_key is in the format db//d//.dataset.json + # get the key for the root info object as: db//.summary.json + if len(parts) != 3: + log.error(f"Unexpected s3key format: {s3_key}") + return None + + summary_key = f"db/{parts[1]}/.summary.json" + + try: + summary_json = await getStorJSONObj(app, summary_key, bucket=bucket) + except HTTPNotFound: + log.warn(f".summary.json not found for key: {summary_key}") + return None + + return summary_json + + async def doFlush(app, root_id, bucket=None): - """return wnen all DN nodes have wrote any pending changes to S3""" + """return wnen all DN nodes have wrote any pending changes to S3 """ log.info(f"doFlush {root_id}") params = {"flush": 1} if bucket: diff --git a/hsds/util/storUtil.py b/hsds/util/storUtil.py index b37e25bc..7b3b8a4e 100644 --- a/hsds/util/storUtil.py +++ b/hsds/util/storUtil.py @@ -493,7 +493,7 @@ async def getStorBytes(app, chunk_bytes = [] for chunk_location in chunk_locations: - log.debug(f"getStoreBytes - processing chunk_location: {chunk_location}") + log.debug(f"getStorBytes - processing chunk_location: {chunk_location}") n = chunk_location.offset - offset if n < 0: log.warn(f"getStorBytes - unexpected offset for chunk_location: {chunk_location}") diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index 697258da..04d7f20b 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -342,6 +342,30 @@ def testPostDatasetWithAttributes(self): self.assertTrue("attributes") in rspJson self.assertEqual(len(rspJson["attributes"]), attr_count) + # try fetching the objson in domain resp + req = helper.getEndpoint() + "/" + params = {"getobjs": 1} + for i in range(10): + # try a few times to allow for async update of summary info + time.sleep(5) + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + if "domain_objs" in rspJson: + break + + self.assertTrue("domain_objs" in rspJson) + domain_objs = rspJson["domain_objs"] + self.assertTrue(root_uuid in domain_objs) + self.assertTrue(dset_id in domain_objs) + dset_json = domain_objs[dset_id] + self.assertTrue("attributes" in dset_json) + self.assertEqual(len(dset_json["attributes"]), attr_count) + self.assertTrue("type" in dset_json) + self.assertTrue("shape" in dset_json) + self.assertTrue("creationProperties" in dset_json) + self.assertFalse("value" in dset_json) # no data written yet + def testScalarEmptyDimsDataset(self): # Test creation/deletion of scalar dataset obj domain = self.base_domain + "/testScalarEmptyDimsDataset.h5" diff --git a/tests/integ/vlen_test.py b/tests/integ/vlen_test.py index d3d44ab5..28bb3e90 100755 --- a/tests/integ/vlen_test.py +++ b/tests/integ/vlen_test.py @@ -35,7 +35,7 @@ def tearDown(self): # main def testPutVLenInt(self): - # Test PUT value for 1d attribute with variable length int types + # Test PUT value for 1d dataset with variable length int types print("testPutVLenInt", self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) @@ -120,7 +120,7 @@ def testPutVLenInt(self): self.assertEqual(value[1], [1, 2, 3, 4]) def testPutVLenIntBinary(self): - # Test PUT value for 1d attribute with variable length int types using binary transfer + # Test PUT value for 1d dataset with variable length int types using binary transfer print("testPutVLenIntBinary", self.base_domain) count = 4 @@ -217,7 +217,7 @@ def testPutVLenIntBinary(self): self.assertEqual(value[0], [1, 2, 3]) def testPutVLen2DInt(self): - # Test PUT value for 1d attribute with variable length int types + # Test PUT value for 1d dataset with variable length int types print("testPutVLen2DInt", self.base_domain) nrow = 2 ncol = 2 @@ -294,7 +294,7 @@ def testPutVLen2DInt(self): self.assertEqual(value[0][1], [1, 2]) def testPutVLenString(self): - # Test PUT value for 1d attribute with variable length string types + # Test PUT value for 1d dataset with variable length string types print("testPutVLenString", self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) @@ -364,7 +364,7 @@ def testPutVLenString(self): self.assertEqual(value[1], data[3]) def testPutVLenStringBinary(self): - # Test PUT value for 1d attribute with variable length string types + # Test PUT value for 1d dataset with variable length string types print("testPutVLenStringBinary", self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) From 23bb24bd8441428233c6ba1b513991cac181fb7e Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 6 Jan 2026 18:03:31 +0800 Subject: [PATCH 47/49] fix for use of H5S_UNLIMITED in maxdims --- hsds/domain_sn.py | 1 - hsds/dset_dn.py | 2 +- hsds/dset_lib.py | 2 +- hsds/util/dsetUtil.py | 5 ----- tests/integ/dataset_test.py | 8 ++++---- 5 files changed, 6 insertions(+), 12 deletions(-) diff --git a/hsds/domain_sn.py b/hsds/domain_sn.py index 46f2f890..5758cd0d 100755 --- a/hsds/domain_sn.py +++ b/hsds/domain_sn.py @@ -551,7 +551,6 @@ async def GET_Domain(request): kwargs = {"verbose": verbose, "getobjs": getobjs, "bucket": bucket} rsp_json = await getDomainResponse(app, domain_json, **kwargs) - # include dn_ids if requested if "getdnids" in params and params["getdnids"]: rsp_json["dn_ids"] = app["dn_ids"] diff --git a/hsds/dset_dn.py b/hsds/dset_dn.py index 5b99711d..3d5a261e 100755 --- a/hsds/dset_dn.py +++ b/hsds/dset_dn.py @@ -290,7 +290,7 @@ async def PUT_DatasetShape(request): if i == extend_dim: lb = dims[i] ub = lb + extension - if maxdims[extend_dim] != 0 and ub > maxdims[extend_dim]: + if maxdims[extend_dim] not in (0, "H5S_UNLIMITED") and ub > maxdims[extend_dim]: msg = "maximum extent exceeded" log.warn(msg) raise HTTPConflict() diff --git a/hsds/dset_lib.py b/hsds/dset_lib.py index 384defe7..a6c58b45 100755 --- a/hsds/dset_lib.py +++ b/hsds/dset_lib.py @@ -1007,7 +1007,7 @@ async def updateShape(app, dset_json, shape_update, bucket=None): raise HTTPBadRequest(reason=msg) decreasing_dims.append(i) elif shape_update[i] > dims[i]: - if maxdims[i] != 0 and shape_update[i] > maxdims[i]: + if maxdims[i] not in (0, "H5S_UNLIMITED") and shape_update[i] > maxdims[i]: msg = "Extension dimension can not be extended past max extent" log.warn(msg) raise HTTPConflict() diff --git a/hsds/util/dsetUtil.py b/hsds/util/dsetUtil.py index 49723750..fb7d21c5 100644 --- a/hsds/util/dsetUtil.py +++ b/hsds/util/dsetUtil.py @@ -50,11 +50,6 @@ def getShapeJson(body): log.warn(msg) raise ValueError(msg) - if shape_class not in ("H5S_NULL", "H5S_SCALAR", "H5S_SIMPLE"): - msg = f"invalid shape class: {shape_class}" - log.warn(msg) - raise ValueError(msg) - if shape_class in ("H5S_NULL", "H5S_SCALAR") and dims: msg = f"dims not valid for shape class: {body_shape}" log.warn(msg) diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index 04d7f20b..17357119 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -938,7 +938,7 @@ def testResizableUnlimitedDataset(self): # create the dataset req = self.endpoint + "/datasets" - payload = {"type": "H5T_IEEE_F32LE", "shape": [10, 20], "maxdims": [30, 0]} + payload = {"type": "H5T_IEEE_F32LE", "shape": [10, 20], "maxdims": [30, "H5S_UNLIMITED"]} payload["creationProperties"] = {"fillValue": 3.12} req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) @@ -970,7 +970,7 @@ def testResizableUnlimitedDataset(self): self.assertEqual(shape["dims"][1], 20) self.assertTrue("maxdims" in shape) self.assertEqual(shape["maxdims"][0], 30) - self.assertEqual(shape["maxdims"][1], 0) + self.assertEqual(shape["maxdims"][1], "H5S_UNLIMITED") # verify shape using the GET shape request req = req + "/shape" @@ -987,7 +987,7 @@ def testResizableUnlimitedDataset(self): self.assertTrue("maxdims" in shape) self.assertEqual(len(shape["maxdims"]), 2) self.assertEqual(shape["maxdims"][0], 30) - self.assertEqual(shape["maxdims"][1], 0) + self.assertEqual(shape["maxdims"][1], "H5S_UNLIMITED") # resize the second dimension to 500 elements payload = {"shape": [10, 500]} @@ -1009,7 +1009,7 @@ def testResizableUnlimitedDataset(self): self.assertTrue("maxdims" in shape) self.assertEqual(len(shape["maxdims"]), 2) self.assertEqual(shape["maxdims"][0], 30) - self.assertEqual(shape["maxdims"][1], 0) + self.assertEqual(shape["maxdims"][1], "H5S_UNLIMITED") def testExtendDataset(self): # test extending dataset From c66d632161f29e1753565d903b2041e52cb2be37 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 6 Jan 2026 21:17:22 +0800 Subject: [PATCH 48/49] fix for domain_test --- tests/integ/domain_test.py | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/tests/integ/domain_test.py b/tests/integ/domain_test.py index 1c68cb5e..f01dcc93 100755 --- a/tests/integ/domain_test.py +++ b/tests/integ/domain_test.py @@ -14,7 +14,7 @@ import json from os import path as pp -from h5json.objid import createObjId +from h5json.objid import createObjId, getCollectionForId import config import helper @@ -116,33 +116,21 @@ def testGetDomain(self): attr_count = 0 for objid in domain_objs: obj_json = domain_objs[objid] - self.assertTrue("id" in obj_json) - self.assertTrue("attributeCount" in obj_json) - attr_count += obj_json["attributeCount"] - self.assertFalse("attributes" in obj_json) + collection_type = getCollectionForId(objid) + if collection_type == "datasets": + self.assertTrue("attributes" in obj_json) + self.assertTrue("type" in obj_json) + self.assertTrue("shape" in obj_json) + self.assertTrue("creationProperties" in obj_json) + elif collection_type == "groups": + self.assertTrue("attributes" in obj_json) + self.assertTrue("links" in obj_json) + else: + self.assertTrue(False) # unexpected type + attr_count += len(obj_json["attributes"]) self.assertEqual(attr_count, 4) - # get a dict of all objects in the domain including any attributes - params["include_attrs"] = 1 - rsp = self.session.get(req, headers=headers, params=params) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("domain_objs" in rspJson) - domain_objs = rspJson["domain_objs"] - self.assertEqual(len(domain_objs), 10) - attr_count = 0 - for objid in domain_objs: - obj_json = domain_objs[objid] - self.assertTrue("attributeCount" in obj_json) - self.assertTrue("attributes" in obj_json) - attributes = obj_json["attributes"] - for attr_name in attributes: - # only the names "attr1" and "attr2" are used in this domain - self.assertTrue(attr_name in ("attr1", "attr2")) - attr_count += 1 - self.assertEqual(attr_count, 4) - # passing domain via the host header is deprecated # Previously his returned 200, now it is a 400 del headers["X-Hdf-domain"] From 6917c5d8df1dc47ce3c11caeedd77bfc5632d5d4 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 8 Jan 2026 13:17:36 +0800 Subject: [PATCH 49/49] refactor linkUtil with h5json --- hsds/link_dn.py | 17 +++-- hsds/link_sn.py | 51 +++++++++----- hsds/servicenode_lib.py | 7 +- hsds/util/linkUtil.py | 139 ++------------------------------------- tests/integ/link_test.py | 67 ++++++++++--------- 5 files changed, 92 insertions(+), 189 deletions(-) diff --git a/hsds/link_dn.py b/hsds/link_dn.py index f602a405..ef1c0438 100755 --- a/hsds/link_dn.py +++ b/hsds/link_dn.py @@ -22,9 +22,9 @@ from h5json.objid import isValidUuid from h5json.time_util import getNow +from h5json.link_util import validateLinkName, getLinkClass, isEqualLink from .util.globparser import globmatch -from .util.linkUtil import validateLinkName, getLinkClass, isEqualLink from .util.domainUtil import isValidBucketName from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj from . import config @@ -156,6 +156,10 @@ async def GET_Links(request): link = copy(link_dict[title]) log.debug(f"link list[{i}: {link}") link["title"] = title + if link.get("h5domain"): + # deprecated key, replace with file + link["file"] = link["h5domain"] + del link["h5domain"] link_list.append(link) resp_json = {"links": link_list} @@ -218,6 +222,7 @@ async def POST_Links(request): log.info(f"Link name {title} not found in group: {group_id}") continue link_json = links[title] + log.debug(f"POST Links got link_json: {link_json}") item = {} if "class" not in link_json: log.warn(f"expected to find class key for link: {title}") @@ -245,15 +250,19 @@ async def POST_Links(request): log.warn(f"expected to find h5path for external link: {title}") continue item["h5path"] = link_json["h5path"] - if "h5domain" not in link_json: - log.warn(f"expted to find h5domain for external link: {title}") + if "h5domain" in link_json: + item["file"] = link_json["h5domain"] + elif "file" in link_json: + item["file"] = link_json["file"] + else: + log.warn(f"expected to find h5domain or file for external link: {title}") continue - item["h5domain"] = link_json["h5domain"] else: log.warn(f"unexpected to link class {link_class} for link: {title}") continue item["title"] = title + log.debug(f"adding link item: {item}") link_list.append(item) diff --git a/hsds/link_sn.py b/hsds/link_sn.py index 2048dd7c..a7dcc6a0 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -17,19 +17,23 @@ from json import JSONDecodeError from h5json.objid import isValidUuid, getCollectionForId +from h5json.link_util import validateLinkName, getLinkClass, getLinkId +from h5json.link_util import getLinkPath, getLinkFilePath from .util.nodeUtil import getDataNodeUrl from .util.httpUtil import getHref, getBooleanParam from .util.httpUtil import jsonResponse from .util.globparser import globmatch from .util.authUtil import getUserPasswordFromRequest, validateUserPassword -from .util.domainUtil import getDomainFromRequest, isValidDomain, verifyRoot -from .util.domainUtil import getBucketForDomain -from .util.linkUtil import validateLinkName, getLinkClass +from .util.domainUtil import getDomainFromRequest, isValidDomain, verifyRoot, getBucketForDomain +from .util.linkUtil import getRequestLink + + from .servicenode_lib import getDomainJson, validateAction from .servicenode_lib import getLink, putLink, putLinks, getLinks, deleteLinks from .domain_crawl import DomainCrawler from . import hsds_logger as log +from . import config async def GET_Links(request): @@ -221,13 +225,13 @@ async def GET_Link(request): link_class = link_json["class"] resp_link["class"] = link_class if link_class == "H5L_TYPE_HARD": - resp_link["id"] = link_json["id"] + resp_link["id"] = getLinkId(link_json) resp_link["collection"] = getCollectionForId(link_json["id"]) elif link_class == "H5L_TYPE_SOFT": - resp_link["h5path"] = link_json["h5path"] + resp_link["h5path"] = getLinkPath(link_json) elif link_class == "H5L_TYPE_EXTERNAL": - resp_link["h5path"] = link_json["h5path"] - resp_link["h5domain"] = link_json["h5domain"] + resp_link["h5path"] = getLinkPath(link_json) + resp_link["file"] = getLinkFilePath(link_json) else: log.warn(f"Unexpected link class: {link_class}") resp_json = {} @@ -291,17 +295,32 @@ async def PUT_Link(request): msg = f"Invalid domain: {domain}" log.warn(msg) raise HTTPBadRequest(reason=msg) - bucket = getBucketForDomain(domain) await validateAction(app, domain, group_id, username, "create") - # putLink will validate these arguments - kwargs = {"bucket": bucket} - kwargs["tgt_id"] = body.get("id") - kwargs["h5path"] = body.get("h5path") - kwargs["h5domain"] = body.get("h5domain") - created = body.get("created") - if created: - kwargs["created"] = created + + predate_max_time = config.get("predate_max_time", default=10.0) + + try: + link_json = getRequestLink(link_title, body, predate_max_time=predate_max_time) + except (KeyError, TypeError, ValueError) as e: + raise HTTPBadRequest(reason=str(e)) + + link_class = getLinkClass(link_json) + + kwargs = {} + kwargs["bucket"] = getBucketForDomain(domain) + if link_class == "H5L_TYPE_HARD": + kwargs["tgt_id"] = getLinkId(link_json) + elif link_class == "H5L_TYPE_SOFT": + kwargs["h5path"] = getLinkPath(link_json) + elif link_class == "H5L_TYPE_EXTERNAL": + kwargs["h5path"] = getLinkPath(link_json) + kwargs["h5domain"] = getLinkFilePath(link_json) + else: + raise HTTPBadRequest(reason=f"unexpected link class: {link_class}") + + if "created" in link_json: + kwargs["created"] = link_json["created"] status = await putLink(app, group_id, link_title, **kwargs) diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 921e2f2d..f20908bf 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -31,11 +31,12 @@ from h5json.shape_util import getShapeDims, getShapeClass from h5json.dset_util import getChunkSize, generateLayout from h5json.dset_util import getDataSize, validateDatasetCreationProps +from h5json.link_util import h5Join, validateLinkName, getLinkClass, getLinkFilePath from h5json.time_util import getNow from .util.nodeUtil import getDataNodeUrl from .util.authUtil import getAclKeys -from .util.linkUtil import h5Join, validateLinkName, getLinkClass, getRequestLinks +from .util.linkUtil import getRequestLinks from .util.storUtil import getStorJSONObj, isStorObj, getSupportedFilters from .util.authUtil import aclCheck from .util.httpUtil import http_get, http_put, http_post, http_delete @@ -479,7 +480,7 @@ async def putLink(app, group_id, title, if h5path: link_json["h5path"] = h5path if h5domain: - link_json["h5domain"] = h5domain + link_json["file"] = h5domain if created: link_json["created"] = created @@ -659,7 +660,7 @@ async def getObjectIdByPath(app, obj_id, h5path, bucket=None, refresh=False, dom raise HTTPBadRequest(reason=msg) # find domain object is stored under - domain = link_json["h5domain"] + domain = getLinkFilePath(link_json) if domain.startswith("hdf5:/"): # strip off prefix diff --git a/hsds/util/linkUtil.py b/hsds/util/linkUtil.py index d0063a39..65939e7d 100644 --- a/hsds/util/linkUtil.py +++ b/hsds/util/linkUtil.py @@ -13,129 +13,12 @@ # linkdUtil: # link related functions # -import time +from h5json.time_util import getNow +from h5json.link_util import validateLinkName, getLinkClass, getLinkPath, getLinkFilePath from .. import hsds_logger as log -def validateLinkName(name): - """ verify the link name is valid """ - if not isinstance(name, str): - msg = "Unexpected type for link name" - log.warn(msg) - raise ValueError(msg) - if name.find("/") >= 0: - msg = "link name contains slash" - log.warn(msg) - raise ValueError(msg) - - -def getLinkClass(link_json): - """ verify this is a valid link - returns the link class """ - log.debug(f"getLinkClass({link_json})") - if "class" in link_json: - link_class = link_json["class"] - else: - link_class = None - if "h5path" in link_json and "id" in link_json: - msg = "link tgt_id and h5path both set" - log.warn(msg) - raise ValueError(msg) - if "id" in link_json: - tgt_id = link_json["id"] - if not isinstance(tgt_id, str) or len(tgt_id) < 38: - msg = f"link with invalid id: {tgt_id}" - log.warn(msg) - raise ValueError(msg) - if tgt_id[:2] not in ("g-", "t-", "d-"): - msg = "link tgt must be group, datatype or dataset uuid" - log.warn(msg) - raise ValueError(msg) - if link_class: - if link_class != "H5L_TYPE_HARD": - msg = f"expected link class to be H5L_TYPE_HARD but got: {link_class}" - log.warn(msg) - raise ValueError(msg) - else: - link_class = "H5L_TYPE_HARD" - elif "h5path" in link_json: - h5path = link_json["h5path"] - log.debug(f"link path: {h5path}") - if "h5domain" in link_json: - if link_class: - if link_class != "H5L_TYPE_EXTERNAL": - msg = f"expected link class to be H5L_TYPE_EXTERNAL but got: {link_class}" - log.warn(msg) - raise ValueError(msg) - else: - link_class = "H5L_TYPE_EXTERNAL" - else: - if link_class: - if link_class != "H5L_TYPE_SOFT": - msg = f"expected link class to be H5L_TYPE_SOFT but got: {link_class}" - log.warn(msg) - raise ValueError(msg) - else: - link_class = "H5L_TYPE_SOFT" - else: - msg = "link with no id or h5path" - log.warn(msg) - raise ValueError(msg) - - return link_class - - -def isEqualLink(link1, link2): - """ Return True if the two links are the same """ - - for obj in (link1, link2): - if not isinstance(obj, dict): - raise TypeError(f"unexpected type: {type(obj)}") - if "class" not in obj: - raise TypeError("expected class key for link") - if link1["class"] != link2["class"]: - return False # different link types - link_class = link1["class"] - if link_class == "H5L_TYPE_HARD": - for obj in (link1, link2): - if "id" not in obj: - raise TypeError(f"expected id key for link: {obj}") - if link1["id"] != link2["id"]: - return False - elif link_class == "H5L_TYPE_SOFT": - for obj in (link1, link2): - if "h5path" not in obj: - raise TypeError(f"expected h5path key for link: {obj}") - if link1["h5path"] != link2["h5path"]: - return False - elif link_class == "H5L_TYPE_EXTERNAL": - for obj in (link1, link2): - for k in ("h5path", "h5domain"): - if k not in obj: - raise TypeError(f"expected {k} key for link: {obj}") - if link1["h5path"] != link2["h5path"]: - return False - if link1["h5domain"] != link2["h5domain"]: - return False - else: - raise TypeError(f"unexpected link class: {link_class}") - return True - - -def h5Join(path, paths): - h5path = path - if not paths: - return h5path - if isinstance(paths, str): - paths = (paths,) - for s in paths: - if h5path[-1] != "/": - h5path += "/" - h5path += s - return h5path - - def getRequestLink(title, link_json, predate_max_time=0.0): """ return normalized link from request json Throw value error if badly formatted """ @@ -148,16 +31,11 @@ def getRequestLink(title, link_json, predate_max_time=0.0): log.debug(f"getRequestLink title: {title} link_json: {link_json}") link_item = {} # normalized link item to return - now = time.time() + now = getNow() validateLinkName(title) # will raise ValueError is invalid link_class = getLinkClass(link_json) - if "class" in link_item: - if link_class != link_json["class"]: - msg = f"expected link class of: {link_class} but got {link_json}" - log.warn(msg) - raise ValueError(msg) link_item = {"class": link_class} @@ -169,17 +47,10 @@ def getRequestLink(title, link_json, predate_max_time=0.0): link_item["id"] = link_json["id"] else: if link_class in ("H5L_TYPE_SOFT", "H5L_TYPE_EXTERNAL"): - if "h5path" not in link_json: - msg = "expected h5path key for soft link" - log.warn(msg) - raise ValueError(msg) - link_item["h5path"] = link_json["h5path"] + link_item["h5path"] = getLinkPath(link_json) if link_class == "H5L_TYPE_EXTERNAL": - if "h5domain" not in link_json: - msg = "expected h5domain key for external link" - log.warn(msg) - raise ValueError(msg) + link_item["file"] = getLinkFilePath(link_json) if "created" in link_json: created = link_json["created"] diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index 244f8f5d..d95e6834 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -270,7 +270,7 @@ def testExternalLink(self): target_path = "somewhere" link_title = "external_link" req = helper.getEndpoint() + "/groups/" + root_id + "/links/" + link_title - payload = {"h5path": target_path, "h5domain": target_domain} + payload = {"h5path": target_path, "file": target_domain} rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # created @@ -294,7 +294,7 @@ def testExternalLink(self): self.assertEqual(rspLink["title"], link_title) self.assertEqual(rspLink["class"], "H5L_TYPE_EXTERNAL") self.assertEqual(rspLink["h5path"], target_path) - self.assertEqual(rspLink["h5domain"], target_domain) + self.assertEqual(rspLink["file"], target_domain) def testGetLinks(self): domain = self.base_domain + "/testGetLinks.h5" @@ -524,10 +524,10 @@ def testGet(self): self.assertTrue(link["created"] < now - 10) else: self.assertEqual(link_class, "H5L_TYPE_EXTERNAL") - for name in ("created", "class", "h5domain", "h5path", "title", "href"): + for name in ("created", "class", "file", "h5path", "title", "href"): self.assertTrue(name in link) self.assertEqual(link["title"], "extlink") - extlink_file = link["h5domain"] + extlink_file = link["file"] self.assertEqual(extlink_file, "somefile") self.assertEqual(link["h5path"], "somepath") self.assertTrue(link["created"] < now - 10) @@ -555,7 +555,8 @@ def testGet(self): self.assertTrue(name in link) self.assertEqual(link["class"], "H5L_TYPE_SOFT") - self.assertFalse("h5domain" in link) # only for external links + self.assertFalse("h5domain" in link) # deprecated name + self.assertFalse("file" in link) # only for external links self.assertEqual(link["title"], "slink") self.assertEqual(link["h5path"], "somevalue") @@ -618,12 +619,14 @@ def testGetRecursive(self): softlink_count += 1 self.assertTrue("h5path" in link) self.assertFalse("h5domain" in link) + self.assertFalse("file" in link) self.assertFalse("id" in link) self.assertTrue(link_title in expected_soft_links) elif link_class == "H5L_TYPE_EXTERNAL": extlink_count += 1 self.assertTrue("h5path" in link) - self.assertTrue("h5domain" in link) + self.assertTrue("file" in link) + self.assertFalse("h5domain" in link) # deprecated name self.assertFalse("id" in link) self.assertTrue(link_title in expected_external_links) else: @@ -690,7 +693,7 @@ def testGetPattern(self): self.assertEqual(len(links), 1) # only extlink should be returned link = links[0] - for name in ("created", "class", "h5domain", "h5path", "title"): + for name in ("created", "class", "file", "h5path", "title"): self.assertTrue(name in link) if use_post: pass # no href with post @@ -698,7 +701,7 @@ def testGetPattern(self): self.assertTrue("href" in link) self.assertEqual(link["class"], "H5L_TYPE_EXTERNAL") self.assertEqual(link["title"], "extlink") - self.assertEqual(link["h5domain"], "somefile") + self.assertEqual(link["file"], "somefile") self.assertEqual(link["h5path"], "somepath") self.assertTrue(link["created"] < now - 10) @@ -926,7 +929,7 @@ def testExternalLinkTraversal(self): target_path = "/external_group" link_title = "external_link_to_group" req = helper.getEndpoint() + "/groups/" + root_id + "/links/" + link_title - payload = {"h5path": target_path, "h5domain": second_domain} + payload = {"h5path": target_path, "file": second_domain} headers = helper.getRequestHeaders(domain=domain) rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) @@ -959,7 +962,7 @@ def testExternalLinkTraversal(self): target_path = "/external_group" link_title = "external_link_to_group_prefix" req = helper.getEndpoint() + "/groups/" + root_id + "/links/" + link_title - payload = {"h5path": target_path, "h5domain": f"hdf5:/{second_domain}"} + payload = {"h5path": target_path, "file": f"hdf5:/{second_domain}"} headers = helper.getRequestHeaders(domain=domain) rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) @@ -1222,10 +1225,10 @@ def testPostLinkSingle(self): self.assertTrue(link["created"] < now - 10) else: self.assertEqual(link_class, "H5L_TYPE_EXTERNAL") - for name in ("created", "class", "h5domain", "h5path", "title"): + for name in ("created", "class", "file", "h5path", "title"): self.assertTrue(name in link) self.assertEqual(link["title"], "extlink") - extlink_file = link["h5domain"] + extlink_file = link["file"] self.assertEqual(extlink_file, "somefile") self.assertEqual(link["h5path"], "somepath") self.assertTrue(link["created"] < now - 10) @@ -1296,7 +1299,7 @@ def testPostLinkMultiple(self): # soft or external link self.assertEqual(link["h5path"], expected["h5path"]) if link_class == "H5L_TYPE_EXTERNAL": - self.assertEqual(link["h5domain"], expected["h5domain"]) + self.assertEqual(link["file"], expected["file"]) # get just the requested links for each group req = helper.getEndpoint() + "/groups/" + root_id + "/links" @@ -1498,7 +1501,7 @@ def testPutLinkMultiple(self): # add a soft and external link as well links["softlink"] = {"h5path": "a_path"} - links["extlink"] = {"h5path": "another_path", "h5domain": "/a_domain"} + links["extlink"] = {"h5path": "another_path", "file": "/a_domain"} link_count = len(links) # write links to the grpA @@ -1531,8 +1534,8 @@ def testPutLinkMultiple(self): self.assertTrue("h5path" in link) h5path = link["h5path"] self.assertEqual(h5path, "another_path") - self.assertTrue("h5domain" in link) - h5domain = link["h5domain"] + self.assertTrue("file" in link) + h5domain = link["file"] self.assertEqual(h5domain, "/a_domain") else: self.assertTrue(False) # unexpected @@ -1545,7 +1548,7 @@ def testPutLinkMultiple(self): links = {} links["hardlink_multicast"] = {"id": root_id} links["softlink_multicast"] = {"h5path": "multi_path"} - links["extlink_multicast"] = {"h5path": "multi_path", "h5domain": "/another_domain"} + links["extlink_multicast"] = {"h5path": "multi_path", "file": "/another_domain"} link_count = len(links) data = {"links": links, "grp_ids": grp_ids} req = self.endpoint + "/groups/" + root_id + "/links" @@ -1573,8 +1576,8 @@ def testPutLinkMultiple(self): elif link_class == "H5L_TYPE_EXTERNAL": self.assertTrue("h5path" in ret_link) self.assertEqual(ret_link["h5path"], "multi_path") - self.assertTrue("h5domain" in ret_link) - self.assertEqual(ret_link["h5domain"], "/another_domain") + self.assertTrue("file" in ret_link) + self.assertEqual(ret_link["file"], "/another_domain") else: self.assertTrue(False) # unexpected @@ -1585,7 +1588,7 @@ def testPutLinkMultiple(self): links = {} links[f"hardlink_{i}"] = {"id": root_id} links[f"softlink_{i}"] = {"h5path": f"multi_path_{i}"} - ext_link = {"h5path": f"multi_path_{i}", "h5domain": f"/another_domain/{i}"} + ext_link = {"h5path": f"multi_path_{i}", "file": f"/another_domain/{i}"} links[f"extlink_{i}"] = ext_link link_data[grp_id] = {"links": links} @@ -1625,8 +1628,8 @@ def testPutLinkMultiple(self): self.assertEqual(link_title, f"extlink_{i}") self.assertTrue("h5path" in ret_link) self.assertEqual(ret_link["h5path"], f"multi_path_{i}") - self.assertTrue("h5domain" in ret_link) - self.assertEqual(ret_link["h5domain"], f"/another_domain/{i}") + self.assertTrue("file" in ret_link) + self.assertEqual(ret_link["file"], f"/another_domain/{i}") else: self.assertTrue(False) # unexpected @@ -1685,7 +1688,7 @@ def testPutLinkMultipleWithTimestamps(self): # add a soft and external link as well links["softlink"] = {"h5path": "a_path"} - links["extlink"] = {"h5path": "another_path", "h5domain": "/a_domain"} + links["extlink"] = {"h5path": "another_path", "file": "/a_domain"} link_count = len(links) # add timestamp timestamps = set() @@ -1725,8 +1728,8 @@ def testPutLinkMultipleWithTimestamps(self): self.assertTrue("h5path" in link) h5path = link["h5path"] self.assertEqual(h5path, "another_path") - self.assertTrue("h5domain" in link) - h5domain = link["h5domain"] + self.assertTrue("file" in link) + h5domain = link["file"] self.assertEqual(h5domain, "/a_domain") else: self.assertTrue(False) # unexpected @@ -1741,7 +1744,7 @@ def testPutLinkMultipleWithTimestamps(self): links = {} links["hardlink_multicast"] = {"id": root_id} links["softlink_multicast"] = {"h5path": "multi_path"} - links["extlink_multicast"] = {"h5path": "multi_path", "h5domain": "/another_domain"} + links["extlink_multicast"] = {"h5path": "multi_path", "file": "/another_domain"} link_count = len(links) timestamps = set() for title in links: @@ -1776,8 +1779,8 @@ def testPutLinkMultipleWithTimestamps(self): elif link_class == "H5L_TYPE_EXTERNAL": self.assertTrue("h5path" in ret_link) self.assertEqual(ret_link["h5path"], "multi_path") - self.assertTrue("h5domain" in ret_link) - self.assertEqual(ret_link["h5domain"], "/another_domain") + self.assertTrue("file" in ret_link) + self.assertEqual(ret_link["file"], "/another_domain") else: self.assertTrue(False) # unexpected self.assertTrue("created" in ret_link) @@ -1791,7 +1794,7 @@ def testPutLinkMultipleWithTimestamps(self): links = {} links[f"hardlink_{i}"] = {"id": root_id} links[f"softlink_{i}"] = {"h5path": f"multi_path_{i}"} - ext_link = {"h5path": f"multi_path_{i}", "h5domain": f"/another_domain/{i}"} + ext_link = {"h5path": f"multi_path_{i}", "file": f"/another_domain/{i}"} links[f"extlink_{i}"] = ext_link for title in links: link = links[title] @@ -1836,8 +1839,8 @@ def testPutLinkMultipleWithTimestamps(self): self.assertEqual(link_title, f"extlink_{i}") self.assertTrue("h5path" in ret_link) self.assertEqual(ret_link["h5path"], f"multi_path_{i}") - self.assertTrue("h5domain" in ret_link) - self.assertEqual(ret_link["h5domain"], f"/another_domain/{i}") + self.assertTrue("file" in ret_link) + self.assertEqual(ret_link["file"], f"/another_domain/{i}") else: self.assertTrue(False) # unexpected self.assertTrue("created" in ret_link) @@ -1886,7 +1889,7 @@ def testDeleteLinkMultiple(self): links[title] = {"h5path": "a_path"} titles.append(title) title = "extlink" - links[title] = {"h5path": "another_path", "h5domain": "/a_domain"} + links[title] = {"h5path": "another_path", "file": "/a_domain"} titles.append(title) link_count = len(links)