diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 7e24ec14..5175ea54 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-22.04, ubuntu-latest, windows-latest] - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12"] build-method: ["manual", "docker"] runs-on: ${{ matrix.os }} diff --git a/admin/config/config.yml b/admin/config/config.yml index 756be465..cb46522a 100755 --- a/admin/config/config.yml +++ b/admin/config/config.yml @@ -47,6 +47,7 @@ flush_sleep_interval: 1 # time to wait between checking on dirty objects flush_timeout: 10 # max time to wait on all I/O operations to complete for a flush min_chunk_size: 1m # 1 MB max_chunk_size: 4m # 4 MB +default_vlen_type_size: 128 # guess for average size of variable length types max_request_size: 100m # 100 MB - should be no smaller than client_max_body_size in nginx tmpl (if using nginx) max_chunks_per_folder: 0 # max number of chunks per s3 folder. 0 for unlimiited max_task_count: 100 # maximum number of concurrent tasks per node before server will return 503 error @@ -70,7 +71,7 @@ admin_group: null # enable admin privileges for any user in this group openid_provider: azure # OpenID authentication provider openid_url: null # OpenID connect endpoint if provider is not azure or google openid_audience: null # OpenID audience. This is synonymous with azure_resource_id for azure -openid_claims: unique_name,appid,roles # Comma seperated list of claims to resolve to usernames. +openid_claims: unique_name,appid,roles # Comma separated list of claims to resolve to usernames. chaos_die: 0 # if > 0, have nodes randomly die after n seconds (for testing) standalone_app: false # True when run as a single application blosc_nthreads: 2 # number of threads to use for blosc compression. Set to 0 to have blosc auto-determine thread count @@ -88,6 +89,10 @@ allow_any_bucket_read: true # enable reads to buckets other than default bucket allow_any_bucket_write: true # enable writes to buckets other than default bucket bit_shuffle_default_blocksize: 2048 # default blocksize for bitshuffle filter max_rangeget_gap: 1024 # max gap in byte for intelligent range get requests +predate_maxtime: 10.0 # max delta between object created timestamp in request and actual time +posix_delay: 0.0 # delay for POSIX IO operations for simulating cloud storage latencies +max_compact_dset_size: 65536 # size in bytes for maximum compact storage size +max_timestamp_drift: 300 # number of seconds a client-based timestamp can differ from current time # DEPRECATED - the remaining config values are not used in currently but kept for backward compatibility with older container images aws_lambda_chunkread_function: null # name of aws lambda function for chunk reading aws_lambda_threshold: 4 # number of chunks per node per request to reach before using lambda diff --git a/hsds/async_lib.py b/hsds/async_lib.py index e749e8a2..997432cc 100755 --- a/hsds/async_lib.py +++ b/hsds/async_lib.py @@ -15,18 +15,24 @@ from aiohttp.client_exceptions import ClientError from aiohttp.web_exceptions import HTTPNotFound, HTTPInternalServerError from aiohttp.web_exceptions import HTTPForbidden -from .util.idUtil import isValidUuid, isSchema2Id, getS3Key, isS3ObjKey -from .util.idUtil import getObjId, isValidChunkId, getCollectionForId -from .util.chunkUtil import getDatasetId, getNumChunks, ChunkIterator -from .util.hdf5dtype import getItemSize, createDataType -from .util.arrayUtil import getNumElements, bytesToArray -from .util.dsetUtil import getHyperslabSelection, getFilterOps, getChunkDims, getFilters -from .util.dsetUtil import getDatasetLayoutClass, getDatasetLayout, getShapeDims +from h5json.hdf5dtype import getItemSize +from h5json.hdf5dtype import createDataType +from h5json.array_util import getNumElements, bytesToArray, bytesArrayToList +from h5json.objid import isValidUuid, isSchema2Id, getS3Key, isS3ObjKey +from h5json.objid import getObjId, isValidChunkId, getCollectionForId +from h5json.filters import getFilters +from h5json.shape_util import getShapeDims, getDataSize +from h5json.dset_util import getDatasetLayoutClass, getDatasetLayout, getChunkDims +from h5json.time_util import getNow + +from .util.chunkUtil import getDatasetId, getNumChunks, ChunkIterator, getChunkIndex, getChunkIds +from .util.dsetUtil import getHyperslabSelection from .util.storUtil import getStorKeys, putStorJSONObj, getStorJSONObj from .util.storUtil import deleteStorObj, getStorBytes, isStorObj +from .datanode_lib import getFilterOps from . import hsds_logger as log from . import config -import time + # List all keys under given root and optionally update info.json # Note: only works with schema v2 domains! @@ -71,9 +77,10 @@ async def updateDatasetInfo(app, dset_id, dataset_info, bucket=None): msg += f"{dset_id}" log.warn(msg) return + type_json = dset_json["type"] item_size = getItemSize(type_json) - if "layout" not in dset_json: + if not getDatasetLayout(dset_json): msg = "updateDatasetInfo - expected to find layout in dataset_json " msg += f"for {dset_id}" log.warn(msg) @@ -106,7 +113,7 @@ async def updateDatasetInfo(app, dset_id, dataset_info, bucket=None): if layout_class == "H5D_CONTIGUOUS_REF": # In H5D_CONTIGUOUS_REF a non-compressed part of the HDF5 is divided # into equal size chunks, so we can just compute link bytes and num - # chunks based on the size of the coniguous dataset + # chunks based on the size of the contiguous dataset layout_dims = getChunkDims(dset_json) num_chunks = getNumChunks(selection, layout_dims) chunk_size = item_size @@ -262,20 +269,26 @@ def scanRootCallback(app, s3keys): results = app["scanRoot_results"] scanRoot_keyset = app["scanRoot_keyset"] checksums = results["checksums"] + for s3key in s3keys.keys(): if not isS3ObjKey(s3key): - log.info(f"not s3obj key, ignoring: {s3key}") + log.info(f"scanRoot -not s3obj key, ignoring: {s3key}") continue if s3key in scanRoot_keyset: - log.warn(f"scanRoot - dejavu for key: {s3key}") + log.warn(f"scanRoot -scanRoot - dejavu for key: {s3key}") continue scanRoot_keyset.add(s3key) - msg = f"scanRoot adding key: {s3key} to keyset, " + msg = f"scanRoot - adding key: {s3key} to keyset, " msg += f"{len(scanRoot_keyset)} keys" log.debug(msg) objid = getObjId(s3key) + + if objid in app["deleted_ids"]: + log.debug(f"scanRoot - skipping deleted id: {objid}") + continue + etag = None obj_size = None lastModified = None @@ -300,8 +313,15 @@ def scanRootCallback(app, s3keys): is_chunk = True results["num_chunks"] += 1 results["allocated_bytes"] += obj_size + chunk_index = getChunkIndex(objid) + if max(chunk_index) == 0: + # save the first chunk if present + # this will be used to save dataset values to + # the the obj_ids set for small datasets + results["obj_ids"].add(objid) else: results["metadata_bytes"] += obj_size + results["obj_ids"].add(objid) if is_chunk or getCollectionForId(objid) == "datasets": if is_chunk: @@ -339,6 +359,144 @@ def scanRootCallback(app, s3keys): log.error(msg) +async def _getDatsetValueJson(app, dset_id, dset_json, obj_ids, size_limit=None, bucket=None): + """ If the dataset size is less than size_limit, and the chunk_ids for the dataset are + available, return a JSON representation of the dataset values. Othewise, return None """ + + dims = getShapeDims(dset_json) + if dims is None: + return None # null dataspace + if "type" not in dset_json: + msg = f"_getDatsetValueJson - expected to find type in dataset_json for {dset_id}" + log.warn(msg) + return None + type_json = dset_json["type"] + item_size = getItemSize(type_json) + if item_size == "H5T_VARIABLE": + item_size = 1024 # make a guess for variable length types + dataset_size = getDataSize(dims, item_size) + if dataset_size > size_limit: + log.debug(f"_getDatasetValueJson - dataset size {dataset_size} exceeds limit {size_limit}") + return None + + chunk_dims = getChunkDims(dset_json) + if not chunk_dims: + log.warning(f"_getDatasetValueJson - no layout found for dataset: {dset_id}") + return None + if chunk_dims != dims: + msg = f"_getDatasetValueJson - dataset layout {chunk_dims} does not match dims {dims} " + msg += f"for dataset: {dset_id}, ignoring" + log.warning(msg) + return None + select_all = getHyperslabSelection(dims) # select entire datashape + chunk_ids = getChunkIds(dset_id, select_all, dims) + if len(chunk_ids) == 0: + log.debug(f"_getDatasetValueJson - no chunk ids found for dataset: {dset_id}") + return None + if len(chunk_ids) > 1: + log.debug(f"_getDatasetValueJson - more than one chunk id found for dataset: {dset_id}") + return None + chunk_id = chunk_ids[0] + if chunk_id not in obj_ids: + log.debug(f"_getDatasetValueJson - chunk id {chunk_id} not in scanned obj_ids") + return None + log.debug(f"using chunk: {chunk_id} to get dataset value for {dset_id}") + + # fetch the chunk - using getStoreBytes since this will not be used with + # chunk cache or chunk crawlers + # TBD: need parameters for s3path, s3offset, s3size for ref layouts + # regular store read + + filters = getFilters(dset_json) + dt = createDataType(type_json) + filter_ops = getFilterOps(app, dset_id, filters, dtype=dt, chunk_shape=chunk_dims) + + kwargs = { + "filter_ops": filter_ops, + "offset": None, + "length": None, + "bucket": bucket + } + s3key = getS3Key(chunk_id) + + try: + chunk_bytes = await getStorBytes(app, s3key, **kwargs) + except HTTPNotFound: + log.warning(f"_getDatasetValueJson - HTTPNotFound for chunk {chunk_id} bucket:{bucket}") + return None + except HTTPForbidden: + log.warning(f"_getDatasetValueJson - HTTPForbidden for chunk {chunk_id} bucket:{bucket}") + return None + except HTTPInternalServerError: + msg = "_getDatasetValueJson - " + msg += f"HTTPInternalServerError for chunk {chunk_id} bucket:{bucket}" + log.warning(msg) + return None + + if chunk_bytes is None: + msg = f"_getDatasetValueJson -read {chunk_id} bucket: {bucket} returned None" + log.warning(msg) + return None + + arr = bytesToArray(chunk_bytes, dt, chunk_dims) + + json_value = bytesArrayToList(arr) + log.debug(f"_getDatsetValueJson - returning {json_value}") + + return json_value + + +async def getConsolidatedMetaData(app, obj_ids, bucket=None): + # create a consolidated metadata summary for all objects in the domain + # return a dict of obj_ids to their metadata summaries + log.info("getConsolidatedMetaData - creating consolidated metadata summary") + consolidated_metadata = {} + for obj_id in obj_ids: + if isValidChunkId(obj_id): + # skip chunks - we may use the chunk later when processing it's dataset object + continue + s3_key = getS3Key(obj_id) + try: + obj_json = await getStorJSONObj(app, s3_key, bucket=bucket) + except HTTPNotFound: + log.warn(f"HTTPNotFound for {s3_key} bucket:{bucket}") + continue + except HTTPForbidden: + log.warn(f"HTTPForbidden error for {s3_key} bucket:{bucket}") + continue + except HTTPInternalServerError: + msg = f"HTTPInternalServerError error for {s3_key} bucket:{bucket}" + log.warn(msg) + continue + log.debug(f"getConsolidatedMetaData - got json for obj_id: {obj_id}: {obj_json}") + # extract relevant metadata + metadata_summary = {} + if "type" in obj_json: + metadata_summary["type"] = obj_json["type"] + if "shape" in obj_json: + metadata_summary["shape"] = obj_json["shape"] + if "attributes" in obj_json: + metadata_summary["attributes"] = obj_json["attributes"] + if "links" in obj_json: + metadata_summary["links"] = obj_json["links"] + if "creationProperties" in obj_json: + metadata_summary["creationProperties"] = obj_json["creationProperties"] + if getCollectionForId(obj_id) == "datasets": + log.debug("getConsolidatedMetaData - got dataset") + size_limit = 4096 # TBD - make this a config + kwargs = {"size_limit": size_limit, "bucket": bucket} + json_value = await _getDatsetValueJson(app, obj_id, obj_json, obj_ids, **kwargs) + if json_value is not None: + log.debug(f"adding dataset value to metadata summary for dataset: {obj_id}") + metadata_summary["value"] = json_value + else: + log.debug("getConsolidatedMetaData - not a dataset") + + consolidated_metadata[obj_id] = metadata_summary + log.info("getConsolidatedMetaData - done creating consolidated metadata summary") + return consolidated_metadata + + async def scanRoot(app, rootid, update=False, bucket=None): # iterate through all s3 keys under the given root. @@ -380,9 +538,10 @@ async def scanRoot(app, rootid, update=False, bucket=None): results["num_linked_chunks"] = 0 results["linked_bytes"] = 0 results["logical_bytes"] = 0 - results["checksums"] = {} # map of objid to checksums + results["obj_ids"] = set() # map of object ids scanned (and first chunk id for datasets) + results["checksums"] = {} # map of objid to checksums results["bucket"] = bucket - results["scan_start"] = time.time() + results["scan_start"] = getNow(app=app) app["scanRoot_results"] = results app["scanRoot_keyset"] = set() @@ -399,6 +558,9 @@ async def scanRoot(app, rootid, update=False, bucket=None): num_objects += len(results["datasets"]) num_objects += results["num_chunks"] log.info(f"scanRoot - got {num_objects} keys for rootid: {rootid}") + obj_ids = results["obj_ids"] + log.info(f"scanRoot - got {len(obj_ids)} unique object ids") + log.debug(f"scanRoot - obj_ids: {obj_ids}") dataset_results = results["datasets"] for dsetid in dataset_results: @@ -437,7 +599,12 @@ async def scanRoot(app, rootid, update=False, bucket=None): # free up memory used by the checksums del results["checksums"] - results["scan_complete"] = time.time() + results["scan_complete"] = getNow(app=app) + + # extract the obj_ids set, that won't go into .info.json + obj_ids = results["obj_ids"] + del results["obj_ids"] + log.debug(f"obj_ids set: {obj_ids}") if update: # write .info object back to S3 @@ -446,6 +613,17 @@ async def scanRoot(app, rootid, update=False, bucket=None): msg += f"{results}" log.info(msg) await putStorJSONObj(app, info_key, results, bucket=bucket) + + # create a json summary of objects in ths domain + log.debug(f"Creating consolidated metadata summary for root {rootid}") + summary_key = root_prefix + ".summary.json" + summary_data = await getConsolidatedMetaData(app, obj_ids, bucket=bucket) + if summary_data: + log.info(f"Got consolidated metadata summary for root {rootid}") + log.debug(f"Summary data: {summary_data}") + await putStorJSONObj(app, summary_key, summary_data, bucket=bucket) + else: + log.info(f"No consolidated metadata summary for root {rootid}") return results diff --git a/hsds/attr_dn.py b/hsds/attr_dn.py index 456e9854..a660b836 100755 --- a/hsds/attr_dn.py +++ b/hsds/attr_dn.py @@ -12,21 +12,24 @@ # # attribute handling routines # -import time from bisect import bisect_left from aiohttp.web_exceptions import HTTPBadRequest, HTTPConflict, HTTPNotFound, HTTPGone from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.web import json_response +from h5json.hdf5dtype import getItemSize, createDataType +from h5json.array_util import arrayToBytes, jsonToArray, decodeData +from h5json.array_util import bytesToArray, bytesArrayToList, getNumElements +from h5json.shape_util import getShapeDims +from h5json.time_util import getNow + from .util.attrUtil import validateAttributeName, isEqualAttr -from .util.hdf5dtype import getItemSize, createDataType from .util.globparser import globmatch -from .util.dsetUtil import getShapeDims -from .util.arrayUtil import arrayToBytes, jsonToArray, decodeData -from .util.arrayUtil import bytesToArray, bytesArrayToList, getNumElements from .util.domainUtil import isValidBucketName from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj + +from . import config from . import hsds_logger as log @@ -359,15 +362,17 @@ async def PUT_Attributes(request): log.request(request) app = request.app params = request.rel_url.query - log.debug(f"got PUT_Attributes params: {params}") + log.debug(f"got PUT_Attributes params: {dict(params)}") obj_id = get_obj_id(request) + now = getNow(app) + max_timestamp_drift = int(config.get("max_timestamp_drift", default=300)) if not request.has_body: log.error("PUT_Attribute with no body") raise HTTPBadRequest(message="body expected") body = await request.json() - log.debug(f"got body: {body}") + log.debug(f"PUT_Attributes got body: {body}") if "bucket" in params: bucket = params["bucket"] elif "bucket" in body: @@ -436,8 +441,8 @@ async def PUT_Attributes(request): data = arr.tolist() try: json_data = bytesArrayToList(data) - log.debug(f"converted encoded data to {json_data}") - if attr_shape["class"] == "H5S_SCALAR": + log.debug(f"converted encoded data to '{json_data}'") + if attr_shape["class"] == "H5S_SCALAR" and isinstance(json_data, list): attr_json["value"] = json_data[0] # just store the scalar else: attr_json["value"] = json_data @@ -458,11 +463,18 @@ async def PUT_Attributes(request): attributes = obj_json["attributes"] - create_time = time.time() # check for conflicts new_attributes = set() # attribute names that are new or replacements for attr_name in items: attribute = items[attr_name] + if attribute.get("created"): + create_time = attribute["created"] + log.debug(f"attribute {attr_name} has create time: {create_time}") + if abs(create_time - now) > max_timestamp_drift: + log.warn(f"attribute {attr_name} create time stale, ignoring") + create_time = now + else: + create_time = now if attr_name in attributes: log.debug(f"attribute {attr_name} exists") old_item = attributes[attr_name] @@ -510,7 +522,7 @@ async def PUT_Attributes(request): if new_attributes: # update the obj lastModified - now = time.time() + now = getNow(app) obj_json["lastModified"] = now # write back to S3, save to metadata cache await save_metadata_obj(app, obj_id, obj_json, bucket=bucket) @@ -609,7 +621,7 @@ async def DELETE_Attributes(request): if save_obj: # update the object lastModified - now = time.time() + now = getNow(app) obj_json["lastModified"] = now await save_metadata_obj(app, obj_id, obj_json, bucket=bucket) diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index b7ecdce4..44346929 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -18,21 +18,21 @@ from aiohttp.web import StreamResponse from json import JSONDecodeError +from h5json.hdf5dtype import createDataType, getItemSize +from h5json.array_util import jsonToArray, getNumElements +from h5json.array_util import bytesToArray, arrayToBytes, decodeData, encodeData +from h5json.objid import isValidUuid +from h5json.shape_util import getShapeDims + from .util.httpUtil import getAcceptType, jsonResponse, getHref, getBooleanParam from .util.globparser import globmatch -from .util.idUtil import isValidUuid, getRootObjId from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain, verifyRoot from .util.attrUtil import validateAttributeName, getRequestCollectionName -from .util.hdf5dtype import validateTypeItem, getBaseTypeJson -from .util.hdf5dtype import createDataType, getItemSize -from .util.arrayUtil import jsonToArray, getNumElements, bytesArrayToList -from .util.arrayUtil import bytesToArray, arrayToBytes, decodeData, encodeData -from .util.dsetUtil import getShapeDims - -from .servicenode_lib import getDomainJson, getObjectJson, validateAction -from .servicenode_lib import getAttributes, putAttributes, deleteAttributes + +from .servicenode_lib import getDomainJson, getAttributeFromRequest, getAttributesFromRequest +from .servicenode_lib import getAttributes, putAttributes, deleteAttributes, validateAction from .domain_crawl import DomainCrawler from . import hsds_logger as log from . import config @@ -295,244 +295,6 @@ async def GET_Attribute(request): return resp -async def _getTypeFromRequest(app, body, obj_id=None, bucket=None): - """ return a type json from the request body """ - if "type" not in body: - msg = "PUT attribute with no type in body" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - datatype = body["type"] - - if isinstance(datatype, str) and datatype.startswith("t-"): - # Committed type - fetch type json from DN - ctype_id = datatype - log.debug(f"got ctypeid: {ctype_id}") - ctype_json = await getObjectJson(app, ctype_id, bucket=bucket) - log.debug(f"ctype {ctype_id}: {ctype_json}") - root_id = getRootObjId(obj_id) - if ctype_json["root"] != root_id: - msg = "Referenced committed datatype must belong in same domain" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - datatype = ctype_json["type"] - # add the ctype_id to the type - datatype["id"] = ctype_id - elif isinstance(datatype, str): - try: - # convert predefined type string (e.g. "H5T_STD_I32LE") to - # corresponding json representation - datatype = getBaseTypeJson(datatype) - except TypeError: - msg = "PUT attribute with invalid predefined type" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - try: - validateTypeItem(datatype) - except KeyError as ke: - msg = f"KeyError creating type: {ke}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - except TypeError as te: - msg = f"TypeError creating type: {te}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - except ValueError as ve: - msg = f"ValueError creating type: {ve}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - return datatype - - -def _getShapeFromRequest(body): - """ get shape json from request body """ - shape_json = {} - if "shape" in body: - shape_body = body["shape"] - shape_class = None - if isinstance(shape_body, dict) and "class" in shape_body: - shape_class = shape_body["class"] - elif isinstance(shape_body, str): - shape_class = shape_body - if shape_class: - if shape_class == "H5S_NULL": - shape_json["class"] = "H5S_NULL" - if isinstance(shape_body, dict) and "dims" in shape_body: - msg = "can't include dims with null shape" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if isinstance(shape_body, dict) and "value" in body: - msg = "can't have H5S_NULL shape with value" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif shape_class == "H5S_SCALAR": - shape_json["class"] = "H5S_SCALAR" - dims = getShapeDims(shape_body) - if len(dims) != 1 or dims[0] != 1: - msg = "dimensions aren't valid for scalar attribute" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif shape_class == "H5S_SIMPLE": - shape_json["class"] = "H5S_SIMPLE" - dims = getShapeDims(shape_body) - shape_json["dims"] = dims - else: - msg = f"Unknown shape class: {shape_class}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - # no class, interpet shape value as dimensions and - # use H5S_SIMPLE as class - if isinstance(shape_body, list) and len(shape_body) == 0: - shape_json["class"] = "H5S_SCALAR" - else: - shape_json["class"] = "H5S_SIMPLE" - dims = getShapeDims(shape_body) - shape_json["dims"] = dims - else: - shape_json["class"] = "H5S_SCALAR" - - return shape_json - - -def _getValueFromRequest(body, data_type, data_shape): - """ Get attribute value from request json """ - dims = getShapeDims(data_shape) - if "value" in body: - if dims is None: - msg = "Bad Request: data can not be included with H5S_NULL space" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - value = body["value"] - # validate that the value agrees with type/shape - arr_dtype = createDataType(data_type) # np datatype - if len(dims) == 0: - np_dims = [1, ] - else: - np_dims = dims - - if body.get("encoding"): - item_size = getItemSize(data_type) - if item_size == "H5T_VARIABLE": - msg = "base64 encoding is not support for variable length attributes" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - try: - data = decodeData(value) - except ValueError: - msg = "unable to decode data" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - expected_numbytes = arr_dtype.itemsize * np.prod(dims) - if len(data) != expected_numbytes: - msg = f"expected: {expected_numbytes} but got: {len(data)}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - # check to see if this works with our shape and type - try: - arr = bytesToArray(data, arr_dtype, np_dims) - except ValueError as e: - log.debug(f"data: {data}") - log.debug(f"type: {arr_dtype}") - log.debug(f"np_dims: {np_dims}") - msg = f"Bad Request: encoded input data doesn't match shape and type: {e}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - value_json = None - # now try converting to JSON - list_data = arr.tolist() - try: - value_json = bytesArrayToList(list_data) - except ValueError as err: - msg = f"Cannot decode bytes to list: {err}, will store as encoded bytes" - log.warn(msg) - if value_json: - log.debug("will store base64 input as json") - if data_shape["class"] == "H5S_SCALAR": - # just use the scalar value - value = value_json[0] - else: - value = value_json # return this - else: - value = data # return bytes to signal that this needs to be encoded - else: - # verify that the input data matches the array shape and type - try: - jsonToArray(np_dims, arr_dtype, value) - except ValueError as e: - msg = f"Bad Request: input data doesn't match selection: {e}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - value = None - - return value - - -async def _getAttributeFromRequest(app, req_json, obj_id=None, bucket=None): - """ return attribute from given request json """ - attr_item = {} - attr_type = await _getTypeFromRequest(app, req_json, obj_id=obj_id, bucket=bucket) - attr_shape = _getShapeFromRequest(req_json) - attr_item = {"type": attr_type, "shape": attr_shape} - attr_value = _getValueFromRequest(req_json, attr_type, attr_shape) - if attr_value is not None: - if isinstance(attr_value, bytes): - attr_value = encodeData(attr_value) # store as base64 - attr_item["encoding"] = "base64" - else: - # just store the JSON dict or primitive value - attr_item["value"] = attr_value - else: - attr_item["value"] = None - - return attr_item - - -async def _getAttributesFromRequest(request, req_json, obj_id=None, bucket=None): - """ read the given JSON dictinary and return dict of attribute json """ - - app = request.app - attr_items = {} - kwargs = {"obj_id": obj_id} - if bucket: - kwargs["bucket"] = bucket - if "attributes" in req_json: - attributes = req_json["attributes"] - if not isinstance(attributes, dict): - msg = f"expected list for attributes but got: {type(attributes)}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - # read each attr_item and canonicalize the shape, type, verify value - for attr_name in attributes: - attr_json = attributes[attr_name] - attr_item = await _getAttributeFromRequest(app, attr_json, **kwargs) - attr_items[attr_name] = attr_item - - elif "type" in req_json: - # single attribute create - fake an item list - attr_item = await _getAttributeFromRequest(app, req_json, **kwargs) - if "name" in req_json: - attr_name = req_json["name"] - else: - attr_name = request.match_info.get("name") - validateAttributeName(attr_name) - if not attr_name: - msg = "Missing attribute name" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - attr_items[attr_name] = attr_item - else: - log.debug(f"_getAttributes from request - no attribute defined in {req_json}") - - return attr_items - - async def PUT_Attribute(request): """HTTP method to create a new attribute""" log.request(request) @@ -555,7 +317,7 @@ async def PUT_Attribute(request): log.debug(f"Attribute name: [{attr_name}]") validateAttributeName(attr_name) - log.info(f"PUT Attributes id: {req_obj_id} name: {attr_name}") + log.info(f"PUT Attribute id: {req_obj_id} name: {attr_name}") username, pswd = getUserPasswordFromRequest(request) # write actions need auth await validateUserPassword(app, username, pswd) @@ -587,7 +349,7 @@ async def PUT_Attribute(request): # get attribute from request body kwargs = {"bucket": bucket, "obj_id": req_obj_id} - attr_body = await _getAttributeFromRequest(app, body, **kwargs) + attr_body = await getAttributeFromRequest(app, body, **kwargs) # write attribute to DN attr_json = {attr_name: attr_body} @@ -596,7 +358,7 @@ async def PUT_Attribute(request): kwargs = {"bucket": bucket} if "replace" in params and params["replace"]: # allow attribute to be overwritten - log.debug("setting replace for PUT Atttribute") + log.debug("setting replace for PUT Attribute") kwargs["replace"] = True else: log.debug("replace is not set for PUT Attribute") @@ -624,7 +386,7 @@ async def PUT_Attributes(request): await validateUserPassword(app, username, pswd) if not request.has_body: - msg = "PUT Attribute with no body" + msg = "PUT Attributes with no body" log.warn(msg) raise HTTPBadRequest(reason=msg) try: @@ -654,10 +416,10 @@ async def PUT_Attributes(request): if not req_obj_id: req_obj_id = domain_json["root"] kwargs = {"obj_id": req_obj_id, "bucket": bucket} - attr_items = await _getAttributesFromRequest(request, body, **kwargs) + attr_items = await getAttributesFromRequest(app, body, **kwargs) if attr_items: - log.debug(f"PUT Attribute {len(attr_items)} attibutes to add") + log.debug(f"PUT Attribute {len(attr_items)} attributes to add") else: log.debug("no attributes defined yet") @@ -666,6 +428,7 @@ async def PUT_Attributes(request): obj_ids = {} if "obj_ids" in body: body_ids = body["obj_ids"] + if isinstance(body_ids, list): # multi cast the attributes - each attribute in attr-items # will be written to each of the objects identified by obj_id @@ -685,7 +448,7 @@ async def PUT_Attributes(request): msg += f"{len(obj_ids)} objects" log.info(msg) elif isinstance(body_ids, dict): - # each value is body_ids is a set of attriutes to write to the object + # each value is body_ids is a set of attributes to write to the object # unlike the above case, different attributes can be written to # different objects if attr_items: @@ -701,7 +464,7 @@ async def PUT_Attributes(request): id_json = body_ids[obj_id] kwargs = {"obj_id": obj_id, "bucket": bucket} - obj_items = await _getAttributesFromRequest(request, id_json, **kwargs) + obj_items = await getAttributesFromRequest(app, id_json, **kwargs) if obj_items: obj_ids[obj_id] = obj_items @@ -1056,7 +819,7 @@ async def PUT_AttributeValue(request): log.debug("PUT AttributeValue - request_type is binary") request_type = "binary" elif "application/json" in content_type: - log.debug("PUT AttribueValue - request type is json") + log.debug("PUT AttributeValue - request type is json") else: msg = f"Unknown content_type: {content_type}" log.warn(msg) @@ -1088,11 +851,7 @@ async def PUT_AttributeValue(request): msg += f"but got {len(binary_data)}" log.warn(msg) raise HTTPBadRequest(reason=msg) - arr = np.fromstring(binary_data, dtype=np_dtype) - if attr_shape["class"] == "H5S_SCALAR": - arr = arr.reshape([]) - else: - arr = arr.reshape(np_shape) # conform to selection shape + arr = bytesToArray(binary_data, np_dtype, np_shape) log.debug(f"got array {arr} from binary data") else: try: @@ -1137,6 +896,7 @@ async def PUT_AttributeValue(request): attr_body["value"] = data.decode("ascii") attr_body["encoding"] = "base64" attr_json = {attr_name: attr_body} + log.debug(f"putting attr {attr_name} to DN: {attr_json}") kwargs = {"bucket": bucket, "replace": True} diff --git a/hsds/basenode.py b/hsds/basenode.py index f3356f34..1b1bbbd0 100644 --- a/hsds/basenode.py +++ b/hsds/basenode.py @@ -25,15 +25,17 @@ from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.web_exceptions import HTTPServiceUnavailable + from . import config from .util.httpUtil import http_get, http_post, jsonResponse -from .util.idUtil import createNodeId, getNodeNumber, getNodeCount from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.authUtil import isAdminUser from .util.k8sClient import getDnLabelSelector, getPodIps +from .util.nodeUtil import createNodeId, getNodeNumber, getNodeCount + from . import hsds_logger as log -HSDS_VERSION = "0.9.2" +HSDS_VERSION = "1.0.0" def getVersion(): diff --git a/hsds/chunk_crawl.py b/hsds/chunk_crawl.py index 847f0933..38da4d3e 100755 --- a/hsds/chunk_crawl.py +++ b/hsds/chunk_crawl.py @@ -15,7 +15,6 @@ # import asyncio -import time import traceback import random from asyncio import CancelledError @@ -24,16 +23,19 @@ from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.client_exceptions import ClientError +from h5json.hdf5dtype import createDataType +from h5json.array_util import jsonToArray, getNumpyValue +from h5json.array_util import getNumElements, arrayToBytes, bytesToArray +from h5json.shape_util import getShapeDims +from h5json.dset_util import getChunkDims +from h5json.time_util import getNow + +from .util.nodeUtil import getDataNodeUrl, getNodeCount from .util.httpUtil import http_get, http_put, http_post, get_http_client from .util.httpUtil import isUnixDomainUrl -from .util.idUtil import getDataNodeUrl, getNodeCount -from .util.hdf5dtype import createDataType -from .util.dsetUtil import getSliceQueryParam, getShapeDims -from .util.dsetUtil import getSelectionShape, getChunkLayout +from .util.dsetUtil import getSliceQueryParam, getSelectionShape from .util.chunkUtil import getChunkCoverage, getDataCoverage from .util.chunkUtil import getChunkIdForPartition, getQueryDtype -from .util.arrayUtil import jsonToArray, getNumpyValue -from .util.arrayUtil import getNumElements, arrayToBytes, bytesToArray from . import config from . import hsds_logger as log @@ -83,10 +85,9 @@ async def write_chunk_hyperslab( msg = f"write_chunk_hyperslab, chunk_id: {chunk_id}, slices: {slices}, " msg += f"bucket: {bucket}" + msg += f" dset_json: {dset_json}" log.info(msg) - if "layout" not in dset_json: - log.error(f"No layout found in dset_json: {dset_json}") - raise HTTPInternalServerError() + partition_chunk_id = getChunkIdForPartition(chunk_id, dset_json) if partition_chunk_id != chunk_id: log.debug(f"using partition_chunk_id: {partition_chunk_id}") @@ -106,7 +107,7 @@ async def write_chunk_hyperslab( log.debug(f"setting fields_param to: {fields_param}") params["fields"] = fields_param - layout = getChunkLayout(dset_json) + layout = getChunkDims(dset_json) log.debug(f"getChunkCoverage({chunk_id}, {slices}, {layout})") chunk_sel = getChunkCoverage(chunk_id, slices, layout) if chunk_sel is None: @@ -675,6 +676,9 @@ def __init__( app["cc_clients"] = {} self._clients = app["cc_clients"] + def now(self): + return getNow(app=self._app) + def get_status(self): if len(self._status_map) != len(self._chunk_ids): msg = "get_status code while crawler not complete" @@ -719,7 +723,7 @@ async def work(self): log.info(f"ChunkCrawler - client_name: {client_name}") while True: try: - start = time.time() + start = self.now() chunk_id = await self._q.get() if self._limit > 0 and self._hits >= self._limit: msg = f"ChunkCrawler - maxhits exceeded, skipping fetch for chunk: {chunk_id}" @@ -744,7 +748,7 @@ async def work(self): await self.do_work(chunk_id, client=client) self._q.task_done() - elapsed = time.time() - start + elapsed = self.now() - start msg = f"ChunkCrawler - task {chunk_id} start: {start:.3f} " msg += f"elapsed: {elapsed:.3f}" log.debug(msg) diff --git a/hsds/chunk_dn.py b/hsds/chunk_dn.py index e2671b61..839da5ac 100644 --- a/hsds/chunk_dn.py +++ b/hsds/chunk_dn.py @@ -20,18 +20,21 @@ from aiohttp.web_exceptions import HTTPNotFound, HTTPServiceUnavailable from aiohttp.web import json_response, StreamResponse +from h5json.hdf5dtype import createDataType, getSubType +from h5json.array_util import bytesToArray, arrayToBytes, getBroadcastShape +from h5json.objid import getS3Key, isValidUuid +from h5json.shape_util import getShapeDims +from h5json.dset_util import getChunkDims + from .util.httpUtil import request_read, getContentType -from .util.arrayUtil import bytesToArray, arrayToBytes, getBroadcastShape -from .util.idUtil import getS3Key, validateInPartition, isValidUuid from .util.storUtil import isStorObj, deleteStorObj -from .util.hdf5dtype import createDataType, getSubType -from .util.dsetUtil import getSelectionList, getChunkLayout, getShapeDims -from .util.dsetUtil import getSelectionShape, getChunkInitializer +from .util.dsetUtil import getSelectionList, getSelectionShape, getChunkInitializer from .util.chunkUtil import getChunkIndex, getDatasetId, chunkQuery from .util.chunkUtil import chunkWriteSelection, chunkReadSelection from .util.chunkUtil import chunkWritePoints, chunkReadPoints from .util.domainUtil import isValidBucketName from .util.boolparser import BooleanParser +from .util.nodeUtil import validateInPartition from .datanode_lib import get_metadata_obj, get_chunk, save_chunk from . import hsds_logger as log @@ -63,7 +66,7 @@ async def PUT_Chunk(request): log.error(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(chunk_id, "Chunk"): + if not isValidUuid(chunk_id, obj_class="chunks"): msg = f"Invalid chunk id: {chunk_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -129,7 +132,7 @@ async def PUT_Chunk(request): dset_json = await get_metadata_obj(app, dset_id, bucket=bucket) # TBD - does this work with linked datasets? - dims = getChunkLayout(dset_json) + dims = getChunkDims(dset_json) rank = len(dims) type_json = dset_json["type"] @@ -337,7 +340,7 @@ async def GET_Chunk(request): msg = "Missing chunk id" log.error(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(chunk_id, "Chunk"): + if not isValidUuid(chunk_id, obj_class="chunks"): msg = f"Invalid chunk id: {chunk_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -433,7 +436,7 @@ async def GET_Chunk(request): dset_json = await get_metadata_obj(app, dset_id, bucket=bucket) shape_dims = getShapeDims(dset_json["shape"]) log.debug(f"shape_dims: {shape_dims}") - dims = getChunkLayout(dset_json) + dims = getChunkDims(dset_json) log.debug(f"GET_Chunk - got dims: {dims}") # get chunk selection from query params @@ -653,7 +656,7 @@ async def POST_Chunk(request): chunk_index = getChunkIndex(chunk_id) log.debug(f"chunk_index: {chunk_index}") - if not isValidUuid(chunk_id, "Chunk"): + if not isValidUuid(chunk_id, obj_class="chunks"): msg = f"Invalid chunk id: {chunk_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -680,7 +683,7 @@ async def POST_Chunk(request): dset_json = await get_metadata_obj(app, dset_id, bucket=bucket) log.debug(f"get_metadata_obj for {dset_id} returned {dset_json}") - dims = getChunkLayout(dset_json) + dims = getChunkDims(dset_json) rank = len(dims) type_json = dset_json["type"] @@ -840,7 +843,7 @@ async def DELETE_Chunk(request): raise HTTPBadRequest(reason=msg) log.info(f"DELETE chunk: {chunk_id}") - if not isValidUuid(chunk_id, "Chunk"): + if not isValidUuid(chunk_id, obj_class="chunks"): msg = f"Invalid chunk id: {chunk_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index 68575007..1cca80f1 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -19,29 +19,26 @@ import numpy as np from json import JSONDecodeError -from asyncio import IncompleteReadError from aiohttp.web_exceptions import HTTPException, HTTPBadRequest from aiohttp.web_exceptions import HTTPRequestEntityTooLarge from aiohttp.web_exceptions import HTTPConflict, HTTPInternalServerError from aiohttp.web import StreamResponse +from h5json.hdf5dtype import getItemSize, getDtypeItemSize, getSubType, createDataType +from h5json.array_util import bytesArrayToList, jsonToArray, getNumElements, arrayToBytes +from h5json.array_util import bytesToArray, squeezeArray, getBroadcastShape +from h5json.objid import isValidUuid +from h5json.shape_util import isNullSpace, isScalar, getShapeDims, getMaxDims +from h5json.dset_util import getChunkDims, isExtensible + from .util.httpUtil import getHref, getAcceptType, getContentType from .util.httpUtil import request_read, jsonResponse, isAWSLambda -from .util.idUtil import isValidUuid from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain -from .util.hdf5dtype import getItemSize, getDtypeItemSize, getSubType, createDataType -from .util.dsetUtil import isNullSpace, isScalarSpace, get_slices, getShapeDims -from .util.dsetUtil import isExtensible, getSelectionPagination -from .util.dsetUtil import getSelectionShape, getDsetMaxDims, getChunkLayout -from .util.chunkUtil import getNumChunks, getChunkIds, getChunkId -from .util.arrayUtil import bytesArrayToList, jsonToArray -from .util.arrayUtil import getNumElements, arrayToBytes, bytesToArray -from .util.arrayUtil import squeezeArray, getBroadcastShape +from .util.dsetUtil import getSelectionShape, getSelectionPagination, get_slices from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .servicenode_lib import getDsetJson, validateAction -from .dset_lib import getSelectionData, getParser, extendShape -from .chunk_crawl import ChunkCrawler +from .dset_lib import getSelectionData, getParser, extendShape, doPointWrite, doHyperslabWrite from . import config from . import hsds_logger as log @@ -166,8 +163,8 @@ def _getAppendRows(params, dset_json, body=None): datashape = dset_json["shape"] dims = getShapeDims(datashape) rank = len(dims) - maxdims = getDsetMaxDims(dset_json) - if not isExtensible(dims, maxdims): + maxdims = getMaxDims(datashape) + if not isExtensible(datashape): msg = "Dataset shape must be extensible for packet updates" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -204,7 +201,7 @@ def _getSelect(params, dset_json, body=None): """ return selection region if any as a list of slices. """ slices = None - log.debug(f"_getSelect params: {params} body: {body}") + log.debug(f"_getSelect params: {dict(params)} body: {body}") try: if body and isinstance(body, dict): if "select" in body and body["select"]: @@ -217,6 +214,7 @@ def _getSelect(params, dset_json, body=None): if slices: msg = "select defined in both request body and query parameters" raise ValueError(msg) + log.debug(f"_getSelect - select param: {select}") slices = get_slices(select, dset_json) except ValueError as ve: log.warn(f"Invalid selection: {ve}") @@ -229,12 +227,17 @@ def _getSelect(params, dset_json, body=None): if not slices: # just return the entire dataspace + log.debug("_getSelect - no selection, using entire dataspace") datashape = dset_json["shape"] dims = getShapeDims(datashape) slices = [] - for dim in dims: - s = slice(0, dim, 1) - slices.append(s) + if dims: + for dim in dims: + s = slice(0, dim, 1) + slices.append(s) + else: + # scalar dataset + slices.append(slice(0, 1, 1)) log.debug(f"_getSelect returning: {slices}") return slices @@ -464,188 +467,6 @@ async def arrayResponse(arr, request, dset_json): return resp -async def _doPointWrite(app, - request, - points=None, - data=None, - dset_json=None, - bucket=None - ): - """ write the given points to the dataset """ - - num_points = len(points) - log.debug(f"doPointWrite - num_points: {num_points}") - dset_id = dset_json["id"] - layout = getChunkLayout(dset_json) - datashape = dset_json["shape"] - dims = getShapeDims(datashape) - rank = len(dims) - - chunk_dict = {} # chunk ids to list of points in chunk - - for pt_indx in range(num_points): - if rank == 1: - point = int(points[pt_indx]) - else: - point_tuple = points[pt_indx] - point = [] - for i in range(len(point_tuple)): - point.append(int(point_tuple[i])) - if rank == 1: - if point < 0 or point >= dims[0]: - msg = f"PUT Value point: {point} is not within the " - msg += "bounds of the dataset" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - if len(point) != rank: - msg = "PUT Value point value did not match dataset rank" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - for i in range(rank): - if point[i] < 0 or point[i] >= dims[i]: - msg = f"PUT Value point: {point} is not within the " - msg += "bounds of the dataset" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - chunk_id = getChunkId(dset_id, point, layout) - # get the pt_indx element from the input data - value = data[pt_indx] - if chunk_id not in chunk_dict: - point_list = [point, ] - point_data = [value, ] - chunk_dict[chunk_id] = {"indices": point_list, "points": point_data} - else: - item = chunk_dict[chunk_id] - point_list = item["indices"] - point_list.append(point) - point_data = item["points"] - point_data.append(value) - - num_chunks = len(chunk_dict) - log.debug(f"num_chunks: {num_chunks}") - max_chunks = int(config.get("max_chunks_per_request", default=1000)) - if num_chunks > max_chunks: - msg = f"PUT value request with more than {max_chunks} chunks" - log.warn(msg) - - chunk_ids = list(chunk_dict.keys()) - chunk_ids.sort() - - crawler = ChunkCrawler( - app, - chunk_ids, - dset_json=dset_json, - bucket=bucket, - points=chunk_dict, - action="write_point_sel", - ) - await crawler.crawl() - - crawler_status = crawler.get_status() - - if crawler_status not in (200, 201): - msg = f"doPointWritte raising HTTPInternalServerError for status: {crawler_status}" - log.error(msg) - raise HTTPInternalServerError() - else: - log.info("doPointWrite success") - - -async def _doHyperslabWrite(app, - request, - page_number=0, - page=None, - data=None, - dset_json=None, - select_dtype=None, - bucket=None - ): - """ write the given page selection to the dataset """ - dset_id = dset_json["id"] - log.info(f"_doHyperslabWrite on {dset_id} - page: {page_number}") - type_json = dset_json["type"] - - if select_dtype is not None: - item_size = getDtypeItemSize(select_dtype) - else: - item_size = getItemSize(type_json) - if item_size == "H5T_VARIABLE" and data is None: - msg = "unexpected call to _doHyperslabWrite for variable length data" - log.error(msg) - raise HTTPInternalServerError() - - layout = getChunkLayout(dset_json) - - num_chunks = getNumChunks(page, layout) - log.debug(f"num_chunks: {num_chunks}") - max_chunks = int(config.get("max_chunks_per_request", default=1000)) - if num_chunks > max_chunks: - msg = f"PUT value chunk count: {num_chunks} exceeds max_chunks: {max_chunks}" - log.warn(msg) - select_shape = getSelectionShape(page) - log.debug(f"got select_shape: {select_shape} for page: {page_number}") - - if data is None: - num_bytes = math.prod(select_shape) * item_size - log.debug(f"reading {num_bytes} from request stream") - # read page of data from input stream - try: - page_bytes = await request_read(request, count=num_bytes) - except HTTPRequestEntityTooLarge as tle: - msg = "Got HTTPRequestEntityTooLarge exception during " - msg += f"binary read: {tle}) for page: {page_number}" - log.warn(msg) - raise # re-throw - except IncompleteReadError as ire: - msg = "Got asyncio.IncompleteReadError during binary " - msg += f"read: {ire} for page: {page_number}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - log.debug(f"read {len(page_bytes)} for page: {page_number}") - try: - arr = bytesToArray(page_bytes, select_dtype, select_shape) - except ValueError as ve: - msg = f"bytesToArray value error for page: {page_number}: {ve}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - arr = data # use array provided to function - - try: - chunk_ids = getChunkIds(dset_id, page, layout) - except ValueError: - log.warn("getChunkIds failed") - raise HTTPInternalServerError() - if len(chunk_ids) < 10: - log.debug(f"chunk_ids: {chunk_ids}") - else: - log.debug(f"chunk_ids: {chunk_ids[:10]} ...") - if len(chunk_ids) > max_chunks: - msg = f"got {len(chunk_ids)} for page: {page_number}. max_chunks: {max_chunks}" - log.warn(msg) - - crawler = ChunkCrawler( - app, - chunk_ids, - dset_json=dset_json, - bucket=bucket, - slices=page, - arr=arr, - action="write_chunk_hyperslab", - ) - await crawler.crawl() - - crawler_status = crawler.get_status() - - if crawler_status not in (200, 201): - msg = f"crawler failed for page: {page_number} with status: {crawler_status}" - log.error(msg) - raise HTTPInternalServerError() - else: - log.info("crawler write_chunk_hyperslab successful") - - async def PUT_Value(request): """ Handler for PUT //value request @@ -682,7 +503,7 @@ async def PUT_Value(request): msg = "Missing dataset id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -700,6 +521,7 @@ async def PUT_Value(request): # get state for dataset from DN - will need this to validate # some of the query parameters dset_json = await getDsetJson(app, dset_id, bucket=bucket) + log.debug(f"got dset_json: {dset_json}") datashape = dset_json["shape"] if isNullSpace(dset_json): @@ -841,7 +663,7 @@ async def PUT_Value(request): log.warn(f"bytesToArray value error: {ve}") raise HTTPBadRequest() else: - # fixed item size + # fixed item size - check against number of bytes if len(input_data) % item_size != 0: msg = f"Expected request size to be a multiple of {item_size}, " msg += f"but {len(input_data)} bytes received" @@ -852,8 +674,7 @@ async def PUT_Value(request): msg = f"expected {item_size * num_elements} bytes but got {len(input_data)}" log.warn(msg) raise HTTPBadRequest(reason=msg) - - arr = np.fromstring(input_data, dtype=dset_dtype) + arr = np.frombuffer(input_data, dtype=dset_dtype) log.debug(f"read fixed type array: {arr}") if bc_shape: @@ -940,13 +761,13 @@ async def PUT_Value(request): else: kwargs["data"] = None # do write for one page selection - await _doHyperslabWrite(app, request, **kwargs) + await doHyperslabWrite(app, request, **kwargs) else: # # Do point put # kwargs = {"points": points, "data": arr, "dset_json": dset_json, "bucket": bucket} - await _doPointWrite(app, request, **kwargs) + await doPointWrite(app, request, **kwargs) # write successful @@ -968,7 +789,7 @@ async def GET_Value(request): msg = "Missing dataset id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -1004,7 +825,7 @@ async def GET_Value(request): log.debug(f"dset shape: {dims}") rank = len(dims) - layout = getChunkLayout(dset_json) + layout = getChunkDims(dset_json) log.debug(f"chunk layout: {layout}") await validateAction(app, domain, dset_id, username, "read") @@ -1089,7 +910,6 @@ async def GET_Value(request): arr = None # will be set based on returned data if stream_pagination: - # example # get binary data a page at a time and write back to response if item_size == "H5T_VARIABLE": page_item_size = VARIABLE_AVG_ITEM_SIZE # random guess of avg item_size @@ -1247,7 +1067,7 @@ async def POST_Value(request): msg = "Missing dataset id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -1289,7 +1109,7 @@ async def POST_Value(request): msg = "POST value not supported for datasets with NULL shape" log.warn(msg) raise HTTPBadRequest(reason=msg) - if isScalarSpace(dset_json): + if isScalar(dset_json): msg = "POST value not supported for datasets with SCALAR shape" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -1351,7 +1171,7 @@ async def POST_Value(request): log.warn(msg) raise HTTPBadRequest(reason=msg) num_points = request.content_length // point_dt.itemsize - points = np.fromstring(binary_data, dtype=point_dt) + points = np.frombuffer(binary_data, dtype=point_dt) # reshape the data based on the rank (num_points x rank) if rank > 1: if len(points) % rank != 0: diff --git a/hsds/chunklocator.py b/hsds/chunklocator.py index 6727de9e..f2cd93d7 100644 --- a/hsds/chunklocator.py +++ b/hsds/chunklocator.py @@ -1,11 +1,13 @@ import sys -import time import h5py import s3fs import numpy as np from . import config from . import hsds_logger as log -from .util.arrayUtil import bytesArrayToList, getNumElements + +from h5json.time_util import getNow +from h5json.array_util import bytesArrayToList, getNumElements + from .util.dsetUtil import getSelectionList, getSelectionShape @@ -189,7 +191,7 @@ def main(): prefix = config.get("log_prefix") log_timestamps = config.get("log_timestamps", default=False) log.setLogConfig(log_level, prefix=prefix, timestamps=log_timestamps) - start_time = time.time() + start_time = getNow() log.info(f"chunklocator start: {start_time:.2f}") cmd_options = get_cmd_options() @@ -229,6 +231,6 @@ def main(): sys.exit(1) log.info('done') - stop_time = time.time() + stop_time = getNow() log.info(f"chunklocator stop: {stop_time:.2f}") log.info(f"chunklocator elapsed: {(stop_time - start_time):.2f}") diff --git a/hsds/ctype_dn.py b/hsds/ctype_dn.py index f06b98b3..465d8916 100755 --- a/hsds/ctype_dn.py +++ b/hsds/ctype_dn.py @@ -18,11 +18,13 @@ from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.web import json_response -from .util.idUtil import isValidUuid, validateUuid +from h5json.objid import isValidUuid, validateUuid +from h5json.time_util import getNow + from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj from .datanode_lib import delete_metadata_obj, check_metadata_obj from .util.domainUtil import isValidBucketName -from .util.timeUtil import getNow + from . import hsds_logger as log @@ -33,7 +35,7 @@ async def GET_Datatype(request): params = request.rel_url.query ctype_id = get_obj_id(request) - if not isValidUuid(ctype_id, obj_class="type"): + if not isValidUuid(ctype_id, obj_class="datatypes"): log.error(f"Unexpected type_id: {ctype_id}") raise HTTPInternalServerError() @@ -90,15 +92,21 @@ async def POST_Datatype(request): raise HTTPBadRequest(reason=msg) ctype_id = get_obj_id(request, body=body) - if not isValidUuid(ctype_id, obj_class="datatype"): + if not isValidUuid(ctype_id, obj_class="datatypes"): log.error("Unexpected type_id: {ctype_id}") raise HTTPInternalServerError() + deleted_ids = app["deleted_ids"] + if ctype_id in deleted_ids: + log.warn(f"POST Dataset has id: {ctype_id} that has previously been deleted") + deleted_ids.remove(ctype_id) + # verify the id doesn't already exist obj_found = await check_metadata_obj(app, ctype_id, bucket=bucket) if obj_found: - log.error(f"Post with existing type_id: {ctype_id}") - raise HTTPInternalServerError() + msg = f"Post with existing type_id: {ctype_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) root_id = None @@ -120,10 +128,17 @@ async def POST_Datatype(request): raise HTTPInternalServerError() type_json = body["type"] + if "attributes" in body: + # initialize attributes + attrs = body["attributes"] + log.debug(f"POST datatype with attributes: {attrs}") + else: + attrs = {} + # ok - all set, create committed type obj now = getNow(app) - log.info(f"POST_datatype, typejson: {type_json}") + log.info(f"POST_datatype, type_json: {type_json}") ctype_json = { "id": ctype_id, @@ -131,7 +146,7 @@ async def POST_Datatype(request): "created": now, "lastModified": now, "type": type_json, - "attributes": {}, + "attributes": attrs, } kwargs = {"bucket": bucket, "notify": True, "flush": True} @@ -143,7 +158,7 @@ async def POST_Datatype(request): resp_json["created"] = ctype_json["created"] resp_json["lastModified"] = ctype_json["lastModified"] resp_json["type"] = type_json - resp_json["attributeCount"] = 0 + resp_json["attributeCount"] = len(attrs) resp = json_response(resp_json, status=201) log.response(request, resp=resp) diff --git a/hsds/ctype_sn.py b/hsds/ctype_sn.py index 84cdd17f..dfa96f98 100755 --- a/hsds/ctype_sn.py +++ b/hsds/ctype_sn.py @@ -16,18 +16,20 @@ from aiohttp.web_exceptions import HTTPBadRequest, HTTPGone from json import JSONDecodeError + +from h5json.objid import isValidUuid + from .util.httpUtil import getHref, respJsonAssemble, getBooleanParam from .util.httpUtil import jsonResponse -from .util.idUtil import isValidUuid -from .util.linkUtil import validateLinkName from .util.authUtil import getUserPasswordFromRequest, aclCheck from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain from .util.domainUtil import getBucketForDomain, verifyRoot -from .util.hdf5dtype import validateTypeItem, getBaseTypeJson from .servicenode_lib import getDomainJson, getObjectJson, validateAction -from .servicenode_lib import getObjectIdByPath, getPathForObjectId -from .servicenode_lib import createObject, createObjectByPath, deleteObject +from .servicenode_lib import getObjectIdByPath, getPathForObjectId, deleteObject +from .servicenode_lib import getCreateArgs, createDatatypeObj +from .post_crawl import createDatatypeObjs +from .domain_crawl import DomainCrawler from . import hsds_logger as log @@ -49,7 +51,7 @@ async def GET_Datatype(request): include_attrs = True if ctype_id: - if not isValidUuid(ctype_id, "Type"): + if not isValidUuid(ctype_id, obj_class="datatypes"): msg = f"Invalid type id: {ctype_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -60,7 +62,7 @@ async def GET_Datatype(request): group_id = None if "grpid" in params: group_id = params["grpid"] - if not isValidUuid(group_id, "Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid parent group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -101,7 +103,7 @@ async def GET_Datatype(request): # throws 404 if not found kwargs = {"bucket": bucket, "domain": domain} ctype_id, domain, _ = await getObjectIdByPath(app, group_id, h5path, **kwargs) - if not isValidUuid(ctype_id, "Datatype"): + if not isValidUuid(ctype_id, obj_class="datatypes"): msg = f"No datatype exist with the path: {h5path}" log.warn(msg) raise HTTPGone() @@ -153,7 +155,7 @@ async def POST_Datatype(request): await validateUserPassword(app, username, pswd) if not request.has_body: - msg = "POST Datatype with no body" + msg = "POST datatype with no body" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -163,35 +165,6 @@ async def POST_Datatype(request): msg = "Unable to load JSON body" log.warn(msg) raise HTTPBadRequest(reason=msg) - if "type" not in body: - msg = "POST Datatype has no type key in body" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - datatype = body["type"] - if isinstance(datatype, str): - try: - # convert predefined type string (e.g. "H5T_STD_I32LE") to - # corresponding json representation - datatype = getBaseTypeJson(datatype) - log.debug(f"got datatype: {datatype}") - except TypeError: - msg = "POST Dataset with invalid predefined type" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - try: - validateTypeItem(datatype) - except KeyError as ke: - msg = f"KeyError creating type: {ke}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - except TypeError as te: - msg = f"TypeError creating type: {te}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - except ValueError as ve: - msg = f"ValueError creating type: {ve}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) domain = getDomainFromRequest(request) if not isValidDomain(domain): @@ -207,56 +180,92 @@ async def POST_Datatype(request): verifyRoot(domain_json) root_id = domain_json["root"] - parent_id = None - link_title = None - h5path = None - if "link" in body: - if "h5path" in body: - msg = "link can't be used with h5path" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - link_body = body["link"] - if "id" in link_body: - parent_id = link_body["id"] - if "name" in link_body: - link_title = link_body["name"] - try: - # will throw exception if there's a slash in the name - validateLinkName(link_title) - except ValueError: - msg = f"invalid link title: {link_title}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + # allow parent group creation or not + implicit = getBooleanParam(params, "implicit") - if parent_id and link_title: - log.debug(f"parent id: {parent_id}, link_title: {link_title}") - h5path = link_title # just use the link name as the h5path + post_rsp = None - if "h5path" in body: - h5path = body["h5path"] - if "parent_id" not in body: - parent_id = root_id + if isinstance(body, list): + count = len(body) + log.debug(f"multiple ctype create: {count} items") + if count == 0: + # equivalent to no body + msg = "POST Datatype with no body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + elif count == 1: + # just create one object in typical way + kwargs = getCreateArgs(body[0], + root_id=root_id, + bucket=bucket, + implicit=implicit) else: - parent_id = body["parent_id"] - - # setup args to createObject - kwargs = {"bucket": bucket, "obj_type": datatype} - # TBD: creation props for datatype obj? - if parent_id: - kwargs["parent_id"] = parent_id - kwargs["h5path"] = h5path - # allow parent group creation or not - implicit = getBooleanParam(params, "implicit") - if implicit: - kwargs["implicit"] = True - ctype_json = await createObjectByPath(app, **kwargs) + # create multiple ctype objects + kwarg_list = [] # list of kwargs for each object + + for item in body: + log.debug(f"item: {item}") + if not isinstance(item, dict): + msg = f"Post_Datatype - invalid item type: {type(item)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + kwargs = getCreateArgs(item, root_id=root_id, bucket=bucket) + kwargs["ignore_link"] = True # will create parent links later + kwarg_list.append(kwargs) + kwargs = {"bucket": bucket, "root_id": root_id} + log.debug(f"createDatatypeObjects, items: {kwarg_list}") + post_rsp = await createDatatypeObjs(app, kwarg_list, **kwargs) else: - # create an anonymous datatype - kwargs["root_id"] = root_id - ctype_json = await createObject(app, **kwargs) + # single object create + kwargs = getCreateArgs(body, root_id=root_id, bucket=bucket, implicit=implicit) + log.debug(f"kwargs for datatype create: {kwargs}") + + if post_rsp is None: + # Handle cases other than multi ctype create here + post_rsp = await createDatatypeObj(app, **kwargs) + + log.debug(f"returning resp: {post_rsp}") + + if "objects" in post_rsp: + # add any links in multi request + objects = post_rsp["objects"] + obj_count = len(objects) + log.debug(f"Post datatype multi create: {obj_count} objects") + if len(body) != obj_count: + msg = f"Expected {obj_count} objects but got {len(body)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + parent_ids = {} + for index in range(obj_count): + item = body[index] + if "link" in item: + link_item = item["link"] + parent_id = link_item.get("id") + title = link_item.get("name") + if parent_id and title: + # add a hard link + object = objects[index] + obj_id = object["id"] + if parent_id not in parent_ids: + parent_ids[parent_id] = {} + links = parent_ids[parent_id] + links[title] = {"id": obj_id} + if parent_ids: + log.debug(f"POST datatype multi - adding links: {parent_ids}") + kwargs = {"action": "put_link", "bucket": bucket} + kwargs["replace"] = True + + crawler = DomainCrawler(app, parent_ids, **kwargs) + + # will raise exception on not found, server busy, etc. + await crawler.crawl() + + status = crawler.get_status() + + log.info(f"DomainCrawler done for put_links action, status: {status}") # datatype creation successful - resp = await jsonResponse(request, ctype_json, status=201) + resp = await jsonResponse(request, post_rsp, status=201) log.response(request, resp=resp) return resp @@ -271,7 +280,7 @@ async def DELETE_Datatype(request): msg = "Missing committed type id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(ctype_id, "Type"): + if not isValidUuid(ctype_id, obj_class="datatypes"): msg = f"Invalid committed type id: {ctype_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/datanode.py b/hsds/datanode.py index b7c00b9d..cef44bd0 100644 --- a/hsds/datanode.py +++ b/hsds/datanode.py @@ -17,10 +17,11 @@ import traceback from aiohttp.web import run_app +from h5json.objid import isValidUuid, isSchema2Id, getCollectionForId +from h5json.objid import isRootObjId + from . import config from .util.lruCache import LruCache -from .util.idUtil import isValidUuid, isSchema2Id, getCollectionForId -from .util.idUtil import isRootObjId from .util.httpUtil import isUnixDomainUrl, bindToSocket, getPortFromUrl from .util.httpUtil import jsonResponse, release_http_client from .util.storUtil import setBloscThreads, getBloscThreads diff --git a/hsds/datanode_lib.py b/hsds/datanode_lib.py index 08ecc52a..d2b1840c 100644 --- a/hsds/datanode_lib.py +++ b/hsds/datanode_lib.py @@ -19,22 +19,29 @@ from aiohttp.web_exceptions import HTTPGone, HTTPInternalServerError from aiohttp.web_exceptions import HTTPNotFound, HTTPForbidden from aiohttp.web_exceptions import HTTPServiceUnavailable, HTTPBadRequest -from .util.idUtil import validateInPartition, getS3Key, isValidUuid -from .util.idUtil import isValidChunkId, getDataNodeUrl, isSchema2Id -from .util.idUtil import getRootObjId, isRootObjId + +from h5json.hdf5dtype import createDataType, isVlen +from h5json.array_util import arrayToBytes, bytesToArray, jsonToArray +from h5json.filters import getFilters, getCompressionFilter, getShuffleFilter +from h5json.objid import getS3Key, isValidUuid +from h5json.objid import isValidChunkId, isSchema2Id +from h5json.objid import getRootObjId, isRootObjId +from h5json.shape_util import getShapeDims +from h5json.dset_util import getChunkDims, getDatasetLayoutClass +from h5json.time_util import getNow + +from .util.nodeUtil import getDataNodeUrl from .util.storUtil import getStorJSONObj, putStorJSONObj, putStorBytes from .util.storUtil import getStorBytes, isStorObj, deleteStorObj, getHyperChunks from .util.storUtil import getBucketFromStorURI, getKeyFromStorURI, getURIFromKey from .util.domainUtil import isValidDomain, getBucketForDomain from .util.attrUtil import getRequestCollectionName from .util.httpUtil import http_post -from .util.dsetUtil import getChunkLayout, getFilterOps, getShapeDims -from .util.dsetUtil import getChunkInitializer, getSliceQueryParam, getFilters +from .util.dsetUtil import getChunkInitializer, getSliceQueryParam from .util.chunkUtil import getDatasetId, getChunkSelection, getChunkIndex -from .util.arrayUtil import arrayToBytes, bytesToArray, jsonToArray -from .util.hdf5dtype import createDataType +from .util.nodeUtil import validateInPartition from .util.rangegetUtil import ChunkLocation, chunkMunge, getHyperChunkIndex, getHyperChunkFactors -from .util.timeUtil import getNow + from . import config from . import hsds_logger as log from .dset_lib import getFillValue @@ -554,6 +561,54 @@ async def delete_metadata_obj(app, obj_id, notify=True, root_id=None, bucket=Non log.debug(f"delete_metadata_obj for {obj_id} done") +def getFilterOps(app, dset_id, filters, dtype=None, chunk_shape=None): + """Get list of filter operations to be used for this dataset""" + filter_map = app["filter_map"] + + if dset_id in filter_map: + return filter_map[dset_id] + + compressionFilter = getCompressionFilter(filters) + + filter_ops = {} + + shuffleFilter = getShuffleFilter(filters) + + if shuffleFilter and not isVlen(dtype): + shuffle_name = shuffleFilter["name"] + if shuffle_name == "shuffle": + filter_ops["shuffle"] = 1 # use regular shuffle + elif shuffle_name == "bitshuffle": + filter_ops["shuffle"] = 2 # use bitshuffle + else: + filter_ops["shuffle"] = 0 # no shuffle + else: + filter_ops["shuffle"] = 0 # no shuffle + + if compressionFilter: + if compressionFilter["class"] == "H5Z_FILTER_DEFLATE": + filter_ops["compressor"] = "zlib" # blosc compressor + else: + if "name" in compressionFilter: + filter_ops["compressor"] = compressionFilter["name"] + else: + filter_ops["compressor"] = "lz4" # default to lz4 + if "level" not in compressionFilter: + filter_ops["level"] = 5 # medium level + else: + filter_ops["level"] = int(compressionFilter["level"]) + + if filter_ops: + # save the chunk shape and dtype + filter_ops["chunk_shape"] = chunk_shape + filter_ops["dtype"] = dtype + filter_map[dset_id] = filter_ops # save + + return filter_ops + else: + return None + + def arange_chunk_init( app, initializer, @@ -584,9 +639,8 @@ def arange_chunk_init( log.warn(msg) raise None - try: - chunk_layout = getChunkLayout(dset_json) - except HTTPInternalServerError: + chunk_layout = getChunkDims(dset_json) + if chunk_layout is None: msg = "non-chunked dataset" log.warning(msg) raise None @@ -710,7 +764,7 @@ async def run_chunk_initializer( dims = getShapeDims(datashape) log.debug(f"dataset shape: {dims}") # get the chunk layout for this dataset - layout = getChunkLayout(dset_json) + layout = getChunkDims(dset_json) log.debug(f"chunk layout: {layout}") rank = len(dims) @@ -1004,12 +1058,11 @@ async def get_chunk( log.debug(msg) chunk_arr = None - dims = getChunkLayout(dset_json) + dims = getChunkDims(dset_json) type_json = dset_json["type"] dt = createDataType(type_json) - layout_json = dset_json["layout"] - layout_class = layout_json.get("class") - chunk_dims = getChunkLayout(dset_json) + layout_class = getDatasetLayoutClass(dset_json) + chunk_dims = getChunkDims(dset_json) fill_value = getFillValue(dset_json) # note - officially we should follow the order in which the filters are @@ -1041,7 +1094,7 @@ async def get_chunk( log.debug(msg) else: s3key = getS3Key(chunk_id) - log.debug(f"getChunk chunkid: {chunk_id} bucket: {bucket}") + log.debug(f"getChunk chunkid: {chunk_id} bucket: {bucket} using key: {s3key}") if chunk_id in chunk_cache: log.debug(f"getChunk chunkid: {chunk_id} found in cache") chunk_arr = chunk_cache[chunk_id] @@ -1163,7 +1216,7 @@ def save_chunk(app, chunk_id, dset_json, chunk_arr, bucket=None): dset_id = dset_json["id"] dtype = createDataType(dset_json["type"]) - chunk_shape = getChunkLayout(dset_json) + chunk_shape = getChunkDims(dset_json) # will store filter options into app['filter_map'] filters = getFilters(dset_json) diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index b8e0ba39..d9285d45 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -18,8 +18,11 @@ from aiohttp.web_exceptions import HTTPServiceUnavailable, HTTPConflict, HTTPBadRequest from aiohttp.web_exceptions import HTTPInternalServerError, HTTPNotFound, HTTPGone -from .util.httpUtil import isOK -from .util.idUtil import getCollectionForId, getDataNodeUrl +from h5json.objid import getCollectionForId +from h5json.array_util import arrayToBytes + +from .util.nodeUtil import getDataNodeUrl +from .util.httpUtil import isOK, http_put from .util.globparser import globmatch from .servicenode_lib import getObjectJson, getAttributes, putAttributes, getLinks, putLinks from . import hsds_logger as log @@ -231,7 +234,7 @@ async def put_attributes(self, obj_id, attr_items): try: status = await putAttributes(self._app, obj_id, attr_items, **kwargs) except HTTPConflict: - log.warn("DomainCrawler - got HTTPConflict from http_put") + log.warn("DomainCrawler - got HTTPConflict from putAttributers") status = 409 except HTTPServiceUnavailable: status = 503 @@ -246,7 +249,7 @@ async def put_attributes(self, obj_id, attr_items): async def get_obj_json(self, obj_id): """ get the given obj_json for the obj_id. for each group found, search the links if follow_links is set """ - log.debug(f"get_obj_json: {obj_id}") + log.debug(f"DomainCrawler get_obj_json: {obj_id}") collection = getCollectionForId(obj_id) kwargs = {"bucket": self._bucket, "include_attrs": self._include_attrs} @@ -406,7 +409,7 @@ async def get_links(self, grp_id, titles=None): async def put_links(self, grp_id, link_items): # write the given links for the obj_id - log.debug(f"put_links for {grp_id}, {len(link_items)} links") + log.debug(f"DomainCrawler put_links for {grp_id}, {len(link_items)} links") req = getDataNodeUrl(self._app, grp_id) req += f"/groups/{grp_id}/links" kwargs = {"bucket": self._bucket} @@ -417,8 +420,10 @@ async def put_links(self, grp_id, link_items): log.warn("DomainCrawler - got HTTPConflict from http_put") status = 409 except HTTPServiceUnavailable: + log.warn("DomainCrawler - got HTTPServiceUnavailable exception") status = 503 except HTTPInternalServerError: + log.warn("DomainCrawler - got 500 error from DN") status = 500 except Exception as e: log.error(f"unexpected exception {e}") @@ -426,6 +431,38 @@ async def put_links(self, grp_id, link_items): log.debug(f"DomainCrawler fetch for {grp_id} - returning status: {status}") self._obj_dict[grp_id] = {"status": status} + async def put_data(self, chunk_id, arr): + # write a one-chunk dataset value + log.debug(f"DomainCrawler put_data for {chunk_id}, arr.shape: {arr.shape}") + req = getDataNodeUrl(self._app, chunk_id) + req += "/chunks/" + chunk_id + params = {"bucket": self._bucket} + + data = arrayToBytes(arr) + + log.debug(f"DomainCrawler - put_data req: {req}, {len(data)} bytes") + try: + rsp = await http_put(self._app, req, data=data, params=params) + log.debug(f"http_put return: {rsp}") + status = 200 + except HTTPConflict: + log.warn("DomainCrawler - got HTTPConflict from http_put") + status = 409 + except HTTPServiceUnavailable: + log.warn("DomainCrawler - got HTTPServiceUnavailable exception") + status = 503 + except HTTPInternalServerError: + log.warn("DomainCrawler - got 500 error from DN") + status = 500 + except Exception as e: + log.error(f"unexpected exception {e}") + status = 500 + finally: + log.debug("DomainCrawler put_data end try") + + log.debug(f"DomainCrawler put_data for {chunk_id} - returning status: {status}") + self._obj_dict[chunk_id] = {"status": status} + def get_status(self): """ return the highest status of any of the returned objects """ status = None @@ -464,7 +501,7 @@ async def crawl(self): pass # ok elif status == 400: log.warn("DomainCrawler - BadRequest") - raise HTTPBadRequest(reason="unkown") + raise HTTPBadRequest(reason="unknown") elif status == 404: log.warn("DomainCrawler - not found") raise HTTPNotFound() @@ -526,7 +563,7 @@ async def fetch(self, obj_id): await self.put_attributes(obj_id, attr_items) elif self._action == "get_link": - log.debug("DomainCrawlwer - get links") + log.debug("DomainCrawler - get links") log.debug(f"self._objs: {self._objs}, type: {type(self._objs)}") if self._objs is None or obj_id not in self._objs: @@ -546,7 +583,7 @@ async def fetch(self, obj_id): log.debug(f"DomainCrawler - get link titles: {link_titles}") await self.get_links(obj_id, link_titles) elif self._action == "put_link": - log.debug("DomainCrawlwer - put links") + log.debug("DomainCrawler - put links") # write links if self._objs and obj_id not in self._objs: log.error(f"couldn't find {obj_id} in self._objs") @@ -555,11 +592,23 @@ async def fetch(self, obj_id): log.debug(f"got {len(link_items)} link items for {obj_id}") await self.put_links(obj_id, link_items) + elif self._action == "put_data": + log.debug("DomainCrawler - put data") + # write one chunk per dataset + if self._objs and obj_id not in self._objs: + log.error(f"couldn't find {obj_id} in self._objs") + return + data = self._objs[obj_id] + if data is None: + log.error(f"no data found for {obj_id}") + return + + await self.put_data(obj_id, data) else: msg = f"DomainCrawler: unexpected action: {self._action}" log.error(msg) msg = f"DomainCrawler - fetch complete obj_id: {obj_id}, " - msg += f"{len(self._obj_dict)} objects found" + msg += f"{len(self._obj_dict)} objects processed" log.debug(msg) log.debug(f"obj_dict: {len(self._obj_dict)} items") diff --git a/hsds/domain_dn.py b/hsds/domain_dn.py index 83932e5d..5b14ba70 100755 --- a/hsds/domain_dn.py +++ b/hsds/domain_dn.py @@ -16,10 +16,11 @@ from aiohttp.web_exceptions import HTTPConflict, HTTPInternalServerError from aiohttp.web import json_response +from h5json.time_util import getNow + from .util.authUtil import getAclKeys from .util.domainUtil import isValidDomain, getBucketForDomain -from .util.idUtil import validateInPartition -from .util.timeUtil import getNow +from .util.nodeUtil import validateInPartition from .datanode_lib import get_metadata_obj, save_metadata_obj from .datanode_lib import delete_metadata_obj, check_metadata_obj from . import hsds_logger as log diff --git a/hsds/domain_sn.py b/hsds/domain_sn.py index 56d3611a..5758cd0d 100755 --- a/hsds/domain_sn.py +++ b/hsds/domain_sn.py @@ -18,15 +18,19 @@ import os.path as op from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPNotFound -from aiohttp.web_exceptions import HTTPInternalServerError +from aiohttp.web_exceptions import HTTPInternalServerError, HTTPGone from aiohttp.web_exceptions import HTTPConflict, HTTPServiceUnavailable from aiohttp.web import json_response +from h5json.objid import createObjId, getCollectionForId +from h5json.objid import isValidUuid, isRootObjId, isSchema2Id +from h5json.time_util import getNow + + +from .util.nodeUtil import getNodeCount, getDataNodeUrl from .util.httpUtil import getObjectClass, http_post, http_put, http_delete from .util.httpUtil import getHref, respJsonAssemble from .util.httpUtil import jsonResponse -from .util.idUtil import getDataNodeUrl, createObjId, getCollectionForId -from .util.idUtil import isValidUuid, isSchema2Id, getNodeCount from .util.authUtil import getUserPasswordFromRequest, aclCheck, isAdminUser from .util.authUtil import validateUserPassword, getAclKeys from .util.domainUtil import getParentDomain, getDomainFromRequest @@ -35,7 +39,6 @@ from .util.storUtil import getStorKeys, getCompressors from .util.boolparser import BooleanParser from .util.globparser import globmatch -from .util.timeUtil import getNow from .servicenode_lib import getDomainJson, getObjectJson, getObjectIdByPath from .servicenode_lib import getRootInfo, checkBucketAccess, doFlush, getDomainResponse from .basenode import getVersion @@ -97,7 +100,7 @@ async def get_collections(app, root_id, bucket=None, max_objects_limit=None): async def getDomainObjects(app, root_id, include_attrs=False, bucket=None): - """Iterate through all objects in heirarchy and add to obj_dict + """Iterate through all objects in hierarchy and add to obj_dict keyed by obj id """ @@ -459,6 +462,11 @@ async def GET_Domain(request): if "verbose" in params and params["verbose"]: verbose = True + getobjs = False + # include domain objects if requested + if params.get("getobjs"): + getobjs = True + if not domain: log.info("no domain passed in, returning all top-level domains") # no domain passed in, return top-level domains for this request @@ -540,23 +548,9 @@ async def GET_Domain(request): return resp # return just the keys as per the REST API - kwargs = {"verbose": verbose, "bucket": bucket} + kwargs = {"verbose": verbose, "getobjs": getobjs, "bucket": bucket} rsp_json = await getDomainResponse(app, domain_json, **kwargs) - # include domain objects if requested - if params.get("getobjs") and "root" in domain_json: - - log.debug("getting all domain objects") - root_id = domain_json["root"] - kwargs = {"include_attrs": include_attrs, "bucket": bucket} - domain_objs = await getDomainObjects(app, root_id, **kwargs) - if domain_objs: - rsp_json["domain_objs"] = domain_objs - - # include domain class if present - # if "class" in domain_json: - # rsp_json["class"] = domain_json["class"] - # include dn_ids if requested if "getdnids" in params and params["getdnids"]: rsp_json["dn_ids"] = app["dn_ids"] @@ -752,7 +746,7 @@ async def PUT_Domain(request): username, pswd = getUserPasswordFromRequest(request) await validateUserPassword(app, username, pswd) - # inital perms for owner and default + # initial perms for owner and default owner_perm = { "create": True, "read": True, @@ -856,7 +850,7 @@ async def PUT_Domain(request): if "root" in domain_json: # nothing to update for folders root_id = domain_json["root"] - if not isValidUuid(root_id): + if not isValidUuid(root_id, obj_class="groups"): msg = f"domain: {domain} with invalid root id: {root_id}" log.error(msg) raise HTTPInternalServerError() @@ -983,8 +977,33 @@ async def PUT_Domain(request): if not is_folder and not linked_json: # create a root group for the new domain - root_id = createObjId("roots") - log.debug(f"new root group id: {root_id}") + if body and "root_id" in body: + root_id = body["root_id"] + if not isRootObjId(root_id): + msg = f"invalid client provided root id: {root_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + # verify that the group object doesn't already exist + log.debug(f"attempting to fetch root id: {root_id}") + kwargs = { + "refresh": True, + "include_links": False, + "include_attrs": False, + "bucket": bucket, + } + try: + await getObjectJson(app, root_id, **kwargs) + msg = "client specified root_id already exists" + log.warn(msg) + raise HTTPConflict() + except HTTPNotFound: + log.debug(f"root_id: {root_id} not found (expected)") + except HTTPGone: + log.debug(f"root_id: {root_id} has been removed (expected)") + log.debug(f"using client supplied root_id: {root_id}") + else: + root_id = createObjId("groups") + log.debug(f"new root group id: {root_id}") group_json = {"id": root_id, "root": root_id, "domain": domain} log.debug(f"create group for domain, body: {group_json}") diff --git a/hsds/dset_dn.py b/hsds/dset_dn.py index 34a8ff6f..3d5a261e 100755 --- a/hsds/dset_dn.py +++ b/hsds/dset_dn.py @@ -17,10 +17,10 @@ from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.web import json_response +from h5json.objid import isValidUuid, validateUuid +from h5json.time_util import getNow -from .util.idUtil import isValidUuid, validateUuid from .util.domainUtil import isValidBucketName -from .util.timeUtil import getNow from .datanode_lib import get_obj_id, check_metadata_obj, get_metadata_obj from .datanode_lib import save_metadata_obj, delete_metadata_obj from . import hsds_logger as log @@ -33,7 +33,7 @@ async def GET_Dataset(request): params = request.rel_url.query dset_id = get_obj_id(request) - if not isValidUuid(dset_id, obj_class="dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): log.error(f"Unexpected dataset_id: {dset_id}") raise HTTPInternalServerError() if "bucket" in params: @@ -57,9 +57,12 @@ async def GET_Dataset(request): resp_json["shape"] = dset_json["shape"] resp_json["attributeCount"] = len(dset_json["attributes"]) if "creationProperties" in dset_json: - resp_json["creationProperties"] = dset_json["creationProperties"] + cpl = dset_json["creationProperties"] + else: + cpl = {} if "layout" in dset_json: - resp_json["layout"] = dset_json["layout"] + cpl["layout"] = dset_json["layout"] + resp_json["creationProperties"] = cpl if "include_attrs" in params and params["include_attrs"]: resp_json["attributes"] = dset_json["attributes"] @@ -94,15 +97,21 @@ async def POST_Dataset(request): raise HTTPBadRequest(reason=msg) dset_id = get_obj_id(request, body=body) - if not isValidUuid(dset_id, obj_class="dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): log.error(f"Unexpected dataset_id: {dset_id}") raise HTTPInternalServerError() + deleted_ids = app["deleted_ids"] + if dset_id in deleted_ids: + log.warn(f"POST Dataset has id: {dset_id} that has previously been deleted") + deleted_ids.remove(dset_id) + # verify the id doesn't already exist obj_found = await check_metadata_obj(app, dset_id, bucket=bucket) if obj_found: - log.error("Post with existing dset_id: {}".format(dset_id)) - raise HTTPInternalServerError() + msg = f"Post with existing dset_id: {dset_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) if "root" not in body: msg = "POST_Dataset with no root" @@ -127,14 +136,20 @@ async def POST_Dataset(request): raise HTTPInternalServerError() shape_json = body["shape"] - layout = None if "layout" in body: - layout = body["layout"] # client specified chunk layout + log.error("unexpected key for POST Dataset: 'layout'") - # ok - all set, create committed type obj + # ok - all set, create dataset obj now = getNow(app) - log.debug(f"POST_dataset typejson: {type_json}, shapejson: {shape_json}") + if "attributes" in body: + # initialize attributes + attrs = body["attributes"] + log.debug(f"POST Dataset with attributes: {attrs}") + else: + attrs = {} + + log.debug(f"POST_dataset type_json: {type_json}, shape_json: {shape_json}") dset_json = { "id": dset_id, @@ -143,13 +158,14 @@ async def POST_Dataset(request): "lastModified": now, "type": type_json, "shape": shape_json, - "attributes": {}, + "attributes": attrs, } if "creationProperties" in body: - dset_json["creationProperties"] = body["creationProperties"] - if layout is not None: - dset_json["layout"] = layout + cpl = body["creationProperties"] + else: + cpl = {} + dset_json["creationProperties"] = cpl kwargs = {"bucket": bucket, "notify": True, "flush": True} await save_metadata_obj(app, dset_id, dset_json, **kwargs) @@ -161,7 +177,8 @@ async def POST_Dataset(request): resp_json["type"] = type_json resp_json["shape"] = shape_json resp_json["lastModified"] = dset_json["lastModified"] - resp_json["attributeCount"] = 0 + resp_json["attributeCount"] = len(attrs) + resp_json["creationProperties"] = cpl resp = json_response(resp_json, status=201) log.response(request, resp=resp) @@ -176,7 +193,7 @@ async def DELETE_Dataset(request): dset_id = request.match_info.get("id") log.info(f"DELETE dataset: {dset_id}") - if not isValidUuid(dset_id, obj_class="dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): log.error(f"Unexpected dataset id: {dset_id}") raise HTTPInternalServerError() @@ -220,7 +237,7 @@ async def PUT_DatasetShape(request): params = request.rel_url.query dset_id = request.match_info.get("id") - if not isValidUuid(dset_id, obj_class="dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): log.error(f"Unexpected dset_id: {dset_id}") raise HTTPInternalServerError() @@ -273,7 +290,7 @@ async def PUT_DatasetShape(request): if i == extend_dim: lb = dims[i] ub = lb + extension - if maxdims[extend_dim] != 0 and ub > maxdims[extend_dim]: + if maxdims[extend_dim] not in (0, "H5S_UNLIMITED") and ub > maxdims[extend_dim]: msg = "maximum extent exceeded" log.warn(msg) raise HTTPConflict() diff --git a/hsds/dset_lib.py b/hsds/dset_lib.py index 1fe89b3e..a6c58b45 100755 --- a/hsds/dset_lib.py +++ b/hsds/dset_lib.py @@ -11,22 +11,29 @@ ############################################################################## import asyncio +from asyncio import IncompleteReadError + import math import numpy as np from aiohttp.client_exceptions import ClientError -from aiohttp.web_exceptions import HTTPBadRequest, HTTPConflict, HTTPInternalServerError -from .util.arrayUtil import getNumpyValue +from aiohttp.web_exceptions import HTTPBadRequest, HTTPConflict +from aiohttp.web_exceptions import HTTPInternalServerError, HTTPRequestEntityTooLarge + +from h5json.hdf5dtype import createDataType, getItemSize, getDtypeItemSize +from h5json.array_util import getNumpyValue, bytesToArray +from h5json.objid import isSchema2Id, getS3Key, getObjId +from h5json.shape_util import isNullSpace, getShapeDims +from h5json.dset_util import getChunkDims, getDatasetLayout, getDatasetLayoutClass + +from .util.nodeUtil import getDataNodeUrl from .util.boolparser import BooleanParser -from .util.dsetUtil import isNullSpace, getDatasetLayout, getDatasetLayoutClass, get_slices -from .util.dsetUtil import getChunkLayout, getSelectionShape, getShapeDims +from .util.dsetUtil import get_slices, getSelectionShape from .util.chunkUtil import getChunkCoordinate, getChunkIndex, getChunkSuffix from .util.chunkUtil import getNumChunks, getChunkIds, getChunkId from .util.chunkUtil import getChunkCoverage, getDataCoverage from .util.chunkUtil import getQueryDtype, get_chunktable_dims -from .util.hdf5dtype import createDataType, getItemSize -from .util.httpUtil import http_delete, http_put -from .util.idUtil import getDataNodeUrl, isSchema2Id, getS3Key, getObjId +from .util.httpUtil import http_delete, http_put, request_read from .util.rangegetUtil import getHyperChunkFactors from .util.storUtil import getStorKeys @@ -364,7 +371,7 @@ def get_chunk_selections(chunk_map, chunk_ids, slices, dset_json): log.debug("no slices set, returning") return # nothing to do log.debug(f"slices: {slices}") - layout = getChunkLayout(dset_json) + layout = getChunkDims(dset_json) for chunk_id in chunk_ids: if chunk_id in chunk_map: item = chunk_map[chunk_id] @@ -442,7 +449,7 @@ async def getSelectionData( log.error("getSelectionData - expected either slices or points to be set") raise HTTPInternalServerError() - layout = getChunkLayout(dset_json) + layout = getChunkDims(dset_json) chunkinfo = {} @@ -855,7 +862,15 @@ async def reduceShape(app, dset_json, shape_update, bucket=None): arr = np.zeros([1], dtype=dt, order="C") # and the chunk layout - layout = tuple(getChunkLayout(dset_json)) + layout = getChunkDims(dset_json) + if not layout: + layout = dset_json.get("layout") # older storage version put layout here + if layout: + log.warn(f"got layout for {dset_id} from dataset_json") + if not layout: + msg = f"no layout found for {dset_id}" + log.error(msg) + raise HTTPInternalServerError() log.debug(f"got layout: {layout}") # get all chunk ids for chunks that have been allocated @@ -992,7 +1007,7 @@ async def updateShape(app, dset_json, shape_update, bucket=None): raise HTTPBadRequest(reason=msg) decreasing_dims.append(i) elif shape_update[i] > dims[i]: - if maxdims[i] != 0 and shape_update[i] > maxdims[i]: + if maxdims[i] not in (0, "H5S_UNLIMITED") and shape_update[i] > maxdims[i]: msg = "Extension dimension can not be extended past max extent" log.warn(msg) raise HTTPConflict() @@ -1053,3 +1068,185 @@ async def deleteAllChunks(app, dset_id, bucket=None): await removeChunks(app, chunk_ids, bucket=bucket) else: log.info(f"deleteAllChunks for {dset_id} - no chunks need deletion") + + +async def doPointWrite(app, + request, + points=None, + data=None, + dset_json=None, + bucket=None + ): + """ write the given points to the dataset """ + + num_points = len(points) + log.debug(f"doPointWrite - num_points: {num_points}") + dset_id = dset_json["id"] + layout = getChunkDims(dset_json) + datashape = dset_json["shape"] + dims = getShapeDims(datashape) + rank = len(dims) + + chunk_dict = {} # chunk ids to list of points in chunk + + for pt_indx in range(num_points): + if rank == 1: + point = int(points[pt_indx]) + else: + point_tuple = points[pt_indx] + point = [] + for i in range(len(point_tuple)): + point.append(int(point_tuple[i])) + if rank == 1: + if point < 0 or point >= dims[0]: + msg = f"PUT Value point: {point} is not within the " + msg += "bounds of the dataset" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + if len(point) != rank: + msg = "PUT Value point value did not match dataset rank" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + for i in range(rank): + if point[i] < 0 or point[i] >= dims[i]: + msg = f"PUT Value point: {point} is not within the " + msg += "bounds of the dataset" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + chunk_id = getChunkId(dset_id, point, layout) + # get the pt_indx element from the input data + value = data[pt_indx] + if chunk_id not in chunk_dict: + point_list = [point, ] + point_data = [value, ] + chunk_dict[chunk_id] = {"indices": point_list, "points": point_data} + else: + item = chunk_dict[chunk_id] + point_list = item["indices"] + point_list.append(point) + point_data = item["points"] + point_data.append(value) + + num_chunks = len(chunk_dict) + log.debug(f"num_chunks: {num_chunks}") + max_chunks = int(config.get("max_chunks_per_request", default=1000)) + if num_chunks > max_chunks: + msg = f"PUT value request with more than {max_chunks} chunks" + log.warn(msg) + + chunk_ids = list(chunk_dict.keys()) + chunk_ids.sort() + + crawler = ChunkCrawler( + app, + chunk_ids, + dset_json=dset_json, + bucket=bucket, + points=chunk_dict, + action="write_point_sel", + ) + await crawler.crawl() + + crawler_status = crawler.get_status() + + if crawler_status not in (200, 201): + msg = f"doPointWrite raising HTTPInternalServerError for status: {crawler_status}" + log.error(msg) + raise HTTPInternalServerError() + else: + log.info("doPointWrite success") + + +async def doHyperslabWrite(app, + request, + page_number=0, + page=None, + data=None, + dset_json=None, + select_dtype=None, + bucket=None + ): + """ write the given page selection to the dataset """ + dset_id = dset_json["id"] + log.info(f"doHyperslabWrite on {dset_id} - page: {page_number} dset_json: {dset_json}") + type_json = dset_json["type"] + + if select_dtype is not None: + item_size = getDtypeItemSize(select_dtype) + else: + item_size = getItemSize(type_json) + if item_size == "H5T_VARIABLE" and data is None: + msg = "unexpected call to doHyperslabWrite for variable length data" + log.error(msg) + raise HTTPInternalServerError() + + layout = getChunkDims(dset_json) + + num_chunks = getNumChunks(page, layout) + log.debug(f"num_chunks: {num_chunks}") + max_chunks = int(config.get("max_chunks_per_request", default=1000)) + if num_chunks > max_chunks: + msg = f"PUT value chunk count: {num_chunks} exceeds max_chunks: {max_chunks}" + log.warn(msg) + select_shape = getSelectionShape(page) + log.debug(f"got select_shape: {select_shape} for page: {page_number}") + + if data is None: + num_bytes = math.prod(select_shape) * item_size + log.debug(f"reading {num_bytes} from request stream") + # read page of data from input stream + try: + page_bytes = await request_read(request, count=num_bytes) + except HTTPRequestEntityTooLarge as tle: + msg = "Got HTTPRequestEntityTooLarge exception during " + msg += f"binary read: {tle}) for page: {page_number}" + log.warn(msg) + raise # re-throw + except IncompleteReadError as ire: + msg = "Got asyncio.IncompleteReadError during binary " + msg += f"read: {ire} for page: {page_number}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + log.debug(f"read {len(page_bytes)} for page: {page_number}") + try: + arr = bytesToArray(page_bytes, select_dtype, select_shape) + except ValueError as ve: + msg = f"bytesToArray value error for page: {page_number}: {ve}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + arr = data # use array provided to function + + try: + chunk_ids = getChunkIds(dset_id, page, layout) + except ValueError: + log.warn("getChunkIds failed") + raise HTTPInternalServerError() + if len(chunk_ids) < 10: + log.debug(f"chunk_ids: {chunk_ids}") + else: + log.debug(f"chunk_ids: {chunk_ids[:10]} ...") + if len(chunk_ids) > max_chunks: + msg = f"got {len(chunk_ids)} for page: {page_number}. max_chunks: {max_chunks}" + log.warn(msg) + + crawler = ChunkCrawler( + app, + chunk_ids, + dset_json=dset_json, + bucket=bucket, + slices=page, + arr=arr, + action="write_chunk_hyperslab", + ) + await crawler.crawl() + + crawler_status = crawler.get_status() + + if crawler_status not in (200, 201): + msg = f"crawler failed for page: {page_number} with status: {crawler_status}" + log.error(msg) + raise HTTPInternalServerError() + else: + log.info("crawler write_chunk_hyperslab successful") diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 38e1156a..bd7ad394 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -14,226 +14,32 @@ # handles dataset requests # -import math from json import JSONDecodeError -from aiohttp.web_exceptions import HTTPBadRequest, HTTPNotFound +from aiohttp.web_exceptions import HTTPBadRequest, HTTPNotFound, HTTPInternalServerError + +from h5json.hdf5dtype import createDataType +from h5json.array_util import getNumElements, jsonToArray +from h5json.objid import isValidUuid, isSchema2Id +from h5json.shape_util import getShapeDims +from h5json.dset_util import getChunkDims, getDatasetLayoutClass from .util.httpUtil import getHref, respJsonAssemble from .util.httpUtil import jsonResponse, getBooleanParam -from .util.idUtil import isValidUuid, isSchema2Id -from .util.dsetUtil import getPreviewQuery, getFilterItem, getShapeDims -from .util.arrayUtil import getNumElements, getNumpyValue -from .util.chunkUtil import getChunkSize, guessChunk, expandChunk, shrinkChunk -from .util.chunkUtil import getContiguousLayout +from .util.chunkUtil import getChunkIds +from .util.dsetUtil import getPreviewQuery, getHyperslabSelection from .util.authUtil import getUserPasswordFromRequest, aclCheck from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain from .util.domainUtil import getBucketForDomain, verifyRoot -from .util.storUtil import getSupportedFilters -from .util.hdf5dtype import validateTypeItem, createDataType, getBaseTypeJson -from .util.hdf5dtype import getItemSize -from .util.linkUtil import validateLinkName from .servicenode_lib import getDomainJson, getObjectJson, getDsetJson, getPathForObjectId from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo -from .servicenode_lib import createObject, createObjectByPath, deleteObject +from .servicenode_lib import getDatasetCreateArgs, createDataset, deleteObject from .dset_lib import updateShape, deleteAllChunks -from . import config +from .post_crawl import createDatasets +from .domain_crawl import DomainCrawler from . import hsds_logger as log -async def validateChunkLayout(app, shape_json, item_size, layout, bucket=None): - """ - Use chunk layout given in the creationPropertiesList (if defined and - layout is valid). - Return chunk_layout_json - """ - - rank = 0 - space_dims = None - chunk_dims = None - max_dims = None - - if "dims" in shape_json: - space_dims = shape_json["dims"] - rank = len(space_dims) - - if "maxdims" in shape_json: - max_dims = shape_json["maxdims"] - if "dims" in layout: - chunk_dims = layout["dims"] - - if chunk_dims: - # validate that the chunk_dims are valid and correlates with the - # dataset shape - if isinstance(chunk_dims, int): - chunk_dims = [ - chunk_dims, - ] # promote to array - if len(chunk_dims) != rank: - msg = "Layout rank does not match shape rank" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - for i in range(rank): - dim_extent = space_dims[i] - chunk_extent = chunk_dims[i] - if not isinstance(chunk_extent, int): - msg = "Layout dims must be integer or integer array" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if chunk_extent <= 0: - msg = "Invalid layout value" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if max_dims is None: - if chunk_extent > dim_extent: - msg = "Invalid layout value" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif max_dims[i] != 0: - if chunk_extent > max_dims[i]: - msg = "Invalid layout value for extensible dimension" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - pass # allow any positive value for unlimited dimensions - - if "class" not in layout: - msg = "class key not found in layout for creation property list" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - layout_class = layout["class"] - - if layout_class == "H5D_CONTIGUOUS_REF": - # reference to a dataset in a traditional HDF5 files with - # contigious storage - if item_size == "H5T_VARIABLE": - # can't be used with variable types... - msg = "Datsets with variable types cannot be used with " - msg += "reference layouts" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "file_uri" not in layout: - # needed for H5D_CONTIGUOUS_REF - msg = "'file_uri' key must be provided for " - msg += "H5D_CONTIGUOUS_REF layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "offset" not in layout: - # needed for H5D_CONTIGUOUS_REF - msg = "'offset' key must be provided for " - msg += "H5D_CONTIGUOUS_REF layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "size" not in layout: - # needed for H5D_CONTIGUOUS_REF - msg = "'size' key must be provided for " - msg += "H5D_CONTIGUOUS_REF layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "dims" in layout: - # used defined chunk layout not allowed for H5D_CONTIGUOUS_REF - msg = "'dims' key can not be provided for " - msg += "H5D_CONTIGUOUS_REF layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif layout_class == "H5D_CHUNKED_REF": - # reference to a dataset in a traditional HDF5 files with - # chunked storage - if item_size == "H5T_VARIABLE": - # can't be used with variable types.. - msg = "Datsets with variable types cannot be used with " - msg += "reference layouts" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "file_uri" not in layout: - # needed for H5D_CHUNKED_REF - msg = "'file_uri' key must be provided for " - msg += "H5D_CHUNKED_REF layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "dims" not in layout: - # needed for H5D_CHUNKED_REF - msg = "'dimns' key must be provided for " - msg += "H5D_CHUNKED_REF layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "chunks" not in layout: - msg = "'chunks' key must be provided for " - msg += "H5D_CHUNKED_REF layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif layout_class == "H5D_CHUNKED_REF_INDIRECT": - # reference to a dataset in a traditional HDF5 files with chunked - # storage using an auxillary dataset - if item_size == "H5T_VARIABLE": - # can't be used with variable types.. - msg = "Datsets with variable types cannot be used with " - msg += "reference layouts" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "dims" not in layout: - # needed for H5D_CHUNKED_REF_INDIRECT - msg = "'dimns' key must be provided for " - msg += "H5D_CHUNKED_REF_INDIRECT layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "chunk_table" not in layout: - msg = "'chunk_table' key must be provided for " - msg += "H5D_CHUNKED_REF_INDIRECT layout" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - chunktable_id = layout["chunk_table"] - if not isValidUuid(chunktable_id, "Dataset"): - msg = f"Invalid chunk table id: {chunktable_id}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - # verify the chunk table exists and is of reasonable shape - try: - chunktable_json = await getDsetJson(app, chunktable_id, bucket=bucket) - except HTTPNotFound: - msg = f"chunk table id: {chunktable_id} not found" - log.warn(msg) - raise - chunktable_shape = chunktable_json["shape"] - if chunktable_shape["class"] == "H5S_NULL": - msg = "Null space datasets can not be used as chunk tables" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - chunktable_dims = getShapeDims(chunktable_shape) - if len(chunktable_dims) != len(space_dims): - msg = "Chunk table rank must be same as dataspace rank" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif layout_class == "H5D_CHUNKED": - if "dims" not in layout: - msg = "dims key not found in layout for creation property list" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if shape_json["class"] != "H5S_SIMPLE": - msg = "Bad Request: chunked layout not valid with shape class: " - msg += f"{shape_json['class']}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif layout_class == "H5D_CONTIGUOUS": - if "dims" in layout: - msg = "dims key found in layout for creation property list " - msg += "for H5D_CONTIGUOUS storage class" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif layout_class == "H5D_COMPACT": - if "dims" in layout: - msg = "dims key found in layout for creation property list " - msg += "for H5D_COMPACT storage class" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - msg = f"Unexpected layout: {layout_class}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - async def getDatasetDetails(app, dset_id, root_id, bucket=None): """Get extra information about the given dataset""" # Gather additional info on the domain @@ -282,7 +88,7 @@ async def GET_Dataset(request): include_attrs = True if dset_id: - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -293,7 +99,7 @@ async def GET_Dataset(request): group_id = None if "grpid" in params: group_id = params["grpid"] - if not isValidUuid(group_id, "Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid parent group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -337,7 +143,7 @@ async def GET_Dataset(request): # throws 404 if not found kwargs = {"bucket": bucket, "domain": domain} dset_id, domain, _ = await getObjectIdByPath(app, group_id, h5path, **kwargs) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"No dataset exist with the path: {h5path}" log.warn(msg) raise HTTPNotFound() @@ -425,7 +231,7 @@ async def GET_DatasetType(request): msg = "Missing dataset id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -477,7 +283,7 @@ async def GET_DatasetShape(request): msg = "Missing dataset id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -534,7 +340,7 @@ async def PUT_DatasetShape(request): msg = "Missing dataset id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -670,7 +476,7 @@ async def POST_Dataset(request): log.warn(msg) raise HTTPBadRequest(reason=msg) - log.debug(f"got body: {body}") + log.debug(f"POST_Dataset got body: {body}") # get domain, check authorization domain = getDomainFromRequest(request) if not isValidDomain(domain): @@ -688,472 +494,218 @@ async def POST_Dataset(request): verifyRoot(domain_json) - # - # validate type input - # - if "type" not in body: - msg = "POST Dataset has no type key in body" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + # allow parent group creation or not + implicit = getBooleanParam(params, "implicit") - datatype = body["type"] - log.debug(f"got datatype: {datatype}") - if isinstance(datatype, str) and datatype.startswith("t-"): - # Committed type - fetch type json from DN - ctype_id = datatype - log.debug(f"got ctypeid: {ctype_id}") - ctype_json = await getObjectJson(app, ctype_id, bucket=bucket) - log.debug(f"ctype: {ctype_json}") - if ctype_json["root"] != root_id: - msg = "Referenced committed datatype must belong in same domain" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - datatype = ctype_json["type"] - # add the ctype_id to type type - datatype["id"] = ctype_id - elif isinstance(datatype, str): - try: - # convert predefined type string (e.g. "H5T_STD_I32LE") to - # corresponding json representation - datatype = getBaseTypeJson(datatype) - log.debug(f"got datatype: {datatype}") - except TypeError: - msg = "POST Dataset with invalid predefined type" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + post_rsp = None - try: - validateTypeItem(datatype) - except KeyError as ke: - msg = f"KeyError creating type: {ke}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - except TypeError as te: - msg = f"TypeError creating type: {te}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - except ValueError as ve: - msg = f"ValueError creating type: {ve}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + datatype_json = None + init_values = [] # value initializer for each object - item_size = getItemSize(datatype) + def _updateInitValuesList(kwargs): + # remove value key from kwargs and append + # to init_values list + if "value" in kwargs: + init_values.append(kwargs["value"]) + log.debug(f"init value appended: {kwargs['value']}") + del kwargs["value"] + else: + # add a placeholder + init_values.append(None) # - # Validate shape input + # handle case of committed type input # - dims = None - shape_json = {} - rank = 0 - chunk_size = None - - if "shape" not in body: - shape_json["class"] = "H5S_SCALAR" - else: - shape = body["shape"] - log.debug(f"got shape: {shape}") - if isinstance(shape, int): - shape_json["class"] = "H5S_SIMPLE" - dims = [shape, ] - shape_json["dims"] = dims - rank = 1 - elif isinstance(shape, str): - # only valid string value is H5S_NULL or H5S_SCALAR - if shape == "H5S_NULL": - shape_json["class"] = "H5S_NULL" - elif shape == "H5S_SCALAR": - shape_json["class"] = "H5S_SCALAR" - else: - msg = "POST Datset with invalid shape value" + if isinstance(body, dict) and "type" in body: + + body_type = body["type"] + log.debug(f"got datatype: {body_type}") + if isinstance(body_type, str) and body_type.startswith("t-"): + ctype_id = body_type + # Committed type - fetch type json from DN + log.debug(f"got ctype_id: {ctype_id}") + ctype_json = await getObjectJson(app, ctype_id, bucket=bucket) + log.debug(f"ctype: {ctype_json}") + if ctype_json["root"] != root_id: + msg = "Referenced committed datatype must belong in same domain" log.warn(msg) raise HTTPBadRequest(reason=msg) - elif isinstance(shape, list): - if len(shape) == 0: - shape_json["class"] = "H5S_SCALAR" - else: - shape_json["class"] = "H5S_SIMPLE" - shape_json["dims"] = shape - dims = shape - rank = len(dims) - else: - msg = "Bad Request: shape is invalid" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if dims is not None: - for i in range(rank): - extent = dims[i] - if not isinstance(extent, int): - msg = "Invalid shape type" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if extent < 0: - msg = "shape dimension is negative" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - maxdims = None - if "maxdims" in body: - if dims is None: - msg = "Maxdims cannot be supplied if space is NULL" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - maxdims = body["maxdims"] - if isinstance(maxdims, int): - dim1 = maxdims - maxdims = [dim1] - elif isinstance(maxdims, list): - pass # can use as is - else: - msg = "Bad Request: maxdims is invalid" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if len(dims) != len(maxdims): - msg = "Maxdims rank doesn't match Shape" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if maxdims is not None: - for extent in maxdims: - if not isinstance(extent, int): - msg = "Invalid maxdims type" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if extent < 0: - msg = "maxdims dimension is negative" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if len(maxdims) != len(dims): - msg = "Bad Request: maxdims array length must equal " - msg += "shape array length" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - shape_json["maxdims"] = [] - for i in range(rank): - maxextent = maxdims[i] - if not isinstance(maxextent, int): - msg = "Bad Request: maxdims must be integer type" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif maxextent == 0: - # unlimited dimension - shape_json["maxdims"].append(0) - elif maxextent < dims[i]: - msg = "Bad Request: maxdims extent can't be smaller " - msg += "than shape extent" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - shape_json["maxdims"].append(maxextent) - - layout_props = None - min_chunk_size = int(config.get("min_chunk_size")) - max_chunk_size = int(config.get("max_chunk_size")) - if "creationProperties" in body: - creationProperties = body["creationProperties"] - log.debug(f"got creationProperties: {creationProperties}") - if "layout" in creationProperties: - layout_props = creationProperties["layout"] - await validateChunkLayout(app, shape_json, item_size, layout_props, bucket=bucket) - else: - creationProperties = {} - - # TBD: check for invalid layout class... - if layout_props: - if layout_props["class"] == "H5D_CONTIGUOUS": - # treat contiguous as chunked - layout_class = "H5D_CHUNKED" + datatype_json = ctype_json["type"] + # add the ctype_id to type type + datatype_json["id"] = ctype_id else: - layout_class = layout_props["class"] - elif shape_json["class"] != "H5S_NULL": - layout_class = "H5D_CHUNKED" - else: - layout_class = None - - if layout_class == "H5D_COMPACT": - layout = {"class": "H5D_COMPACT"} - elif layout_class: - # initialize to H5D_CHUNKED - layout = {"class": "H5D_CHUNKED"} - else: - # null space - no layout - layout = None - - if layout_props and "dims" in layout_props: - chunk_dims = layout_props["dims"] - else: - chunk_dims = None - - if layout_class == "H5D_CONTIGUOUS_REF": - kwargs = {"chunk_min": min_chunk_size, "chunk_max": max_chunk_size} - chunk_dims = getContiguousLayout(shape_json, item_size, **kwargs) - layout["dims"] = chunk_dims - log.debug(f"autoContiguous layout: {layout}") - - if layout_class == "H5D_CHUNKED" and chunk_dims is None: - # do autochunking - chunk_dims = guessChunk(shape_json, item_size) - log.debug(f"initial autochunk layout: {chunk_dims}") - - if layout_class == "H5D_CHUNKED": - chunk_size = getChunkSize(chunk_dims, item_size) - - msg = f"chunk_size: {chunk_size}, min: {min_chunk_size}, " - msg += f"max: {max_chunk_size}" - log.debug(msg) - # adjust the chunk shape if chunk size is too small or too big - adjusted_chunk_dims = None - if chunk_size < min_chunk_size: - msg = f"chunk size: {chunk_size} less than min size: " - msg += f"{min_chunk_size}, expanding" - log.debug(msg) - kwargs = {"chunk_min": min_chunk_size, "layout_class": layout_class} - adjusted_chunk_dims = expandChunk(chunk_dims, item_size, shape_json, **kwargs) - elif chunk_size > max_chunk_size: - msg = f"chunk size: {chunk_size} greater than max size: " - msg += f"{max_chunk_size}, shrinking" - log.debug(msg) - kwargs = {"chunk_max": max_chunk_size} - adjusted_chunk_dims = shrinkChunk(chunk_dims, item_size, **kwargs) - if adjusted_chunk_dims: - msg = f"requested chunk_dimensions: {chunk_dims} modified " - msg += f"dimensions: {adjusted_chunk_dims}" - log.debug(msg) - layout["dims"] = adjusted_chunk_dims + pass # we'll fetch type in getDatasetCreateArgs + + if isinstance(body, list): + count = len(body) + log.debug(f"multiple dataset create: {count} items") + if count == 0: + # equivalent to no body + msg = "POST Dataset with no body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + elif count == 1: + # just create one object in typical way + kwargs = getDatasetCreateArgs(body[0], + root_id=root_id, + type=datatype_json, + bucket=bucket, + implicit=implicit) + _updateInitValuesList(kwargs) else: - layout["dims"] = chunk_dims # don't need to adjust chunk size - - # set partition_count if needed: - max_chunks_per_folder = int(config.get("max_chunks_per_folder")) - set_partition = False - if max_chunks_per_folder > 0: - if "dims" in shape_json and "dims" in layout: - set_partition = True - - if set_partition: - chunk_dims = layout["dims"] - shape_dims = shape_json["dims"] - if "maxdims" in shape_json: - max_dims = shape_json["maxdims"] - else: - max_dims = None - num_chunks = 1 - rank = len(shape_dims) - unlimited_count = 0 - if max_dims: - for i in range(rank): - if max_dims[i] == 0: - unlimited_count += 1 - msg = f"number of unlimited dimensions: {unlimited_count}" - log.debug(msg) - - for i in range(rank): - max_dim = 1 - if max_dims: - max_dim = max_dims[i] - if max_dim == 0: - # don't really know what the ultimate extent - # could be, but assume 10^6 for total number of - # elements and square-shaped array... - MAX_ELEMENT_GUESS = 10.0 ** 6 - exp = 1 / unlimited_count - max_dim = int(math.pow(MAX_ELEMENT_GUESS, exp)) - else: - max_dim = shape_dims[i] - num_chunks *= math.ceil(max_dim / chunk_dims[i]) - - if num_chunks > max_chunks_per_folder: - partition_count = math.ceil(num_chunks / max_chunks_per_folder) - msg = f"set partition count to: {partition_count}, " - msg += f"num_chunks: {num_chunks}" - log.info(msg) - layout["partition_count"] = partition_count - else: - msg = "do not need chunk partitions, num_chunks: " - msg += f"{num_chunks} max_chunks_per_folder: " - msg += f"{max_chunks_per_folder}" - log.info(msg) - - if layout_class in ("H5D_CHUNKED_REF", "H5D_CHUNKED_REF_INDIRECT"): - chunk_size = getChunkSize(chunk_dims, item_size) - - msg = f"chunk_size: {chunk_size}, min: {min_chunk_size}, " - msg += f"max: {max_chunk_size}" - log.debug(msg) - # nothing to do about inefficiently small chunks, but large chunks - # can be subdivided - if chunk_size < min_chunk_size: - msg = f"chunk size: {chunk_size} less than min size: " - msg += f"{min_chunk_size} for {layout_class} dataset" - log.warn(msg) - elif chunk_size > max_chunk_size: - msg = f"chunk size: {chunk_size} greater than max size: " - msg += f"{max_chunk_size}, for {layout_class} dataset" - log.warn(msg) - layout["dims"] = chunk_dims - - if creationProperties: - # TBD - validate all creationProperties - if "fillValue" in creationProperties: - # validate fill value compatible with type - dt = createDataType(datatype) - fill_value = creationProperties["fillValue"] - if "fillValue_encoding" in creationProperties: - fill_value_encoding = creationProperties["fillValue_encoding"] - - if fill_value_encoding not in ("None", "base64"): - msg = f"unexpected value for fill_value_encoding: {fill_value_encoding}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - # should see a string in this case - if not isinstance(fill_value, str): - msg = f"unexpected fill value: {fill_value} " - msg += f"for encoding: {fill_value_encoding}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - fill_value_encoding = None + # create multiple dataset objects + kwarg_list = [] # list of kwargs for each object - try: - getNumpyValue(fill_value, dt=dt, encoding=fill_value_encoding) - except ValueError: - msg = f"invalid fill value: {fill_value}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if "filters" in creationProperties: - # convert to standard representation - # refer to https://hdf5-json.readthedocs.io/en/latest/bnf/\ - # filters.html#grammar-token-filter_list - f_in = creationProperties["filters"] - supported_filters = getSupportedFilters(include_compressors=True) - log.debug(f"supported_compressors: {supported_filters}") - - log.debug(f"filters provided in creationProperties: {f_in}") - - if not isinstance(f_in, list): - msg = "Expected filters in creationProperties to be a list" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if f_in and chunk_size is None: - # filters can only be used with chunked datasets - msg = "Filters can only be used with chunked datasets" - log.warning(msg) - raise HTTPBadRequest(reason=msg) - - f_out = [] - for filter in f_in: - if isinstance(filter, int) or isinstance(filter, str): - item = getFilterItem(filter) - if not item: - msg = f"filter {filter} not recognized" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if item["name"] not in supported_filters: - msg = f"filter {filter} is not supported" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - f_out.append(item) - elif isinstance(filter, dict): - if "class" not in filter: - msg = "expected 'class' key for filter property" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if filter["class"] != "H5Z_FILTER_USER": - item = getFilterItem(filter["class"]) - elif "id" in filter: - item = getFilterItem(filter["id"]) - elif "name" in filter: - item = getFilterItem(filter["name"]) - else: - item = None - if not item: - msg = f"filter {filter['class']} not recognized" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "id" not in filter: - filter["id"] = item["id"] - elif item["id"] != filter["id"]: - msg = f"Expected {filter['class']} to have id: " - msg += f"{item['id']} but got {filter['id']}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if "name" not in filter: - filter["name"] = item["name"] - if filter["name"] not in supported_filters: - msg = f"filter {filter} is not supported" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - f_out.append(filter) - else: - msg = f"Unexpected type for filter: {filter}" + for item in body: + log.debug(f"item: {item}") + if not isinstance(item, dict): + msg = f"Post_Dataset - invalid item type: {type(item)}" log.warn(msg) raise HTTPBadRequest(reason=msg) - # replace filters with our starndardized list - log.debug(f"setting filters to: {f_out}") - creationProperties["filters"] = f_out - - log.debug(f"set dataset json creationPropries: {creationProperties}") - - parent_id = None - link_title = None - h5path = None - if "link" in body: - if "h5path" in body: - msg = "link can't be used with h5path" + kwargs = getDatasetCreateArgs(item, + root_id=root_id, + type=datatype_json, + bucket=bucket) + _updateInitValuesList(kwargs) + kwargs["ignore_link"] = True + kwarg_list.append(kwargs) + kwargs = {"bucket": bucket, "root_id": root_id} + if datatype_json: + kwargs["type"] = datatype_json + log.debug(f"createDatasetObjects, items: {kwarg_list}") + post_rsp = await createDatasets(app, kwarg_list, **kwargs) + else: + # single object create + kwargs = getDatasetCreateArgs(body, + root_id=root_id, + type=datatype_json, + bucket=bucket, + implicit=implicit) + _updateInitValuesList(kwargs) + log.debug(f"kwargs for dataset create: {kwargs}") + + if post_rsp is None: + # Handle cases other than multi ctype create here + post_rsp = await createDataset(app, **kwargs) + + log.debug(f"returning resp: {post_rsp}") + + if "objects" in post_rsp: + # add any links in multi request + objects = post_rsp["objects"] + obj_count = len(objects) + log.debug(f"Post dataset multi create: {obj_count} objects") + if len(body) != obj_count: + msg = f"Expected {obj_count} objects but got {len(body)}" log.warn(msg) raise HTTPBadRequest(reason=msg) - link_body = body["link"] - if "id" in link_body: - parent_id = link_body["id"] - if "name" in link_body: - link_title = link_body["name"] - try: - # will throw exception if there's a slash in the name - validateLinkName(link_title) - except ValueError: - msg = f"invalid link title: {link_title}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if parent_id and link_title: - log.debug(f"parent id: {parent_id}, link_title: {link_title}") - h5path = link_title # just use the link name as the h5path - - if "h5path" in body: - h5path = body["h5path"] - if "parent_id" not in body: - parent_id = root_id - else: - parent_id = body["parent_id"] - - # setup args to createObject - kwargs = {"bucket": bucket, "obj_type": datatype, "obj_shape": shape_json} - if creationProperties: - kwargs["creation_props"] = creationProperties - if layout: - kwargs["layout"] = layout - - if parent_id: - kwargs["parent_id"] = parent_id - kwargs["h5path"] = h5path - # allow parent group creation or not - implicit = getBooleanParam(params, "implicit") - if implicit: - kwargs["implicit"] = True - dset_json = await createObjectByPath(app, **kwargs) else: - # create an anonymous datatype - kwargs["root_id"] = root_id - dset_json = await createObject(app, **kwargs) + obj_count = 1 # single object create + objects = [post_rsp, ] # treat as an array to make the following code more consistent + + if len(init_values) != obj_count: + msg = f"Expected {obj_count} init values" + log.error(msg) + raise HTTPInternalServerError() + + # write any init data values + init_chunks = {} + for index in range(obj_count): + init_data = init_values[index] + if init_data is None: + continue # no data to initialize + log.debug(f"init data: {init_data}") + dset_json = objects[index] + dset_id = dset_json["id"] + log.debug(f"init value, post_rsp: {dset_json}") + layout_class = getDatasetLayoutClass(dset_json) + log.debug(f"layout_class: {layout_class}") + if layout_class not in ("H5D_CONTIGUOUS", "H5D_CHUNKED"): + msg = f"dataset init_data used with unsupported layout_class: {layout_class}" + log.error(msg) + raise HTTPInternalServerError() + layout_dims = getChunkDims(dset_json) + log.debug(f"init data layout is: {layout_dims}") + # make selection for entire dataspace + dims = getShapeDims(dset_json["shape"]) + slices = getHyperslabSelection(dims) + + chunk_ids = getChunkIds(dset_id, slices, layout_dims) + log.debug(f"init data, got chunk_ids: {chunk_ids}") + if not chunk_ids or len(chunk_ids) != 1: + msg = "expected one chunk for init_data but got: {chunk_ids}" + log.error(msg) + raise HTTPInternalServerError() + chunk_id = chunk_ids[0] + shape_json = dset_json["shape"] + type_json = dset_json["type"] + arr_dtype = createDataType(type_json) + dims = getShapeDims(shape_json) + try: + input_arr = jsonToArray(dims, arr_dtype, init_data) + except ValueError: + log.warn(f"ValueError: {msg}") + raise HTTPBadRequest(reason=msg) + except TypeError: + log.warn(f"TypeError: {msg}") + raise HTTPBadRequest(reason=msg) + except IndexError: + log.warn(f"IndexError: {msg}") + raise HTTPBadRequest(reason=msg) + log.debug(f"got json arr: {input_arr.shape}") + init_chunks[chunk_id] = input_arr + + if init_chunks: + # write dataset init values using the Domain Crawler + log.debug(f"POST dataset - setting init values: {list(init_chunks.keys())}") + kwargs = {"action": "put_data", "bucket": bucket} + + crawler = DomainCrawler(app, init_chunks, **kwargs) + + # will raise exception on not found, server busy, etc. + await crawler.crawl() + status = crawler.get_status() + log.info(f"DomainCrawler done for put_data action, status: {status}") + + if "objects" in post_rsp: + # add any links in multi request + objects = post_rsp["objects"] + obj_count = len(objects) + log.debug(f"Post datatype multi create: {obj_count} objects") + if len(body) != obj_count: + msg = f"Expected {obj_count} objects but got {len(body)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + parent_ids = {} + for index in range(obj_count): + item = body[index] + if "link" in item: + link_item = item["link"] + parent_id = link_item.get("id") + title = link_item.get("name") + if parent_id and title: + # add a hard link + object = objects[index] + obj_id = object["id"] + if parent_id not in parent_ids: + parent_ids[parent_id] = {} + links = parent_ids[parent_id] + links[title] = {"id": obj_id} + if parent_ids: + log.debug(f"POST dataset multi - adding links: {parent_ids}") + kwargs = {"action": "put_link", "bucket": bucket} + kwargs["replace"] = True + + crawler = DomainCrawler(app, parent_ids, **kwargs) + + # will raise exception on not found, server busy, etc. + await crawler.crawl() + status = crawler.get_status() + + log.info(f"DomainCrawler done for put_links action, status: {status}") # dataset creation successful - resp = await jsonResponse(request, dset_json, status=201) + resp = await jsonResponse(request, post_rsp, status=201) log.response(request, resp=resp) return resp @@ -1169,7 +721,7 @@ async def DELETE_Dataset(request): msg = "Missing dataset id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(dset_id, "Dataset"): + if not isValidUuid(dset_id, obj_class="datasets"): msg = f"Invalid dataset id: {dset_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/folder_crawl.py b/hsds/folder_crawl.py index 48f37ce6..f1b3fcf2 100644 --- a/hsds/folder_crawl.py +++ b/hsds/folder_crawl.py @@ -13,14 +13,15 @@ # service node of hsds cluster # -import time import asyncio from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPNotFound from aiohttp.web_exceptions import HTTPGone, HTTPInternalServerError from aiohttp.web_exceptions import HTTPServiceUnavailable -from .util.idUtil import getNodeCount +from h5json.time_util import getNow from .servicenode_lib import getObjectJson, getDomainResponse, getDomainJson +from .util.nodeUtil import getNodeCount + from . import hsds_logger as log @@ -50,6 +51,9 @@ def __init__( else: self._max_tasks = len(domains) + def now(self): + return getNow(app=self._app) + async def crawl(self): workers = [asyncio.Task(self.work()) for _ in range(self._max_tasks)] # When all work is done, exit. @@ -67,11 +71,11 @@ async def crawl(self): async def work(self): while True: - start = time.time() + start = self.now() domain = await self._q.get() await self.fetch(domain) self._q.task_done() - elapsed = time.time() - start + elapsed = self.now() - start msg = f"FolderCrawler - task {domain} start: {start:.3f} " msg += f"elapsed: {elapsed:.3f}" log.debug(msg) diff --git a/hsds/group_dn.py b/hsds/group_dn.py index 0a6bb937..69fa35d4 100755 --- a/hsds/group_dn.py +++ b/hsds/group_dn.py @@ -19,9 +19,10 @@ from aiohttp.web_exceptions import HTTPNotFound, HTTPServiceUnavailable from aiohttp.web import json_response -from .util.idUtil import isValidUuid, isSchema2Id, isRootObjId, getRootObjId +from h5json.objid import isValidUuid, isSchema2Id, isRootObjId, getRootObjId +from h5json.time_util import getNow + from .util.domainUtil import isValidBucketName -from .util.timeUtil import getNow from .datanode_lib import get_obj_id, check_metadata_obj, get_metadata_obj from .datanode_lib import save_metadata_obj, delete_metadata_obj from . import hsds_logger as log @@ -46,7 +47,7 @@ async def GET_Group(request): log.info(f"GET group: {group_id} bucket: {bucket}") - if not isValidUuid(group_id, obj_class="group"): + if not isValidUuid(group_id, obj_class="groups"): log.error(f"Unexpected group_id: {group_id}") raise HTTPInternalServerError() @@ -97,9 +98,13 @@ async def POST_Group(request): raise HTTPBadRequest(reason=msg) group_id = get_obj_id(request, body=body) + deleted_ids = app["deleted_ids"] + if group_id in deleted_ids: + log.warn(f"POST Group has id: {group_id} that has previously been deleted") + deleted_ids.remove(group_id) - log.info(f"POST group: {group_id} bucket: {bucket}") - if not isValidUuid(group_id, obj_class="group"): + log.info(f"POST group: {group_id} bucket: {bucket} body: {body}") + if not isValidUuid(group_id, obj_class="groups"): log.error(f"Unexpected group_id: {group_id}") raise HTTPInternalServerError() if "root" not in body: @@ -110,12 +115,13 @@ async def POST_Group(request): # verify the id doesn't already exist obj_found = await check_metadata_obj(app, group_id, bucket=bucket) if obj_found: - log.error(f"Post with existing group_id: {group_id}") - raise HTTPInternalServerError() + msg = f"Post with existing group_id: {group_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) root_id = body["root"] - if not isValidUuid(root_id, obj_class="group"): + if not isValidUuid(root_id, obj_class="groups"): msg = "Invalid root_id: " + root_id log.error(msg) raise HTTPInternalServerError() @@ -123,13 +129,27 @@ async def POST_Group(request): # ok - all set, create group obj now = getNow(app) + if "attributes" in body: + # initialize attributes + attrs = body["attributes"] + log.debug(f"POST Group with attributes: {attrs}") + else: + attrs = {} + + if "links" in body: + # initialize links + links = body["links"] + log.debug(f"POST Group with links: {links}") + else: + links = {} + group_json = { "id": group_id, "root": root_id, "created": now, "lastModified": now, - "links": {}, - "attributes": {}, + "links": links, + "attributes": attrs, } if "creationProperties" in body: @@ -144,8 +164,8 @@ async def POST_Group(request): resp_json["root"] = root_id resp_json["created"] = group_json["created"] resp_json["lastModified"] = group_json["lastModified"] - resp_json["linkCount"] = 0 - resp_json["attributeCount"] = 0 + resp_json["linkCount"] = len(links) + resp_json["attributeCount"] = len(attrs) resp = json_response(resp_json, status=201) log.response(request, resp=resp) @@ -178,7 +198,7 @@ async def PUT_Group(request): # don't really need bucket param since the dirty ids know which bucket # they should write too - if not isValidUuid(root_id, obj_class="group"): + if not isValidUuid(root_id, obj_class="groups"): log.error(f"Unexpected group_id: {root_id}") raise HTTPInternalServerError() @@ -247,7 +267,7 @@ async def DELETE_Group(request): params = request.rel_url.query group_id = get_obj_id(request) - if not isValidUuid(group_id, obj_class="group"): + if not isValidUuid(group_id, obj_class="groups"): log.error(f"Unexpected group_id: {group_id}") raise HTTPInternalServerError() diff --git a/hsds/group_sn.py b/hsds/group_sn.py index 2b573985..991b50bd 100755 --- a/hsds/group_sn.py +++ b/hsds/group_sn.py @@ -16,17 +16,19 @@ from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPNotFound from json import JSONDecodeError +from h5json.objid import isValidUuid + from .util.httpUtil import getHref, jsonResponse, getBooleanParam -from .util.idUtil import isValidUuid from .util.authUtil import getUserPasswordFromRequest, aclCheck from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain, getPathForDomain, verifyRoot -from .util.linkUtil import validateLinkName from .servicenode_lib import getDomainJson, getObjectJson, validateAction -from .servicenode_lib import getObjectIdByPath, getPathForObjectId -from .servicenode_lib import createObject, createObjectByPath, deleteObject +from .servicenode_lib import getObjectIdByPath, getPathForObjectId, deleteObject +from .servicenode_lib import getCreateArgs, createGroup from . import hsds_logger as log +from .post_crawl import createGroups +from .domain_crawl import DomainCrawler async def GET_Group(request): @@ -50,7 +52,7 @@ async def GET_Group(request): if group_id: log.info(f"GET_Group, id: {group_id}") # is the id a group id and not something else? - if not isValidUuid(group_id, "Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -97,7 +99,7 @@ async def GET_Group(request): kwargs = {"bucket": bucket, "domain": domain} group_id, domain, obj_json = await getObjectIdByPath(app, group_id, h5path, **kwargs) - if not isValidUuid(group_id, "Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"No group exist with the path: {h5path}" log.warn(msg) raise HTTPNotFound() @@ -173,6 +175,7 @@ async def POST_Group(request): bucket = getBucketForDomain(domain) domain_json = await getDomainJson(app, domain, reload=True) + log.debug(f"got domain_json: {domain_json}") # throws exception if not allowed aclCheck(app, domain_json, "create", username) @@ -182,11 +185,8 @@ async def POST_Group(request): # allow parent group creation or not implicit = getBooleanParam(params, "implicit") - - parent_id = None - h5path = None - creation_props = None - + kwargs = {} + post_rsp = None if request.has_body: try: body = await request.json() @@ -197,55 +197,92 @@ async def POST_Group(request): log.info(f"POST Group body: {body}") if body: - if "link" in body: - if "h5path" in body: - msg = "link can't be used with h5path" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - link_body = body["link"] - log.debug(f"link_body: {link_body}") - if "id" in link_body: - parent_id = link_body["id"] - if "name" in link_body: - link_title = link_body["name"] - try: - # will throw exception if there's a slash in the name - validateLinkName(link_title) - except ValueError: - msg = f"invalid link title: {link_title}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if parent_id and link_title: - log.debug(f"parent id: {parent_id}, link_title: {link_title}") - h5path = link_title # just use the link name as the h5path - - if "h5path" in body: - h5path = body["h5path"] - if "parent_id" not in body: - parent_id = root_id + if isinstance(body, list): + count = len(body) + log.debug(f"multiple group create: {count} items") + if count == 0: + # equivalent to no body, anonymous group case + kwargs = {"root_id": root_id, "bucket": bucket} + elif count == 1: + # just create one object in typical way + kwargs = getCreateArgs(body[0], + root_id=root_id, + bucket=bucket, + implicit=implicit) else: - parent_id = body["parent_id"] - if "creationProperties" in body: - creation_props = body["creationProperties"] - - if parent_id: - kwargs = {"bucket": bucket, "parent_id": parent_id, "h5path": h5path} - if creation_props: - kwargs["creation_props"] = creation_props - if implicit: - kwargs["implicit"] = True - group_json = await createObjectByPath(app, **kwargs) + # create multiple group objects + kwarg_list = [] # list of kwargs for each object + + for item in body: + log.debug(f"item: {item}") + if not isinstance(item, dict): + msg = f"PostGroup - invalid item type: {type(item)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + kwargs = getCreateArgs(item, root_id=root_id, bucket=bucket) + kwargs["ignore_link"] = True + kwarg_list.append(kwargs) + kwargs = {"bucket": bucket, "root_id": root_id} + post_rsp = await createGroups(app, kwarg_list, **kwargs) + else: + kwargs = getCreateArgs(body, root_id=root_id, bucket=bucket, implicit=implicit) + else: + kwargs["root_id"] = root_id + kwargs["bucket"] = bucket else: - # create an anonymous group - kwargs = {"bucket": bucket, "root_id": root_id} - if creation_props: - kwargs["creation_props"] = creation_props - group_json = await createObject(app, **kwargs) + kwargs = {"root_id": root_id, "bucket": bucket} + + if post_rsp is None: + log.debug(f"post_rsp is None, call createGroup with kwargs: {kwargs}") + # Handle cases other than multi-group create here + if "type" in kwargs: + msg = "type key is not allowed for Group creation" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + post_rsp = await createGroup(app, **kwargs) + + log.debug(f"returning resp: {post_rsp}") + + if "objects" in post_rsp: + # add any links in multi request + objects = post_rsp["objects"] + obj_count = len(objects) + log.debug(f"Post group multi create: {obj_count} objects") + if len(body) != obj_count: + msg = f"Expected {obj_count} objects but got {len(body)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + parent_ids = {} + for index in range(obj_count): + item = body[index] + if "link" in item: + link_item = item["link"] + parent_id = link_item.get("id") + title = link_item.get("name") + if parent_id and title: + # add a hard link + object = objects[index] + obj_id = object["id"] + if parent_id not in parent_ids: + parent_ids[parent_id] = {} + links = parent_ids[parent_id] + links[title] = {"id": obj_id} + if parent_ids: + log.debug(f"POST group multi - adding links: {parent_ids}") + kwargs = {"action": "put_link", "bucket": bucket} + kwargs["replace"] = True + + crawler = DomainCrawler(app, parent_ids, **kwargs) + + # will raise exception on not found, server busy, etc. + await crawler.crawl() + + status = crawler.get_status() + + log.info(f"DomainCrawler done for put_links action, status: {status}") - log.debug(f"returning resp: {group_json}") # group creation successful - resp = await jsonResponse(request, group_json, status=201) + resp = await jsonResponse(request, post_rsp, status=201) log.response(request, resp=resp) return resp @@ -260,7 +297,7 @@ async def DELETE_Group(request): msg = "Missing group id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(group_id, "Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/headnode.py b/hsds/headnode.py index 9b49517d..41501d00 100755 --- a/hsds/headnode.py +++ b/hsds/headnode.py @@ -15,14 +15,13 @@ import asyncio import os -import time from aiohttp.web import Application, StreamResponse, run_app, json_response from aiohttp.web_exceptions import HTTPBadRequest, HTTPInternalServerError +from h5json.time_util import unixTimeToUTC, elapsedTime, getNow from . import config -from .util.timeUtil import unixTimeToUTC, elapsedTime -from .util.idUtil import createNodeId +from .util.nodeUtil import createNodeId from . import hsds_logger as log from .util import query_marathon as marathonClient @@ -46,7 +45,7 @@ def __init__(self, node_id=None, node_type=None, node_host=None, node_port=None) self._type = node_type self._host = node_host self._port = node_port - now = time.time() + now = getNow() self._create_time = now self._last_poll = now self._stats = {} @@ -87,13 +86,13 @@ def get_info(self): return info def poll_update(self): - now = time.time() + now = getNow() self._last_poll = now def is_healthy(self): sleep_sec = int(config.get("node_sleep_time")) - now = time.time() + now = getNow() if now - self._last_poll < sleep_sec * 2: return True else: @@ -301,7 +300,7 @@ async def register(request): answer["dn_ids"] = dn_ids answer["req_ip"] = node_host log.debug(f"register returning: {answer}") - app["last_health_check"] = int(time.time()) + app["last_health_check"] = int(getNow()) resp = json_response(answer) log.response(request, resp=resp) @@ -475,7 +474,7 @@ async def init(): app["nodes"] = nodes app["dead_node_ids"] = set() - app["start_time"] = int(time.time()) # seconds after epoch + app["start_time"] = int(getNow()) # seconds after epoch app["last_health_check"] = 0 app["max_task_count"] = config.get("max_task_count") app.router.add_get("/", info) diff --git a/hsds/hsds_app.py b/hsds/hsds_app.py index e690b68d..d0d25d6d 100644 --- a/hsds/hsds_app.py +++ b/hsds/hsds_app.py @@ -3,12 +3,14 @@ from pathlib import Path import site import subprocess -import time import queue import threading +import time import logging from shutil import which +from h5json.time_util import getNow + def _enqueue_output(out, queue, loglevel): try: @@ -318,7 +320,7 @@ def run(self): self._threads.append(t) # wait to sockets are initialized - start_ts = time.time() + start_ts = getNow() SLEEP_TIME = 1 # time to sleep between checking on socket connection MAX_INIT_TIME = 10.0 # max time to wait for socket to be initialized @@ -329,7 +331,7 @@ def run(self): if os.path.exists(socket_path): ready += 1 else: - if time.time() > start_ts + 5: + if getNow() > start_ts + 5: # TBD - put a real ready check here ready = count if ready == count: @@ -339,12 +341,12 @@ def run(self): self.log.debug(f"{ready}/{count} ready") self.log.debug(f"sleeping for {SLEEP_TIME}") time.sleep(SLEEP_TIME) - if time.time() > start_ts + MAX_INIT_TIME: + if getNow() > start_ts + MAX_INIT_TIME: msg = f"failed to initialize after {MAX_INIT_TIME} seconds" self.log.error(msg) raise IOError(msg) - self.log.info(f"Ready after: {(time.time() - start_ts):4.2f} s") + self.log.info(f"Ready after: {(getNow() - start_ts):4.2f} s") self._ready = True def stop(self): @@ -352,7 +354,7 @@ def stop(self): if not self._processes: return - now = time.time() + now = getNow() logging.info(f"hsds app stop at {now}") for pname in self._processes: @@ -363,7 +365,7 @@ def stop(self): # wait for sub-proccesses to exit SLEEP_TIME = 0.1 # time to sleep between checking on process state MAX_WAIT_TIME = 10.0 # max time to wait for sub-process to terminate - start_ts = time.time() + start_ts = getNow() while True: is_alive_cnt = 0 for pname in self._processes: @@ -380,7 +382,7 @@ def stop(self): else: logging.debug("all subprocesses exited") break - if time.time() > start_ts + MAX_WAIT_TIME: + if getNow() > start_ts + MAX_WAIT_TIME: msg = f"failed to terminate after {MAX_WAIT_TIME} seconds" self.log.error(msg) break diff --git a/hsds/link_dn.py b/hsds/link_dn.py index f7ec5956..ef1c0438 100755 --- a/hsds/link_dn.py +++ b/hsds/link_dn.py @@ -20,12 +20,14 @@ from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.web import json_response -from .util.idUtil import isValidUuid +from h5json.objid import isValidUuid +from h5json.time_util import getNow +from h5json.link_util import validateLinkName, getLinkClass, isEqualLink + from .util.globparser import globmatch -from .util.linkUtil import validateLinkName, getLinkClass, isEqualLink from .util.domainUtil import isValidBucketName -from .util.timeUtil import getNow from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj +from . import config from . import hsds_logger as log @@ -74,7 +76,7 @@ async def GET_Links(request): log.debug(f"GET_Links params: {params}") group_id = get_obj_id(request) log.info(f"GET links: {group_id}") - if not isValidUuid(group_id, obj_class="group"): + if not isValidUuid(group_id, obj_class="groups"): log.error(f"Unexpected group_id: {group_id}") raise HTTPInternalServerError() @@ -154,6 +156,10 @@ async def GET_Links(request): link = copy(link_dict[title]) log.debug(f"link list[{i}: {link}") link["title"] = title + if link.get("h5domain"): + # deprecated key, replace with file + link["file"] = link["h5domain"] + del link["h5domain"] link_list.append(link) resp_json = {"links": link_list} @@ -170,7 +176,7 @@ async def POST_Links(request): group_id = get_obj_id(request) log.info(f"POST_Links: {group_id}") - if not isValidUuid(group_id, obj_class="group"): + if not isValidUuid(group_id, obj_class="groups"): log.error(f"Unexpected group_id: {group_id}") raise HTTPInternalServerError() @@ -216,6 +222,7 @@ async def POST_Links(request): log.info(f"Link name {title} not found in group: {group_id}") continue link_json = links[title] + log.debug(f"POST Links got link_json: {link_json}") item = {} if "class" not in link_json: log.warn(f"expected to find class key for link: {title}") @@ -243,15 +250,19 @@ async def POST_Links(request): log.warn(f"expected to find h5path for external link: {title}") continue item["h5path"] = link_json["h5path"] - if "h5domain" not in link_json: - log.warn(f"expted to find h5domain for external link: {title}") + if "h5domain" in link_json: + item["file"] = link_json["h5domain"] + elif "file" in link_json: + item["file"] = link_json["file"] + else: + log.warn(f"expected to find h5domain or file for external link: {title}") continue - item["h5domain"] = link_json["h5domain"] else: log.warn(f"unexpected to link class {link_class} for link: {title}") continue item["title"] = title + log.debug(f"adding link item: {item}") link_list.append(item) @@ -284,8 +295,10 @@ async def PUT_Links(request): params = request.rel_url.query group_id = get_obj_id(request) log.info(f"PUT links: {group_id}") + now = getNow(app) + max_timestamp_drift = int(config.get("max_timestamp_drift", default=300)) - if not isValidUuid(group_id, obj_class="group"): + if not isValidUuid(group_id, obj_class="groups"): log.error(f"Unexpected group_id: {group_id}") raise HTTPInternalServerError() @@ -364,11 +377,16 @@ async def PUT_Links(request): link_delete_set = deleted_links[group_id] else: link_delete_set = set() - - create_time = getNow(app) - for title in new_links: item = items[title] + if item.get("created"): + create_time = item["created"] + log.debug(f"link {title} has create time: {create_time}") + if abs(create_time - now) > max_timestamp_drift: + log.warn(f"link {title} create time stale, ignoring") + create_time = now + else: + create_time = now item["created"] = create_time links[title] = item log.debug(f"added link {title}: {item}") @@ -377,8 +395,7 @@ async def PUT_Links(request): if new_links: # update the group lastModified - group_json["lastModified"] = create_time - log.debug(f"tbd: group_json: {group_json}") + group_json["lastModified"] = now # write back to S3, save to metadata cache await save_metadata_obj(app, group_id, group_json, bucket=bucket) @@ -405,7 +422,7 @@ async def DELETE_Links(request): group_id = get_obj_id(request) log.info(f"DELETE links: {group_id}") - if not isValidUuid(group_id, obj_class="group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Unexpected group_id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/link_sn.py b/hsds/link_sn.py index 71e39246..a7dcc6a0 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -13,21 +13,27 @@ # service node of hsds cluster # -from aiohttp.web_exceptions import HTTPBadRequest +from aiohttp.web_exceptions import HTTPBadRequest, HTTPInternalServerError from json import JSONDecodeError +from h5json.objid import isValidUuid, getCollectionForId +from h5json.link_util import validateLinkName, getLinkClass, getLinkId +from h5json.link_util import getLinkPath, getLinkFilePath + +from .util.nodeUtil import getDataNodeUrl from .util.httpUtil import getHref, getBooleanParam from .util.httpUtil import jsonResponse from .util.globparser import globmatch -from .util.idUtil import isValidUuid, getDataNodeUrl, getCollectionForId from .util.authUtil import getUserPasswordFromRequest, validateUserPassword -from .util.domainUtil import getDomainFromRequest, isValidDomain, verifyRoot -from .util.domainUtil import getBucketForDomain -from .util.linkUtil import validateLinkName, getLinkClass +from .util.domainUtil import getDomainFromRequest, isValidDomain, verifyRoot, getBucketForDomain +from .util.linkUtil import getRequestLink + + from .servicenode_lib import getDomainJson, validateAction from .servicenode_lib import getLink, putLink, putLinks, getLinks, deleteLinks from .domain_crawl import DomainCrawler from . import hsds_logger as log +from . import config async def GET_Links(request): @@ -43,7 +49,7 @@ async def GET_Links(request): msg = "Missing group id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(group_id, obj_class="Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -140,7 +146,15 @@ async def GET_Links(request): # mix in collection key, target and hrefs for link in links: + for key in ("class", "title"): + if key not in link: + log.error(f"expected to find {key} key in link") + raise HTTPInternalServerError() + if link["class"] == "H5L_TYPE_HARD": + if "id" not in link: + log.error("expected to id key in hard link") + raise HTTPInternalServerError() collection_name = getCollectionForId(link["id"]) link["collection"] = collection_name target_uri = "/" + collection_name + "/" + link["id"] @@ -175,7 +189,7 @@ async def GET_Link(request): msg = "Missing group id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(group_id, obj_class="Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -211,13 +225,13 @@ async def GET_Link(request): link_class = link_json["class"] resp_link["class"] = link_class if link_class == "H5L_TYPE_HARD": - resp_link["id"] = link_json["id"] + resp_link["id"] = getLinkId(link_json) resp_link["collection"] = getCollectionForId(link_json["id"]) elif link_class == "H5L_TYPE_SOFT": - resp_link["h5path"] = link_json["h5path"] + resp_link["h5path"] = getLinkPath(link_json) elif link_class == "H5L_TYPE_EXTERNAL": - resp_link["h5path"] = link_json["h5path"] - resp_link["h5domain"] = link_json["h5domain"] + resp_link["h5path"] = getLinkPath(link_json) + resp_link["file"] = getLinkFilePath(link_json) else: log.warn(f"Unexpected link class: {link_class}") resp_json = {} @@ -281,14 +295,32 @@ async def PUT_Link(request): msg = f"Invalid domain: {domain}" log.warn(msg) raise HTTPBadRequest(reason=msg) - bucket = getBucketForDomain(domain) await validateAction(app, domain, group_id, username, "create") - # putLink will validate these arguments - kwargs = {"bucket": bucket} - kwargs["tgt_id"] = body.get("id") - kwargs["h5path"] = body.get("h5path") - kwargs["h5domain"] = body.get("h5domain") + + predate_max_time = config.get("predate_max_time", default=10.0) + + try: + link_json = getRequestLink(link_title, body, predate_max_time=predate_max_time) + except (KeyError, TypeError, ValueError) as e: + raise HTTPBadRequest(reason=str(e)) + + link_class = getLinkClass(link_json) + + kwargs = {} + kwargs["bucket"] = getBucketForDomain(domain) + if link_class == "H5L_TYPE_HARD": + kwargs["tgt_id"] = getLinkId(link_json) + elif link_class == "H5L_TYPE_SOFT": + kwargs["h5path"] = getLinkPath(link_json) + elif link_class == "H5L_TYPE_EXTERNAL": + kwargs["h5path"] = getLinkPath(link_json) + kwargs["h5domain"] = getLinkFilePath(link_json) + else: + raise HTTPBadRequest(reason=f"unexpected link class: {link_class}") + + if "created" in link_json: + kwargs["created"] = link_json["created"] status = await putLink(app, group_id, link_title, **kwargs) @@ -327,6 +359,16 @@ async def PUT_Links(request): log.warn(msg) raise HTTPBadRequest(reason=msg) + if not body: + msg = "PUT links with empty body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if not isinstance(body, dict): + msg = f"PUT links expected dictionary body but got: {type(body)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + domain = getDomainFromRequest(request) if not isValidDomain(domain): msg = f"Invalid domain: {domain}" @@ -422,6 +464,7 @@ async def PUT_Links(request): link_item = link_items[title] getLinkClass(link_item) except ValueError: + log.warn(f"invalid link for {title}: {link_item}") raise HTTPBadRequest(reason="invalid link item") grp_ids[grp_id] = link_items @@ -447,7 +490,7 @@ async def PUT_Links(request): count = len(grp_ids) if count == 0: msg = "no grp_ids defined" - log.warn(f"PUT_Attributes: {msg}") + log.warn(f"PUT_Links: {msg}") raise HTTPBadRequest(reason=msg) elif count == 1: # just send one PUT Attributes request to the dn @@ -493,7 +536,7 @@ async def DELETE_Links(request): msg = "Missing group id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(group_id, obj_class="Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -640,7 +683,7 @@ async def POST_Links(request): # do a check that everything is as it should with the item list for group_id in items: - if not isValidUuid(group_id, obj_class="Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid group id: {group_id}" log.warn(msg) @@ -747,7 +790,7 @@ async def DELETE_Link(request): msg = "Missing group id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(group_id, obj_class="Group"): + if not isValidUuid(group_id, obj_class="groups"): msg = f"Invalid group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) diff --git a/hsds/post_crawl.py b/hsds/post_crawl.py new file mode 100644 index 00000000..2c79b47e --- /dev/null +++ b/hsds/post_crawl.py @@ -0,0 +1,288 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +# +# post crawler +# + +import asyncio + +from aiohttp.web_exceptions import HTTPServiceUnavailable, HTTPConflict, HTTPBadRequest +from aiohttp.web_exceptions import HTTPInternalServerError, HTTPNotFound, HTTPGone + +from .util.httpUtil import isOK +from .servicenode_lib import createObject +from . import hsds_logger as log + + +class PostCrawler: + def __init__( + self, + app, + items=None, + root_id=None, + bucket=None, + max_tasks=40, + ignore_error=False + ): + log.info("PostCrawler.__init__") + self._app = app + self._root_id = root_id + self._bucket = bucket + self._max_tasks = max_tasks + self._ignore_error = ignore_error + + if not items: + log.error("no post requests for crawler to crawl!") + raise ValueError() + if not bucket: + log.error("bucket not set for PostCrawler") + raise ValueError() + self._count = len(items) + self._items = items + self._rsp_objs = [None,] * self._count + self._q = asyncio.Queue() + log.debug(f"PostCrawler adding index 0 - {self._count} to queue") + for i in range(self._count): + self._q.put_nowait(i) + + def get_rsp_objs(self): + """ return list of object responses """ + + return self._rsp_objs + + def get_status(self): + """ return the highest status of any of the returned objects """ + status = None + for i in range(self._count): + item = self._rsp_objs[i] + if not item: + continue # resp not filled in yet + if "status_code" in item: + item_status = item["status_code"] + if status is None or item_status > status: + # return the more severe error + log.debug(f"setting status to {item_status}") + status = item_status + elif "id" in item: + # post request succeeded + if status is None: + status = 201 + else: + log.error(f"PostCrawler unexpected response for item {i}: {item}") + status = 500 + + return status + + async def crawl(self): + max_tasks = min(self._max_tasks, self._count) + workers = [asyncio.Task(self.work()) for _ in range(max_tasks)] + # When all work is done, exit. + msg = "PostCrawler - await queue.join - " + msg += f"count: {self._count} with {max_tasks} workers" + log.info(msg) + await self._q.join() + msg = "PostCrawler - join complete - " + msg += f"count: {self._count}" + log.info(msg) + + for w in workers: + w.cancel() + log.debug("PostCrawler - workers canceled") + + status = self.get_status() + if status: + log.debug(f"PostCrawler -- status: {status}") + log.debug(f"ignore_error: {self._ignore_error}") + if not self._ignore_error: + # throw the appropriate exception if other than 200, 201 + if isOK(status): + pass # ok + elif status == 400: + log.warn("PostCrawler - BadRequest") + raise HTTPBadRequest(reason="unknown") + elif status == 404: + log.warn("PostCrawler - not found") + raise HTTPNotFound() + elif status == 409: + log.warn("PostCrawler - conflict") + raise HTTPConflict() + elif status == 410: + log.warn("PostCrawler - gone") + raise HTTPGone() + elif status == 500: + log.error("PostCrawler - internal server error") + raise HTTPInternalServerError() + elif status == 503: + log.error("PostCrawler - server busy") + raise HTTPServiceUnavailable() + else: + log.error(f"PostCrawler - unexpected status: {status}") + raise HTTPInternalServerError() + else: + # no tasks returned anything + log.error("PostCrawler - no results returned") + if not self._ignore_error: + raise HTTPInternalServerError() + + async def work(self): + while True: + index = await self._q.get() + await self.create(index) + self._q.task_done() + + async def create(self, index): + log.debug(f"PostCrawler fetch for index: {index}") + item = self._items[index] + log.debug(f"got item: {item}") + kwargs = {"bucket": self._bucket} + + if "obj_id" in item: + kwargs["obj_id"] = item["obj_id"] + if "type" in item: + kwargs["type"] = item["type"] + if "shape" in item: + kwargs["shape"] = item["shape"] + if "layout" in item: + kwargs["layout"] = item["layout"] + if "creation_props" in item: + kwargs["creation_props"] = item["creation_props"] + if "attrs" in item: + kwargs["attrs"] = item["attrs"] + if "parent_id" in item: + kwargs["parent_id"] = item["parent_id"] + elif "root_id" in item: + kwargs["root_id"] = item["root_id"] + if "h5path" in item: + kwargs["h5path"] = item["h5path"] + if "links" in item: + kwargs["links"] = item["links"] + + log.debug(f"PostCrawler index {index} kwargs: {kwargs}") + rsp_json = None + try: + rsp_json = await createObject(self._app, **kwargs) + except HTTPConflict: + log.warn("PostCrawler - got HTTPConflict from http_post") + rsp_json = {"status_code": 409} + except HTTPServiceUnavailable: + rsp_json = {"status_code": 503} + except HTTPInternalServerError: + rsp_json = {"status_code": 500} + except Exception as e: + log.error(f"unexpected exception {e}") + rsp_json = {"status_code": 500} + + log.info(f"PostCrawler - index: {index} post rsp: {rsp_json}") + + self._rsp_objs[index] = rsp_json + + +async def _createObjects(app, items: list, root_id=None, bucket=None): + """ generic create function """ + + post_crawler = PostCrawler(app, root_id=root_id, bucket=bucket, items=items) + await post_crawler.crawl() + if post_crawler.get_status() > 201: + msg = f"createGroups returning status from crawler: {post_crawler.get_status()}" + log.error(msg) + raise HTTPInternalServerError() + + obj_list = post_crawler.get_rsp_objs() + if not isinstance(obj_list, list): + msg = f"createGroups expected list but got: {type(obj_list)}" + log.error(msg) + raise HTTPInternalServerError() + return {"objects": obj_list} + + +async def createGroups(app, items: list, root_id=None, bucket=None): + """ create an group objects based on parameters in items list """ + + if not root_id: + msg = "no root_id given for createObjects" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + for item in items: + if not isinstance(item, dict): + msg = "expected list of dictionary objects for multi-object create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "type" in item: + msg = "type key not allowed for multi-group create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "shape" in item: + msg = "shape key not allowed for multi-group create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + log.info(f"createGroups with {len(items)} items, root_id: {root_id}") + + rsp_json = await _createObjects(app, items=items, root_id=root_id, bucket=bucket) + return rsp_json + + +async def createDatatypeObjs(app, items: list, root_id=None, bucket=None): + """ create datatype objects based on parameters in items list """ + + if not root_id: + msg = "no root_id given for createDatatypeObjs" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + for item in items: + if not isinstance(item, dict): + msg = "expected list of dictionary objects for multi-object create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "type" not in item: + msg = "type key not provided for multi-datatype create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "shape" in item: + msg = "shape key not allowed for multi-datatype create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + log.info(f"createDatatypes with {len(items)} items, root_id: {root_id}") + + rsp_json = await _createObjects(app, items=items, root_id=root_id, bucket=bucket) + return rsp_json + + +async def createDatasets(app, items: list, root_id=None, bucket=None): + """ create dataset objects based on parameters in items list """ + + if not root_id: + msg = "no root_id given for createDatatypeObjs" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + for item in items: + if not isinstance(item, dict): + msg = "expected list of dictionary objects for multi-object create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "type" not in item: + msg = "type key not provided for multi-dataset create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "shape" not in item: + msg = "shape key not provided for multi-dataset create" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + log.info(f"createDatasets with {len(items)} items, root_id: {root_id}") + + rsp_json = await _createObjects(app, items=items, root_id=root_id, bucket=bucket) + return rsp_json diff --git a/hsds/servicenode.py b/hsds/servicenode.py index 8a5ddaee..7ec4f5a3 100755 --- a/hsds/servicenode.py +++ b/hsds/servicenode.py @@ -14,9 +14,10 @@ # import asyncio -import time from aiohttp.web import run_app import aiohttp_cors +from h5json.time_util import getNow + from .util.lruCache import LruCache from .util.httpUtil import isUnixDomainUrl, bindToSocket, getPortFromUrl from .util.httpUtil import release_http_client, jsonResponse @@ -217,10 +218,10 @@ async def preStop(request): log.request(request) app = request.app - shutdown_start = time.time() + shutdown_start = getNow() log.warn(f"preStop request calling on_shutdown at {shutdown_start:.2f}") await on_shutdown(app) - shutdown_elapse_time = time.time() - shutdown_start + shutdown_elapse_time = getNow() - shutdown_start msg = f"shutdown took: {shutdown_elapse_time:.2f} seconds" if shutdown_elapse_time > 2.0: # 2.0 is the default grace period for kubernetes diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index c8c84f75..f20908bf 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -15,25 +15,38 @@ import asyncio import json +import numpy as np from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPGone, HTTPConflict from aiohttp.web_exceptions import HTTPNotFound, HTTPInternalServerError + from aiohttp.client_exceptions import ClientOSError, ClientError from aiohttp import ClientResponseError +from h5json.array_util import encodeData, decodeData, bytesToArray, bytesArrayToList +from h5json.array_util import jsonToArray +from h5json.objid import getCollectionForId, createObjId, getRootObjId +from h5json.objid import isSchema2Id, getS3Key, isValidUuid +from h5json.hdf5dtype import getBaseTypeJson, validateTypeItem, createDataType, getItemSize +from h5json.shape_util import getShapeDims, getShapeClass +from h5json.dset_util import getChunkSize, generateLayout +from h5json.dset_util import getDataSize, validateDatasetCreationProps +from h5json.link_util import h5Join, validateLinkName, getLinkClass, getLinkFilePath +from h5json.time_util import getNow + +from .util.nodeUtil import getDataNodeUrl from .util.authUtil import getAclKeys -from .util.arrayUtil import encodeData -from .util.idUtil import getDataNodeUrl, getCollectionForId, createObjId, getRootObjId -from .util.idUtil import isSchema2Id, getS3Key, isValidUuid -from .util.linkUtil import h5Join, validateLinkName, getLinkClass -from .util.storUtil import getStorJSONObj, isStorObj +from .util.linkUtil import getRequestLinks +from .util.storUtil import getStorJSONObj, isStorObj, getSupportedFilters from .util.authUtil import aclCheck from .util.httpUtil import http_get, http_put, http_post, http_delete from .util.domainUtil import getBucketForDomain, verifyRoot, getLimits +from .util.dsetUtil import getShapeJson from .util.storUtil import getCompressors -from .basenode import getVersion +from .basenode import getVersion from . import hsds_logger as log +from . import config async def getDomainJson(app, domain, reload=False): @@ -104,7 +117,7 @@ async def getDomainJson(app, domain, reload=False): return domain_json -async def getDomainResponse(app, domain_json, bucket=None, verbose=False): +async def getDomainResponse(app, domain_json, bucket=None, verbose=False, getobjs=False): """ construct JSON response for domain request """ rsp_json = {} if "root" in domain_json: @@ -177,6 +190,13 @@ async def getDomainResponse(app, domain_json, bucket=None, verbose=False): rsp_json["num_linked_chunks"] = num_linked_chunks rsp_json["md5_sum"] = md5_sum + if getobjs and "root" in domain_json: + root_id = domain_json["root"] + domain_objs = await getDomainObjs(app, root_id, bucket=bucket) + if domain_objs: + log.debug(f"returning {len(domain_objs)} for root_id: {root_id}") + rsp_json["domain_objs"] = domain_objs + # pass back config parameters the client may care about rsp_json["limits"] = getLimits() @@ -435,7 +455,12 @@ async def getLink(app, group_id, title, bucket=None): return link_json -async def putLink(app, group_id, title, tgt_id=None, h5path=None, h5domain=None, bucket=None): +async def putLink(app, group_id, title, + tgt_id=None, + h5path=None, + h5domain=None, + bucket=None, + created=None): """ create a new link. Return 201 if this is a new link, or 200 if it's a duplicate of an existing link. """ @@ -455,7 +480,9 @@ async def putLink(app, group_id, title, tgt_id=None, h5path=None, h5domain=None, if h5path: link_json["h5path"] = h5path if h5domain: - link_json["h5domain"] = h5domain + link_json["file"] = h5domain + if created: + link_json["created"] = created try: link_class = getLinkClass(link_json) @@ -525,8 +552,7 @@ async def putLinks(app, group_id, items, bucket=None): """ create a new links. Return 201 if any item is a new link, or 200 if it's a duplicate of an existing link. """ - isValidUuid(group_id, obj_class="group") - group_json = None + isValidUuid(group_id, obj_class="groups") # validate input for title in items: @@ -539,25 +565,23 @@ async def putLinks(app, group_id, items, bucket=None): raise HTTPBadRequest(reason="invalid link") if link_class == "H5L_TYPE_HARD": + if "id" not in item: + msg = "expected id key for hard link class" + log.warn(msg) + raise HTTPBadRequest(reason=msg) tgt_id = item["id"] - isValidUuid(tgt_id) - # for hard links, verify that the referenced id exists and is in - # this domain - ref_json = await getObjectJson(app, tgt_id, bucket=bucket) - if not group_json: - # just need to fetch this once - group_json = await getObjectJson(app, group_id, bucket=bucket) - if ref_json["root"] != group_json["root"]: - msg = "Hard link must reference an object in the same domain" + try: + isValidUuid(tgt_id) + except ValueError: + msg = f"invalid object id: {tgt_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) # ready to add links now req = getDataNodeUrl(app, group_id) req += "/groups/" + group_id + "/links" - log.debug(f"PUT links - PUT request: {req}") + log.debug(f"PUT links {len(items)} items - PUT request: {req}") params = {"bucket": bucket} - data = {"links": items} put_rsp = await http_put(app, req, data=data, params=params) @@ -636,7 +660,7 @@ async def getObjectIdByPath(app, obj_id, h5path, bucket=None, refresh=False, dom raise HTTPBadRequest(reason=msg) # find domain object is stored under - domain = link_json["h5domain"] + domain = getLinkFilePath(link_json) if domain.startswith("hdf5:/"): # strip off prefix @@ -833,8 +857,32 @@ async def getRootInfo(app, root_id, bucket=None): return info_json +async def getDomainObjs(app, root_id, bucket=None): + """ Return domain objects if available for this root id """ + log.debug(f"getDomainObjs {root_id}") + + s3_key = getS3Key(root_id) + + parts = s3_key.split("/") + # dset_key is in the format db//d//.dataset.json + # get the key for the root info object as: db//.summary.json + if len(parts) != 3: + log.error(f"Unexpected s3key format: {s3_key}") + return None + + summary_key = f"db/{parts[1]}/.summary.json" + + try: + summary_json = await getStorJSONObj(app, summary_key, bucket=bucket) + except HTTPNotFound: + log.warn(f".summary.json not found for key: {summary_key}") + return None + + return summary_json + + async def doFlush(app, root_id, bucket=None): - """return wnen all DN nodes have wrote any pending changes to S3""" + """return wnen all DN nodes have wrote any pending changes to S3 """ log.info(f"doFlush {root_id}") params = {"flush": 1} if bucket: @@ -886,6 +934,245 @@ async def doFlush(app, root_id, bucket=None): return dn_ids +async def getTypeFromRequest(app, body, obj_id=None, bucket=None): + """ return a type json from the request body """ + if "type" not in body: + msg = "expected type in body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + datatype = body["type"] + + if isinstance(datatype, str) and datatype.startswith("t-"): + # Committed type - fetch type json from DN + ctype_id = datatype + log.debug(f"got ctypeid: {ctype_id}") + ctype_json = await getObjectJson(app, ctype_id, bucket=bucket) + log.debug(f"ctype {ctype_id}: {ctype_json}") + root_id = getRootObjId(obj_id) + if ctype_json["root"] != root_id: + msg = "Referenced committed datatype must belong in same domain" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + datatype = ctype_json["type"] + # add the ctype_id to the type + datatype["id"] = ctype_id + elif isinstance(datatype, str): + try: + # convert predefined type string (e.g. "H5T_STD_I32LE") to + # corresponding json representation + datatype = getBaseTypeJson(datatype) + except TypeError: + msg = "PUT attribute with invalid predefined type" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + try: + validateTypeItem(datatype) + except KeyError as ke: + msg = f"KeyError creating type: {ke}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + except TypeError as te: + msg = f"TypeError creating type: {te}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + except ValueError as ve: + msg = f"ValueError creating type: {ve}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + return datatype + + +def getShapeFromRequest(body): + """ get shape json from request body """ + shape_json = {} + if "shape" in body: + shape_body = body["shape"] + shape_class = None + if isinstance(shape_body, dict) and "class" in shape_body: + shape_class = shape_body["class"] + elif isinstance(shape_body, str): + shape_class = shape_body + if shape_class: + if shape_class == "H5S_NULL": + shape_json["class"] = "H5S_NULL" + if isinstance(shape_body, dict) and "dims" in shape_body: + msg = "can't include dims with null shape" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if isinstance(shape_body, dict) and "value" in body: + msg = "can't have H5S_NULL shape with value" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + elif shape_class == "H5S_SCALAR": + shape_json["class"] = "H5S_SCALAR" + if "dims" in shape_body: + msg = "dimensions aren't valid for scalar attribute" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + elif shape_class == "H5S_SIMPLE": + shape_json["class"] = "H5S_SIMPLE" + dims = getShapeDims(shape_body) + shape_json["dims"] = dims + else: + msg = f"Unknown shape class: {shape_class}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + # no class, interpret shape value as dimensions and + # use H5S_SIMPLE as class + if isinstance(shape_body, list) and len(shape_body) == 0: + shape_json["class"] = "H5S_SCALAR" + else: + shape_json["class"] = "H5S_SIMPLE" + dims = getShapeDims(shape_body) + shape_json["dims"] = dims + else: + shape_json["class"] = "H5S_SCALAR" + + return shape_json + + +async def getAttributeFromRequest(app, req_json, obj_id=None, bucket=None): + """ return attribute from given request json """ + attr_item = {} + log.debug(f"getAttributeFromRequest req_json: {req_json} obj_id: {obj_id}") + attr_type = await getTypeFromRequest(app, req_json, obj_id=obj_id, bucket=bucket) + attr_shape = getShapeFromRequest(req_json) + attr_item = {"type": attr_type, "shape": attr_shape} + attr_value = getValueFromRequest(req_json, attr_type, attr_shape) + if attr_value is not None: + if isinstance(attr_value, bytes): + attr_value = encodeData(attr_value) # store as base64 + attr_item["encoding"] = "base64" + else: + # just store the JSON dict or primitive value + attr_item["value"] = attr_value + else: + attr_item["value"] = None + + now = getNow() + if "created" in req_json: + created = req_json["created"] + # allow "pre-dated" attributes if the timestamp is within the last 10 seconds + predate_max_time = config.get("predate_max_time", default=10.0) + if now - created < predate_max_time: + attr_item["created"] = created + else: + msg = "stale created timestamp for attribute, ignoring " + msg += f"predate config: {predate_max_time:6.2f} " + msg += f"age: {(now - created):6.2f}" + log.warn(msg) + if "created" not in attr_item: + attr_item["created"] = now + + return attr_item + + +async def getAttributesFromRequest(app, req_json, obj_id=None, bucket=None): + """ read the given JSON dictionary and return dict of attribute json """ + + attr_items = {} + kwargs = {"obj_id": obj_id} + if bucket: + kwargs["bucket"] = bucket + if "attributes" in req_json: + attributes = req_json["attributes"] + if not isinstance(attributes, dict): + msg = f"expected list for attributes but got: {type(attributes)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + # read each attr_item and canonicalize the shape, type, verify value + for attr_name in attributes: + attr_json = attributes[attr_name] + attr_item = await getAttributeFromRequest(app, attr_json, **kwargs) + attr_items[attr_name] = attr_item + else: + log.debug(f"getAttributesFromRequest - no attribute defined in {req_json}") + + return attr_items + + +def getValueFromRequest(body, data_type, data_shape): + """ Get attribute value from request json """ + dims = getShapeDims(data_shape) + if "value" in body: + if dims is None: + msg = "Bad Request: data can not be included with H5S_NULL space" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + value = body["value"] + # validate that the value agrees with type/shape + arr_dtype = createDataType(data_type) # np datatype + if len(dims) == 0: + np_dims = [1, ] + else: + np_dims = dims + + if "encoding" in body: + encoding = body["encoding"] + log.debug(f"using encoding: {encoding}") + item_size = getItemSize(data_type) + if item_size == "H5T_VARIABLE": + msg = "base64 encoding is not support for variable length attributes" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + try: + data = decodeData(value) + except ValueError: + msg = "unable to decode data" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + expected_byte_count = arr_dtype.itemsize * np.prod(dims) + if len(data) != expected_byte_count: + msg = f"expected: {expected_byte_count} but got: {len(data)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + # check to see if this works with our shape and type + try: + arr = bytesToArray(data, arr_dtype, np_dims) + except ValueError as e: + log.debug(f"data: {data}") + log.debug(f"type: {arr_dtype}") + log.debug(f"np_dims: {np_dims}") + msg = f"Bad Request: encoded input data doesn't match shape and type: {e}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + value_json = None + # now try converting to JSON + list_data = arr.tolist() + try: + value_json = bytesArrayToList(list_data) + except ValueError as err: + msg = f"Cannot decode bytes to list: {err}, will store as encoded bytes" + log.warn(msg) + if value_json: + log.debug("will store base64 input as json") + if data_shape["class"] == "H5S_SCALAR": + # just use the scalar value + value = value_json[0] + else: + value = value_json # return this + else: + value = data # return bytes to signal that this needs to be encoded + else: + # verify that the input data matches the array shape and type + try: + jsonToArray(np_dims, arr_dtype, value) + except ValueError as e: + msg = f"Bad Request: input data doesn't match selection: {e}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + value = None + + return value + + async def getAttributes(app, obj_id, attr_names=None, include_data=False, @@ -973,7 +1260,7 @@ async def putAttributes(app, req = getDataNodeUrl(app, obj_id) collection = getCollectionForId(obj_id) req += f"/{collection}/{obj_id}/attributes" - log.info(f"putAttribute: {req}") + log.info(f"putAttributes: {req}") params = {} if replace: @@ -1044,99 +1331,341 @@ async def deleteObject(app, obj_id, bucket=None): del meta_cache[obj_id] # remove from cache -async def createObject(app, - root_id=None, - obj_type=None, - obj_shape=None, - layout=None, - creation_props=None, - bucket=None): - """ create a group, ctype, or dataset object and return object json - Determination on whether a group, ctype, or dataset is created is based on: - 1) if obj_type and obj_shape are set, a dataset object will be created - 2) if obj_type is set but not obj_shape, a datatype object will be created - 3) otherwise (type and shape are both None), a group object will be created - The layout parameter only applies to dataset creation - """ - if obj_type and obj_shape: - collection = "datasets" - elif obj_type: - collection = "datatypes" +def getCreateArgs(body, + root_id=None, + bucket=None, + type=None, + implicit=False, + chunk_table=None, + ignore_link=False): + """ get args for createObject from request body """ + + log.debug(f"getCreateArgs with body keys: {list(body.keys())}") + if ignore_link: + log.debug("getCreateArgs, ignore_link is set") + + kwargs = {"bucket": bucket} + predate_max_time = config.get("predate_max_time", default=10.0) + + parent_id = None + obj_id = None + h5path = None + + if "parent_id" not in body: + parent_id = root_id else: - collection = "groups" - log.info(f"createObject for {collection} collection, root: {root_id}, bucket: {bucket}") - if obj_type: - log.debug(f" obj_type: {obj_type}") - if obj_shape: - log.debug(f" obj_shape: {obj_shape}") - if layout: - log.debug(f" layout: {layout}") - if creation_props: - log.debug(f" cprops: {creation_props}") + parent_id = body["parent_id"] + + if "h5path" in body: + h5path = body["h5path"] + # normalize the h5path + if h5path.startswith("/"): + if parent_id == root_id: + # just adjust the path to be relative + h5path = h5path[1:] + else: + msg = f"PostCrawler expecting relative h5path, but got: {h5path}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) - obj_id = createObjId(collection, rootid=root_id) - log.info(f"new obj id: {obj_id}") - obj_json = {"id": obj_id, "root": root_id} - if obj_type: - obj_json["type"] = obj_type - if obj_shape: - obj_json["shape"] = obj_shape - if layout: - obj_json["layout"] = layout - if creation_props: - obj_json["creationProperties"] = creation_props - log.debug(f"create {collection} obj, body: {obj_json}") - dn_url = getDataNodeUrl(app, obj_id) - req = f"{dn_url}/{collection}" - params = {"bucket": bucket} - rsp_json = await http_post(app, req, data=obj_json, params=params) + if h5path.endswith("/"): + h5path = h5path[:-1] # makes iterating through the links a bit easier - return rsp_json + if "link" in body: + if "h5path" in body: + msg = "'link' key in body can't be used with h5path" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + # if ignore_link is set, parent_links will be created post object creation + link_body = body["link"] + log.debug(f"link_body: {link_body}") + if "id" in link_body and not ignore_link: + parent_id = link_body["id"] + if "name" in link_body: + link_title = link_body["name"] + try: + # will throw exception if there's a slash in the name + validateLinkName(link_title) + except ValueError: + msg = f"invalid link title: {link_title}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if parent_id and link_title: + log.debug(f"parent id: {parent_id}, link_title: {link_title}") + if not ignore_link: + h5path = link_title # just use the link name as the h5path + log.debug(f"set h5path to {link_title}") + + if parent_id and h5path: + # these are used by createObjectByPath + kwargs["parent_id"] = parent_id + kwargs["implicit"] = implicit + kwargs["h5path"] = h5path + else: + kwargs["root_id"] = root_id + if "id" in body: + obj_id = body["id"] + if not isValidUuid(obj_id): + msg = f"Invalid id: {obj_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) -async def createObjectByPath(app, - parent_id=None, - h5path=None, - implicit=False, - obj_type=None, - obj_shape=None, - layout=None, - creation_props=None, - bucket=None): + kwargs["obj_id"] = obj_id + log.debug(f"createObject will use client id: {obj_id}") - """ create an object at the designated path relative to the parent. - If implicit is True, make any intermediate groups needed in the h5path. """ + if "creationProperties" in body: + creation_props = body["creationProperties"] + # validate after we've checked for shape and type + else: + creation_props = {} + kwargs["creation_props"] = creation_props - if not parent_id: - msg = "no parent_id given for createObjectByPath" + if "attributes" in body: + attrs = body["attributes"] + if not isinstance(attrs, dict): + msg = f"expected dict for for attributes, but got: {type(attrs)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + log.debug(f"createObject attributes: {attrs}") + + # tbd: validate attributes + kwargs["attrs"] = attrs + + if "links" in body: + body_links = body["links"] + log.debug(f"got links for new group: {body_links}") + try: + links = getRequestLinks(body["links"], predate_max_time=predate_max_time) + except ValueError: + msg = "invalid link item sent in request" + raise HTTPBadRequest(reason=msg) + log.debug(f"adding links to createObject request: {links}") + kwargs["links"] = links + + if type: + kwargs["type"] = type + type_json = type + elif "type" in body: + type_json = body["type"] + if isinstance(type_json, str): + try: + # convert predefined type string (e.g. "H5T_STD_I32LE") to + # corresponding json representation + type_json = getBaseTypeJson(type_json) + log.debug(f"got type: {type_json}") + except TypeError: + msg = f"POST with invalid predefined type: {type_json}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + type_json = None + + if type_json: + try: + validateTypeItem(type_json) + except KeyError as ke: + msg = f"KeyError creating type: {ke}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + except TypeError as te: + msg = f"TypeError creating type: {te}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + except ValueError as ve: + msg = f"ValueError creating type: {ve}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + kwargs["type"] = type_json + else: + pass # no type + return kwargs + + +def genLayout(shape_json, type_json, has_filters=False): + """ create a chunked or contiguous layout based on shape and itemsize """ + + min_chunk_size = int(config.get("min_chunk_size")) + max_chunk_size = int(config.get("max_chunk_size")) + max_chunks_per_folder = int(config.get("max_chunks_per_folder", default=0)) + kwargs = {"chunk_min": min_chunk_size, "chunk_max": max_chunk_size} + max_chunks_per_folder = int(config.get("max_chunks_per_folder")) + if max_chunks_per_folder > 0: + kwargs["max_chunks_per_folder"] = max_chunks_per_folder + if has_filters: + kwargs["chunks"] = True # force a chunked layout to support compression + + layout_json = generateLayout(shape_json, type_json, **kwargs) + return layout_json + + +def getDatasetCreateArgs(body, + root_id=None, + bucket=None, + type=None, + implicit=False, + ignore_link=False): + + """ get args for createDataset from request body """ + + # call getCreateArgs for group, datatype objects, then fill in for dataset specific options + kwargs = getCreateArgs(body, + root_id=root_id, + bucket=bucket, + type=type, + implicit=implicit, + ignore_link=ignore_link) + + if "type" not in kwargs: + msg = "no type specified for create dataset" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not h5path: - msg = "no h5path given for createObjectByPath" + + type_json = kwargs["type"] + # + # Validate shape if present + # + + # will return scalar shape if no shape key in body + shape_json = getShapeJson(body) + try: + shape_class = getShapeClass(shape_json) + shape_dims = getShapeDims(shape_json) + except (KeyError, TypeError, ValueError): + msg = f"Invalid shape: {shape_json}" log.warn(msg) raise HTTPBadRequest(reason=msg) - log.debug(f"createObjectByPath - parent_id: {parent_id}, h5path: {h5path}") - root_id = getRootObjId(parent_id) + log.debug(f"shape_class: {shape_class}, shape_dims: {shape_dims}") + + log.debug(f"got createArgs: {list(kwargs.keys())}") + + kwargs["shape"] = shape_json - if h5path.startswith("/"): - if parent_id == root_id: - # just adjust the path to be relative - h5path = h5path[1:] + # get layout for dataset creation + log.debug("getting dataset creation settings") + min_chunk_size = int(config.get("min_chunk_size")) + max_chunk_size = int(config.get("max_chunk_size")) + type_json = kwargs["type"] + + item_size = getItemSize(type_json) + if item_size == "H5T_VARIABLE": + item_size = config.get("default_vlen_type_size", default=128) + if shape_dims is None: + dset_size = 0 + else: + dset_size = getDataSize(shape_dims, item_size) + + if "creationProperties" in body: + creation_props = body["creationProperties"] + else: + creation_props = None + layout_json = None + chunk_dims = None + has_filters = False + + if creation_props: + log.debug(f"POST_Dataset creation props: {creation_props}") + try: + validateDatasetCreationProps(creation_props, type_json=type_json, shape=shape_json) + except (KeyError, TypeError, ValueError) as e: + msg = f"Provided creation properties are invalid: {e}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + log.debug(f"create_props after validation: {creation_props}") + + if "filters" in creation_props: + # check that the given filters are supported by HSDS + filters = creation_props["filters"] + supported_filters = getSupportedFilters() + log.debug(f"supported filters: {supported_filters}") + for filter_item in filters: + if filter_item["name"] not in supported_filters: + msg = f"Unsupported filter id: {filter_item['id']}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if filters: + has_filters = True + creation_props["filters"] = filters + log.debug(f"post validate creation properties: {creation_props}") + if "layout" in creation_props: + layout_json = creation_props["layout"] + else: + creation_props = {} + + if "layout" in creation_props: + layout_json = creation_props["layout"] + # layout_json was validated in validateDatasetCreationProps, but issue + # a warning if the chunk size is outside suggested range + if "dims" in layout_json: + chunk_dims = layout_json["dims"] + # log warning if the chunk shape if chunk size is too small or too big + chunk_size = getChunkSize(chunk_dims, item_size) + if chunk_size < min_chunk_size: + msg = f"chunk size: {chunk_size} less than recommended min size: {min_chunk_size}" + log.warn(msg) + elif chunk_size > max_chunk_size: + msg = f"chunk size: {chunk_size} greater than recommended " + msg += f"max size: {max_chunk_size}" + log.debug(msg) else: - msg = f"createObjectByPath expecting relative h5path, but got: {h5path}" + # log warning if contiguous layout used with too large datadset + if dset_size > max_chunk_size: + msg = f"dataset larger than recommended {max_chunk_size} for CONTIGUOUS storage" + log.warn(msg) + else: + # no layout, create one based on shape and itemsize + layout_json = genLayout(shape_json, type_json, has_filters=has_filters) + log.info(f"created chunk layout for new dset: {layout_json}") + creation_props["layout"] = layout_json + + kwargs["creation_props"] = creation_props + log.debug(f"updated creation props: {creation_props}") + + # + # get input data if present + # + if "value" in body and body["value"]: + # data to initialize dataset included in request + if shape_json["class"] == "H5S_NULL": + msg = "null shape datasets can not have initial values" log.warn(msg) raise HTTPBadRequest(reason=msg) - if h5path.endswith("/"): - h5path = h5path[:-1] # makes iterating through the links a bit easier + input_data = body["value"] + msg = "input data doesn't match request type and shape" + dims = getShapeDims(shape_json) - if not h5path: - msg = "h5path for createObjectByPath invalid" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + arr_dtype = createDataType(type_json) - obj_json = None + try: + input_arr = jsonToArray(dims, arr_dtype, input_data) + except ValueError: + log.warn(f"ValueError: {msg}") + raise HTTPBadRequest(reason=msg) + except TypeError: + log.warn(f"TypeError: {msg}") + raise HTTPBadRequest(reason=msg) + except IndexError: + log.warn(f"IndexError: {msg}") + raise HTTPBadRequest(reason=msg) + log.debug(f"got json arr: {input_arr.shape}") + kwargs["value"] = input_data + + return kwargs + + +async def createLinkFromParent(app, parent_id, h5path, tgt_id=None, bucket=None, implicit=False): + """ create link or links from parentId to tgt_id. + If implicit is True, create any intermediate group objects needed """ + + if not h5path: + log.warn("createLinkFromParent with null h5path") + return + log.info(f"createLinkFromParent, parent_id: {parent_id} h5path: {h5path} tgt_id={tgt_id}") + if implicit: + log.debug("createLinkFromParent - using implicit creation") link_titles = h5path.split("/") log.debug(f"link_titles: {link_titles}") for i in range(len(link_titles)): @@ -1145,7 +1674,7 @@ async def createObjectByPath(app, else: last_link = False link_title = link_titles[i] - log.debug(f"createObjectByPath - processing link: {link_title}") + log.debug(f"createLinkFromParent - processing link: {link_title}") link_json = None try: link_json = await getLink(app, parent_id, link_title, bucket=bucket) @@ -1161,7 +1690,7 @@ async def createObjectByPath(app, raise HTTPConflict() # otherwise, verify that this is a hardlink if link_json.get("class") != "H5L_TYPE_HARD": - msg = "createObjectByPath - h5path must contain only hardlinks" + msg = "createLinkFromParent - h5path must contain only hard links" log.warn(msg) raise HTTPBadRequest(reason=msg) parent_id = link_json["id"] @@ -1174,31 +1703,236 @@ async def createObjectByPath(app, log.debug(f"link: {link_title} to sub-group found") else: log.debug(f"link for link_title {link_title} not found") - if not last_link and not implicit: + if last_link: + # create a link to the new object + await putHardLink(app, parent_id, link_title, tgt_id=tgt_id, bucket=bucket) + parent_id = tgt_id # new parent + elif implicit: + # create a new group object + log.info(f"creating intermediate group object for: {link_title}") + kwargs = {"parent_id": parent_id, "bucket": bucket} + grp_id = createObjId("groups", root_id=getRootObjId(parent_id)) + kwargs["obj_id"] = grp_id + # createObject won't call back to this function since we haven't set the h5path + await createObject(app, **kwargs) + # create a link to the subgroup + await putHardLink(app, parent_id, link_title, tgt_id=grp_id, bucket=bucket) + parent_id = grp_id # new parent + else: if len(link_titles) > 1: - msg = f"createObjectByPath failed: not all groups in {h5path} exist" + msg = f"createLinkFromParent failed: not all groups in {h5path} exist" else: - msg = f"createObjectByPath failed: {h5path} does not exist" + msg = f"createLinkFromParent failed: {h5path} does not exist" log.warn(msg) raise HTTPNotFound(reason=msg) - # create the group or group/datatype/dataset for the last - # item in the path (based on parameters passed in) - kwargs = {"bucket": bucket, "root_id": root_id} - if last_link: - if obj_type: - kwargs["obj_type"] = obj_type - if obj_shape: - kwargs["obj_shape"] = obj_shape - if layout: - kwargs["layout"] = layout - if creation_props: - kwargs["creation_props"] = creation_props - obj_json = await createObject(app, **kwargs) - obj_id = obj_json["id"] - # create a link to the new object - await putHardLink(app, parent_id, link_title, tgt_id=obj_id, bucket=bucket) - parent_id = obj_id # new parent - log.info(f"createObjectByPath {h5path} done, returning obj_json") - return obj_json +async def createObject(app, + parent_id=None, + root_id=None, + h5path=None, + obj_id=None, + type=None, + shape=None, + creation_props=None, + attrs=None, + links=None, + implicit=None, + bucket=None): + """ create a group, ctype, or dataset object and return object json + Determination on whether a group, ctype, or dataset is created is based on: + 1) if type and shape are set, a dataset object will be created + 2) if type is set but not shape, a datatype object will be created + 3) otherwise (type and shape are both None), a group object will be created + The layout parameter only applies to dataset creation + """ + if type and shape: + collection = "datasets" + elif type: + collection = "datatypes" + else: + collection = "groups" + + if not root_id: + root_id = getRootObjId(parent_id) + log.info(f"createObject for {collection} collection, root_id: {root_id}, bucket: {bucket}") + if root_id != parent_id: + log.debug(f" parent_id: {parent_id}") + if obj_id: + log.debug(f" obj_id: {obj_id}") + if h5path: + log.debug(f" h5path: {h5path}") + if type: + log.debug(f" type: {type}") + if shape: + log.debug(f" shape: {shape}") + if creation_props: + log.debug(f" cprops: {creation_props}") + if attrs: + log.debug(f" attrs: {attrs}") + if links: + log.debug(f" links: {links}") + + if h5path: + if h5path.startswith("/"): + if parent_id == root_id: + # just adjust the path to be relative + h5path = h5path[1:] + else: + msg = f"createObject expecting relative h5path, but got: {h5path}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if h5path.endswith("/"): + h5path = h5path[:-1] # makes iterating through the links a bit easier + + if obj_id: + log.debug(f"using client supplied id: {obj_id}") + if not isValidUuid(obj_id, obj_class=collection): + msg = f"invalid id: {obj_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if getRootObjId(obj_id) != root_id: + msg = f"id: {obj_id} is not valid for root: {root_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + obj_id = createObjId(collection, root_id=root_id) + log.info(f"new obj id: {obj_id}") + obj_json = {"id": obj_id, "root": root_id} + if type: + obj_json["type"] = type + if shape: + obj_json["shape"] = shape + if creation_props: + obj_json["creationProperties"] = creation_props + else: + obj_json["creationProperties"] = {} + if attrs: + kwargs = {"obj_id": obj_id, "bucket": bucket} + attrs_json = {"attributes": attrs} + attr_items = await getAttributesFromRequest(app, attrs_json, **kwargs) + log.debug(f"got attr_items: {attr_items}") + obj_json["attributes"] = attr_items + if links: + if collection != "groups": + msg = "links can only be used with groups" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + obj_json["links"] = links + log.debug(f"create {collection} obj, body: {obj_json}") + dn_url = getDataNodeUrl(app, obj_id) + req = f"{dn_url}/{collection}" + params = {"bucket": bucket} + rsp_json = await http_post(app, req, data=obj_json, params=params) + + log.debug(f"createObject: {req} got rsp_json: {rsp_json}") + + # object creation successful, create link from parent if requested + if h5path: + kwargs = {"tgt_id": obj_id, "bucket": bucket, "implicit": implicit} + await createLinkFromParent(app, parent_id, h5path, **kwargs) + + return rsp_json + + +async def createGroup(app, + parent_id=None, + root_id=None, + h5path=None, + obj_id=None, + creation_props=None, + attrs=None, + links=None, + implicit=None, + bucket=None): + + """ create a new group object """ + + kwargs = {} + kwargs["parent_id"] = parent_id + kwargs["root_id"] = root_id + kwargs["h5path"] = h5path + kwargs["obj_id"] = obj_id + kwargs["creation_props"] = creation_props + kwargs["attrs"] = attrs + kwargs["links"] = links + kwargs["implicit"] = implicit + kwargs["bucket"] = bucket + rsp_json = await createObject(app, **kwargs) + return rsp_json + + +async def createDatatypeObj(app, + parent_id=None, + root_id=None, + type=None, + h5path=None, + obj_id=None, + creation_props=None, + attrs=None, + links=None, + implicit=None, + bucket=None): + + """ create a new committed type object""" + + if not type: + msg = "type not set for committed type creation" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + kwargs = {} + kwargs["parent_id"] = parent_id + kwargs["root_id"] = root_id + kwargs["type"] = type + kwargs["h5path"] = h5path + kwargs["obj_id"] = obj_id + kwargs["creation_props"] = creation_props + kwargs["attrs"] = attrs + kwargs["links"] = links + kwargs["implicit"] = implicit + kwargs["bucket"] = bucket + rsp_json = await createObject(app, **kwargs) + return rsp_json + + +async def createDataset(app, + parent_id=None, + root_id=None, + type=None, + shape=None, + h5path=None, + obj_id=None, + creation_props=None, + attrs=None, + links=None, + implicit=None, + bucket=None): + + """ create a new dataset object""" + + if not type: + msg = "type not set for dataset creation" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if not shape: + # default to a scalar dataset + shape = {"class": "H5S_SCALAR"} + + kwargs = {} + kwargs["parent_id"] = parent_id + kwargs["root_id"] = root_id + kwargs["type"] = type + kwargs["shape"] = shape + kwargs["h5path"] = h5path + kwargs["obj_id"] = obj_id + kwargs["creation_props"] = creation_props + kwargs["attrs"] = attrs + kwargs["links"] = links + kwargs["implicit"] = implicit + kwargs["bucket"] = bucket + dset_json = await createObject(app, **kwargs) + + return dset_json diff --git a/hsds/util/arrayUtil.py b/hsds/util/arrayUtil.py deleted file mode 100644 index 67c847c3..00000000 --- a/hsds/util/arrayUtil.py +++ /dev/null @@ -1,731 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # -# Utilities. The full HSDS copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## - -import math -import base64 -import binascii -import numpy as np - -MAX_VLEN_ELEMENT = 1_000_000 # restrict largest vlen element to one million - - -def bytesArrayToList(data): - """ - Convert list that may contain bytes type elements to list of string elements - - TBD: Need to deal with non-string byte data (hexencode?) - """ - if type(data) in (bytes, str): - is_list = False - elif isinstance(data, (np.ndarray, np.generic)): - if len(data.shape) == 0: - is_list = False - data = data.tolist() # tolist will return a scalar in this case - if type(data) in (list, tuple): - is_list = True - else: - is_list = False - else: - is_list = True - elif type(data) in (list, tuple): - is_list = True - else: - is_list = False - - if is_list: - out = [] - for item in data: - try: - rec_item = bytesArrayToList(item) # recursive call - out.append(rec_item) - except ValueError as err: - raise err - elif type(data) is bytes: - try: - out = data.decode("utf-8") - except UnicodeDecodeError as err: - raise ValueError(err) - else: - out = data - - return out - - -def toTuple(rank, data): - """ - Convert a list to a tuple, recursively. - Example. [[1,2],[3,4]] -> ((1,2),(3,4)) - """ - if type(data) in (list, tuple): - if rank > 0: - return list(toTuple(rank - 1, x) for x in data) - else: - return tuple(toTuple(rank - 1, x) for x in data) - else: - if isinstance(data, str): - data = data.encode("utf8") - return data - - -def getArraySize(arr): - """ - Get size in bytes of a numpy array. - """ - nbytes = arr.dtype.itemsize - for n in arr.shape: - nbytes *= n - return nbytes - - -def getNumElements(dims): - """ - Get num elements defined by a shape - """ - num_elements = 0 - if isinstance(dims, int): - num_elements = dims - elif isinstance(dims, (list, tuple)): - num_elements = 1 - for dim in dims: - num_elements *= dim - else: - raise ValueError("Unexpected argument") - return num_elements - - -def isVlen(dt): - """ - Return True if the type contains variable length elements - """ - is_vlen = False - if len(dt) > 1: - names = dt.names - for name in names: - if isVlen(dt[name]): - is_vlen = True - break - else: - if dt.metadata and "vlen" in dt.metadata: - is_vlen = True - return is_vlen - - -def jsonToArray(data_shape, data_dtype, data_json): - """ - Return numpy array from the given json array. - """ - def fillVlenArray(rank, data, arr, index): - for i in range(len(data)): - if rank > 1: - index = fillVlenArray(rank - 1, data[i], arr, index) - else: - arr[index] = data[i] - index += 1 - return index - - if data_json is None: - return np.array([]).astype(data_dtype) - - if isinstance(data_json, (list, tuple)): - if None in data_json: - return np.array([]).astype(data_dtype) - - # need some special conversion for compound types -- - # each element must be a tuple, but the JSON decoder - # gives us a list instead. - if len(data_dtype) > 1 and not isinstance(data_json, (list, tuple)): - raise TypeError("expected list data for compound data type") - npoints = getNumElements(data_shape) - np_shape_rank = len(data_shape) - - if type(data_json) in (list, tuple): - converted_data = [] - if npoints == 1 and len(data_json) == len(data_dtype): - converted_data.append(toTuple(0, data_json)) - else: - converted_data = toTuple(np_shape_rank, data_json) - data_json = converted_data - else: - if isinstance(data_json, str): - data_json = data_json.encode("utf8") - data_json = [data_json,] # listify - - if isVlen(data_dtype): - arr = np.zeros((npoints,), dtype=data_dtype) - fillVlenArray(np_shape_rank, data_json, arr, 0) - else: - try: - arr = np.array(data_json, dtype=data_dtype) - except UnicodeEncodeError as ude: - msg = "Unable to encode data" - raise ValueError(msg) from ude - # raise an exception of the array shape doesn't match the selection shape - # allow if the array is a scalar and the selection shape is one element, - # numpy is ok with this - if arr.size != npoints: - msg = "Input data doesn't match selection number of elements" - msg += f" Expected {npoints}, but received: {arr.size}" - raise ValueError(msg) - if arr.shape != data_shape: - arr = arr.reshape(data_shape) # reshape to match selection - - return arr - - -def getElementSize(e, dt): - """ - Get number of byte needed to given element as a bytestream - """ - # print(f"getElementSize - e: {e} dt: {dt} metadata: {dt.metadata}") - if len(dt) > 1: - count = 0 - for name in dt.names: - field_dt = dt[name] - field_val = e[name] - count += getElementSize(field_val, field_dt) - elif not dt.metadata or "vlen" not in dt.metadata: - count = dt.itemsize # fixed size element - else: - # variable length element - vlen = dt.metadata["vlen"] - if isinstance(e, int): - if e == 0: - count = 4 # non-initialized element - else: - raise ValueError("Unexpected value: {}".format(e)) - elif isinstance(e, bytes): - count = len(e) + 4 - elif isinstance(e, str): - count = len(e.encode("utf-8")) + 4 - elif isinstance(e, np.ndarray): - nElements = math.prod(e.shape) - if e.dtype.kind != "O": - count = e.dtype.itemsize * nElements - else: - arr1d = e.reshape((nElements,)) - count = 0 - for item in arr1d: - count += getElementSize(item, dt) - count += 4 # byte count - elif isinstance(e, list) or isinstance(e, tuple): - if not e: - # empty list, just add byte count - count = 4 - else: - # not sure how to deal with this - count = len(e) * vlen.itemsize + 4 # +4 for byte count - else: - raise TypeError("unexpected type: {}".format(type(e))) - return count - - -def getByteArraySize(arr): - """ - Get number of bytes needed to store given numpy array as a bytestream - """ - if not isVlen(arr.dtype): - return arr.itemsize * math.prod(arr.shape) - nElements = math.prod(arr.shape) - # reshape to 1d for easier iteration - arr1d = arr.reshape((nElements,)) - dt = arr1d.dtype - count = 0 - for e in arr1d: - count += getElementSize(e, dt) - return count - - -def copyBuffer(src, des, offset): - """ - Copy to buffer at given offset - """ - # print(f"copyBuffer - src: {src} offset: {offset}") - # TBD: just do: des[offset:] = src[:] ? - for i in range(len(src)): - des[i + offset] = src[i] - - # print("returning:", offset + len(src)) - return offset + len(src) - - -def copyElement(e, dt, buffer, offset): - """ - Copy element to bytearray - """ - # print(f"copyElement - dt: {dt} offset: {offset}") - if len(dt) > 1: - for name in dt.names: - field_dt = dt[name] - field_val = e[name] - offset = copyElement(field_val, field_dt, buffer, offset) - elif not dt.metadata or "vlen" not in dt.metadata: - # print(f"e vlen: {e} type: {type(e)} itemsize: {dt.itemsize}") - e_buf = e.tobytes() - # print("tobytes:", e_buf) - if len(e_buf) < dt.itemsize: - # extend the buffer for fixed size strings - # print("extending buffer") - e_buf_ex = bytearray(dt.itemsize) - for i in range(len(e_buf)): - e_buf_ex[i] = e_buf[i] - e_buf = bytes(e_buf_ex) - - # print("length:", len(e_buf)) - offset = copyBuffer(e_buf, buffer, offset) - else: - # variable length element - vlen = dt.metadata["vlen"] - # print("copyBuffer vlen:", vlen) - if isinstance(e, int): - # print("copyBuffer int") - if e == 0: - # write 4-byte integer 0 to buffer - offset = copyBuffer(b"\x00\x00\x00\x00", buffer, offset) - else: - raise ValueError("Unexpected value: {}".format(e)) - elif isinstance(e, bytes): - # print("copyBuffer bytes") - count = np.int32(len(e)) - if count > MAX_VLEN_ELEMENT: - raise ValueError("vlen element too large") - offset = copyBuffer(count.tobytes(), buffer, offset) - offset = copyBuffer(e, buffer, offset) - elif isinstance(e, str): - # print("copyBuffer, str") - text = e.encode("utf-8") - count = np.int32(len(text)) - if count > MAX_VLEN_ELEMENT: - raise ValueError("vlen element too large") - offset = copyBuffer(count.tobytes(), buffer, offset) - offset = copyBuffer(text, buffer, offset) - - elif isinstance(e, np.ndarray): - nElements = math.prod(e.shape) - # print("copyBuffer ndarray, nElements:", nElements) - - if e.dtype.kind != "O": - count = np.int32(e.dtype.itemsize * nElements) - # print("copyBuffeer got vlen count:", count) - # print("copyBuffer e:", e) - if count > MAX_VLEN_ELEMENT: - raise ValueError("vlen element too large") - offset = copyBuffer(count.tobytes(), buffer, offset) - # print("copyBuffer write new count, offset:", offset) - offset = copyBuffer(e.tobytes(), buffer, offset) - # print("copyBuffer write data, offset:", offset) - else: - arr1d = e.reshape((nElements,)) - for item in arr1d: - offset = copyElement(item, dt, buffer, offset) - - elif isinstance(e, list) or isinstance(e, tuple): - # print("cooyBuffer list/tuple vlen:", vlen, "e:", e) - count = np.int32(len(e) * vlen.itemsize) - offset = copyBuffer(count.tobytes(), buffer, offset) - if isinstance(e, np.ndarray): - arr = e - else: - arr = np.asarray(e, dtype=vlen) - offset = copyBuffer(arr.tobytes(), buffer, offset) - - else: - raise TypeError("unexpected type: {}".format(type(e))) - # print("buffer: {}".format(buffer)) - return offset - - -def getElementCount(buffer, offset=0): - """ - Get the count value from persisted vlen array - """ - - n = offset - m = offset + 4 - count_bytes = bytes(buffer[n:m]) - - try: - count = int(np.frombuffer(count_bytes, dtype=" MAX_VLEN_ELEMENT: - # expect variable length element to be between 0 and 1mb - raise ValueError("varlen element size expected to be less than 1MB") - return count - - -def readElement(buffer, offset, arr, index, dt): - """ - Read a single element from buffer into array. - - Parameters: - buffer (bytearray): Byte array to read an element from. - offset (int): Starting offset in the buffer. - arr (numpy.ndarray): Array to store the element. - index (int): Index in 'arr' at which to store the element. - dt (numpy.dtype): Numpy datatype of the element. - - Note: If the provided datatype is a variable-length sequence, - this function will read the byte count from the first 4 bytes - of the buffer, and then read the entire sequence. - - Returns: - int: The updated offset value after reading the element. - """ - if len(dt) > 1: - e = arr[index] - for name in dt.names: - field_dt = dt[name] - offset = readElement(buffer, offset, e, name, field_dt) - elif not dt.metadata or "vlen" not in dt.metadata: - count = dt.itemsize - n = offset - m = offset + count - e_buffer = buffer[n:m] - offset += count - try: - e = np.frombuffer(bytes(e_buffer), dtype=dt) - arr[index] = e[0] - except ValueError: - print(f"ERROR: ValueError setting {e_buffer} and dtype: {dt}") - raise - else: - # variable length element - vlenBaseType = dt.metadata["vlen"] - e = arr[index] - - if isinstance(e, np.ndarray): - nelements = math.prod(dt.shape) - e.reshape((nelements,)) - for i in range(nelements): - offset = readElement(buffer, offset, e, i, dt) - e.reshape(dt.shape) - else: - # total number of bytes in the vlen sequence/variable-length string - count = getElementCount(buffer, offset=offset) - offset += 4 - n = offset - m = offset + count - if count > 0: - e_buffer = buffer[n:m] - offset += count - - if vlenBaseType is bytes: - arr[index] = bytes(e_buffer) - elif vlenBaseType is str: - s = e_buffer.decode("utf-8") - arr[index] = s - else: - try: - e = np.frombuffer(bytes(e_buffer), dtype=vlenBaseType) - except ValueError: - msg = f"Failed to parse vlen data: {e_buffer} with dtype: {vlenBaseType}" - raise ValueError(msg) - arr[index] = e - return offset - - -def encodeData(data, encoding="base64"): - """ Encode given data """ - if encoding != "base64": - raise ValueError("only base64 encoding is supported") - try: - if isinstance(data, str): - data = data.encode("utf8") - except UnicodeEncodeError: - raise ValueError("can not encode string value") - if not isinstance(data, bytes): - msg = "Expected str or bytes type to encodeData, " - msg += f"but got: {type(data)}" - raise TypeError(msg) - try: - encoded_data = base64.b64encode(data) - except Exception as e: - # TBD: what exceptions can be raised? - raise ValueError(f"Unable to encode: {e}") - return encoded_data - - -def decodeData(data, encoding="base64"): - if encoding != "base64": - raise ValueError("only base64 decoding is supported") - try: - decoded_data = base64.b64decode(data) - except Exception as e: - # TBD: catch actual exception - raise ValueError(f"Unable to decode: {e}") - return decoded_data - - -def arrayToBytes(arr, encoding=None): - """ - Return byte representation of numpy array - """ - if isVlen(arr.dtype): - nSize = getByteArraySize(arr) - buffer = bytearray(nSize) - offset = 0 - nElements = math.prod(arr.shape) - arr1d = arr.reshape((nElements,)) - for e in arr1d: - # print("arrayToBytes:", e) - offset = copyElement(e, arr1d.dtype, buffer, offset) - data = bytes(buffer) - else: - # fixed length type - data = arr.tobytes() - - if encoding: - data = encodeData(data) - return data - - -def bytesToArray(data, dt, shape, encoding=None): - """ - Create numpy array based on byte representation - """ - if encoding: - # decode the data - # will raise ValueError if non-decodeable - data = decodeData(data) - if not isVlen(dt): - # regular numpy from string - arr = np.frombuffer(data, dtype=dt) - else: - nelements = getNumElements(shape) - - arr = np.zeros((nelements,), dtype=dt) - offset = 0 - for index in range(nelements): - offset = readElement(data, offset, arr, index, dt) - if shape is not None: - arr = arr.reshape(shape) - # check that we can update the array if needed - # Note: this seems to have been required starting with numpuy v 1.17 - # Setting the flag directly is not recommended. - # cf: https://github.com/numpy/numpy/issues/9440 - - if not arr.flags["WRITEABLE"]: - arr_copy = arr.copy() - arr = arr_copy - - return arr - - -def getNumpyValue(value, dt=None, encoding=None): - """ - Return value as numpy type for given dtype and encoding - Encoding is expected to be one of None or "base64" - """ - # create a scalar numpy array - arr = np.zeros((), dtype=dt) - - if encoding and not isinstance(value, str): - msg = "Expected value to be string to use encoding" - raise ValueError(msg) - - if encoding == "base64": - try: - data = base64.decodebytes(value.encode("utf-8")) - except binascii.Error: - msg = "Unable to decode base64 string: {value}" - # log.warn(msg) - raise ValueError(msg) - arr = bytesToArray(data, dt, dt.shape) - else: - if isinstance(value, list): - # convert to tuple - value = tuple(value) - elif dt.kind == "f" and isinstance(value, str) and value == "nan": - value = np.nan - else: - # use as is - pass - arr = np.asarray(value, dtype=dt.base) - return arr[()] - - -def squeezeArray(data): - """ - Reduce dimensions by removing any 1-extent dimensions. - Just return input if no 1-extent dimensions - - Note: only works with ndarrays (for now at least) - """ - if not isinstance(data, np.ndarray): - raise TypeError("expected ndarray") - if len(data.shape) <= 1: - return data - can_reduce = True - for extent in data.shape: - if extent == 1: - can_reduce = True - break - if can_reduce: - data = data.squeeze() - return data - - -class IndexIterator(object): - """ - Class to iterate through list of chunks of a given dataset - """ - - def __init__(self, shape, sel=None): - self._shape = shape - self._rank = len(self._shape) - self._stop = False - - if self._rank < 1: - raise ValueError("IndexIterator can not be used on arrays of zero rank") - - if sel is None: - # select over entire dataset - slices = [] - for dim in range(self._rank): - slices.append(slice(0, self._shape[dim])) - self._sel = tuple(slices) - else: - if isinstance(sel, slice): - self._sel = (sel,) - else: - self._sel = sel - if len(self._sel) != self._rank: - raise ValueError("Invalid selection - selection region must have same rank as shape") - self._index = [] - for dim in range(self._rank): - s = self._sel[dim] - if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start: - raise ValueError( - "Invalid selection - selection region must be within dataset space" - ) - self._index.append(s.start) - - def __iter__(self): - return self - - def __next__(self): - if self._stop: - raise StopIteration() - # bump up the last index and carry forward if we run outside the selection - dim = self._rank - 1 - ret_index = self._index.copy() - while True: - s = self._sel[dim] - if s.step: - step = s.step - else: - step = 1 - self._index[dim] += step - - if self._index[dim] < s.stop: - # we still have room to extend along this dimensions - break - - # reset to the start and continue iterating with higher dimension - self._index[dim] = s.start - dim -= 1 - if dim < 0: - # ran past last index, stop iteration on next run - self._stop = True - - return tuple(ret_index) - - -def ndarray_compare(arr1, arr2): - # compare two numpy arrays. - # return true if the same (exclusive of null vs. empty array) - # false otherwise - # TBD: this is slow for multi-megabyte vlen arrays, needs to be optimized - if not isinstance(arr1, np.ndarray) and not isinstance(arr2, np.ndarray): - if not isinstance(arr1, np.void) and not isinstance(arr2, np.void): - return arr1 == arr2 - if isinstance(arr1, np.void) and not isinstance(arr2, np.void): - if arr1.size == 0 and not arr2: - return True - else: - return False - if not isinstance(arr1, np.void) and isinstance(arr2, np.void): - if not arr1 and arr2.size == 0: - return True - else: - return False - # both np.voids - if arr1.size != arr2.size: - return False - - if len(arr1) != len(arr2): - return False - - for i in range(len(arr1)): - if not ndarray_compare(arr1[i], arr2[i]): - return False - return True - - if isinstance(arr1, np.ndarray) and not isinstance(arr2, np.ndarray): - # same only if arr1 is empty and arr2 is 0 - if arr1.size == 0 and not arr2: - return True - else: - return False - if not isinstance(arr1, np.ndarray) and isinstance(arr2, np.ndarray): - # same only if arr1 is empty and arr2 size is 0 - if not arr1 and arr2.size == 0: - return True - else: - return False - - # two ndarrays... - if arr1.shape != arr2.shape: - return False - if arr2.dtype != arr2.dtype: - return False - - if isVlen(arr1.dtype): - # need to compare element by element - - nElements = np.prod(arr1.shape) - arr1 = arr1.reshape((nElements,)) - arr2 = arr2.reshape((nElements,)) - for i in range(nElements): - if not ndarray_compare(arr1[i], arr2[i]): - return False - return True - else: - # can just us np array_compare - return np.array_equal(arr1, arr2) - - -def getBroadcastShape(mshape, element_count): - # if element_count is less than the number of elements - # defined by mshape, return a numpy compatible broadcast - # shape that contains element_count elements. - # If non exists return None - - if np.prod(mshape) == element_count: - return None - - if element_count == 1: - # this always works - return [1,] - - bcshape = [] - rank = len(mshape) - for n in range(rank - 1): - bcshape.insert(0, mshape[rank - n - 1]) - if element_count == np.prod(bcshape): - return bcshape # have a match - - return None # no broadcast found diff --git a/hsds/util/chunkUtil.py b/hsds/util/chunkUtil.py index dc03cc89..8715ac3d 100644 --- a/hsds/util/chunkUtil.py +++ b/hsds/util/chunkUtil.py @@ -1,6 +1,9 @@ import numpy as np + +from h5json.array_util import ndarray_compare +from h5json.dset_util import getDatasetLayout + from .. import hsds_logger as log -from .arrayUtil import ndarray_compare CHUNK_BASE = 16 * 1024 # Multiplier by which chunks are adjusted CHUNK_MIN = 512 * 1024 # Soft lower limit (512k) @@ -9,230 +12,6 @@ PRIMES = [29, 31, 37, 41, 43, 47, 53, 59, 61, 67] # for chunk partitioning -def getChunkSize(layout, type_size): - """Return chunk size given layout. - i.e. just the product of the values in the list. - """ - if type_size == "H5T_VARIABLE": - type_size = DEFAULT_TYPE_SIZE - - chunk_size = type_size - for n in layout: - if n <= 0: - raise ValueError("Invalid chunk layout") - chunk_size *= n - return chunk_size - - -def get_dset_size(shape_json, typesize): - """Return the size of the dataspace. For - any unlimited dimensions, assume a value of 1. - (so the return size will be the absolute minimum) - """ - if shape_json is None or shape_json["class"] == "H5S_NULL": - return None - if shape_json["class"] == "H5S_SCALAR": - return typesize # just return size for one item - if typesize == "H5T_VARIABLE": - typesize = DEFAULT_TYPE_SIZE # just take a guess at the item size - dset_size = typesize - shape = shape_json["dims"] - rank = len(shape) - - for n in range(rank): - if shape[n] == 0: - # extendable extent with value of 0 - continue # assume this is one - dset_size *= shape[n] - return dset_size - - -def expandChunk( - layout, typesize, shape_json, chunk_min=CHUNK_MIN, layout_class="H5D_CHUNKED" -): - """Compute an increased chunk shape with a size in bytes greater than chunk_min.""" - if shape_json is None or shape_json["class"] == "H5S_NULL": - return None - if shape_json["class"] == "H5S_SCALAR": - return (1,) # just enough to store one item - - layout = list(layout) - log.debug(f"expandChunk layout: {layout} typesize: {typesize}") - dims = shape_json["dims"] - rank = len(dims) - extendable_dims = 0 # number of dimensions that are extenable - maxdims = None - if "maxdims" in shape_json: - maxdims = shape_json["maxdims"] - for n in range(rank): - if maxdims[n] == 0 or maxdims[n] > dims[n]: - extendable_dims += 1 - - dset_size = get_dset_size(shape_json, typesize) - if dset_size <= chunk_min and extendable_dims == 0: - # just use the entire dataspace shape as one big chunk - return tuple(dims) - - chunk_size = getChunkSize(layout, typesize) - if chunk_size >= chunk_min: - return tuple(layout) # good already - while chunk_size < chunk_min: - # just adjust along extendable dimensions first - old_chunk_size = chunk_size - for n in range(rank): - dim = rank - n - 1 # start from last dim - - if extendable_dims > 0: - if maxdims[dim] == 0: - # infinitely extendable dimensions - layout[dim] *= 2 - chunk_size = getChunkSize(layout, typesize) - if chunk_size > chunk_min: - break - elif maxdims[dim] > layout[dim]: - # can only be extended so much - layout[dim] *= 2 - if layout[dim] >= dims[dim]: - layout[dim] = maxdims[dim] # trim back - extendable_dims -= 1 # one less extenable dimension - - chunk_size = getChunkSize(layout, typesize) - if chunk_size > chunk_min: - break - else: - pass # ignore non-extensible for now - else: - # no extendable dimensions - if dims[dim] > layout[dim]: - # can expand chunk along this dimension - layout[dim] *= 2 - if layout[dim] > dims[dim]: - layout[dim] = dims[dim] # trim back - chunk_size = getChunkSize(layout, typesize) - if chunk_size > chunk_min: - break - else: - pass # can't extend chunk along this dimension - if chunk_size <= old_chunk_size: - # stop iteration if we haven't increased the chunk size - log.debug("stopping expandChunk iteration") - break - elif chunk_size > chunk_min: - break # we're good - else: - pass # do another round - return tuple(layout) - - -def shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX, layout_class="H5D_CHUNKED"): - """Compute a reduced chunk shape with a size in bytes less than chunk_max.""" - layout = list(layout) - chunk_size = getChunkSize(layout, typesize) - if chunk_size <= chunk_max: - return tuple(layout) # good already - log.debug(f"shrinkChunk layout: {layout} typesize: {typesize}") - rank = len(layout) - - while chunk_size > chunk_max: - # just adjust along extendable dimensions first - old_chunk_size = chunk_size - for dim in range(rank): - if layout[dim] > 1: - # tricky way to do x // 2 with ceil - layout[dim] = -(-layout[dim] // 2) - chunk_size = getChunkSize(layout, typesize) - if chunk_size <= chunk_max: - break - else: - pass # can't shrink chunk along this dimension - if chunk_size >= old_chunk_size: - # reality check to see if we'll ever break out of the while loop - log.warning("Unexpected error in shrink_chunk") - break - elif chunk_size <= chunk_max: - break # we're good - else: - pass # do another round - return tuple(layout) - - -def guessChunk(shape_json, typesize): - """Guess an appropriate chunk layout for a dataset, given its shape and - the size of each element in bytes. Will allocate chunks only as large - as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of - each axis, slightly favoring bigger values for the last index. - - Undocumented and subject to change without warning. - """ - if shape_json is None or shape_json["class"] == "H5S_NULL": - return None - if shape_json["class"] == "H5S_SCALAR": - return (1,) # just enough to store one item - - if "maxdims" in shape_json: - shape = shape_json["maxdims"] - else: - shape = shape_json["dims"] - - if typesize == "H5T_VARIABLE": - typesize = 128 # just take a guess at the item size - - # For unlimited dimensions we have to guess. use 1024 - shape = tuple((x if x != 0 else 1024) for i, x in enumerate(shape)) - - return shape - - -def getContiguousLayout( - shape_json, item_size, chunk_min=1000 * 1000, chunk_max=4 * 1000 * 1000 -): - """ - create a chunklayout for datasets use continguous storage. - """ - if not isinstance(item_size, int): - msg = "ContiguousLayout can only be used with fixed-length types" - raise ValueError(msg) - if chunk_max < chunk_min: - raise ValueError("chunk_max cannot be less than chunk_min") - if shape_json is None or shape_json["class"] == "H5S_NULL": - return None - if shape_json["class"] == "H5S_SCALAR": - return (1,) # just enough to store one item - dims = shape_json["dims"] - rank = len(dims) - if rank == 0: - raise ValueError("rank must be positive for Contiguous Layout") - for dim in dims: - if dim < 0: - raise ValueError("extents must be positive for Contiguous Layout") - if dim == 0: - # datashape with no elements, just return dims as layout - return dims - - nsize = item_size - layout = [ - 1, - ] * rank - - for i in range(rank): - dim = rank - i - 1 - extent = dims[dim] - if extent * nsize < chunk_max: - # just use the full extent as layout - layout[dim] = extent - nsize *= extent - else: - n = extent - while n > 1: - n = -(-n // 2) # use negatives so we round up on odds - if n * nsize < chunk_max: - break - layout[dim] = n - break # just use 1's for the rest of the layout - - return layout - - def frac(x, d): """ Utility func -- Works like fractional div, but returns ceiling @@ -259,7 +38,15 @@ def getNumChunks(selection, layout): If selection is provided (a list of slices), return the number of chunks that intersect with the selection. """ + + if len(selection) == 0: + # zero length selection + return 0 + rank = len(layout) + if rank == 1 and layout[0] == 1: + # scalar dataset + return 1 if len(selection) != rank: msg = f"selection list has {len(selection)} items, but rank is {rank}" raise ValueError(msg) @@ -429,11 +216,8 @@ def getPartitionKey(chunk_id, partition_count): def getChunkIdForPartition(chunk_id, dset_json): """Return the partition specific chunk id for given chunk""" - if "layout" not in dset_json: - msg = "No layout found in dset_json" - log.error(msg) - raise KeyError(msg) - layout_json = dset_json["layout"] + + layout_json = getDatasetLayout(dset_json) if "partition_count" in layout_json: partition_count = layout_json["partition_count"] partition = getChunkPartition(chunk_id) @@ -472,7 +256,12 @@ def chunk_id_to_index(chunk_id): indices.append(x) return indices + log.debug(f"getChunkIds - dset_id: {dset_id}, selection: {selection}, layout: {layout}") + if prefix: + log.debug(f"prefix: {prefix}") + num_chunks = getNumChunks(selection, layout) + log.debug(f"getChunkIds - num_chunks: {num_chunks}") if num_chunks == 0: return [] # empty list if prefix is None: diff --git a/hsds/util/dsetUtil.py b/hsds/util/dsetUtil.py index 044127f0..fb7d21c5 100644 --- a/hsds/util/dsetUtil.py +++ b/hsds/util/dsetUtil.py @@ -10,261 +10,120 @@ # request a copy from help@hdfgroup.org. # ############################################################################## -from aiohttp.web_exceptions import HTTPBadRequest, HTTPInternalServerError +from aiohttp.web_exceptions import HTTPBadRequest import math +from h5json.shape_util import getShapeDims + from .. import hsds_logger as log -""" -Filters that are known to HSDS. -Format is: - FILTER_CODE, FILTER_ID, Name - - H5Z_FILTER_FLETCHER32, H5Z_FILTER_SZIP, H5Z_FILTER_NBIT, - and H5Z_FILTER_SCALEOFFSET, are not currently supported. - - Non-supported filters metadata will be stored, but are - not (currently) used for compression/decompression. -""" - -FILTER_DEFS = ( - ("H5Z_FILTER_NONE", 0, "none"), - ("H5Z_FILTER_DEFLATE", 1, "gzip"), # aka as "zlib" for blosc - ("H5Z_FILTER_SHUFFLE", 2, "shuffle"), - ("H5Z_FILTER_FLETCHER32", 3, "fletcher32"), - ("H5Z_FILTER_SZIP", 4, "szip"), - ("H5Z_FILTER_NBIT", 5, "nbit"), - ("H5Z_FILTER_SCALEOFFSET", 6, "scaleoffset"), - ("H5Z_FILTER_LZF", 32000, "lzf"), - ("H5Z_FILTER_BLOSC", 32001, "blosclz"), - ("H5Z_FILTER_SNAPPY", 32003, "snappy"), - ("H5Z_FILTER_LZ4", 32004, "lz4"), - ("H5Z_FILTER_LZ4HC", 32005, "lz4hc"), - ("H5Z_FILTER_BITSHUFFLE", 32008, "bitshuffle"), - ("H5Z_FILTER_ZSTD", 32015, "zstd"), -) - -COMPRESSION_FILTER_IDS = ( - "H5Z_FILTER_DEFLATE", - "H5Z_FILTER_SZIP", - "H5Z_FILTER_SCALEOFFSET", - "H5Z_FILTER_LZF", - "H5Z_FILTER_BLOSC", - "H5Z_FILTER_SNAPPY", - "H5Z_FILTER_LZ4", - "H5Z_FILTER_LZ4HC", - "H5Z_FILTER_ZSTD", -) - -COMPRESSION_FILTER_NAMES = ( - "gzip", - "szip", - "lzf", - "blosclz", - "snappy", - "lz4", - "lz4hc", - "zstd", -) - -CHUNK_LAYOUT_CLASSES = ( - "H5D_CHUNKED", - "H5D_CHUNKED_REF", - "H5D_CHUNKED_REF_INDIRECT", - "H5D_CONTIGUOUS_REF", -) - - -# copied from arrayUtil.py -def isVlen(dt): - """ - Return True if the type contains variable length elements - """ - is_vlen = False - if len(dt) > 1: - names = dt.names - for name in names: - if isVlen(dt[name]): - is_vlen = True - break - else: - if dt.metadata and "vlen" in dt.metadata: - is_vlen = True - return is_vlen +def getShapeJson(body): + """ Return normalized json description of data space """ -def getFilterItem(key): - """ - Return filter code, id, and name, based on an id, a name or a code. - """ - - if key == "deflate": - key = "gzip" # use gzip as equivalent - for item in FILTER_DEFS: - for i in range(3): - if key == item[i]: - return {"class": item[0], "id": item[1], "name": item[2]} - return None # not found - - -def getFilters(dset_json): - """Return list of filters, or empty list""" - if "creationProperties" not in dset_json: - return [] - creationProperties = dset_json["creationProperties"] - if "filters" not in creationProperties: - return [] - filters = creationProperties["filters"] - return filters - - -def getCompressionFilter(filters): - """Return compression filter from filters, or None""" - for filter in filters: - if "class" not in filter: - msg = f"filter option: {filter} with no class key" - log.warn(msg) - continue - filter_class = filter["class"] - if filter_class in COMPRESSION_FILTER_IDS: - return filter - if all( - ( - filter_class == "H5Z_FILTER_USER", - "name" in filter, - filter["name"] in COMPRESSION_FILTER_NAMES, - ) - ): - return filter - return None - - -def getShuffleFilter(filters): - """Return shuffle filter, or None""" - FILTER_CLASSES = ("H5Z_FILTER_SHUFFLE", "H5Z_FILTER_BITSHUFFLE") - for filter in filters: - log.debug(f"filter: {filter}") - if "class" not in filter: - log.warn(f"filter option: {filter} with no class key") - continue - filter_class = filter["class"] - if filter_class in FILTER_CLASSES: - log.debug(f"found filter: {filter}") - return filter - - log.debug("Shuffle filter not used") - return None - - -def getFilterOps(app, dset_id, filters, dtype=None, chunk_shape=None): - """Get list of filter operations to be used for this dataset""" - filter_map = app["filter_map"] - - try: - if dset_id in filter_map: - log.debug(f"returning filter from filter_map for dset: {dset_id}") - return filter_map[dset_id] - except TypeError: - log.error(f"getFilterOps TypeError - dset_id: {dset_id} filter_map: {filter_map}") - raise - - compressionFilter = getCompressionFilter(filters) - log.debug(f"got compressionFilter: {compressionFilter}") - - filter_ops = {} - - shuffleFilter = getShuffleFilter(filters) - - if shuffleFilter and not isVlen(dtype): - shuffle_name = shuffleFilter["name"] - if shuffle_name == "shuffle": - filter_ops["shuffle"] = 1 # use regular shuffle - elif shuffle_name == "bitshuffle": - filter_ops["shuffle"] = 2 # use bitshuffle - else: - log.warn(f"unexpected shuffleFilter: {shuffle_name}") - filter_ops["shuffle"] = 0 # no shuffle - else: - filter_ops["shuffle"] = 0 # no shuffle - - if compressionFilter: - if compressionFilter["class"] == "H5Z_FILTER_DEFLATE": - filter_ops["compressor"] = "zlib" # blosc compressor - else: - if "name" in compressionFilter: - filter_ops["compressor"] = compressionFilter["name"] - else: - filter_ops["compressor"] = "lz4" # default to lz4 - if "level" not in compressionFilter: - filter_ops["level"] = 5 # medium level + dims = None + maxdims = None + shape_class = None + + if "shape" not in body: + shape_class = "H5S_SCALAR" + log.debug("getShapeJson - no shape given, using H5S_SCALAR") + return {"class": shape_class} + + body_shape = body["shape"] + log.debug(f"getShapeJson - got shape: {body_shape}") + + if isinstance(body_shape, int): + shape_class = "H5S_SIMPLE" + dims = [body_shape, ] + elif isinstance(body_shape, str): + # only valid string value is H5S_NULL or H5S_SCALAR + shape_class = body_shape + elif isinstance(body_shape, (tuple, list)): + if len(body_shape) == 0: + shape_class = "H5S_SCALAR" else: - filter_ops["level"] = int(compressionFilter["level"]) - - if filter_ops: - # save the chunk shape and dtype - filter_ops["chunk_shape"] = chunk_shape - filter_ops["dtype"] = dtype - log.debug(f"save filter ops for {dset_id}") - filter_map[dset_id] = filter_ops # save - - return filter_ops + shape_class = "H5S_SIMPLE" + dims = body_shape else: - return None + msg = f"invalid shape: {body_shape}" + log.warn(msg) + raise ValueError(msg) + if shape_class in ("H5S_NULL", "H5S_SCALAR") and dims: + msg = f"dims not valid for shape class: {body_shape}" + log.warn(msg) + raise ValueError(msg) -def getDsetRank(dset_json): - """Get rank returning 0 for sclar or NULL datashapes""" - datashape = dset_json["shape"] - if datashape["class"] == "H5S_NULL": - return 0 - if datashape["class"] == "H5S_SCALAR": - return 0 - if "dims" not in datashape: - log.warn(f"expected to find dims key in shape_json: {datashape}") - return 0 - dims = datashape["dims"] - rank = len(dims) - return rank + if dims is None and shape_class == "H5S_SIMPLE": + msg = "dims not specified for H5S_SIMPLE shape" + log.warn(msg) + raise ValueError(msg) + if dims is not None: + rank = len(dims) + for i in range(rank): + extent = dims[i] + if not isinstance(extent, int): + msg = f"Invalid shape dims: {dims}" + log.warn(msg) + raise ValueError(msg) + if extent < 0: + msg = f"shape dimension is negative for dims: {dims}" + log.warn(msg) + raise ValueError(msg) -def isNullSpace(dset_json): - """Return true if this dataset is a null dataspace""" - datashape = dset_json["shape"] - if datashape["class"] == "H5S_NULL": - return True + if "maxdims" in body: + maxdims = body["maxdims"] + elif isinstance(body_shape, dict) and "maxdims" in body_shape: + maxdims = body_shape["maxdims"] else: - return False + maxdims = None + # validate maxdims + if maxdims: + if dims is None: + msg = f"maxdims cannot be supplied for space class: {shape_class}" + log.warn(msg) + raise ValueError(msg) -def isScalarSpace(dset_json): - """ return true if this is a scalar dataset """ - datashape = dset_json["shape"] - is_scalar = False - if datashape["class"] == "H5S_NULL": - is_scalar = False - elif datashape["class"] == "H5S_SCALAR": - is_scalar = True - else: - if "dims" not in datashape: - log.warn(f"expected to find dims key in shape_json: {datashape}") - is_scalar = False + if isinstance(maxdims, int): + dim1 = maxdims + maxdims = [dim1] + elif isinstance(maxdims, list): + pass # can use as is else: - dims = datashape["dims"] - if len(dims) == 0: - # guess this properly be a H5S_SCALAR class - # but treat this as equivalent - is_scalar = True - return is_scalar + msg = f"Bad Request: maxdims is invalid: {maxdims}" + log.warn(msg) + raise ValueError(msg) + if len(dims) != len(maxdims): + msg = "max_dims rank doesn't match dims" + log.warn(msg) + raise ValueError(msg) + # return json description of shape + shape_json = {"class": shape_class} + if shape_class == "H5S_SIMPLE": + shape_json["dims"] = dims + if maxdims: + shape_json["maxdims"] = maxdims + log.debug(f"getShapeJson - returning shape_json: {shape_json}") + return shape_json -def getHyperslabSelection(dsetshape, start=None, stop=None, step=None): + +def getHyperslabSelection(dims, start=None, stop=None, step=None): """ Get slices given lists of start, stop, step values TBD: for step>1, adjust the slice to not extend beyond last data point returned """ - rank = len(dsetshape) + + if len(dims) == 0: + # scalar dataset + dims = (1,) + + rank = len(dims) if start: if not isinstance(start, (list, tuple)): start = [start] @@ -273,7 +132,7 @@ def getHyperslabSelection(dsetshape, start=None, stop=None, step=None): log.warn(msg) raise HTTPBadRequest(reason=msg) for dim in range(rank): - if start[dim] < 0 or start[dim] >= dsetshape[dim]: + if start[dim] < 0 or start[dim] >= dims[dim]: msg = "Bad Request: start index invalid for dim: " + str(dim) log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -290,14 +149,14 @@ def getHyperslabSelection(dsetshape, start=None, stop=None, step=None): log.warn(msg) raise HTTPBadRequest(reason=msg) for dim in range(rank): - if stop[dim] <= start[dim] or stop[dim] > dsetshape[dim]: + if stop[dim] <= start[dim] or stop[dim] > dims[dim]: msg = "Bad Request: stop index invalid for dim: " + str(dim) log.warn(msg) raise HTTPBadRequest(reason=msg) else: stop = [] for dim in range(rank): - stop.append(dsetshape[dim]) + stop.append(dims[dim]) if step: if not isinstance(step, (list, tuple)): @@ -307,7 +166,7 @@ def getHyperslabSelection(dsetshape, start=None, stop=None, step=None): log.warn(msg) raise HTTPBadRequest(reason=msg) for dim in range(rank): - if step[dim] <= 0 or step[dim] > dsetshape[dim]: + if step[dim] <= 0 or step[dim] > dims[dim]: msg = "Bad Request: step index invalid for dim: " + str(dim) log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -373,40 +232,6 @@ def getSelectionShape(selection): return shape -def getShapeDims(shape): - """ - Get dims from a given shape json. Return [1,] for Scalar datasets, - None for null dataspaces - """ - dims = None - if isinstance(shape, int): - dims = [shape, ] - elif isinstance(shape, list) or isinstance(shape, tuple): - dims = shape # can use as is - elif isinstance(shape, str): - # only valid string value is H5S_NULL - if shape != "H5S_NULL": - raise ValueError("Invalid value for shape") - dims = None - elif isinstance(shape, dict): - if "class" not in shape: - raise ValueError("'class' key not found in shape") - if shape["class"] == "H5S_NULL": - dims = None - elif shape["class"] == "H5S_SCALAR": - dims = [1,] - elif shape["class"] == "H5S_SIMPLE": - if "dims" not in shape: - raise ValueError("'dims' key expected for shape") - dims = shape["dims"] - else: - raise ValueError("Unknown shape class: {}".format(shape["class"])) - else: - raise ValueError(f"Unexpected shape class: {type(shape)}") - - return dims - - def isSelectAll(slices, dims): """ return True if the selection covers the entire dataspace """ if len(slices) != len(dims): @@ -427,11 +252,11 @@ def isSelectAll(slices, dims): def getQueryParameter(request, query_name, body=None, default=None): """ - Herlper function, get query parameter value from request. + Helper function, get query parameter value from request. If body is provided (as a JSON object) look in JSON and if not found look for query param. Return default value (or None) if not found """ - # as a convience, look up different capitilizations of query name + # as a convenience, look up different capitalizations of query name params = request.rel_url.query query_names = [] query_names.append(query_name.lower()) @@ -669,20 +494,25 @@ def get_slices(select, dset_json): dset_id = dset_json["id"] datashape = dset_json["shape"] - if datashape["class"] == "H5S_NULL": + shape_class = datashape["class"] + if shape_class == "H5S_NULL": msg = "Null space datasets can not be used as target for GET value" log.warn(msg) raise HTTPBadRequest(reason=msg) - dims = getShapeDims(datashape) # throws 400 for HS_NULL dsets + if shape_class == "H5S_SCALAR": + # return single slice + slices = [slice(0, 1, 1), ] + else: + dims = getShapeDims(datashape) # throws 400 for HS_NULL dsets - try: - slices = getSelectionList(select, dims) - except ValueError: - msg = f"Invalid selection: {select} on dims: {dims} " - msg += f"for dataset: {dset_id}" - log.warn(msg) - raise + try: + slices = getSelectionList(select, dims) + except ValueError: + msg = f"Invalid selection: {select} on dims: {dims} " + msg += f"for dataset: {dset_id}" + log.warn(msg) + raise return slices @@ -861,58 +691,10 @@ def setChunkDimQueryParam(params, dims): extent = dims[i] dim_param += str(extent) dim_param += "]" - log.debug("dim query param: {}".format(dim_param)) + log.debug(f"dim query param: {dim_param}") params["dim"] = dim_param -def getDsetMaxDims(dset_json): - """ - Get maxdims from a given shape. Return [1,] for Scalar datasets - - Use with H5S_NULL datasets will throw a 400 error. - """ - if "shape" not in dset_json: - log.error("No shape found in dset_json") - raise HTTPInternalServerError() - shape_json = dset_json["shape"] - maxdims = None - if shape_json["class"] == "H5S_NULL": - msg = "Expected shape class other than H5S_NULL" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - elif shape_json["class"] == "H5S_SCALAR": - maxdims = [ - 1, - ] - elif shape_json["class"] == "H5S_SIMPLE": - if "maxdims" in shape_json: - maxdims = shape_json["maxdims"] - else: - log.error("Unexpected shape class: {}".format(shape_json["class"])) - raise HTTPInternalServerError() - return maxdims - - -def getChunkLayout(dset_json): - """Get chunk layout. Throw 500 if used with non-H5D_CHUNKED layout""" - if "layout" not in dset_json: - log.error("No layout found in dset_json") - raise HTTPInternalServerError() - layout_json = dset_json["layout"] - if "class" not in layout_json: - log.error(f"Expected class key for layout: {layout_json}") - raise HTTPInternalServerError() - layout_class = layout_json["class"] - if layout_class not in CHUNK_LAYOUT_CLASSES: - log.error(f"Unexpected shape layout: {layout_class}") - raise HTTPInternalServerError() - if "dims" not in layout_json: - log.error(f"Expected dims key in layout: {layout_json}") - raise HTTPInternalServerError() - layout = layout_json["dims"] - return layout - - def getChunkInitializer(dset_json): """ get initializer application and arguments if set """ initializer = None @@ -964,65 +746,6 @@ def getPreviewQuery(dims): return select -def isExtensible(dims, maxdims): - """ - Determine if the dataset can be extended - """ - if maxdims is None or len(dims) == 0: - return False - rank = len(dims) - if len(maxdims) != rank: - raise ValueError("rank of maxdims does not match dataset") - for n in range(rank): - # TBD - shouldn't have H5S_UNLIMITED in any new files. - # Remove check once this is confirmed - if maxdims[n] in (0, "H5S_UNLIMITED") or maxdims[n] > dims[n]: - return True - return False - - -def getDatasetLayout(dset_json): - """ Return layout json from creation property list or layout json """ - layout = None - - if "creationProperties" in dset_json: - cp = dset_json["creationProperties"] - if "layout" in cp: - layout = cp["layout"] - if not layout and "layout" in dset_json: - layout = dset_json["layout"] - if not layout: - log.warn(f"no layout for {dset_json}") - return layout - - -def getDatasetLayoutClass(dset_json): - """ return layout class """ - layout = getDatasetLayout(dset_json) - if layout and "class" in layout: - layout_class = layout["class"] - else: - layout_class = None - return layout_class - - -def getChunkDims(dset_json): - """ get chunk shape for given dset_json """ - - layout = getDatasetLayout(dset_json) - if layout and "dims" in layout: - return layout["dims"] - else: - # H5D_COMPACT and H5D_CONTIGUOUS will not have a dims key - # Check the layout dict in dset_json to see if it's - # defined there - if "layout" in dset_json: - layout = dset_json["layout"] - if "dims" in layout: - return layout["dims"] - return None - - class ItemIterator: """ Class to iterator through items in a selection diff --git a/hsds/util/fileClient.py b/hsds/util/fileClient.py index 1bc5e786..0d7d88ba 100644 --- a/hsds/util/fileClient.py +++ b/hsds/util/fileClient.py @@ -88,7 +88,7 @@ def _getFileStats(self, filepath, data=None): return key_stats def _file_stats_increment(self, counter, inc=1): - """Incremenet the indicated connter""" + """Increment the indicated counter""" if "file_stats" not in self._app: # setup stats file_stats = {} @@ -173,8 +173,34 @@ async def get_object(self, key, bucket=None, offset=0, length=-1): msg = f"Unexpected Exception {type(e)} get get_object {key}: {e}" log.error(msg) raise HTTPInternalServerError() + + posix_delay = config.get("posix_delay", default=0.0) + if posix_delay > 0.0: + log.warn(f"posix_delay for get_object, sleep for: {posix_delay}") + await asyncio.sleep(posix_delay) + return data + def _mkdir(self, dirpath): + """ create the given directory if it doesn't already exist """ + try: + dirpath = pp.normpath(dirpath) + log.debug(f"normpath: {dirpath}") + + if not pp.isdir(dirpath): + log.debug(f"mkdir({dirpath})") + mkdir(dirpath) + else: + log.debug(f"isdir {dirpath} found") + except IOError as ioe: + if ioe.errno == 17: + # likely directory was created by another process since we checked + log.warn(f"mkdir failed, {dirpath} created outside this process") + else: + msg = f"fileClient: IOError on mkdir {dirpath}: {ioe}" + log.warn(msg) + raise HTTPInternalServerError() + async def put_object(self, key, data, bucket=None): """Write data to given key. Returns client specific dict on success @@ -202,15 +228,7 @@ async def put_object(self, key, data, bucket=None): for key_dir in key_dirs: dirpath = pp.join(dirpath, key_dir) log.debug(f"pp.join({key_dir}) => {dirpath}") - - dirpath = pp.normpath(dirpath) - log.debug(f"normpath: {dirpath}") - - if not pp.isdir(dirpath): - log.debug(f"mkdir({dirpath})") - mkdir(dirpath) - else: - log.debug(f"isdir {dirpath} found") + self._mkdir(dirpath) log.debug(f"open({filepath}, 'wb')") async with aiofiles.open(filepath, loop=loop, mode="wb") as f: await f.write(data) @@ -242,6 +260,12 @@ async def put_object(self, key, data, bucket=None): msg = f"fileClient.put_object {key} complete, " msg += f"write_rsp: {write_rsp}" log.debug(msg) + + posix_delay = config.get("posix_delay", default=0.0) + if posix_delay > 0.0: + log.warn(f"posix_delay for put_object, sleep for: {posix_delay}") + await asyncio.sleep(posix_delay) + return write_rsp async def delete_object(self, key, bucket=None): @@ -282,7 +306,11 @@ async def delete_object(self, key, bucket=None): msg = f"Unexpected Exception {type(e)} deleting file obj {key}: {e}" log.error(msg) raise HTTPInternalServerError() - await asyncio.sleep(0) # for async compat + + posix_delay = config.get("posix_delay", default=0.0) + if posix_delay > 0.0: + log.warn(f"posix_delay for delete_object , sleep for: {posix_delay}") + await asyncio.sleep(posix_delay) # for async compat async def is_object(self, key, bucket=None): self._validateBucket(bucket) @@ -417,6 +445,11 @@ async def list_keys( msg == f"got {len(key_names)}" log.warning(msg) + posix_delay = config.get("posix_delay", default=0.0) + if posix_delay > 0.0: + log.warn(f"posix_delay for list_keys, sleep for: {posix_delay}") + await asyncio.sleep(posix_delay) + return key_names async def releaseClient(self): diff --git a/hsds/util/hdf5dtype.py b/hsds/util/hdf5dtype.py deleted file mode 100644 index 3d7d1d2f..00000000 --- a/hsds/util/hdf5dtype.py +++ /dev/null @@ -1,876 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # -# Utilities. The full HSDS copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## - -import weakref -import numpy as np - - -class Reference: - """ - Represents an HDF5 object reference - """ - - @property - def id(self): - """Low-level identifier appropriate for this object""" - return self._id - - @property - def objref(self): - """Weak reference to object""" - return self._objref # return weak ref to ref'd object - - def __init__(self, bind): - """Create a new reference by binding to - a group/dataset/committed type - """ - self._id = bind._id - self._objref = weakref.ref(bind) - - def __repr__(self): - # TBD: this is not consistent with hsds or h5py... - if not isinstance(self._id.id, str): - raise TypeError("Expected string id") - item = None - - collection_type = self._id.collection_type - item = f"{collection_type}/{self._id.id}" - return item - - def tolist(self): - if type(self._id.id) is not str: - raise TypeError("Expected string id") - if self._id.objtype_code == "d": - return [ - ("datasets/" + self._id.id), - ] - elif self._id.objtype_code == "g": - return [ - ("groups/" + self._id.id), - ] - elif self._id.objtype_code == "t": - return [ - ("datatypes/" + self._id.id), - ] - else: - raise TypeError("Unexpected id type") - - -class RegionReference: - """ - Represents an HDF5 region reference - """ - - @property - def id(self): - """Low-level identifier appropriate for this object""" - return self._id - - @property - def objref(self): - """Weak reference to object""" - return self._objref # return weak ref to ref'd object - - def __init__(self, bind): - """Create a new reference by binding to - a group/dataset/committed type - """ - self._id = bind._id - self._objref = weakref.ref(bind) - - def __repr__(self): - return "" - - -def special_dtype(**kwds): - """Create a new h5py "special" type. Only one keyword may be given. - - Legal keywords are: - - vlen = basetype - Base type for HDF5 variable-length datatype. This can be Python - str type or instance of np.dtype. - Example: special_dtype( vlen=str ) - - enum = (basetype, values_dict) - Create a NumPy representation of an HDF5 enumerated type. Provide - a 2-tuple containing an (integer) base dtype and a dict mapping - string names to integer values. - - ref = Reference | RegionReference - Create a NumPy representation of an HDF5 object or region reference - type.""" - - if len(kwds) != 1: - raise TypeError("Exactly one keyword may be provided") - - name, val = kwds.popitem() - - if name == "vlen": - - return np.dtype("O", metadata={"vlen": val}) - - if name == "enum": - - try: - dt, enum_vals = val - except TypeError: - msg = "Enums must be created from a 2-tuple " - msg += "(basetype, values_dict)" - raise TypeError(msg) - - dt = np.dtype(dt) - if dt.kind not in "iu": - raise TypeError("Only integer types can be used as enums") - - return np.dtype(dt, metadata={"enum": enum_vals}) - - if name == "ref": - dt = None - if val is Reference: - dt = np.dtype("S48", metadata={"ref": Reference}) - elif val is RegionReference: - dt = np.dtype("S48", metadata={"ref": RegionReference}) - else: - raise ValueError("Ref class must be Reference or RegionReference") - - return dt - - raise TypeError(f'Unknown special type "{name}"') - - -def check_dtype(**kwds): - """Check a dtype for h5py special type "hint" information. Only one - keyword may be given. - - vlen = dtype - If the dtype represents an HDF5 vlen, returns the Python base class. - Currently only builting string vlens (str) are supported. Returns - None if the dtype does not represent an HDF5 vlen. - - enum = dtype - If the dtype represents an HDF5 enumerated type, returns the dictionary - mapping string names to integer values. Returns None if the dtype does - not represent an HDF5 enumerated type. - - ref = dtype - If the dtype represents an HDF5 reference type, returns the reference - class (either Reference or RegionReference). Returns None if the dtype - does not represent an HDF5 reference type. - """ - - if len(kwds) != 1: - raise TypeError("Exactly one keyword may be provided") - - name, dt = kwds.popitem() - - if name not in ("vlen", "enum", "ref"): - raise TypeError('Unknown special type "%s"' % name) - - try: - return dt.metadata[name] - except TypeError: - return None - except KeyError: - return None - - -def getTypeResponse(typeItem): - """ - Convert the given type item to a predefined type string for - predefined integer and floating point types ("H5T_STD_I64LE", et. al). - For compound types, recursively iterate through the typeItem and do - same conversion for fields of the compound type.""" - response = None - if "uuid" in typeItem: - # committed type, just return uuid - response = "datatypes/" + typeItem["uuid"] - elif typeItem["class"] in ("H5T_INTEGER", "H5T_FLOAT"): - # just return the class and base for pre-defined types - response = {} - response["class"] = typeItem["class"] - response["base"] = typeItem["base"] - elif typeItem["class"] == "H5T_OPAQUE": - response = {} - response["class"] = "H5T_OPAQUE" - response["size"] = typeItem["size"] - elif typeItem["class"] == "H5T_REFERENCE": - response = {} - response["class"] = "H5T_REFERENCE" - response["base"] = typeItem["base"] - elif typeItem["class"] == "H5T_COMPOUND": - response = {} - response["class"] = "H5T_COMPOUND" - fieldList = [] - for field in typeItem["fields"]: - fieldItem = {} - fieldItem["name"] = field["name"] - fieldItem["type"] = getTypeResponse(field["type"]) # recurse call - fieldList.append(fieldItem) - response["fields"] = fieldList - else: - response = {} # otherwise, return full type - for k in typeItem.keys(): - if k == "base": - if isinstance(typeItem[k], dict): - response[k] = getTypeResponse(typeItem[k]) # recurse call - else: - response[k] = typeItem[k] # predefined type - elif k not in ("size", "base_size"): - response[k] = typeItem[k] - return response - - -def getTypeItem(dt, metadata=None): - """ - Return type info. - For primitive types, return string with typename - For compound types return array of dictionary items - """ - predefined_int_types = { - "int8": "H5T_STD_I8", - "uint8": "H5T_STD_U8", - "int16": "H5T_STD_I16", - "uint16": "H5T_STD_U16", - "int32": "H5T_STD_I32", - "uint32": "H5T_STD_U32", - "int64": "H5T_STD_I64", - "uint64": "H5T_STD_U64", - } - predefined_float_types = { - "float16": "H5T_IEEE_F16", - "float32": "H5T_IEEE_F32", - "float64": "H5T_IEEE_F64", - } - # print(">getTypeItem:", dt.str) - if not metadata and dt.metadata: - metadata = dt.metadata - # if metadata: - # print("> metadata:", metadata) - # if dt.shape: - # print("> shape:", dt.shape) - # if len(dt) > 1: - # print("> len:", len(dt)) - - type_info = {} - if len(dt) > 1: - # compound type - names = dt.names - type_info["class"] = "H5T_COMPOUND" - fields = [] - for name in names: - field = {"name": name} - field["type"] = getTypeItem(dt[name]) - fields.append(field) - type_info["fields"] = fields - elif dt.shape: - # array type - if dt.base == dt: - raise TypeError("Expected base type to be different than parent") - # array type - type_info["dims"] = dt.shape - type_info["class"] = "H5T_ARRAY" - # print("> array type, metadata:", metadata) - type_info["base"] = getTypeItem(dt.base, metadata=metadata) - elif dt.kind == "O": - # vlen string or data - # - # check for h5py variable length extension - - if metadata and "vlen" in metadata: - vlen_check = metadata["vlen"] - if vlen_check is not None and not isinstance(vlen_check, np.dtype): - vlen_check = np.dtype(vlen_check) - - if metadata and "ref" in metadata: - ref_check = metadata["ref"] - else: - ref_check = check_dtype(ref=dt.base) - if vlen_check == bytes: - type_info["class"] = "H5T_STRING" - type_info["length"] = "H5T_VARIABLE" - type_info["charSet"] = "H5T_CSET_ASCII" - type_info["strPad"] = "H5T_STR_NULLTERM" - elif vlen_check == str: - type_info["class"] = "H5T_STRING" - type_info["length"] = "H5T_VARIABLE" - type_info["charSet"] = "H5T_CSET_UTF8" - type_info["strPad"] = "H5T_STR_NULLTERM" - elif isinstance(vlen_check, np.dtype): - # vlen data - type_info["class"] = "H5T_VLEN" - type_info["size"] = "H5T_VARIABLE" - type_info["base"] = getTypeItem(vlen_check) - elif vlen_check is not None: - # unknown vlen type - raise TypeError("Unknown h5py vlen type: " + str(vlen_check)) - elif ref_check is not None: - # a reference type - type_info["class"] = "H5T_REFERENCE" - - if ref_check is Reference: - type_info["base"] = "H5T_STD_REF_OBJ" # objref - elif ref_check is RegionReference: - type_info["base"] = "H5T_STD_REF_DSETREG" # region ref - else: - raise TypeError("unexpected reference type") - else: - raise TypeError("unknown object type") - elif dt.kind == "V": - # void type - type_info["class"] = "H5T_OPAQUE" - type_info["size"] = dt.itemsize - type_info["tag"] = "" # todo - determine tag - elif dt.base.kind == "S": - # check for object reference - ref_check = check_dtype(ref=dt.base) - if ref_check is not None: - # a reference type - type_info["class"] = "H5T_REFERENCE" - - if ref_check is Reference: - type_info["base"] = "H5T_STD_REF_OBJ" # objref - elif ref_check is RegionReference: - type_info["base"] = "H5T_STD_REF_DSETREG" # region ref - else: - raise TypeError("unexpected reference type") - else: - # Fixed length string type - type_info["class"] = "H5T_STRING" - type_info["length"] = dt.itemsize - type_info["charSet"] = "H5T_CSET_ASCII" - type_info["strPad"] = "H5T_STR_NULLPAD" - elif dt.base.kind == "U": - # Fixed length unicode type - ref_check = check_dtype(ref=dt.base) - if ref_check is not None: - raise TypeError("unexpected reference type") - - # Fixed length string type with unicode support - type_info["class"] = "H5T_STRING" - - # this can be problematic if the encoding of the string is not valid, - # or reqires too many bytes. Use variable length strings to handle all - # UTF8 strings correctly - type_info["charSet"] = "H5T_CSET_UTF8" - # convert from UTF32 length to a fixed length - type_info["length"] = dt.itemsize - type_info["strPad"] = "H5T_STR_NULLPAD" - - elif dt.kind == "b": - # boolean type - h5py stores as enum - # assume LE unless the numpy byteorder is '>' - byteorder = "LE" - if dt.base.byteorder == ">": - byteorder = "BE" - # this mapping is an h5py convention for boolean support - mapping = {"FALSE": 0, "TRUE": 1} - type_info["class"] = "H5T_ENUM" - type_info["mapping"] = mapping - base_info = {"class": "H5T_INTEGER"} - base_info["base"] = "H5T_STD_I8" + byteorder - type_info["base"] = base_info - elif dt.kind == "f": - # floating point type - type_info["class"] = "H5T_FLOAT" - byteorder = "LE" - if dt.byteorder == ">": - byteorder = "BE" - if dt.name in predefined_float_types: - # maps to one of the HDF5 predefined types - float_type = predefined_float_types[dt.base.name] - type_info["base"] = float_type + byteorder - else: - raise TypeError("Unexpected floating point type: " + dt.name) - elif dt.kind == "i" or dt.kind == "u": - # integer type - - # assume LE unless the numpy byteorder is '>' - byteorder = "LE" - if dt.base.byteorder == ">": - byteorder = "BE" - - # numpy integer type - but check to see if this is the hypy - # enum extension - if metadata and "enum" in metadata: - # yes, this is an enum! - mapping = metadata["enum"] - type_info["class"] = "H5T_ENUM" - type_info["mapping"] = mapping - if dt.name not in predefined_int_types: - raise TypeError("Unexpected integer type: " + dt.name) - # maps to one of the HDF5 predefined types - base_info = {"class": "H5T_INTEGER"} - base_info["base"] = predefined_int_types[dt.name] + byteorder - type_info["base"] = base_info - else: - type_info["class"] = "H5T_INTEGER" - base_name = dt.name - - if dt.name not in predefined_int_types: - raise TypeError("Unexpected integer type: " + dt.name) - - type_info["base"] = predefined_int_types[base_name] + byteorder - - else: - # unexpected kind - raise TypeError(f"unexpected dtype kind: {dt.kind}") - - return type_info - - -def getItemSize(typeItem): - """ - Get size of an item in bytes. - For variable length types (e.g. variable length strings), - return the string "H5T_VARIABLE" - """ - # handle the case where we are passed a primitive type first - if isinstance(typeItem, str) or isinstance(typeItem, bytes): - for type_prefix in ("H5T_STD_I", "H5T_STD_U", "H5T_IEEE_F"): - if typeItem.startswith(type_prefix): - nlen = len(type_prefix) - num_bits = typeItem[nlen:] - if num_bits[-2:] in ("LE", "BE"): - num_bits = num_bits[:-2] - try: - return int(num_bits) // 8 - except ValueError: - raise TypeError("Invalid Type") - # none of the expect primative types mathched - raise TypeError("Invalid Type") - if not isinstance(typeItem, dict): - raise TypeError("invalid type") - - item_size = 0 - if "class" not in typeItem: - raise KeyError("'class' not provided") - typeClass = typeItem["class"] - - if typeClass == "H5T_INTEGER": - if "base" not in typeItem: - raise KeyError("'base' not provided") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_FLOAT": - if "base" not in typeItem: - raise KeyError("'base' not provided") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_STRING": - if "length" not in typeItem: - raise KeyError("'length' not provided") - item_size = typeItem["length"] - - elif typeClass == "H5T_VLEN": - item_size = "H5T_VARIABLE" - elif typeClass == "H5T_OPAQUE": - if "size" not in typeItem: - raise KeyError("'size' not provided") - item_size = int(typeItem["size"]) - - elif typeClass == "H5T_ARRAY": - if "dims" not in typeItem: - raise KeyError("'dims' must be provided for array types") - if "base" not in typeItem: - raise KeyError("'base' not provided") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_ENUM": - if "base" not in typeItem: - raise KeyError("'base' must be provided for enum types") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_REFERENCE": - if "length" in typeItem: - item_size = typeItem["length"] - elif "base" in typeItem and typeItem["base"] == "H5T_STD_REF_OBJ": - # obj ref values are in the form: "groups/" or - # "datasets/" or "datatypes/" - item_size = 48 - else: - raise KeyError("Unable to determine item size for reference type") - elif typeClass == "H5T_COMPOUND": - if "fields" not in typeItem: - raise KeyError("'fields' not provided for compound type") - fields = typeItem["fields"] - if not isinstance(fields, list): - raise TypeError("Type Error: expected list type for 'fields'") - if not fields: - raise KeyError("no 'field' elements provided") - # add up the size of each sub-field - for field in fields: - if not isinstance(field, dict): - raise TypeError("Expected dictionary type for field") - if "type" not in field: - raise KeyError("'type' missing from field") - subtype_size = getItemSize(field["type"]) # recursive call - if subtype_size == "H5T_VARIABLE": - item_size = "H5T_VARIABLE" - break # don't need to look at the rest - - item_size += subtype_size - else: - raise TypeError("Invalid type class") - - # calculate array type - if "dims" in typeItem and isinstance(item_size, int): - dims = typeItem["dims"] - for dim in dims: - item_size *= dim - - return item_size - - -def getDtypeItemSize(dtype): - """ Return size of dtype in bytes - For variable length types (e.g. variable length strings), - return the string "H5T_VARIABLE - """ - item_size = 0 - if len(dtype) > 0: - # compound dtype - for i in range(len(dtype)): - sub_dt = dtype[i] - sub_dt_size = getDtypeItemSize(sub_dt) - if sub_dt_size == "H5T_VARIABLE": - item_size = "H5T_VARIABLE" # return variable if any component is variable - break - item_size += sub_dt_size - else: - # primitive type - if dtype.metadata and "vlen" in dtype.metadata: - item_size = "H5T_VARIABLE" - else: - item_size = dtype.itemsize - return item_size - - -def getNumpyTypename(hdf5TypeName, typeClass=None): - predefined_int_types = { - "H5T_STD_I8": "i1", - "H5T_STD_U8": "u1", - "H5T_STD_I16": "i2", - "H5T_STD_U16": "u2", - "H5T_STD_I32": "i4", - "H5T_STD_U32": "u4", - "H5T_STD_I64": "i8", - "H5T_STD_U64": "u8", - } - predefined_float_types = { - "H5T_IEEE_F16": "f2", - "H5T_IEEE_F32": "f4", - "H5T_IEEE_F64": "f8", - } - - if len(hdf5TypeName) < 3: - raise Exception("Type Error: invalid typename: ") - endian = "<" # default endian - key = hdf5TypeName - if hdf5TypeName.endswith("LE"): - key = hdf5TypeName[:-2] - elif hdf5TypeName.endswith("BE"): - key = hdf5TypeName[:-2] - endian = ">" - - if key in predefined_int_types and ( - typeClass is None or typeClass == "H5T_INTEGER" - ): - return endian + predefined_int_types[key] - if key in predefined_float_types and ( - typeClass is None or typeClass == "H5T_FLOAT" - ): - return endian + predefined_float_types[key] - raise TypeError("Type Error: invalid type") - - -def createBaseDataType(typeItem): - dtRet = None - if isinstance(typeItem, str): - # should be one of the predefined types - dtName = getNumpyTypename(typeItem) - dtRet = np.dtype(dtName) - return dtRet # return predefined type - - if not isinstance(typeItem, dict): - raise TypeError("Type Error: invalid type") - - if "class" not in typeItem: - raise KeyError("'class' not provided") - typeClass = typeItem["class"] - - dims = "" - if "dims" in typeItem: - if typeClass != "H5T_ARRAY": - raise TypeError("'dims' only supported for integer types") - - dims = None - if isinstance(typeItem["dims"], int): - dims = typeItem["dims"] # make into a tuple - elif not isinstance(typeItem["dims"], list) and not isinstance( - typeItem["dims"], tuple - ): - raise TypeError("expected list or integer for dims") - else: - dims = typeItem["dims"] - dims = str(tuple(dims)) - - if typeClass == "H5T_INTEGER": - if "base" not in typeItem: - raise KeyError("'base' not provided") - baseType = getNumpyTypename(typeItem["base"], typeClass="H5T_INTEGER") - dtRet = np.dtype(dims + baseType) - elif typeClass == "H5T_FLOAT": - if "base" not in typeItem: - raise KeyError("'base' not provided") - baseType = getNumpyTypename(typeItem["base"], typeClass="H5T_FLOAT") - dtRet = np.dtype(dims + baseType) - elif typeClass == "H5T_STRING": - if "length" not in typeItem: - raise KeyError("'length' not provided") - if "charSet" not in typeItem: - raise KeyError("'charSet' not provided") - - if typeItem["length"] == "H5T_VARIABLE": - if dims: - msg = "ArrayType is not supported for variable len types" - raise TypeError(msg) - if typeItem["charSet"] == "H5T_CSET_ASCII": - dtRet = special_dtype(vlen=bytes) - elif typeItem["charSet"] == "H5T_CSET_UTF8": - dtRet = special_dtype(vlen=str) - else: - raise TypeError("unexpected 'charSet' value") - else: - nStrSize = typeItem["length"] - if not isinstance(nStrSize, int): - raise TypeError("expecting integer value for 'length'") - type_code = None - if typeItem["charSet"] == "H5T_CSET_ASCII": - type_code = "S" - elif typeItem["charSet"] == "H5T_CSET_UTF8": - # use the same type_code as ascii strings - # (othewise, numpy will reserve bytes for UTF32 representation) - type_code = "S" - else: - raise TypeError("unexpected 'charSet' value") - # a fixed size string - dtRet = np.dtype(dims + type_code + str(nStrSize)) - elif typeClass == "H5T_VLEN": - if dims: - msg = "ArrayType is not supported for variable len types" - raise TypeError(msg) - if "base" not in typeItem: - raise KeyError("'base' not provided") - baseType = createBaseDataType(typeItem["base"]) - dtRet = special_dtype(vlen=np.dtype(baseType)) - elif typeClass == "H5T_OPAQUE": - if dims: - msg = "Opaque Type is not supported for variable len types" - raise TypeError(msg) - if "size" not in typeItem: - raise KeyError("'size' not provided") - nSize = int(typeItem["size"]) - if nSize <= 0: - raise TypeError("'size' must be non-negative") - dtRet = np.dtype("V" + str(nSize)) - elif typeClass == "H5T_ARRAY": - if not dims: - raise KeyError("'dims' must be provided for array types") - if "base" not in typeItem: - raise KeyError("'base' not provided") - arrayBaseType = typeItem["base"] - if isinstance(arrayBaseType, dict): - if "class" not in arrayBaseType: - raise KeyError("'class' not provided for array base type") - type_classes = ("H5T_INTEGER", "H5T_FLOAT", "H5T_STRING", "H5T_ARRAY") - if arrayBaseType["class"] not in type_classes: - msg = "Array Type base type must be integer, float, string, or array" - raise TypeError(msg) - baseType = createDataType(arrayBaseType) - metadata = None - if baseType.metadata: - metadata = dict(baseType.metadata) - dtRet = np.dtype(dims + baseType.str, metadata=metadata) - else: - dtRet = np.dtype(dims + baseType.str) - return dtRet # return predefined type - elif typeClass == "H5T_REFERENCE": - if "base" not in typeItem: - raise KeyError("'base' not provided") - if typeItem["base"] == "H5T_STD_REF_OBJ": - dtRet = special_dtype(ref=Reference) - elif typeItem["base"] == "H5T_STD_REF_DSETREG": - dtRet = special_dtype(ref=RegionReference) - else: - raise TypeError("Invalid base type for reference type") - - elif typeClass == "H5T_ENUM": - if "base" not in typeItem: - raise KeyError("Expected 'base' to be provided for enum type") - base_json = typeItem["base"] - if "class" not in base_json: - raise KeyError("Expected class field in base type") - if base_json["class"] != "H5T_INTEGER": - msg = "Only integer base types can be used with enum type" - raise TypeError(msg) - if "mapping" not in typeItem: - raise KeyError("'mapping' not provided for enum type") - mapping = typeItem["mapping"] - if len(mapping) == 0: - raise KeyError("empty enum map") - - dt = createBaseDataType(base_json) - if all( - ( - dt.kind == "i", - dt.name == "int8", - len(mapping) == 2, - "TRUE" in mapping, - "FALSE" in mapping, - ) - ): - # convert to numpy boolean type - dtRet = np.dtype("bool") - else: - # not a boolean enum, use h5py special dtype - dtRet = special_dtype(enum=(dt, mapping)) - - else: - raise TypeError("Invalid type class") - - return dtRet - - -def createDataType(typeItem): - """ - Create a numpy datatype given a json type - """ - dtRet = None - if type(typeItem) in (str, bytes): - # should be one of the predefined types - dtName = getNumpyTypename(typeItem) - dtRet = np.dtype(dtName) - return dtRet # return predefined type - - if not isinstance(typeItem, dict): - raise TypeError("invalid type") - - if "class" not in typeItem: - raise KeyError("'class' not provided") - typeClass = typeItem["class"] - - if typeClass == "H5T_COMPOUND": - if "fields" not in typeItem: - raise KeyError("'fields' not provided for compound type") - fields = typeItem["fields"] - if type(fields) is not list: - raise TypeError("Type Error: expected list type for 'fields'") - if not fields: - raise KeyError("no 'field' elements provided") - subtypes = [] - for field in fields: - - if not isinstance(field, dict): - raise TypeError("Expected dictionary type for field") - if "name" not in field: - raise KeyError("'name' missing from field") - if "type" not in field: - raise KeyError("'type' missing from field") - field_name = field["name"] - if not isinstance(field_name, str): - raise TypeError("field names must be strings") - # verify the field name is ascii - try: - field_name.encode("ascii") - except UnicodeEncodeError: - raise TypeError("non-ascii field name not allowed") - - dt = createDataType(field["type"]) # recursive call - if dt is None: - raise Exception("unexpected error") - subtypes.append((field["name"], dt)) # append tuple - - dtRet = np.dtype(subtypes) - else: - dtRet = createBaseDataType(typeItem) # create non-compound dt - return dtRet - - -def validateTypeItem(typeItem): - """ - Validate a json type - call createDataType and if no exception, - it's valid - """ - createDataType(typeItem) - # throws KeyError, TypeError, or ValueError - - -def getBaseTypeJson(type_name): - """ - Return JSON representation of a predefined type string - """ - predefined_int_types = ( - "H5T_STD_I8", - "H5T_STD_U8", - "H5T_STD_I16", - "H5T_STD_U16", - "H5T_STD_I32", - "H5T_STD_U32", - "H5T_STD_I64", - "H5T_STD_U64", - ) - predefined_float_types = ("H5T_IEEE_F16", "H5T_IEEE_F32", "H5T_IEEE_F64") - type_json = {} - # predefined typenames start with 'H5T' and end with "LE" or "BE" - if all( - ( - type_name.startswith("H5T_"), - type_name[-1] == "E", - type_name[-2] in ("L", "B"), - ) - ): - # trime of the "BE/"LE" - type_prefix = type_name[:-2] - if type_prefix in predefined_int_types: - type_json["class"] = "H5T_INTEGER" - type_json["base"] = type_name - elif type_prefix in predefined_float_types: - type_json["class"] = "H5T_FLOAT" - type_json["base"] = type_name - else: - raise TypeError("Invalid type name") - else: - raise TypeError("Invalid type name") - return type_json - - -def getSubType(dt_parent, fields): - """ Return a dtype that is a compound type composed of - the fields given in the field_names list - """ - if len(dt_parent) == 0: - raise TypeError("getSubType - parent must be compound type") - if not fields: - raise TypeError("null field specification") - if isinstance(fields, str): - fields = [fields,] # convert to a list - - field_names = set(dt_parent.names) - dt_items = [] - for field in fields: - if field not in field_names: - raise TypeError(f"field: {field} is not defined in parent type") - dt_items.append((field, dt_parent[field])) - dt = np.dtype(dt_items) - - return dt diff --git a/hsds/util/httpUtil.py b/hsds/util/httpUtil.py index 0d43ae4a..3ca19f19 100644 --- a/hsds/util/httpUtil.py +++ b/hsds/util/httpUtil.py @@ -25,7 +25,8 @@ from aiohttp.web_exceptions import HTTPRequestEntityTooLarge from aiohttp.web_exceptions import HTTPServiceUnavailable, HTTPBadRequest from aiohttp.client_exceptions import ClientError -from hsds.util.idUtil import isValidUuid + +from h5json.objid import isValidUuid from .. import hsds_logger as log from .. import config diff --git a/hsds/util/idUtil.py b/hsds/util/idUtil.py deleted file mode 100644 index fe21bbb0..00000000 --- a/hsds/util/idUtil.py +++ /dev/null @@ -1,540 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # -# Utilities. The full HSDS copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## -# -# idUtil: -# id (uuid) related functions -# - -import os.path -import hashlib -import uuid -from aiohttp.web_exceptions import HTTPServiceUnavailable -from .. import hsds_logger as log - - -S3_URI = "s3://" -FILE_URI = "file://" -AZURE_URI = "blob.core.windows.net/" # preceded with "https://" - - -def _getStorageProtocol(uri): - """ returns 's3://', 'file://', or 'https://...net/' prefix if present. - If the prefix is in the form: https://myaccount.blob.core.windows.net/mycontainer - (references Azure blob storage), return: https://myaccount.blob.core.windows.net/ - otherwise None """ - - if not uri: - protocol = None - elif uri.startswith(S3_URI): - protocol = S3_URI - elif uri.startswith(FILE_URI): - protocol = FILE_URI - elif uri.startswith("https://") and uri.find(AZURE_URI) > 0: - n = uri.find(AZURE_URI) + len(AZURE_URI) - protocol = uri[:n] - elif uri.find("://") >= 0: - raise ValueError(f"storage uri: {uri} not supported") - else: - protocol = None - return protocol - - -def _getBaseName(uri): - """ Return the part of the URI after the storage protocol (if any) """ - - protocol = _getStorageProtocol(uri) - if not protocol: - return uri - else: - return uri[len(protocol):] - - -def getIdHash(id): - """Return md5 prefix based on id value""" - m = hashlib.new("md5") - m.update(id.encode("utf8")) - hexdigest = m.hexdigest() - return hexdigest[:5] - - -def isSchema2Id(id): - """return true if this is a v2 id""" - # v1 ids are in the standard UUID format: 8-4-4-4-12 - # v2 ids are in the non-standard: 8-8-4-6-6 - parts = id.split("-") - if len(parts) != 6: - raise ValueError(f"Unexpected id formation for uuid: {id}") - if len(parts[2]) == 8: - return True - else: - return False - - -def getIdHexChars(id): - """get the hex chars of the given id""" - if id[0] == "c": - # don't include chunk index - index = id.index("_") - parts = id[0:index].split("-") - else: - parts = id.split("-") - if len(parts) != 6: - raise ValueError(f"Unexpected id format for uuid: {id}") - return "".join(parts[1:]) - - -def hexRot(ch): - """rotate hex character by 8""" - return format((int(ch, base=16) + 8) % 16, "x") - - -def isRootObjId(id): - """returns true if this is a root id (only for v2 schema)""" - if not isSchema2Id(id): - raise ValueError("isRootObjId can only be used with v2 ids") - validateUuid(id) # will throw ValueError exception if not a objid - if id[0] != "g": - return False # not a group - token = getIdHexChars(id) - # root ids will have last 16 chars rotated version of the first 16 - is_root = True - for i in range(16): - if token[i] != hexRot(token[i + 16]): - is_root = False - break - return is_root - - -def getRootObjId(id): - """returns root id for this objid if this is a root id - (only for v2 schema) - """ - if isRootObjId(id): - return id # this is the root id - token = list(getIdHexChars(id)) - # root ids will have last 16 chars rotated version of the first 16 - for i in range(16): - token[i + 16] = hexRot(token[i]) - token = "".join(token) - root_id = "g-" + token[0:8] + "-" + token[8:16] + "-" + token[16:20] - root_id += "-" + token[20:26] + "-" + token[26:32] - - return root_id - - -def createObjId(obj_type, rootid=None): - if obj_type not in ("groups", "datasets", "datatypes", "chunks", "roots"): - raise ValueError("unexpected obj_type") - - prefix = None - if obj_type == "datatypes": - prefix = "t" # don't collide with datasets - elif obj_type == "roots": - prefix = "g" # root obj is a group - else: - prefix = obj_type[0] - if not rootid and obj_type != "roots": - # v1 schema - folder - objid = prefix + "-" + str(uuid.uuid1()) - elif rootid and not isSchema2Id(rootid): - # v1 schema - domain - objid = prefix + "-" + str(uuid.uuid1()) - else: - # schema v2 - salt = uuid.uuid4().hex - # take a hash to randomize the uuid - token = list(hashlib.sha256(salt.encode()).hexdigest()) - - if rootid: - # replace first 16 chars of token with first 16 chars of root id - root_hex = getIdHexChars(rootid) - token[0:16] = root_hex[0:16] - else: - # obj_type == "roots" - # use only 16 chars, but make it look a 32 char id - for i in range(16): - token[16 + i] = hexRot(token[i]) - # format as a string - token = "".join(token) - objid = prefix + "-" + token[0:8] + "-" + token[8:16] + "-" - objid += token[16:20] + "-" + token[20:26] + "-" + token[26:32] - - return objid - - -def getS3Key(id): - """Return s3 key for given id. - - For schema v1: - A md5 prefix is added to the front of the returned key to better - distribute S3 objects. - For schema v2: - The id is converted to the pattern: "db/{rootid[0:16]}" for rootids and - "db/id[0:16]/{prefix}/id[16-32]" for other ids - Chunk ids have the chunk index added after the slash: - "db/id[0:16]/d/id[16:32]/x_y_z - - For domain id's: - Return a key with the .domain suffix and no preceding slash. - For non-default buckets, use the format: /s3_key - If the id has a storage specifier ("s3://", "file://", etc.) - include that along with the bucket name. e.g.: "s3://mybucket/a_folder/a_file.h5" - """ - - base_id = _getBaseName(id) # strip any s3://, etc. - if base_id.find("/") > 0: - # a domain id - domain_suffix = ".domain.json" - index = base_id.find("/") + 1 - key = base_id[index:] - if not key.endswith(domain_suffix): - if key[-1] != "/": - key += "/" - key += domain_suffix - else: - if isSchema2Id(id): - # schema v2 id - hexid = getIdHexChars(id) - prefix = id[0] # one of g, d, t, c - if prefix not in ("g", "d", "t", "c"): - raise ValueError(f"Unexpected id: {id}") - - if isRootObjId(id): - key = f"db/{hexid[0:8]}-{hexid[8:16]}" - else: - partition = "" - if prefix == "c": - # use 'g' so that chunks will show up under their dataset - s3col = "d" - n = id.find("-") - if n > 1: - # extract the partition index if present - partition = "p" + id[1:n] - else: - s3col = prefix - key = f"db/{hexid[0:8]}-{hexid[8:16]}/{s3col}/{hexid[16:20]}" - key += f"-{hexid[20:26]}-{hexid[26:32]}" - if prefix == "c": - if partition: - key += "/" - key += partition - # add the chunk coordinate - index = id.index("_") # will raise ValueError if not found - n = index + 1 - coord = id[n:] - key += "/" - key += coord - elif prefix == "g": - # add key suffix for group - key += "/.group.json" - elif prefix == "d": - # add key suffix for dataset - key += "/.dataset.json" - else: - # add key suffix for datatype - key += "/.datatype.json" - else: - # v1 id - # schema v1 id - idhash = getIdHash(id) - key = f"{idhash}-{id}" - - return key - - -def getObjId(s3key): - """Return object id given valid s3key""" - if all( - ( - len(s3key) >= 44 and s3key[0:5].isalnum(), - len(s3key) >= 44 and s3key[5] == "-", - len(s3key) >= 44 and s3key[6] in ("g", "d", "c", "t"), - ) - ): - # v1 obj keys - objid = s3key[6:] - elif s3key.endswith("/.domain.json"): - objid = "/" + s3key[: -(len("/.domain.json"))] - elif s3key.startswith("db/"): - # schema v2 object key - parts = s3key.split("/") - chunk_coord = "" # used only for chunk ids - partition = "" # likewise - token = [] - for ch in parts[1]: - if ch != "-": - token.append(ch) - - if len(parts) == 3: - # root id - # last part should be ".group.json" - if parts[2] != ".group.json": - raise ValueError(f"unexpected S3Key: {s3key}") - # add 16 more chars using rotated version of first 16 - for i in range(16): - token.append(hexRot(token[i])) - prefix = "g" - elif len(parts) == 5: - # group, dataset, or datatype or chunk - for ch in parts[3]: - if ch != "-": - token.append(ch) - - if parts[2] == "g" and parts[4] == ".group.json": - prefix = "g" # group json - elif parts[2] == "t" and parts[4] == ".datatype.json": - prefix = "t" # datatype json - elif parts[2] == "d": - if parts[4] == ".dataset.json": - prefix = "d" # dataset json - else: - # chunk object - prefix = "c" - chunk_coord = "_" + parts[4] - else: - raise ValueError(f"unexpected S3Key: {s3key}") - elif len(parts) == 6: - # chunk key with partitioning - for ch in parts[3]: - if ch != "-": - token.append(ch) - if parts[2][0] != "d": - raise ValueError(f"unexpected S3Key: {s3key}") - prefix = "c" - partition = parts[4] - if partition[0] != "p": - raise ValueError(f"unexpected S3Key: {s3key}") - partition = partition[1:] # strip off the p - chunk_coord = "_" + parts[5] - else: - raise ValueError(f"unexpected S3Key: {s3key}") - - token = "".join(token) - objid = prefix + partition + "-" + token[0:8] + "-" + token[8:16] - objid += "-" + token[16:20] + "-" + token[20:26] + "-" - objid += token[26:32] + chunk_coord - else: - msg = f"unexpected S3Key: {s3key}" - log.warn(msg) - raise ValueError(msg) - return objid - - -def isS3ObjKey(s3key): - valid = False - try: - objid = getObjId(s3key) - if objid: - valid = True - except KeyError: - pass # ignore - except ValueError: - pass # ignore - return valid - - -def createNodeId(prefix, node_number=None): - """Create a random id used to identify nodes""" - node_id = "" # nothing too bad happens if this doesn't get set - if node_number is not None: - # just make an id based on the node_number - hash_key = f"{node_number + 1:03d}" - else: - # use the container id if we are running inside docker - hash_key = getIdHash(str(uuid.uuid1())) - proc_file = "/proc/self/cgroup" - if os.path.isfile(proc_file): - with open(proc_file) as f: - first_line = f.readline() - if first_line: - fields = first_line.split(":") - if len(fields) >= 3: - field = fields[2] - if field.startswith("/docker/"): - docker_len = len("/docker/") - - if len(field) > docker_len + 12: - n = docker_len - m = n + 12 - node_id = field[n:m] - - if node_id: - key = f"{prefix}-{node_id}-{hash_key}" - else: - key = f"{prefix}-{hash_key}" - return key - - -def getCollectionForId(obj_id): - """return groups/datasets/datatypes based on id""" - if not isinstance(obj_id, str): - raise ValueError("invalid object id") - collection = None - if obj_id.startswith("g-"): - collection = "groups" - elif obj_id.startswith("d-"): - collection = "datasets" - elif obj_id.startswith("t-"): - collection = "datatypes" - else: - raise ValueError("not a collection id") - return collection - - -def validateUuid(id, obj_class=None): - if not isinstance(id, str): - raise ValueError("Expected string type") - if len(id) < 38: - # id should be prefix (e.g. "g-") and uuid value - raise ValueError("Unexpected id length") - if id[0] not in ("g", "d", "t", "c"): - raise ValueError("Unexpected prefix") - if id[0] != "c" and id[1] != "-": - # chunk ids may have a partition index following the c - raise ValueError("Unexpected prefix") - if obj_class is not None: - obj_class = obj_class.lower() - prefix = obj_class[0] - if obj_class.startswith("datatype"): - prefix = "t" - if id[0] != prefix: - raise ValueError(f"Unexpected prefix for class: {obj_class}") - if id[0] == "c": - # trim the type char and any partition id - n = id.find("-") - if n == -1: - raise ValueError("Invalid chunk id") - - # trim the chunk index for chunk ids - m = id.find("_") - if m == -1: - raise ValueError("Invalid chunk id") - n += 1 - id = "c-" + id[n:m] - if len(id) != 38: - # id should be 36 now - raise ValueError("Unexpected id length") - - for ch in id: - if ch.isalnum(): - continue - if ch == "-": - continue - raise ValueError(f"Unexpected character in uuid: {ch}") - - -def isValidUuid(id, obj_class=None): - try: - validateUuid(id, obj_class) - return True - except ValueError: - return False - - -def isValidChunkId(id): - if not isValidUuid(id): - return False - if id[0] != "c": - return False - return True - - -def getClassForObjId(id): - """return domains/chunks/groups/datasets/datatypes based on id""" - if not isinstance(id, str): - raise ValueError("Expected string type") - if len(id) == 0: - raise ValueError("Empty string") - if id[0] == "/": - return "domains" - if isValidChunkId(id): - return "chunks" - else: - return getCollectionForId(id) - - -def isObjId(id): - """return true if uuid or domain""" - if not isinstance(id, str) or len(id) == 0: - return False - if id.find("/") > 0: - # domain id is any string in the form / - return True - return isValidUuid(id) - - -def getUuidFromId(id): - """strip off the type prefix ('g-' or 'd-', or 't-') - and return the uuid part""" - return id[2:] - - -def getObjPartition(id, count): - """Get the id of the dn node that should be handling the given obj id""" - hash_code = getIdHash(id) - hash_value = int(hash_code, 16) - number = hash_value % count - return number - - -def getNodeNumber(app): - if app["node_type"] == "sn": - log.error("node number if only for DN nodes") - raise ValueError() - - dn_ids = app["dn_ids"] - log.debug(f"getNodeNumber(from dn_ids: {dn_ids})") - for i in range(len(dn_ids)): - dn_id = dn_ids[i] - if dn_id == app["id"]: - log.debug(f"returning nodeNumber: {i}") - return i - log.error("getNodeNumber, no matching id") - return -1 - - -def getNodeCount(app): - dn_urls = app["dn_urls"] - log.debug(f"getNodeCount for dn_urls: {dn_urls}") - dn_node_count = len(dn_urls) - return dn_node_count - - -def validateInPartition(app, obj_id): - node_number = getNodeNumber(app) - node_count = getNodeCount(app) - msg = f"obj_id: {obj_id}, node_count: {node_count}, " - msg += f"node_number: {node_number}" - log.debug(msg) - partition_number = getObjPartition(obj_id, node_count) - if partition_number != node_number: - # The request shouldn't have come to this node' - msg = f"wrong node for 'id':{obj_id}, expected node {node_number} " - msg += f"got {partition_number}" - log.error(msg) - raise KeyError(msg) - - -def getDataNodeUrl(app, obj_id): - """Return host/port for datanode for given obj_id. - Throw exception if service is not ready""" - dn_urls = app["dn_urls"] - dn_node_count = getNodeCount(app) - node_state = app["node_state"] - if node_state != "READY" or dn_node_count <= 0: - msg = "Service not ready" - log.warn(msg) - raise HTTPServiceUnavailable() - dn_number = getObjPartition(obj_id, dn_node_count) - url = dn_urls[dn_number] - log.debug(f"got dn_url: {url} for obj_id: {obj_id}") - return url diff --git a/hsds/util/linkUtil.py b/hsds/util/linkUtil.py index 3469a8a1..65939e7d 100644 --- a/hsds/util/linkUtil.py +++ b/hsds/util/linkUtil.py @@ -13,122 +13,71 @@ # linkdUtil: # link related functions # +from h5json.time_util import getNow +from h5json.link_util import validateLinkName, getLinkClass, getLinkPath, getLinkFilePath from .. import hsds_logger as log -def validateLinkName(name): - """ verify the link name is valid """ - if not isinstance(name, str): - msg = "Unexpected type for link name" - log.warn(msg) - raise ValueError(msg) - if name.find("/") >= 0: - msg = "link name contains slash" +def getRequestLink(title, link_json, predate_max_time=0.0): + """ return normalized link from request json + Throw value error if badly formatted """ + + if not isinstance(link_json, dict): + msg = f"expected dict for for links, but got: {type(link_json)}" log.warn(msg) raise ValueError(msg) + log.debug(f"getRequestLink title: {title} link_json: {link_json}") + link_item = {} # normalized link item to return -def getLinkClass(link_json): - """ verify this is a valid link - returns the link class """ - if "class" in link_json: - link_class = link_json["class"] - else: - link_class = None - if "h5path" in link_json and "id" in link_json: - msg = "link tgt_id and h5path both set" - log.warn(msg) - raise ValueError(msg) - if "id" in link_json: - tgt_id = link_json["id"] - if not isinstance(tgt_id, str) or len(tgt_id) < 38: - msg = f"link with invalid id: {tgt_id}" - log.warn(msg) - raise ValueError(msg) - if tgt_id[:2] not in ("g-", "t-", "d-"): - msg = "link tgt must be group, datatype or dataset uuid" + now = getNow() + + validateLinkName(title) # will raise ValueError is invalid + + link_class = getLinkClass(link_json) + + link_item = {"class": link_class} + + if link_class == "H5L_TYPE_HARD": + if "id" not in link_json: + msg = "expected id key for hard link" log.warn(msg) - raise ValueError(msg) - if link_class: - if link_class != "H5L_TYPE_HARD": - msg = f"expected link class to be H5L_TYPE_HARD but got: {link_class}" - log.warn(msg) - raise ValueError(msg) - else: - link_class = "H5L_TYPE_HARD" - elif "h5path" in link_json: - h5path = link_json["h5path"] - log.debug(f"link path: {h5path}") - if "h5domain" in link_json: - if link_class: - if link_class != "H5L_TYPE_EXTERNAL": - msg = f"expected link class to be H5L_TYPE_EXTERNAL but got: {link_class}" - log.warn(msg) - raise ValueError(msg) - else: - link_class = "H5L_TYPE_EXTERNAL" - else: - if link_class: - if link_class != "H5L_TYPE_SOFT": - msg = f"expected link class to be H5L_TYPE_SOFT but got: {link_class}" - log.warn(msg) - raise ValueError(msg) - else: - link_class = "H5L_TYPE_SOFT" + raise ValueError + link_item["id"] = link_json["id"] else: - msg = "link with no id or h5path" + if link_class in ("H5L_TYPE_SOFT", "H5L_TYPE_EXTERNAL"): + link_item["h5path"] = getLinkPath(link_json) + + if link_class == "H5L_TYPE_EXTERNAL": + link_item["file"] = getLinkFilePath(link_json) + + if "created" in link_json: + created = link_json["created"] + # allow "pre-dated" attributes if recent enough + if now - created < predate_max_time: + link_item["created"] = created + else: + log.warn("stale created timestamp for link, ignoring") + if "created" not in link_item: + link_item["created"] = now + + return link_item + + +def getRequestLinks(links_json, predate_max_time=0.0): + """ return list of normalized links from request json + Throw value error if any is badly formatted """ + + if not isinstance(links_json, dict): + msg = f"POST_Groups expected dict for for links, but got: {type(links_json)}" log.warn(msg) raise ValueError(msg) - return link_class + links = {} # normalized link items to return + kwargs = {"predate_max_time": predate_max_time} + for title in links_json: + links[title] = getRequestLink(title, links_json[title], **kwargs) -def isEqualLink(link1, link2): - """ Return True if the two links are the same """ - - for obj in (link1, link2): - if not isinstance(obj, dict): - raise TypeError(f"unexpected type: {type(obj)}") - if "class" not in obj: - raise TypeError("expected class key for link") - if link1["class"] != link2["class"]: - return False # different link types - link_class = link1["class"] - if link_class == "H5L_TYPE_HARD": - for obj in (link1, link2): - if "id" not in obj: - raise TypeError(f"expected id key for link: {obj}") - if link1["id"] != link2["id"]: - return False - elif link_class == "H5L_TYPE_SOFT": - for obj in (link1, link2): - if "h5path" not in obj: - raise TypeError(f"expected h5path key for link: {obj}") - if link1["h5path"] != link2["h5path"]: - return False - elif link_class == "H5L_TYPE_EXTERNAL": - for obj in (link1, link2): - for k in ("h5path", "h5domain"): - if k not in obj: - raise TypeError(f"expected {k} key for link: {obj}") - if link1["h5path"] != link2["h5path"]: - return False - if link1["h5domain"] != link2["h5domain"]: - return False - else: - raise TypeError(f"unexpected link class: {link_class}") - return True - - -def h5Join(path, paths): - h5path = path - if not paths: - return h5path - if isinstance(paths, str): - paths = (paths,) - for s in paths: - if h5path[-1] != "/": - h5path += "/" - h5path += s - return h5path + return links diff --git a/hsds/util/nodeUtil.py b/hsds/util/nodeUtil.py new file mode 100644 index 00000000..8f67f400 --- /dev/null +++ b/hsds/util/nodeUtil.py @@ -0,0 +1,124 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +# +# nodeUtil: +# node (SN/DN mapping) related functions +# +import hashlib +import os.path +import uuid + +from aiohttp.web_exceptions import HTTPServiceUnavailable + +from .. import hsds_logger as log + + +def _getIdHash(id): + """Return md5 prefix based on id value""" + m = hashlib.new("md5") + m.update(id.encode("utf8")) + hexdigest = m.hexdigest() + return hexdigest[:5] + + +def createNodeId(prefix, node_number=None): + """Create a random id used to identify nodes""" + node_id = "" # nothing too bad happens if this doesn't get set + if node_number is not None: + # just make an id based on the node_number + hash_key = f"{node_number + 1:03d}" + else: + # use the container id if we are running inside docker + hash_key = _getIdHash(str(uuid.uuid1())) + proc_file = "/proc/self/cgroup" + if os.path.isfile(proc_file): + with open(proc_file) as f: + first_line = f.readline() + if first_line: + fields = first_line.split(":") + if len(fields) >= 3: + field = fields[2] + if field.startswith("/docker/"): + docker_len = len("/docker/") + + if len(field) > docker_len + 12: + n = docker_len + m = n + 12 + node_id = field[n:m] + + if node_id: + key = f"{prefix}-{node_id}-{hash_key}" + else: + key = f"{prefix}-{hash_key}" + return key + + +def getObjPartition(id, count): + """Get the id of the dn node that should be handling the given obj id""" + hash_code = _getIdHash(id) + hash_value = int(hash_code, 16) + number = hash_value % count + return number + + +def getNodeNumber(app): + if app["node_type"] == "sn": + log.error("node number if only for DN nodes") + raise ValueError() + + dn_ids = app["dn_ids"] + log.debug(f"getNodeNumber(from dn_ids: {dn_ids})") + for i in range(len(dn_ids)): + dn_id = dn_ids[i] + if dn_id == app["id"]: + log.debug(f"returning nodeNumber: {i}") + return i + log.error("getNodeNumber, no matching id") + return -1 + + +def getNodeCount(app): + dn_urls = app["dn_urls"] + log.debug(f"getNodeCount for dn_urls: {dn_urls}") + dn_node_count = len(dn_urls) + return dn_node_count + + +def validateInPartition(app, obj_id): + node_number = getNodeNumber(app) + node_count = getNodeCount(app) + msg = f"obj_id: {obj_id}, node_count: {node_count}, " + msg += f"node_number: {node_number}" + log.debug(msg) + partition_number = getObjPartition(obj_id, node_count) + if partition_number != node_number: + # The request shouldn't have come to this node' + msg = f"wrong node for 'id':{obj_id}, expected node {node_number} " + msg += f"got {partition_number}" + log.error(msg) + raise KeyError(msg) + + +def getDataNodeUrl(app, obj_id): + """Return host/port for datanode for given obj_id. + Throw exception if service is not ready""" + dn_urls = app["dn_urls"] + dn_node_count = getNodeCount(app) + node_state = app["node_state"] + if node_state != "READY" or dn_node_count <= 0: + msg = "Service not ready" + log.warn(msg) + raise HTTPServiceUnavailable() + dn_number = getObjPartition(obj_id, dn_node_count) + url = dn_urls[dn_number] + log.debug(f"got dn_url: {url} for obj_id: {obj_id}") + return url diff --git a/hsds/util/storUtil.py b/hsds/util/storUtil.py index 3bbb073c..7b3b8a4e 100644 --- a/hsds/util/storUtil.py +++ b/hsds/util/storUtil.py @@ -69,7 +69,7 @@ def getCompressors(): def getSupportedFilters(include_compressors=True): """return list of other supported filters""" filters = [ - # "bitshuffle", + "bitshuffle", "shuffle", "fletcher32", "nbit", # No-op @@ -493,7 +493,7 @@ async def getStorBytes(app, chunk_bytes = [] for chunk_location in chunk_locations: - log.debug(f"getStoreBytes - processing chunk_location: {chunk_location}") + log.debug(f"getStorBytes - processing chunk_location: {chunk_location}") n = chunk_location.offset - offset if n < 0: log.warn(f"getStorBytes - unexpected offset for chunk_location: {chunk_location}") diff --git a/hsds/util/timeUtil.py b/hsds/util/timeUtil.py deleted file mode 100755 index e4ae9d3f..00000000 --- a/hsds/util/timeUtil.py +++ /dev/null @@ -1,83 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # -# Utilities. The full HSDS copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## -from datetime import datetime -import time -import os -import pytz - - -def unixTimeToUTC(timestamp): - """Convert unix timestamp (seconds since Jan 1, 1970, to ISO-8601 - compatible UTC time string. - - """ - utc = pytz.utc - dtTime = datetime.fromtimestamp(timestamp, utc) - iso_str = dtTime.isoformat() - # isoformat returns a string like this: - # '2014-10-30T04:25:21+00:00' - # strip off the '+00:00' and replace - # with 'Z' (both are ISO-8601 compatible) - npos = iso_str.rfind("+") - iso_z = iso_str[:npos] + "Z" - return iso_z - - -def elapsedTime(timestamp): - """Get Elapsed time from given timestamp""" - delta = int(time.time()) - timestamp - if delta < 0: - return "Invalid timestamp!" - day_length = 24 * 60 * 60 - days = 0 - hour_length = 60 * 60 - hours = 0 - minute_length = 60 - minutes = 0 - ret_str = "" - - if delta > day_length: - days = delta // day_length - delta = delta % day_length - ret_str += "{} days ".format(days) - if delta > hour_length or days > 0: - hours = delta // hour_length - delta = delta % hour_length - ret_str += "{} hours ".format(hours) - if delta > minute_length or days > 0 or hours > 0: - minutes = delta // minute_length - delta = delta % minute_length - ret_str += "{} minutes ".format(minutes) - ret_str += "{} seconds".format(delta) - return ret_str - - -def getNow(app): - """ - Get current time in unix timestamp - - Returns a precise timestamp even on platforms where - time.time() has low resolution (e.g. Windows) - """ - system = os.name - current_time = 0 - - if system == "nt": - # Windows - current_time = (time.perf_counter() - app["start_time_relative"]) + app["start_time"] - elif system == "posix": - # Unix - current_time = time.time() - else: - raise ValueError(f"Unsupported OS: {system}") - - return current_time diff --git a/pyproject.toml b/pyproject.toml index af575c13..422750fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ classifiers = [ "Topic :: Database", "Topic :: Software Development :: Libraries :: Python Modules", ] -requires-python = ">=3.8" +requires-python = ">=3.10" version = "0.9.2" dependencies = [ @@ -42,9 +42,10 @@ dependencies = [ "bitshuffle >=0.5.2", "cryptography", "h5py >= 3.6.0", + "h5json >= 1.0.0", "importlib_resources", - "numcodecs", - "numpy >=2.0.0rc1; python_version>='3.9'", + "numcodecs <= 0.15.1", + "numpy >=2.0.0", "psutil", "pyjwt", "pytz", diff --git a/requirements.txt b/requirements.txt index 5aa9d39b..7dfad721 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,7 @@ aiofiles azure-storage-blob cryptography h5py>=3.6.0 +hdf5-json>1.0.0 numcodecs numpy>=2.0.0rc1 psutil diff --git a/testall.py b/testall.py index 1e8ea348..480ab693 100755 --- a/testall.py +++ b/testall.py @@ -15,8 +15,8 @@ PYTHON_CMD = "python" # change to "python3" if "python" invokes python version 2.x -unit_tests = ('array_util_test', 'chunk_util_test', 'compression_test', 'domain_util_test', - 'dset_util_test', 'hdf5_dtype_test', 'id_util_test', 'lru_cache_test', +unit_tests = ('chunk_util_test', 'compression_test', 'domain_util_test', + 'dset_util_test', 'lru_cache_test', 'shuffle_test', 'rangeget_util_test') integ_tests = ('uptest', 'setup_test', 'domain_test', 'group_test', diff --git a/tests/integ/attr_test.py b/tests/integ/attr_test.py index de54c5ea..e986a464 100644 --- a/tests/integ/attr_test.py +++ b/tests/integ/attr_test.py @@ -12,6 +12,7 @@ from copy import copy import unittest import json +import time import numpy as np import base64 import helper @@ -500,6 +501,66 @@ def testPutFixedString(self): self.assertTrue("length" in type_json) self.assertEqual(type_json["length"], 7) + def testUseTimestamp(self): + # Test PUT value for 1d attribute with timestamp included + print("testUseTimestamp", self.base_domain) + + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + def _create_attr(attr_name, ts=None): + + # create attr + fixed_str_type = { + "charSet": "H5T_CSET_ASCII", + "class": "H5T_STRING", + "length": 12, + "strPad": "H5T_STR_NULLPAD", + } + data = {"type": fixed_str_type, "value": "XYZ"} + if ts: + data["created"] = ts + req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + def _check_attr_ts(attr_name, min_ts=None, max_ts=None): + # read attr + req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], "XYZ") + self.assertTrue("type" in rspJson) + self.assertTrue("created" in rspJson) + if min_ts: + self.assertGreaterEqual(rspJson["created"], min_ts) + if max_ts: + self.assertLessEqual(rspJson["created"], max_ts) + + now = time.time() + # server-based timestamp + _create_attr("a1") + _check_attr_ts("a1", min_ts=(now - 1), max_ts=(now + 1)) + # client assigned timestamp + _create_attr("a2", ts=now) + _check_attr_ts("a2", min_ts=now, max_ts=now) + # client assigned with small time-skew, ok + _create_attr("a3", ts=int(now)) + _check_attr_ts("a3", min_ts=int(now), max_ts=int(now)) + # client assigned with large time-skew, ignored + _create_attr("a4", ts=999) + _check_attr_ts("a4", min_ts=now, max_ts=(now + 1)) + def testPutFixedStringNullTerm(self): # Test PUT value for 1d attribute with fixed length string/null terminated types print("testPutFixedStringNullTerm", self.base_domain) diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index 280877cf..17357119 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -13,16 +13,13 @@ import json import time import numpy as np + +from h5json.objid import createObjId +from h5json.filters import getFilterItem + import helper import config -# min/max chunk size - these can be set by config, but -# practially the min config value should be larger than -# CHUNK_MIN and the max config value should less than -# CHUNK_MAX -CHUNK_MIN = 1024 # lower limit (1024b) -CHUNK_MAX = 50 * 1024 * 1024 # upper limit (50M) - class DatasetTest(unittest.TestCase): def __init__(self, *args, **kwargs): @@ -61,31 +58,34 @@ def testScalarDataset(self): rsp = self.session.post(req, data=json.dumps(data), headers=headers) self.assertEqual(rsp.status_code, 201) rspJson = json.loads(rsp.text) - self.assertEqual(rspJson["attributeCount"], 0) - dset_id = rspJson["id"] - self.assertTrue(helper.validateId(dset_id)) - - # read back the obj - req = self.endpoint + "/datasets/" + dset_id - rsp = self.session.get(req, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) expected_keys = [ "id", "shape", - "hrefs", - "layout", "creationProperties", "attributeCount", "created", "lastModified", "root", - "domain", ] - for name in expected_keys: self.assertTrue(name in rspJson) + + # additional keys expected for GET response + expected_keys.append("hrefs") + expected_keys.append("creationProperties") + expected_keys.append("domain") + + self.assertEqual(rspJson["attributeCount"], 0) + dset_id = rspJson["id"] + self.assertTrue(helper.validateId(dset_id)) + + # read back the obj + req = self.endpoint + "/datasets/" + dset_id + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["id"], dset_id) self.assertEqual(rspJson["root"], root_uuid) self.assertEqual(rspJson["domain"], domain) @@ -190,6 +190,182 @@ def testScalarDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 410) + def testPostDatasetWithId(self): + # Test creation of a dataset obj with client creating obj id + domain = self.base_domain + "/testPostDatasetWithId.h5" + helper.setupDomain(domain) + print("testPostDatasetWithId", domain) + headers = helper.getRequestHeaders(domain=domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # make a new dataset id + dset_id = createObjId("datasets", root_id=root_uuid) + + # create a dataset obj + data = {"id": dset_id, "type": "H5T_IEEE_F32LE", "shape": "H5S_SCALAR"} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["attributeCount"], 0) + self.assertEqual(rspJson["id"], dset_id) + + # read back the obj + req = self.endpoint + "/datasets/" + dset_id + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + + expected_keys = [ + "id", + "shape", + "hrefs", + "creationProperties", + "attributeCount", + "created", + "lastModified", + "root", + "domain", + ] + + for name in expected_keys: + self.assertTrue(name in rspJson) + self.assertEqual(rspJson["id"], dset_id) + self.assertEqual(rspJson["root"], root_uuid) + self.assertEqual(rspJson["domain"], domain) + self.assertEqual(rspJson["attributeCount"], 0) + shape_json = rspJson["shape"] + self.assertTrue(shape_json["class"], "H5S_SCALAR") + self.assertTrue(rspJson["type"], "H5T_IEEE_F32LE") + + # Get the type + rsp = self.session.get(req + "/type", headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("type" in rspJson) + self.assertTrue(rspJson["type"], "H5T_IEEE_F32LE") + self.assertTrue("hrefs" in rspJson) + hrefs = rspJson["hrefs"] + self.assertEqual(len(hrefs), 3) + + # Get the shape + rsp = self.session.get(req + "/shape", headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("created" in rspJson) + self.assertTrue("lastModified" in rspJson) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("shape" in rspJson) + shape_json = rspJson["shape"] + self.assertTrue(shape_json["class"], "H5S_SCALAR") + + # try getting verbose info + params = {"verbose": 1} + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + + for name in expected_keys: + self.assertTrue(name in rspJson) + + # flush to storage and force an immediate rescan + domain_req = self.endpoint + "/" + domain_params = {"flush": 1, "rescan": 1} + rsp = self.session.put(domain_req, params=domain_params, headers=headers) + # should get a NO_CONTENT code, + self.assertEqual(rsp.status_code, 204) + + # do a get and verify the additional keys are present + expected_keys.append("num_chunks") + expected_keys.append("allocated_size") + + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + + for name in expected_keys: + self.assertTrue(name in rspJson) + + def testPostDatasetWithAttributes(self): + # test POST with attribute initialization + domain = self.base_domain + "/testPostDatasetWithAttributes.h5" + helper.setupDomain(domain) + print("testPostDatasetWithAttributes", domain) + headers = helper.getRequestHeaders(domain=domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # setup some attributes to include + attr_count = 4 + attributes = {} + extent = 10 + for i in range(attr_count): + value = [i * 10 + j for j in range(extent)] + data = {"type": "H5T_STD_I32LE", "shape": extent, "value": value} + attr_name = f"attr{i + 1:04d}" + attributes[attr_name] = data + + # create new dataset + payload = {"type": "H5T_IEEE_F32LE", "shape": "H5S_SCALAR"} + payload["attributes"] = attributes + payload["link"] = {"id": root_uuid, "name": "linked_datatype"} + + req = helper.getEndpoint() + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["attributeCount"], 4) + dset_id = rspJson["id"] + self.assertTrue(helper.validateId(dset_id)) + + # fetch the attributes + req = f"{helper.getEndpoint()}/datasets/{dset_id}/attributes" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertFalse("type" in rspJson) + self.assertFalse("shape" in rspJson) + self.assertTrue("attributes") in rspJson + self.assertEqual(len(rspJson["attributes"]), attr_count) + + # try fetching the objson in domain resp + req = helper.getEndpoint() + "/" + params = {"getobjs": 1} + for i in range(10): + # try a few times to allow for async update of summary info + time.sleep(5) + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + if "domain_objs" in rspJson: + break + + self.assertTrue("domain_objs" in rspJson) + domain_objs = rspJson["domain_objs"] + self.assertTrue(root_uuid in domain_objs) + self.assertTrue(dset_id in domain_objs) + dset_json = domain_objs[dset_id] + self.assertTrue("attributes" in dset_json) + self.assertEqual(len(dset_json["attributes"]), attr_count) + self.assertTrue("type" in dset_json) + self.assertTrue("shape" in dset_json) + self.assertTrue("creationProperties" in dset_json) + self.assertFalse("value" in dset_json) # no data written yet + def testScalarEmptyDimsDataset(self): # Test creation/deletion of scalar dataset obj domain = self.base_domain + "/testScalarEmptyDimsDataset.h5" @@ -265,7 +441,6 @@ def testGet(self): "id", "shape", "hrefs", - "layout", "creationProperties", "attributeCount", "created", @@ -289,7 +464,8 @@ def testGet(self): self.assertEqual(shape["dims"], [10, 10]) self.assertEqual(shape["maxdims"], [10, 10]) - layout = rspJson["layout"] + cpl = rspJson["creationProperties"] + layout = cpl["layout"] self.assertEqual(layout["class"], "H5D_CHUNKED") self.assertEqual(layout["dims"], [10, 10]) self.assertTrue("partition_count" not in layout) @@ -359,7 +535,6 @@ def testGetByPath(self): "id", "shape", "hrefs", - "layout", "creationProperties", "attributeCount", "created", @@ -381,7 +556,6 @@ def testGetByPath(self): "id", "shape", "hrefs", - "layout", "creationProperties", "attributeCount", "created", @@ -443,7 +617,6 @@ def testGetVerbose(self): "id", "shape", "hrefs", - "layout", "creationProperties", "attributeCount", "created", @@ -637,10 +810,19 @@ def testResizableDataset(self): # create the dataset req = self.endpoint + "/datasets" payload = {"type": "H5T_IEEE_F32LE", "shape": 10, "maxdims": 20} - payload["creationProperties"] = {"fillValue": 3.12} + contiguous_layout = {"class": "H5D_CONTIGUOUS"} + cpl = {"fillValue": 3.12, "layout": contiguous_layout} + payload["creationProperties"] = cpl + req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 201) # create dataset + self.assertEqual(rsp.status_code, 400) # need chunk layout for resizable + + # if we remove the layout, HSDS will setup a chunked layout for us + del cpl["layout"] + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) dset_uuid = rspJson["id"] self.assertTrue(helper.validateId(dset_uuid)) @@ -668,8 +850,16 @@ def testResizableDataset(self): self.assertTrue("maxdims" in shape) self.assertEqual(shape["maxdims"][0], 20) + self.assertTrue("creationProperties" in rspJson) creationProps = rspJson["creationProperties"] + self.assertTrue("fillValue" in creationProps) self.assertEqual(creationProps["fillValue"], 3.12) + self.assertTrue("layout" in creationProps) + layout = creationProps["layout"] + self.assertTrue("class" in layout) + self.assertEqual(layout["class"], "H5D_CHUNKED") + self.assertTrue("dims" in layout) + self.assertEqual(len(layout["dims"]), 1) # verify shape using the GET shape request req = req + "/shape" @@ -748,7 +938,7 @@ def testResizableUnlimitedDataset(self): # create the dataset req = self.endpoint + "/datasets" - payload = {"type": "H5T_IEEE_F32LE", "shape": [10, 20], "maxdims": [30, 0]} + payload = {"type": "H5T_IEEE_F32LE", "shape": [10, 20], "maxdims": [30, "H5S_UNLIMITED"]} payload["creationProperties"] = {"fillValue": 3.12} req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) @@ -780,7 +970,7 @@ def testResizableUnlimitedDataset(self): self.assertEqual(shape["dims"][1], 20) self.assertTrue("maxdims" in shape) self.assertEqual(shape["maxdims"][0], 30) - self.assertEqual(shape["maxdims"][1], 0) + self.assertEqual(shape["maxdims"][1], "H5S_UNLIMITED") # verify shape using the GET shape request req = req + "/shape" @@ -797,7 +987,7 @@ def testResizableUnlimitedDataset(self): self.assertTrue("maxdims" in shape) self.assertEqual(len(shape["maxdims"]), 2) self.assertEqual(shape["maxdims"][0], 30) - self.assertEqual(shape["maxdims"][1], 0) + self.assertEqual(shape["maxdims"][1], "H5S_UNLIMITED") # resize the second dimension to 500 elements payload = {"shape": [10, 500]} @@ -819,7 +1009,7 @@ def testResizableUnlimitedDataset(self): self.assertTrue("maxdims" in shape) self.assertEqual(len(shape["maxdims"]), 2) self.assertEqual(shape["maxdims"][0], 30) - self.assertEqual(shape["maxdims"][1], 0) + self.assertEqual(shape["maxdims"][1], "H5S_UNLIMITED") def testExtendDataset(self): # test extending dataset @@ -1107,6 +1297,98 @@ def testExtend3DDataset(self): rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 409) # tried to extend a non-extensible dimension + def testInvalidDatasetLayout(self): + # test that various invalid layouts fail with a 400 status + domain = self.base_domain + "/testInvalidDatasetLayout.h5" + helper.setupDomain(domain) + + print("testInvalidDatasetLayout", domain) + headers = helper.getRequestHeaders(domain=domain) + # get domain + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("root" in rspJson) + + # dataset create + req = self.endpoint + "/datasets" + # TBD: the larger dimensions are causing SN to crash + # dims = [365, 780, 1024] + dims = [365, 780, 10] + # maxdims = [0, 780, 1024] + maxdims = [0, 780, 10] + payload = { + "type": "H5T_IEEE_F32LE", + "shape": dims, + "maxdims": maxdims, + } + # bad layout class + payload["creationProperties"] = { + "layout": {"class": "H5D_XYZ", "dims": [1, 390, 512]}, + } + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) # create dataset + + # chunked layout with mismatched dimensions + payload["creationProperties"] = { + "layout": {"class": "H5D_XYZ", "dims": [1, 390]}, + } + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) # create dataset + + # chunked layout with negative dimensions + payload["creationProperties"] = { + "layout": {"class": "H5D_XYZ", "dims": [100, 200, -300]}, + } + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) # create dataset + + file_uri = "s3://a-storage-bucket/some-file.h5" + offset = 1234 + size = dims[0] * dims[1] * dims[2] * 4 + + # H5D_CONTIGUOUS layout missing different required keys... + for key in ("file_uri", "offset", "size"): + layout = {"class": "H5D_CONTIGUOUS_REF"} + if key != "file_uri": + layout["file_uri"] = file_uri + elif key != "offset": + layout["offset"] = offset + elif key != "size": + layout["size"] = size + else: + self.assertTrue(False) # one of the above should be true + + payload["creationProperties"] = { + "layout": layout + } + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) # create dataset + + # H5D_CONTIGOUS with a vlen type + type_vstr = { + "charSet": "H5T_CSET_ASCII", + "class": "H5T_STRING", + "strPad": "H5T_STR_NULLTERM", + "length": "H5T_VARIABLE", + } + payload = { + "type": type_vstr, + "shape": dims, + } + layout = { + "class": "H5D_CONTIGUOUS_REF", + "file_uri": file_uri, + "offset": offset, + "size": size + } + payload["creationProperties"] = { + "layout": layout + } + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) # create dataset + def testCreationPropertiesLayoutDataset(self): # test Dataset with creation property list domain = self.base_domain + "/testCreationPropertiesLayoutDataset.h5" @@ -1130,6 +1412,7 @@ def testCreationPropertiesLayoutDataset(self): "shape": [365, 780, 1024], "maxdims": [0, 780, 1024], } + # define a chunk layout with 4 chunks per 'slice' # chunk size is 798720 bytes gzip_filter = { @@ -1143,41 +1426,67 @@ def testCreationPropertiesLayoutDataset(self): "id": 3, "name": "fletcher32" } - payload["creationProperties"] = { - "layout": {"class": "H5D_CHUNKED", "dims": [1, 390, 512]}, + contiguous_layout = {"class": "H5D_CONTIGUOUS"} + chunked_layout = {"class": "H5D_CHUNKED", "dims": [1, 390, 512]} + creationProps = { "filters": [ gzip_filter, fletcher32_filter, ], } - req = self.endpoint + "/datasets" + payload["creationProperties"] = creationProps rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset rspJson = json.loads(rsp.text) dset_uuid = rspJson["id"] self.assertTrue(helper.validateId(dset_uuid)) - # link new dataset as 'chunktest' - name = "chunktest" - req = self.endpoint + "/groups/" + root_uuid + "/links/" + name - payload = {"id": dset_uuid} - rsp = self.session.put(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 201) # verify layout req = helper.getEndpoint() + "/datasets/" + dset_uuid rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] + self.assertTrue("class" in layout_json) + self.assertEqual(layout_json["class"], "H5D_CHUNKED") + self.assertTrue("dims" in layout_json) # layout created automatically + + # add an explicit layout to creation props and verify contiguous + creationProps["layout"] = contiguous_layout + payload["creationProperties"] = creationProps + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) + + # use a chunk layout to creation props and verify success + creationProps["layout"] = chunked_layout + payload["creationProperties"] = creationProps + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) - self.assertEqual(layout_json["dims"], [1, 390, 1024]) + + self.assertEqual(layout_json["dims"], [1, 390, 512]) if config.get("max_chunks_per_folder") > 0: self.assertTrue("partition_count" in layout_json) self.assertEqual(layout_json["partition_count"], 10) + # link new dataset as 'chunktest' + name = "chunktest" + req = self.endpoint + "/groups/" + root_uuid + "/links/" + name + payload = {"id": dset_uuid} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + # verify compression self.assertTrue("creationProperties" in rspJson) cpl = rspJson["creationProperties"] @@ -1216,7 +1525,7 @@ def testCreationPropertiesContiguousDataset(self): req = self.endpoint + "/datasets" # Create ~1GB dataset - layout = {"class": "H5D_CONTIGUOUS"} + layout = {"class": "H5D_CHUNKED", "dims": [10, 20]} gzip_filter = { "class": "H5Z_FILTER_DEFLATE", "id": 1, @@ -1249,16 +1558,14 @@ def testCreationPropertiesContiguousDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) self.assertEqual(layout_json["dims"], [10, 20]) - # verify creation properties are preserved - self.assertTrue("creationProperties" in rspJson) - cpl = rspJson["creationProperties"] - self.assertTrue("layout" in cpl) def testCompressionFiltersDataset(self): # test Dataset with creation property list @@ -1283,11 +1590,11 @@ def testCompressionFiltersDataset(self): req = self.endpoint + "/datasets" payload = {"type": "H5T_IEEE_F32LE", "shape": [40, 80]} - payload["creationProperties"] = { - "filters": [ - compressor, - ] - } + filter_item = getFilterItem(compressor) + filters = [filter_item, ] + layout = {"class": "H5D_CHUNKED", "dims": [10, 20]} + cpl = {"filters": filters, "layout": layout} + payload["creationProperties"] = cpl req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -1305,8 +1612,10 @@ def testCompressionFiltersDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") @@ -1342,14 +1651,15 @@ def testCompressionFilterOptionDataset(self): # create the dataset req = self.endpoint + "/datasets" - compressor = {"class": "H5Z_FILTER_USER", "name": "lz4", "level": 5} + filter_item = getFilterItem("lz4", options={"level": 4}) + print("filter_item:", filter_item) + filters = [filter_item, ] payload = {"type": "H5T_IEEE_F32LE", "shape": [40, 80]} - payload["creationProperties"] = { - "filters": [ - compressor, - ] - } + layout = {"class": "H5D_CHUNKED", "dims": [10, 20]} + cpl = {"filters": filters, "layout": layout} + payload["creationProperties"] = cpl + req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -1367,10 +1677,13 @@ def testCompressionFilterOptionDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") + self.assertTrue("dims" in layout_json) # verify compression self.assertTrue("creationProperties" in rspJson) @@ -1381,7 +1694,7 @@ def testCompressionFilterOptionDataset(self): filter = filters[0] self.assertTrue(isinstance(filter, dict)) self.assertTrue("class" in filter) - self.assertEqual(filter["class"], "H5Z_FILTER_USER") + self.assertEqual(filter["class"], "H5Z_FILTER_LZ4") self.assertTrue("id" in filter) self.assertTrue("name" in filter) self.assertEqual(filter["name"], "lz4") @@ -1400,25 +1713,15 @@ def testInvalidCompressionFilter(self): rspJson = json.loads(rsp.text) self.assertTrue("root" in rspJson) - bad_compressors = ("shrink-o-rama") - for compressor_name in bad_compressors: - # create the dataset - req = self.endpoint + "/datasets" - compressor = { - "class": "H5Z_FILTER_USER", - "name": compressor_name, - "level": 5, - } + filter_item = {'class': 'H5Z_FILTER_FOOBAR', 'id': 123, 'name': 'foobar'} + # create the dataset + req = self.endpoint + "/datasets" - payload = {"type": "H5T_IEEE_F32LE", "shape": [40, 80]} - payload["creationProperties"] = { - "filters": [ - compressor, - ] - } - req = self.endpoint + "/datasets" - rsp = self.session.post(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 400) # create dataset + payload = {"type": "H5T_IEEE_F32LE", "shape": [40, 80]} + payload["creationProperties"] = {"filters": [filter_item, ]} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) # create dataset def testInvalidFillValue(self): # test Dataset with simple type and fill value that is incompatible with the type @@ -1604,26 +1907,18 @@ def testAutoChunk1dDataset(self): req = self.endpoint + "/datasets" # 50K x 80K dataset extent = 1000 * 1000 * 1000 - dims = [ - extent, - ] + dims = [extent, ] fields = ( {"name": "x", "type": "H5T_IEEE_F64LE"}, {"name": "y", "type": "H5T_IEEE_F64LE"}, {"name": "z", "type": "H5T_IEEE_F64LE"}, ) datatype = {"class": "H5T_COMPOUND", "fields": fields} + item_size = 12 # 3 fields of 4 bytes each + cpl = {"fillValue": 3.12} # no layout given + + payload = {"type": datatype, "shape": dims, "creationProperties": cpl} - payload = {"type": datatype, "shape": dims} - # the following should get ignored as too small - payload["creationProperties"] = { - "layout": { - "class": "H5D_CHUNKED", - "dims": [ - 10, - ], - } - } req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -1644,17 +1939,23 @@ def testAutoChunk1dDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("fillValue" in cpl) + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) - self.assertTrue("partition_count" not in layout_json) - layout = layout_json["dims"] - self.assertEqual(len(layout), 1) - self.assertTrue(layout[0] < dims[0]) - chunk_size = layout[0] * 8 * 3 # three 64bit - # chunk size should be between chunk min and max + chunk_dims = layout_json["dims"] + self.assertEqual(len(chunk_dims), 1) + self.assertTrue(chunk_dims[0] < dims[0]) + + chunk_size = chunk_dims[0] * item_size + + # chunk size will be based on server config, but assume a min/max of 1MB to 1GB + CHUNK_MIN = 1024 * 1024 + CHUNK_MAX = 1024 * 1024 * 1024 self.assertTrue(chunk_size >= CHUNK_MIN) self.assertTrue(chunk_size <= CHUNK_MAX) @@ -1675,6 +1976,7 @@ def testAutoChunk2dDataset(self): req = self.endpoint + "/datasets" # 50K x 80K dataset dims = [50000, 80000] + item_size = 4 # 4 bytes per float32 payload = {"type": "H5T_IEEE_F32LE", "shape": dims} req = self.endpoint + "/datasets" @@ -1697,74 +1999,22 @@ def testAutoChunk2dDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) - layout = layout_json["dims"] - self.assertEqual(len(layout), 2) - self.assertTrue(layout[0] < dims[0]) - self.assertTrue(layout[1] < dims[1]) - chunk_size = layout[0] * layout[1] * 4 - # chunk size should be between chunk min and max - self.assertTrue(chunk_size >= CHUNK_MIN) - self.assertTrue(chunk_size <= CHUNK_MAX) - - def testMinChunkSizeDataset(self): - # test Dataset where chunk layout is adjusted if provided - # layout is too small - domain = self.base_domain + "/testMinChunkSizeDataset.h5" - helper.setupDomain(domain) - print("testMinChunkSizeDataset", domain) - headers = helper.getRequestHeaders(domain=domain) - # get domain - req = helper.getEndpoint() + "/" - rsp = self.session.get(req, headers=headers) - rspJson = json.loads(rsp.text) - self.assertTrue("root" in rspJson) - root_uuid = rspJson["root"] - - # create the dataset - req = self.endpoint + "/datasets" - # 50K x 80K dataset - dims = [50000, 80000] - payload = {"type": "H5T_IEEE_F32LE", "shape": dims} - # define a chunk layout with lots of small chunks - payload["creationProperties"] = { - "layout": {"class": "H5D_CHUNKED", "dims": [10, 10]} - } - - req = self.endpoint + "/datasets" - rsp = self.session.post(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 201) # create dataset - rspJson = json.loads(rsp.text) - dset_uuid = rspJson["id"] - self.assertTrue(helper.validateId(dset_uuid)) - - # link new dataset as 'dset' - name = "dset" + helper.getRandomName() - req = self.endpoint + "/groups/" + root_uuid + "/links/" + name - payload = {"id": dset_uuid} - rsp = self.session.put(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 201) + chunk_dims = layout_json["dims"] + self.assertEqual(len(chunk_dims), 2) + self.assertTrue(chunk_dims[0] < dims[0]) + self.assertTrue(chunk_dims[1] < dims[1]) + chunk_size = chunk_dims[0] * chunk_dims[1] * item_size - # verify layout - req = helper.getEndpoint() + "/datasets/" + dset_uuid - rsp = self.session.get(req, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] - self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CHUNKED") - self.assertTrue("dims" in layout_json) - layout = layout_json["dims"] - self.assertEqual(len(layout), 2) - self.assertTrue(layout[0] < dims[0]) - self.assertTrue(layout[1] < dims[1]) - chunk_size = layout[0] * layout[1] * 4 - # chunk size should be between chunk min and max + # chunk size will be based on server config, but assume a min/max of 1MB to 1GB + CHUNK_MIN = 1024 * 1024 + CHUNK_MAX = 1024 * 1024 * 1024 self.assertTrue(chunk_size >= CHUNK_MIN) self.assertTrue(chunk_size <= CHUNK_MAX) @@ -2149,17 +2399,13 @@ def testContiguousRefDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CHUNKED") - self.assertTrue("dims" in layout_json) - chunk_dims = layout_json["dims"] - self.assertEqual(len(chunk_dims), 2) - chunk_size = chunk_dims[0] * chunk_dims[1] * 4 - # chunk size should be between chunk min and max - self.assertTrue(chunk_size >= CHUNK_MIN) - self.assertTrue(chunk_size <= CHUNK_MAX) + self.assertEqual(layout_json["class"], "H5D_CONTIGUOUS_REF") + self.assertFalse("dims" in layout_json) # verify cpl self.assertTrue("creationProperties" in rspJson) @@ -2222,23 +2468,13 @@ def testContiguousRefZeroDimDataset(self): rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) - # verify layout + # get dataset json req = helper.getEndpoint() + "/datasets/" + dset_uuid rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] - self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CHUNKED") - self.assertTrue("dims" in layout_json) - chunk_dims = layout_json["dims"] - self.assertEqual(len(chunk_dims), 2) - # layout should be same as the dims - self.assertEqual(chunk_dims[0], dims[0]) - self.assertEqual(chunk_dims[1], dims[1]) - # verify cpl + # verify layout self.assertTrue("creationProperties" in rspJson) cpl = rspJson["creationProperties"] self.assertTrue("layout" in cpl) @@ -2312,13 +2548,6 @@ def testChunkedRefDataset(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] - self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CHUNKED") - self.assertTrue("dims" in layout_json) - chunk_dims = layout_json["dims"] - self.assertEqual(len(chunk_dims), 2) self.assertTrue("creationProperties" in rspJson) cpl = rspJson["creationProperties"] self.assertTrue("layout" in cpl) @@ -2391,21 +2620,15 @@ def testChunkedRefIndirectDataset(self): rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) - # verify layout + # get dataset json req = helper.getEndpoint() + "/datasets/" + dset_uuid rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] - self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CHUNKED") - self.assertTrue("chunks" not in layout_json) - chunk_dims = layout_json["dims"] - self.assertEqual(len(chunk_dims), 2) self.assertTrue("creationProperties" in rspJson) cpl = rspJson["creationProperties"] + self.assertTrue("layout") self.assertTrue("layout" in cpl) cpl_layout = cpl["layout"] self.assertTrue("class" in cpl_layout) @@ -2487,19 +2710,11 @@ def testChunkedRefIndirectS3UriDataset(self): rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) - # verify layout + # fetch dataset json req = helper.getEndpoint() + "/datasets/" + dset_uuid rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] - self.assertTrue("class" in layout_json) - self.assertEqual(layout_json["class"], "H5D_CHUNKED") - self.assertTrue("chunks" not in layout_json) - self.assertTrue("dims" in layout_json) - chunk_dims = layout_json["dims"] - self.assertEqual(len(chunk_dims), 2) self.assertTrue("creationProperties" in rspJson) cpl = rspJson["creationProperties"] @@ -2553,8 +2768,10 @@ def testDatasetChunkPartitioning(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) @@ -2570,10 +2787,6 @@ def testDatasetChunkPartitioning(self): self.assertTrue(layout[0] < dims[0]) self.assertTrue(layout[1] < dims[1]) self.assertTrue(layout[2] < dims[2]) - chunk_size = layout[0] * layout[1] * layout[2] * 4 - # chunk size should be between chunk min and max - self.assertTrue(chunk_size >= CHUNK_MIN) - self.assertTrue(chunk_size <= CHUNK_MAX) def testExtendibleDatasetChunkPartitioning(self): # test Dataset partitioning logic for large datasets @@ -2592,9 +2805,17 @@ def testExtendibleDatasetChunkPartitioning(self): req = self.endpoint + "/datasets" # 50K x 80K x 90K dataset dims = [0, 80000, 90000] - # unlimited extend in dim 0, fixeed in dimension 2, extenbile by 10x in dim 3 + + # unlimited extend in dim 0, fixeed in dimension 2, extensible by 10x in dim 3 max_dims = [0, 80000, 900000] + chunk_shape = [1000, 1000, 1000] + layout = { + "class": "H5D_CHUNKED", + "dims": chunk_shape + } + cpl = {"layout": layout} payload = {"type": "H5T_IEEE_F32LE", "shape": dims, "maxdims": max_dims} + payload["creationProperties"] = cpl req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) @@ -2616,8 +2837,10 @@ def testExtendibleDatasetChunkPartitioning(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + cpl = rspJson["creationProperties"] + self.assertTrue("layout" in cpl) + layout_json = cpl["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) @@ -2627,10 +2850,6 @@ def testExtendibleDatasetChunkPartitioning(self): layout = layout_json["dims"] self.assertEqual(len(layout), 3) - chunk_size = layout[0] * layout[1] * layout[2] * 4 - # chunk size should be between chunk min and max - self.assertTrue(chunk_size >= CHUNK_MIN) - self.assertTrue(chunk_size <= CHUNK_MAX) def testDatasetEmptyChunkExtent(self): # Attempting to create 0-extent chunks should respond with Bad Request @@ -2656,6 +2875,87 @@ def testDatasetEmptyChunkExtent(self): # Should fail with Bad Request due to invalid layout value self.assertEqual(rsp.status_code, 400) # create dataset + def testDatasetPostMulti(self): + # test POST with multi-object creation + domain = self.base_domain + "/testDatasetPostMulti.h5" + helper.setupDomain(domain) + print("testDatasetPostMulti", domain) + headers = helper.getRequestHeaders(domain=domain) + req = helper.getEndpoint() + "/" + + # get root ids + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # get root group and verify link count is 0 + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + + dataset_count = 3 + datatype = "H5T_STD_I32LE" + payload = [] + for _ in range(dataset_count): + dataset_args = {"type": datatype} + payload.append(dataset_args) + + req = helper.getEndpoint() + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertTrue("objects" in rspJson) + rsp_objs = rspJson["objects"] + self.assertEqual(len(rsp_objs), dataset_count) + + expected_keys = [ + "id", + "shape", + "attributeCount", + "created", + "lastModified", + "root", + ] + + for i in range(dataset_count): + obj_json = rsp_objs[i] + self.assertEqual(obj_json["attributeCount"], 0) + dset_id = obj_json["id"] + self.assertTrue(helper.validateId(dset_id)) + self.assertTrue(dset_id.startswith("d-")) + for key in expected_keys: + self.assertTrue(key in obj_json) + + # create a set of linked datasets + for i in range(dataset_count): + item = payload[i] + item["link"] = {"id": root_uuid, "name": f"dset_{i + 1}"} + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertTrue("objects" in rspJson) + rsp_objs = rspJson["objects"] + self.assertEqual(len(rsp_objs), dataset_count) + for i in range(dataset_count): + json_rsp = rsp_objs[i] + self.assertEqual(json_rsp["attributeCount"], 0) + dset_id = json_rsp["id"] + self.assertTrue(helper.validateId(dset_id)) + for key in expected_keys: + self.assertTrue(key in obj_json) + + # get root group and verify link count is dataset_count + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], dataset_count) + if __name__ == "__main__": # setup test files diff --git a/tests/integ/datatype_test.py b/tests/integ/datatype_test.py index f3f2d1a9..7bf90d09 100755 --- a/tests/integ/datatype_test.py +++ b/tests/integ/datatype_test.py @@ -11,6 +11,9 @@ ############################################################################## import unittest import json + +from h5json.objid import createObjId + import helper import config @@ -120,6 +123,108 @@ def testCommittedType(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 410) + def testPostTypeWithId(self): + # Test creation/deletion of datatype obj + + print("testPostTypeWithId", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create a datatype id + ctype_id = createObjId("datatypes", root_id=root_uuid) + + # try creating a committed type without a type in the body + req = self.endpoint + "/datatypes" + data = {"id": ctype_id} + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 400) # bad request + + # create a committed type obj + data = {"id": ctype_id, "type": "H5T_IEEE_F32LE"} + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["attributeCount"], 0) + self.assertEqual(rspJson["id"], ctype_id) + self.assertTrue("type" in rspJson) + type_json = rspJson["type"] + self.assertEqual(type_json["class"], "H5T_FLOAT") + self.assertEqual(type_json["base"], "H5T_IEEE_F32LE") + + # read back the obj + req = self.endpoint + "/datatypes/" + ctype_id + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("id" in rspJson) + self.assertEqual(rspJson["id"], ctype_id) + self.assertTrue("root" in rspJson) + self.assertEqual(rspJson["root"], root_uuid) + self.assertTrue("created" in rspJson) + self.assertTrue("lastModified" in rspJson) + self.assertTrue("attributeCount" in rspJson) + self.assertEqual(rspJson["attributeCount"], 0) + self.assertTrue("type" in rspJson) + type_json = rspJson["type"] + self.assertEqual(type_json["class"], "H5T_FLOAT") + self.assertEqual(type_json["base"], "H5T_IEEE_F32LE") + + def testPostWithAttributes(self): + # test POST with attribute initialization + print("testPostWithAttributes", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # setup some attributes to include + attr_count = 4 + attributes = {} + extent = 10 + for i in range(attr_count): + value = [i * 10 + j for j in range(extent)] + data = {"type": "H5T_STD_I32LE", "shape": extent, "value": value} + attr_name = f"attr{i + 1:04d}" + attributes[attr_name] = data + + # create new datatype + link = {"id": root_uuid, "name": "linked_datatype"} + payload = {"type": "H5T_IEEE_F32LE", "attributes": attributes, "link": link} + req = helper.getEndpoint() + "/datatypes" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + ctype_id = rspJson["id"] + self.assertTrue(helper.validateId(ctype_id)) + self.assertTrue("type" in rspJson) + type_json = rspJson["type"] + self.assertEqual(type_json["class"], "H5T_FLOAT") + self.assertEqual(type_json["base"], "H5T_IEEE_F32LE") + self.assertEqual(rspJson["attributeCount"], attr_count) + + # fetch the attributes, check count + req = f"{helper.getEndpoint()}/datatypes/{ctype_id}/attributes" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertFalse("type" in rspJson) + self.assertFalse("shape" in rspJson) + self.assertTrue("attributes") in rspJson + self.assertEqual(len(rspJson["attributes"]), attr_count) + def testPostTypes(self): # Test creation with all primitive types @@ -370,6 +475,7 @@ def testPostWithLink(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) # link doesn't exist yet rspJson = json.loads(rsp.text) + self.assertTrue("link" in rspJson) link_json = rspJson["link"] self.assertEqual(link_json["collection"], "datatypes") @@ -509,6 +615,80 @@ def testPostWithPath(self): rspJson = json.loads(rsp.text) self.assertEqual(rspJson["id"], new_datatype_id) + def testPostMulti(self): + # test POST with multi-object creation + print("testPostMulti", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # get root group and verify link count is 0 + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + + str_type = { + "charSet": "H5T_CSET_ASCII", + "class": "H5T_STRING", + "length": 12, + "strPad": "H5T_STR_NULLPAD", + } + + float_type = "H5T_IEEE_F32LE" + + # create a set of anonymous ctypes + fields = ( + {"name": "temp", "type": "H5T_STD_I32LE"}, + {"name": "pressure", "type": "H5T_IEEE_F32LE"}, + ) + compound_type = {"class": "H5T_COMPOUND", "fields": fields} + + payload = [{"type": str_type}, {"type": float_type}, {"type": compound_type}] + req = helper.getEndpoint() + "/datatypes" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertTrue("objects" in rspJson) + rsp_objs = rspJson["objects"] + self.assertEqual(len(rsp_objs), 3) + + for i in range(3): + obj_json = rsp_objs[i] + self.assertEqual(obj_json["attributeCount"], 0) + ctype_id = obj_json["id"] + self.assertTrue(helper.validateId(ctype_id)) + + # create a set of linked ctypes + for i in range(3): + item = payload[i] + item["link"] = {"id": root_uuid, "name": f"ctype_{i + 1}"} + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertTrue("objects" in rspJson) + rsp_objs = rspJson["objects"] + self.assertEqual(len(rsp_objs), 3) + for i in range(3): + json_rsp = rsp_objs[i] + self.assertEqual(json_rsp["attributeCount"], 0) + ctype_id = json_rsp["id"] + self.assertTrue(helper.validateId(ctype_id)) + + # get root group and verify link count is 3 + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 3) + if __name__ == "__main__": # setup test files diff --git a/tests/integ/domain_test.py b/tests/integ/domain_test.py index 4f21d44e..f01dcc93 100755 --- a/tests/integ/domain_test.py +++ b/tests/integ/domain_test.py @@ -13,6 +13,9 @@ import time import json from os import path as pp + +from h5json.objid import createObjId, getCollectionForId + import config import helper @@ -113,33 +116,21 @@ def testGetDomain(self): attr_count = 0 for objid in domain_objs: obj_json = domain_objs[objid] - self.assertTrue("id" in obj_json) - self.assertTrue("attributeCount" in obj_json) - attr_count += obj_json["attributeCount"] - self.assertFalse("attributes" in obj_json) + collection_type = getCollectionForId(objid) + if collection_type == "datasets": + self.assertTrue("attributes" in obj_json) + self.assertTrue("type" in obj_json) + self.assertTrue("shape" in obj_json) + self.assertTrue("creationProperties" in obj_json) + elif collection_type == "groups": + self.assertTrue("attributes" in obj_json) + self.assertTrue("links" in obj_json) + else: + self.assertTrue(False) # unexpected type + attr_count += len(obj_json["attributes"]) self.assertEqual(attr_count, 4) - # get a dict of all objects in the domain including any attributes - params["include_attrs"] = 1 - rsp = self.session.get(req, headers=headers, params=params) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("domain_objs" in rspJson) - domain_objs = rspJson["domain_objs"] - self.assertEqual(len(domain_objs), 10) - attr_count = 0 - for objid in domain_objs: - obj_json = domain_objs[objid] - self.assertTrue("attributeCount" in obj_json) - self.assertTrue("attributes" in obj_json) - attributes = obj_json["attributes"] - for attr_name in attributes: - # only the names "attr1" and "attr2" are used in this domain - self.assertTrue(attr_name in ("attr1", "attr2")) - attr_count += 1 - self.assertEqual(attr_count, 4) - # passing domain via the host header is deprecated # Previously his returned 200, now it is a 400 del headers["X-Hdf-domain"] @@ -489,6 +480,94 @@ def testCreateDomain(self): self.assertTrue(k in rspJson) # we should get the same value for root id self.assertEqual(root_id, rspJson["root"]) + + def testCreateDomainWithId(self): + domain = self.base_domain + "/newdomainwithid.h5" + print("testCreateDomainWithId", domain) + headers = helper.getRequestHeaders(domain=domain) + + root_id = createObjId("groups") + body = {"root_id": root_id} + req = helper.getEndpoint() + "/" + + rsp = self.session.put(req, data=json.dumps(body), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + for k in ( + "root", + "owner", + "acls", + "created", + "lastModified", + "version", + "limits", + "compressors", + ): + self.assertTrue(k in rspJson) + + self.assertEqual(rspJson["root"], root_id) + + limit_keys = ("min_chunk_size", "max_chunk_size", "max_request_size") + limits = rspJson["limits"] + for k in limit_keys: + self.assertTrue(k in limits) + limit = limits[k] + self.assertTrue(isinstance(limit, int)) + self.assertTrue(limit > 0) + compressors = rspJson["compressors"] + for compressor in EXPECTED_COMPRESSORS: + self.assertTrue(compressor in compressors) + + # do a get on the new domain + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + for k in ( + "root", + "owner", + "class", + "created", + "lastModified", + "limits", + "version", + ): + self.assertTrue(k in rspJson) + # we should get the same value for root id + self.assertEqual(root_id, rspJson["root"]) + # should get limits here too + limits = rspJson["limits"] + for k in limit_keys: + self.assertTrue(k in limits) + limit = limits[k] + self.assertTrue(isinstance(limit, int)) + self.assertTrue(limit > 0) + + # verify we can access root groups + root_req = helper.getEndpoint() + "/groups/" + root_id + headers = helper.getRequestHeaders(domain=domain) + rsp = self.session.get(root_req, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # verify that putting the same domain again fails with a 409 error + rsp = self.session.put(req, headers=headers) + self.assertEqual(rsp.status_code, 409) + + # PUT with a different domain name should also give a 409 + # (due to the root_id conflicting) + domain2 = self.base_domain + "/newdomainwithid2.h5" + headers2 = helper.getRequestHeaders(domain=domain2) + rsp = self.session.put(req, data=json.dumps(body), headers=headers2) + self.assertEqual(rsp.status_code, 409) + + # Delete the original domain + headers = helper.getRequestHeaders(domain=domain) + rsp = self.session.delete(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # re-create the domain with the same root id + rsp = self.session.put(req, data=json.dumps(body), headers=headers) + self.assertEqual(rsp.status_code, 201) + """ def testCreateDomainWithCustomClass(self): domain = self.base_domain + "/newclassdomain.h6" diff --git a/tests/integ/filter_test.py b/tests/integ/filter_test.py index ea2df637..0cd7fdb5 100755 --- a/tests/integ/filter_test.py +++ b/tests/integ/filter_test.py @@ -58,6 +58,8 @@ def testDeflateCompression(self): # Create ~1MB dataset payload = {"type": "H5T_STD_I8LE", "shape": [1024, 1024]} + # use a chunked layout for compression + layout = {"class": "H5D_CHUNKED", "dims": [64, 64]} # define deflate compression gzip_filter = { "class": "H5Z_FILTER_DEFLATE", @@ -65,7 +67,7 @@ def testDeflateCompression(self): "level": 9, "name": "deflate", } - payload["creationProperties"] = {"filters": [gzip_filter]} + payload["creationProperties"] = {"layout": layout, "filters": [gzip_filter]} req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -120,7 +122,9 @@ def testShuffleFilter(self): payload = {"type": "H5T_STD_I32LE", "shape": [1024, 1024]} # define sshufle compression shuffle_filter = {"class": "H5Z_FILTER_SHUFFLE", "id": 2, "name": "shuffle"} - payload["creationProperties"] = {"filters": [shuffle_filter]} + # use chunked layout for compression + layout = {"class": "H5D_CHUNKED", "dims": [64, 64]} + payload["creationProperties"] = {"filters": [shuffle_filter], "layout": layout} req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -182,7 +186,11 @@ def testShuffleAndDeflate(self): } # and shuffle compression shuffle_filter = {"class": "H5Z_FILTER_SHUFFLE", "id": 2, "name": "shuffle"} - payload["creationProperties"] = {"filters": [shuffle_filter, gzip_filter]} + filters = [shuffle_filter, gzip_filter] + # use chunked layout + layout = {"class": "H5D_CHUNKED", "dims": [64, 64]} + payload["creationProperties"] = {"layout": layout, "filters": filters} + req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -237,7 +245,9 @@ def testBitShuffle(self): # bit shuffle bitshuffle_filter = {"class": "H5Z_FILTER_BITSHUFFLE", "id": 32008, "name": "bitshuffle"} - payload["creationProperties"] = {"filters": [bitshuffle_filter, ]} + # use chunked layout + layout = {"class": "H5D_CHUNKED", "dims": [64, 64]} + payload["creationProperties"] = {"filters": [bitshuffle_filter], "layout": layout} req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset @@ -299,7 +309,10 @@ def testBitShuffleAndDeflate(self): } # and bit shuffle bitshuffle_filter = {"class": "H5Z_FILTER_BITSHUFFLE", "id": 32008, "name": "bitshuffle"} - payload["creationProperties"] = {"filters": [bitshuffle_filter, gzip_filter]} + filters = [bitshuffle_filter, gzip_filter] + # use chunked layout + layout = {"class": "H5D_CHUNKED", "dims": [64, 64]} + payload["creationProperties"] = {"filters": filters, "layout": layout} req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset diff --git a/tests/integ/group_test.py b/tests/integ/group_test.py index 7a832271..ce617e4e 100755 --- a/tests/integ/group_test.py +++ b/tests/integ/group_test.py @@ -13,6 +13,9 @@ import time import json import uuid + +from h5json.objid import createObjId + import helper import config @@ -220,6 +223,22 @@ def testPost(self): self.assertTrue("alias" in rspJson) self.assertEqual(rspJson["alias"], []) + # try with an empty body + payload = {} + req = endpoint + "/groups" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + self.assertEqual(rspJson["attributeCount"], 0) + group_id = rspJson["id"] + self.assertTrue(helper.validateId(group_id)) + + # try with a type in body (as if we were trying to create a committed type) + payload["type"] = "H5T_IEEE_F32LE" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) + # try POST with user who doesn't have create permission on this domain test_user2 = config.get("user2_name") # some tests will be skipped if not set if not test_user2: @@ -233,6 +252,39 @@ def testPost(self): rsp = self.session.post(req, headers=headers) self.assertEqual(rsp.status_code, 403) # forbidden + def testPostWithId(self): + # test POST group with a client-generated id + print("testPostWithId", self.base_domain) + endpoint = helper.getEndpoint() + headers = helper.getRequestHeaders(domain=self.base_domain) + req = endpoint + "/groups" + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create a group id + grp_id = createObjId("groups", root_id=root_uuid) + + # create a new group using the grp_id + payload = {"id": grp_id} + req = helper.getEndpoint() + "/groups" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + self.assertEqual(rspJson["attributeCount"], 0) + self.assertEqual(grp_id, rspJson["id"]) + + # try sending the same request again + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) # bad request + def testPostWithLink(self): # test POST with link creation print("testPostWithLink", self.base_domain) @@ -279,7 +331,7 @@ def testPostWithLink(self): self.assertEqual(rspJson["linkCount"], 0) self.assertEqual(rspJson["attributeCount"], 0) new_group_id = rspJson["id"] - self.assertTrue(helper.validateId(rspJson["id"])) + self.assertTrue(helper.validateId(new_group_id)) self.assertTrue(new_group_id != root_uuid) # get root group and verify link count is 1 @@ -310,6 +362,145 @@ def testPostWithLink(self): self.assertTrue("alias" in rspJson) self.assertEqual(rspJson["alias"], ["/linked_group",]) + def testPostIdWithLink(self): + # test POST with link creation + print("testPostIdWithLink", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + grp_count = 3 + req = helper.getEndpoint() + "/groups" + + for i in range(grp_count): + # create a group id + grp_id = createObjId("groups", root_id=root_uuid) + + # create new group + payload = {"id": grp_id, "link": {"id": root_uuid, "name": f"g{i:04d}"}} + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + self.assertEqual(rspJson["attributeCount"], 0) + self.assertEqual(grp_id, rspJson["id"]) + + # get root group and verify number of links + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], grp_count) + + def testPostWithAttributes(self): + # test POST with attribute initialization + print("testPostWithAttributes", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # setup some attributes to include + attr_count = 4 + attributes = {} + extent = 10 + for i in range(attr_count): + value = [i * 10 + j for j in range(extent)] + data = {"type": "H5T_STD_I32LE", "shape": extent, "value": value} + attr_name = f"attr{i + 1:04d}" + attributes[attr_name] = data + + # create new group + payload = {"attributes": attributes, "link": {"id": root_uuid, "name": "linked_group"}} + req = helper.getEndpoint() + "/groups" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + self.assertEqual(rspJson["attributeCount"], attr_count) + grp_id = rspJson["id"] + self.assertTrue(helper.validateId(grp_id)) + + # fetch the attributes, check count + req = f"{helper.getEndpoint()}/groups/{grp_id}/attributes" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertFalse("type" in rspJson) + self.assertFalse("shape" in rspJson) + self.assertTrue("attributes") in rspJson + self.assertEqual(len(rspJson["attributes"]), attr_count) + + def testPostWithLinks(self): + # test POST with attribute initialization + print("testPostWithLinks", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # some objects to link + link_count = 4 + links = {} + req = helper.getEndpoint() + "/groups" + + for i in range(link_count): + rsp = self.session.post(req, headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + group_id = rspJson["id"] + self.assertTrue(helper.validateId(group_id)) + links[f"obj_{i}"] = {"id": group_id} + + # create new group + payload = {"links": links, "link": {"id": root_uuid, "name": "g1"}} + req = helper.getEndpoint() + "/groups" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], link_count) + self.assertEqual(rspJson["attributeCount"], 0) + grp_id = rspJson["id"] + helper.validateId(grp_id) + + # fetch all the links + req = helper.getEndpoint() + "/groups/" + grp_id + "/links" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + + self.assertTrue("links" in rspJson) + links_rsp = rspJson["links"] + self.assertEqual(len(links_rsp), link_count) + for i in range(link_count): + link_rsp = links_rsp[i] + self.assertTrue("class" in link_rsp) + self.assertEqual(link_rsp["class"], "H5L_TYPE_HARD") + self.assertTrue("id" in link_rsp) + self.assertTrue("title" in link_rsp) + self.assertEqual(link_rsp["title"], f"obj_{i}") + self.assertTrue("collection" in link_rsp) + self.assertEqual(link_rsp["collection"], "groups") + self.assertTrue("target" in link_rsp) + self.assertTrue("href" in link_rsp) + def testPostWithPath(self): # test POST with implicit parent group creation print("testPostWithPath", self.base_domain) @@ -427,9 +618,92 @@ def testPostWithPath(self): rsp = self.session.get(req, headers=headers, params=params) self.assertEqual(rsp.status_code, 200) + def testPostIdWithPath(self): + # test POST with implicit parent group creation + print("testPostIdWithPath", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # get root group and verify link count is 0 + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + + # create new group with link path: /g1 + g1_id = createObjId("groups", root_id=root_uuid) + payload = {"id": g1_id, "h5path": "g1"} + req = helper.getEndpoint() + "/groups" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + self.assertEqual(rspJson["attributeCount"], 0) + self.assertEqual(rspJson["id"], g1_id) + + # get root group and verify link count is 1 + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 1) + + # get the group at "g1" + req = helper.getEndpoint() + "/groups/" + params = {"h5path": "/g1"} + rsp = self.session.get(req, headers=headers, params=params) + self.assertEqual(rsp.status_code, 200) + + # try creating new group with link path: /g2/g2.1 + g21_id = createObjId("groups", root_id=root_uuid) + payload = {"id": g21_id, "h5path": "g2/g2.1"} + req = helper.getEndpoint() + "/groups" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 404) # g2 not found + + # try again with implicit creation set + params = {"implicit": 1} + g21_id = createObjId("groups", root_id=root_uuid) + payload = {"id": g21_id, "h5path": "g2/g2.1"} + rsp = self.session.post(req, data=json.dumps(payload), params=params, headers=headers) + self.assertEqual(rsp.status_code, 201) # g2 and g2.1 created + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + self.assertEqual(rspJson["attributeCount"], 0) + self.assertEqual(rspJson["id"], g21_id) + + # get root group and verify link count is 2 + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 2) + + # get the group at "/g2" + req = helper.getEndpoint() + "/groups/" + params = {"h5path": "/g2"} + rsp = self.session.get(req, headers=headers, params=params) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 1) # group g2.1 + + # get the group at "/g2/g2.1" + req = helper.getEndpoint() + "/groups/" + params = {"h5path": "/g2/g2.1"} + rsp = self.session.get(req, headers=headers, params=params) + self.assertEqual(rsp.status_code, 200) + def testPostWithCreationProps(self): # test POST group with creation properties - print("testPost", self.base_domain) + print("testPostWithCreationProps", self.base_domain) endpoint = helper.getEndpoint() headers = helper.getRequestHeaders(domain=self.base_domain) req = endpoint + "/groups" @@ -470,6 +744,69 @@ def testPostWithCreationProps(self): self.assertTrue("alias" in rspJson) self.assertEqual(rspJson["alias"], []) + def testPostMulti(self): + # test POST with multi-object creation + print("testPostMulti", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get root id + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # get root group and verify link count is 0 + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], 0) + + # create a set of anonymous groups + grp_count = 3 + req = helper.getEndpoint() + "/groups" + + payload = [{},] * grp_count + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertTrue("objects" in rspJson) + rsp_objs = rspJson["objects"] + self.assertEqual(len(rsp_objs), grp_count) + for i in range(grp_count): + grp_rsp = rsp_objs[i] + self.assertEqual(grp_rsp["linkCount"], 0) + self.assertEqual(grp_rsp["attributeCount"], 0) + group_id = grp_rsp["id"] + self.assertTrue(helper.validateId(group_id)) + + # create a set of linked groups + grp_count = 3 + payload = [] + for i in range(grp_count): + payload.append({"link": {"id": root_uuid, "name": f"g{i}"}}) + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertTrue("objects" in rspJson) + rsp_objs = rspJson["objects"] + self.assertEqual(len(rsp_objs), grp_count) + for i in range(grp_count): + grp_rsp = rsp_objs[i] + self.assertEqual(grp_rsp["linkCount"], 0) + self.assertEqual(grp_rsp["attributeCount"], 0) + group_id = grp_rsp["id"] + self.assertTrue(helper.validateId(group_id)) + + # get root group and verify link count is grp_count + req = helper.getEndpoint() + "/groups/" + root_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["linkCount"], grp_count) + def testDelete(self): # test Delete print("testDelete", self.base_domain) diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index a6f72aeb..d95e6834 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -68,6 +68,14 @@ def testHardLink(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 404) # link doesn't exist yet + # try creating link with no body + rsp = self.session.put(req, headers=headers) + self.assertEqual(rsp.status_code, 400) + + # try creating link with no items + rsp = self.session.put(req, headers=headers, data=json.dumps({})) + self.assertEqual(rsp.status_code, 400) + # try creating a link with a different user (should fail) if test_user2: headers = helper.getRequestHeaders(domain=domain, username=test_user2) @@ -262,7 +270,7 @@ def testExternalLink(self): target_path = "somewhere" link_title = "external_link" req = helper.getEndpoint() + "/groups/" + root_id + "/links/" + link_title - payload = {"h5path": target_path, "h5domain": target_domain} + payload = {"h5path": target_path, "file": target_domain} rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # created @@ -286,7 +294,7 @@ def testExternalLink(self): self.assertEqual(rspLink["title"], link_title) self.assertEqual(rspLink["class"], "H5L_TYPE_EXTERNAL") self.assertEqual(rspLink["h5path"], target_path) - self.assertEqual(rspLink["h5domain"], target_domain) + self.assertEqual(rspLink["file"], target_domain) def testGetLinks(self): domain = self.base_domain + "/testGetLinks.h5" @@ -516,10 +524,10 @@ def testGet(self): self.assertTrue(link["created"] < now - 10) else: self.assertEqual(link_class, "H5L_TYPE_EXTERNAL") - for name in ("created", "class", "h5domain", "h5path", "title", "href"): + for name in ("created", "class", "file", "h5path", "title", "href"): self.assertTrue(name in link) self.assertEqual(link["title"], "extlink") - extlink_file = link["h5domain"] + extlink_file = link["file"] self.assertEqual(extlink_file, "somefile") self.assertEqual(link["h5path"], "somepath") self.assertTrue(link["created"] < now - 10) @@ -547,7 +555,8 @@ def testGet(self): self.assertTrue(name in link) self.assertEqual(link["class"], "H5L_TYPE_SOFT") - self.assertFalse("h5domain" in link) # only for external links + self.assertFalse("h5domain" in link) # deprecated name + self.assertFalse("file" in link) # only for external links self.assertEqual(link["title"], "slink") self.assertEqual(link["h5path"], "somevalue") @@ -610,12 +619,14 @@ def testGetRecursive(self): softlink_count += 1 self.assertTrue("h5path" in link) self.assertFalse("h5domain" in link) + self.assertFalse("file" in link) self.assertFalse("id" in link) self.assertTrue(link_title in expected_soft_links) elif link_class == "H5L_TYPE_EXTERNAL": extlink_count += 1 self.assertTrue("h5path" in link) - self.assertTrue("h5domain" in link) + self.assertTrue("file" in link) + self.assertFalse("h5domain" in link) # deprecated name self.assertFalse("id" in link) self.assertTrue(link_title in expected_external_links) else: @@ -682,7 +693,7 @@ def testGetPattern(self): self.assertEqual(len(links), 1) # only extlink should be returned link = links[0] - for name in ("created", "class", "h5domain", "h5path", "title"): + for name in ("created", "class", "file", "h5path", "title"): self.assertTrue(name in link) if use_post: pass # no href with post @@ -690,7 +701,7 @@ def testGetPattern(self): self.assertTrue("href" in link) self.assertEqual(link["class"], "H5L_TYPE_EXTERNAL") self.assertEqual(link["title"], "extlink") - self.assertEqual(link["h5domain"], "somefile") + self.assertEqual(link["file"], "somefile") self.assertEqual(link["h5path"], "somepath") self.assertTrue(link["created"] < now - 10) @@ -918,7 +929,7 @@ def testExternalLinkTraversal(self): target_path = "/external_group" link_title = "external_link_to_group" req = helper.getEndpoint() + "/groups/" + root_id + "/links/" + link_title - payload = {"h5path": target_path, "h5domain": second_domain} + payload = {"h5path": target_path, "file": second_domain} headers = helper.getRequestHeaders(domain=domain) rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) @@ -951,7 +962,7 @@ def testExternalLinkTraversal(self): target_path = "/external_group" link_title = "external_link_to_group_prefix" req = helper.getEndpoint() + "/groups/" + root_id + "/links/" + link_title - payload = {"h5path": target_path, "h5domain": f"hdf5:/{second_domain}"} + payload = {"h5path": target_path, "file": f"hdf5:/{second_domain}"} headers = helper.getRequestHeaders(domain=domain) rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) @@ -1214,10 +1225,10 @@ def testPostLinkSingle(self): self.assertTrue(link["created"] < now - 10) else: self.assertEqual(link_class, "H5L_TYPE_EXTERNAL") - for name in ("created", "class", "h5domain", "h5path", "title"): + for name in ("created", "class", "file", "h5path", "title"): self.assertTrue(name in link) self.assertEqual(link["title"], "extlink") - extlink_file = link["h5domain"] + extlink_file = link["file"] self.assertEqual(extlink_file, "somefile") self.assertEqual(link["h5path"], "somepath") self.assertTrue(link["created"] < now - 10) @@ -1288,7 +1299,7 @@ def testPostLinkMultiple(self): # soft or external link self.assertEqual(link["h5path"], expected["h5path"]) if link_class == "H5L_TYPE_EXTERNAL": - self.assertEqual(link["h5domain"], expected["h5domain"]) + self.assertEqual(link["file"], expected["file"]) # get just the requested links for each group req = helper.getEndpoint() + "/groups/" + root_id + "/links" @@ -1481,12 +1492,211 @@ def testPutLinkMultiple(self): links = {} for i in range(grp_count): title = grp_names[i] - links[title] = {"id": grp_ids[i]} + if i % 2 == 0: + # create a hardlink implicitly + links[title] = {"id": grp_ids[i]} + else: + # for variety, create a hardlink by providing full link json + links[title] = {"class": "H5L_TYPE_HARD", "id": grp_ids[i]} + + # add a soft and external link as well + links["softlink"] = {"h5path": "a_path"} + links["extlink"] = {"h5path": "another_path", "file": "/a_domain"} + link_count = len(links) + + # write links to the grpA + data = {"links": links} + req = self.endpoint + "/groups/" + grpA_id + "/links" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # do a get on the links + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + ret_links = rspJson["links"] + self.assertEqual(len(ret_links), link_count) + for link in ret_links: + self.assertTrue("title" in link) + title = link["title"] + self.assertTrue("class" in link) + link_class = link["class"] + if link_class == "H5L_TYPE_HARD": + self.assertTrue("id" in link) + self.assertTrue(link["id"] in grp_ids) + self.assertTrue(title in grp_names) + elif link_class == "H5L_TYPE_SOFT": + self.assertTrue("h5path" in link) + h5path = link["h5path"] + self.assertEqual(h5path, "a_path") + elif link_class == "H5L_TYPE_EXTERNAL": + self.assertTrue("h5path" in link) + h5path = link["h5path"] + self.assertEqual(h5path, "another_path") + self.assertTrue("file" in link) + h5domain = link["file"] + self.assertEqual(h5domain, "/a_domain") + else: + self.assertTrue(False) # unexpected + + # try writing again, should get 200 (no new links) + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 200) + + # write some links to three group objects + links = {} + links["hardlink_multicast"] = {"id": root_id} + links["softlink_multicast"] = {"h5path": "multi_path"} + links["extlink_multicast"] = {"h5path": "multi_path", "file": "/another_domain"} + link_count = len(links) + data = {"links": links, "grp_ids": grp_ids} + req = self.endpoint + "/groups/" + root_id + "/links" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # check that the links got created + for grp_id in grp_ids: + req = self.endpoint + "/groups/" + grp_id + "/links" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + ret_links = rspJson["links"] + self.assertEqual(len(ret_links), 3) + for ret_link in ret_links: + self.assertTrue("class" in ret_link) + link_class = ret_link["class"] + if link_class == "H5L_TYPE_HARD": + self.assertTrue("id" in ret_link) + self.assertEqual(ret_link["id"], root_id) + elif link_class == "H5L_TYPE_SOFT": + self.assertTrue("h5path" in ret_link) + self.assertEqual(ret_link["h5path"], "multi_path") + elif link_class == "H5L_TYPE_EXTERNAL": + self.assertTrue("h5path" in ret_link) + self.assertEqual(ret_link["h5path"], "multi_path") + self.assertTrue("file" in ret_link) + self.assertEqual(ret_link["file"], "/another_domain") + else: + self.assertTrue(False) # unexpected + + # write different links to three group objects + link_data = {} + for i in range(grp_count): + grp_id = grp_ids[i] + links = {} + links[f"hardlink_{i}"] = {"id": root_id} + links[f"softlink_{i}"] = {"h5path": f"multi_path_{i}"} + ext_link = {"h5path": f"multi_path_{i}", "file": f"/another_domain/{i}"} + links[f"extlink_{i}"] = ext_link + link_data[grp_id] = {"links": links} + + data = {"grp_ids": link_data} + req = self.endpoint + "/groups/" + root_id + "/links" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # check that the new links got created + for i in range(grp_count): + grp_id = grp_ids[i] + titles = [f"hardlink_{i}", f"softlink_{i}", f"extlink_{i}", ] + data = {"titles": titles} + # do a post to just return the links we are interested in + req = self.endpoint + "/groups/" + grp_id + "/links" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + ret_links = rspJson["links"] + self.assertEqual(len(ret_links), len(titles)) + for j in range(len(titles)): + ret_link = ret_links[j] + self.assertTrue("class" in ret_link) + link_class = ret_link["class"] + self.assertTrue("title" in ret_link) + link_title = ret_link["title"] + if link_class == "H5L_TYPE_HARD": + self.assertEqual(link_title, f"hardlink_{i}") + self.assertTrue("id" in ret_link) + self.assertEqual(ret_link["id"], root_id) + elif link_class == "H5L_TYPE_SOFT": + self.assertEqual(link_title, f"softlink_{i}") + self.assertTrue("h5path" in ret_link) + self.assertEqual(ret_link["h5path"], f"multi_path_{i}") + elif link_class == "H5L_TYPE_EXTERNAL": + self.assertEqual(link_title, f"extlink_{i}") + self.assertTrue("h5path" in ret_link) + self.assertEqual(ret_link["h5path"], f"multi_path_{i}") + self.assertTrue("file" in ret_link) + self.assertEqual(ret_link["file"], f"/another_domain/{i}") + else: + self.assertTrue(False) # unexpected + + def testPutLinkMultipleWithTimestamps(self): + domain = self.base_domain + "/testPutLinkMultipleWithTImestamps.h5" + helper.setupDomain(domain) + print("testPutLinkMultipleWithTimestamps", domain) + headers = helper.getRequestHeaders(domain=domain) + req = self.endpoint + "/" + + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_id = rspJson["root"] + + # create a group + req = self.endpoint + "/groups" + rsp = self.session.post(req, headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + grpA_id = rspJson["id"] + self.assertTrue(helper.validateId(grpA_id)) + + # link new obj as '/grpA' + req = self.endpoint + "/groups/" + root_id + "/links/grpA" + payload = {"id": grpA_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # created + + # create some groups under grp1 + grp_count = 3 + + grp_names = [f"grp{(i + 1):04d}" for i in range(grp_count)] + grp_ids = [] + + for grp_name in grp_names: + # create sub_groups + req = self.endpoint + "/groups" + rsp = self.session.post(req, headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + grp_id = rspJson["id"] + self.assertTrue(helper.validateId(grp_id)) + grp_ids.append(grp_id) + + # create some links + links = {} + for i in range(grp_count): + title = grp_names[i] + if i % 2 == 0: + # create a hardlink implicitly + links[title] = {"id": grp_ids[i]} + else: + # for variety, create a hardlink by providing full link json + links[title] = {"class": "H5L_TYPE_HARD", "id": grp_ids[i]} # add a soft and external link as well links["softlink"] = {"h5path": "a_path"} - links["extlink"] = {"h5path": "another_path", "h5domain": "/a_domain"} + links["extlink"] = {"h5path": "another_path", "file": "/a_domain"} link_count = len(links) + # add timestamp + timestamps = set() + for title in links: + link = links[title] + now = time.time() + link["created"] = now + timestamps.add(now) # write links to the grpA data = {"links": links} @@ -1518,11 +1728,13 @@ def testPutLinkMultiple(self): self.assertTrue("h5path" in link) h5path = link["h5path"] self.assertEqual(h5path, "another_path") - self.assertTrue("h5domain" in link) - h5domain = link["h5domain"] + self.assertTrue("file" in link) + h5domain = link["file"] self.assertEqual(h5domain, "/a_domain") else: self.assertTrue(False) # unexpected + self.assertTrue("created" in link) + self.assertTrue(link["created"] in timestamps) # try writing again, should get 200 (no new links) rsp = self.session.put(req, data=json.dumps(data), headers=headers) @@ -1532,8 +1744,15 @@ def testPutLinkMultiple(self): links = {} links["hardlink_multicast"] = {"id": root_id} links["softlink_multicast"] = {"h5path": "multi_path"} - links["extlink_multicast"] = {"h5path": "multi_path", "h5domain": "/another_domain"} + links["extlink_multicast"] = {"h5path": "multi_path", "file": "/another_domain"} link_count = len(links) + timestamps = set() + for title in links: + link = links[title] + now = time.time() + link["created"] = now + timestamps.add(now) + data = {"links": links, "grp_ids": grp_ids} req = self.endpoint + "/groups/" + root_id + "/links" rsp = self.session.put(req, data=json.dumps(data), headers=headers) @@ -1560,20 +1779,28 @@ def testPutLinkMultiple(self): elif link_class == "H5L_TYPE_EXTERNAL": self.assertTrue("h5path" in ret_link) self.assertEqual(ret_link["h5path"], "multi_path") - self.assertTrue("h5domain" in ret_link) - self.assertEqual(ret_link["h5domain"], "/another_domain") + self.assertTrue("file" in ret_link) + self.assertEqual(ret_link["file"], "/another_domain") else: self.assertTrue(False) # unexpected + self.assertTrue("created" in ret_link) + self.assertTrue(ret_link["created"] in timestamps) # write different links to three group objects link_data = {} + timestamps = set() for i in range(grp_count): grp_id = grp_ids[i] links = {} links[f"hardlink_{i}"] = {"id": root_id} links[f"softlink_{i}"] = {"h5path": f"multi_path_{i}"} - ext_link = {"h5path": f"multi_path_{i}", "h5domain": f"/another_domain/{i}"} + ext_link = {"h5path": f"multi_path_{i}", "file": f"/another_domain/{i}"} links[f"extlink_{i}"] = ext_link + for title in links: + link = links[title] + now = time.time() + link["created"] = now + timestamps.add(now) link_data[grp_id] = {"links": links} data = {"grp_ids": link_data} @@ -1612,10 +1839,12 @@ def testPutLinkMultiple(self): self.assertEqual(link_title, f"extlink_{i}") self.assertTrue("h5path" in ret_link) self.assertEqual(ret_link["h5path"], f"multi_path_{i}") - self.assertTrue("h5domain" in ret_link) - self.assertEqual(ret_link["h5domain"], f"/another_domain/{i}") + self.assertTrue("file" in ret_link) + self.assertEqual(ret_link["file"], f"/another_domain/{i}") else: self.assertTrue(False) # unexpected + self.assertTrue("created" in ret_link) + self.assertTrue(ret_link["created"] in timestamps) def testDeleteLinkMultiple(self): domain = self.base_domain + "/testDeleteLinkMultiple.h5" @@ -1660,7 +1889,7 @@ def testDeleteLinkMultiple(self): links[title] = {"h5path": "a_path"} titles.append(title) title = "extlink" - links[title] = {"h5path": "another_path", "h5domain": "/a_domain"} + links[title] = {"h5path": "another_path", "file": "/a_domain"} titles.append(title) link_count = len(links) @@ -1767,7 +1996,6 @@ def testLinkCreationOrder(self): self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) links_json = rspJson["links"] - print("params:", params) # verify the links are in order for i in range(link_count - 1): @@ -1793,6 +2021,58 @@ def testLinkCreationOrder(self): self.assertEqual(prev_link['title'], sorted(link_names)[i]) self.assertEqual(link['title'], sorted(link_names)[i + 1]) + def testUseTimestamp(self): + # Test PUT value for link with timestamp included + domain = self.base_domain + "/testLinkUseTimestamp.h5" + + helper.setupDomain(domain) + print("testUseTimestamp", domain) + headers = helper.getRequestHeaders(domain=domain) + req = helper.getEndpoint() + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + def _create_link(title, ts=None): + # create link + req = helper.getEndpoint() + f"/groups/{root_uuid}/links/{title}" + body = {"h5path": "some_path"} + if ts: + body["created"] = ts + rsp = self.session.put(req, data=json.dumps(body), headers=headers) + self.assertEqual(rsp.status_code, 201) + + def _check_link_ts(title, min_ts=None, max_ts=None): + # read link + req = helper.getEndpoint() + f"/groups/{root_uuid}/links/{title}" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("created" in rspJson) + if min_ts: + self.assertGreaterEqual(rspJson["created"], min_ts) + if max_ts: + self.assertLessEqual(rspJson["created"], max_ts) + + now = time.time() + # server-based timestamp + _create_link("a1", ts=None) + _check_link_ts("a1", min_ts=(now - 1), max_ts=(now + 1)) + # client assigned timestamp + _create_link("a2", ts=now) + _check_link_ts("a2", min_ts=now, max_ts=now) + # client assigned with small time-skew, ok + _create_link("a3", ts=int(now)) + _check_link_ts("a3", min_ts=int(now), max_ts=int(now)) + # client assigned with large time-skew, ignored + _create_link("a4", ts=999) + _check_link_ts("a4", min_ts=(now - 1), max_ts=(now + 1)) + if __name__ == "__main__": # setup test files diff --git a/tests/integ/pointsel_test.py b/tests/integ/pointsel_test.py index 194eb2ce..57949114 100755 --- a/tests/integ/pointsel_test.py +++ b/tests/integ/pointsel_test.py @@ -1370,8 +1370,10 @@ def testDatasetChunkPartitioning(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("layout" in rspJson) - layout_json = rspJson["layout"] + self.assertTrue("creationProperties" in rspJson) + creation_props = rspJson["creationProperties"] + self.assertTrue("layout" in creation_props) + layout_json = creation_props["layout"] self.assertTrue("class" in layout_json) self.assertEqual(layout_json["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout_json) diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index 66287caf..02a4f990 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -111,9 +111,7 @@ def testPut1DDataset(self): rspJson = json.loads(rsp.text) self.assertTrue("hrefs" in rspJson) self.assertTrue("value" in rspJson) - expect_value = [ - 0, - ] + expect_value = [0, ] expect_value *= data["shape"] self.assertEqual(rspJson["value"], expect_value) @@ -928,7 +926,7 @@ def testPutScalarDataset(self): rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) - # read unintialized value from dataset + # read uninitialized value from dataset req = self.endpoint + "/datasets/" + dset_id + "/value" rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) @@ -951,6 +949,145 @@ def testPutScalarDataset(self): self.assertTrue("value" in rspJson) self.assertEqual(rspJson["value"], "Hello, world") + def testScalarDatasetInitData(self): + # Test creation/deletion of scalar dataset obj along with initial data + print("testScalarDatasetInitData", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create a dataset obj + data = {"type": "H5T_STD_I32LE", "shape": "H5S_SCALAR", "value": 42} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + self.assertEqual(rspJson["attributeCount"], 0) + dset_id = rspJson["id"] + self.assertTrue(helper.validateId(dset_id)) + + # read back the obj + req = self.endpoint + "/datasets/" + dset_id + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + + expected_keys = [ + "id", + "shape", + "hrefs", + "creationProperties", + "attributeCount", + "created", + "lastModified", + "root", + "domain", + ] + + for name in expected_keys: + self.assertTrue(name in rspJson) + self.assertEqual(rspJson["id"], dset_id) + self.assertEqual(rspJson["root"], root_uuid) + self.assertEqual(rspJson["domain"], self.base_domain) + self.assertEqual(rspJson["attributeCount"], 0) + shape_json = rspJson["shape"] + self.assertTrue(shape_json["class"], "H5S_SCALAR") + self.assertTrue(rspJson["type"], "H5T_STD_I32LE") + + # read the data back + req += "/value" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], 42) + + def testScalarDatasetInitDataMulti(self): + # Test creation/deletion of multiple scalar dataset obj along with initial data + print("testScalarDatasetInitDataMulti", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + dataset_count = 3 + datatype = "H5T_STD_I32LE" + payload = [] + for i in range(dataset_count): + dataset_args = {"type": datatype} + dataset_args["value"] = i + payload.append(dataset_args) + + # create dataset objects + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + + self.assertTrue("objects" in rspJson) + rsp_objs = rspJson["objects"] + self.assertEqual(len(rsp_objs), dataset_count) + + for i in range(dataset_count): + obj_json = rsp_objs[i] + self.assertEqual(obj_json["attributeCount"], 0) + dset_id = obj_json["id"] + self.assertTrue(helper.validateId(dset_id)) + self.assertTrue(dset_id.startswith("d-")) + + # read back the obj + for i in range(dataset_count): + dset_id = rsp_objs[i]["id"] + req = self.endpoint + "/datasets/" + dset_id + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + + expected_keys = [ + "id", + "shape", + "hrefs", + "creationProperties", + "attributeCount", + "created", + "lastModified", + "root", + "domain", + ] + + for name in expected_keys: + self.assertTrue(name in rspJson) + self.assertEqual(rspJson["id"], dset_id) + self.assertEqual(rspJson["root"], root_uuid) + self.assertEqual(rspJson["domain"], self.base_domain) + self.assertEqual(rspJson["attributeCount"], 0) + shape_json = rspJson["shape"] + self.assertTrue(shape_json["class"], "H5S_SCALAR") + self.assertTrue(rspJson["type"], "H5T_STD_I32LE") + + # read the data back + for i in range(dataset_count): + dset_id = rsp_objs[i]["id"] + req = self.endpoint + "/datasets/" + dset_id + "/value" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], i) + def testNullSpaceDataset(self): # Test attempted read/write to null space dataset print("testNullSpaceDataset", self.base_domain) @@ -1212,6 +1349,89 @@ def testPutCompound(self): self.assertEqual(len(item), 1) self.assertEqual(item[0], i * 10) + def testPutCompoundInitData(self): + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + str_type = { + "charSet": "H5T_CSET_ASCII", + "class": "H5T_STRING", + "strPad": "H5T_STR_NULLPAD", + "length": 5, + } + + fields = ( + {"name": "temp", "type": "H5T_STD_I32LE"}, + {"name": "unit", "type": str_type}, + ) + datatype = {"class": "H5T_COMPOUND", "fields": fields} + + # + # create compound scalar dataset + # + value = (42, 'C') + payload = {"type": datatype, "value": value} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # create dataset + + rspJson = json.loads(rsp.text) + dset0d_uuid = rspJson["id"] + self.assertTrue(helper.validateId(dset0d_uuid)) + + # verify the shape of the dataset + req = self.endpoint + "/datasets/" + dset0d_uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) # get dataset + rspJson = json.loads(rsp.text) + shape = rspJson["shape"] + self.assertEqual(shape["class"], "H5S_SCALAR") + + # read back the value + req = self.endpoint + "/datasets/" + dset0d_uuid + "/value" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], [42, 'C']) + + # + # create 1d dataset + # + + # make up some data + num_elements = 10 + value = [] + for i in range(num_elements): + item = (i * 10, chr(ord('A') + i)) + value.append(item) + payload = {"type": datatype, "shape": num_elements, "value": value} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # create dataset + + rspJson = json.loads(rsp.text) + dset1d_uuid = rspJson["id"] + self.assertTrue(helper.validateId(dset1d_uuid)) + + # read back the value + req = self.endpoint + "/datasets/" + dset1d_uuid + "/value" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + self.assertEqual(len(rspJson["value"]), num_elements) + self.assertEqual(rspJson["value"][2], [20, 'C']) + def testSimpleTypeFillValue(self): # test Dataset with simple type and fill value print("testSimpleTypeFillValue", self.base_domain) @@ -1911,7 +2131,7 @@ def testResizable1DValue(self): # read values from the extended region req = self.endpoint + "/datasets/" + dset_uuid + "/value" - params = {"select": "[{}:{}]".format(0, num_elements)} + params = {"select": f"[0:{num_elements}]"} rsp = self.session.get(req, params=params, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) @@ -3014,7 +3234,7 @@ def testARangeInitializerDataset(self): extent = 1_000_000_000 # one billion elements dset_dims = [extent, ] layout = {"class": "H5D_CHUNKED"} - layout["dims"] = dset_dims + layout["dims"] = [1_000, ] range_start = 0 # -0.25 range_step = 1 @@ -3064,6 +3284,7 @@ def testARangeInitializerDataset(self): def testIntelligentRangeGet1D(self): test_name = "testIntelligentRangeGet1D" + print(test_name, self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) diff --git a/tests/integ/vlen_test.py b/tests/integ/vlen_test.py index e45504e6..28bb3e90 100755 --- a/tests/integ/vlen_test.py +++ b/tests/integ/vlen_test.py @@ -13,18 +13,15 @@ import json import helper import numpy as np -import sys -sys.path.append("../..") -from hsds.util.arrayUtil import arrayToBytes, bytesToArray -from hsds.util.hdf5dtype import createDataType +from h5json.hdf5dtype import createDataType +from h5json.array_util import arrayToBytes, bytesToArray class VlenTest(unittest.TestCase): def __init__(self, *args, **kwargs): super(VlenTest, self).__init__(*args, **kwargs) self.base_domain = helper.getTestDomainName(self.__class__.__name__) - print(self.base_domain) helper.setupDomain(self.base_domain) self.endpoint = helper.getEndpoint() @@ -38,7 +35,7 @@ def tearDown(self): # main def testPutVLenInt(self): - # Test PUT value for 1d attribute with variable length int types + # Test PUT value for 1d dataset with variable length int types print("testPutVLenInt", self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) @@ -123,7 +120,7 @@ def testPutVLenInt(self): self.assertEqual(value[1], [1, 2, 3, 4]) def testPutVLenIntBinary(self): - # Test PUT value for 1d attribute with variable length int types using binary transfer + # Test PUT value for 1d dataset with variable length int types using binary transfer print("testPutVLenIntBinary", self.base_domain) count = 4 @@ -220,7 +217,7 @@ def testPutVLenIntBinary(self): self.assertEqual(value[0], [1, 2, 3]) def testPutVLen2DInt(self): - # Test PUT value for 1d attribute with variable length int types + # Test PUT value for 1d dataset with variable length int types print("testPutVLen2DInt", self.base_domain) nrow = 2 ncol = 2 @@ -297,7 +294,7 @@ def testPutVLen2DInt(self): self.assertEqual(value[0][1], [1, 2]) def testPutVLenString(self): - # Test PUT value for 1d attribute with variable length string types + # Test PUT value for 1d dataset with variable length string types print("testPutVLenString", self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) @@ -367,7 +364,7 @@ def testPutVLenString(self): self.assertEqual(value[1], data[3]) def testPutVLenStringBinary(self): - # Test PUT value for 1d attribute with variable length string types + # Test PUT value for 1d dataset with variable length string types print("testPutVLenStringBinary", self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) diff --git a/tests/unit/array_util_test.py b/tests/unit/array_util_test.py deleted file mode 100644 index 1a4f40e5..00000000 --- a/tests/unit/array_util_test.py +++ /dev/null @@ -1,1025 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # -# Utilities. The full HSDS copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## -import unittest -import json -import numpy as np - -import sys -import base64 - -sys.path.append("../..") -from hsds.util.arrayUtil import ( - bytesArrayToList, - toTuple, - getNumElements, - jsonToArray, - arrayToBytes, - bytesToArray, - getByteArraySize, - IndexIterator, - ndarray_compare, - getNumpyValue, - getBroadcastShape -) -from hsds.util.hdf5dtype import special_dtype -from hsds.util.hdf5dtype import check_dtype -from hsds.util.hdf5dtype import createDataType - - -class ArrayUtilTest(unittest.TestCase): - def __init__(self, *args, **kwargs): - super(ArrayUtilTest, self).__init__(*args, **kwargs) - # main - - def testByteArrayToList(self): - data_items = ( - 42, - "foo", - b"foo", - [1, 2, 3], - (1, 2, 3), - ["A", "B", "C"], - [b"A", b"B", b"C"], - [["A", "B"], [b"a", b"b", b"c"]], - ) - for data in data_items: - json_data = bytesArrayToList(data) - # will throw TypeError if not able to convert - json.dumps(json_data) - - def testToTuple(self): - data0d = 42 # scalar - data1d1 = [1] # one dimensional, one element list - data1d = [1, 2, 3, 4, 5] # list - data2d1 = [ - [1, 2], - ] # two dimensional, one element - data2d = [[1, 0.1], [2, 0.2], [3, 0.3], [4, 0.4]] # list of two-element lists - data3d = [[[0, 0.0], [1, 0.1]], [[2, 0.2], [3, 0.3]]] # list of list of lists - out = toTuple(0, data0d) - self.assertEqual(data0d, out) - out = toTuple(1, data1d1) - self.assertEqual(data1d1, out) - out = toTuple(1, data1d) - self.assertEqual(data1d, out) - out = toTuple(2, data2d) - self.assertEqual(data2d, out) - out = toTuple(1, data2d1) - self.assertEqual([(1, 2)], out) - out = toTuple(3, data3d) - self.assertEqual(data3d, out) - out = toTuple(1, data2d) # treat input as 1d array of two-field compound types - self.assertEqual([(1, 0.1), (2, 0.2), (3, 0.3), (4, 0.4)], out) - out = toTuple(2, data3d) # treat input as 2d array of two-field compound types - self.assertEqual([[(0, 0.0), (1, 0.1)], [(2, 0.2), (3, 0.3)]], out) - out = toTuple(1, data3d) # treat input a 1d array of compound type of compound types - self.assertEqual([((0, 0.0), (1, 0.1)), ((2, 0.2), (3, 0.3))], out) - - def testGetNumElements(self): - shape = (4,) - nelements = getNumElements(shape) - self.assertEqual(nelements, 4) - - shape = [10,] - nelements = getNumElements(shape) - self.assertEqual(nelements, 10) - - shape = (10, 8) - nelements = getNumElements(shape) - self.assertEqual(nelements, 80) - - def testJsonToArray(self): - dt = np.dtype("i4") - shape = [4, ] - data = [0, 2, 4, 6] - out = jsonToArray(shape, dt, data) - - self.assertTrue(isinstance(out, np.ndarray)) - self.assertEqual(out.shape, (4,)) - for i in range(4): - self.assertEqual(out[i], i * 2) - - # compound type - dt = np.dtype([("a", "i4"), ("b", "S5")]) - shape = [2, ] - data = [[4, "four"], [5, "five"]] - out = jsonToArray(shape, dt, data) - self.assertTrue(isinstance(out, np.ndarray)) - - self.assertEqual(out.shape, (2,)) - self.assertTrue(isinstance(out[0], np.void)) - e0 = out[0].tolist() - self.assertEqual(e0, (4, b"four")) - self.assertTrue(isinstance(out[1], np.void)) - e1 = out[1].tolist() - self.assertEqual(e1, (5, b"five")) - - shape = [1, ] - data = [ - [6, "six"], - ] - out = jsonToArray(shape, dt, data) - e0 = out[0].tolist() - self.assertEqual(e0, (6, b"six")) - - data = [6, "six"] - out = jsonToArray(shape, dt, data) - e0 = out[0].tolist() - self.assertEqual(e0, (6, b"six")) - - # test ascii chars >127 - dt = np.dtype("S26") - data = "extended ascii char 241: " + chr(241) - out = jsonToArray(shape, dt, data) - self.assertEqual(out[0], b'extended ascii char 241: \xc3') - - dt = np.dtype("S12") - data = "eight: \u516b" - out = jsonToArray(shape, dt, data) - self.assertEqual(out[0], b'eight: \xe5\x85\xab') - - # VLEN ascii - dt = special_dtype(vlen=bytes) - data = [b"one", b"two", b"three", b"four", b"five"] - shape = [5, ] - out = jsonToArray(shape, dt, data) - self.assertTrue("vlen" in out.dtype.metadata) - self.assertEqual(out.dtype.metadata["vlen"], bytes) - self.assertEqual(out.dtype.kind, "O") - self.assertEqual(out.shape, (5,)) - # TBD: code does not actually enforce use of bytes vs. str, - # probably not worth the effort to fix - self.assertEqual(out[2], b"three") - self.assertEqual(out[3], b"four") - - # VLEN str - dt = special_dtype(vlen=str) - data = [ - [b"part 1 - section A", b"part 1 - section B"], - [b"part 2 - section A", b"part 2 - section B"], - ] - shape = [2,] - out = jsonToArray(shape, dt, data) - self.assertTrue("vlen" in out.dtype.metadata) - self.assertEqual(out.dtype.metadata["vlen"], str) - self.assertEqual(out.dtype.kind, "O") - self.assertEqual(out.shape, (2,)) - self.assertEqual(out[0], tuple(data[0])) - self.assertEqual(out[1], tuple(data[1])) - - # VLEN Scalar str - dt = special_dtype(vlen=str) - data = "I'm a string!" - shape = [1, ] - out = jsonToArray(shape, dt, data) - - # VLEN unicode - dt = special_dtype(vlen=bytes) - data = ["one", "two", "three", "four", "five"] - shape = [5, ] - out = jsonToArray(shape, dt, data) - self.assertTrue("vlen" in out.dtype.metadata) - self.assertEqual(out.dtype.metadata["vlen"], bytes) - self.assertEqual(out.dtype.kind, "O") - self.assertEqual(out[2], b"three") - - # VLEN data - dt = special_dtype(vlen=np.dtype("int32")) - shape = [4, ] - data = [ - [1,], - [1, 2], - [1, 2, 3], - [1, 2, 3, 4], - ] - out = jsonToArray(shape, dt, data) - self.assertTrue(isinstance(out, np.ndarray)) - self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) - - self.assertEqual(out.shape, (4,)) - self.assertEqual(out.dtype.kind, "O") - self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) - for i in range(4): - e = out[i] # .tolist() - self.assertTrue(isinstance(e, tuple)) - self.assertEqual(e, tuple(range(1, i + 2))) - - # VLEN 2D data - dt = special_dtype(vlen=np.dtype("int32")) - shape = [2, 2] - data = [ - [ - [0,], - [1, 2], - ], - [ - [1,], - [2, 3], - ], - ] - out = jsonToArray(shape, dt, data) - self.assertTrue(isinstance(out, np.ndarray)) - self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) - - self.assertEqual(out.shape, (2, 2)) - self.assertEqual(out.dtype.kind, "O") - self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) - for i in range(2): - for j in range(2): - e = out[i, j] # .tolist() - self.assertTrue(isinstance(e, tuple)) - - # create VLEN of obj ref's - ref_type = {"class": "H5T_REFERENCE", "base": "H5T_STD_REF_OBJ"} - vlen_type = {"class": "H5T_VLEN", "base": ref_type} - dt = createDataType(vlen_type) # np datatype - - id0 = b"g-a4f455b2-c8cf-11e7-8b73-0242ac110009" - id1 = b"g-a50af844-c8cf-11e7-8b73-0242ac110009" - id2 = b"g-a5236276-c8cf-11e7-8b73-0242ac110009" - - data = [ - [id0, ], - [id0, id1], - [id0, id1, id2], - ] - shape = [3, ] - out = jsonToArray(shape, dt, data) - self.assertTrue(isinstance(out, np.ndarray)) - base_type = check_dtype(vlen=out.dtype) - self.assertEqual(base_type.kind, "S") - self.assertEqual(base_type.itemsize, 48) - - self.assertEqual(out.shape, (3,)) - self.assertEqual(out.dtype.kind, "O") - self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("S48")) - - e = out[0] - self.assertTrue(isinstance(e, tuple)) - self.assertEqual(e, (id0,)) - e = out[1] - self.assertTrue(isinstance(e, tuple)) - self.assertEqual(e, (id0, id1)) - e = out[2] - self.assertTrue(isinstance(e, tuple)) - self.assertEqual(e, (id0, id1, id2)) - - # compound type with array field - dt = np.dtype([("a", ("i4", 3)), ("b", "S5")]) - shape = [2, ] - data = [[[4, 8, 12], "four"], [[5, 10, 15], "five"]] - out = jsonToArray(shape, dt, data) - self.assertTrue(isinstance(out, np.ndarray)) - - self.assertEqual(out.shape, (2,)) - self.assertTrue(isinstance(out[0], np.void)) - e0 = out[0] - self.assertEqual(len(e0), 2) - e0a = e0[0] - self.assertTrue(isinstance(e0a, np.ndarray)) - self.assertEqual(e0a[0], 4) - self.assertEqual(e0a[1], 8) - self.assertEqual(e0a[2], 12) - e0b = e0[1] - self.assertEqual(e0b, b"four") - self.assertTrue(isinstance(out[1], np.void)) - e1 = out[1] - self.assertEqual(len(e1), 2) - e1a = e1[0] - self.assertTrue(isinstance(e1a, np.ndarray)) - self.assertEqual(e1a[0], 5) - self.assertEqual(e1a[1], 10) - self.assertEqual(e1a[2], 15) - e1b = e1[1] - self.assertEqual(e1b, b"five") - - def testToBytes(self): - # Simple array - dt = np.dtype(" expected_num_bytes) - - # convert buffer back to arr - arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") - self.assertTrue(np.array_equal(arr, arr_copy)) - - # fixed length string - dt = np.dtype("S8") - arr = np.asarray(("abcdefgh", "ABCDEFGH", "12345678"), dtype=dt) - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (3,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - - # Compound non-vlen - dt = np.dtype([("x", "f8"), ("y", "i4")]) - arr = np.zeros((4,), dtype=dt) - arr[0] = (3.12, 42) - arr[3] = (1.28, 69) - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - - # VLEN of int32's - dt = np.dtype("O", metadata={"vlen": np.dtype("int32")}) - arr = np.zeros((4,), dtype=dt) - arr[0] = np.int32([1, ]) - arr[1] = np.int32([1, 2]) - arr[2] = 0 # test un-intialized value - arr[3] = np.int32([1, 2, 3]) - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - - # VLEN of strings - dt = np.dtype("O", metadata={"vlen": str}) - arr = np.zeros((5,), dtype=dt) - arr[0] = "one: \u4e00" - arr[1] = "two: \u4e8c" - arr[2] = "three: \u4e09" - arr[3] = "four: \u56db" - arr[4] = 0 - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (5,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - # VLEN of bytes - dt = np.dtype("O", metadata={"vlen": bytes}) - arr = np.zeros((5,), dtype=dt) - arr[0] = b"Parting" - arr[1] = b"is such" - arr[2] = b"sweet" - arr[3] = b"sorrow" - arr[4] = 0 - - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (5,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - - # - # Compound str vlen - # - dt_vstr = np.dtype("O", metadata={"vlen": str}) - dt = np.dtype([("x", "i4"), ("tag", dt_vstr), ("code", "S4")]) - arr = np.zeros((4,), dtype=dt) - arr[0] = (42, "Hello", "X1") - arr[3] = (84, "Bye", "XYZ") - count = getByteArraySize(arr) - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - - # - # Compound int vlen - # - dt_vint = np.dtype("O", metadata={"vlen": "int32"}) - dt = np.dtype([("x", "int32"), ("tag", dt_vint)]) - arr = np.zeros((4,), dtype=dt) - arr[0] = (42, np.array((), dtype="int32")) - arr[3] = (84, np.array((1, 2, 3), dtype="int32")) - count = getByteArraySize(arr) - self.assertEqual(count, 44) - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - - # - # VLEN utf string with array type - # - dt_arr_str = np.dtype("(2,)O", metadata={"vlen": str}) - dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) - arr = np.zeros((4,), dtype=dt) - dt_str = np.dtype("O", metadata={"vlen": str}) - arr[0] = (42, np.asarray(["hi", "bye"], dtype=dt_str)) - arr[3] = (84, np.asarray(["hi-hi", "bye-bye"], dtype=dt_str)) - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") - - self.assertEqual(arr.dtype, arr_copy.dtype) - self.assertEqual(arr.shape, arr_copy.shape) - for i in range(4): - e = arr[i] - e_copy = arr_copy[i] - self.assertTrue(np.array_equal(e, e_copy)) - # - # VLEN ascii with array type - # - dt_arr_str = np.dtype("(2,)O", metadata={"vlen": bytes}) - dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) - arr = np.zeros((4,), dtype=dt) - dt_str = np.dtype("O", metadata={"vlen": bytes}) - arr[0] = (42, np.asarray([b"hi", b"bye"], dtype=dt_str)) - arr[3] = (84, np.asarray([b"hi-hi", b"bye-bye"], dtype=dt_str)) - buffer = arrayToBytes(arr, encoding="base64") - - # convert back to array - arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") - self.assertTrue(ndarray_compare(arr, arr_copy)) - - def testArrayCompareInt(self): - # Simple array - dt = np.dtype("= 1) - self.assertTrue(layout[i] <= 100) - - typesize = 8 - layout = guessChunk(shape, typesize) - self.assertTrue(len(layout), 2) - for i in range(2): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= 100) - - shape = {"class": "H5S_SIMPLE", "dims": [5]} - layout = guessChunk(shape, typesize) - self.assertEqual(layout, (5,)) - - shape = {"class": "H5S_SIMPLE", "dims": [100, 100, 100]} - layout = guessChunk(shape, typesize) - print("layout:", layout) - self.assertTrue(len(layout), 3) - for i in range(3): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= 100) - - shape = {"class": "H5S_SIMPLE", "dims": [100, 0], "maxdims": [100, 0]} - layout = guessChunk(shape, typesize) - self.assertTrue(len(layout), 2) - for i in range(2): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= 1024) - - shape = {"class": "H5S_SCALAR"} - layout = guessChunk(shape, typesize) - self.assertEqual(layout, (1,)) - - shape = {"class": "H5S_NULL"} - layout = guessChunk(shape, typesize) - self.assertEqual(layout, None) - - def testShrinkChunk(self): - CHUNK_MIN = 500 - CHUNK_MAX = 5000 - typesize = 1 - layout = (1, 2, 3) - shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) - self.assertEqual(shrunk, layout) - - layout = (100, 200, 300) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes > CHUNK_MAX) - shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) - rank = len(layout) - for i in range(rank): - self.assertTrue(shrunk[i] >= 1) - self.assertTrue(shrunk[i] <= 1000 * (i + 1)) - num_bytes = getChunkSize(shrunk, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - layout = (300, 200, 100) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes > CHUNK_MAX) - shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) - rank = len(layout) - for i in range(rank): - self.assertTrue(shrunk[i] >= 1) - self.assertTrue(shrunk[i] <= 1000 * (3 - i)) - num_bytes = getChunkSize(shrunk, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - CHUNK_MIN = 1 * 1024 * 1024 - CHUNK_MAX = 4 * 1024 * 1024 - typesize = 4 - layout = (117, 201, 189, 1) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes > CHUNK_MAX) - shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) - self.assertEqual(shrunk, (59, 101, 95, 1)) - num_bytes = getChunkSize(shrunk, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - def testExpandChunk(self): - CHUNK_MIN = 5000 - CHUNK_MAX = 50000 - - typesize = 20 - shape = {"class": "H5S_SIMPLE", "dims": [12, ], "maxdims": [20, ]} - layout = (20,) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - # chunk layout can't be larger than dataspace - self.assertTrue(num_bytes < CHUNK_MIN) - self.assertEqual(expanded, (20,)) - - typesize = 1 - shape = {"class": "H5S_SIMPLE", "dims": [10, 10, 10]} - layout = (10, 10, 10) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - # chunk layout can't be larger than dataspace - self.assertTrue(num_bytes < CHUNK_MIN) - self.assertEqual(expanded, (10, 10, 10)) - - shape = {"class": "H5S_SIMPLE", "dims": [1000, 2000, 3000]} - layout = (10, 10, 10) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - shape = {"class": "H5S_SIMPLE", "dims": [1000,]} - layout = (10,) - num_bytes = getChunkSize(layout, "H5T_VARIABLE") - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, "H5T_VARIABLE", shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, "H5T_VARIABLE") - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - shape = { - "class": "H5S_SIMPLE", - "dims": [1000, 10, 1000], - "maxdims": [1000, 100, 1000], - } - layout = (10, 10, 10) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - shape = { - "class": "H5S_SIMPLE", - "dims": [1000, 0, 1000], - "maxdims": [1000, 100, 1000], - } - layout = (10, 10, 10) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - shape = { - "class": "H5S_SIMPLE", - "dims": [1000, 10, 1000], - "maxdims": [1000, 0, 1000], - } - layout = (10, 10, 10) - num_bytes = getChunkSize(layout, typesize) - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, typesize) - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - - def testGetContiguiousLayout(self): - - typesize = 4 - chunk_min = 400 - chunk_max = 800 - - def get_num_bytes(dims): - num_bytes = typesize - for n in dims: - num_bytes *= n - return num_bytes - - try: - shape = {"class": "H5S_SIMPLE", "dims": [100, 100]} - layout = getContiguousLayout(shape, "H5T_VARIABLE") - self.assertTrue(False) - except ValueError: - pass # expected - - shape = {"class": "H5S_NULL"} - layout = getContiguousLayout(shape, typesize) - self.assertTrue(layout is None) - - shape = {"class": "H5S_SCALAR"} - layout = getContiguousLayout(shape, typesize) - self.assertEqual(layout, (1,)) - - for extent in (1, 100, 10000): - dims = [ - extent, - ] - shape = {"class": "H5S_SIMPLE", "dims": dims} - layout = getContiguousLayout( - shape, typesize, chunk_min=chunk_min, chunk_max=chunk_max - ) - self.assertTrue(len(layout), 1) - chunk_bytes = get_num_bytes(layout) - space_bytes = get_num_bytes(dims) - if space_bytes > chunk_min: - self.assertTrue(chunk_bytes >= chunk_min) - - self.assertTrue(chunk_bytes <= chunk_max) - - for extent in (1, 9, 90): - dims = [extent, extent] - shape = {"class": "H5S_SIMPLE", "dims": dims} - layout = getContiguousLayout( - shape, typesize, chunk_min=chunk_min, chunk_max=chunk_max - ) - self.assertTrue(len(layout), 2) - for i in range(2): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= extent) - self.assertEqual(layout[1], extent) - chunk_bytes = get_num_bytes(layout) - space_bytes = get_num_bytes(dims) - - if space_bytes > chunk_min: - self.assertTrue(chunk_bytes >= chunk_min) - self.assertTrue(chunk_bytes <= chunk_max) - - for extent in (1, 10, 100): - dims = [extent, extent, 50] - shape = {"class": "H5S_SIMPLE", "dims": dims} - layout = getContiguousLayout( - shape, typesize, chunk_min=chunk_min, chunk_max=chunk_max - ) - self.assertTrue(len(layout), 3) - for i in range(3): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= dims[i]) - - chunk_bytes = get_num_bytes(layout) - space_bytes = get_num_bytes(dims) - - if space_bytes > chunk_min: - self.assertTrue(chunk_bytes >= chunk_min) - self.assertTrue(chunk_bytes <= chunk_max) - - for extent in (1, 100, 1000): - dims = [extent, 4] - shape = {"class": "H5S_SIMPLE", "dims": dims} - layout = getContiguousLayout( - shape, typesize, chunk_min=chunk_min, chunk_max=chunk_max - ) - self.assertTrue(len(layout), 2) - for i in range(2): - self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= dims[i]) - - chunk_bytes = get_num_bytes(layout) - space_bytes = get_num_bytes(dims) - - if space_bytes > chunk_min: - self.assertTrue(chunk_bytes >= chunk_min) - self.assertTrue(chunk_bytes <= chunk_max) - def testGetNumChunks(self): - datashape = [ - 100, - ] + datashape = [100,] layout = (10,) selection = getHyperslabSelection(datashape) count = getNumChunks(selection, layout) @@ -400,11 +115,44 @@ def testGetNumChunks(self): selection = getHyperslabSelection(datashape, (0, 0), (100, 100), (20, 40)) count = getNumChunks(selection, layout) self.assertEqual(count, 15) + # test with scalar + datashape = () + layout = (1, ) + selection = getHyperslabSelection(datashape, 0, 1) + print("selection:", selection) + count = getNumChunks(selection, layout) + self.assertEqual(count, 1) def testGetChunkIds(self): # getChunkIds(dset_id, selection, layout, dim=0, prefix=None, chunk_ids=None): dset_id = "d-12345678-1234-1234-1234-1234567890ab" + datashape = [] + layout = (1,) + + selection = getHyperslabSelection(datashape, 0, 1) + num_chunks = getNumChunks(selection, layout) + + self.assertEqual(num_chunks, 1) + chunk_ids = getChunkIds(dset_id, selection, layout) + self.assertEqual(len(chunk_ids), 1) + chunk_id = chunk_ids[0] + self.assertTrue(chunk_id.startswith("c-")) + self.assertTrue(chunk_id.endswith("_0")) + self.assertEqual(chunk_id[2:-2], dset_id[2:]) + self.assertEqual(len(chunk_id), 2 + 36 + 2) + self.assertEqual(getDatasetId(chunk_id), dset_id) + + selection = getHyperslabSelection(datashape) + chunk_ids = getChunkIds(dset_id, selection, layout) + self.assertEqual(len(chunk_ids), 1) + chunk_id = chunk_ids[0] + self.assertTrue(chunk_id.startswith("c-")) + self.assertTrue(chunk_id.endswith("_0")) + self.assertEqual(chunk_id[2:-2], dset_id[2:]) + self.assertEqual(len(chunk_id), 2 + 36 + 2) + self.assertEqual(getDatasetId(chunk_id), dset_id) + datashape = [1,] layout = (1,) selection = getHyperslabSelection(datashape) diff --git a/tests/unit/dset_util_test.py b/tests/unit/dset_util_test.py index 0e77ab1b..f89690d9 100755 --- a/tests/unit/dset_util_test.py +++ b/tests/unit/dset_util_test.py @@ -14,7 +14,7 @@ import sys sys.path.append("../..") -from hsds.util.dsetUtil import getHyperslabSelection, getSelectionShape +from hsds.util.dsetUtil import getHyperslabSelection, getSelectionShape, get_slices from hsds.util.dsetUtil import getSelectionList, ItemIterator, getSelectionPagination @@ -25,8 +25,40 @@ def __init__(self, *args, **kwargs): self.logger = logging.getLogger() self.logger.setLevel(logging.WARNING) + def testGetSlices(self): + dset_json = {"id": "d-b4b3b3d6-94343adc-1727-28bebf-12caac"} + datashape = {"class": "H5S_SCALAR"} + cprops = {"layout": {"class": "H5D_CONTIGUOUS"}} + dtype_json = {"class": "H5T_INTEGER", "base": "H5T_STD_I32LE"} + dset_json["shape"] = datashape + dset_json["creationProperties"] = cprops + dset_json["type"] = dtype_json + + slices = get_slices("", dset_json) + self.assertEqual(len(slices), 1) + self.assertEqual(slices[0], slice(0, 1, 1)) + + slices = get_slices(None, dset_json) + self.assertEqual(len(slices), 1) + self.assertEqual(slices[0], slice(0, 1, 1)) + def testGetHyperslabSelection(self): # getHyperslabSelection(dsetshape, start, stop, step) + + # Scalar case + datashape = [] + slices = getHyperslabSelection(datashape) + self.assertEqual(len(slices), 1) + self.assertEqual(slices[0], slice(0, 1, 1)) + + slices = getHyperslabSelection(datashape, 0) + self.assertEqual(len(slices), 1) + self.assertEqual(slices[0], slice(0, 1, 1)) + + slices = getHyperslabSelection(datashape, 0, 1) + self.assertEqual(len(slices), 1) + self.assertEqual(slices[0], slice(0, 1, 1)) + # 1-D case datashape = [100,] slices = getHyperslabSelection(datashape) diff --git a/tests/unit/hdf5_dtype_test.py b/tests/unit/hdf5_dtype_test.py deleted file mode 100755 index e51913a6..00000000 --- a/tests/unit/hdf5_dtype_test.py +++ /dev/null @@ -1,717 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # -# Utilities. The full HSDS copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## -import unittest -import logging -import numpy as np -import sys - -sys.path.append("../..") -from hsds.util import hdf5dtype -from hsds.util.hdf5dtype import special_dtype -from hsds.util.hdf5dtype import check_dtype -from hsds.util.hdf5dtype import Reference -from hsds.util.hdf5dtype import RegionReference - - -class Hdf5dtypeTest(unittest.TestCase): - def __init__(self, *args, **kwargs): - super(Hdf5dtypeTest, self).__init__(*args, **kwargs) - # main - self.logger = logging.getLogger() - self.logger.setLevel(logging.INFO) - - def testGetBaseTypeJson(self): - type_json = hdf5dtype.getBaseTypeJson("H5T_IEEE_F64LE") - self.assertTrue("class" in type_json) - self.assertEqual(type_json["class"], "H5T_FLOAT") - self.assertTrue("base" in type_json) - self.assertEqual(type_json["base"], "H5T_IEEE_F64LE") - - type_json = hdf5dtype.getBaseTypeJson("H5T_IEEE_F16LE") - self.assertTrue("class" in type_json) - self.assertEqual(type_json["class"], "H5T_FLOAT") - self.assertTrue("base" in type_json) - self.assertEqual(type_json["base"], "H5T_IEEE_F16LE") - - type_json = hdf5dtype.getBaseTypeJson("H5T_STD_I32LE") - self.assertTrue("class" in type_json) - self.assertEqual(type_json["class"], "H5T_INTEGER") - self.assertTrue("base" in type_json) - self.assertEqual(type_json["base"], "H5T_STD_I32LE") - - try: - hdf5dtype.getBaseTypeJson("foobar") - self.assertTrue(False) - except TypeError: - pass # expected - - def testBaseIntegerTypeItem(self): - dt = np.dtype("") - self.assertEqual(dt.kind, "u") - - dt = hdf5dtype.createDataType("H5T_STD_I16LE") - self.assertEqual(dt.name, "int16") - self.assertEqual(dt.kind, "i") - - dt = hdf5dtype.createDataType("H5T_IEEE_F64LE") - self.assertEqual(dt.name, "float64") - self.assertEqual(dt.kind, "f") - - dt = hdf5dtype.createDataType("H5T_IEEE_F32LE") - self.assertEqual(dt.name, "float32") - self.assertEqual(dt.kind, "f") - - typeItem = {"class": "H5T_INTEGER", "base": "H5T_STD_I32BE"} - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "int32") - self.assertEqual(dt.kind, "i") - self.assertEqual(typeSize, 4) - - def testCreateBaseStringType(self): - typeItem = {"class": "H5T_STRING", "charSet": "H5T_CSET_ASCII", "length": 6} - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "bytes48") - self.assertEqual(dt.kind, "S") - self.assertEqual(typeSize, 6) - - def testCreateBaseUnicodeType(self): - typeItem = {"class": "H5T_STRING", "charSet": "H5T_CSET_UTF8", "length": 6} - - dt = hdf5dtype.createDataType(typeItem) - typeSize = hdf5dtype.getItemSize(typeItem) - self.assertTrue(dt is not None) - self.assertEqual(dt.name, "bytes48") - self.assertEqual(dt.kind, "S") # uses byte - self.assertEqual(typeSize, 6) - - def testCreateNullTermStringType(self): - typeItem = { - "class": "H5T_STRING", - "charSet": "H5T_CSET_ASCII", - "length": 6, - "strPad": "H5T_STR_NULLTERM", - } - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - - self.assertEqual(dt.name, "bytes48") - self.assertEqual(dt.kind, "S") - self.assertEqual(typeSize, 6) - - def testCreateVLenStringType(self): - typeItem = { - "class": "H5T_STRING", - "charSet": "H5T_CSET_ASCII", - "length": "H5T_VARIABLE", - } - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "object") - self.assertEqual(dt.kind, "O") - self.assertEqual(check_dtype(vlen=dt), bytes) - self.assertEqual(typeSize, "H5T_VARIABLE") - - def testCreateVLenUTF8Type(self): - typeItem = { - "class": "H5T_STRING", - "charSet": "H5T_CSET_UTF8", - "length": "H5T_VARIABLE", - } - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "object") - self.assertEqual(dt.kind, "O") - self.assertEqual(check_dtype(vlen=dt), str) - self.assertEqual(typeSize, "H5T_VARIABLE") - - def testCreateVLenDataType(self): - typeItem = {"class": "H5T_VLEN", "base": "H5T_STD_I32BE"} - typeSize = hdf5dtype.getItemSize(typeItem) - self.assertEqual(typeSize, "H5T_VARIABLE") - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "object") - self.assertEqual(dt.kind, "O") - - def testCreateOpaqueType(self): - typeItem = {"class": "H5T_OPAQUE", "size": 200} - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "void1600") - self.assertEqual(dt.kind, "V") - self.assertEqual(typeSize, 200) - - def testCreateEnumType(self): - typeItem = { - "class": "H5T_ENUM", - "base": {"base": "H5T_STD_I16LE", "class": "H5T_INTEGER"}, - "mapping": {"GAS": 2, "LIQUID": 1, "PLASMA": 3, "SOLID": 0}, - } - - typeSize = hdf5dtype.getItemSize(typeItem) - self.assertEqual(typeSize, 2) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "int16") - self.assertEqual(dt.kind, "i") - mapping = check_dtype(enum=dt) - self.assertTrue(isinstance(mapping, dict)) - self.assertEqual(mapping["SOLID"], 0) - self.assertEqual(mapping["LIQUID"], 1) - self.assertEqual(mapping["GAS"], 2) - self.assertEqual(mapping["PLASMA"], 3) - - def testCreateBoolType(self): - typeItem = { - "class": "H5T_ENUM", - "base": {"base": "H5T_STD_I8LE", "class": "H5T_INTEGER"}, - "mapping": {"TRUE": 1, "FALSE": 0}, - } - - typeSize = hdf5dtype.getItemSize(typeItem) - self.assertEqual(typeSize, 1) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "bool") - self.assertEqual(dt.kind, "b") - self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) - - def testCreateCompoundType(self): - typeItem = { - "class": "H5T_COMPOUND", - "fields": [ - {"name": "temp", "type": "H5T_IEEE_F32LE"}, - {"name": "pressure", "type": "H5T_IEEE_F32LE"}, - { - "name": "location", - "type": { - "length": "H5T_VARIABLE", - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - }, - }, - {"name": "wind", "type": "H5T_STD_I16LE"}, - ], - } - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "void144") - self.assertEqual(dt.kind, "V") - self.assertEqual(len(dt.fields), 4) - self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) - - dtLocation = dt[2] - self.assertEqual(dtLocation.name, "object") - self.assertEqual(dtLocation.kind, "O") - self.assertEqual(check_dtype(vlen=dtLocation), bytes) - self.assertEqual(typeSize, "H5T_VARIABLE") - self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dtLocation)) - - def testCreateCompoundInvalidFieldName(self): - typeItem = { - "class": "H5T_COMPOUND", - "fields": [ - { - "name": "\u03b1", - "type": {"base": "H5T_STD_I32LE", "class": "H5T_INTEGER"}, - }, - { - "name": "\u03c9", - "type": {"base": "H5T_STD_I32LE", "class": "H5T_INTEGER"}, - }, - ], - } - try: - hdf5dtype.createDataType(typeItem) - self.assertTrue(False) - except TypeError: - pass # expected - - def testCreateCompoundOfCompoundType(self): - typeItem = { - "class": "H5T_COMPOUND", - "fields": [ - { - "name": "field1", - "type": { - "class": "H5T_COMPOUND", - "fields": [ - { - "name": "x", - "type": { - "class": "H5T_FLOAT", - "base": "H5T_IEEE_F32LE", - }, - }, - { - "name": "y", - "type": { - "class": "H5T_FLOAT", - "base": "H5T_IEEE_F32LE", - }, - }, - ], - }, - }, - { - "name": "field2", - "type": { - "class": "H5T_COMPOUND", - "fields": [ - { - "name": "a", - "type": { - "class": "H5T_FLOAT", - "base": "H5T_IEEE_F32LE", - }, - }, - { - "name": "b", - "type": { - "class": "H5T_FLOAT", - "base": "H5T_IEEE_F32LE", - }, - }, - { - "name": "c", - "type": { - "class": "H5T_FLOAT", - "base": "H5T_IEEE_F32LE", - }, - }, - ], - }, - }, - ], - } - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "void160") - self.assertEqual(dt.kind, "V") - self.assertEqual(len(dt.fields), 2) - dt_field1 = dt[0] - self.assertEqual(dt_field1.name, "void64") - self.assertEqual(dt_field1.kind, "V") - self.assertEqual(len(dt_field1.fields), 2) - dt_field2 = dt[1] - self.assertEqual(dt_field2.name, "void96") - self.assertEqual(dt_field2.kind, "V") - self.assertEqual(len(dt_field2.fields), 3) - - def testCreateCompoundTypeUnicodeFields(self): - typeItem = { - "class": "H5T_COMPOUND", - "fields": [ - {"name": u"temp", "type": "H5T_IEEE_F32LE"}, - {"name": u"pressure", "type": "H5T_IEEE_F32LE"}, - {"name": u"wind", "type": "H5T_STD_I16LE"}, - ], - } - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "void80") - self.assertEqual(dt.kind, "V") - self.assertEqual(len(dt.fields), 3) - self.assertEqual(typeSize, 10) - self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) - - def testCreateArrayType(self): - typeItem = {"class": "H5T_ARRAY", "base": "H5T_STD_I64LE", "dims": (3, 5)} - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(dt.name, "void960") - self.assertEqual(dt.kind, "V") - self.assertEqual(typeSize, 120) - self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) - - def testCreateArrayIntegerType(self): - typeItem = {"class": "H5T_INTEGER", "base": "H5T_STD_I64LE", "dims": (3, 5)} - - try: - hdf5dtype.createDataType(typeItem) - self.assertTrue(False) # expected exception - dims used with non-array type - except TypeError: - pass # should get exception - - def testCreateCompoundArrayType(self): - typeItem = { - "class": "H5T_COMPOUND", - "fields": [ - {"type": {"base": "H5T_STD_I8LE", "class": "H5T_INTEGER"}, "name": "a"}, - { - "type": { - "dims": [10], - "base": { - "length": 1, - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLPAD", - }, - "class": "H5T_ARRAY", - }, - "name": "b", - }, - ], - } - typeSize = hdf5dtype.getItemSize(typeItem) - dt = hdf5dtype.createDataType(typeItem) - self.assertEqual(len(dt.fields), 2) - self.assertTrue("a" in dt.fields.keys()) - self.assertTrue("b" in dt.fields.keys()) - self.assertEqual(typeSize, 11) - self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) - - def testCompoundArrayType(self): - typeItem = { - "class": "H5T_COMPOUND", - "fields": [ - { - "type": {"class": "H5T_INTEGER", "base": "H5T_STD_U64BE"}, - "name": "VALUE1", - }, - { - "type": {"class": "H5T_FLOAT", "base": "H5T_IEEE_F64BE"}, - "name": "VALUE2", - }, - { - "type": { - "class": "H5T_ARRAY", - "dims": [2], - "base": { - "class": "H5T_STRING", - "charSet": "H5T_CSET_ASCII", - "strPad": "H5T_STR_NULLTERM", - "length": "H5T_VARIABLE", - }, - }, - "name": "VALUE3", - }, - ], - } - dt = hdf5dtype.createDataType(typeItem) - typeSize = hdf5dtype.getItemSize(typeItem) - self.assertEqual(typeSize, "H5T_VARIABLE") - self.assertEqual(len(dt), 3) - self.assertTrue("VALUE1" in dt.fields.keys()) - self.assertTrue("VALUE2" in dt.fields.keys()) - self.assertTrue("VALUE3" in dt.fields.keys()) - self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) - - dt3 = dt["VALUE3"] - self.assertEqual(check_dtype(vlen=dt3), bytes) - - -if __name__ == "__main__": - # setup test files - - unittest.main() diff --git a/tests/unit/id_util_test.py b/tests/unit/id_util_test.py deleted file mode 100755 index 06f974c4..00000000 --- a/tests/unit/id_util_test.py +++ /dev/null @@ -1,212 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # -# Utilities. The full HSDS copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## -import unittest -import sys - -sys.path.append("../..") -from hsds.util.idUtil import getObjPartition, isValidUuid, validateUuid -from hsds.util.idUtil import createObjId, getCollectionForId -from hsds.util.idUtil import isObjId, isS3ObjKey, getS3Key, getObjId, isSchema2Id -from hsds.util.idUtil import isRootObjId, getRootObjId - - -class IdUtilTest(unittest.TestCase): - def __init__(self, *args, **kwargs): - super(IdUtilTest, self).__init__(*args, **kwargs) - # main - - def testCreateObjId(self): - id_len = 38 # 36 for uuid plus two for prefix ("g-", "d-") - ids = set() - for obj_class in ("groups", "datasets", "datatypes", "chunks"): - for i in range(100): - id = createObjId(obj_class) - self.assertEqual(len(id), id_len) - self.assertTrue(id[0] in ("g", "d", "t", "c")) - self.assertEqual(id[1], "-") - ids.add(id) - - self.assertEqual(len(ids), 400) - try: - createObjId("bad_class") - self.assertTrue(False) # should throw exception - except ValueError: - pass # expected - - def testIsValidUuid(self): - group1_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e" # orig schema - group2_id = "g-314d61b8-995411e6-a733-3c15c2-da029e" - root_id = "g-f9aaa28e-d42e10e5-7122-2a065c-a6986d" - dataset1_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e" # orig schema - dataset2_id = "d-4c48f3ae-995411e6-a3cd-3c15c2-da029e" - ctype1_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005" # orig schema - ctype2_id = "t-8c785f1c-995311e6-9bc2-0242ac-110005" - chunk1_id = "c-8c785f1c-9953-11e6-9bc2-0242ac110005_7_2" # orig schema - chunk2_id = "c-8c785f1c-995311e6-9bc2-0242ac-110005_7_2" - domain_id = "mybucket/bob/mydata.h5" - s3_domain_id = "s3://mybucket/bob/mydata.h5" - file_domain_id = "file://mybucket/bob/mydata.h5" - azure_domain_id = "https://myaccount.blob.core.windows.net/mybucket/bob/mydata.h5" - valid_id_map = { - group1_id: "a49be-g-314d61b8-9954-11e6-a733-3c15c2da029e", - group2_id: "db/314d61b8-995411e6/g/a733-3c15c2-da029e/.group.json", - dataset1_id: "26928-d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e", - dataset2_id: "db/4c48f3ae-995411e6/d/a3cd-3c15c2-da029e/.dataset.json", - ctype1_id: "5a9cf-t-8c785f1c-9953-11e6-9bc2-0242ac110005", - ctype2_id: "db/8c785f1c-995311e6/t/9bc2-0242ac-110005/.datatype.json", - chunk1_id: "dc4ce-c-8c785f1c-9953-11e6-9bc2-0242ac110005_7_2", - chunk2_id: "db/8c785f1c-995311e6/d/9bc2-0242ac-110005/7_2", - domain_id: "bob/mydata.h5/.domain.json", - s3_domain_id: "bob/mydata.h5/.domain.json", - file_domain_id: "bob/mydata.h5/.domain.json", - azure_domain_id: "bob/mydata.h5/.domain.json", } - - bad_ids = ("g-1e76d862", "/bob/mydata.h5") - - self.assertTrue(isValidUuid(group1_id)) - self.assertFalse(isSchema2Id(group1_id)) - self.assertTrue(isValidUuid(group1_id, obj_class="Group")) - self.assertTrue(isValidUuid(group1_id, obj_class="group")) - self.assertTrue(isValidUuid(group1_id, obj_class="groups")) - self.assertTrue(isSchema2Id(root_id)) - self.assertTrue(isValidUuid(root_id, obj_class="Group")) - self.assertTrue(isValidUuid(root_id, obj_class="group")) - self.assertTrue(isValidUuid(root_id, obj_class="groups")) - self.assertTrue(isRootObjId(root_id)) - self.assertTrue(isValidUuid(dataset1_id, obj_class="datasets")) - self.assertFalse(isSchema2Id(dataset1_id)) - self.assertTrue(isValidUuid(ctype1_id, obj_class="datatypes")) - self.assertFalse(isSchema2Id(ctype1_id)) - self.assertTrue(isValidUuid(chunk1_id, obj_class="chunks")) - self.assertFalse(isSchema2Id(chunk1_id)) - self.assertTrue(isValidUuid(group2_id)) - self.assertTrue(isSchema2Id(group2_id)) - self.assertTrue(isValidUuid(group2_id, obj_class="Group")) - self.assertTrue(isValidUuid(group2_id, obj_class="group")) - self.assertTrue(isValidUuid(group2_id, obj_class="groups")) - self.assertFalse(isRootObjId(group2_id)) - self.assertTrue(isValidUuid(dataset2_id, obj_class="datasets")) - self.assertTrue(isSchema2Id(dataset2_id)) - self.assertTrue(isValidUuid(ctype2_id, obj_class="datatypes")) - self.assertTrue(isSchema2Id(ctype2_id)) - self.assertTrue(isValidUuid(chunk2_id, obj_class="chunks")) - self.assertTrue(isSchema2Id(chunk2_id)) - validateUuid(group1_id) - try: - isRootObjId(group1_id) - self.assertTrue(False) - except ValueError: - # only works for v2 schema - pass # expected - - for item in valid_id_map: - self.assertTrue(isObjId(item)) - s3key = getS3Key(item) - self.assertTrue(s3key[0] != "/") - self.assertTrue(isS3ObjKey(s3key)) - expected = valid_id_map[item] - self.assertEqual(s3key, expected) - if item.find("/") > 0: - continue # bucket name gets lost when domain ids get converted to s3keys - objid = getObjId(s3key) - self.assertEqual(objid, item) - for item in bad_ids: - self.assertFalse(isValidUuid(item)) - self.assertFalse(isObjId(item)) - - def testGetObjPartition(self): - node_count = 12 - for obj_class in ("groups", "datasets", "datatypes", "chunks"): - for i in range(100): - id = createObjId(obj_class) - node_number = getObjPartition(id, node_count) - self.assertTrue(node_number >= 0) - self.assertTrue(node_number < node_count) - # try a domain partition - node_number = getObjPartition("/home/test_user1", node_count) - self.assertTrue(node_number >= 0) - self.assertTrue(node_number < node_count) - - def testGetCollection(self): - group_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e" - dataset_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e" - ctype_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005" - bad_id = "x-59647858-9954-11e6-95d2-3c15c2da029e" - self.assertEqual(getCollectionForId(group_id), "groups") - self.assertEqual(getCollectionForId(dataset_id), "datasets") - self.assertEqual(getCollectionForId(ctype_id), "datatypes") - try: - getCollectionForId(bad_id) - self.assertTrue(False) - except ValueError: - pass # expected - try: - getCollectionForId(None) - self.assertTrue(False) - except ValueError: - pass # expected - - def testSchema2Id(self): - root_id = createObjId("roots") - group_id = createObjId("groups", rootid=root_id) - dataset_id = createObjId("datasets", rootid=root_id) - ctype_id = createObjId("datatypes", rootid=root_id) - - self.assertEqual(getCollectionForId(root_id), "groups") - self.assertEqual(getCollectionForId(group_id), "groups") - self.assertEqual(getCollectionForId(dataset_id), "datasets") - self.assertEqual(getCollectionForId(ctype_id), "datatypes") - chunk_id = "c" + dataset_id[1:] + "_1_2" - print(chunk_id) - chunk_partition_id = "c42-" + dataset_id[2:] + "_1_2" - - for id in (chunk_id, chunk_partition_id): - try: - getCollectionForId(id) - self.assertTrue(False) - except ValueError: - pass # expected - valid_ids = ( - group_id, - dataset_id, - ctype_id, - chunk_id, - chunk_partition_id, - root_id, - ) - s3prefix = getS3Key(root_id) - self.assertTrue(s3prefix.endswith("/.group.json")) - s3prefix = s3prefix[: -(len(".group.json"))] - for oid in valid_ids: - print("oid:", oid) - self.assertTrue(len(oid) >= 38) - parts = oid.split("-") - self.assertEqual(len(parts), 6) - self.assertTrue(oid[0] in ("g", "d", "t", "c")) - self.assertTrue(isSchema2Id(oid)) - if oid == root_id: - self.assertTrue(isRootObjId(oid)) - else: - self.assertFalse(isRootObjId(oid)) - self.assertEqual(getRootObjId(oid), root_id) - - s3key = getS3Key(oid) - print(s3key) - self.assertTrue(s3key.startswith(s3prefix)) - self.assertEqual(getObjId(s3key), oid) - self.assertTrue(isS3ObjKey(s3key)) - - -if __name__ == "__main__": - # setup test files - - unittest.main() diff --git a/tests/unit/lru_cache_test.py b/tests/unit/lru_cache_test.py index 5e747c92..002ca822 100755 --- a/tests/unit/lru_cache_test.py +++ b/tests/unit/lru_cache_test.py @@ -14,9 +14,16 @@ import sys import numpy as np +from h5json.objid import createObjId + sys.path.append("../..") from hsds.util.lruCache import LruCache -from hsds.util.idUtil import createObjId + + +def _createId(): + objid = createObjId("groups") + objid = 'c' + objid[1:] # fake a chunk id + return objid class LruCacheTest(unittest.TestCase): @@ -34,7 +41,7 @@ def testSimple(self): self.assertFalse("xyz" in cc) - id = createObjId("chunks") + id = _createId() try: # only dict objects can be added cc[id] = list(range(20)) @@ -42,7 +49,7 @@ def testSimple(self): except TypeError: pass # expected - rand_id = createObjId("chunks") + rand_id = _createId() np_arr = np.random.random((500, 500)) # smaller than our chunk cache size cc[rand_id] = np_arr # add to cache cc.consistencyCheck() @@ -104,7 +111,7 @@ def testLRU(self): ids = [] # add chunks to the cache for i in range(10): - id = createObjId("chunks") + id = _createId() ids.append(id) arr = np.empty((16, 16), dtype="i4") # 1024 bytes arr[...] = i @@ -165,7 +172,7 @@ def testClearCache(self): ids = [] # add chunks to the cache for i in range(10): - id = createObjId("chunks") + id = _createId() ids.append(id) arr = np.empty((16, 16), dtype="i4") # 1024 bytes arr[...] = i @@ -190,7 +197,7 @@ def testMemUtil(self): self.assertEqual(len(cc), 0) ids = set() for i in range(10): - id = createObjId("chunks") + id = _createId() ids.add(id) arr = np.empty((16, 16), dtype="i4") # 1024 bytes arr[...] = i @@ -208,7 +215,7 @@ def testMemUtil(self): # add 10 more chunks, but set dirty to true each time for i in range(10): - id = createObjId("chunks") + id = _createId() ids.add(id) arr = np.empty((16, 16), dtype="i4") # 1024 bytes arr[...] = i @@ -255,7 +262,7 @@ def testMetaDataCache(self): data = {"x": 123, "y": 456} - rand_id = createObjId("groups") + rand_id = _createId() data = {"foo": "bar"} cc[rand_id] = data # add to cache cc.consistencyCheck()