diff --git a/mandible/metadata_mapper/format/__init__.py b/mandible/metadata_mapper/format/__init__.py index 3fee13d..7e9873f 100644 --- a/mandible/metadata_mapper/format/__init__.py +++ b/mandible/metadata_mapper/format/__init__.py @@ -1,5 +1,6 @@ from .format import ( FORMAT_REGISTRY, + Bzip2File, FileFormat, Format, FormatError, @@ -21,6 +22,7 @@ __all__ = ( "FORMAT_REGISTRY", + "Bzip2File", "FileFormat", "Format", "FormatError", diff --git a/mandible/metadata_mapper/format/format.py b/mandible/metadata_mapper/format/format.py index 8723c7a..bf47a47 100644 --- a/mandible/metadata_mapper/format/format.py +++ b/mandible/metadata_mapper/format/format.py @@ -1,3 +1,4 @@ +import bz2 import contextlib import inspect import json @@ -67,10 +68,7 @@ def get_values( """Get a list of values from a file""" with self.parse_data(file) as data: - return { - key: self._eval_key_wrapper(data, key) - for key in keys - } + return {key: self._eval_key_wrapper(data, key) for key in keys} def get_value(self, file: IO[bytes], key: Key) -> Any: """Convenience function for getting a single value""" @@ -150,7 +148,7 @@ class ZipMember(Format): """A member from a zip archive. :param filters: A set of filters used to select the desired archive member - :param format: The Format of the archive member + :param format: The `Format` of the archive member """ filters: dict[str, Any] @@ -235,10 +233,7 @@ def parse_data(file: IO[bytes]) -> Generator[dict]: with zipfile.ZipFile(file, "r") as zf: yield { "infolist": [ - { - k: getattr(info, k) - for k in ZIP_INFO_ATTRS - } + {k: getattr(info, k) for k in ZIP_INFO_ATTRS} for info in zf.infolist() ], "filename": zf.filename, @@ -250,3 +245,29 @@ def eval_key(data: dict, key: Key) -> Any: values = jsonpath.get(data, key.key) return key.resolve_list_match(values) + + +@dataclass +class Bzip2File(Format): + """A Bzip2 compressed file + + :param format: The `Format` of the compressed file + """ + + format: Format + + def get_values( + self, + file: IO[bytes], + keys: Iterable[Key], + ) -> dict[Key, Any]: + """Get a list of values from a file""" + + with bz2.BZ2File(file, mode="rb") as bz2f: + return self.format.get_values(bz2f, keys) + + def get_value(self, file: IO[bytes], key: Key) -> Any: + """Convenience function for getting a single value""" + + with bz2.BZ2File(file, mode="rb") as bz2f: + return self.format.get_value(bz2f, key) diff --git a/mandible/metadata_mapper/format/h5.py b/mandible/metadata_mapper/format/h5.py index 8538703..482216d 100644 --- a/mandible/metadata_mapper/format/h5.py +++ b/mandible/metadata_mapper/format/h5.py @@ -1,6 +1,6 @@ import contextlib from dataclasses import dataclass -from typing import IO, Any +from typing import IO, Any, Optional import h5py import numpy as np @@ -18,7 +18,33 @@ def parse_data(file: IO[bytes]) -> contextlib.AbstractContextManager[Any]: @staticmethod def eval_key(data: Any, key: Key) -> Any: - return normalize(data[key.key][()]) + group_key, attribute_key = parse_key(key.key) + if attribute_key is not None: + return normalize(data[group_key].attrs.get(attribute_key)) + return normalize(data[group_key][()]) + + +def parse_key(key: str) -> tuple[str, Optional[str]]: + """Parse a HDF5 key where '@' separates the group name from an attribute name. + + The special @ character can be escaped as @@ if the group or attribute name contains a literal '@'. + :returns: (str, str | None) -- the group name and the attribute name (if any) + """ + + # HDF5 states null character is not a valid group name + # https://docs.hdfgroup.org/documentation/hdf5/latest/_l_b_grp_create_names.html + placeholder = "\0" + temp = key.replace("@@", placeholder) + + if temp.count("@") > 1: + raise ValueError(f"Invalid key: multiple '@' in '{key}'") + + if "@" not in temp: + return temp.replace(placeholder, "@"), None + + left, right = temp.split("@", 1) + + return left.replace(placeholder, "@"), right.replace(placeholder, "@") def normalize(node_val: Any) -> Any: @@ -30,8 +56,7 @@ def normalize(node_val: Any) -> Any: return float(node_val) if isinstance(node_val, np.ndarray): value = [ - x.decode("utf-8") if isinstance(x, bytes) else x - for x in node_val.tolist() + x.decode("utf-8") if isinstance(x, bytes) else x for x in node_val.tolist() ] return value if isinstance(node_val, bytes): diff --git a/tests/data/example.json.bz2 b/tests/data/example.json.bz2 new file mode 100644 index 0000000..dbc7786 Binary files /dev/null and b/tests/data/example.json.bz2 differ diff --git a/tests/integration_tests/test_full_example.py b/tests/integration_tests/test_full_example.py index c360ed9..fd3abf4 100644 --- a/tests/integration_tests/test_full_example.py +++ b/tests/integration_tests/test_full_example.py @@ -31,6 +31,20 @@ def sources(): "class": "Xml", }, }, + "bzip2json": { + "storage": { + "class": "LocalFile", + "filters": { + "name": r"example\.json\.bz2", + }, + }, + "format": { + "class": "Bzip2File", + "format": { + "class": "Json", + }, + }, + }, } @@ -83,6 +97,29 @@ def template(): return_list=True, ), }, + "Bzip2JsonMd": { + "description": mapped("bzip2json", "description"), + "total": mapped("bzip2json", "meta.summary.total"), + "complete": mapped("bzip2json", "meta.summary.complete"), + "null": mapped("bzip2json", "meta.null"), + # JSONPath only queries + "banana_price": mapped("bzip2json", "inventory[?name = 'Banana'].price"), + "oreo_price": mapped( + "bzip2json", + "inventory[?name = 'Oreo'].price", + default=4.49, + ), + "first_red_item": mapped( + "bzip2json", + "inventory[?attributes.color = 'red'].name", + return_first=True, + ), + "in_stock_items": mapped( + "bzip2json", + "inventory[?in_stock = true].name", + return_list=True, + ), + }, }) @@ -94,7 +131,7 @@ def context(data_path): "name": f"example.{ext}", "path": str(data_path / f"example.{ext}"), } - for ext in ("json", "xml", "h5") + for ext in ("json", "json.bz2", "xml", "h5") ], meta={ "json_file_name": r"example\.json", @@ -129,4 +166,14 @@ def test_full_example(context, sources, template): "first_red_item": "Apple", "in_stock_items": ["Apple", "Banana", "Tomato", "Scotch Tape", "Oreo"], }, + "Bzip2JsonMd": { + "description": "A store inventory", + "total": 5, + "complete": False, + "null": None, + "banana_price": 0.99, + "oreo_price": 4.49, + "first_red_item": "Apple", + "in_stock_items": ["Apple", "Banana", "Tomato", "Scotch Tape", "Oreo"], + }, } diff --git a/tests/test_format.py b/tests/test_format.py index 75fa3ce..100c41f 100644 --- a/tests/test_format.py +++ b/tests/test_format.py @@ -1,4 +1,6 @@ +import bz2 import io +import json import zipfile from unittest import mock @@ -7,6 +9,7 @@ from mandible.metadata_mapper.format import ( FORMAT_REGISTRY, H5, + Bzip2File, Format, FormatError, Json, @@ -24,6 +27,7 @@ def test_registry(): assert FORMAT_REGISTRY == { + "Bzip2File": Bzip2File, "H5": H5, "Json": Json, "Xml": Xml, @@ -94,6 +98,47 @@ def test_h5_empty_key(): format.get_value(file, Key("")) +@pytest.mark.h5 +def test_h5_attribute(): + file = io.BytesIO() + with h5py.File(file, "w") as f: + f["foo"] = "foo value" + f["bar"] = "bar value" + f["list"] = ["list", "value"] + f["foo@bar"] = "foo@bar value" + new_group = f.create_group("foo_with_attribute") + new_group.attrs["value"] = "foo_with_attribute value" + new_group.attrs["foo@bar"] = "foo_with_attribute @bar value" + new_group_with_at = f.create_group("bar@foo") + new_group_with_at.attrs["attr@ibute"] = "testing_attribute@_group@" + + format = H5() + + assert format.get_values( + file, + [ + Key("/foo"), + Key("bar"), + Key("list"), + Key("foo@@bar"), + Key("foo_with_attribute@value"), + Key("foo_with_attribute@foo@@bar"), + Key("bar@@foo@attr@@ibute"), + ], + ) == { + Key("/foo"): "foo value", + Key("bar"): "bar value", + Key("list"): ["list", "value"], + Key("foo@@bar"): "foo@bar value", + Key("foo_with_attribute@value"): "foo_with_attribute value", + Key("foo_with_attribute@foo@@bar"): "foo_with_attribute @bar value", + Key("bar@@foo@attr@@ibute"): "testing_attribute@_group@", + } + + with pytest.raises(FormatError, match="Invalid key: multiple '@'"): + format.get_values(file, [Key("test@test@test")]) + + @pytest.mark.h5 def test_h5_key_error(): file = io.BytesIO() @@ -430,3 +475,49 @@ def test_xml_key_error(): with pytest.raises(FormatError, match="key not found 'foo'"): format.get_values(file, [Key("foo")]) + + +@pytest.mark.h5 +def test_bzip2_h5py(): + h5_buffer = io.BytesIO() + with h5py.File(h5_buffer, "w") as f: + f["foo"] = "foo value" + f["bar"] = "bar value" + f["list"] = ["list", "value"] + + bz2_compressed_file = io.BytesIO(bz2.compress(h5_buffer.getvalue())) + format = Bzip2File(format=H5()) + + assert format.get_value(bz2_compressed_file, Key("foo")) == "foo value" + bz2_compressed_file.seek(0) + assert format.get_value(bz2_compressed_file, Key("bar")) == "bar value" + bz2_compressed_file.seek(0) + assert format.get_value(bz2_compressed_file, Key("list")) == ["list", "value"] + bz2_compressed_file.seek(0) + assert format.get_values(bz2_compressed_file, [Key("foo"), Key("bar")]) == { + Key("foo"): "foo value", + Key("bar"): "bar value", + } + + +def test_bzip2_json(): + json_bytes = json.dumps( + { + "foo": "foo value", + "bar": "bar value", + }, + ).encode("utf-8") + + bz2_compressed_file = io.BytesIO(bz2.compress(json_bytes)) + format = Bzip2File(format=Json()) + + assert format.get_value(bz2_compressed_file, Key("$.foo")) == "foo value" + bz2_compressed_file.seek(0) + assert format.get_values(bz2_compressed_file, [Key("$.foo")]) == { + Key("$.foo"): "foo value", + } + bz2_compressed_file.seek(0) + assert format.get_value(bz2_compressed_file, Key("$")) == { + "bar": "bar value", + "foo": "foo value", + } diff --git a/tests/test_h5.py b/tests/test_h5.py index 6ab7b25..5e14a42 100644 --- a/tests/test_h5.py +++ b/tests/test_h5.py @@ -23,3 +23,22 @@ def test_normalize(): assert normalize(np.array(["A", "B"], dtype="|S1")) == ["A", "B"] assert normalize(np.array(["A", "B"], dtype="O")) == ["A", "B"] assert normalize(np.array([], dtype="|S1")) == [] + + +def test_parse_key(): + from mandible.metadata_mapper.format.h5 import parse_key + + assert parse_key("foo") == ("foo", None) + assert parse_key("foo@@oo") == ("foo@oo", None) + assert parse_key("foo@bar") == ("foo", "bar") + assert parse_key("@bar") == ("", "bar") + assert parse_key("foo@") == ("foo", "") + assert parse_key("fo@@o@bar") == ("fo@o", "bar") + assert parse_key("foo@@@bar") == ("foo@", "bar") + assert parse_key("@@@@") == ("@@", None) + assert parse_key("@@@foo@@") == ("@", "foo@") + + with pytest.raises(ValueError): + parse_key("a@b@c") + with pytest.raises(ValueError): + parse_key("@@a@b@c@@")