From 2cb71133f426c6255bb3323e094ee0bc22de0258 Mon Sep 17 00:00:00 2001 From: Matt Perry Date: Tue, 2 Sep 2025 12:17:38 -0600 Subject: [PATCH 1/3] feat: add h5 attribute syntax --- mandible/metadata_mapper/format/h5.py | 36 ++++++++++++++++++++--- tests/test_format.py | 41 +++++++++++++++++++++++++++ tests/test_h5.py | 19 +++++++++++++ 3 files changed, 92 insertions(+), 4 deletions(-) diff --git a/mandible/metadata_mapper/format/h5.py b/mandible/metadata_mapper/format/h5.py index 8538703..b2aa201 100644 --- a/mandible/metadata_mapper/format/h5.py +++ b/mandible/metadata_mapper/format/h5.py @@ -1,6 +1,6 @@ import contextlib from dataclasses import dataclass -from typing import IO, Any +from typing import IO, Any, Optional import h5py import numpy as np @@ -18,7 +18,36 @@ def parse_data(file: IO[bytes]) -> contextlib.AbstractContextManager[Any]: @staticmethod def eval_key(data: Any, key: Key) -> Any: - return normalize(data[key.key][()]) + group_key, attribute_key = parse_key(key.key) + if attribute_key is not None: + return normalize(data[group_key].attrs.get(attribute_key)) + return normalize(data[group_key][()]) + + +def parse_key(key: str) -> tuple[str, Optional[str]]: + """ + Parse a key where '@' is special. + '@@' means a literal '@'. + '@' can appear at most once as a separator. + + Returns: + (left, right) where right is None if no unescaped '@' + """ + + # HDF5 states null character is not a valid group name + # https://docs.hdfgroup.org/documentation/hdf5/latest/_l_b_grp_create_names.html + placeholder = "\0" + temp = key.replace("@@", placeholder) + + if temp.count("@") > 1: + raise ValueError(f"Invalid key: multiple '@' in '{key}'") + + if "@" not in temp: + return temp.replace(placeholder, "@"), None + + left, right = temp.split("@", 1) + + return left.replace(placeholder, "@"), right.replace(placeholder, "@") def normalize(node_val: Any) -> Any: @@ -30,8 +59,7 @@ def normalize(node_val: Any) -> Any: return float(node_val) if isinstance(node_val, np.ndarray): value = [ - x.decode("utf-8") if isinstance(x, bytes) else x - for x in node_val.tolist() + x.decode("utf-8") if isinstance(x, bytes) else x for x in node_val.tolist() ] return value if isinstance(node_val, bytes): diff --git a/tests/test_format.py b/tests/test_format.py index 75fa3ce..1ee0e63 100644 --- a/tests/test_format.py +++ b/tests/test_format.py @@ -94,6 +94,47 @@ def test_h5_empty_key(): format.get_value(file, Key("")) +@pytest.mark.h5 +def test_h5_attribute(): + file = io.BytesIO() + with h5py.File(file, "w") as f: + f["foo"] = "foo value" + f["bar"] = "bar value" + f["list"] = ["list", "value"] + f["foo@bar"] = "foo@bar value" + new_group = f.create_group("foo_with_attribute") + new_group.attrs["value"] = "foo_with_attribute value" + new_group.attrs["foo@bar"] = "foo_with_attribute @bar value" + new_group_with_at = f.create_group("bar@foo") + new_group_with_at.attrs["attr@ibute"] = "testing_attribute@_group@" + + format = H5() + + assert format.get_values( + file, + [ + Key("/foo"), + Key("bar"), + Key("list"), + Key("foo@@bar"), + Key("foo_with_attribute@value"), + Key("foo_with_attribute@foo@@bar"), + Key("bar@@foo@attr@@ibute"), + ], + ) == { + Key("/foo"): "foo value", + Key("bar"): "bar value", + Key("list"): ["list", "value"], + Key("foo@@bar"): "foo@bar value", + Key("foo_with_attribute@value"): "foo_with_attribute value", + Key("foo_with_attribute@foo@@bar"): "foo_with_attribute @bar value", + Key("bar@@foo@attr@@ibute"): "testing_attribute@_group@", + } + + with pytest.raises(FormatError, match="Invalid key: multiple '@'"): + format.get_values(file, [Key("test@test@test")]) + + @pytest.mark.h5 def test_h5_key_error(): file = io.BytesIO() diff --git a/tests/test_h5.py b/tests/test_h5.py index 6ab7b25..5e14a42 100644 --- a/tests/test_h5.py +++ b/tests/test_h5.py @@ -23,3 +23,22 @@ def test_normalize(): assert normalize(np.array(["A", "B"], dtype="|S1")) == ["A", "B"] assert normalize(np.array(["A", "B"], dtype="O")) == ["A", "B"] assert normalize(np.array([], dtype="|S1")) == [] + + +def test_parse_key(): + from mandible.metadata_mapper.format.h5 import parse_key + + assert parse_key("foo") == ("foo", None) + assert parse_key("foo@@oo") == ("foo@oo", None) + assert parse_key("foo@bar") == ("foo", "bar") + assert parse_key("@bar") == ("", "bar") + assert parse_key("foo@") == ("foo", "") + assert parse_key("fo@@o@bar") == ("fo@o", "bar") + assert parse_key("foo@@@bar") == ("foo@", "bar") + assert parse_key("@@@@") == ("@@", None) + assert parse_key("@@@foo@@") == ("@", "foo@") + + with pytest.raises(ValueError): + parse_key("a@b@c") + with pytest.raises(ValueError): + parse_key("@@a@b@c@@") From e0aeb9ae45f9b24f36f9c437d426e47be9c28448 Mon Sep 17 00:00:00 2001 From: Matt Perry Date: Tue, 2 Sep 2025 17:45:12 -0600 Subject: [PATCH 2/3] feat: add bzip2 support to mandible --- mandible/metadata_mapper/format/__init__.py | 2 + mandible/metadata_mapper/format/format.py | 37 +++++++++++--- mandible/metadata_mapper/format/h5.py | 9 ++-- tests/data/example.json.bz2 | Bin 0 -> 413 bytes tests/integration_tests/test_full_example.py | 49 +++++++++++++++++- tests/test_format.py | 50 +++++++++++++++++++ 6 files changed, 132 insertions(+), 15 deletions(-) create mode 100644 tests/data/example.json.bz2 diff --git a/mandible/metadata_mapper/format/__init__.py b/mandible/metadata_mapper/format/__init__.py index 3fee13d..7e9873f 100644 --- a/mandible/metadata_mapper/format/__init__.py +++ b/mandible/metadata_mapper/format/__init__.py @@ -1,5 +1,6 @@ from .format import ( FORMAT_REGISTRY, + Bzip2File, FileFormat, Format, FormatError, @@ -21,6 +22,7 @@ __all__ = ( "FORMAT_REGISTRY", + "Bzip2File", "FileFormat", "Format", "FormatError", diff --git a/mandible/metadata_mapper/format/format.py b/mandible/metadata_mapper/format/format.py index 8723c7a..25f86d1 100644 --- a/mandible/metadata_mapper/format/format.py +++ b/mandible/metadata_mapper/format/format.py @@ -1,3 +1,4 @@ +import bz2 import contextlib import inspect import json @@ -67,10 +68,7 @@ def get_values( """Get a list of values from a file""" with self.parse_data(file) as data: - return { - key: self._eval_key_wrapper(data, key) - for key in keys - } + return {key: self._eval_key_wrapper(data, key) for key in keys} def get_value(self, file: IO[bytes], key: Key) -> Any: """Convenience function for getting a single value""" @@ -235,10 +233,7 @@ def parse_data(file: IO[bytes]) -> Generator[dict]: with zipfile.ZipFile(file, "r") as zf: yield { "infolist": [ - { - k: getattr(info, k) - for k in ZIP_INFO_ATTRS - } + {k: getattr(info, k) for k in ZIP_INFO_ATTRS} for info in zf.infolist() ], "filename": zf.filename, @@ -250,3 +245,29 @@ def eval_key(data: dict, key: Key) -> Any: values = jsonpath.get(data, key.key) return key.resolve_list_match(values) + + +@dataclass +class Bzip2File(Format): + """A Bzip2 compressed file + + :param format: The `Format` of the compressed file + """ + + format: Format + + def get_values( + self, + file: IO[bytes], + keys: Iterable[Key], + ) -> dict[Key, Any]: + """Get a list of values from a file""" + + with bz2.BZ2File(file, mode="rb") as bz2f: + return self.format.get_values(bz2f, keys) + + def get_value(self, file: IO[bytes], key: Key) -> Any: + """Convenience function for getting a single value""" + + with bz2.BZ2File(file, mode="rb") as bz2f: + return self.format.get_value(bz2f, key) diff --git a/mandible/metadata_mapper/format/h5.py b/mandible/metadata_mapper/format/h5.py index b2aa201..482216d 100644 --- a/mandible/metadata_mapper/format/h5.py +++ b/mandible/metadata_mapper/format/h5.py @@ -25,13 +25,10 @@ def eval_key(data: Any, key: Key) -> Any: def parse_key(key: str) -> tuple[str, Optional[str]]: - """ - Parse a key where '@' is special. - '@@' means a literal '@'. - '@' can appear at most once as a separator. + """Parse a HDF5 key where '@' separates the group name from an attribute name. - Returns: - (left, right) where right is None if no unescaped '@' + The special @ character can be escaped as @@ if the group or attribute name contains a literal '@'. + :returns: (str, str | None) -- the group name and the attribute name (if any) """ # HDF5 states null character is not a valid group name diff --git a/tests/data/example.json.bz2 b/tests/data/example.json.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..dbc77862a54bd249c3dadd01e022544c2cd15df9 GIT binary patch literal 413 zcmV;O0b>3_T4*^jL0KkKStr9qg#ZFv-+)vQPz8UmFaV4Szwh6wKmoTz+H54EWj!IX zo})pa00Te-h(%9Ek)uFqkQxD|n1(VL4F-cofHVL!FhwF0OqnAS2xL7&CQUGukd@9I z(2hLq5DJ)>n6uW1prCBG!_|=iOMnAP%A!&rkpTTd&I}`DBR)RyO4-5waiE4R!Mq`D zdL8$Rbkrj?tS(fpV5d+mVC|`>6BxD2gg{9%1q1fdEh5S(@nF#mDWlBrWAgQrAiWmI z*iDHhBAnN^CS*!jpw$or518E=1DPe?$EWbcW9V)f4`7hlA0t9@-FY&A!wfrK7mOxK zOM=*nD|?j-pmURl!eW4#(5vwq3^rCox(pL$QW&I>2*9zxFcvP>&}9~EI^H}+T*l#R zndSwJ=I=P3@iD|`aYRyxI@tw`ghCiZ#Og28g99y_kY1V^U;9w0L&Z} Date: Fri, 5 Sep 2025 17:44:41 -0600 Subject: [PATCH 3/3] chore: improve docstring for ZipMember --- mandible/metadata_mapper/format/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mandible/metadata_mapper/format/format.py b/mandible/metadata_mapper/format/format.py index 25f86d1..bf47a47 100644 --- a/mandible/metadata_mapper/format/format.py +++ b/mandible/metadata_mapper/format/format.py @@ -148,7 +148,7 @@ class ZipMember(Format): """A member from a zip archive. :param filters: A set of filters used to select the desired archive member - :param format: The Format of the archive member + :param format: The `Format` of the archive member """ filters: dict[str, Any]