asfadmin · mattp0 · Sep 8, 2025 · Sep 2, 2025 · Sep 2, 2025 · Sep 5, 2025
diff --git a/mandible/metadata_mapper/format/__init__.py b/mandible/metadata_mapper/format/__init__.py
@@ -1,5 +1,6 @@
 from .format import (
     FORMAT_REGISTRY,
+    Bzip2File,
     FileFormat,
     Format,
     FormatError,
@@ -21,6 +22,7 @@
 
 __all__ = (
     "FORMAT_REGISTRY",
+    "Bzip2File",
     "FileFormat",
     "Format",
     "FormatError",

diff --git a/mandible/metadata_mapper/format/format.py b/mandible/metadata_mapper/format/format.py
@@ -1,3 +1,4 @@
+import bz2
 import contextlib
 import inspect
 import json
@@ -67,10 +68,7 @@ def get_values(
         """Get a list of values from a file"""
 
         with self.parse_data(file) as data:
-            return {
-                key: self._eval_key_wrapper(data, key)
-                for key in keys
-            }
+            return {key: self._eval_key_wrapper(data, key) for key in keys}
 
     def get_value(self, file: IO[bytes], key: Key) -> Any:
         """Convenience function for getting a single value"""
@@ -150,7 +148,7 @@ class ZipMember(Format):
     """A member from a zip archive.
 
     :param filters: A set of filters used to select the desired archive member
-    :param format: The Format of the archive member
+    :param format: The `Format` of the archive member
     """
 
     filters: dict[str, Any]
@@ -235,10 +233,7 @@ def parse_data(file: IO[bytes]) -> Generator[dict]:
         with zipfile.ZipFile(file, "r") as zf:
             yield {
                 "infolist": [
-                    {
-                        k: getattr(info, k)
-                        for k in ZIP_INFO_ATTRS
-                    }
+                    {k: getattr(info, k) for k in ZIP_INFO_ATTRS}
                     for info in zf.infolist()
                 ],
                 "filename": zf.filename,
@@ -250,3 +245,29 @@ def eval_key(data: dict, key: Key) -> Any:
         values = jsonpath.get(data, key.key)
 
         return key.resolve_list_match(values)
+
+
+@dataclass
+class Bzip2File(Format):
+    """A Bzip2 compressed file
+
+    :param format: The `Format` of the compressed file
+    """
+
+    format: Format
+
+    def get_values(
+        self,
+        file: IO[bytes],
+        keys: Iterable[Key],
+    ) -> dict[Key, Any]:
+        """Get a list of values from a file"""
+
+        with bz2.BZ2File(file, mode="rb") as bz2f:
+            return self.format.get_values(bz2f, keys)
+
+    def get_value(self, file: IO[bytes], key: Key) -> Any:
+        """Convenience function for getting a single value"""
+
+        with bz2.BZ2File(file, mode="rb") as bz2f:
+            return self.format.get_value(bz2f, key)
diff --git a/mandible/metadata_mapper/format/h5.py b/mandible/metadata_mapper/format/h5.py
@@ -1,6 +1,6 @@
 import contextlib
 from dataclasses import dataclass
-from typing import IO, Any
+from typing import IO, Any, Optional
 
 import h5py
 import numpy as np
@@ -18,7 +18,33 @@ def parse_data(file: IO[bytes]) -> contextlib.AbstractContextManager[Any]:
 
     @staticmethod
     def eval_key(data: Any, key: Key) -> Any:
-        return normalize(data[key.key][()])
+        group_key, attribute_key = parse_key(key.key)
+        if attribute_key is not None:
+            return normalize(data[group_key].attrs.get(attribute_key))
+        return normalize(data[group_key][()])
+
+
+def parse_key(key: str) -> tuple[str, Optional[str]]:
+    """Parse a HDF5 key where '@' separates the group name from an attribute name.
+
+    The special @ character can be escaped as @@ if the group or attribute name contains a literal '@'.
+    :returns: (str, str | None) -- the group name and the attribute name (if any)
+    """
+
+    # HDF5 states null character is not a valid group name
+    # https://docs.hdfgroup.org/documentation/hdf5/latest/_l_b_grp_create_names.html
+    placeholder = "\0"
+    temp = key.replace("@@", placeholder)
+
+    if temp.count("@") > 1:
+        raise ValueError(f"Invalid key: multiple '@' in '{key}'")
+
+    if "@" not in temp:
+        return temp.replace(placeholder, "@"), None
+
+    left, right = temp.split("@", 1)
+
+    return left.replace(placeholder, "@"), right.replace(placeholder, "@")
 
 
 def normalize(node_val: Any) -> Any:
@@ -30,8 +56,7 @@ def normalize(node_val: Any) -> Any:
         return float(node_val)
     if isinstance(node_val, np.ndarray):
         value = [
-            x.decode("utf-8") if isinstance(x, bytes) else x
-            for x in node_val.tolist()
+            x.decode("utf-8") if isinstance(x, bytes) else x for x in node_val.tolist()
         ]
         return value
     if isinstance(node_val, bytes):

diff --git a/tests/data/example.json.bz2 b/tests/data/example.json.bz2
diff --git a/tests/integration_tests/test_full_example.py b/tests/integration_tests/test_full_example.py
@@ -31,6 +31,20 @@ def sources():
                 "class": "Xml",
             },
         },
+        "bzip2json": {
+            "storage": {
+                "class": "LocalFile",
+                "filters": {
+                    "name": r"example\.json\.bz2",
+                },
+            },
+            "format": {
+                "class": "Bzip2File",
+                "format": {
+                    "class": "Json",
+                },
+            },
+        },
     }
 
 
@@ -83,6 +97,29 @@ def template():
                 return_list=True,
             ),
         },
+        "Bzip2JsonMd": {
+            "description": mapped("bzip2json", "description"),
+            "total": mapped("bzip2json", "meta.summary.total"),
+            "complete": mapped("bzip2json", "meta.summary.complete"),
+            "null": mapped("bzip2json", "meta.null"),
+            # JSONPath only queries
+            "banana_price": mapped("bzip2json", "inventory[?name = 'Banana'].price"),
+            "oreo_price": mapped(
+                "bzip2json",
+                "inventory[?name = 'Oreo'].price",
+                default=4.49,
+            ),
+            "first_red_item": mapped(
+                "bzip2json",
+                "inventory[?attributes.color = 'red'].name",
+                return_first=True,
+            ),
+            "in_stock_items": mapped(
+                "bzip2json",
+                "inventory[?in_stock = true].name",
+                return_list=True,
+            ),
+        },
     })
 
 
@@ -94,7 +131,7 @@ def context(data_path):
                 "name": f"example.{ext}",
                 "path": str(data_path / f"example.{ext}"),
             }
-            for ext in ("json", "xml", "h5")
+            for ext in ("json", "json.bz2", "xml", "h5")
         ],
         meta={
             "json_file_name": r"example\.json",
@@ -129,4 +166,14 @@ def test_full_example(context, sources, template):
             "first_red_item": "Apple",
             "in_stock_items": ["Apple", "Banana", "Tomato", "Scotch Tape", "Oreo"],
         },
+        "Bzip2JsonMd": {
+            "description": "A store inventory",
+            "total": 5,
+            "complete": False,
+            "null": None,
+            "banana_price": 0.99,
+            "oreo_price": 4.49,
+            "first_red_item": "Apple",
+            "in_stock_items": ["Apple", "Banana", "Tomato", "Scotch Tape", "Oreo"],
+        },
     }
diff --git a/tests/test_format.py b/tests/test_format.py
@@ -1,4 +1,6 @@
+import bz2
 import io
+import json
 import zipfile
 from unittest import mock
 
@@ -7,6 +9,7 @@
 from mandible.metadata_mapper.format import (
     FORMAT_REGISTRY,
     H5,
+    Bzip2File,
     Format,
     FormatError,
     Json,
@@ -24,6 +27,7 @@
 
 def test_registry():
     assert FORMAT_REGISTRY == {
+        "Bzip2File": Bzip2File,
         "H5": H5,
         "Json": Json,
         "Xml": Xml,
@@ -94,6 +98,47 @@ def test_h5_empty_key():
         format.get_value(file, Key(""))
 
 
+@pytest.mark.h5
+def test_h5_attribute():
+    file = io.BytesIO()
+    with h5py.File(file, "w") as f:
+        f["foo"] = "foo value"
+        f["bar"] = "bar value"
+        f["list"] = ["list", "value"]
+        f["foo@bar"] = "foo@bar value"
+        new_group = f.create_group("foo_with_attribute")
+        new_group.attrs["value"] = "foo_with_attribute value"
+        new_group.attrs["foo@bar"] = "foo_with_attribute @bar value"
+        new_group_with_at = f.create_group("bar@foo")
+        new_group_with_at.attrs["attr@ibute"] = "testing_attribute@_group@"
+
+    format = H5()
+
+    assert format.get_values(
+        file,
+        [
+            Key("/foo"),
+            Key("bar"),
+            Key("list"),
+            Key("foo@@bar"),
+            Key("foo_with_attribute@value"),
+            Key("foo_with_attribute@foo@@bar"),
+            Key("bar@@foo@attr@@ibute"),
+        ],
+    ) == {
+        Key("/foo"): "foo value",
+        Key("bar"): "bar value",
+        Key("list"): ["list", "value"],
+        Key("foo@@bar"): "foo@bar value",
+        Key("foo_with_attribute@value"): "foo_with_attribute value",
+        Key("foo_with_attribute@foo@@bar"): "foo_with_attribute @bar value",
+        Key("bar@@foo@attr@@ibute"): "testing_attribute@_group@",
+    }
+
+    with pytest.raises(FormatError, match="Invalid key: multiple '@'"):
+        format.get_values(file, [Key("test@test@test")])
+
+
 @pytest.mark.h5
 def test_h5_key_error():
     file = io.BytesIO()
@@ -430,3 +475,49 @@ def test_xml_key_error():
 
     with pytest.raises(FormatError, match="key not found 'foo'"):
         format.get_values(file, [Key("foo")])
+
+
+@pytest.mark.h5
+def test_bzip2_h5py():
+    h5_buffer = io.BytesIO()
+    with h5py.File(h5_buffer, "w") as f:
+        f["foo"] = "foo value"
+        f["bar"] = "bar value"
+        f["list"] = ["list", "value"]
+
+    bz2_compressed_file = io.BytesIO(bz2.compress(h5_buffer.getvalue()))
+    format = Bzip2File(format=H5())
+
+    assert format.get_value(bz2_compressed_file, Key("foo")) == "foo value"
+    bz2_compressed_file.seek(0)
+    assert format.get_value(bz2_compressed_file, Key("bar")) == "bar value"
+    bz2_compressed_file.seek(0)
+    assert format.get_value(bz2_compressed_file, Key("list")) == ["list", "value"]
+    bz2_compressed_file.seek(0)
+    assert format.get_values(bz2_compressed_file, [Key("foo"), Key("bar")]) == {
+        Key("foo"): "foo value",
+        Key("bar"): "bar value",
+    }
+
+
+def test_bzip2_json():
+    json_bytes = json.dumps(
+        {
+            "foo": "foo value",
+            "bar": "bar value",
+        },
+    ).encode("utf-8")
+
+    bz2_compressed_file = io.BytesIO(bz2.compress(json_bytes))
+    format = Bzip2File(format=Json())
+
+    assert format.get_value(bz2_compressed_file, Key("$.foo")) == "foo value"
+    bz2_compressed_file.seek(0)
+    assert format.get_values(bz2_compressed_file, [Key("$.foo")]) == {
+        Key("$.foo"): "foo value",
+    }
+    bz2_compressed_file.seek(0)
+    assert format.get_value(bz2_compressed_file, Key("$")) == {
+        "bar": "bar value",
+        "foo": "foo value",
+    }
diff --git a/tests/test_h5.py b/tests/test_h5.py
@@ -23,3 +23,22 @@ def test_normalize():
     assert normalize(np.array(["A", "B"], dtype="|S1")) == ["A", "B"]
     assert normalize(np.array(["A", "B"], dtype="O")) == ["A", "B"]
     assert normalize(np.array([], dtype="|S1")) == []
+
+
+def test_parse_key():
+    from mandible.metadata_mapper.format.h5 import parse_key
+
+    assert parse_key("foo") == ("foo", None)
+    assert parse_key("foo@@oo") == ("foo@oo", None)
+    assert parse_key("foo@bar") == ("foo", "bar")
+    assert parse_key("@bar") == ("", "bar")
+    assert parse_key("foo@") == ("foo", "")
+    assert parse_key("fo@@o@bar") == ("fo@o", "bar")
+    assert parse_key("foo@@@bar") == ("foo@", "bar")
+    assert parse_key("@@@@") == ("@@", None)
+    assert parse_key("@@@foo@@") == ("@", "foo@")
+
+    with pytest.raises(ValueError):
+        parse_key("a@b@c")
+    with pytest.raises(ValueError):
+        parse_key("@@a@b@c@@")