Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions mandible/metadata_mapper/format/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .format import (
FORMAT_REGISTRY,
Bzip2File,
FileFormat,
Format,
FormatError,
Expand All @@ -21,6 +22,7 @@

__all__ = (
"FORMAT_REGISTRY",
"Bzip2File",
"FileFormat",
"Format",
"FormatError",
Expand Down
39 changes: 30 additions & 9 deletions mandible/metadata_mapper/format/format.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import bz2
import contextlib
import inspect
import json
Expand Down Expand Up @@ -67,10 +68,7 @@ def get_values(
"""Get a list of values from a file"""

with self.parse_data(file) as data:
return {
key: self._eval_key_wrapper(data, key)
for key in keys
}
return {key: self._eval_key_wrapper(data, key) for key in keys}

def get_value(self, file: IO[bytes], key: Key) -> Any:
"""Convenience function for getting a single value"""
Expand Down Expand Up @@ -150,7 +148,7 @@ class ZipMember(Format):
"""A member from a zip archive.

:param filters: A set of filters used to select the desired archive member
:param format: The Format of the archive member
:param format: The `Format` of the archive member
"""

filters: dict[str, Any]
Expand Down Expand Up @@ -235,10 +233,7 @@ def parse_data(file: IO[bytes]) -> Generator[dict]:
with zipfile.ZipFile(file, "r") as zf:
yield {
"infolist": [
{
k: getattr(info, k)
for k in ZIP_INFO_ATTRS
}
{k: getattr(info, k) for k in ZIP_INFO_ATTRS}
for info in zf.infolist()
],
"filename": zf.filename,
Expand All @@ -250,3 +245,29 @@ def eval_key(data: dict, key: Key) -> Any:
values = jsonpath.get(data, key.key)

return key.resolve_list_match(values)


@dataclass
class Bzip2File(Format):
"""A Bzip2 compressed file

:param format: The `Format` of the compressed file
"""

format: Format

def get_values(
self,
file: IO[bytes],
keys: Iterable[Key],
) -> dict[Key, Any]:
"""Get a list of values from a file"""

with bz2.BZ2File(file, mode="rb") as bz2f:
return self.format.get_values(bz2f, keys)

def get_value(self, file: IO[bytes], key: Key) -> Any:
"""Convenience function for getting a single value"""

with bz2.BZ2File(file, mode="rb") as bz2f:
return self.format.get_value(bz2f, key)
33 changes: 29 additions & 4 deletions mandible/metadata_mapper/format/h5.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import contextlib
from dataclasses import dataclass
from typing import IO, Any
from typing import IO, Any, Optional

import h5py
import numpy as np
Expand All @@ -18,7 +18,33 @@ def parse_data(file: IO[bytes]) -> contextlib.AbstractContextManager[Any]:

@staticmethod
def eval_key(data: Any, key: Key) -> Any:
return normalize(data[key.key][()])
group_key, attribute_key = parse_key(key.key)
if attribute_key is not None:
return normalize(data[group_key].attrs.get(attribute_key))
return normalize(data[group_key][()])


def parse_key(key: str) -> tuple[str, Optional[str]]:
"""Parse a HDF5 key where '@' separates the group name from an attribute name.

The special @ character can be escaped as @@ if the group or attribute name contains a literal '@'.
:returns: (str, str | None) -- the group name and the attribute name (if any)
"""

# HDF5 states null character is not a valid group name
# https://docs.hdfgroup.org/documentation/hdf5/latest/_l_b_grp_create_names.html
placeholder = "\0"
temp = key.replace("@@", placeholder)

if temp.count("@") > 1:
raise ValueError(f"Invalid key: multiple '@' in '{key}'")

if "@" not in temp:
return temp.replace(placeholder, "@"), None

left, right = temp.split("@", 1)

return left.replace(placeholder, "@"), right.replace(placeholder, "@")


def normalize(node_val: Any) -> Any:
Expand All @@ -30,8 +56,7 @@ def normalize(node_val: Any) -> Any:
return float(node_val)
if isinstance(node_val, np.ndarray):
value = [
x.decode("utf-8") if isinstance(x, bytes) else x
for x in node_val.tolist()
x.decode("utf-8") if isinstance(x, bytes) else x for x in node_val.tolist()
]
return value
if isinstance(node_val, bytes):
Expand Down
Binary file added tests/data/example.json.bz2
Binary file not shown.
49 changes: 48 additions & 1 deletion tests/integration_tests/test_full_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,20 @@ def sources():
"class": "Xml",
},
},
"bzip2json": {
"storage": {
"class": "LocalFile",
"filters": {
"name": r"example\.json\.bz2",
},
},
"format": {
"class": "Bzip2File",
"format": {
"class": "Json",
},
},
},
}


Expand Down Expand Up @@ -83,6 +97,29 @@ def template():
return_list=True,
),
},
"Bzip2JsonMd": {
"description": mapped("bzip2json", "description"),
"total": mapped("bzip2json", "meta.summary.total"),
"complete": mapped("bzip2json", "meta.summary.complete"),
"null": mapped("bzip2json", "meta.null"),
# JSONPath only queries
"banana_price": mapped("bzip2json", "inventory[?name = 'Banana'].price"),
"oreo_price": mapped(
"bzip2json",
"inventory[?name = 'Oreo'].price",
default=4.49,
),
"first_red_item": mapped(
"bzip2json",
"inventory[?attributes.color = 'red'].name",
return_first=True,
),
"in_stock_items": mapped(
"bzip2json",
"inventory[?in_stock = true].name",
return_list=True,
),
},
})


Expand All @@ -94,7 +131,7 @@ def context(data_path):
"name": f"example.{ext}",
"path": str(data_path / f"example.{ext}"),
}
for ext in ("json", "xml", "h5")
for ext in ("json", "json.bz2", "xml", "h5")
],
meta={
"json_file_name": r"example\.json",
Expand Down Expand Up @@ -129,4 +166,14 @@ def test_full_example(context, sources, template):
"first_red_item": "Apple",
"in_stock_items": ["Apple", "Banana", "Tomato", "Scotch Tape", "Oreo"],
},
"Bzip2JsonMd": {
"description": "A store inventory",
"total": 5,
"complete": False,
"null": None,
"banana_price": 0.99,
"oreo_price": 4.49,
"first_red_item": "Apple",
"in_stock_items": ["Apple", "Banana", "Tomato", "Scotch Tape", "Oreo"],
},
}
91 changes: 91 additions & 0 deletions tests/test_format.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import bz2
import io
import json
import zipfile
from unittest import mock

Expand All @@ -7,6 +9,7 @@
from mandible.metadata_mapper.format import (
FORMAT_REGISTRY,
H5,
Bzip2File,
Format,
FormatError,
Json,
Expand All @@ -24,6 +27,7 @@

def test_registry():
assert FORMAT_REGISTRY == {
"Bzip2File": Bzip2File,
"H5": H5,
"Json": Json,
"Xml": Xml,
Expand Down Expand Up @@ -94,6 +98,47 @@ def test_h5_empty_key():
format.get_value(file, Key(""))


@pytest.mark.h5
def test_h5_attribute():
file = io.BytesIO()
with h5py.File(file, "w") as f:
f["foo"] = "foo value"
f["bar"] = "bar value"
f["list"] = ["list", "value"]
f["foo@bar"] = "foo@bar value"
new_group = f.create_group("foo_with_attribute")
new_group.attrs["value"] = "foo_with_attribute value"
new_group.attrs["foo@bar"] = "foo_with_attribute @bar value"
new_group_with_at = f.create_group("bar@foo")
new_group_with_at.attrs["attr@ibute"] = "testing_attribute@_group@"

format = H5()

assert format.get_values(
file,
[
Key("/foo"),
Key("bar"),
Key("list"),
Key("foo@@bar"),
Key("foo_with_attribute@value"),
Key("foo_with_attribute@foo@@bar"),
Key("bar@@foo@attr@@ibute"),
],
) == {
Key("/foo"): "foo value",
Key("bar"): "bar value",
Key("list"): ["list", "value"],
Key("foo@@bar"): "foo@bar value",
Key("foo_with_attribute@value"): "foo_with_attribute value",
Key("foo_with_attribute@foo@@bar"): "foo_with_attribute @bar value",
Key("bar@@foo@attr@@ibute"): "testing_attribute@_group@",
}

with pytest.raises(FormatError, match="Invalid key: multiple '@'"):
format.get_values(file, [Key("test@test@test")])


@pytest.mark.h5
def test_h5_key_error():
file = io.BytesIO()
Expand Down Expand Up @@ -430,3 +475,49 @@ def test_xml_key_error():

with pytest.raises(FormatError, match="key not found 'foo'"):
format.get_values(file, [Key("foo")])


@pytest.mark.h5
def test_bzip2_h5py():
h5_buffer = io.BytesIO()
with h5py.File(h5_buffer, "w") as f:
f["foo"] = "foo value"
f["bar"] = "bar value"
f["list"] = ["list", "value"]

bz2_compressed_file = io.BytesIO(bz2.compress(h5_buffer.getvalue()))
format = Bzip2File(format=H5())

assert format.get_value(bz2_compressed_file, Key("foo")) == "foo value"
bz2_compressed_file.seek(0)
assert format.get_value(bz2_compressed_file, Key("bar")) == "bar value"
bz2_compressed_file.seek(0)
assert format.get_value(bz2_compressed_file, Key("list")) == ["list", "value"]
bz2_compressed_file.seek(0)
assert format.get_values(bz2_compressed_file, [Key("foo"), Key("bar")]) == {
Key("foo"): "foo value",
Key("bar"): "bar value",
}


def test_bzip2_json():
json_bytes = json.dumps(
{
"foo": "foo value",
"bar": "bar value",
},
).encode("utf-8")

bz2_compressed_file = io.BytesIO(bz2.compress(json_bytes))
format = Bzip2File(format=Json())

assert format.get_value(bz2_compressed_file, Key("$.foo")) == "foo value"
bz2_compressed_file.seek(0)
assert format.get_values(bz2_compressed_file, [Key("$.foo")]) == {
Key("$.foo"): "foo value",
}
bz2_compressed_file.seek(0)
assert format.get_value(bz2_compressed_file, Key("$")) == {
"bar": "bar value",
"foo": "foo value",
}
19 changes: 19 additions & 0 deletions tests/test_h5.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,22 @@ def test_normalize():
assert normalize(np.array(["A", "B"], dtype="|S1")) == ["A", "B"]
assert normalize(np.array(["A", "B"], dtype="O")) == ["A", "B"]
assert normalize(np.array([], dtype="|S1")) == []


def test_parse_key():
from mandible.metadata_mapper.format.h5 import parse_key

assert parse_key("foo") == ("foo", None)
assert parse_key("foo@@oo") == ("foo@oo", None)
assert parse_key("foo@bar") == ("foo", "bar")
assert parse_key("@bar") == ("", "bar")
assert parse_key("foo@") == ("foo", "")
assert parse_key("fo@@o@bar") == ("fo@o", "bar")
assert parse_key("foo@@@bar") == ("foo@", "bar")
assert parse_key("@@@@") == ("@@", None)
assert parse_key("@@@foo@@") == ("@", "foo@")

with pytest.raises(ValueError):
parse_key("a@b@c")
with pytest.raises(ValueError):
parse_key("@@a@b@c@@")