diff --git a/.gitignore b/.gitignore index d0f2d8d8f..a2de93108 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ sandbox/ venv/ venvs/ .DS_Store +uv.lock diff --git a/CLAUDE.md b/CLAUDE.md index ec4a22d19..e6d47b9f2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -18,7 +18,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Test Markers - When adding AI-generated tests, mark them with `@pytest.mark.ai_generated` -- Any new pytest markers must be registered in `tox.ini` under `[pytest]` section in the `markers` list +- Any new pytest markers must be registered in `pytest_configure` function of `dandi/pytest_plugin.py` ## Code Style - Code is formatted with Black (line length 100) @@ -31,6 +31,11 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co - Prefer specific exceptions over generic ones - For CLI, use click library patterns - Imports organized: stdlib, third-party, local (alphabetical within groups) +- **Imports must be at the top of the file** — do NOT place imports inside + functions or methods unless there is a concrete reason (circular dependency, + or heavy transitive imports like `pynwb`/`h5py`/`nwbinspector` that would + slow down module load for unrelated code paths). When deferring an import + for weight, add the comment `# Avoid heavy import by importing within function:`. ## Documentation - Keep docstrings updated when changing function signatures diff --git a/dandi/files/bases.py b/dandi/files/bases.py index 15bc616c4..6ade4d09b 100644 --- a/dandi/files/bases.py +++ b/dandi/files/bases.py @@ -18,9 +18,23 @@ from dandischema.digests.dandietag import DandiETag from dandischema.models import BareAsset, CommonModel from dandischema.models import Dandiset as DandisetMeta -from dandischema.models import get_schema_version +from dandischema.models import StandardsType, get_schema_version, nwb_standard from packaging.version import Version from pydantic import ValidationError + +# True when the installed dandischema exposes the per-asset dataStandard field +# and related StandardsType enhancements (version, extensions). All these +# fields ship together starting with dandischema 0.12.2. +# TODO: remove this guard (and all branches that check it) once the minimum +# required dandischema version is >= 0.12.2. +_SCHEMA_BAREASSET_HAS_DATASTANDARD = "dataStandard" in BareAsset.model_fields +if not _SCHEMA_BAREASSET_HAS_DATASTANDARD and Version( + dandischema.__version__ +) >= Version("0.12.2"): + raise RuntimeError( + f"dandischema {dandischema.__version__} should have " + f"'dataStandard' field on BareAsset" + ) from pydantic_core import ErrorDetails import requests @@ -504,6 +518,32 @@ def get_metadata( else: raise metadata.path = self.path + if _SCHEMA_BAREASSET_HAS_DATASTANDARD: + kwargs: dict[str, Any] = dict(nwb_standard) + # Avoid heavy import by importing within function: + from dandi.pynwb_utils import get_nwb_extensions + + try: + nwb_exts = get_nwb_extensions(self.filepath) + except Exception: + lgr.debug( + "Failed to extract NWB extensions from %s", + self.filepath, + exc_info=True, + ) + nwb_exts = {} + if nwb_exts: + kwargs["extensions"] = [ + StandardsType(name=name, version=ver).model_dump( + mode="json", exclude_none=True + ) + for name, ver in sorted(nwb_exts.items()) + ] + nwb = StandardsType(**kwargs) + if metadata.dataStandard is None: + metadata.dataStandard = [nwb] + elif nwb not in metadata.dataStandard: + metadata.dataStandard.append(nwb) return metadata # TODO: @validate_cache.memoize_path diff --git a/dandi/files/bids.py b/dandi/files/bids.py index 7e43163ec..e9507aac8 100644 --- a/dandi/files/bids.py +++ b/dandi/files/bids.py @@ -3,15 +3,27 @@ from collections import defaultdict from dataclasses import dataclass, field from datetime import datetime +import json from pathlib import Path from threading import Lock import weakref -from dandischema.models import BareAsset +from dandischema.models import ( + BareAsset, + StandardsType, + bids_standard, + ome_ngff_standard, +) +import dandi from dandi.bids_validator_deno import bids_validate -from .bases import GenericAsset, LocalFileAsset, NWBAsset +from .bases import GenericAsset, LocalFileAsset, NWBAsset, _SCHEMA_BAREASSET_HAS_DATASTANDARD + +if _SCHEMA_BAREASSET_HAS_DATASTANDARD: + from dandischema.models import hed_standard +else: + hed_standard = None # type: ignore[assignment] from .zarr import ZarrAsset from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file from ..metadata.core import add_common_metadata, prepare_metadata @@ -23,10 +35,30 @@ ValidationResult, ) +lgr = dandi.get_logger() + BIDS_ASSET_ERRORS = ("BIDS.NON_BIDS_PATH_PLACEHOLDER",) BIDS_DATASET_ERRORS = ("BIDS.MANDATORY_FILE_MISSING_PLACEHOLDER",) +def _add_standard( + metadata, # type: ignore[no-untyped-def] + standard_dict: dict, + version: str | None = None, +) -> None: + """Add a data standard to asset metadata if the field is available.""" + if not _SCHEMA_BAREASSET_HAS_DATASTANDARD: + return + kwargs = dict(standard_dict) + if version: + kwargs["version"] = version + standard = StandardsType(**kwargs) + if metadata.dataStandard is None: + metadata.dataStandard = [standard] + elif standard not in metadata.dataStandard: + metadata.dataStandard.append(standard) + + @dataclass class BIDSDatasetDescriptionAsset(LocalFileAsset): """ @@ -192,7 +224,50 @@ def get_validation_errors( assert self._dataset_errors is not None return self._dataset_errors.copy() - # get_metadata(): inherit use of default metadata from LocalFileAsset + def get_metadata( + self, + digest: Digest | None = None, + ignore_errors: bool = True, + ) -> BareAsset: + metadata = super().get_metadata(digest=digest, ignore_errors=ignore_errors) + try: + with open(self.filepath) as f: + desc = json.load(f) + except (OSError, json.JSONDecodeError) as e: + lgr.warning("Failed to read %s: %s", self.filepath, e) + _add_standard(metadata, bids_standard) + return metadata + _add_standard(metadata, bids_standard, version=desc.get("BIDSVersion")) + if hed_standard and (hed_version := desc.get("HEDVersion")): + # HEDVersion can be a string or list. + # List form: ["8.2.0", "sc:1.0.0"] where first element is base + # HED version and subsequent "prefix:version" entries are library + # schemas recorded as extensions. + if isinstance(hed_version, list): + version = hed_version[0] if hed_version else None + library_entries = hed_version[1:] + else: + version = hed_version + library_entries = [] + kwargs: dict = dict(hed_standard) + if version: + kwargs["version"] = version + if library_entries: + extensions = [] + for entry in library_entries: + # Format is "prefix:version" (e.g. "sc:1.0.0") + if ":" in str(entry): + lib_name, lib_ver = str(entry).split(":", 1) + else: + lib_name, lib_ver = str(entry), None + ext = StandardsType(name=lib_name, version=lib_ver) + extensions.append( + ext.model_dump(mode="json", exclude_none=True) + ) + if extensions: + kwargs["extensions"] = extensions + _add_standard(metadata, kwargs) + return metadata @dataclass @@ -312,6 +387,8 @@ def get_metadata( add_common_metadata(metadata, self.filepath, start_time, end_time, digest) metadata.path = self.path metadata.encodingFormat = ZARR_MIME_TYPE + if Path(self.path).suffixes == [".ome", ".zarr"]: + _add_standard(metadata, ome_ngff_standard) return metadata diff --git a/dandi/pynwb_utils.py b/dandi/pynwb_utils.py index 7ffe552a8..ff41bbe6b 100644 --- a/dandi/pynwb_utils.py +++ b/dandi/pynwb_utils.py @@ -152,6 +152,49 @@ def _sanitize(v: Any) -> str: return None +# Namespaces bundled with NWB/HDMF core — not extensions +_NWB_CORE_NAMESPACES = frozenset({"core", "hdmf-common", "hdmf-experimental"}) + + +def get_nwb_extensions(filepath: str | Path | Readable) -> dict[str, str]: + """Return NWB extensions embedded in an HDF5 file. + + Reads the ``specifications`` group of an NWB HDF5 file and returns a + mapping of extension namespace names to their latest embedded version, + excluding core NWB/HDMF namespaces. + + Parameters + ---------- + filepath + Path to an NWB ``.nwb`` HDF5 file, or a :class:`Readable`. + + Returns + ------- + dict[str, str] + ``{namespace_name: latest_version}`` for each non-core namespace + found in the file's ``specifications`` group. Empty dict if no + extensions are present or the group does not exist. + """ + extensions: dict[str, str] = {} + with open_readable(filepath) as fp, h5py.File(fp, "r") as h5file: + specs = h5file.get("specifications") + if specs is None: + return extensions + for name in specs: + if name in _NWB_CORE_NAMESPACES: + continue + ns_group = specs[name] + if not isinstance(ns_group, h5py.Group): + continue + try: + sorted_versions = sorted(ns_group, key=Version) + if sorted_versions: + extensions[name] = sorted_versions[-1] + except Exception: + lgr.debug("Failed to parse versions for NWB extension %s", name) + return extensions + + def get_neurodata_types_to_modalities_map() -> dict[str, str]: """Return a dict to map neurodata types known to pynwb to "modalities" diff --git a/dandi/tests/test_files.py b/dandi/tests/test_files.py index cef6c8ac3..b25d86ffe 100644 --- a/dandi/tests/test_files.py +++ b/dandi/tests/test_files.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json from operator import attrgetter import os from pathlib import Path @@ -16,6 +17,7 @@ from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file from ..dandiapi import AssetType, RemoteZarrAsset from ..exceptions import UnknownAssetError +from ..files.bases import _SCHEMA_BAREASSET_HAS_DATASTANDARD from ..files import ( BIDSDatasetDescriptionAsset, DandisetMetadataFile, @@ -608,3 +610,88 @@ def test_validate_invalid_zarr3(path: str, expected_result_ids: set[str]) -> Non result_ids = {r.id for r in zf.get_validation_errors()} assert result_ids == expected_result_ids + + +@pytest.mark.ai_generated +class TestBIDSDatasetDescriptionDataStandard: + """Tests for per-asset dataStandard population from dataset_description.json""" + + @staticmethod + def _make_bids_dd(tmp_path: Path, content: dict) -> BIDSDatasetDescriptionAsset: + dd_path = tmp_path / "dataset_description.json" + dd_path.write_text(json.dumps(content)) + return BIDSDatasetDescriptionAsset( + filepath=dd_path, + path="dataset_description.json", + dandiset_path=tmp_path, + ) + + @staticmethod + def _standard_names(metadata): # type: ignore[no-untyped-def] + if not _SCHEMA_BAREASSET_HAS_DATASTANDARD: + pytest.skip("dandischema too old, no dataStandard on BareAsset") + return [s.name for s in (metadata.dataStandard or [])] + + def test_bids_always_set(self, tmp_path: Path) -> None: + asset = self._make_bids_dd( + tmp_path, + {"Name": "Test", "BIDSVersion": "1.9.0"}, + ) + names = self._standard_names(asset.get_metadata()) + assert "Brain Imaging Data Structure (BIDS)" in names + + def test_hed_detected_when_hedversion_present(self, tmp_path: Path) -> None: + asset = self._make_bids_dd( + tmp_path, + {"Name": "Test", "BIDSVersion": "1.9.0", "HEDVersion": "8.2.0"}, + ) + names = self._standard_names(asset.get_metadata()) + assert "Hierarchical Event Descriptors (HED)" in names + assert "Brain Imaging Data Structure (BIDS)" in names + + def test_hed_not_detected_when_hedversion_absent(self, tmp_path: Path) -> None: + asset = self._make_bids_dd( + tmp_path, + {"Name": "Test", "BIDSVersion": "1.9.0"}, + ) + names = self._standard_names(asset.get_metadata()) + assert "Hierarchical Event Descriptors (HED)" not in names + + def test_hed_detected_with_list_hedversion(self, tmp_path: Path) -> None: + """HEDVersion can be a list of strings per BIDS spec.""" + asset = self._make_bids_dd( + tmp_path, + { + "Name": "Test", + "BIDSVersion": "1.9.0", + "HEDVersion": ["8.2.0", "sc:1.0.0"], + }, + ) + names = self._standard_names(asset.get_metadata()) + assert "Hierarchical Event Descriptors (HED)" in names + + def test_hed_library_schemas_as_extensions(self, tmp_path: Path) -> None: + """HED library schemas in list HEDVersion populate extensions.""" + if not _SCHEMA_BAREASSET_HAS_DATASTANDARD: + pytest.skip("dandischema too old, no dataStandard on BareAsset") + asset = self._make_bids_dd( + tmp_path, + { + "Name": "Test", + "BIDSVersion": "1.9.0", + "HEDVersion": ["8.2.0", "sc:1.0.0", "lang:1.1.0"], + }, + ) + metadata = asset.get_metadata() + hed_standards = [ + s for s in (metadata.dataStandard or []) + if s.name == "Hierarchical Event Descriptors (HED)" + ] + assert len(hed_standards) == 1 + hed = hed_standards[0] + assert hed.version == "8.2.0" + assert hed.extensions is not None + ext_names = {e.name for e in hed.extensions} + assert ext_names == {"sc", "lang"} + ext_map = {e.name: e.version for e in hed.extensions} + assert ext_map == {"sc": "1.0.0", "lang": "1.1.0"} diff --git a/dandi/tests/test_pynwb_utils.py b/dandi/tests/test_pynwb_utils.py index 0de33d555..47bac6fa4 100644 --- a/dandi/tests/test_pynwb_utils.py +++ b/dandi/tests/test_pynwb_utils.py @@ -6,10 +6,11 @@ import re from typing import Any, NoReturn +import h5py import numpy as np from pynwb import NWBHDF5IO, NWBFile, TimeSeries -from ..pynwb_utils import _sanitize_nwb_version, nwb_has_external_links +from ..pynwb_utils import _sanitize_nwb_version, get_nwb_extensions, nwb_has_external_links def test_pynwb_io(simple1_nwb: Path) -> None: @@ -103,3 +104,34 @@ def test_nwb_has_external_links(tmp_path): assert not nwb_has_external_links(filename1) assert nwb_has_external_links(filename4) + + +def test_get_nwb_extensions(tmp_path: Path) -> None: + """Test extraction of NWB extensions from HDF5 specifications group.""" + h5path = tmp_path / "test.nwb" + with h5py.File(h5path, "w") as f: + specs = f.create_group("specifications") + # Core namespaces should be excluded + core_grp = specs.create_group("core") + core_grp.create_group("2.7.0") + hdmf_grp = specs.create_group("hdmf-common") + hdmf_grp.create_group("1.8.0") + # An extension namespace should be included, latest version used + ndx_ecog = specs.create_group("ndx-ecog") + ndx_ecog.create_group("0.1.0") + ndx_ecog.create_group("0.2.0") + # Another extension + ndx_events = specs.create_group("ndx-events") + ndx_events.create_group("0.3.0") + + result = get_nwb_extensions(h5path) + assert result == {"ndx-ecog": "0.2.0", "ndx-events": "0.3.0"} + + +def test_get_nwb_extensions_no_specs(tmp_path: Path) -> None: + """No specifications group returns empty dict.""" + h5path = tmp_path / "test.nwb" + with h5py.File(h5path, "w") as f: + f.attrs["nwb_version"] = "2.7.0" + + assert get_nwb_extensions(h5path) == {}