From 8b015297b4310254e8234dea546b4b3206f0857c Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Feb 2026 22:30:14 -0500 Subject: [PATCH 1/2] feat: add HED standard, per-asset dataStandard, version and extensions on StandardsType - Bump DANDI_SCHEMA_VERSION to 0.8.0, add 0.7.0 to ALLOWED_INPUT_SCHEMAS - Add `dataStandard: Optional[List[StandardsType]]` to BareAsset for per-asset standard declarations (NWB, BIDS, HED, OME/NGFF) - Add `version: Optional[str]` to StandardsType - Add `extensions: Optional[List["StandardsType"]]` (self-referencing) to StandardsType for NWB ndx-* extensions, HED library schemas, etc. - Add `hed_standard` constant (RRID:SCR_014074) - Collect per-asset dataStandard in aggregate_assets_summary(), with deprecated path/encoding heuristic fallbacks (remove after 2026-12-01) Co-Authored-By: Claude Opus 4.6 --- dandischema/consts.py | 3 +- dandischema/metadata.py | 23 +++++++++++++- dandischema/models.py | 27 ++++++++++++++++ dandischema/tests/test_metadata.py | 49 ++++++++++++++++++++++++++++++ dandischema/tests/test_models.py | 7 ++++- 5 files changed, 106 insertions(+), 3 deletions(-) diff --git a/dandischema/consts.py b/dandischema/consts.py index cf02329d..c5091395 100644 --- a/dandischema/consts.py +++ b/dandischema/consts.py @@ -1,6 +1,6 @@ from packaging.version import Version as _Version -DANDI_SCHEMA_VERSION = "0.7.0" +DANDI_SCHEMA_VERSION = "0.8.0" ALLOWED_INPUT_SCHEMAS = [ "0.4.4", "0.5.1", @@ -16,6 +16,7 @@ "0.6.8", "0.6.9", "0.6.10", + "0.7.0", DANDI_SCHEMA_VERSION, ] diff --git a/dandischema/metadata.py b/dandischema/metadata.py index 19064262..e43c9fd4 100644 --- a/dandischema/metadata.py +++ b/dandischema/metadata.py @@ -459,6 +459,22 @@ def migrate( if "schemaKey" not in obj_migrated: obj_migrated["schemaKey"] = "Dandiset" + # Prune fields introduced in newer schema versions when target is older. + # Each entry: (version_introduced, list_of_top_level_field_paths_to_remove) + # + # Note: migrate() only processes Dandiset metadata (not Asset metadata). + # BareAsset.dataStandard (new in 0.7.0) does not need pruning here; + # AssetsSummary.dataStandard (nested in assetsSummary) pre-dates 0.7.0. + # StandardsType.version and .extensions are structural additions to an + # existing type — older consumers ignore unknown fields. + _FIELDS_INTRODUCED: list[tuple[str, list[str]]] = [ + # ("0.7.0", ["field_on_dandiset"]), + ] + for ver, fields in _FIELDS_INTRODUCED: + if target_ver_tuple < version2tuple(ver): + for field_path in fields: + obj_migrated.pop(field_path, None) + # Always update schemaVersion when migrating obj_migrated["schemaVersion"] = to_version return obj_migrated @@ -543,9 +559,14 @@ def add_if_missing(standard: dict) -> None: if standard not in stats["dataStandard"]: stats["dataStandard"].append(standard) + # Collect per-asset dataStandard declarations populated by dandi-cli + for standard in assetmeta.get("dataStandard") or []: + add_if_missing(standard) + + # DEPRECATED: path/encoding heuristic fallbacks for older clients that do not + # populate per-asset dataStandard. Remove after 2026-12-01. if "nwb" in assetmeta["encodingFormat"]: add_if_missing(models.nwb_standard) - # TODO: RF assumption that any .json implies BIDS if Path(assetmeta["path"]).name == "dataset_description.json": add_if_missing(models.bids_standard) if Path(assetmeta["path"]).suffixes == [".ome", ".zarr"]: diff --git a/dandischema/models.py b/dandischema/models.py index bb3b8f2e..38ee8dda 100644 --- a/dandischema/models.py +++ b/dandischema/models.py @@ -860,11 +860,27 @@ class MeasurementTechniqueType(BaseType): class StandardsType(BaseType): """Identifier for data standard used""" + version: Optional[str] = Field( + None, + description="Version of the standard used.", + json_schema_extra={"nskey": "schema"}, + ) + extensions: Optional[List["StandardsType"]] = Field( + None, + description="Extensions to the standard used in this dataset " + "(e.g. NWB extensions like ndx-*, HED library schemas).", + ) + # TODO: consider how to formalize BIDS extensions (BEPs) once BIDS + # has a machine-readable way to declare them. schemaKey: Literal["StandardsType"] = Field( "StandardsType", validate_default=True, json_schema_extra={"readOnly": True} ) +# Self-referencing model needs rebuild after class definition +# https://docs.pydantic.dev/latest/concepts/postponed_annotations/#self-referencing-or-recursive-models +StandardsType.model_rebuild() + nwb_standard = StandardsType( name="Neurodata Without Borders (NWB)", identifier="RRID:SCR_015242", @@ -880,6 +896,11 @@ class StandardsType(BaseType): identifier="DOI:10.25504/FAIRsharing.9af712", ).model_dump(mode="json", exclude_none=True) +hed_standard = StandardsType( + name="Hierarchical Event Descriptors (HED)", + identifier="RRID:SCR_014074", +).model_dump(mode="json", exclude_none=True) + class ContactPoint(DandiBaseModel): email: Optional[EmailStr] = Field( @@ -1841,6 +1862,12 @@ class BareAsset(CommonModel): json_schema_extra={"nskey": "prov"}, ) + dataStandard: Optional[List[StandardsType]] = Field( + None, + description="Data standard(s) applicable to this asset.", + json_schema_extra={"readOnly": True}, + ) + # Bare asset is to be just Asset. schemaKey: Literal["Asset"] = Field( "Asset", validate_default=True, json_schema_extra={"readOnly": True} diff --git a/dandischema/tests/test_metadata.py b/dandischema/tests/test_metadata.py index 6da7fff3..b5293682 100644 --- a/dandischema/tests/test_metadata.py +++ b/dandischema/tests/test_metadata.py @@ -758,6 +758,55 @@ def test_aggregation_bids() -> None: ) # only a single entry so we do not duplicate them +def test_hed_standard_structure() -> None: + from dandischema.models import hed_standard + + assert hed_standard["schemaKey"] == "StandardsType" + assert hed_standard["name"] == "Hierarchical Event Descriptors (HED)" + assert hed_standard["identifier"] == "RRID:SCR_014074" + + +def test_aggregate_per_asset_datastandard() -> None: + """Per-asset dataStandard entries are collected into the summary.""" + from dandischema.models import hed_standard + + data = [ + { + "schemaKey": "Asset", + "schemaVersion": "0.6.0", + "path": "dataset_description.json", + "encodingFormat": "application/json", + "contentSize": 512, + "digest": {"dandi:dandi-etag": "abc123-1"}, + "dataStandard": [hed_standard], + }, + ] + summary = aggregate_assets_summary(data) + assert hed_standard in summary["dataStandard"] + # dataset_description.json also triggers BIDS via deprecated heuristic + assert sum("BIDS" in s.get("name", "") for s in summary["dataStandard"]) == 1 + + +def test_aggregate_per_asset_datastandard_no_duplication() -> None: + """No duplication when a standard is declared both per-asset and via heuristic.""" + from dandischema.models import bids_standard + + data = [ + { + "schemaKey": "Asset", + "schemaVersion": "0.6.0", + "path": "dataset_description.json", + "encodingFormat": "application/json", + "contentSize": 512, + "digest": {"dandi:dandi-etag": "abc123-1"}, + "dataStandard": [bids_standard], + }, + ] + summary = aggregate_assets_summary(data) + bids_count = sum("BIDS" in s.get("name", "") for s in summary["dataStandard"]) + assert bids_count == 1, "BIDS should appear exactly once, not duplicated" + + class TestValidateObjJson: """ Tests for `_validate_obj_json()` diff --git a/dandischema/tests/test_models.py b/dandischema/tests/test_models.py index 2a50b174..4705c693 100644 --- a/dandischema/tests/test_models.py +++ b/dandischema/tests/test_models.py @@ -582,7 +582,12 @@ def check_qname(qname: str, klass: type) -> None: "RelatedParticipant", ): return - if qname in "dandi:approach" and klass.__name__ in ( + if qname == "dandi:approach" and klass.__name__ in ( + "Asset", + "AssetsSummary", + ): + return + if qname == "dandi:dataStandard" and klass.__name__ in ( "Asset", "AssetsSummary", ): From 4bb5f6fa1cb7eb231ab7387701a518ba9d021e30 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Feb 2026 22:30:19 -0500 Subject: [PATCH 2/2] docs: comprehensive CLAUDE.md with architecture, conventions, and checklist Co-Authored-By: Claude Opus 4.6 --- .gitignore | 2 + CLAUDE.md | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 CLAUDE.md diff --git a/.gitignore b/.gitignore index 14b9e168..14061b0d 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,5 @@ sandbox/ venv/ venvs/ dandischema/_version.py +uv.lock +.cache/ diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..05f01b15 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,138 @@ +# CLAUDE.md + +This file provides guidance to Claude Code when working with code in this +repository. + +## Project Overview + +dandischema defines the Pydantic v2 metadata models for the DANDI +neurophysiology data archive. It is used by both the dandi-cli client and the +dandi-archive server. Key concerns: model definitions, JSON Schema generation, +metadata validation, schema migration between versions, and asset metadata +aggregation. + +## Build/Test Commands + +```bash +tox -e py3 # Run full test suite (preferred) +pytest dandischema/ # Run tests directly in active venv +pytest dandischema/tests/test_metadata.py -v -k "test_name" # Single test +tox -e lint # codespell + flake8 +tox -e typing # mypy (strict, with pydantic plugin) +``` + +- `filterwarnings = error` is active — new warnings will fail tests. +- Coverage is collected by default (`--cov=dandischema`). + +## Code Style + +- **Formatter**: Black (no explicit line-length override → default 88) +- **Import sorting**: isort with `profile = "black"`, `force_sort_within_sections`, + `reverse_relative` +- **Linting**: flake8 (max-line-length=100, ignores E203/W503) +- **Type checking**: mypy strict — `no_implicit_optional`, `warn_return_any`, + `warn_unreachable`, pydantic plugin enabled +- **Pre-commit hooks**: trailing-whitespace, end-of-file-fixer, check-yaml, + check-added-large-files, black, isort, codespell, flake8 +- Imports at top of file; avoid function-level imports unless there is a + concrete reason (circular deps, heavy transitive imports) + +## Architecture + +### Key Modules + +| File | Role | +|------|------| +| `models.py` | All Pydantic models (~2000 lines). Class hierarchy rooted at `DandiBaseModel`. | +| `metadata.py` | `validate()`, `migrate()`, `aggregate_assets_summary()`. | +| `consts.py` | `DANDI_SCHEMA_VERSION`, `ALLOWED_INPUT_SCHEMAS`, `ALLOWED_TARGET_SCHEMAS`. | +| `conf.py` | Instance configuration via env vars (`DANDI_INSTANCE_NAME`, etc.). | +| `types.py` | Custom Pydantic types (`ByteSizeJsonSchema`). | +| `utils.py` | JSON schema helpers, `version2tuple()`, `name2title()`. | +| `exceptions.py` | `ValidationError`, `JsonschemaValidationError`, `PydanticValidationError`. | +| `digests/` | `DandiETag` multipart-upload checksum calculation. | +| `datacite/` | DataCite DOI metadata conversion. | + +### Model Hierarchy (simplified) + +``` +DandiBaseModel +├── PropertyValue # recursive (self-referencing) +├── BaseType +│ ├── StandardsType # name, identifier, version, extensions (recursive) +│ ├── ApproachType, AssayType, SampleType, Anatomy, ... +│ └── MeasurementTechniqueType +├── Person, Organization # Contributor subclasses +├── BioSample # recursive (wasDerivedFrom) +├── AssetsSummary # aggregated stats +└── CommonModel + ├── Dandiset → PublishedDandiset + └── BareAsset → Asset → PublishedAsset +``` + +Several models are **self-referencing** (PropertyValue, BioSample, +StandardsType). These require `model_rebuild()` after the class definition. + +### Data Flow: Asset Metadata Aggregation + +1. dandi-cli calls `asset.get_metadata()` → populates `BareAsset` including + per-asset `dataStandard` list +2. Asset metadata is serialized via `model_dump(mode="json", exclude_none=True)` +3. Server calls `aggregate_assets_summary(assets)` → + `_add_asset_to_stats()` per asset → `AssetsSummary` +4. `_add_asset_to_stats()` collects: numberOfBytes, numberOfFiles, approach, + measurementTechnique, variableMeasured, species, subjects, dataStandard +5. `dataStandard` has deprecated path/encoding heuristic fallbacks for old + clients (remove after 2026-12-01) + +### Pre-instantiated Standard Constants + +```python +nwb_standard # RRID:SCR_015242 +bids_standard # RRID:SCR_016124 +ome_ngff_standard # DOI:10.25504/FAIRsharing.9af712 +hed_standard # RRID:SCR_014074 +``` + +These are dicts (`model_dump(mode="json", exclude_none=True)`) used by both +dandischema (heuristic fallbacks) and dandi-cli (per-asset population). + +### Vendorization + +The schema supports deployment for different DANDI instances. Environment +variables (`DANDI_INSTANCE_NAME`, `DANDI_INSTANCE_IDENTIFIER`, +`DANDI_DOI_PREFIX`, etc.) must be set **before** importing +`dandischema.models`. This dynamically adjusts identifier patterns, DOI +prefixes, license enums, and URL patterns. CI tests multiple vendored +configurations. + +## Schema Change Checklist + +When adding or removing fields from any model (BareAsset, Dandiset, +AssetsSummary, etc.): + +1. **Update `_FIELDS_INTRODUCED` in `metadata.py:migrate()`** if adding a new + **top-level field to Dandiset metadata** — `migrate()` only processes + Dandiset-level dicts (not Asset metadata). Fields on BareAsset or nested + inside existing structures (e.g. new fields on StandardsType) do not need + entries here. + +2. **Update `consts.py`** if bumping `DANDI_SCHEMA_VERSION` or adding to + `ALLOWED_INPUT_SCHEMAS`. + +3. **Add tests** covering migration/aggregation with the new field. + +4. **Coordinate with dandi-cli** — new fields that dandi-cli populates need + backward-compat guards there (check `"field" in Model.model_fields`) until + the minimum dandischema dependency is bumped. + +## Testing Notes + +- Tests use `filterwarnings = error` — any new deprecation warning will fail. +- The `clear_dandischema_modules_and_set_env_vars` fixture (conftest.py) + supports testing vendored configurations by clearing cached modules and + setting env vars. +- Network-dependent tests are skipped when `DANDI_TESTS_NONETWORK` is set. +- DataCite tests require `DATACITE_DEV_LOGIN` / `DATACITE_DEV_PASSWORD`. +- `test_models.py:test_duplicate_classes` checks for duplicate field qnames + across models; allowed duplicates are listed explicitly.