dandi · bendichter · Feb 21, 2026 · Feb 21, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
     rev: v2.4.1
     hooks:
     -   id: codespell
-        exclude: ^(dandi/_version\.py|dandi/due\.py|versioneer\.py|pyproject\.toml)$
+        exclude: ^(dandi/_version\.py|dandi/due\.py|versioneer\.py|pyproject\.toml|dandi/data/allen_ccf_structures\.json)$
         additional_dependencies:
         - tomli; python_version<'3.11'
 -   repo: https://github.com/PyCQA/flake8

diff --git a/dandi/data/__init__.py b/dandi/data/__init__.py
diff --git a/dandi/data/allen_ccf_structures.json b/dandi/data/allen_ccf_structures.json
diff --git a/dandi/data/generate_allen_structures.py b/dandi/data/generate_allen_structures.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+"""Regenerate allen_ccf_structures.json from Allen Brain Map API.
+
+Run: python -m dandi.data.generate_allen_structures
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import requests
+
+
+def _flatten(node: dict, out: list[dict]) -> None:
+    out.append({"id": node["id"], "acronym": node["acronym"], "name": node["name"]})
+    for child in node.get("children", []):
+        _flatten(child, out)
+
+
+def main() -> None:
+    url = "http://api.brain-map.org/api/v2/structure_graph_download/1.json"
+    resp = requests.get(url, timeout=30)
+    resp.raise_for_status()
+    data = resp.json()
+    structures: list[dict] = []
+    root = data["msg"][0]
+    _flatten(root, structures)
+    structures.sort(key=lambda s: s["id"])
+    out_path = Path(__file__).with_name("allen_ccf_structures.json")
+    with open(out_path, "w") as f:
+        json.dump(structures, f, separators=(",", ":"))
+    print(f"Wrote {len(structures)} structures to {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dandi/metadata/brain_areas.py b/dandi/metadata/brain_areas.py
@@ -0,0 +1,252 @@
+from __future__ import annotations
+
+import ast
+from functools import lru_cache
+import json
+from pathlib import Path
+import re
+from typing import Any
+
+from dandischema import models
+
+from .. import get_logger
+
+lgr = get_logger()
+
+MBAO_URI_TEMPLATE = "http://purl.obolibrary.org/obo/MBA_{}"
+
+# Values that should be treated as missing / uninformative
+_TRIVIAL_VALUES = frozenset(
+    {
+        "",
+        "unknown",
+        "none",
+        "n/a",
+        "na",
+        "null",
+        "unspecified",
+        "not available",
+        "not applicable",
+        "brain",
+    }
+)
+
+
+@lru_cache(maxsize=1)
+def _load_allen_structures() -> list[dict[str, Any]]:
+    """Load the bundled Allen CCF structures JSON."""
+    data_path = (
+        Path(__file__).resolve().parent.parent / "data" / "allen_ccf_structures.json"
+    )
+    with open(data_path) as f:
+        structures: list[dict[str, Any]] = json.load(f)
+    return structures
+
+
+@lru_cache(maxsize=1)
+def _build_lookup_dicts() -> (
+    tuple[dict[str, dict], dict[str, dict], dict[str, dict], dict[str, dict]]
+):
+    """Build lookup dictionaries for Allen CCF structures.
+
+    Returns
+    -------
+    tuple of 4 dicts
+        (acronym_exact, acronym_lower, name_exact, name_lower)
+    """
+    structures = _load_allen_structures()
+    acronym_exact: dict[str, dict] = {}
+    acronym_lower: dict[str, dict] = {}
+    name_exact: dict[str, dict] = {}
+    name_lower: dict[str, dict] = {}
+    for s in structures:
+        acr = s["acronym"]
+        name = s["name"]
+        # First match wins (structures are sorted by id)
+        if acr not in acronym_exact:
+            acronym_exact[acr] = s
+        acr_low = acr.lower()
+        if acr_low not in acronym_lower:
+            acronym_lower[acr_low] = s
+        if name not in name_exact:
+            name_exact[name] = s
+        name_low = name.lower()
+        if name_low not in name_lower:
+            name_lower[name_low] = s
+    return acronym_exact, acronym_lower, name_exact, name_lower
+
+
+def _is_numeric(val: str) -> bool:
+    """Return True if *val* looks like a number (int or float)."""
+    try:
+        float(val)
+        return True
+    except ValueError:
+        return False
+
+
+# Canonicalised area key names recognised in dict-style location strings.
+# Keys are normalised by lowering, stripping whitespace, and removing hyphens
+# and underscores so that "brain-area", "brain_area", "BrainArea" all match.
+_AREA_KEYS = frozenset(
+    {
+        "area",
+        "areaname",
+        "brainarea",
+        "brainregion",
+        "location",
+        "name",
+        "region",
+        "regionname",
+    }
+)
+
+
+def _normalise_key(key: str) -> str:
+    """Lower-case and strip spaces, hyphens, underscores from *key*."""
+    return re.sub(r"[\s_-]", "", key).lower()
+
+
+def _extract_area_from_dict(d: dict) -> str | None:
+    """Return the first non-trivial area value from a dict with flexible key matching."""
+    for key, val in d.items():
+        if _normalise_key(str(key)) in _AREA_KEYS:
+            val = str(val).strip()
+            if val and val.lower() not in _TRIVIAL_VALUES:
+                return val
+    return None
+
+
+def _parse_location_string(location: str) -> list[str]:
+    """Parse a raw NWB location string into area tokens ignoring numerics etc.
+
+    Handles:
+    - Simple strings: ``"VISp"``
+    - Dict literals: ``"{'area': 'VISp', 'depth': '20'}"``
+    - Key-value pairs: ``"area: VISp, depth: 175"``
+    - Comma-separated lists: ``"VISp,VISrl,VISlm"``
+
+    In examples above, depth numerical values are getting ignored.
+    """
+    location = location.strip()
+    if not location or location.lower() in _TRIVIAL_VALUES:
+        return []
+
+    # Try dict literal (e.g. "{'area': 'VISp', 'depth': 20}")
+    if location.startswith("{"):
+        try:
+            d = ast.literal_eval(location)
+            if isinstance(d, dict):
+                val = _extract_area_from_dict(d)
+                if val is not None:
+                    return [val]
+                # If no known key, return all non-trivial, non-numeric values
+                tokens = []
+                for v in d.values():
+                    v = str(v).strip()
+                    if v and v.lower() not in _TRIVIAL_VALUES and not _is_numeric(v):
+                        tokens.append(v)
+                return tokens
+        except (ValueError, SyntaxError):
@@ -112,7 +112,11 @@
                            tokens.append(val)
                return tokens
        except (ValueError, SyntaxError):
-            pass
+            lgr.debug(
+                "Failed to parse brain location as dict literal %r; "
+                "falling back to alternative parsing strategies",
+                location,
+            )
    # Try key-value format (e.g. "area: VISp, depth: 175")
    if re.search(r"\w+\s*:", location) and "://" not in location:
@@ -112,7 +112,11 @@
                            tokens.append(val)
                return tokens
        except (ValueError, SyntaxError):
-            pass
+            lgr.debug(
+                "Failed to parse brain location as dict literal %r; "
+                "falling back to alternative parsing strategies",
+                location,
+            )

    # Try key-value format (e.g. "area: VISp, depth: 175")
    if re.search(r"\w+\s*:", location) and "://" not in location:
+            lgr.debug("Location %r looks like a dict but failed to parse", location)
+
+    # Try key-value format (e.g. "area: VISp, depth: 175")
+    if re.search(r"\w+\s*:", location) and "://" not in location:
+        pairs = re.split(r",\s*", location)
+        kv: dict[str, str] = {}
+        for pair in pairs:
+            m = re.match(r"(\w+)\s*:\s*(.+)", pair.strip())
+            if m:
+                kv[m.group(1).lower()] = m.group(2).strip()
+        if kv:
+            val = _extract_area_from_dict(kv)
+            if val is not None:
+                return [val]
+            # Fall through — return non-trivial, non-numeric values
+            tokens = []
+            for v in kv.values():
+                if v.lower() not in _TRIVIAL_VALUES and not _is_numeric(v):
+                    tokens.append(v)
+            if tokens:
+                return tokens
+
+    # Comma-separated list
+    if "," in location:
+        tokens = [t.strip() for t in location.split(",")]
+        return [t for t in tokens if t and t.lower() not in _TRIVIAL_VALUES]
+
+    # Simple string
+    return [location]
+
+
+def match_location_to_allen(token: str) -> models.Anatomy | None:
+    """Match a single location token against Allen CCF structures.
+
+    Tries exact acronym, case-insensitive acronym, exact name,
+    case-insensitive name in that order.
+
+    Returns
+    -------
+    models.Anatomy or None
+    """
+    acronym_exact, acronym_lower, name_exact, name_lower = _build_lookup_dicts()
+    token_stripped = token.strip()
+    if not token_stripped:
+        return None
+
+    # 1. Exact acronym match
+    s = acronym_exact.get(token_stripped)
+    if s is not None:
+        return _structure_to_anatomy(s)
+
+    # 2. Case-insensitive acronym match
+    s = acronym_lower.get(token_stripped.lower())
+    if s is not None:
+        return _structure_to_anatomy(s)
+
+    # 3. Exact name match
+    s = name_exact.get(token_stripped)
+    if s is not None:
+        return _structure_to_anatomy(s)
+
+    # 4. Case-insensitive name match
+    s = name_lower.get(token_stripped.lower())
+    if s is not None:
+        return _structure_to_anatomy(s)
+
+    lgr.debug("Could not match brain location %r to Allen CCF", token_stripped)
+    return None
+
+
+def _structure_to_anatomy(s: dict[str, Any]) -> models.Anatomy:
+    return models.Anatomy(
+        identifier=MBAO_URI_TEMPLATE.format(s["id"]),
+        name=s["name"],
+    )
+
+
+def locations_to_anatomy(locations: list[str]) -> list[models.Anatomy]:
+    """Convert raw NWB location strings to deduplicated Anatomy list.
+
+    Parameters
+    ----------
+    locations : list[str]
+        Raw location strings from NWB file.
+
+    Returns
+    -------
+    list[models.Anatomy]
+        Matched and deduplicated anatomy entries.
+    """
+    seen_ids: set[str] = set()
+    results: list[models.Anatomy] = []
+    for loc in locations:
+        tokens = _parse_location_string(loc)
+        for token in tokens:
+            anatomy = match_location_to_allen(token)
+            if anatomy is not None:
+                id_str = str(anatomy.identifier)
+                if id_str not in seen_ids:
+                    seen_ids.add(id_str)
+                    results.append(anatomy)
+    return results
diff --git a/dandi/metadata/util.py b/dandi/metadata/util.py
@@ -516,6 +516,8 @@ def extract_assay_type(metadata: dict) -> list[models.AssayType] | None:
 
 
 def extract_anatomy(metadata: dict) -> list[models.Anatomy] | None:
+    # Anatomy is now populated via wasDerivedFrom BioSample.
+    # This extractor is kept only as a fallback for the top-level "anatomy" field.
     if "anatomy" in metadata:
         return [models.Anatomy(identifier="anatomy", name=metadata["anatomy"])]
     else:
@@ -552,22 +554,73 @@ def func(metadata: dict) -> list[M]:
 
 def extract_wasDerivedFrom(metadata: dict) -> list[models.BioSample] | None:
     derived_from: list[models.BioSample] | None = None
+    deepest: models.BioSample | None = None
     for field, sample_name in [
         ("tissue_sample_id", "tissuesample"),
         ("slice_id", "slice"),
         ("cell_id", "cell"),
     ]:
         if metadata.get(field) is not None:
+            sample = models.BioSample(
+                identifier=metadata[field],
+                wasDerivedFrom=derived_from,
+                sampleType=models.SampleType(name=sample_name),
+            )
+            derived_from = [sample]
+            if deepest is None:
+                deepest = sample
+
+    # Compute anatomy from brain locations (mouse only)
+    anatomy = _extract_brain_anatomy(metadata)
+    if anatomy:
+        if deepest is not None:
+            # Add anatomy to the deepest (first created) BioSample
+            deepest.anatomy = anatomy
+        else:
+            # No existing chain — create a new BioSample for the anatomy
             derived_from = [
                 models.BioSample(
-                    identifier=metadata[field],
-                    wasDerivedFrom=derived_from,
-                    sampleType=models.SampleType(name=sample_name),
+                    identifier="brain-region-sample",
+                    sampleType=models.SampleType(name="tissuesample"),
+                    anatomy=anatomy,
                 )
             ]
+
     return derived_from
 
 
+def _is_mouse(metadata: dict) -> bool:
+    """Return True if the species in *metadata* is mouse.
+
+    Reuses the canonical mouse entry from *species_map* to match against
+    common names, the Latin prefix, and the NCBITaxon identifier.
+    """
+    if not (species := metadata.get("species")):
+        return False
+    species_str = str(species).lower()
+    # species_map[0] = (["mouse"], "mus", NCBITaxon_10090 URI, full name)
+    common_names, prefix, uri, _name = species_map[0]
+    return (
+        "10090" in species_str
+        or (prefix is not None and species_str.startswith(prefix))
+        or any(name in species_str for name in common_names)
+        or uri.lower() in species_str
+    )
+
+
+def _extract_brain_anatomy(metadata: dict) -> list[models.Anatomy]:
+    """Extract brain anatomy from metadata, if the species is mouse."""
+    from .brain_areas import locations_to_anatomy
+
+    if not (locations := metadata.get("brain_locations")):
+        return []
+
+    if not _is_mouse(metadata):
+        return []
+
+    return locations_to_anatomy(locations)
+
+
 extract_wasAttributedTo = extract_model_list(
     models.Participant, "identifier", "subject_id", id=None
 )