forecast-bio
diff --git a/‎.chainlink/issues.db‎
0 Bytes b/‎.chainlink/issues.db‎
0 Bytes
diff --git a/‎CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/atdata/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎src/atdata/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/atdata/_helpers.py‎
Lines changed: 128 additions & 4 deletions b/‎src/atdata/_helpers.py‎
Lines changed: 128 additions & 4 deletions
diff --git a/‎src/atdata/_hf_api.py‎
Lines changed: 126 additions & 0 deletions b/‎src/atdata/_hf_api.py‎
Lines changed: 126 additions & 0 deletions
@@ -6,6 +6,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
 
+## [0.3.4b1] - 2026-02-04
+
+### Added
+- **Content checksums**: Per-shard SHA-256 digests computed at write time across all storage backends (`LocalDiskStore`, `S3DataStore`, `PDSBlobStore`). Checksums are carried via `ShardWriteResult` and automatically merged into index entry metadata
+- **`verify_checksums()`**: Utility function to verify stored checksums against shard files on disk; remote URLs (`s3://`, `at://`, `http://`) are gracefully skipped
+- **`atdata verify` CLI command**: Verify content integrity of indexed datasets from the command line
+- **AT URI support in `load_dataset()`**: `load_dataset("at://did:plc:abc/.../rkey")` now fetches dataset records from ATProto and resolves storage (blobs, HTTP, S3) into streamable datasets with automatic schema decoding
+- **Lens composition operators**: `@` (compose) and `|` (pipe) operators for chaining lenses, plus `identity_lens()` factory for pass-through transforms
+
 ## [0.3.3b2] - 2026-02-04
 
 ### Testing
 
@@ -1,6 +1,6 @@
 [project]
 name = "atdata"
-version = "0.3.3b2"
+version = "0.3.4b1"
 description = "A loose federation of distributed, typed datasets"
 readme = "README.md"
 authors = [
 
@@ -106,6 +106,11 @@
     LocalDiskStore as LocalDiskStore,
 )
 
+from ._helpers import (
+    verify_checksums as verify_checksums,
+    ShardWriteResult as ShardWriteResult,
+)
+
 from ._cid import (
     generate_cid as generate_cid,
     verify_cid as verify_cid,
 
@@ -1,24 +1,37 @@
-"""Helper utilities for numpy array serialization.
+"""Helper utilities for numpy array serialization and content checksums.
 
 This module provides utility functions for converting numpy arrays to and from
-bytes for msgpack serialization.
+bytes for msgpack serialization, as well as SHA-256 checksum utilities for
+verifying dataset shard integrity.
 
 Functions:
     - ``array_to_bytes()``: Serialize numpy array to bytes
     - ``bytes_to_array()``: Deserialize bytes to numpy array
+    - ``sha256_file()``: Compute SHA-256 hex digest of a file
+    - ``sha256_bytes()``: Compute SHA-256 hex digest of in-memory bytes
+    - ``verify_checksums()``: Verify stored checksums against shard data
 
-These helpers are used internally by ``PackableSample`` to enable transparent
-handling of NDArray fields during msgpack packing/unpacking.
+Classes:
+    - ``ShardWriteResult``: ``list[str]`` subclass carrying per-shard checksums
 """
 
+from __future__ import annotations
+
 ##
 # Imports
 
+import hashlib
 import struct
 from io import BytesIO
+from typing import TYPE_CHECKING
 
 import numpy as np
 
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from atdata._protocols import IndexEntry
+
 # .npy format magic prefix (used for backward-compatible deserialization)
 _NPY_MAGIC = b"\x93NUMPY"
 
@@ -84,3 +97,114 @@ def bytes_to_array(b: bytes) -> np.ndarray:
     shape = struct.unpack_from(f"<{ndim}q", b, offset)
     offset += ndim * 8
     return np.frombuffer(b, dtype=dtype, offset=offset).reshape(shape).copy()
+
+
+##
+# Checksum utilities
+
+
+def sha256_file(path: str | Path, *, chunk_size: int = 8192) -> str:
+    """Compute SHA-256 hex digest of a file.
+
+    Reads the file in chunks to support large files without loading
+    everything into memory.
+
+    Args:
+        path: Path to the file.
+        chunk_size: Read buffer size in bytes.
+
+    Returns:
+        Hex-encoded SHA-256 digest string (64 characters).
+
+    Examples:
+        >>> digest = sha256_file("/path/to/shard.tar")
+        >>> len(digest)
+        64
+    """
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        while chunk := f.read(chunk_size):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def sha256_bytes(data: bytes) -> str:
+    """Compute SHA-256 hex digest of in-memory bytes.
+
+    Args:
+        data: Raw bytes to hash.
+
+    Returns:
+        Hex-encoded SHA-256 digest string (64 characters).
+
+    Examples:
+        >>> sha256_bytes(b"hello")
+        '2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824'
+    """
+    return hashlib.sha256(data).hexdigest()
+
+
+class ShardWriteResult(list):
+    """Return type carrying shard URLs and per-shard checksums.
+
+    Extends ``list[str]`` so it satisfies the ``AbstractDataStore.write_shards()``
+    return type (``list[str]``), while also carrying SHA-256 checksum metadata.
+
+    Attributes:
+        checksums: Dict mapping each shard URL to its SHA-256 hex digest.
+
+    Examples:
+        >>> result = ShardWriteResult(["shard-0.tar"], {"shard-0.tar": "abcd..."})
+        >>> result[0]
+        'shard-0.tar'
+        >>> result.checksums["shard-0.tar"]
+        'abcd...'
+    """
+
+    checksums: dict[str, str]
+
+    def __init__(self, urls: list[str], checksums: dict[str, str]) -> None:
+        super().__init__(urls)
+        self.checksums = checksums
+
+
+def verify_checksums(entry: "IndexEntry") -> dict[str, str]:
+    """Verify SHA-256 checksums for all shards in an index entry.
+
+    Compares stored checksums (from ``entry.metadata["checksums"]``) against
+    freshly computed digests. Shards without stored checksums are reported
+    as ``"skipped"``.
+
+    Currently supports local file paths only. S3 and AT URIs are reported
+    as ``"skipped"`` unless a corresponding checksum is absent.
+
+    Args:
+        entry: An IndexEntry with ``data_urls`` and optional metadata checksums.
+
+    Returns:
+        Dict mapping each shard URL to one of:
+        ``"ok"``, ``"mismatch"``, ``"skipped"``, or ``"error:<message>"``.
+
+    Examples:
+        >>> results = verify_checksums(entry)
+        >>> assert all(v == "ok" for v in results.values())
+    """
+    stored: dict[str, str] = {}
+    if entry.metadata and "checksums" in entry.metadata:
+        stored = entry.metadata["checksums"]
+
+    results: dict[str, str] = {}
+    for url in entry.data_urls:
+        if url not in stored:
+            results[url] = "skipped"
+            continue
+        # Only local file paths can be verified; skip remote URLs
+        if url.startswith(("s3://", "at://", "http://", "https://")):
+            results[url] = "skipped"
+            continue
+        try:
+            actual = sha256_file(url)
+            results[url] = "ok" if actual == stored[url] else "mismatch"
+        except Exception as e:
+            results[url] = f"error:{e}"
+    return results
@@ -48,6 +48,7 @@
 
 if TYPE_CHECKING:
     from ._protocols import AbstractIndex
+    from .atmosphere.client import Atmosphere
 
 ##
 # Type variables
@@ -478,6 +479,118 @@ def _group_shards_by_split(shards: list[str]) -> dict[str, list[str]]:
 # Index-based path resolution
 
 
+def _is_at_uri(path: str) -> bool:
+    """Check if path is an AT Protocol URI (at://...).
+
+    Examples:
+        >>> _is_at_uri("at://did:plc:abc123/ac.foundation.dataset.record/my-ds")
+        True
+        >>> _is_at_uri("@local/my-dataset")
+        False
+    """
+    return path.startswith("at://")
+
+
+def _resolve_at_uri(
+    path: str,
+    sample_type: Type[ST] | None = None,
+    client: "Atmosphere | None" = None,
+) -> tuple[Dataset, Type]:
+    """Resolve an AT URI to a Dataset by fetching the record from ATProto.
+
+    Fetches the dataset record once, determines storage type (blobs, HTTP, S3),
+    resolves shard URLs, and optionally decodes the schema to reconstruct
+    the sample type.
+
+    Args:
+        path: AT URI pointing to a dataset record.
+        sample_type: Optional sample type class. If None, the schema is
+            decoded from the referenced schema record.
+        client: Optional Atmosphere client. If None, an unauthenticated
+            client is created for public record access.
+
+    Returns:
+        Tuple of (Dataset, resolved_type).
+
+    Raises:
+        ValueError: If the record is not a dataset record, has no
+            resolvable storage, or uses an unknown storage type.
+    """
+    from .atmosphere.client import Atmosphere
+    from .atmosphere._types import AtUri, LEXICON_NAMESPACE
+    from ._sources import BlobSource
+
+    if client is None:
+        client = Atmosphere()
+
+    # Single fetch — all routing derived from this dict
+    record = client.get_record(path)
+    expected_type = f"{LEXICON_NAMESPACE}.record"
+    if record.get("$type") != expected_type:
+        raise ValueError(
+            f"Record at {path} is not a dataset record. "
+            f"Expected $type='{expected_type}', got '{record.get('$type')}'"
+        )
+
+    storage = record.get("storage", {})
+    storage_type = storage.get("$type", "")
+
+    if "storageBlobs" in storage_type:
+        parsed = AtUri.parse(path)
+        did = parsed.authority
+        refs = []
+        for entry in storage.get("blobs", []):
+            blob = entry.get("blob", entry)
+            ref = blob.get("ref", {})
+            cid = ref.get("$link") if isinstance(ref, dict) else str(ref)
+            if cid:
+                refs.append({"did": did, "cid": cid})
+        pds_endpoint = client._resolve_pds_endpoint(did)
+        source: DataSource = BlobSource(blob_refs=refs, pds_endpoint=pds_endpoint)
+    elif "storageHttp" in storage_type:
+        urls = [s["url"] for s in storage.get("shards", [])]
+        if not urls:
+            raise ValueError(f"Dataset record at {path} has no storage URLs")
+        source = URLSource(_shards_to_wds_url(urls))
+    elif "storageS3" in storage_type:
+        bucket = storage.get("bucket", "")
+        endpoint = storage.get("endpoint")
+        urls = []
+        for s in storage.get("shards", []):
+            if endpoint:
+                urls.append(f"{endpoint.rstrip('/')}/{bucket}/{s['key']}")
+            else:
+                urls.append(f"s3://{bucket}/{s['key']}")
+        if not urls:
+            raise ValueError(f"Dataset record at {path} has no storage URLs")
+        source = URLSource(_shards_to_wds_url(urls))
+    elif "storageExternal" in storage_type:
+        urls = storage.get("urls", [])
+        if not urls:
+            raise ValueError(f"Dataset record at {path} has no storage URLs")
+        source = URLSource(_shards_to_wds_url(urls))
+    else:
+        raise ValueError(f"Unknown storage type in dataset record: {storage_type}")
+
+    # Resolve sample type from the already-fetched record
+    if sample_type is None:
+        schema_ref = record.get("schemaRef")
+        if schema_ref:
+            from .atmosphere.schema import SchemaLoader
+            from ._schema_codec import schema_to_type
+
+            schema_loader = SchemaLoader(client)
+            schema_record = schema_loader.get(schema_ref)
+            resolved_type = schema_to_type(schema_record)
+        else:
+            resolved_type = DictSample
+    else:
+        resolved_type = sample_type
+
+    ds = Dataset[resolved_type](source)
+    return ds, resolved_type
+
+
 def _is_indexed_path(path: str) -> bool:
     """Check if path uses @handle/dataset notation for index lookup.
 
@@ -680,6 +793,7 @@ def load_dataset(
 
     Args:
         path: Path to dataset. Can be:
+            - AT URI: "at://did:plc:abc/ac.foundation.dataset.record/rkey"
             - Index lookup: "@handle/dataset-name" or "@local/dataset-name"
             - WebDataset brace notation: "path/to/{train,test}-{000..099}.tar"
             - Local directory: "./data/" (scans for .tar files)
@@ -746,6 +860,18 @@ def load_dataset(
         sample_type.__name__ if sample_type is not None else "None",
     )
 
+    # Handle at:// AT Protocol URI resolution
+    if _is_at_uri(path):
+        log.debug("load_dataset: resolving AT URI %s", path)
+        ds, resolved_type = _resolve_at_uri(path, sample_type)
+
+        if split is not None:
+            return ds
+
+        return DatasetDict(
+            {"train": ds}, sample_type=resolved_type, streaming=streaming
+        )
+
     # Handle @handle/dataset indexed path resolution
     if _is_indexed_path(path):
         if index is None: