import numpy as np
from numpy.typing import NDArray
import atdata
@@ -681,7 +681,7 @@ 1. Define a Sample Ty
2. Create and Write Samples
Use WebDataset’s standard TarWriter:
-
+
import webdataset as wds
samples = [
@@ -701,7 +701,7 @@ 2. Create and Wri
3. Load and Iterate with Type Safety
The generic Dataset[T] provides typed access:
-
+
dataset = atdata.Dataset[ImageSample]("data-000000.tar")
for batch in dataset.shuffled(batch_size=32):
@@ -716,7 +716,7 @@ Scaling Up
Team Storage with Redis + S3
When you’re ready to share with your team:
-
+
from atdata.local import LocalIndex, S3DataStore
# Connect to team infrastructure
@@ -740,7 +740,7 @@ Team Storage wi
Federation with ATProto
For public or cross-organization sharing:
-
+
from atdata.atmosphere import AtmosphereClient, AtmosphereIndex, PDSBlobStore
from atdata.promote import promote_to_atmosphere
@@ -762,7 +762,7 @@ Federation with AT
HuggingFace-Style Loading
For convenient access to datasets:
-
+
from atdata import load_dataset
# Load from local files
diff --git a/docs/reference/architecture.html b/docs/reference/architecture.html
index 18b2fcf..e7ea4d2 100644
--- a/docs/reference/architecture.html
+++ b/docs/reference/architecture.html
@@ -657,7 +657,7 @@ Core Components
PackableSample: The Foundation
Everything in atdata starts with PackableSample—a base class that makes Python dataclasses serializable with msgpack:
-
+
@atdata.packable
class ImageSample:
image: NDArray # Automatically converted to/from bytes
@@ -680,7 +680,7 @@ PackableSamp
Dataset: Typed Iteration
The Dataset[T] class wraps WebDataset tar archives with type information:
-
+
dataset = atdata.Dataset[ImageSample]("data-{000000..000009}.tar")
for batch in dataset.shuffled(batch_size=32):
@@ -704,7 +704,7 @@ Dataset: Typed Ite
SampleBatch: Automatic Aggregation
When iterating with batch_size, atdata returns SampleBatch[T] objects that aggregate sample attributes:
-
+
batch = SampleBatch[ImageSample](samples)
# NDArray fields → stacked numpy array with batch dimension
@@ -718,7 +718,7 @@ SampleBa
Lens: Schema Transformations
Lenses enable viewing datasets through different schemas without duplicating data:
-
+
@atdata.packable
class SimplifiedSample:
label: str
@@ -755,7 +755,7 @@ Local Index (Redis +
- WebDataset tar shards
- Any S3-compatible storage (AWS, MinIO, Cloudflare R2)
-
+
store = S3DataStore(credentials=creds, bucket="datasets")
index = LocalIndex(data_store=store)
@@ -783,7 +783,7 @@ Atmosphere Index
- Store actual data shards as ATProto blobs
- Fully decentralized—no external dependencies
-
+
client = AtmosphereClient()
client.login("handle.bsky.social", "app-password")
@@ -801,7 +801,7 @@ Protocol Abstraction
AbstractIndex
Common interface for both LocalIndex and AtmosphereIndex:
-
+
def process_dataset(index: AbstractIndex, name: str):
entry = index.get_dataset(name)
schema = index.decode_schema(entry.schema_ref)
@@ -817,7 +817,7 @@ AbstractIndex
AbstractDataStore
Common interface for S3DataStore and PDSBlobStore:
-
+
def write_to_store(store: AbstractDataStore, dataset: Dataset):
urls = store.write_shards(dataset, prefix="data/v1")
# Works with S3 or PDS blob storage
@@ -838,7 +838,7 @@ Data Flow: L
A typical workflow progresses through three stages:
Stage 1: Local Development
-
+
# Define type and create samples
@atdata.packable
class MySample:
@@ -856,7 +856,7 @@ Stage 1: Local D
Stage 2: Team Storage
-
+
# Set up team storage
store = S3DataStore(credentials=team_creds, bucket="team-datasets")
index = LocalIndex(data_store=store)
@@ -871,7 +871,7 @@ Stage 2: Team Storage
Stage 3: Federation
-
+
# Promote to atmosphere
client = AtmosphereClient()
client.login("handle.bsky.social", "app-password")
@@ -904,7 +904,7 @@ Extension Points
Custom DataSources
Implement the DataSource protocol to add new storage backends:
-
+
class MyCustomSource:
def list_shards(self) -> list[str]: ...
def open_shard(self, shard_id: str) -> IO[bytes]: ...
@@ -916,7 +916,7 @@ Custom DataSources
Custom Lenses
Register transformations between any PackableSample types:
-
+
@atdata.lens
def my_transform(src: SourceType) -> TargetType:
return TargetType(...)
@@ -929,7 +929,7 @@ Custom Lenses
Schema Extensions
The schema format supports custom metadata for domain-specific needs:
-
+
index.publish_schema(
MySample,
version="1.0.0",
diff --git a/docs/reference/atmosphere.html b/docs/reference/atmosphere.html
index 71c6ce9..11a6ce0 100644
--- a/docs/reference/atmosphere.html
+++ b/docs/reference/atmosphere.html
@@ -626,7 +626,7 @@ Overview
AtmosphereClient
The client handles authentication and record operations:
-
+
from atdata.atmosphere import AtmosphereClient
client = AtmosphereClient()
@@ -653,7 +653,7 @@ AtmosphereClient
Session Management
Save and restore sessions to avoid re-authentication:
-
+
# Export session for later
session_string = client.export_session()
@@ -665,7 +665,7 @@ Session Management
Custom PDS
Connect to a custom PDS instead of bsky.social:
-
+
client = AtmosphereClient(base_url="https://pds.example.com")
@@ -673,7 +673,7 @@ Custom PDS
PDSBlobStore
Store dataset shards as ATProto blobs for fully decentralized storage:
-
+
from atdata.atmosphere import AtmosphereClient, PDSBlobStore
client = AtmosphereClient()
@@ -696,7 +696,7 @@ PDSBlobStore
Size Limits
PDS blobs typically have size limits (often 50MB-5GB depending on the PDS). Use maxcount and maxsize parameters to control shard sizes:
-
+
urls = store.write_shards(
dataset,
prefix="large-data/v1",
@@ -709,7 +709,7 @@ Size Limits
BlobSource
Read datasets stored as PDS blobs:
-
+