Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ just test # Run all tests with coverage
just test tests/test_dataset.py # Run specific test file
just lint # Run ruff check + format check
just docs # Build documentation (runs quartodoc + quarto)
just sync-lexicons # Fetch latest lexicons from atdata-lexicon repo
just sync-lexicons ref=v1.0 # Pin to a specific tag/branch
just sync-lexicons-local # Sync vendored → package (offline, no fetch)
just bench # Run full benchmark suite
just bench-io # Run I/O benchmarks only
just bench-index # Run index provider benchmarks
Expand Down
47 changes: 44 additions & 3 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,55 @@ setup:
git config core.hooksPath .githooks
@echo "Git hooks activated from .githooks/"

sync-lexicons:
cp lexicons/*.json src/atdata/lexicons/
# Fetch latest lexicons from atdata-lexicon repo and sync to both local copies.
# Uses gh CLI to download a tarball — no clone needed.
# Pass ref=<branch/tag> to pin a specific version (default: main).
sync-lexicons ref="main":
#!/usr/bin/env bash
set -euo pipefail
REPO="forecast-bio/atdata-lexicon"
VENDOR="lexicons/science/alt/dataset"
PKG="src/atdata/lexicons/science/alt/dataset"
TMPDIR=$(mktemp -d)
trap 'rm -rf "$TMPDIR"' EXIT
echo "Fetching lexicons from $REPO@{{ref}}..."
gh api "repos/$REPO/tarball/{{ref}}" > "$TMPDIR/archive.tar.gz"
tar xzf "$TMPDIR/archive.tar.gz" -C "$TMPDIR" --strip-components=1
# Copy NSID lexicons to both vendor and package directories
for f in "$TMPDIR/lexicons/science/alt/dataset/"*.json; do
name=$(basename "$f")
cp "$f" "$VENDOR/$name"
cp "$f" "$PKG/$name"
echo " synced $name"
done
echo "Lexicons synced from $REPO@{{ref}}"
# Also sync top-level shim files if they exist upstream
for f in "$TMPDIR/lexicons/"*.json; do
[ -f "$f" ] || continue
name=$(basename "$f")
cp "$f" "lexicons/$name"
[ -f "src/atdata/lexicons/$name" ] && cp "$f" "src/atdata/lexicons/$name"
done

# Sync local vendored lexicons → package (no network, for offline use)
sync-lexicons-local:
#!/usr/bin/env bash
set -euo pipefail
for f in lexicons/science/alt/dataset/*.json; do
name=$(basename "$f")
cp "$f" "src/atdata/lexicons/science/alt/dataset/$name"
done
for f in lexicons/*.json; do
name=$(basename "$f")
[ -f "src/atdata/lexicons/$name" ] && cp "$f" "src/atdata/lexicons/$name"
done
echo "Local lexicon sync complete"

gen-lexicon-docs:
uv run python scripts/gen_lexicon_docs.py

test *args:
just sync-lexicons
just sync-lexicons-local
uv run pytest {{args}}

lint:
Expand Down
34 changes: 34 additions & 0 deletions lexicons/science/alt/dataset/entry.json
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,44 @@
"contentMetadata": {
"type": "unknown",
"description": "Dataset-level content metadata (e.g., instrument settings, acquisition parameters). Structure is validated against the schema referenced by metadataSchemaRef when present. Stored as an open JSON object."
},
"manifests": {
"type": "array",
"description": "Per-shard manifest references for query-based access. Each entry pairs a shard with its manifest header blob and optional Parquet samples blob.",
"items": {
"type": "ref",
"ref": "#shardManifestRef"
},
"maxLength": 10000
}
}
}
},
"shardManifestRef": {
"type": "object",
"description": "References to manifest sidecar data for a single shard. The header contains schema info, sample count, and per-field aggregates. The samples file is a Parquet table with per-sample metadata for query-based access.",
"required": [
"header"
],
"properties": {
"header": {
"type": "blob",
"description": "Manifest JSON header blob containing shard-level metadata (schema, sample count, field aggregates)",
"accept": [
"application/json"
],
"maxSize": 1048576
},
"samples": {
"type": "blob",
"description": "Optional Parquet file with per-sample metadata for query-based filtering",
"accept": [
"application/octet-stream"
],
"maxSize": 104857600
}
}
},
"shardChecksum": {
"type": "object",
"description": "Content hash for shard integrity verification. Algorithm is flexible to allow SHA-256, BLAKE3, or other hash functions.",
Expand Down
34 changes: 34 additions & 0 deletions src/atdata/lexicons/science/alt/dataset/entry.json
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,44 @@
"contentMetadata": {
"type": "unknown",
"description": "Dataset-level content metadata (e.g., instrument settings, acquisition parameters). Structure is validated against the schema referenced by metadataSchemaRef when present. Stored as an open JSON object."
},
"manifests": {
"type": "array",
"description": "Per-shard manifest references for query-based access. Each entry pairs a shard with its manifest header blob and optional Parquet samples blob.",
"items": {
"type": "ref",
"ref": "#shardManifestRef"
},
"maxLength": 10000
}
}
}
},
"shardManifestRef": {
"type": "object",
"description": "References to manifest sidecar data for a single shard. The header contains schema info, sample count, and per-field aggregates. The samples file is a Parquet table with per-sample metadata for query-based access.",
"required": [
"header"
],
"properties": {
"header": {
"type": "blob",
"description": "Manifest JSON header blob containing shard-level metadata (schema, sample count, field aggregates)",
"accept": [
"application/json"
],
"maxSize": 1048576
},
"samples": {
"type": "blob",
"description": "Optional Parquet file with per-sample metadata for query-based filtering",
"accept": [
"application/octet-stream"
],
"maxSize": 104857600
}
}
},
"shardChecksum": {
"type": "object",
"description": "Content hash for shard integrity verification. Algorithm is flexible to allow SHA-256, BLAKE3, or other hash functions.",
Expand Down
Loading