forecast-bio · maxine-at-forecast · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -51,6 +51,9 @@ just test              # Run all tests with coverage
 just test tests/test_dataset.py  # Run specific test file
 just lint              # Run ruff check + format check
 just docs              # Build documentation (runs quartodoc + quarto)
+just sync-lexicons     # Fetch latest lexicons from atdata-lexicon repo
+just sync-lexicons ref=v1.0  # Pin to a specific tag/branch
+just sync-lexicons-local  # Sync vendored → package (offline, no fetch)
 just bench             # Run full benchmark suite
 just bench-io          # Run I/O benchmarks only
 just bench-index       # Run index provider benchmarks

diff --git a/justfile b/justfile
@@ -4,14 +4,55 @@ setup:
     git config core.hooksPath .githooks
     @echo "Git hooks activated from .githooks/"
 
-sync-lexicons:
-    cp lexicons/*.json src/atdata/lexicons/
+# Fetch latest lexicons from atdata-lexicon repo and sync to both local copies.
+# Uses gh CLI to download a tarball — no clone needed.
+# Pass ref=<branch/tag> to pin a specific version (default: main).
+sync-lexicons ref="main":
+    #!/usr/bin/env bash
+    set -euo pipefail
+    REPO="forecast-bio/atdata-lexicon"
+    VENDOR="lexicons/science/alt/dataset"
+    PKG="src/atdata/lexicons/science/alt/dataset"
+    TMPDIR=$(mktemp -d)
+    trap 'rm -rf "$TMPDIR"' EXIT
+    echo "Fetching lexicons from $REPO@{{ref}}..."
+    gh api "repos/$REPO/tarball/{{ref}}" > "$TMPDIR/archive.tar.gz"
+    tar xzf "$TMPDIR/archive.tar.gz" -C "$TMPDIR" --strip-components=1
+    # Copy NSID lexicons to both vendor and package directories
+    for f in "$TMPDIR/lexicons/science/alt/dataset/"*.json; do
+        name=$(basename "$f")
+        cp "$f" "$VENDOR/$name"
+        cp "$f" "$PKG/$name"
+        echo "  synced $name"
+    done
+    echo "Lexicons synced from $REPO@{{ref}}"
+    # Also sync top-level shim files if they exist upstream
+    for f in "$TMPDIR/lexicons/"*.json; do
+        [ -f "$f" ] || continue
+        name=$(basename "$f")
+        cp "$f" "lexicons/$name"
+        [ -f "src/atdata/lexicons/$name" ] && cp "$f" "src/atdata/lexicons/$name"
+    done
+
+# Sync local vendored lexicons → package (no network, for offline use)
+sync-lexicons-local:
+    #!/usr/bin/env bash
+    set -euo pipefail
+    for f in lexicons/science/alt/dataset/*.json; do
+        name=$(basename "$f")
+        cp "$f" "src/atdata/lexicons/science/alt/dataset/$name"
+    done
+    for f in lexicons/*.json; do
+        name=$(basename "$f")
+        [ -f "src/atdata/lexicons/$name" ] && cp "$f" "src/atdata/lexicons/$name"
+    done
+    echo "Local lexicon sync complete"
 
 gen-lexicon-docs:
     uv run python scripts/gen_lexicon_docs.py
 
 test *args:
-    just sync-lexicons
+    just sync-lexicons-local
     uv run pytest {{args}}
 
 lint:

diff --git a/lexicons/science/alt/dataset/entry.json b/lexicons/science/alt/dataset/entry.json
@@ -78,10 +78,44 @@
           "contentMetadata": {
             "type": "unknown",
             "description": "Dataset-level content metadata (e.g., instrument settings, acquisition parameters). Structure is validated against the schema referenced by metadataSchemaRef when present. Stored as an open JSON object."
+          },
+          "manifests": {
+            "type": "array",
+            "description": "Per-shard manifest references for query-based access. Each entry pairs a shard with its manifest header blob and optional Parquet samples blob.",
+            "items": {
+              "type": "ref",
+              "ref": "#shardManifestRef"
+            },
+            "maxLength": 10000
           }
         }
       }
     },
+    "shardManifestRef": {
+      "type": "object",
+      "description": "References to manifest sidecar data for a single shard. The header contains schema info, sample count, and per-field aggregates. The samples file is a Parquet table with per-sample metadata for query-based access.",
+      "required": [
+        "header"
+      ],
+      "properties": {
+        "header": {
+          "type": "blob",
+          "description": "Manifest JSON header blob containing shard-level metadata (schema, sample count, field aggregates)",
+          "accept": [
+            "application/json"
+          ],
+          "maxSize": 1048576
+        },
+        "samples": {
+          "type": "blob",
+          "description": "Optional Parquet file with per-sample metadata for query-based filtering",
+          "accept": [
+            "application/octet-stream"
+          ],
+          "maxSize": 104857600
+        }
+      }
+    },
     "shardChecksum": {
       "type": "object",
       "description": "Content hash for shard integrity verification. Algorithm is flexible to allow SHA-256, BLAKE3, or other hash functions.",

diff --git a/src/atdata/lexicons/science/alt/dataset/entry.json b/src/atdata/lexicons/science/alt/dataset/entry.json
@@ -78,10 +78,44 @@
           "contentMetadata": {
             "type": "unknown",
             "description": "Dataset-level content metadata (e.g., instrument settings, acquisition parameters). Structure is validated against the schema referenced by metadataSchemaRef when present. Stored as an open JSON object."
+          },
+          "manifests": {
+            "type": "array",
+            "description": "Per-shard manifest references for query-based access. Each entry pairs a shard with its manifest header blob and optional Parquet samples blob.",
+            "items": {
+              "type": "ref",
+              "ref": "#shardManifestRef"
+            },
+            "maxLength": 10000
           }
         }
       }
     },
+    "shardManifestRef": {
+      "type": "object",
+      "description": "References to manifest sidecar data for a single shard. The header contains schema info, sample count, and per-field aggregates. The samples file is a Parquet table with per-sample metadata for query-based access.",
+      "required": [
+        "header"
+      ],
+      "properties": {
+        "header": {
+          "type": "blob",
+          "description": "Manifest JSON header blob containing shard-level metadata (schema, sample count, field aggregates)",
+          "accept": [
+            "application/json"
+          ],
+          "maxSize": 1048576
+        },
+        "samples": {
+          "type": "blob",
+          "description": "Optional Parquet file with per-sample metadata for query-based filtering",
+          "accept": [
+            "application/octet-stream"
+          ],
+          "maxSize": 104857600
+        }
+      }
+    },
     "shardChecksum": {
       "type": "object",
       "description": "Content hash for shard integrity verification. Algorithm is flexible to allow SHA-256, BLAKE3, or other hash functions.",