From 2f80dcf9b7947ed9ce162f32b2c10b361699b0e9 Mon Sep 17 00:00:00 2001
From: mask
Date: Fri, 6 Feb 2026 16:32:58 -0600
Subject: [PATCH 01/58] docs(ray-docs): align API docs and examples with
current code
---
ray-docs/src/lib/docs.ts | 2 +-
ray-docs/src/routes/docs/$.tsx | 2 +-
ray-docs/src/routes/docs/api/$.tsx | 76 ++++--
.../src/routes/docs/getting-started/$.tsx | 8 +-
.../docs/getting-started/installation.tsx | 6 +-
ray-docs/src/routes/docs/guides/$.tsx | 252 +++++++-----------
.../routes/docs/internals/-performance.tsx | 64 +++--
ray-docs/src/routes/docs/internals/-wal.tsx | 12 +-
ray-docs/src/routes/index.tsx | 26 +-
9 files changed, 208 insertions(+), 240 deletions(-)
diff --git a/ray-docs/src/lib/docs.ts b/ray-docs/src/lib/docs.ts
index 6f29200..f67f94d 100644
--- a/ray-docs/src/lib/docs.ts
+++ b/ray-docs/src/lib/docs.ts
@@ -85,7 +85,7 @@ export const docsStructure: DocSection[] = [
},
{
title: "Low-Level API",
- description: "Direct storage access",
+ description: "Direct database primitives",
slug: "api/low-level",
},
{
diff --git a/ray-docs/src/routes/docs/$.tsx b/ray-docs/src/routes/docs/$.tsx
index 08ffcc4..ec68050 100644
--- a/ray-docs/src/routes/docs/$.tsx
+++ b/ray-docs/src/routes/docs/$.tsx
@@ -96,7 +96,7 @@ function DocPageContent(props: { slug: string }) {
traversals
- Vector search – HNSW-indexed similarity queries
+ Vector search – IVF-based similarity queries
Embedded – Runs in your process, no server needed
diff --git a/ray-docs/src/routes/docs/api/$.tsx b/ray-docs/src/routes/docs/api/$.tsx
index 5b8f41e..986780e 100644
--- a/ray-docs/src/routes/docs/api/$.tsx
+++ b/ray-docs/src/routes/docs/api/$.tsx
@@ -117,7 +117,7 @@ db.countEdges(follows)`}
Next Steps
@@ -128,46 +128,64 @@ db.countEdges(follows)`}
return (
- The low-level API provides direct access to the underlying storage
- engine for advanced use cases.
+ The low-level API uses the Database class for direct
+ graph operations, transaction control, and batched writes.
- Storage Access
+ Open and Write
Batch Operations
+db.addEdgesBatch(edges); // Array<{ src, etype, dst }>
+db.addEdgesWithPropsBatch(edgesWithProps);
+db.commit();
+
+// Optional maintenance checkpoint after ingest
+db.checkpoint();`}
language="typescript"
/>
- Iterators
+ Streaming and Pagination
diff --git a/ray-docs/src/routes/docs/getting-started/$.tsx b/ray-docs/src/routes/docs/getting-started/$.tsx
index a3dae51..61c6082 100644
--- a/ray-docs/src/routes/docs/getting-started/$.tsx
+++ b/ray-docs/src/routes/docs/getting-started/$.tsx
@@ -68,7 +68,7 @@ function DocPageContent(props: { slug: string }) {
typescript={`import { kite } from '@kitedb/core';
// Define schema inline when opening the database
-const db = kite('./social.kitedb', {
+const db = await kite('./social.kitedb', {
nodes: [
{
name: 'user',
@@ -151,7 +151,7 @@ let bob = db.insert("user")
.returning()?;
// Create a follow relationship
-db.link(alice.id, "follows", bob.id, Some(json!({
+db.link(alice.id(), "follows", bob.id(), Some(json!({
"followedAt": std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)?
.as_secs()
@@ -185,14 +185,14 @@ const followsBob = db.hasEdge(alice.id, 'follows', bob.id);
console.log('Alice follows Bob:', followsBob);`}
rust={`// Find all users Alice follows
let following = db
- .from(alice.id)
+ .from(alice.id())
.out(Some("follows"))
.nodes()?;
println!("Alice follows: {} users", following.len());
// Check if Alice follows Bob
-let follows_bob = db.has_edge(alice.id, "follows", bob.id)?;
+let follows_bob = db.has_edge(alice.id(), "follows", bob.id())?;
println!("Alice follows Bob: {}", follows_bob);`}
python={`# Find all users Alice follows
following = (db
diff --git a/ray-docs/src/routes/docs/getting-started/installation.tsx b/ray-docs/src/routes/docs/getting-started/installation.tsx
index 2eef324..d10105f 100644
--- a/ray-docs/src/routes/docs/getting-started/installation.tsx
+++ b/ray-docs/src/routes/docs/getting-started/installation.tsx
@@ -20,8 +20,8 @@ function InstallationPage() {
Requirements
- JavaScript/TypeScript: Bun 1.0+, Node.js 18+, or Deno
- Rust: Rust 1.70+
+ JavaScript/TypeScript: Bun 1.0+ or Node.js 16+
+ Rust: Stable Rust toolchain
Python: Python 3.9+
@@ -31,7 +31,7 @@ function InstallationPage() {
typescript={`import { kite } from '@kitedb/core';
// Open database with a simple schema
-const db = kite('./test.kitedb', {
+const db = await kite('./test.kitedb', {
nodes: [
{
name: 'user',
diff --git a/ray-docs/src/routes/docs/guides/$.tsx b/ray-docs/src/routes/docs/guides/$.tsx
index ae9dd14..5714007 100644
--- a/ray-docs/src/routes/docs/guides/$.tsx
+++ b/ray-docs/src/routes/docs/guides/$.tsx
@@ -67,7 +67,7 @@ function DocPageContent(props: { slug: string }) {
= db.all("user")?.collect();
// Count nodes
-let user_count = db.count_nodes(Some("user"))?;`}
+let user_count = db.count_nodes_by_type("user")?;`}
python={`# Get by key
user = db.get(user, "alice")
-# Get by node ID
-user_by_id = db.get_by_id(alice.id)
+# Get lightweight ref by key
+user_ref = db.get_ref(user, "alice")
# Check if exists
-exists = db.exists(alice.id)
+exists = alice is not None and db.exists(alice)
# List all nodes of a type
-all_users = db.all(user)
+all_users = list(db.all(user))
# Count nodes
-user_count = db.count_nodes("user")`}
+user_count = db.count(user)`}
/>
Updating Data
@@ -299,35 +301,33 @@ db.update(user, 'alice')
.unset('email')
.execute();`}
rust={`// Update by node ID
-db.update_by_id(alice.id)
- .set("name", "Alice C.")
+db.update_by_id(alice.id())
+ .set("name", PropValue::String("Alice C.".into()))
.execute()?;
// Update multiple properties
-db.update_by_id(alice.id)
- .set_all(json!({
- "name": "Alice Chen",
- "email": "newemail@example.com"
- }))
+db.update_by_id(alice.id())
+ .set("name", PropValue::String("Alice Chen".into()))
+ .set("email", PropValue::String("newemail@example.com".into()))
.execute()?;
// Remove a property
-db.update_by_id(alice.id)
+db.update_by_id(alice.id())
.unset("email")
.execute()?;`}
- python={`# Update by node ID
-(db.update_by_id(alice.id)
- .set("name", "Alice C.")
+ python={`# Update by node reference
+(db.update(alice)
+ .set(name="Alice C.")
.execute())
# Update multiple properties
-(db.update_by_id(alice.id)
- .set_all({"name": "Alice Chen", "email": "newemail@example.com"})
+(db.update(alice)
+ .set({"name": "Alice Chen", "email": "newemail@example.com"})
.execute())
-# Remove a property
-(db.update_by_id(alice.id)
- .unset("email")
+# Update another property
+(db.update(alice)
+ .set(email="newemail@example.com")
.execute())`}
/>
@@ -339,15 +339,19 @@ db.deleteById(alice.id);
// Delete by key
db.deleteByKey('user', 'alice');`}
rust={`// Delete by node ID
-db.delete_by_id(alice.id)?;
+db.delete_node(alice.id())?;
-// Delete by key
-db.delete_by_key("user", "alice")?;`}
- python={`# Delete by node ID
-db.delete_by_id(alice.id)
+// Delete by key (lookup then delete)
+if let Some(node) = db.get("user", "alice")? {
+ db.delete_node(node.id())?;
+}`}
+ python={`# Delete by node reference
+db.delete(alice)
-# Delete by key
-db.delete_by_key(user, "alice")`}
+# Delete by key (lookup then delete)
+node = db.get(user, "alice")
+if node is not None:
+ db.delete(node)`}
/>
Next Steps
@@ -388,19 +392,19 @@ const connections = db
.nodes();`}
rust={`// Find all users that Alice follows (outgoing edges)
let following = db
- .from(alice.id)
+ .from(alice.id())
.out(Some("follows"))
.nodes()?;
// Find all followers of Alice (incoming edges)
let followers = db
- .from(alice.id)
+ .from(alice.id())
.in_(Some("follows"))
.nodes()?;
// Follow edges in both directions
let connections = db
- .from(alice.id)
+ .from(alice.id())
.both(Some("knows"))
.nodes()?;`}
python={`# Find all users that Alice follows (outgoing edges)
@@ -442,14 +446,14 @@ const authorsOfLikedArticles = db
.nodes();`}
rust={`// Find friends of friends (2-hop)
let friends_of_friends = db
- .from(alice.id)
+ .from(alice.id())
.out(Some("follows"))
.out(Some("follows"))
.nodes()?;
// Chain different edge types
let authors_of_liked = db
- .from(alice.id)
+ .from(alice.id())
.out(Some("likes")) // Alice -> Articles
.in_(Some("authored")) // Articles <- Users
.nodes()?;`}
@@ -486,7 +490,7 @@ const topConnections = db
.nodes();`}
rust={`// Traverse 1-3 hops
let network = db
- .from(alice.id)
+ .from(alice.id())
.traverse(Some("follows"), TraverseOptions {
min_depth: Some(1),
max_depth: 3,
@@ -496,7 +500,7 @@ let network = db
// Limit results
let top_connections = db
- .from(alice.id)
+ .from(alice.id())
.out(Some("follows"))
.take(10)
.nodes()?;`}
@@ -574,7 +578,7 @@ index.set(doc.id, embedding);`}
let embedding: Vec = get_embedding("Your document content")?;
// Store the vector, associated with a node ID
-index.set(doc.id, &embedding)?;`}
+index.set(doc.id(), &embedding)?;`}
python={`# Generate embedding with your preferred provider
response = openai.embeddings.create(
model="text-embedding-ada-002",
@@ -641,13 +645,13 @@ index.buildIndex();
const stats = index.stats();
console.log(\`Total vectors: \${stats.totalVectors}\`);`}
rust={`// Check if a node has a vector
-let has_vector = index.has(doc.id)?;
+let has_vector = index.has(doc.id())?;
// Get a stored vector
-let vector = index.get(doc.id)?;
+let vector = index.get(doc.id())?;
// Delete a vector
-index.delete(doc.id)?;
+index.delete(doc.id())?;
// Build/rebuild the IVF index for faster search
index.build_index()?;
@@ -713,7 +717,7 @@ let mut db = Kite::open("./my.kitedb", options)?;
db.transaction(|ctx| {
let alice = ctx.create_node("user", "alice", HashMap::new())?;
let bob = ctx.create_node("user", "bob", HashMap::new())?;
- ctx.link(alice.id, "follows", bob.id)?;
+ ctx.link(alice.id(), "follows", bob.id())?;
Ok(())
})?;`}
python={`from kitedb import kite
@@ -793,7 +797,7 @@ db.commit()`}
Max throughput, single writer
- begin_bulk() + batch APIs
+ beginBulk() + batch APIs
Atomic ingest w/ MVCC
@@ -801,7 +805,7 @@ db.commit()`}
Multi-writer throughput
- sync_mode=Normal + group commit + chunked batches
+ syncMode: 'Normal' + group commit + chunked batches
@@ -955,7 +959,7 @@ if db.has_transaction():
Max ingest throughput, single writer
- begin_bulk() + batch APIs
+ beginBulk() + batch APIs
Atomic ingest with MVCC
@@ -963,15 +967,15 @@ if db.has_transaction():
Multi-writer throughput
- sync_mode=Normal + group commit (1-2ms)
+ syncMode: 'Normal' + group commit (1-2ms)
Strong durability per commit
- sync_mode=Full
+ syncMode: 'Full'
Throwaway or test data
- sync_mode=Off
+ syncMode: 'Off'
@@ -1020,32 +1024,32 @@ db.commit()`}
Single-writer ingest
- sync_mode=Normal, group_commit=false,
- WAL ≥ 256MB, auto_checkpoint=false
+ syncMode: 'Normal', groupCommitEnabled: false,
+ WAL ≥ 256MB, autoCheckpoint: false
Multi-writer throughput
- sync_mode=Normal, group_commit=true
+ syncMode: 'Normal', groupCommitEnabled: true
(1-2ms window), chunked batches
Max durability
- sync_mode=Full, smaller batches
+ syncMode: 'Full', smaller batches
Max speed (test)
- sync_mode=Off
+ syncMode: 'Off'
Checklist
- Use batch APIs: create_nodes_batch, add_edges_batch, add_edges_with_props_batch
- Prefer begin_bulk() for ingest; commit in chunks
+ Use batch APIs: createNodesBatch, addEdgesBatch, addEdgesWithPropsBatch
+ Prefer beginBulk() for ingest; commit in chunks
Increase WAL size for large ingest (256MB+)
Disable auto-checkpoint during ingest; checkpoint once at the end
Use low-level API for hot paths in JS/TS
@@ -1101,15 +1105,17 @@ const results = await Promise.all([
// Workers can read concurrently from the same database file`}
rust={`use std::sync::{Arc, RwLock};
use std::thread;
+use kitedb::api::kite::Kite;
-let db = Arc::new(RwLock::new(Kite::open("./data.kitedb")?));
+let db = Arc::new(RwLock::new(Kite::open("./data.kitedb", options)?));
let handles: Vec<_> = (0..4).map(|i| {
let db = Arc::clone(&db);
thread::spawn(move || {
// Multiple threads can acquire read locks simultaneously
+ let key = format!("user{}", i);
let guard = db.read().unwrap();
- guard.get_node(format!("user:{}", i))
+ guard.get("user", &key).ok().flatten()
})
}).collect();
@@ -1141,47 +1147,12 @@ for t in threads:
print(results)`}
/>
-
+
- Benchmarks show ~1.5-1.8x throughput improvement with 4-8 reader
- threads:
+ Read throughput typically improves with parallel readers, while write
+ throughput is constrained by serialized commit ordering. Measure with
+ your workload and tune batch sizes and sync mode accordingly.
-
-
-
- Threads
- Relative Throughput
- Notes
-
-
-
-
- 1
- 1.0x (baseline)
- Single-threaded
-
-
- 2
- ~1.3x
- Good scaling
-
-
- 4
- ~1.5-1.6x
- Sweet spot for most workloads
-
-
- 8
- ~1.6-1.8x
- Diminishing returns
-
-
- 16
- ~1.7-1.9x
- Lock contention increases
-
-
-
Best Practices
@@ -1203,63 +1174,44 @@ print(results)`}
- MVCC and Snapshot Isolation
+ MVCC and Transaction Semantics
- KiteDB uses Multi-Version Concurrency Control (MVCC) to provide
- snapshot isolation:
+ KiteDB uses Multi-Version Concurrency Control (MVCC) with serialized
+ writes:
- Readers never block writers
- Writers never block readers
+ Multiple readers can run concurrently
- Each transaction sees a consistent snapshot from its start time
+ A write waits for in-flight reads, then blocks new reads while it
+ commits
- Write conflicts are detected and one transaction is aborted
+ Each committed transaction is atomic
+ Write conflicts are detected at commit time
{
+ const alice = ctx.get(user, 'alice');
+ if (alice) {
+ ctx.update(user, 'alice')
+ .set('name', 'Alice Updated')
+ .execute();
+ }
+});`}
+ rust={`// Atomic transaction with TxContext
+db.transaction(|ctx| {
+ let alice = ctx.get("user", "alice")?;
+ if let Some(node) = alice {
+ ctx.set_prop(node.id(), "name", PropValue::String("Alice Updated".into()))?;
+ }
+ Ok(())
+})?;`}
+ python={`# Atomic transaction (context manager handles commit/rollback)
+with db.transaction():
+ alice = db.get(user, "alice")
+ if alice is not None:
+ db.update(user, "alice").set(name="Alice Updated").execute()`}
/>
Limitations
diff --git a/ray-docs/src/routes/docs/internals/-performance.tsx b/ray-docs/src/routes/docs/internals/-performance.tsx
index 4e583be..dbcf199 100644
--- a/ray-docs/src/routes/docs/internals/-performance.tsx
+++ b/ray-docs/src/routes/docs/internals/-performance.tsx
@@ -320,7 +320,7 @@ export function PerformancePage() {
Latest snapshot (single-file raw, Rust core, 10k nodes / 50k edges,
- edge types=3, edge props=10, sync_mode=Normal, group_commit=false,
+ edge types=3, edge props=10, syncMode=Normal, groupCommitEnabled=false,
February 4, 2026):
@@ -379,16 +379,16 @@ export function PerformancePage() {
Write Durability vs Throughput
- Defaults stay safe: sync_mode=Full,{" "}
- group_commit=false.
+ Defaults stay safe: syncMode=Full,{" "}
+ groupCommitEnabled=false.
Single-writer, low latency: {" "}
- sync_mode=Normal + group_commit=false.
+ syncMode=Normal + groupCommitEnabled=false.
Multi-writer throughput: {" "}
- sync_mode=Normal + group_commit=true (1-2ms).
+ syncMode=Normal + groupCommitEnabled=true (1-2ms).
{" "}
Scaling saturates quickly; prefer prep-parallel + single writer for max ingest. See{" "}
@@ -397,7 +397,7 @@ export function PerformancePage() {
Highest speed, weakest durability: {" "}
- sync_mode=Off (testing/throwaway only).
+ syncMode=Off (testing/throwaway only).
@@ -410,8 +410,8 @@ export function PerformancePage() {
Workload
- sync_mode
- group_commit
+ syncMode
+ groupCommitEnabled
Why
@@ -447,14 +447,14 @@ export function PerformancePage() {
Fastest ingest (single writer): {" "}
- begin_bulk() + create_nodes_batch() +{" "}
- add_edges_batch() / add_edges_with_props_batch(),{" "}
- sync_mode=Normal, group_commit=false, WAL ≥ 256MB,
+ beginBulk() + createNodesBatch() +{" "}
+ addEdgesBatch() / addEdgesWithPropsBatch(),{" "}
+ syncMode=Normal, groupCommitEnabled=false, WAL ≥ 256MB,
auto-checkpoint off during ingest, then checkpoint.
Multi-writer throughput: {" "}
- sync_mode=Normal + group_commit=true (1-2ms window),
+ syncMode=Normal + groupCommitEnabled=true (1-2ms window),
batched ops per transaction.
@@ -463,7 +463,7 @@ export function PerformancePage() {
Max speed, lowest durability: {" "}
- sync_mode=Off for testing only.
+ syncMode=Off for testing only.
@@ -473,10 +473,10 @@ export function PerformancePage() {
Bulk Ingest Example (Low-Level)
@@ -553,21 +553,19 @@ const stats = await db.stats();`}
diff --git a/ray-docs/src/routes/docs/internals/-wal.tsx b/ray-docs/src/routes/docs/internals/-wal.tsx
index d4af9b8..3f9b5e8 100644
--- a/ray-docs/src/routes/docs/internals/-wal.tsx
+++ b/ray-docs/src/routes/docs/internals/-wal.tsx
@@ -524,19 +524,19 @@ export function WALPage() {
- sync_mode = Normal
+ syncMode = Normal
- group_commit_enabled = true
+ groupCommitEnabled = true
- group_commit_window_ms = 2
+ groupCommitWindowMs = 2
- begin_bulk() + batch APIs for ingest (MVCC disabled)
+ beginBulk() + batch APIs for ingest (MVCC disabled)
- Optional: increase wal_size (e.g., 64MB) for heavy ingest to
+ Optional: increase walSizeMb (e.g., 64MB) for heavy ingest to
reduce checkpoints
@@ -564,7 +564,7 @@ export function WALPage() {
use resizeWal (offline) to grow it, or rebuild into a new
file. To prevent single transactions from overfilling the active WAL
region, split work into smaller commits (see bulkWrite or
- chunked begin_bulk() sessions) and consider disabling
+ chunked beginBulk() sessions) and consider disabling
background checkpoints during ingest.
diff --git a/ray-docs/src/routes/index.tsx b/ray-docs/src/routes/index.tsx
index 46f4904..a7c058b 100644
--- a/ray-docs/src/routes/index.tsx
+++ b/ray-docs/src/routes/index.tsx
@@ -107,7 +107,7 @@ function HomePage() {
typescript: `import { kite } from '@kitedb/core';
// Open database with schema
-const db = kite('./knowledge.kitedb', {
+const db = await kite('./knowledge.kitedb', {
nodes: [
{
name: 'document',
@@ -187,14 +187,14 @@ const results = db
.nodes();`,
rust: `// Find all topics discussed by Alice's documents
let topics = db
- .from(alice.id)
+ .from(alice.id())
.out(Some("wrote")) // Alice -> Document
.out(Some("discusses")) // Document -> Topic
.nodes()?;
// Multi-hop traversal
let results = db
- .from(start_node.id)
+ .from(start_node.id())
.out(Some("knows"))
.out(Some("worksAt"))
.take(10)
@@ -245,7 +245,7 @@ let mut index = VectorIndex::new(VectorIndexOptions {
})?;
// Add vectors for nodes
-index.set(doc.id, &embedding)?;
+index.set(doc.id(), &embedding)?;
// Find similar documents
let similar = index.search(&query_embedding, SimilarOptions {
@@ -300,12 +300,12 @@ let doc = db.insert("document")
.returning()?;
// Create relationships
-db.link(doc.id, "discusses", topic.id, Some(json!({
+db.link(doc.id(), "discusses", topic.id(), Some(json!({
"relevance": 0.95
})))?;
// Update properties
-db.update_by_id(doc.id)
+db.update_by_id(doc.id())
.set("title", "Updated Title")
.execute()?;`,
python: `# Insert with returning
@@ -317,8 +317,8 @@ doc = (db.insert(document)
db.link(doc, discusses, topic, relevance=0.95)
# Update properties
-(db.update_by_id(doc.id)
- .set("title", "Updated Title")
+(db.update(doc)
+ .set(title="Updated Title")
.execute())`,
};
@@ -614,8 +614,8 @@ db.link(doc, discusses, topic, relevance=0.95)
icon={ }
/>
}
/>
@@ -671,7 +671,7 @@ db.link(doc, discusses, topic, relevance=0.95)
/>
}
/>
@@ -838,10 +838,10 @@ db.link(doc, discusses, topic, relevance=0.95)
- HNSW_INDEX
+ IVF_INDEX
- O(log n) approximate nearest neighbor queries.
+ Approximate nearest-neighbor search with tunable probe count.
From fe8f1f23770d25eb87c64a5ed1048de5d00dc39a Mon Sep 17 00:00:00 2001
From: mask
Date: Sun, 8 Feb 2026 11:04:08 -0600
Subject: [PATCH 02/58] replication: harden host-runtime OTLP transport
---
docs/REPLICATION_PLAN.md | 392 +++++++++
docs/REPLICATION_RUNBOOK.md | 179 ++++
ray-rs/Cargo.toml | 3 +
ray-rs/README.md | 71 ++
ray-rs/index.d.ts | 22 +
ray-rs/index.js | 4 +
ray-rs/python/PARITY_MATRIX.md | 2 +-
ray-rs/python/README.md | 77 ++
ray-rs/python/kitedb/__init__.py | 6 +
ray-rs/python/kitedb/_kitedb.pyi | 12 +
ray-rs/src/metrics/mod.rs | 873 +++++++++++++++++++-
ray-rs/src/napi_bindings/database.rs | 484 +++++++++++
ray-rs/src/pyo3_bindings/database.rs | 277 +++++++
ray-rs/src/pyo3_bindings/mod.rs | 15 +
ray-rs/tests/replication_metrics_phase_d.rs | 410 +++++++++
ray-rs/ts/index.ts | 71 +-
16 files changed, 2895 insertions(+), 3 deletions(-)
create mode 100644 docs/REPLICATION_PLAN.md
create mode 100644 docs/REPLICATION_RUNBOOK.md
create mode 100644 ray-rs/tests/replication_metrics_phase_d.rs
diff --git a/docs/REPLICATION_PLAN.md b/docs/REPLICATION_PLAN.md
new file mode 100644
index 0000000..b0b5cca
--- /dev/null
+++ b/docs/REPLICATION_PLAN.md
@@ -0,0 +1,392 @@
+# KiteDB Replication V1 Plan (Feature + Code)
+
+Status: draft (implementation-ready)
+
+## 1) Goals
+
+- Single-writer primary, multiple read replicas.
+- Keep local embedded path default and fastest when replication is disabled.
+- Add optional read-your-writes on replicas via commit token wait.
+- Manual replica promotion to primary (no automatic election in V1).
+
+## 2) Non-Goals (V1)
+
+- Multi-primary / multi-writer.
+- Automatic leader election / consensus.
+- WAN topology optimization and geo-routing.
+- Replicating rebuildable derived indexes as required state.
+
+## 3) Scope
+
+- Engine: single-file `.kitedb` path only.
+- Topology target: `1 primary + up to 5 replicas`.
+- Transport target: pull-based replication first (HTTP contract), push later without format break.
+- API policy: additive only.
+
+## 4) Replication Invariants
+
+1. Exactly one writable primary per epoch.
+2. Replica apply order is commit order from primary.
+3. Replica apply is idempotent by log index.
+4. Commit token monotonicity per epoch.
+5. Checkpoint/compaction on primary must not break replica catch-up semantics.
+6. If replication is disabled, existing behavior and performance profile remain unchanged.
+
+## 5) Data Model: Source-of-Truth vs Derived
+
+### Authoritative replicated state
+
+- Committed transaction stream (logical mutation records).
+- Snapshot checkpoint image + metadata.
+- Replication epoch and monotonic log index.
+
+### Derived/rebuildable state (not required for correctness replication)
+
+- Caches (`cache::*`).
+- In-memory overlays reconstructed from snapshot + replicated tx stream.
+- Rebuildable vector/search side structures (unless explicitly marked authoritative in future phases).
+
+## 6) Consistency Model
+
+- Default replica reads: eventual/async.
+- Optional stronger read: provide commit token and wait until `applied_log_index >= token.log_index`.
+- Write ack policy: primary acks after local durability boundary only (replicas async).
+
+## 7) Durability and Crash Boundaries
+
+Commit must define explicit durability points:
+
+1. Primary WAL commit record persisted per current `sync_mode` rules.
+2. Replication log frame append persisted for the same commit.
+3. Commit token returned only after replication log append is durable.
+
+Crash model requirements:
+
+- Crash before token return: client may retry safely (idempotency via tx semantics/log index handling).
+- Crash after token return: token must correspond to durable replication log frame.
+- Replica restart resumes from persisted cursor with idempotent re-apply.
+
+## 8) Compatibility and Versioning
+
+- Keep `.kitedb` format backward compatible in V1.
+- Replication metadata lives in versioned sidecar manifest + segments.
+- Promotion increments epoch; stale writers must be fenced by epoch checks.
+
+## 9) Architecture (V1)
+
+### 9.1 Replication log sidecar
+
+- New sidecar directory adjacent to DB file.
+- Segment files: append-only, checksummed tx frames.
+- Manifest: current epoch, head index, retained floor, active segment metadata.
+- Cursor: `epoch:segment_id:offset:log_index`.
+
+### 9.2 Primary responsibilities
+
+- On commit, append committed tx frame to replication sidecar.
+- Expose snapshot + log pull interfaces.
+- Track replica progress (last acknowledged cursor/index) for retention decisions.
+
+### 9.3 Replica responsibilities
+
+- Bootstrap from latest snapshot bundle.
+- Catch up via log pull from snapshot start cursor.
+- Persist applied cursor atomically after apply batch.
+- Serve reads immediately or wait-for-token when requested.
+
+## 10) Code Touch Points
+
+Core engine:
+
+- `ray-rs/src/core/single_file/transaction.rs`
+ - Commit hook for replication append + token emission.
+- `ray-rs/src/core/single_file/open.rs`
+ - Role/config wiring (primary/replica settings).
+- `ray-rs/src/core/single_file/recovery.rs`
+ - Shared replay semantics reuse for replica apply path.
+- `ray-rs/src/metrics/mod.rs`
+ - Replication lag/apply metrics.
+
+New module tree:
+
+- `ray-rs/src/replication/mod.rs`
+- `ray-rs/src/replication/types.rs`
+- `ray-rs/src/replication/manifest.rs`
+- `ray-rs/src/replication/log_store.rs`
+- `ray-rs/src/replication/primary.rs`
+- `ray-rs/src/replication/replica.rs`
+- `ray-rs/src/replication/token.rs`
+- `ray-rs/src/replication/transport.rs`
+
+Binding surface (additive):
+
+- `ray-rs/src/napi_bindings/database.rs`
+- `ray-rs/src/pyo3_bindings/database.rs`
+
+## 11) API/Interface Additions (Additive)
+
+- Open options:
+ - replication role (`primary` | `replica` | `disabled`)
+ - replication sidecar path (optional default derived from DB path)
+ - pull/apply tuning (chunk bytes, poll interval, max batch)
+- Primary status:
+ - replication head index/epoch
+ - retained floor
+ - per-replica lag
+- Replica status:
+ - applied index/epoch
+ - last pull/apply error
+- Read wait:
+ - `wait_for_token(token, timeout_ms)` style helper.
+
+## 12) Transport Contract (Pull-First)
+
+- `GET /replication/snapshot/latest`
+ - Returns snapshot bytes + metadata (checksum, epoch, start cursor/index).
+- `GET /replication/log?cursor=...&max_bytes=...`
+ - Returns ordered tx frames + next cursor + eof marker.
+- `GET /replication/status`
+ - Primary/replica status for observability.
+- `POST /replication/promote`
+ - Manual promotion to next epoch (authenticated).
+
+Protocol requirement: all payloads versioned to allow push transport later with same frame/cursor model.
+
+## 13) Retention Policy
+
+- Segment rotation by size (default 64MB).
+- Retain at least:
+ - minimum time window (operator-configured), and
+ - min cursor needed by active replicas.
+- If replica falls behind retained floor:
+ - mark `needs_reseed`,
+ - force snapshot bootstrap.
+
+## 14) Failure Modes and Handling
+
+1. Corrupt segment/frame checksum:
+ - stop apply, surface hard error, require retry/reseed policy.
+2. Missing segment due to retention:
+ - deterministic `needs_reseed` status.
+3. Network interruption:
+ - retry with backoff, resume from durable cursor.
+4. Promotion race:
+ - epoch fencing rejects stale primary writes.
+5. Primary crash mid-commit:
+ - recovery ensures token/log durability invariant holds.
+
+## 15) Performance Constraints
+
+- Disabled replication path: <3% regression on write/read microbenchmarks.
+- Enabled replication:
+ - bounded p95 commit overhead target (to be locked in benchmark baseline run).
+ - replica apply throughput >= primary sustained commit rate at target topology.
+- Keep commit hot path branch-light when replication disabled.
+
+## 16) Test-Driven Delivery Model (Red/Green First)
+
+### Phase workflow (mandatory)
+
+1. Red:
+ - Define phase contract/invariants.
+ - Add failing tests for that phase before implementation.
+2. Green:
+ - Implement only enough to pass the new failing tests.
+3. Refactor/Hardening:
+ - Cleanups, edge-case coverage, failure-path tests, perf checks.
+4. Phase gate:
+ - No phase is complete until all red tests are green and phase exit checks pass.
+
+### Test layout
+
+- Module-level tests in `ray-rs/src/replication/*` for parser/state invariants.
+- Cross-module integration tests in `ray-rs/tests/replication_*.rs`.
+- Fault-injection tests in dedicated `ray-rs/tests/replication_faults_*.rs`.
+- Perf checks in existing benchmark harnesses with replication-on/off variants.
+
+### Global test matrix
+
+- Unit:
+ - cursor/token encode/decode.
+ - frame checksum and parse validation.
+ - segment rotation and retention math.
+ - idempotent apply for duplicate/replayed chunks.
+- Integration:
+ - snapshot bootstrap + incremental catch-up.
+ - replica restart + resume cursor.
+ - background checkpoint during active replication.
+ - token wait semantics on replica.
+ - manual promotion and stale writer fencing.
+- Fault injection:
+ - crash before/after token return boundary.
+ - truncated frame/chunk.
+ - corrupt snapshot metadata.
+ - replica far behind retained floor.
+- Performance:
+ - baseline local mode (replication off).
+ - replication-on write latency/throughput.
+ - catch-up time for large backlog.
+
+## 17) Detailed Delivery Phases (Per-Phase Red/Green Gates)
+
+### Phase A: Invariants + sidecar primitives
+
+Objective:
+- Freeze wire/storage invariants and build deterministic sidecar primitives.
+
+Red tests first:
+- Invalid token/cursor strings are rejected.
+- Token/cursor ordering comparator is monotonic and epoch-aware.
+- Corrupt segment frame checksum fails read/scan.
+- Manifest interrupted-write simulation never yields partial-valid state.
+- Segment append/read roundtrip preserves frame boundaries and indices.
+
+Green implementation:
+- Add `replication` module skeleton and core types.
+- Implement versioned manifest read/write with atomic replace semantics.
+- Implement segment append/read and frame checksum verification.
+- Freeze token/cursor format and parser behavior.
+
+Robustness checks:
+- Fuzz/property-like tests on token/cursor parser.
+- Recovery tests for manifest reload after simulated interruption.
+
+Phase exit criteria:
+- All Phase A red tests green.
+- No API breakage.
+- Sidecar primitives deterministic across restart.
+
+### Phase B: Primary commit integration
+
+Objective:
+- Integrate replication append/token generation into primary commit path without regressing disabled mode.
+
+Red tests first:
+- Commit returns monotonic token (`epoch:log_index`) for successful writes.
+- Replication-disabled mode produces no sidecar append activity.
+- Sidecar append failure causes commit failure (no token emitted).
+- Commit ordering remains serialized and token order matches commit order under concurrent writers.
+- Crash boundary test: token is never returned for non-durable replication frame.
+
+Green implementation:
+- Hook replication append into `single_file::transaction::commit`.
+- Add replication config wiring in open options.
+- Emit token and expose primary replication status.
+- Add basic replication metrics counters/gauges.
+
+Robustness checks:
+- Regression benchmark: replication off path <3% overhead.
+- Negative-path tests for IO errors on sidecar append/fsync.
+
+Phase exit criteria:
+- All Phase B red tests green.
+- Disabled path performance gate passes.
+- Durability/token invariant verified by crash-boundary tests.
+
+### Phase C: Replica bootstrap + steady-state apply
+
+Objective:
+- Build replica bootstrap/catch-up/apply loop with idempotency and token-wait semantics.
+
+Red tests first:
+- Replica bootstrap from snapshot reaches exact primary state.
+- Incremental catch-up applies committed frames in order.
+- Duplicate chunk delivery is idempotent (no double-apply).
+- Replica restart resumes from durable cursor without divergence.
+- Token wait returns success on catch-up and timeout when lag persists.
+
+Green implementation:
+- Implement snapshot bootstrap flow and continuity validation.
+- Implement pull loop (`cursor`, `max_bytes`, retry/backoff).
+- Implement apply pipeline using replay semantics + applied-index persistence.
+- Add replica status surface (applied index, lag, last error).
+
+Robustness checks:
+- Checkpoint interleaving tests (primary background checkpoint while replica catches up).
+- Large backlog catch-up throughput and memory boundedness tests.
+
+Phase exit criteria:
+- All Phase C red tests green.
+- Replica apply remains deterministic across restart/retry scenarios.
+- Token-wait semantics validated end-to-end.
+
+### Phase D: Promotion + retention + hardening
+
+Objective:
+- Add manual promotion with fencing and finalize retention/failure behavior.
+
+Red tests first:
+- Promotion increments epoch and fences stale primary writes.
+- Retention respects min active replica cursor and configured minimum window.
+- Missing segment response deterministically marks replica `needs_reseed`.
+- Lagging replica beyond retention floor requires snapshot reseed and recovers.
+- Promotion race cases do not allow split-brain writes.
+
+Green implementation:
+- Implement manual promote flow and epoch fencing checks.
+- Implement replica progress tracking and retention pruning.
+- Add explicit reseed path/status when continuity is broken.
+- Finalize status/admin interfaces for ops visibility.
+
+Robustness checks:
+- Fault-injection sweep for corruption/network/partial transfer.
+- Soak tests at target topology (`1 + up to 5`) with lag churn.
+
+Phase exit criteria:
+- All Phase D red tests green.
+- No split-brain write acceptance in promotion tests.
+- Retention and reseed behavior deterministic and observable.
+
+## 18) Per-Phase Done Definition
+
+- Phase-specific red tests were added before implementation.
+- Green implementation passed with no skipped phase tests.
+- Failure-mode tests for that phase are green.
+- Metrics/status fields for that phase are present and documented.
+- Phase summary notes include known limits and next-phase carry-over items.
+
+## 19) Open Questions
+
+- Commit overhead budget is fixed for V1 gate: `P95_MAX_RATIO=1.03` (replication-on p95 / baseline p95).
+- Host-runtime TLS client-cert enforcement design (beyond playground proxy-header mTLS checks).
+- Whether any vector side data must be promoted to authoritative replicated state in a later phase.
+
+## 20) Phase D Summary (February 8, 2026)
+
+Implemented:
+- Manual promotion API with epoch fencing (`stale primary` rejected on stale writer commit).
+- Retention controls (segment rotation threshold + min retained entries) and primary retention execution.
+- Time-window retention control (`replication_retention_min_ms`) to avoid pruning very recent segments.
+- Replica progress reporting and per-replica lag visibility on primary status.
+- Deterministic reseed signaling (`needs_reseed`) for retained-floor/continuity breaks.
+- Explicit replica reseed API from snapshot.
+- Binding parity for replication admin/status in Node NAPI and Python PyO3 surfaces.
+- Host-runtime Prometheus replication exporter API in Rust core + Node NAPI + Python PyO3 (`collect_replication_metrics_prometheus*`).
+- Host-runtime OpenTelemetry OTLP-JSON replication exporter API in Rust core + Node NAPI + Python PyO3 (`collect_replication_metrics_otel_json*`).
+- Host-runtime OpenTelemetry collector push transport (HTTP OTLP-JSON) in Rust core + Node NAPI + Python PyO3 (`push_replication_metrics_otel_json_single_file`, `pushReplicationMetricsOtelJson`, `push_replication_metrics_otel_json`).
+- Host-runtime OTLP transport hardening for TLS/mTLS (HTTPS-only mode, custom CA trust, optional client cert/key auth).
+- Replica source transport hardening in host-runtime open path (required source DB path + source/local sidecar collision fencing).
+- Operator runbook for promotion/reseed/retention tuning (`docs/REPLICATION_RUNBOOK.md`).
+- Replication benchmark gate script (`ray-rs/scripts/replication-bench-gate.sh`) + benchmark doc wiring.
+- Replica catch-up throughput gate (`ray-rs/scripts/replication-catchup-gate.sh`) and combined perf gate (`ray-rs/scripts/replication-perf-gate.sh`).
+- HTTP transport/admin rollout in playground runtime:
+ - `GET /api/replication/status`
+ - `GET /api/replication/metrics` (Prometheus text export)
+ - `GET /api/replication/snapshot/latest`
+ - `GET /api/replication/log`
+ - `POST /api/replication/pull`
+ - `POST /api/replication/reseed`
+ - `POST /api/replication/promote`
+ - configurable admin auth via `REPLICATION_ADMIN_AUTH_MODE` (`token|mtls|token_or_mtls|token_and_mtls`).
+ - native HTTPS listener + TLS client-cert enforcement support for mTLS auth in playground runtime.
+
+Validated tests:
+- `ray-rs/tests/replication_phase_d.rs` (promotion, retention, reseed, split-brain race).
+- `ray-rs/tests/replication_faults_phase_d.rs` (corrupt/truncated segment fault paths + durable `last_error`).
+
+Known limits:
+- HTTP rollout currently targets playground runtime; broader host-runtime transport remains planned.
+- Host-runtime OTLP export currently targets HTTP OTLP-JSON payloads only (no protobuf/gRPC exporter path).
+
+Carry-over to next phase:
+- Host-runtime replication admin/status HTTP rollout beyond playground runtime (playground remains the only bundled HTTP surface).
diff --git a/docs/REPLICATION_RUNBOOK.md b/docs/REPLICATION_RUNBOOK.md
new file mode 100644
index 0000000..7bc3f7f
--- /dev/null
+++ b/docs/REPLICATION_RUNBOOK.md
@@ -0,0 +1,179 @@
+# Replication Operations Runbook (V1)
+
+Scope:
+- Single-file deployment mode (`.kitedb`) with sidecar replication.
+- Roles: one writable primary, one or more replicas.
+- APIs available in Rust core, Node NAPI, and Python bindings.
+
+## 1. Operational Signals
+
+Primary status fields:
+- `epoch`: current leadership epoch.
+- `head_log_index`: latest committed replication log index.
+- `retained_floor`: lowest retained index after pruning.
+- `replica_lags[]`: per-replica applied position.
+- `append_attempts|append_failures|append_successes`: commit-path replication health.
+
+Replica status fields:
+- `applied_epoch`, `applied_log_index`: durable apply cursor.
+- `last_error`: latest pull/apply failure detail.
+- `needs_reseed`: continuity break or floor violation; snapshot reseed required.
+
+Metrics surface:
+- `collect_metrics()` now includes `replication` with role (`primary|replica|disabled`) plus
+ role-specific replication counters/state for dashboards and alerting.
+- Host-runtime Prometheus text export is available via:
+ - Rust core: `collect_replication_metrics_prometheus_single_file(...)`
+ - Node NAPI: `collectReplicationMetricsPrometheus(db)`
+ - Python PyO3: `collect_replication_metrics_prometheus(db)`
+- Host-runtime OpenTelemetry OTLP-JSON export is available via:
+ - Rust core: `collect_replication_metrics_otel_json_single_file(...)`
+ - Node NAPI: `collectReplicationMetricsOtelJson(db)`
+ - Python PyO3: `collect_replication_metrics_otel_json(db)`
+- Host-runtime OpenTelemetry collector push is available via:
+ - Rust core: `push_replication_metrics_otel_json_single_file(db, endpoint, timeout_ms, bearer_token)`
+ - advanced TLS/mTLS: `push_replication_metrics_otel_json_*_with_options(...)` with
+ `https_only`, `ca_cert_pem_path`, `client_cert_pem_path`, `client_key_pem_path`.
+ - Node NAPI: `pushReplicationMetricsOtelJson(db, endpoint, timeoutMs, bearerToken?)`
+ - advanced TLS/mTLS: `pushReplicationMetricsOtelJsonWithOptions(db, endpoint, options)`.
+ - Python PyO3: `push_replication_metrics_otel_json(db, endpoint, timeout_ms=5000, bearer_token=None)`
+ - advanced TLS/mTLS kwargs:
+ `https_only`, `ca_cert_pem_path`, `client_cert_pem_path`, `client_key_pem_path`.
+
+Alert heuristics:
+- `append_failures > 0` growing: primary sidecar durability issue.
+- Replica lag growth over steady traffic: pull/apply bottleneck.
+- `needs_reseed == true`: force reseed, do not keep retrying catch-up.
+
+## 2. Bootstrap a New Replica
+
+1. Open replica with:
+ - `replication_role=replica`
+ - `replication_source_db_path`
+ - `replication_source_sidecar_path`
+ - Validation hardening:
+ - source DB path is required and must exist as a file,
+ - source DB path must differ from replica DB path,
+ - source sidecar path must differ from local replica sidecar path.
+2. Call `replica_bootstrap_from_snapshot()`.
+3. Start catch-up loop with `replica_catch_up_once(max_frames)`.
+4. Validate `needs_reseed == false` and `last_error == null`.
+
+## 3. Routine Catch-up + Retention
+
+Replica:
+- Poll `replica_catch_up_once(max_frames)` repeatedly.
+- Persist and monitor `applied_log_index`.
+
+Primary:
+- Report each replica cursor via `primary_report_replica_progress(replica_id, epoch, applied_log_index)`.
+- Run `primary_run_retention()` on an operator cadence.
+
+Tuning:
+- `replication_retention_min_entries`: set above worst-case expected replica lag.
+- `replication_retention_min_ms`: keep recent segments for at least this wall-clock window.
+- `replication_segment_max_bytes`: larger segments reduce file churn; smaller segments prune faster.
+
+## 4. Manual Promotion Procedure
+
+Goal: move write authority to a target node without split-brain writes.
+
+1. Quiesce writes on old primary (application-level write freeze).
+2. Promote target primary:
+ - `primary_promote_to_next_epoch()`.
+3. Verify:
+ - new primary status `epoch` incremented,
+ - new writes return tokens in the new epoch.
+4. Confirm stale fence:
+ - old primary write attempts fail with stale-primary error.
+5. Repoint replicas to the promoted primary source paths.
+
+## 5. Reseed Procedure (`needs_reseed`)
+
+Trigger:
+- Replica status sets `needs_reseed=true`, usually from retained-floor/continuity break.
+
+Steps:
+1. Stop normal catch-up loop for that replica.
+2. Execute `replica_reseed_from_snapshot()`.
+3. Resume `replica_catch_up_once(...)`.
+4. Verify:
+ - `needs_reseed=false`,
+ - `last_error` cleared,
+ - data parity checks (counts and spot checks) pass.
+
+## 6. Failure Handling
+
+Corrupt/truncated segment:
+- Symptom: catch-up error + replica `last_error` set.
+- Action: reseed replica from snapshot.
+
+Retention floor outran replica:
+- Symptom: catch-up error mentions reseed/floor; `needs_reseed=true`.
+- Action: reseed; increase `replication_retention_min_entries` if frequent.
+
+Promotion race / split-brain suspicion:
+- Symptom: concurrent promote/write attempts.
+- Expected: exactly one writer succeeds post-promotion.
+- Action: treat stale-writer failures as correct fencing; ensure client routing points to current epoch primary.
+
+## 7. Validation Checklist
+
+Before rollout:
+- `cargo test --no-default-features --test replication_phase_a --test replication_phase_b --test replication_phase_c --test replication_phase_d --test replication_faults_phase_d`
+- `cargo test --no-default-features replication::`
+
+Perf gate:
+- Run `ray-rs/scripts/replication-perf-gate.sh`.
+- Commit overhead gate: require median p95 ratio (replication-on / baseline) within `P95_MAX_RATIO` (default `1.03`, `ATTEMPTS=7`).
+- Catch-up gate: require replica throughput floors (`MIN_CATCHUP_FPS`, `MIN_THROUGHPUT_RATIO`).
+- Catch-up gate retries benchmark noise by default (`ATTEMPTS=3`); increase on busy dev machines.
+
+## 8. HTTP Admin Endpoints (Playground Runtime)
+
+Available endpoints in `playground/src/api/routes.ts`:
+- `GET /api/replication/status`
+- `GET /api/replication/metrics` (Prometheus text format)
+- `GET /api/replication/snapshot/latest`
+- `GET /api/replication/log`
+- `POST /api/replication/pull` (runs `replica_catch_up_once`)
+- `POST /api/replication/reseed` (runs `replica_reseed_from_snapshot`)
+- `POST /api/replication/promote` (runs `primary_promote_to_next_epoch`)
+
+Auth:
+- `REPLICATION_ADMIN_AUTH_MODE` controls admin auth:
+ - `none` (no admin auth)
+ - `token` (Bearer token)
+ - `mtls` (mTLS client-cert header)
+ - `token_or_mtls`
+ - `token_and_mtls`
+- Token modes use `REPLICATION_ADMIN_TOKEN`.
+- mTLS modes read `REPLICATION_MTLS_HEADER` (default `x-forwarded-client-cert`) and optional
+ subject filter `REPLICATION_MTLS_SUBJECT_REGEX`.
+- Native TLS mTLS mode can be enabled with `REPLICATION_MTLS_NATIVE_TLS=true` when the
+ playground listener is configured with:
+ - `PLAYGROUND_TLS_CERT_FILE`, `PLAYGROUND_TLS_KEY_FILE` (HTTPS enablement)
+ - `PLAYGROUND_TLS_REQUEST_CERT=true`
+ - `PLAYGROUND_TLS_REJECT_UNAUTHORIZED=true`
+ - optional `PLAYGROUND_TLS_CA_FILE` for custom client-cert trust roots
+- `REPLICATION_MTLS_SUBJECT_REGEX` applies to header-based mTLS values; native TLS mode
+ validates client cert handshake presence, not subject matching.
+- `metrics`, `snapshot`, `log`, `pull`, `reseed`, and `promote` enforce the selected mode.
+- `status` is read-only and does not require auth.
+
+Playground curl examples:
+- `export BASE="http://localhost:3000"`
+- `curl "$BASE/api/replication/status"`
+- `curl -H "Authorization: Bearer $REPLICATION_ADMIN_TOKEN" "$BASE/api/replication/metrics"`
+- `curl -H "Authorization: Bearer $REPLICATION_ADMIN_TOKEN" "$BASE/api/replication/log?maxFrames=128&maxBytes=1048576"`
+- `curl -X POST -H "Authorization: Bearer $REPLICATION_ADMIN_TOKEN" -H "Content-Type: application/json" -d '{"maxFrames":256}' "$BASE/api/replication/pull"`
+- `curl -X POST -H "Authorization: Bearer $REPLICATION_ADMIN_TOKEN" "$BASE/api/replication/reseed"`
+- `curl -X POST -H "Authorization: Bearer $REPLICATION_ADMIN_TOKEN" "$BASE/api/replication/promote"`
+- `curl -H "x-client-cert: CN=allowed-client,O=RayDB" "$BASE/api/replication/metrics"` (when `REPLICATION_ADMIN_AUTH_MODE=mtls`)
+
+## 9. Known V1 Limits
+
+- Retention policy supports entry-window + time-window floors, but not richer SLA-aware policies.
+- HTTP rollout currently targets the playground runtime API; host-runtime transport rollout remains planned.
+- Host-runtime OTLP export currently targets HTTP OTLP-JSON payloads only (no protobuf/gRPC exporter path).
+- `SyncMode::Normal` and `SyncMode::Off` optimize commit latency by batching sidecar frame writes in-memory and refreshing manifest fencing periodically (not every commit). For strict per-commit sidecar visibility/fencing, use `SyncMode::Full`.
diff --git a/ray-rs/Cargo.toml b/ray-rs/Cargo.toml
index 3ca524f..7c231b9 100644
--- a/ray-rs/Cargo.toml
+++ b/ray-rs/Cargo.toml
@@ -28,6 +28,9 @@ thiserror = "2.0"
# Serialization
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
+ureq = "2.10"
+rustls-pemfile = "2.2"
+webpki-roots = "1.0"
# Binary encoding
byteorder = "1.5"
diff --git a/ray-rs/README.md b/ray-rs/README.md
index f3696f2..3df3e83 100644
--- a/ray-rs/README.md
+++ b/ray-rs/README.md
@@ -181,6 +181,77 @@ const [aliceFriends, bobFriends] = await Promise.all([
This is implemented using a read-write lock (RwLock) internally, providing good read scalability while maintaining data consistency.
+## Replication Admin (low-level API)
+
+Phase D replication controls are available on the low-level `Database` API.
+
+```ts
+import { Database } from 'kitedb'
+import {
+ collectReplicationMetricsOtelJson,
+ collectReplicationMetricsPrometheus,
+ pushReplicationMetricsOtelJson,
+ pushReplicationMetricsOtelJsonWithOptions,
+} from 'kitedb/native'
+
+const primary = Database.open('cluster-primary.kitedb', {
+ replicationRole: 'Primary',
+ replicationSidecarPath: './cluster-primary.sidecar',
+ replicationSegmentMaxBytes: 64 * 1024 * 1024,
+ replicationRetentionMinEntries: 1024,
+})
+
+primary.begin()
+primary.createNode('n:1')
+const token = primary.commitWithToken()
+
+primary.primaryReportReplicaProgress('replica-a', 1, 42)
+const retention = primary.primaryRunRetention()
+const primaryStatus = primary.primaryReplicationStatus()
+
+const replica = Database.open('cluster-replica.kitedb', {
+ replicationRole: 'Replica',
+ replicationSidecarPath: './cluster-replica.sidecar',
+ replicationSourceDbPath: 'cluster-primary.kitedb',
+ replicationSourceSidecarPath: './cluster-primary.sidecar',
+})
+
+replica.replicaBootstrapFromSnapshot()
+replica.replicaCatchUpOnce(256)
+if (token) replica.waitForToken(token, 2_000)
+const replicaStatus = replica.replicaReplicationStatus()
+if (replicaStatus?.needsReseed) replica.replicaReseedFromSnapshot()
+
+const prometheus = collectReplicationMetricsPrometheus(primary)
+console.log(prometheus)
+
+const otelJson = collectReplicationMetricsOtelJson(primary)
+console.log(otelJson)
+
+const exportResult = pushReplicationMetricsOtelJson(
+ primary,
+ 'http://127.0.0.1:4318/v1/metrics',
+ 5_000,
+)
+console.log(exportResult.statusCode, exportResult.responseBody)
+
+const secureExport = pushReplicationMetricsOtelJsonWithOptions(
+ primary,
+ 'https://collector.internal:4318/v1/metrics',
+ {
+ timeoutMs: 5_000,
+ httpsOnly: true,
+ caCertPemPath: './tls/collector-ca.pem',
+ clientCertPemPath: './tls/client.pem',
+ clientKeyPemPath: './tls/client-key.pem',
+ },
+)
+console.log(secureExport.statusCode, secureExport.responseBody)
+
+replica.close()
+primary.close()
+```
+
## API surface
The Node bindings expose both low-level graph primitives (`Database`) and higher-level APIs (Kite) for schema-driven workflows, plus metrics, backups, traversal, and vector search. For full API details and guides, see the docs:
diff --git a/ray-rs/index.d.ts b/ray-rs/index.d.ts
index 45f1dec..d2b3188 100644
--- a/ray-rs/index.d.ts
+++ b/ray-rs/index.d.ts
@@ -864,6 +864,28 @@ export interface CheckResult {
export declare function collectMetrics(db: Database): DatabaseMetrics
+export declare function collectReplicationMetricsOtelJson(db: Database): string
+
+export declare function collectReplicationMetricsPrometheus(db: Database): string
+
+export interface OtlpHttpExportResult {
+ statusCode: number
+ responseBody: string
+}
+
+export declare function pushReplicationMetricsOtelJson(db: Database, endpoint: string, timeoutMs: number, bearerToken?: string | undefined | null): OtlpHttpExportResult
+
+export interface PushReplicationMetricsOtelOptions {
+ timeoutMs?: number
+ bearerToken?: string
+ httpsOnly?: boolean
+ caCertPemPath?: string
+ clientCertPemPath?: string
+ clientKeyPemPath?: string
+}
+
+export declare function pushReplicationMetricsOtelJsonWithOptions(db: Database, endpoint: string, options?: PushReplicationMetricsOtelOptions | undefined | null): OtlpHttpExportResult
+
/** Compression options */
export interface CompressionOptions {
/** Enable compression (default false) */
diff --git a/ray-rs/index.js b/ray-rs/index.js
index 18c95c3..c051d27 100644
--- a/ray-rs/index.js
+++ b/ray-rs/index.js
@@ -597,6 +597,10 @@ module.exports.VectorIndex = nativeBinding.VectorIndex
module.exports.backupInfo = nativeBinding.backupInfo
module.exports.bruteForceSearch = nativeBinding.bruteForceSearch
module.exports.collectMetrics = nativeBinding.collectMetrics
+module.exports.collectReplicationMetricsOtelJson = nativeBinding.collectReplicationMetricsOtelJson
+module.exports.collectReplicationMetricsPrometheus = nativeBinding.collectReplicationMetricsPrometheus
+module.exports.pushReplicationMetricsOtelJson = nativeBinding.pushReplicationMetricsOtelJson
+module.exports.pushReplicationMetricsOtelJsonWithOptions = nativeBinding.pushReplicationMetricsOtelJsonWithOptions
module.exports.createBackup = nativeBinding.createBackup
module.exports.createOfflineBackup = nativeBinding.createOfflineBackup
module.exports.createVectorIndex = nativeBinding.createVectorIndex
diff --git a/ray-rs/python/PARITY_MATRIX.md b/ray-rs/python/PARITY_MATRIX.md
index c8af6ad..a61ce7c 100644
--- a/ray-rs/python/PARITY_MATRIX.md
+++ b/ray-rs/python/PARITY_MATRIX.md
@@ -62,7 +62,7 @@ Legend: parity = full feature match, partial = similar capability with API or be
| Export/Import | `export*`, `import*` | `export*`, `import*` | parity | Python exposes JSON object and file helpers. |
| Streaming | `stream*`, `get*Page` | `stream*`, `get*Page` | parity | Same batching/pagination behavior. |
| Backup/Restore | `createBackup`, `restoreBackup` | `create_backup`, `restore_backup` | parity | Naming differences only. |
-| Metrics/Health | `collectMetrics`, `healthCheck` | `collect_metrics`, `health_check` | parity | Naming differences only. |
+| Metrics/Health | `collectMetrics`, `collectReplicationMetricsPrometheus`, `collectReplicationMetricsOtelJson`, `pushReplicationMetricsOtelJson`, `healthCheck` | `collect_metrics`, `collect_replication_metrics_prometheus`, `collect_replication_metrics_otel_json`, `push_replication_metrics_otel_json`, `health_check` | parity | Naming differences only. |
## Vector Search
diff --git a/ray-rs/python/README.md b/ray-rs/python/README.md
index 9fa7f8f..585f8ad 100644
--- a/ray-rs/python/README.md
+++ b/ray-rs/python/README.md
@@ -188,6 +188,83 @@ for result in results:
print(result.node_id, result.distance)
```
+## Replication admin (low-level API)
+
+Phase D replication controls are available on `Database`:
+
+```python
+from kitedb import (
+ Database,
+ OpenOptions,
+ collect_replication_metrics_otel_json,
+ collect_replication_metrics_prometheus,
+ push_replication_metrics_otel_json,
+)
+
+primary = Database(
+ "cluster-primary.kitedb",
+ OpenOptions(
+ replication_role="primary",
+ replication_sidecar_path="./cluster-primary.sidecar",
+ replication_segment_max_bytes=64 * 1024 * 1024,
+ replication_retention_min_entries=1024,
+ ),
+)
+
+primary.begin()
+primary.create_node("n:1")
+token = primary.commit_with_token()
+
+primary.primary_report_replica_progress("replica-a", 1, 42)
+pruned_segments, retained_floor = primary.primary_run_retention()
+primary_status = primary.primary_replication_status()
+
+replica = Database(
+ "cluster-replica.kitedb",
+ OpenOptions(
+ replication_role="replica",
+ replication_sidecar_path="./cluster-replica.sidecar",
+ replication_source_db_path="cluster-primary.kitedb",
+ replication_source_sidecar_path="./cluster-primary.sidecar",
+ ),
+)
+
+replica.replica_bootstrap_from_snapshot()
+replica.replica_catch_up_once(256)
+if token:
+ replica.wait_for_token(token, 2000)
+replica_status = replica.replica_replication_status()
+if replica_status and replica_status["needs_reseed"]:
+ replica.replica_reseed_from_snapshot()
+
+prometheus = collect_replication_metrics_prometheus(primary)
+print(prometheus)
+
+otel_json = collect_replication_metrics_otel_json(primary)
+print(otel_json)
+
+status_code, response_body = push_replication_metrics_otel_json(
+ primary,
+ "http://127.0.0.1:4318/v1/metrics",
+ timeout_ms=5000,
+)
+print(status_code, response_body)
+
+secure_status, secure_body = push_replication_metrics_otel_json(
+ primary,
+ "https://collector.internal:4318/v1/metrics",
+ timeout_ms=5000,
+ https_only=True,
+ ca_cert_pem_path="./tls/collector-ca.pem",
+ client_cert_pem_path="./tls/client.pem",
+ client_key_pem_path="./tls/client-key.pem",
+)
+print(secure_status, secure_body)
+
+replica.close()
+primary.close()
+```
+
## Documentation
```text
diff --git a/ray-rs/python/kitedb/__init__.py b/ray-rs/python/kitedb/__init__.py
index a56bd64..4e5cb58 100644
--- a/ray-rs/python/kitedb/__init__.py
+++ b/ray-rs/python/kitedb/__init__.py
@@ -103,6 +103,9 @@
# Functions
open_database,
collect_metrics,
+ collect_replication_metrics_otel_json,
+ collect_replication_metrics_prometheus,
+ push_replication_metrics_otel_json,
health_check,
create_backup,
restore_backup,
@@ -270,6 +273,9 @@
# Functions
"open_database",
"collect_metrics",
+ "collect_replication_metrics_otel_json",
+ "collect_replication_metrics_prometheus",
+ "push_replication_metrics_otel_json",
"health_check",
"create_backup",
"restore_backup",
diff --git a/ray-rs/python/kitedb/_kitedb.pyi b/ray-rs/python/kitedb/_kitedb.pyi
index 6f3fa43..41c9c88 100644
--- a/ray-rs/python/kitedb/_kitedb.pyi
+++ b/ray-rs/python/kitedb/_kitedb.pyi
@@ -534,6 +534,18 @@ class Database:
def open_database(path: str, options: Optional[OpenOptions] = None) -> Database: ...
def collect_metrics(db: Database) -> DatabaseMetrics: ...
+def collect_replication_metrics_otel_json(db: Database) -> str: ...
+def collect_replication_metrics_prometheus(db: Database) -> str: ...
+def push_replication_metrics_otel_json(
+ db: Database,
+ endpoint: str,
+ timeout_ms: int = 5000,
+ bearer_token: Optional[str] = None,
+ https_only: bool = False,
+ ca_cert_pem_path: Optional[str] = None,
+ client_cert_pem_path: Optional[str] = None,
+ client_key_pem_path: Optional[str] = None,
+) -> Tuple[int, str]: ...
def health_check(db: Database) -> HealthCheckResult: ...
def create_backup(db: Database, backup_path: str, options: Optional[BackupOptions] = None) -> BackupResult: ...
def restore_backup(backup_path: str, restore_path: str, options: Optional[RestoreOptions] = None) -> str: ...
diff --git a/ray-rs/src/metrics/mod.rs b/ray-rs/src/metrics/mod.rs
index 25a0adf..7f7f4bb 100644
--- a/ray-rs/src/metrics/mod.rs
+++ b/ray-rs/src/metrics/mod.rs
@@ -2,10 +2,18 @@
//!
//! Core implementation used by bindings.
-use std::time::SystemTime;
+use std::fs::File;
+use std::io::BufReader;
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};
+
+use serde_json::{json, Value};
use crate::cache::manager::CacheManagerStats;
use crate::core::single_file::SingleFileDB;
+use crate::error::{KiteError, Result};
+use crate::replication::primary::PrimaryReplicationStatus;
+use crate::replication::replica::ReplicaReplicationStatus;
use crate::types::DeltaState;
/// Cache layer metrics
@@ -56,6 +64,41 @@ pub struct MvccMetrics {
pub committed_writes_pruned: i64,
}
+/// Primary replication metrics
+#[derive(Debug, Clone)]
+pub struct PrimaryReplicationMetrics {
+ pub epoch: i64,
+ pub head_log_index: i64,
+ pub retained_floor: i64,
+ pub replica_count: i64,
+ pub stale_epoch_replica_count: i64,
+ pub max_replica_lag: i64,
+ pub min_replica_applied_log_index: Option,
+ pub sidecar_path: String,
+ pub last_token: Option,
+ pub append_attempts: i64,
+ pub append_failures: i64,
+ pub append_successes: i64,
+}
+
+/// Replica replication metrics
+#[derive(Debug, Clone)]
+pub struct ReplicaReplicationMetrics {
+ pub applied_epoch: i64,
+ pub applied_log_index: i64,
+ pub needs_reseed: bool,
+ pub last_error: Option,
+}
+
+/// Replication metrics
+#[derive(Debug, Clone)]
+pub struct ReplicationMetrics {
+ pub enabled: bool,
+ pub role: String,
+ pub primary: Option,
+ pub replica: Option,
+}
+
/// Memory metrics
#[derive(Debug, Clone)]
pub struct MemoryMetrics {
@@ -74,6 +117,7 @@ pub struct DatabaseMetrics {
pub data: DataMetrics,
pub cache: CacheMetrics,
pub mvcc: Option,
+ pub replication: ReplicationMetrics,
pub memory: MemoryMetrics,
pub collected_at_ms: i64,
}
@@ -93,6 +137,40 @@ pub struct HealthCheckResult {
pub checks: Vec,
}
+/// OTLP HTTP push result for replication metrics export.
+#[derive(Debug, Clone)]
+pub struct OtlpHttpExportResult {
+ pub status_code: i64,
+ pub response_body: String,
+}
+
+/// TLS/mTLS options for OTLP HTTP push.
+#[derive(Debug, Clone, Default)]
+pub struct OtlpHttpTlsOptions {
+ pub https_only: bool,
+ pub ca_cert_pem_path: Option,
+ pub client_cert_pem_path: Option,
+ pub client_key_pem_path: Option,
+}
+
+/// OTLP HTTP push options for collector export.
+#[derive(Debug, Clone)]
+pub struct OtlpHttpPushOptions {
+ pub timeout_ms: u64,
+ pub bearer_token: Option,
+ pub tls: OtlpHttpTlsOptions,
+}
+
+impl Default for OtlpHttpPushOptions {
+ fn default() -> Self {
+ Self {
+ timeout_ms: 5_000,
+ bearer_token: None,
+ tls: OtlpHttpTlsOptions::default(),
+ }
+ }
+}
+
pub fn collect_metrics_single_file(db: &SingleFileDB) -> DatabaseMetrics {
let stats = db.stats();
let delta = db.delta.read();
@@ -118,6 +196,10 @@ pub fn collect_metrics_single_file(db: &SingleFileDB) -> DatabaseMetrics {
};
let cache = build_cache_metrics(cache_stats.as_ref());
+ let replication = build_replication_metrics(
+ db.primary_replication_status(),
+ db.replica_replication_status(),
+ );
let delta_bytes = estimate_delta_memory(&delta);
let cache_bytes = estimate_cache_memory(cache_stats.as_ref());
let snapshot_bytes = (stats.snapshot_nodes as i64 * 50) + (stats.snapshot_edges as i64 * 20);
@@ -145,6 +227,7 @@ pub fn collect_metrics_single_file(db: &SingleFileDB) -> DatabaseMetrics {
data,
cache,
mvcc,
+ replication,
memory: MemoryMetrics {
delta_estimate_bytes: delta_bytes,
cache_estimate_bytes: cache_bytes,
@@ -155,6 +238,615 @@ pub fn collect_metrics_single_file(db: &SingleFileDB) -> DatabaseMetrics {
}
}
+/// Collect replication-only metrics and render them in Prometheus text format.
+pub fn collect_replication_metrics_prometheus_single_file(db: &SingleFileDB) -> String {
+ let metrics = collect_metrics_single_file(db);
+ render_replication_metrics_prometheus(&metrics)
+}
+
+/// Collect replication-only metrics and render them as OTLP JSON payload.
+pub fn collect_replication_metrics_otel_json_single_file(db: &SingleFileDB) -> String {
+ let metrics = collect_metrics_single_file(db);
+ render_replication_metrics_otel_json(&metrics)
+}
+
+/// Push replication OTLP-JSON payload to an OTLP collector endpoint.
+///
+/// Expects collector HTTP endpoint (for example `/v1/metrics`).
+/// Returns an error when collector responds with non-2xx status.
+pub fn push_replication_metrics_otel_json_single_file(
+ db: &SingleFileDB,
+ endpoint: &str,
+ timeout_ms: u64,
+ bearer_token: Option<&str>,
+) -> Result {
+ let options = OtlpHttpPushOptions {
+ timeout_ms,
+ bearer_token: bearer_token.map(ToOwned::to_owned),
+ ..OtlpHttpPushOptions::default()
+ };
+ push_replication_metrics_otel_json_single_file_with_options(db, endpoint, &options)
+}
+
+/// Push replication OTLP-JSON payload using explicit push options.
+pub fn push_replication_metrics_otel_json_single_file_with_options(
+ db: &SingleFileDB,
+ endpoint: &str,
+ options: &OtlpHttpPushOptions,
+) -> Result {
+ let payload = collect_replication_metrics_otel_json_single_file(db);
+ push_replication_metrics_otel_json_payload_with_options(&payload, endpoint, options)
+}
+
+/// Push pre-rendered replication OTLP-JSON payload to an OTLP collector endpoint.
+pub fn push_replication_metrics_otel_json_payload(
+ payload: &str,
+ endpoint: &str,
+ timeout_ms: u64,
+ bearer_token: Option<&str>,
+) -> Result {
+ let options = OtlpHttpPushOptions {
+ timeout_ms,
+ bearer_token: bearer_token.map(ToOwned::to_owned),
+ ..OtlpHttpPushOptions::default()
+ };
+ push_replication_metrics_otel_json_payload_with_options(payload, endpoint, &options)
+}
+
+/// Push pre-rendered replication OTLP-JSON payload using explicit push options.
+pub fn push_replication_metrics_otel_json_payload_with_options(
+ payload: &str,
+ endpoint: &str,
+ options: &OtlpHttpPushOptions,
+) -> Result {
+ let endpoint = endpoint.trim();
+ if endpoint.is_empty() {
+ return Err(KiteError::InvalidQuery(
+ "OTLP endpoint must not be empty".into(),
+ ));
+ }
+ if options.timeout_ms == 0 {
+ return Err(KiteError::InvalidQuery("timeout_ms must be > 0".into()));
+ }
+ if options.tls.https_only && !endpoint_uses_https(endpoint) {
+ return Err(KiteError::InvalidQuery(
+ "OTLP endpoint must use https when https_only is enabled".into(),
+ ));
+ }
+
+ let timeout = Duration::from_millis(options.timeout_ms);
+ let agent = build_otel_http_agent(endpoint, options, timeout)?;
+ let mut request = agent
+ .post(endpoint)
+ .set("content-type", "application/json")
+ .timeout(timeout);
+
+ if let Some(token) = options.bearer_token.as_deref() {
+ if !token.trim().is_empty() {
+ request = request.set("authorization", &format!("Bearer {token}"));
+ }
+ }
+
+ match request.send_string(payload) {
+ Ok(response) => {
+ let status_code = response.status() as i64;
+ let response_body = response.into_string().unwrap_or_default();
+ Ok(OtlpHttpExportResult {
+ status_code,
+ response_body,
+ })
+ }
+ Err(ureq::Error::Status(status_code, response)) => {
+ let body = response.into_string().unwrap_or_default();
+ Err(KiteError::Internal(format!(
+ "OTLP collector rejected replication metrics: status {status_code}, body: {body}"
+ )))
+ }
+ Err(ureq::Error::Transport(error)) => Err(KiteError::Io(std::io::Error::other(format!(
+ "OTLP collector transport error: {error}"
+ )))),
+ }
+}
+
+fn endpoint_uses_https(endpoint: &str) -> bool {
+ endpoint.to_ascii_lowercase().starts_with("https://")
+}
+
+fn build_otel_http_agent(
+ endpoint: &str,
+ options: &OtlpHttpPushOptions,
+ timeout: Duration,
+) -> Result {
+ let ca_cert_pem_path = options
+ .tls
+ .ca_cert_pem_path
+ .as_deref()
+ .map(str::trim)
+ .filter(|path| !path.is_empty());
+ let client_cert_pem_path = options
+ .tls
+ .client_cert_pem_path
+ .as_deref()
+ .map(str::trim)
+ .filter(|path| !path.is_empty());
+ let client_key_pem_path = options
+ .tls
+ .client_key_pem_path
+ .as_deref()
+ .map(str::trim)
+ .filter(|path| !path.is_empty());
+
+ if client_cert_pem_path.is_some() ^ client_key_pem_path.is_some() {
+ return Err(KiteError::InvalidQuery(
+ "OTLP mTLS requires both client_cert_pem_path and client_key_pem_path".into(),
+ ));
+ }
+
+ let custom_tls_configured =
+ ca_cert_pem_path.is_some() || (client_cert_pem_path.is_some() && client_key_pem_path.is_some());
+ if custom_tls_configured && !endpoint_uses_https(endpoint) {
+ return Err(KiteError::InvalidQuery(
+ "OTLP custom TLS/mTLS configuration requires an https endpoint".into(),
+ ));
+ }
+
+ let mut builder = ureq::builder()
+ .https_only(options.tls.https_only)
+ .timeout_connect(timeout)
+ .timeout_read(timeout)
+ .timeout_write(timeout);
+
+ if custom_tls_configured {
+ let mut root_store = ureq::rustls::RootCertStore::empty();
+ root_store.extend(webpki_roots::TLS_SERVER_ROOTS.iter().cloned());
+
+ if let Some(path) = ca_cert_pem_path {
+ let certs = load_certificates_from_pem(path, "ca_cert_pem_path")?;
+ let (valid_count, _) = root_store.add_parsable_certificates(certs);
+ if valid_count == 0 {
+ return Err(KiteError::InvalidQuery(
+ format!("No valid CA certificates found in ca_cert_pem_path: {path}").into(),
+ ));
+ }
+ }
+
+ let client_config_builder =
+ ureq::rustls::ClientConfig::builder().with_root_certificates(root_store);
+ let client_config =
+ if let (Some(cert_path), Some(key_path)) = (client_cert_pem_path, client_key_pem_path) {
+ let certs = load_certificates_from_pem(cert_path, "client_cert_pem_path")?;
+ let key = load_private_key_from_pem(key_path, "client_key_pem_path")?;
+ client_config_builder
+ .with_client_auth_cert(certs, key)
+ .map_err(|error| {
+ KiteError::InvalidQuery(
+ format!("Invalid OTLP client certificate/key for mTLS: {error}").into(),
+ )
+ })?
+ } else {
+ client_config_builder.with_no_client_auth()
+ };
+
+ builder = builder.tls_config(Arc::new(client_config));
+ }
+
+ Ok(builder.build())
+}
+
+fn load_certificates_from_pem(
+ path: &str,
+ field_name: &str,
+) -> Result>> {
+ let file = File::open(path).map_err(|error| {
+ KiteError::InvalidQuery(format!("Failed opening {field_name} '{path}': {error}").into())
+ })?;
+ let mut reader = BufReader::new(file);
+ let certs = rustls_pemfile::certs(&mut reader)
+ .collect::, _>>()
+ .map_err(|error| {
+ KiteError::InvalidQuery(
+ format!("Failed parsing certificates from {field_name} '{path}': {error}").into(),
+ )
+ })?;
+ if certs.is_empty() {
+ return Err(KiteError::InvalidQuery(
+ format!("No certificates found in {field_name} '{path}'").into(),
+ ));
+ }
+ Ok(certs)
+}
+
+fn load_private_key_from_pem(
+ path: &str,
+ field_name: &str,
+) -> Result> {
+ let file = File::open(path).map_err(|error| {
+ KiteError::InvalidQuery(format!("Failed opening {field_name} '{path}': {error}").into())
+ })?;
+ let mut reader = BufReader::new(file);
+ rustls_pemfile::private_key(&mut reader)
+ .map_err(|error| {
+ KiteError::InvalidQuery(
+ format!("Failed parsing private key from {field_name} '{path}': {error}").into(),
+ )
+ })?
+ .ok_or_else(|| {
+ KiteError::InvalidQuery(format!("No private key found in {field_name} '{path}'").into())
+ })
+}
+
+/// Render replication metrics from a metrics snapshot using Prometheus exposition format.
+pub fn render_replication_metrics_prometheus(metrics: &DatabaseMetrics) -> String {
+ let mut lines = Vec::new();
+ let role = metrics.replication.role.as_str();
+ let enabled = if metrics.replication.enabled { 1 } else { 0 };
+
+ push_prometheus_help(
+ &mut lines,
+ "kitedb_replication_enabled",
+ "gauge",
+ "Whether replication is enabled for this database (1 enabled, 0 disabled).",
+ );
+ push_prometheus_sample(
+ &mut lines,
+ "kitedb_replication_enabled",
+ enabled,
+ &[("role", role)],
+ );
+
+ // Host-runtime export path is process-local and does not enforce HTTP auth.
+ push_prometheus_help(
+ &mut lines,
+ "kitedb_replication_auth_enabled",
+ "gauge",
+ "Whether replication admin auth is enabled for this metrics exporter.",
+ );
+ push_prometheus_sample(&mut lines, "kitedb_replication_auth_enabled", 0, &[]);
+
+ if let Some(primary) = metrics.replication.primary.as_ref() {
+ push_prometheus_help(
+ &mut lines,
+ "kitedb_replication_primary_epoch",
+ "gauge",
+ "Current primary replication epoch.",
+ );
+ push_prometheus_sample(
+ &mut lines,
+ "kitedb_replication_primary_epoch",
+ primary.epoch,
+ &[],
+ );
+
+ push_prometheus_help(
+ &mut lines,
+ "kitedb_replication_primary_head_log_index",
+ "gauge",
+ "Current primary head log index.",
+ );
+ push_prometheus_sample(
+ &mut lines,
+ "kitedb_replication_primary_head_log_index",
+ primary.head_log_index,
+ &[],
+ );
+
+ push_prometheus_help(
+ &mut lines,
+ "kitedb_replication_primary_retained_floor",
+ "gauge",
+ "Current primary retained floor log index.",
+ );
+ push_prometheus_sample(
+ &mut lines,
+ "kitedb_replication_primary_retained_floor",
+ primary.retained_floor,
+ &[],
+ );
+
+ push_prometheus_help(
+ &mut lines,
+ "kitedb_replication_primary_replica_count",
+ "gauge",
+ "Replica progress reporters known by this primary.",
+ );
+ push_prometheus_sample(
+ &mut lines,
+ "kitedb_replication_primary_replica_count",
+ primary.replica_count,
+ &[],
+ );
+
+ push_prometheus_help(
+ &mut lines,
+ "kitedb_replication_primary_stale_epoch_replica_count",
+ "gauge",
+ "Replica reporters currently on stale epochs.",
+ );
+ push_prometheus_sample(
+ &mut lines,
+ "kitedb_replication_primary_stale_epoch_replica_count",
+ primary.stale_epoch_replica_count,
+ &[],
+ );
+
+ push_prometheus_help(
+ &mut lines,
+ "kitedb_replication_primary_max_replica_lag",
+ "gauge",
+ "Maximum reported lag (log frames) across replicas.",
+ );
+ push_prometheus_sample(
+ &mut lines,
+ "kitedb_replication_primary_max_replica_lag",
+ primary.max_replica_lag,
+ &[],
+ );
+
+ push_prometheus_help(
+ &mut lines,
+ "kitedb_replication_primary_append_attempts_total",
+ "counter",
+ "Total replication append attempts on the primary commit path.",
+ );
+ push_prometheus_sample(
+ &mut lines,
+ "kitedb_replication_primary_append_attempts_total",
+ primary.append_attempts,
+ &[],
+ );
+
+ push_prometheus_help(
+ &mut lines,
+ "kitedb_replication_primary_append_failures_total",
+ "counter",
+ "Total replication append failures on the primary commit path.",
+ );
+ push_prometheus_sample(
+ &mut lines,
+ "kitedb_replication_primary_append_failures_total",
+ primary.append_failures,
+ &[],
+ );
+
+ push_prometheus_help(
+ &mut lines,
+ "kitedb_replication_primary_append_successes_total",
+ "counter",
+ "Total replication append successes on the primary commit path.",
+ );
+ push_prometheus_sample(
+ &mut lines,
+ "kitedb_replication_primary_append_successes_total",
+ primary.append_successes,
+ &[],
+ );
+ }
+
+ if let Some(replica) = metrics.replication.replica.as_ref() {
+ push_prometheus_help(
+ &mut lines,
+ "kitedb_replication_replica_applied_epoch",
+ "gauge",
+ "Replica applied epoch.",
+ );
+ push_prometheus_sample(
+ &mut lines,
+ "kitedb_replication_replica_applied_epoch",
+ replica.applied_epoch,
+ &[],
+ );
+
+ push_prometheus_help(
+ &mut lines,
+ "kitedb_replication_replica_applied_log_index",
+ "gauge",
+ "Replica applied log index.",
+ );
+ push_prometheus_sample(
+ &mut lines,
+ "kitedb_replication_replica_applied_log_index",
+ replica.applied_log_index,
+ &[],
+ );
+
+ push_prometheus_help(
+ &mut lines,
+ "kitedb_replication_replica_needs_reseed",
+ "gauge",
+ "Whether replica currently requires snapshot reseed (1 yes, 0 no).",
+ );
+ push_prometheus_sample(
+ &mut lines,
+ "kitedb_replication_replica_needs_reseed",
+ if replica.needs_reseed { 1 } else { 0 },
+ &[],
+ );
+
+ push_prometheus_help(
+ &mut lines,
+ "kitedb_replication_replica_last_error_present",
+ "gauge",
+ "Whether replica currently has a non-empty last_error value (1 yes, 0 no).",
+ );
+ push_prometheus_sample(
+ &mut lines,
+ "kitedb_replication_replica_last_error_present",
+ if replica.last_error.is_some() { 1 } else { 0 },
+ &[],
+ );
+ }
+
+ let mut text = lines.join("\n");
+ text.push('\n');
+ text
+}
+
+/// Render replication metrics in OpenTelemetry OTLP JSON format.
+pub fn render_replication_metrics_otel_json(metrics: &DatabaseMetrics) -> String {
+ let role = metrics.replication.role.as_str();
+ let enabled = if metrics.replication.enabled { 1 } else { 0 };
+ let time_unix_nano = metric_time_unix_nano(metrics);
+ let mut otel_metrics: Vec = Vec::new();
+
+ otel_metrics.push(otel_gauge_metric(
+ "kitedb.replication.enabled",
+ "Whether replication is enabled for this database (1 enabled, 0 disabled).",
+ "1",
+ enabled,
+ &[("role", role)],
+ &time_unix_nano,
+ ));
+
+ // Host-runtime export path is process-local and does not enforce HTTP auth.
+ otel_metrics.push(otel_gauge_metric(
+ "kitedb.replication.auth.enabled",
+ "Whether replication admin auth is enabled for this metrics exporter.",
+ "1",
+ 0,
+ &[],
+ &time_unix_nano,
+ ));
+
+ if let Some(primary) = metrics.replication.primary.as_ref() {
+ otel_metrics.push(otel_gauge_metric(
+ "kitedb.replication.primary.epoch",
+ "Current primary replication epoch.",
+ "1",
+ primary.epoch,
+ &[],
+ &time_unix_nano,
+ ));
+ otel_metrics.push(otel_gauge_metric(
+ "kitedb.replication.primary.head_log_index",
+ "Current primary head log index.",
+ "1",
+ primary.head_log_index,
+ &[],
+ &time_unix_nano,
+ ));
+ otel_metrics.push(otel_gauge_metric(
+ "kitedb.replication.primary.retained_floor",
+ "Current primary retained floor log index.",
+ "1",
+ primary.retained_floor,
+ &[],
+ &time_unix_nano,
+ ));
+ otel_metrics.push(otel_gauge_metric(
+ "kitedb.replication.primary.replica_count",
+ "Replica progress reporters known by this primary.",
+ "1",
+ primary.replica_count,
+ &[],
+ &time_unix_nano,
+ ));
+ otel_metrics.push(otel_gauge_metric(
+ "kitedb.replication.primary.stale_epoch_replica_count",
+ "Replica reporters currently on stale epochs.",
+ "1",
+ primary.stale_epoch_replica_count,
+ &[],
+ &time_unix_nano,
+ ));
+ otel_metrics.push(otel_gauge_metric(
+ "kitedb.replication.primary.max_replica_lag",
+ "Maximum reported lag (log frames) across replicas.",
+ "1",
+ primary.max_replica_lag,
+ &[],
+ &time_unix_nano,
+ ));
+
+ otel_metrics.push(otel_sum_metric(
+ "kitedb.replication.primary.append_attempts",
+ "Total replication append attempts on the primary commit path.",
+ "1",
+ primary.append_attempts,
+ true,
+ &[],
+ &time_unix_nano,
+ ));
+ otel_metrics.push(otel_sum_metric(
+ "kitedb.replication.primary.append_failures",
+ "Total replication append failures on the primary commit path.",
+ "1",
+ primary.append_failures,
+ true,
+ &[],
+ &time_unix_nano,
+ ));
+ otel_metrics.push(otel_sum_metric(
+ "kitedb.replication.primary.append_successes",
+ "Total replication append successes on the primary commit path.",
+ "1",
+ primary.append_successes,
+ true,
+ &[],
+ &time_unix_nano,
+ ));
+ }
+
+ if let Some(replica) = metrics.replication.replica.as_ref() {
+ otel_metrics.push(otel_gauge_metric(
+ "kitedb.replication.replica.applied_epoch",
+ "Replica applied epoch.",
+ "1",
+ replica.applied_epoch,
+ &[],
+ &time_unix_nano,
+ ));
+ otel_metrics.push(otel_gauge_metric(
+ "kitedb.replication.replica.applied_log_index",
+ "Replica applied log index.",
+ "1",
+ replica.applied_log_index,
+ &[],
+ &time_unix_nano,
+ ));
+ otel_metrics.push(otel_gauge_metric(
+ "kitedb.replication.replica.needs_reseed",
+ "Whether replica currently requires snapshot reseed (1 yes, 0 no).",
+ "1",
+ if replica.needs_reseed { 1 } else { 0 },
+ &[],
+ &time_unix_nano,
+ ));
+ otel_metrics.push(otel_gauge_metric(
+ "kitedb.replication.replica.last_error_present",
+ "Whether replica currently has a non-empty last_error value (1 yes, 0 no).",
+ "1",
+ if replica.last_error.is_some() { 1 } else { 0 },
+ &[],
+ &time_unix_nano,
+ ));
+ }
+
+ let payload = json!({
+ "resourceMetrics": [
+ {
+ "resource": {
+ "attributes": [
+ otel_attr_string("service.name", "kitedb"),
+ otel_attr_string("kitedb.database.path", metrics.path.as_str()),
+ otel_attr_string("kitedb.metrics.scope", "replication"),
+ ]
+ },
+ "scopeMetrics": [
+ {
+ "scope": {
+ "name": "kitedb.metrics.replication",
+ "version": env!("CARGO_PKG_VERSION"),
+ },
+ "metrics": otel_metrics,
+ }
+ ]
+ }
+ ]
+ });
+
+ serde_json::to_string(&payload).unwrap_or_else(|_| "{\"resourceMetrics\":[]}".to_string())
+}
+
pub fn health_check_single_file(db: &SingleFileDB) -> HealthCheckResult {
let mut checks = Vec::new();
@@ -214,6 +906,77 @@ pub fn health_check_single_file(db: &SingleFileDB) -> HealthCheckResult {
HealthCheckResult { healthy, checks }
}
+fn build_replication_metrics(
+ primary: Option,
+ replica: Option,
+) -> ReplicationMetrics {
+ let role = if primary.is_some() {
+ "primary"
+ } else if replica.is_some() {
+ "replica"
+ } else {
+ "disabled"
+ };
+
+ ReplicationMetrics {
+ enabled: role != "disabled",
+ role: role.to_string(),
+ primary: primary.map(build_primary_replication_metrics),
+ replica: replica.map(build_replica_replication_metrics),
+ }
+}
+
+fn build_primary_replication_metrics(
+ status: PrimaryReplicationStatus,
+) -> PrimaryReplicationMetrics {
+ let mut max_replica_lag = 0u64;
+ let mut min_replica_applied_log_index: Option = None;
+ let mut stale_epoch_replica_count = 0u64;
+
+ for lag in &status.replica_lags {
+ if lag.epoch != status.epoch {
+ stale_epoch_replica_count = stale_epoch_replica_count.saturating_add(1);
+ }
+
+ if lag.epoch == status.epoch {
+ let lag_value = status.head_log_index.saturating_sub(lag.applied_log_index);
+ max_replica_lag = max_replica_lag.max(lag_value);
+ min_replica_applied_log_index = Some(match min_replica_applied_log_index {
+ Some(current) => current.min(lag.applied_log_index),
+ None => lag.applied_log_index,
+ });
+ } else if lag.epoch < status.epoch {
+ max_replica_lag = max_replica_lag.max(status.head_log_index);
+ }
+ }
+
+ PrimaryReplicationMetrics {
+ epoch: status.epoch as i64,
+ head_log_index: status.head_log_index as i64,
+ retained_floor: status.retained_floor as i64,
+ replica_count: status.replica_lags.len() as i64,
+ stale_epoch_replica_count: stale_epoch_replica_count as i64,
+ max_replica_lag: max_replica_lag as i64,
+ min_replica_applied_log_index: min_replica_applied_log_index.map(|value| value as i64),
+ sidecar_path: status.sidecar_path.to_string_lossy().to_string(),
+ last_token: status.last_token.map(|token| token.to_string()),
+ append_attempts: status.append_attempts as i64,
+ append_failures: status.append_failures as i64,
+ append_successes: status.append_successes as i64,
+ }
+}
+
+fn build_replica_replication_metrics(
+ status: ReplicaReplicationStatus,
+) -> ReplicaReplicationMetrics {
+ ReplicaReplicationMetrics {
+ applied_epoch: status.applied_epoch as i64,
+ applied_log_index: status.applied_log_index as i64,
+ needs_reseed: status.needs_reseed,
+ last_error: status.last_error,
+ }
+}
+
fn calc_hit_rate(hits: u64, misses: u64) -> f64 {
let total = hits + misses;
if total > 0 {
@@ -338,3 +1101,111 @@ fn system_time_to_millis(time: SystemTime) -> i64 {
.unwrap_or_default()
.as_millis() as i64
}
+
+fn escape_prometheus_label_value(value: &str) -> String {
+ value
+ .replace('\\', "\\\\")
+ .replace('"', "\\\"")
+ .replace('\n', "\\n")
+}
+
+fn format_prometheus_labels(labels: &[(&str, &str)]) -> String {
+ if labels.is_empty() {
+ return String::new();
+ }
+
+ let rendered = labels
+ .iter()
+ .map(|(key, value)| format!("{key}=\"{}\"", escape_prometheus_label_value(value)))
+ .collect::>()
+ .join(",");
+ format!("{{{rendered}}}")
+}
+
+fn push_prometheus_help(lines: &mut Vec, metric: &str, metric_type: &str, help: &str) {
+ lines.push(format!("# HELP {metric} {help}"));
+ lines.push(format!("# TYPE {metric} {metric_type}"));
+}
+
+fn push_prometheus_sample(
+ lines: &mut Vec,
+ metric: &str,
+ value: i64,
+ labels: &[(&str, &str)],
+) {
+ lines.push(format!(
+ "{metric}{} {value}",
+ format_prometheus_labels(labels)
+ ));
+}
+
+fn metric_time_unix_nano(metrics: &DatabaseMetrics) -> String {
+ let millis = metrics.collected_at_ms.max(0) as u64;
+ millis.saturating_mul(1_000_000).to_string()
+}
+
+fn otel_attr_string(key: &str, value: &str) -> Value {
+ json!({
+ "key": key,
+ "value": { "stringValue": value }
+ })
+}
+
+fn otel_attributes(labels: &[(&str, &str)]) -> Vec {
+ labels
+ .iter()
+ .map(|(key, value)| otel_attr_string(key, value))
+ .collect()
+}
+
+fn otel_gauge_metric(
+ name: &str,
+ description: &str,
+ unit: &str,
+ value: i64,
+ labels: &[(&str, &str)],
+ time_unix_nano: &str,
+) -> Value {
+ json!({
+ "name": name,
+ "description": description,
+ "unit": unit,
+ "gauge": {
+ "dataPoints": [
+ {
+ "attributes": otel_attributes(labels),
+ "asInt": value,
+ "timeUnixNano": time_unix_nano,
+ }
+ ]
+ }
+ })
+}
+
+fn otel_sum_metric(
+ name: &str,
+ description: &str,
+ unit: &str,
+ value: i64,
+ is_monotonic: bool,
+ labels: &[(&str, &str)],
+ time_unix_nano: &str,
+) -> Value {
+ json!({
+ "name": name,
+ "description": description,
+ "unit": unit,
+ "sum": {
+ // CUMULATIVE
+ "aggregationTemporality": 2,
+ "isMonotonic": is_monotonic,
+ "dataPoints": [
+ {
+ "attributes": otel_attributes(labels),
+ "asInt": value,
+ "timeUnixNano": time_unix_nano,
+ }
+ ]
+ }
+ })
+}
diff --git a/ray-rs/src/napi_bindings/database.rs b/ray-rs/src/napi_bindings/database.rs
index f8b3596..f1eecec 100644
--- a/ray-rs/src/napi_bindings/database.rs
+++ b/ray-rs/src/napi_bindings/database.rs
@@ -5,6 +5,7 @@
use napi::bindgen_prelude::*;
use napi_derive::napi;
use std::path::PathBuf;
+use std::str::FromStr;
use super::traversal::{
JsPathConfig, JsPathResult, JsTraversalDirection, JsTraversalResult, JsTraversalStep,
@@ -25,6 +26,11 @@ use crate::core::single_file::{
};
use crate::export as ray_export;
use crate::metrics as core_metrics;
+use crate::replication::primary::{
+ PrimaryReplicationStatus, PrimaryRetentionOutcome, ReplicaLagStatus,
+};
+use crate::replication::replica::ReplicaReplicationStatus;
+use crate::replication::types::{CommitToken, ReplicationRole as RustReplicationRole};
use crate::streaming;
use crate::types::{
CheckResult as RustCheckResult, ETypeId, Edge, EdgeWithProps as CoreEdgeWithProps, NodeId,
@@ -83,6 +89,25 @@ impl From for RustSnapshotParseMode {
}
}
+/// Replication role for single-file open options
+#[napi(string_enum)]
+#[derive(Debug)]
+pub enum JsReplicationRole {
+ Disabled,
+ Primary,
+ Replica,
+}
+
+impl From for RustReplicationRole {
+ fn from(role: JsReplicationRole) -> Self {
+ match role {
+ JsReplicationRole::Disabled => RustReplicationRole::Disabled,
+ JsReplicationRole::Primary => RustReplicationRole::Primary,
+ JsReplicationRole::Replica => RustReplicationRole::Replica,
+ }
+ }
+}
+
// ============================================================================
// Open Options
// ============================================================================
@@ -135,6 +160,20 @@ pub struct OpenOptions {
pub group_commit_window_ms: Option,
/// Snapshot parse mode: "Strict" or "Salvage" (single-file only)
pub snapshot_parse_mode: Option,
+ /// Replication role: "Disabled", "Primary", or "Replica"
+ pub replication_role: Option,
+ /// Replication sidecar path override
+ pub replication_sidecar_path: Option,
+ /// Source primary db path (replica role only)
+ pub replication_source_db_path: Option,
+ /// Source primary sidecar path (replica role only)
+ pub replication_source_sidecar_path: Option,
+ /// Segment rotation threshold in bytes (primary role only)
+ pub replication_segment_max_bytes: Option,
+ /// Minimum retained entries window (primary role only)
+ pub replication_retention_min_entries: Option,
+ /// Minimum retained segment age in milliseconds (primary role only)
+ pub replication_retention_min_ms: Option,
}
impl From for RustOpenOptions {
@@ -221,6 +260,33 @@ impl From for RustOpenOptions {
if let Some(mode) = opts.snapshot_parse_mode {
rust_opts = rust_opts.snapshot_parse_mode(mode.into());
}
+ if let Some(role) = opts.replication_role {
+ rust_opts = rust_opts.replication_role(role.into());
+ }
+ if let Some(path) = opts.replication_sidecar_path {
+ rust_opts = rust_opts.replication_sidecar_path(path);
+ }
+ if let Some(path) = opts.replication_source_db_path {
+ rust_opts = rust_opts.replication_source_db_path(path);
+ }
+ if let Some(path) = opts.replication_source_sidecar_path {
+ rust_opts = rust_opts.replication_source_sidecar_path(path);
+ }
+ if let Some(value) = opts.replication_segment_max_bytes {
+ if value >= 0 {
+ rust_opts = rust_opts.replication_segment_max_bytes(value as u64);
+ }
+ }
+ if let Some(value) = opts.replication_retention_min_entries {
+ if value >= 0 {
+ rust_opts = rust_opts.replication_retention_min_entries(value as u64);
+ }
+ }
+ if let Some(value) = opts.replication_retention_min_ms {
+ if value >= 0 {
+ rust_opts = rust_opts.replication_retention_min_ms(value as u64);
+ }
+ }
rust_opts
}
@@ -374,6 +440,102 @@ pub struct MvccStats {
pub committed_writes_pruned: i64,
}
+/// Per-replica lag entry on primary status
+#[napi(object)]
+pub struct JsReplicaLagStatus {
+ pub replica_id: String,
+ pub epoch: i64,
+ pub applied_log_index: i64,
+}
+
+/// Primary replication runtime status
+#[napi(object)]
+pub struct JsPrimaryReplicationStatus {
+ pub role: String,
+ pub epoch: i64,
+ pub head_log_index: i64,
+ pub retained_floor: i64,
+ pub replica_lags: Vec,
+ pub sidecar_path: String,
+ pub last_token: Option,
+ pub append_attempts: i64,
+ pub append_failures: i64,
+ pub append_successes: i64,
+}
+
+/// Replica replication runtime status
+#[napi(object)]
+pub struct JsReplicaReplicationStatus {
+ pub role: String,
+ pub source_db_path: Option,
+ pub source_sidecar_path: Option,
+ pub applied_epoch: i64,
+ pub applied_log_index: i64,
+ pub last_error: Option,
+ pub needs_reseed: bool,
+}
+
+/// Retention run outcome
+#[napi(object)]
+pub struct JsPrimaryRetentionOutcome {
+ pub pruned_segments: i64,
+ pub retained_floor: i64,
+}
+
+impl From for JsReplicaLagStatus {
+ fn from(value: ReplicaLagStatus) -> Self {
+ Self {
+ replica_id: value.replica_id,
+ epoch: value.epoch as i64,
+ applied_log_index: value.applied_log_index as i64,
+ }
+ }
+}
+
+impl From for JsPrimaryReplicationStatus {
+ fn from(value: PrimaryReplicationStatus) -> Self {
+ Self {
+ role: value.role.to_string(),
+ epoch: value.epoch as i64,
+ head_log_index: value.head_log_index as i64,
+ retained_floor: value.retained_floor as i64,
+ replica_lags: value.replica_lags.into_iter().map(Into::into).collect(),
+ sidecar_path: value.sidecar_path.to_string_lossy().to_string(),
+ last_token: value.last_token.map(|token| token.to_string()),
+ append_attempts: value.append_attempts as i64,
+ append_failures: value.append_failures as i64,
+ append_successes: value.append_successes as i64,
+ }
+ }
+}
+
+impl From for JsReplicaReplicationStatus {
+ fn from(value: ReplicaReplicationStatus) -> Self {
+ Self {
+ role: value.role.to_string(),
+ source_db_path: value
+ .source_db_path
+ .map(|path| path.to_string_lossy().to_string()),
+ source_sidecar_path: value
+ .source_sidecar_path
+ .map(|path| path.to_string_lossy().to_string()),
+ applied_epoch: value.applied_epoch as i64,
+ applied_log_index: value.applied_log_index as i64,
+ last_error: value.last_error,
+ needs_reseed: value.needs_reseed,
+ }
+ }
+}
+
+impl From for JsPrimaryRetentionOutcome {
+ fn from(value: PrimaryRetentionOutcome) -> Self {
+ Self {
+ pruned_segments: value.pruned_segments as i64,
+ retained_floor: value.retained_floor as i64,
+ }
+ }
+}
+
/// Options for export
#[napi(object)]
pub struct ExportOptions {
@@ -601,6 +763,41 @@ pub struct MvccMetrics {
pub committed_writes_pruned: i64,
}
+/// Primary replication metrics
+#[napi(object)]
+pub struct PrimaryReplicationMetrics {
+ pub epoch: i64,
+ pub head_log_index: i64,
+ pub retained_floor: i64,
+ pub replica_count: i64,
+ pub stale_epoch_replica_count: i64,
+ pub max_replica_lag: i64,
+ pub min_replica_applied_log_index: Option,
+ pub sidecar_path: String,
+ pub last_token: Option,
+ pub append_attempts: i64,
+ pub append_failures: i64,
+ pub append_successes: i64,
+}
+
+/// Replica replication metrics
+#[napi(object)]
+pub struct ReplicaReplicationMetrics {
+ pub applied_epoch: i64,
+ pub applied_log_index: i64,
+ pub needs_reseed: bool,
+ pub last_error: Option,
+}
+
+/// Replication metrics
+#[napi(object)]
+pub struct ReplicationMetrics {
+ pub enabled: bool,
+ pub role: String,
+ pub primary: Option,
+ pub replica: Option,
+}
+
/// Memory metrics
#[napi(object)]
pub struct MemoryMetrics {
@@ -619,6 +816,7 @@ pub struct DatabaseMetrics {
pub data: DataMetrics,
pub cache: CacheMetrics,
pub mvcc: Option,
+ pub replication: ReplicationMetrics,
pub memory: MemoryMetrics,
/// Timestamp in milliseconds since epoch
pub collected_at: i64,
@@ -639,6 +837,25 @@ pub struct HealthCheckResult {
pub checks: Vec,
}
+/// OTLP HTTP metrics push result.
+#[napi(object)]
+pub struct OtlpHttpExportResult {
+ pub status_code: i64,
+ pub response_body: String,
+}
+
+/// OTLP collector push options (host runtime).
+#[napi(object)]
+#[derive(Default, Clone)]
+pub struct PushReplicationMetricsOtelOptions {
+ pub timeout_ms: Option,
+ pub bearer_token: Option,
+ pub https_only: Option,
+ pub ca_cert_pem_path: Option,
+ pub client_cert_pem_path: Option,
+ pub client_key_pem_path: Option,
+}
+
impl From for CacheLayerMetrics {
fn from(metrics: core_metrics::CacheLayerMetrics) -> Self {
CacheLayerMetrics {
@@ -695,6 +912,47 @@ impl From for MvccMetrics {
}
}
+impl From for PrimaryReplicationMetrics {
+ fn from(metrics: core_metrics::PrimaryReplicationMetrics) -> Self {
+ PrimaryReplicationMetrics {
+ epoch: metrics.epoch,
+ head_log_index: metrics.head_log_index,
+ retained_floor: metrics.retained_floor,
+ replica_count: metrics.replica_count,
+ stale_epoch_replica_count: metrics.stale_epoch_replica_count,
+ max_replica_lag: metrics.max_replica_lag,
+ min_replica_applied_log_index: metrics.min_replica_applied_log_index,
+ sidecar_path: metrics.sidecar_path,
+ last_token: metrics.last_token,
+ append_attempts: metrics.append_attempts,
+ append_failures: metrics.append_failures,
+ append_successes: metrics.append_successes,
+ }
+ }
+}
+
+impl From for ReplicaReplicationMetrics {
+ fn from(metrics: core_metrics::ReplicaReplicationMetrics) -> Self {
+ ReplicaReplicationMetrics {
+ applied_epoch: metrics.applied_epoch,
+ applied_log_index: metrics.applied_log_index,
+ needs_reseed: metrics.needs_reseed,
+ last_error: metrics.last_error,
+ }
+ }
+}
+
+impl From for ReplicationMetrics {
+ fn from(metrics: core_metrics::ReplicationMetrics) -> Self {
+ ReplicationMetrics {
+ enabled: metrics.enabled,
+ role: metrics.role,
+ primary: metrics.primary.map(Into::into),
+ replica: metrics.replica.map(Into::into),
+ }
+ }
+}
+
impl From for MemoryMetrics {
fn from(metrics: core_metrics::MemoryMetrics) -> Self {
MemoryMetrics {
@@ -715,6 +973,7 @@ impl From for DatabaseMetrics {
data: metrics.data.into(),
cache: metrics.cache.into(),
mvcc: metrics.mvcc.map(Into::into),
+ replication: metrics.replication.into(),
memory: metrics.memory.into(),
collected_at: metrics.collected_at_ms,
}
@@ -740,6 +999,15 @@ impl From for HealthCheckResult {
}
}
+impl From for OtlpHttpExportResult {
+ fn from(result: core_metrics::OtlpHttpExportResult) -> Self {
+ OtlpHttpExportResult {
+ status_code: result.status_code,
+ response_body: result.response_body,
+ }
+ }
+}
+
// ============================================================================
// Property Value (JS-compatible)
// ============================================================================
@@ -1013,6 +1281,18 @@ impl Database {
}
}
+ /// Commit the current transaction and return replication token when primary replication is enabled.
+ #[napi]
+ pub fn commit_with_token(&self) -> Result> {
+ match self.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => db
+ .commit_with_token()
+ .map(|token| token.map(|value| value.to_string()))
+ .map_err(|e| Error::from_reason(format!("Failed to commit with token: {e}"))),
+ None => Err(Error::from_reason("Database is closed")),
+ }
+ }
+
/// Rollback the current transaction
#[napi]
pub fn rollback(&self) -> Result<()> {
@@ -1033,6 +1313,127 @@ impl Database {
}
}
+ /// Wait until the DB has observed at least the provided commit token.
+ #[napi]
+ pub fn wait_for_token(&self, token: String, timeout_ms: i64) -> Result {
+ if timeout_ms < 0 {
+ return Err(Error::from_reason("timeoutMs must be non-negative"));
+ }
+ let token = CommitToken::from_str(&token)
+ .map_err(|e| Error::from_reason(format!("Invalid commit token: {e}")))?;
+
+ match self.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => db
+ .wait_for_token(token, timeout_ms as u64)
+ .map_err(|e| Error::from_reason(format!("Failed waiting for token: {e}"))),
+ None => Err(Error::from_reason("Database is closed")),
+ }
+ }
+
+ // ========================================================================
+ // Replication Methods
+ // ========================================================================
+
+ /// Primary replication status when role=primary, else null.
+ #[napi]
+ pub fn primary_replication_status(&self) -> Result> {
+ match self.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => Ok(db.primary_replication_status().map(Into::into)),
+ None => Err(Error::from_reason("Database is closed")),
+ }
+ }
+
+ /// Replica replication status when role=replica, else null.
+ #[napi]
+ pub fn replica_replication_status(&self) -> Result > {
+ match self.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => Ok(db.replica_replication_status().map(Into::into)),
+ None => Err(Error::from_reason("Database is closed")),
+ }
+ }
+
+ /// Promote this primary to the next replication epoch.
+ #[napi]
+ pub fn primary_promote_to_next_epoch(&self) -> Result {
+ match self.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => db
+ .primary_promote_to_next_epoch()
+ .map(|epoch| epoch as i64)
+ .map_err(|e| Error::from_reason(format!("Failed to promote primary: {e}"))),
+ None => Err(Error::from_reason("Database is closed")),
+ }
+ }
+
+ /// Report replica applied cursor to primary for retention decisions.
+ #[napi]
+ pub fn primary_report_replica_progress(
+ &self,
+ replica_id: String,
+ epoch: i64,
+ applied_log_index: i64,
+ ) -> Result<()> {
+ if epoch < 0 || applied_log_index < 0 {
+ return Err(Error::from_reason(
+ "epoch and appliedLogIndex must be non-negative",
+ ));
+ }
+ match self.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => db
+ .primary_report_replica_progress(&replica_id, epoch as u64, applied_log_index as u64)
+ .map_err(|e| Error::from_reason(format!("Failed to report replica progress: {e}"))),
+ None => Err(Error::from_reason("Database is closed")),
+ }
+ }
+
+ /// Execute replication retention on primary.
+ #[napi]
+ pub fn primary_run_retention(&self) -> Result {
+ match self.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => db
+ .primary_run_retention()
+ .map(Into::into)
+ .map_err(|e| Error::from_reason(format!("Failed to run retention: {e}"))),
+ None => Err(Error::from_reason("Database is closed")),
+ }
+ }
+
+ /// Bootstrap a replica from the primary snapshot.
+ #[napi]
+ pub fn replica_bootstrap_from_snapshot(&self) -> Result<()> {
+ match self.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => db
+ .replica_bootstrap_from_snapshot()
+ .map_err(|e| Error::from_reason(format!("Failed to bootstrap replica: {e}"))),
+ None => Err(Error::from_reason("Database is closed")),
+ }
+ }
+
+ /// Pull and apply up to maxFrames replication frames on replica.
+ #[napi]
+ pub fn replica_catch_up_once(&self, max_frames: i64) -> Result {
+ if max_frames < 0 {
+ return Err(Error::from_reason("maxFrames must be non-negative"));
+ }
+ match self.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => db
+ .replica_catch_up_once(max_frames as usize)
+ .map(|count| count as i64)
+ .map_err(|e| Error::from_reason(format!("Failed replica catch-up: {e}"))),
+ None => Err(Error::from_reason("Database is closed")),
+ }
+ }
+
+ /// Force a replica reseed from current primary snapshot.
+ #[napi]
+ pub fn replica_reseed_from_snapshot(&self) -> Result<()> {
+ match self.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => db
+ .replica_reseed_from_snapshot()
+ .map_err(|e| Error::from_reason(format!("Failed to reseed replica: {e}"))),
+ None => Err(Error::from_reason("Database is closed")),
+ }
+ }
+
// ========================================================================
// Node Operations
// ========================================================================
@@ -2891,6 +3292,89 @@ pub fn collect_metrics(db: &Database) -> Result {
}
}
+#[napi]
+pub fn collect_replication_metrics_prometheus(db: &Database) -> Result {
+ match db.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => {
+ Ok(core_metrics::collect_replication_metrics_prometheus_single_file(db))
+ }
+ None => Err(Error::from_reason("Database is closed")),
+ }
+}
+
+#[napi]
+pub fn collect_replication_metrics_otel_json(db: &Database) -> Result {
+ match db.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => {
+ Ok(core_metrics::collect_replication_metrics_otel_json_single_file(db))
+ }
+ None => Err(Error::from_reason("Database is closed")),
+ }
+}
+
+#[napi]
+pub fn push_replication_metrics_otel_json(
+ db: &Database,
+ endpoint: String,
+ timeout_ms: i64,
+ bearer_token: Option,
+) -> Result {
+ if timeout_ms <= 0 {
+ return Err(Error::from_reason("timeoutMs must be positive"));
+ }
+
+ match db.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => {
+ core_metrics::push_replication_metrics_otel_json_single_file(
+ db,
+ &endpoint,
+ timeout_ms as u64,
+ bearer_token.as_deref(),
+ )
+ .map(Into::into)
+ .map_err(|e| Error::from_reason(format!("Failed to push replication metrics: {e}")))
+ }
+ None => Err(Error::from_reason("Database is closed")),
+ }
+}
+
+#[napi]
+pub fn push_replication_metrics_otel_json_with_options(
+ db: &Database,
+ endpoint: String,
+ options: Option,
+) -> Result {
+ let options = options.unwrap_or_default();
+ let timeout_ms = options.timeout_ms.unwrap_or(5_000);
+ if timeout_ms <= 0 {
+ return Err(Error::from_reason("timeoutMs must be positive"));
+ }
+
+ let core_options = core_metrics::OtlpHttpPushOptions {
+ timeout_ms: timeout_ms as u64,
+ bearer_token: options.bearer_token,
+ tls: core_metrics::OtlpHttpTlsOptions {
+ https_only: options.https_only.unwrap_or(false),
+ ca_cert_pem_path: options.ca_cert_pem_path,
+ client_cert_pem_path: options.client_cert_pem_path,
+ client_key_pem_path: options.client_key_pem_path,
+ },
+ };
+
+ match db.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => {
+ core_metrics::push_replication_metrics_otel_json_single_file_with_options(
+ db,
+ &endpoint,
+ &core_options,
+ )
+ .map(Into::into)
+ .map_err(|e| Error::from_reason(format!("Failed to push replication metrics: {e}")))
+ }
+ None => Err(Error::from_reason("Database is closed")),
+ }
+}
+
#[napi]
pub fn health_check(db: &Database) -> Result {
match db.inner.as_ref() {
diff --git a/ray-rs/src/pyo3_bindings/database.rs b/ray-rs/src/pyo3_bindings/database.rs
index dbae208..128d122 100644
--- a/ray-rs/src/pyo3_bindings/database.rs
+++ b/ray-rs/src/pyo3_bindings/database.rs
@@ -5,7 +5,9 @@
use pyo3::exceptions::PyRuntimeError;
use pyo3::prelude::*;
+use pyo3::types::{PyDict, PyList};
use std::path::PathBuf;
+use std::str::FromStr;
use std::sync::RwLock;
use crate::backup as core_backup;
@@ -14,6 +16,7 @@ use crate::core::single_file::{
VacuumOptions as RustVacuumOptions,
};
use crate::metrics as core_metrics;
+use crate::replication::types::CommitToken;
use crate::types::{ETypeId, EdgeWithProps as CoreEdgeWithProps, NodeId, PropKeyId};
// Import from modular structure
@@ -268,6 +271,200 @@ impl PyDatabase {
dispatch_ok!(self, |db| db.has_transaction(), |_db| false)
}
+ /// Commit and return replication commit token (e.g. "2:41") when available.
+ fn commit_with_token(&self) -> PyResult> {
+ dispatch!(
+ self,
+ |db| db
+ .commit_with_token()
+ .map(|token| token.map(|value| value.to_string()))
+ .map_err(|e| PyRuntimeError::new_err(format!("Failed to commit: {e}"))),
+ |_db| { unreachable!("multi-file database support removed") }
+ )
+ }
+
+ /// Wait until this DB has observed at least the provided commit token.
+ fn wait_for_token(&self, token: String, timeout_ms: i64) -> PyResult {
+ if timeout_ms < 0 {
+ return Err(PyRuntimeError::new_err("timeout_ms must be non-negative"));
+ }
+ let token = CommitToken::from_str(&token)
+ .map_err(|e| PyRuntimeError::new_err(format!("Invalid token: {e}")))?;
+ dispatch!(
+ self,
+ |db| db
+ .wait_for_token(token, timeout_ms as u64)
+ .map_err(|e| PyRuntimeError::new_err(format!("Failed waiting for token: {e}"))),
+ |_db| { unreachable!("multi-file database support removed") }
+ )
+ }
+
+ /// Primary replication status dictionary when role=primary, else None.
+ fn primary_replication_status(&self, py: Python<'_>) -> PyResult> {
+ let guard = self
+ .inner
+ .read()
+ .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
+ match guard.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => {
+ let Some(status) = db.primary_replication_status() else {
+ return Ok(None);
+ };
+
+ let out = PyDict::new_bound(py);
+ out.set_item("role", status.role.to_string())?;
+ out.set_item("epoch", status.epoch)?;
+ out.set_item("head_log_index", status.head_log_index)?;
+ out.set_item("retained_floor", status.retained_floor)?;
+ out.set_item(
+ "sidecar_path",
+ status.sidecar_path.to_string_lossy().to_string(),
+ )?;
+ out.set_item(
+ "last_token",
+ status.last_token.map(|token| token.to_string()),
+ )?;
+ out.set_item("append_attempts", status.append_attempts)?;
+ out.set_item("append_failures", status.append_failures)?;
+ out.set_item("append_successes", status.append_successes)?;
+
+ let lags = PyList::empty_bound(py);
+ for lag in status.replica_lags {
+ let lag_item = PyDict::new_bound(py);
+ lag_item.set_item("replica_id", lag.replica_id)?;
+ lag_item.set_item("epoch", lag.epoch)?;
+ lag_item.set_item("applied_log_index", lag.applied_log_index)?;
+ lags.append(lag_item)?;
+ }
+ out.set_item("replica_lags", lags)?;
+
+ Ok(Some(out.into_py(py)))
+ }
+ None => Err(PyRuntimeError::new_err("Database is closed")),
+ }
+ }
+
+ /// Replica replication status dictionary when role=replica, else None.
+ fn replica_replication_status(&self, py: Python<'_>) -> PyResult > {
+ let guard = self
+ .inner
+ .read()
+ .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
+ match guard.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => {
+ let Some(status) = db.replica_replication_status() else {
+ return Ok(None);
+ };
+
+ let out = PyDict::new_bound(py);
+ out.set_item("role", status.role.to_string())?;
+ out.set_item(
+ "source_db_path",
+ status
+ .source_db_path
+ .map(|path| path.to_string_lossy().to_string()),
+ )?;
+ out.set_item(
+ "source_sidecar_path",
+ status
+ .source_sidecar_path
+ .map(|path| path.to_string_lossy().to_string()),
+ )?;
+ out.set_item("applied_epoch", status.applied_epoch)?;
+ out.set_item("applied_log_index", status.applied_log_index)?;
+ out.set_item("last_error", status.last_error)?;
+ out.set_item("needs_reseed", status.needs_reseed)?;
+ Ok(Some(out.into_py(py)))
+ }
+ None => Err(PyRuntimeError::new_err("Database is closed")),
+ }
+ }
+
+ /// Promote this primary to the next replication epoch.
+ fn primary_promote_to_next_epoch(&self) -> PyResult {
+ dispatch!(
+ self,
+ |db| db
+ .primary_promote_to_next_epoch()
+ .map(|value| value as i64)
+ .map_err(|e| PyRuntimeError::new_err(format!("Failed to promote primary: {e}"))),
+ |_db| { unreachable!("multi-file database support removed") }
+ )
+ }
+
+ /// Report replica progress cursor to primary.
+ fn primary_report_replica_progress(
+ &self,
+ replica_id: String,
+ epoch: i64,
+ applied_log_index: i64,
+ ) -> PyResult<()> {
+ if epoch < 0 || applied_log_index < 0 {
+ return Err(PyRuntimeError::new_err(
+ "epoch and applied_log_index must be non-negative",
+ ));
+ }
+ dispatch!(
+ self,
+ |db| db
+ .primary_report_replica_progress(&replica_id, epoch as u64, applied_log_index as u64)
+ .map_err(|e| PyRuntimeError::new_err(format!("Failed to report replica progress: {e}"))),
+ |_db| { unreachable!("multi-file database support removed") }
+ )
+ }
+
+ /// Run primary retention and return (pruned_segments, retained_floor).
+ fn primary_run_retention(&self) -> PyResult<(i64, i64)> {
+ dispatch!(
+ self,
+ |db| db
+ .primary_run_retention()
+ .map(|outcome| (
+ outcome.pruned_segments as i64,
+ outcome.retained_floor as i64
+ ))
+ .map_err(|e| PyRuntimeError::new_err(format!("Failed to run retention: {e}"))),
+ |_db| { unreachable!("multi-file database support removed") }
+ )
+ }
+
+ /// Bootstrap replica state from source snapshot.
+ fn replica_bootstrap_from_snapshot(&self) -> PyResult<()> {
+ dispatch!(
+ self,
+ |db| db
+ .replica_bootstrap_from_snapshot()
+ .map_err(|e| PyRuntimeError::new_err(format!("Failed to bootstrap replica: {e}"))),
+ |_db| { unreachable!("multi-file database support removed") }
+ )
+ }
+
+ /// Pull and apply at most max_frames frames on replica.
+ fn replica_catch_up_once(&self, max_frames: i64) -> PyResult {
+ if max_frames < 0 {
+ return Err(PyRuntimeError::new_err("max_frames must be non-negative"));
+ }
+ dispatch!(
+ self,
+ |db| db
+ .replica_catch_up_once(max_frames as usize)
+ .map(|count| count as i64)
+ .map_err(|e| PyRuntimeError::new_err(format!("Failed replica catch-up: {e}"))),
+ |_db| { unreachable!("multi-file database support removed") }
+ )
+ }
+
+ /// Force a replica reseed from source snapshot.
+ fn replica_reseed_from_snapshot(&self) -> PyResult<()> {
+ dispatch!(
+ self,
+ |db| db
+ .replica_reseed_from_snapshot()
+ .map_err(|e| PyRuntimeError::new_err(format!("Failed to reseed replica: {e}"))),
+ |_db| { unreachable!("multi-file database support removed") }
+ )
+ }
+
// ==========================================================================
// Node Operations
// ==========================================================================
@@ -1542,6 +1739,86 @@ pub fn collect_metrics(db: &PyDatabase) -> PyResult {
}
}
+#[pyfunction]
+pub fn collect_replication_metrics_prometheus(db: &PyDatabase) -> PyResult {
+ let guard = db
+ .inner
+ .read()
+ .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
+ match guard.as_ref() {
+ Some(DatabaseInner::SingleFile(d)) => {
+ Ok(core_metrics::collect_replication_metrics_prometheus_single_file(d))
+ }
+ None => Err(PyRuntimeError::new_err("Database is closed")),
+ }
+}
+
+#[pyfunction]
+pub fn collect_replication_metrics_otel_json(db: &PyDatabase) -> PyResult {
+ let guard = db
+ .inner
+ .read()
+ .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
+ match guard.as_ref() {
+ Some(DatabaseInner::SingleFile(d)) => {
+ Ok(core_metrics::collect_replication_metrics_otel_json_single_file(d))
+ }
+ None => Err(PyRuntimeError::new_err("Database is closed")),
+ }
+}
+
+#[pyfunction]
+#[pyo3(signature = (
+ db,
+ endpoint,
+ timeout_ms=5000,
+ bearer_token=None,
+ https_only=false,
+ ca_cert_pem_path=None,
+ client_cert_pem_path=None,
+ client_key_pem_path=None
+))]
+pub fn push_replication_metrics_otel_json(
+ db: &PyDatabase,
+ endpoint: String,
+ timeout_ms: i64,
+ bearer_token: Option,
+ https_only: bool,
+ ca_cert_pem_path: Option,
+ client_cert_pem_path: Option,
+ client_key_pem_path: Option,
+) -> PyResult<(i64, String)> {
+ if timeout_ms <= 0 {
+ return Err(PyRuntimeError::new_err("timeout_ms must be positive"));
+ }
+
+ let options = core_metrics::OtlpHttpPushOptions {
+ timeout_ms: timeout_ms as u64,
+ bearer_token,
+ tls: core_metrics::OtlpHttpTlsOptions {
+ https_only,
+ ca_cert_pem_path,
+ client_cert_pem_path,
+ client_key_pem_path,
+ },
+ };
+
+ let guard = db
+ .inner
+ .read()
+ .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
+ match guard.as_ref() {
+ Some(DatabaseInner::SingleFile(d)) => {
+ let result = core_metrics::push_replication_metrics_otel_json_single_file_with_options(
+ d, &endpoint, &options,
+ )
+ .map_err(|e| PyRuntimeError::new_err(format!("Failed to push replication metrics: {e}")))?;
+ Ok((result.status_code, result.response_body))
+ }
+ None => Err(PyRuntimeError::new_err("Database is closed")),
+ }
+}
+
#[pyfunction]
pub fn health_check(db: &PyDatabase) -> PyResult {
let guard = db
diff --git a/ray-rs/src/pyo3_bindings/mod.rs b/ray-rs/src/pyo3_bindings/mod.rs
index fe385cb..3ec942e 100644
--- a/ray-rs/src/pyo3_bindings/mod.rs
+++ b/ray-rs/src/pyo3_bindings/mod.rs
@@ -76,6 +76,9 @@ pub fn kitedb(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::()?;
m.add_class::()?;
m.add_class::()?;
+ m.add_class::()?;
+ m.add_class::()?;
+ m.add_class::()?;
m.add_class::()?;
m.add_class::()?;
m.add_class::()?;
@@ -114,6 +117,18 @@ pub fn kitedb(m: &Bound<'_, PyModule>) -> PyResult<()> {
// Standalone functions
m.add_function(wrap_pyfunction!(database::open_database, m)?)?;
m.add_function(wrap_pyfunction!(database::collect_metrics, m)?)?;
+ m.add_function(wrap_pyfunction!(
+ database::collect_replication_metrics_prometheus,
+ m
+ )?)?;
+ m.add_function(wrap_pyfunction!(
+ database::collect_replication_metrics_otel_json,
+ m
+ )?)?;
+ m.add_function(wrap_pyfunction!(
+ database::push_replication_metrics_otel_json,
+ m
+ )?)?;
m.add_function(wrap_pyfunction!(database::health_check, m)?)?;
m.add_function(wrap_pyfunction!(database::create_backup, m)?)?;
m.add_function(wrap_pyfunction!(database::restore_backup, m)?)?;
diff --git a/ray-rs/tests/replication_metrics_phase_d.rs b/ray-rs/tests/replication_metrics_phase_d.rs
new file mode 100644
index 0000000..4cb8c43
--- /dev/null
+++ b/ray-rs/tests/replication_metrics_phase_d.rs
@@ -0,0 +1,410 @@
+use std::collections::HashMap;
+use std::io::{Read, Write};
+use std::net::TcpListener;
+use std::sync::mpsc;
+use std::thread;
+use std::time::Duration;
+
+use kitedb::core::single_file::{close_single_file, open_single_file, SingleFileOpenOptions};
+use kitedb::metrics::{
+ collect_metrics_single_file, collect_replication_metrics_otel_json_single_file,
+ collect_replication_metrics_prometheus_single_file, push_replication_metrics_otel_json_payload,
+ push_replication_metrics_otel_json_payload_with_options, render_replication_metrics_prometheus,
+ OtlpHttpPushOptions, OtlpHttpTlsOptions,
+};
+use kitedb::replication::types::ReplicationRole;
+
+fn open_primary(
+ path: &std::path::Path,
+ sidecar: &std::path::Path,
+ segment_max_bytes: u64,
+ retention_min_entries: u64,
+) -> kitedb::Result {
+ open_single_file(
+ path,
+ SingleFileOpenOptions::new()
+ .replication_role(ReplicationRole::Primary)
+ .replication_sidecar_path(sidecar)
+ .replication_segment_max_bytes(segment_max_bytes)
+ .replication_retention_min_entries(retention_min_entries),
+ )
+}
+
+fn open_replica(
+ replica_path: &std::path::Path,
+ source_db_path: &std::path::Path,
+ local_sidecar: &std::path::Path,
+ source_sidecar: &std::path::Path,
+) -> kitedb::Result {
+ open_single_file(
+ replica_path,
+ SingleFileOpenOptions::new()
+ .replication_role(ReplicationRole::Replica)
+ .replication_sidecar_path(local_sidecar)
+ .replication_source_db_path(source_db_path)
+ .replication_source_sidecar_path(source_sidecar),
+ )
+}
+
+#[derive(Debug)]
+struct CapturedHttpRequest {
+ request_line: String,
+ headers: HashMap,
+ body: String,
+}
+
+fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option {
+ haystack
+ .windows(needle.len())
+ .position(|window| window == needle)
+}
+
+fn spawn_http_capture_server(
+ status_code: u16,
+ response_body: &str,
+) -> (
+ String,
+ mpsc::Receiver,
+ thread::JoinHandle<()>,
+) {
+ let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
+ let address = listener.local_addr().expect("local addr");
+ let endpoint = format!("http://{address}/v1/metrics");
+ let response_body = response_body.to_string();
+ let (tx, rx) = mpsc::channel::();
+
+ let handle = thread::spawn(move || {
+ let (mut stream, _) = listener.accept().expect("accept");
+ stream
+ .set_read_timeout(Some(Duration::from_secs(2)))
+ .expect("set read timeout");
+
+ let mut buffer = Vec::new();
+ let mut chunk = [0u8; 1024];
+ let mut header_end: Option = None;
+ let mut content_length = 0usize;
+
+ loop {
+ match stream.read(&mut chunk) {
+ Ok(0) => break,
+ Ok(read) => {
+ buffer.extend_from_slice(&chunk[..read]);
+
+ if header_end.is_none() {
+ if let Some(position) = find_subsequence(&buffer, b"\r\n\r\n") {
+ let end = position + 4;
+ header_end = Some(end);
+ let headers_text = String::from_utf8_lossy(&buffer[..end]);
+ for line in headers_text.lines().skip(1) {
+ let Some((name, value)) = line.split_once(':') else {
+ continue;
+ };
+ if name.eq_ignore_ascii_case("content-length") {
+ content_length = value.trim().parse::().unwrap_or(0);
+ }
+ }
+ }
+ }
+
+ if let Some(end) = header_end {
+ if buffer.len() >= end + content_length {
+ break;
+ }
+ }
+ }
+ Err(error) => panic!("read request failed: {error}"),
+ }
+ }
+
+ let end = header_end.expect("header terminator");
+ let headers_text = String::from_utf8_lossy(&buffer[..end]);
+ let mut lines = headers_text.lines();
+ let request_line = lines.next().unwrap_or_default().to_string();
+ let mut headers = HashMap::new();
+ for line in lines {
+ let Some((name, value)) = line.split_once(':') else {
+ continue;
+ };
+ headers.insert(name.trim().to_ascii_lowercase(), value.trim().to_string());
+ }
+
+ let body_end = (end + content_length).min(buffer.len());
+ let body = String::from_utf8_lossy(&buffer[end..body_end]).to_string();
+ tx.send(CapturedHttpRequest {
+ request_line,
+ headers,
+ body,
+ })
+ .expect("send captured request");
+
+ let reason = if status_code == 200 { "OK" } else { "ERR" };
+ let response = format!(
+ "HTTP/1.1 {status_code} {reason}\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
+ response_body.len(),
+ response_body
+ );
+ stream
+ .write_all(response.as_bytes())
+ .expect("write response");
+ });
+
+ (endpoint, rx, handle)
+}
+
+#[test]
+fn collect_metrics_exposes_primary_replication_fields() {
+ let dir = tempfile::tempdir().expect("tempdir");
+ let db_path = dir.path().join("replication-metrics-primary.kitedb");
+ let sidecar = dir.path().join("replication-metrics-primary.sidecar");
+
+ let primary = open_primary(&db_path, &sidecar, 1, 2).expect("open primary");
+
+ for i in 0..4 {
+ primary.begin(false).expect("begin");
+ primary
+ .create_node(Some(&format!("p-{i}")))
+ .expect("create node");
+ let _ = primary.commit_with_token().expect("commit").expect("token");
+ }
+
+ primary
+ .primary_report_replica_progress("replica-a", 1, 2)
+ .expect("report replica progress");
+
+ let metrics = collect_metrics_single_file(&primary);
+ let otel = collect_replication_metrics_otel_json_single_file(&primary);
+ let prometheus = collect_replication_metrics_prometheus_single_file(&primary);
+ assert!(metrics.replication.enabled);
+ assert_eq!(metrics.replication.role, "primary");
+ assert!(metrics.replication.replica.is_none());
+
+ let repl = metrics
+ .replication
+ .primary
+ .as_ref()
+ .expect("primary replication metrics");
+ assert_eq!(repl.epoch, 1);
+ assert_eq!(repl.replica_count, 1);
+ assert_eq!(repl.stale_epoch_replica_count, 0);
+ assert_eq!(repl.min_replica_applied_log_index, Some(2));
+ assert_eq!(repl.max_replica_lag, repl.head_log_index.saturating_sub(2));
+ assert!(repl.append_attempts >= repl.append_successes);
+ assert_eq!(repl.append_failures, 0);
+ assert!(repl.append_successes >= 4);
+ assert!(repl.last_token.is_some());
+ assert!(repl
+ .sidecar_path
+ .ends_with("replication-metrics-primary.sidecar"));
+ assert!(prometheus.contains("# HELP kitedb_replication_enabled"));
+ assert!(prometheus.contains("kitedb_replication_enabled{role=\"primary\"} 1"));
+ assert!(prometheus.contains("kitedb_replication_primary_head_log_index"));
+ assert!(prometheus.contains("kitedb_replication_primary_append_attempts_total"));
+ assert!(otel.contains("\"kitedb.replication.enabled\""));
+ assert!(otel.contains("\"kitedb.replication.primary.head_log_index\""));
+ assert!(otel.contains("\"kitedb.replication.primary.append_attempts\""));
+ let otel_json: serde_json::Value = serde_json::from_str(&otel).expect("parse otel json");
+ assert!(otel_json["resourceMetrics"]
+ .as_array()
+ .map(|values| !values.is_empty())
+ .unwrap_or(false));
+
+ close_single_file(primary).expect("close primary");
+}
+
+#[test]
+fn collect_metrics_exposes_replica_reseed_error_state() {
+ let dir = tempfile::tempdir().expect("tempdir");
+ let primary_path = dir
+ .path()
+ .join("replication-metrics-replica-primary.kitedb");
+ let primary_sidecar = dir
+ .path()
+ .join("replication-metrics-replica-primary.sidecar");
+ let replica_path = dir.path().join("replication-metrics-replica.kitedb");
+ let replica_sidecar = dir.path().join("replication-metrics-replica.sidecar");
+
+ let primary = open_primary(&primary_path, &primary_sidecar, 1, 2).expect("open primary");
+
+ primary.begin(false).expect("begin base");
+ primary.create_node(Some("base")).expect("create base");
+ primary
+ .commit_with_token()
+ .expect("commit base")
+ .expect("token base");
+
+ let replica = open_replica(
+ &replica_path,
+ &primary_path,
+ &replica_sidecar,
+ &primary_sidecar,
+ )
+ .expect("open replica");
+ replica
+ .replica_bootstrap_from_snapshot()
+ .expect("bootstrap replica");
+
+ for i in 0..5 {
+ primary.begin(false).expect("begin");
+ primary
+ .create_node(Some(&format!("r-{i}")))
+ .expect("create");
+ primary.commit_with_token().expect("commit").expect("token");
+ }
+
+ primary
+ .primary_report_replica_progress("replica-r", 1, 1)
+ .expect("report lagging replica");
+ let _ = primary.primary_run_retention().expect("run retention");
+
+ let err = replica
+ .replica_catch_up_once(32)
+ .expect_err("must need reseed");
+ assert!(err.to_string().contains("reseed"));
+
+ let metrics = collect_metrics_single_file(&replica);
+ let otel = collect_replication_metrics_otel_json_single_file(&replica);
+ let prometheus = render_replication_metrics_prometheus(&metrics);
+ assert!(metrics.replication.enabled);
+ assert_eq!(metrics.replication.role, "replica");
+ assert!(metrics.replication.primary.is_none());
+
+ let repl = metrics
+ .replication
+ .replica
+ .as_ref()
+ .expect("replica replication metrics");
+ assert!(repl.needs_reseed);
+ assert!(
+ repl
+ .last_error
+ .as_deref()
+ .unwrap_or_default()
+ .contains("reseed"),
+ "unexpected last_error: {:?}",
+ repl.last_error
+ );
+ assert!(prometheus.contains("kitedb_replication_enabled{role=\"replica\"} 1"));
+ assert!(prometheus.contains("kitedb_replication_replica_needs_reseed 1"));
+ assert!(prometheus.contains("kitedb_replication_replica_last_error_present 1"));
+ assert!(otel.contains("\"kitedb.replication.replica.needs_reseed\""));
+ assert!(otel.contains("\"kitedb.replication.replica.last_error_present\""));
+
+ close_single_file(replica).expect("close replica");
+ close_single_file(primary).expect("close primary");
+}
+
+#[test]
+fn replication_prometheus_export_reports_disabled_role() {
+ let dir = tempfile::tempdir().expect("tempdir");
+ let db_path = dir.path().join("replication-metrics-disabled.kitedb");
+ let db = open_single_file(&db_path, SingleFileOpenOptions::new()).expect("open db");
+
+ let metrics = collect_metrics_single_file(&db);
+ let otel = collect_replication_metrics_otel_json_single_file(&db);
+ let prometheus = render_replication_metrics_prometheus(&metrics);
+ assert!(!metrics.replication.enabled);
+ assert_eq!(metrics.replication.role, "disabled");
+ assert!(prometheus.contains("kitedb_replication_enabled{role=\"disabled\"} 0"));
+ assert!(prometheus.contains("kitedb_replication_auth_enabled 0"));
+ assert!(otel.contains("\"kitedb.replication.enabled\""));
+ assert!(otel.contains("\"role\""));
+ assert!(otel.contains("\"disabled\""));
+
+ close_single_file(db).expect("close db");
+}
+
+#[test]
+fn otlp_push_payload_validates_endpoint_and_timeout() {
+ let endpoint_err = push_replication_metrics_otel_json_payload("{}", " ", 1000, None)
+ .expect_err("empty endpoint must fail");
+ assert!(endpoint_err.to_string().contains("endpoint"));
+
+ let timeout_err =
+ push_replication_metrics_otel_json_payload("{}", "http://127.0.0.1:1/v1/metrics", 0, None)
+ .expect_err("zero timeout must fail");
+ assert!(timeout_err.to_string().contains("timeout_ms"));
+}
+
+#[test]
+fn otlp_push_payload_posts_json_and_auth_header() {
+ let payload = "{\"resourceMetrics\":[]}";
+ let (endpoint, captured_rx, handle) = spawn_http_capture_server(200, "ok");
+
+ let result = push_replication_metrics_otel_json_payload(payload, &endpoint, 2_000, Some("token"))
+ .expect("otlp push must succeed");
+ assert_eq!(result.status_code, 200);
+ assert_eq!(result.response_body, "ok");
+
+ let captured = captured_rx
+ .recv_timeout(Duration::from_secs(2))
+ .expect("captured request");
+ assert_eq!(captured.request_line, "POST /v1/metrics HTTP/1.1");
+ assert_eq!(
+ captured.headers.get("content-type").map(String::as_str),
+ Some("application/json")
+ );
+ assert_eq!(
+ captured.headers.get("authorization").map(String::as_str),
+ Some("Bearer token")
+ );
+ assert_eq!(captured.body, payload);
+
+ handle.join().expect("server thread");
+}
+
+#[test]
+fn otlp_push_payload_returns_error_on_non_success_status() {
+ let payload = "{\"resourceMetrics\":[]}";
+ let (endpoint, _captured_rx, handle) = spawn_http_capture_server(401, "denied");
+
+ let error = push_replication_metrics_otel_json_payload(payload, &endpoint, 2_000, None)
+ .expect_err("non-2xx must fail");
+ let message = error.to_string();
+ assert!(
+ message.contains("status 401"),
+ "unexpected error: {message}"
+ );
+ assert!(message.contains("denied"), "unexpected error: {message}");
+
+ handle.join().expect("server thread");
+}
+
+#[test]
+fn otlp_push_payload_rejects_https_only_http_endpoint() {
+ let options = OtlpHttpPushOptions {
+ timeout_ms: 2_000,
+ bearer_token: None,
+ tls: OtlpHttpTlsOptions {
+ https_only: true,
+ ..OtlpHttpTlsOptions::default()
+ },
+ };
+ let error = push_replication_metrics_otel_json_payload_with_options(
+ "{}",
+ "http://127.0.0.1:4318/v1/metrics",
+ &options,
+ )
+ .expect_err("https_only should reject http endpoint");
+ assert!(error.to_string().contains("https"));
+}
+
+#[test]
+fn otlp_push_payload_rejects_partial_mtls_paths() {
+ let options = OtlpHttpPushOptions {
+ timeout_ms: 2_000,
+ bearer_token: None,
+ tls: OtlpHttpTlsOptions {
+ client_cert_pem_path: Some("/tmp/client.crt".to_string()),
+ client_key_pem_path: None,
+ ..OtlpHttpTlsOptions::default()
+ },
+ };
+ let error = push_replication_metrics_otel_json_payload_with_options(
+ "{}",
+ "https://127.0.0.1:4318/v1/metrics",
+ &options,
+ )
+ .expect_err("partial mTLS path configuration should fail");
+ assert!(error.to_string().contains("client_cert_pem_path"));
+ assert!(error.to_string().contains("client_key_pem_path"));
+}
diff --git a/ray-rs/ts/index.ts b/ray-rs/ts/index.ts
index c855178..dc42c09 100644
--- a/ray-rs/ts/index.ts
+++ b/ray-rs/ts/index.ts
@@ -126,6 +126,7 @@ type NodeObject = NodeRef & Record
type NodeIdLike = number | { id: number }
type NodePropsSelection = Array
type SyncMode = JsSyncMode
+type ReplicationRole = 'disabled' | 'primary' | 'replica'
type InsertExecutorSingle = Omit & {
returning(): InferNode
}
@@ -1031,6 +1032,10 @@ export {
backupInfo,
createOfflineBackup,
collectMetrics,
+ collectReplicationMetricsOtelJson,
+ collectReplicationMetricsPrometheus,
+ pushReplicationMetricsOtelJson,
+ pushReplicationMetricsOtelJsonWithOptions,
healthCheck,
createVectorIndex,
bruteForceSearch,
@@ -1072,6 +1077,8 @@ export type {
MvccStats,
HealthCheckResult,
HealthCheckEntry,
+ OtlpHttpExportResult,
+ PushReplicationMetricsOtelOptions,
// Traversal
JsTraverseOptions as TraverseOptions,
JsTraversalStep as TraversalStep,
@@ -1119,6 +1126,14 @@ export interface KiteOptions {
readOnly?: boolean
/** Create database if it doesn't exist (default: true) */
createIfMissing?: boolean
+ /** Enable MVCC (snapshot isolation + conflict detection) */
+ mvcc?: boolean
+ /** MVCC GC interval in ms */
+ mvccGcIntervalMs?: number
+ /** MVCC retention in ms */
+ mvccRetentionMs?: number
+ /** MVCC max version chain depth */
+ mvccMaxChainDepth?: number
/** Sync mode for durability (default: "Full") */
syncMode?: SyncMode
/** Enable group commit (coalesce WAL flushes across commits) */
@@ -1129,6 +1144,20 @@ export interface KiteOptions {
walSizeMb?: number
/** WAL usage threshold (0.0-1.0) to trigger auto-checkpoint */
checkpointThreshold?: number
+ /** Replication role */
+ replicationRole?: ReplicationRole
+ /** Replication sidecar path override */
+ replicationSidecarPath?: string
+ /** Source primary db path (replica role only) */
+ replicationSourceDbPath?: string
+ /** Source primary sidecar path override (replica role only) */
+ replicationSourceSidecarPath?: string
+ /** Segment rotation threshold in bytes (primary role only) */
+ replicationSegmentMaxBytes?: number
+ /** Minimum retained entries window (primary role only) */
+ replicationRetentionMinEntries?: number
+ /** Minimum retained segment age in milliseconds (primary role only) */
+ replicationRetentionMinMs?: number
}
// =============================================================================
@@ -1176,18 +1205,58 @@ function edgeSpecToNative(spec: EdgeSpec): JsEdgeSpec {
}
}
+function replicationRoleToNative(role: ReplicationRole): 'Disabled' | 'Primary' | 'Replica' {
+ switch (role) {
+ case 'disabled':
+ return 'Disabled'
+ case 'primary':
+ return 'Primary'
+ case 'replica':
+ return 'Replica'
+ }
+}
+
function optionsToNative(options: KiteOptions): JsKiteOptions {
- return {
+ const nativeOptions: JsKiteOptions = {
nodes: options.nodes.map(nodeSpecToNative),
edges: options.edges.map(edgeSpecToNative),
readOnly: options.readOnly,
createIfMissing: options.createIfMissing,
+ mvcc: options.mvcc,
+ mvccGcIntervalMs: options.mvccGcIntervalMs,
+ mvccRetentionMs: options.mvccRetentionMs,
+ mvccMaxChainDepth: options.mvccMaxChainDepth,
syncMode: options.syncMode,
groupCommitEnabled: options.groupCommitEnabled,
groupCommitWindowMs: options.groupCommitWindowMs,
walSizeMb: options.walSizeMb,
checkpointThreshold: options.checkpointThreshold,
}
+
+ const mutable = nativeOptions as unknown as Record
+ if (options.replicationRole) {
+ mutable.replicationRole = replicationRoleToNative(options.replicationRole)
+ }
+ if (options.replicationSidecarPath) {
+ mutable.replicationSidecarPath = options.replicationSidecarPath
+ }
+ if (options.replicationSourceDbPath) {
+ mutable.replicationSourceDbPath = options.replicationSourceDbPath
+ }
+ if (options.replicationSourceSidecarPath) {
+ mutable.replicationSourceSidecarPath = options.replicationSourceSidecarPath
+ }
+ if (options.replicationSegmentMaxBytes !== undefined) {
+ mutable.replicationSegmentMaxBytes = options.replicationSegmentMaxBytes
+ }
+ if (options.replicationRetentionMinEntries !== undefined) {
+ mutable.replicationRetentionMinEntries = options.replicationRetentionMinEntries
+ }
+ if (options.replicationRetentionMinMs !== undefined) {
+ mutable.replicationRetentionMinMs = options.replicationRetentionMinMs
+ }
+
+ return nativeOptions
}
// =============================================================================
From 184e46d93492a59cd68d4c9d8f3bba864fe5868d Mon Sep 17 00:00:00 2001
From: mask
Date: Sun, 8 Feb 2026 11:10:40 -0600
Subject: [PATCH 03/58] replication: add host-runtime transport snapshot/log
exports
---
docs/REPLICATION_PLAN.md | 7 +-
docs/REPLICATION_RUNBOOK.md | 8 +-
ray-rs/Cargo.toml | 1 +
ray-rs/README.md | 8 +
ray-rs/index.d.ts | 4 +
ray-rs/index.js | 2 +
ray-rs/python/PARITY_MATRIX.md | 2 +-
ray-rs/python/README.md | 14 +
ray-rs/python/kitedb/__init__.py | 4 +
ray-rs/python/kitedb/_kitedb.pyi | 11 +
ray-rs/src/core/single_file/replication.rs | 709 +++++++++++++++++++++
ray-rs/src/napi_bindings/database.rs | 43 ++
ray-rs/src/pyo3_bindings/database.rs | 51 ++
ray-rs/src/pyo3_bindings/mod.rs | 8 +
ray-rs/tests/replication_phase_d.rs | 471 ++++++++++++++
ray-rs/ts/index.ts | 2 +
16 files changed, 1341 insertions(+), 4 deletions(-)
create mode 100644 ray-rs/src/core/single_file/replication.rs
create mode 100644 ray-rs/tests/replication_phase_d.rs
diff --git a/docs/REPLICATION_PLAN.md b/docs/REPLICATION_PLAN.md
index b0b5cca..78a4c55 100644
--- a/docs/REPLICATION_PLAN.md
+++ b/docs/REPLICATION_PLAN.md
@@ -365,6 +365,9 @@ Implemented:
- Host-runtime OpenTelemetry OTLP-JSON replication exporter API in Rust core + Node NAPI + Python PyO3 (`collect_replication_metrics_otel_json*`).
- Host-runtime OpenTelemetry collector push transport (HTTP OTLP-JSON) in Rust core + Node NAPI + Python PyO3 (`push_replication_metrics_otel_json_single_file`, `pushReplicationMetricsOtelJson`, `push_replication_metrics_otel_json`).
- Host-runtime OTLP transport hardening for TLS/mTLS (HTTPS-only mode, custom CA trust, optional client cert/key auth).
+- Host-runtime replication transport JSON export surfaces for embedding HTTP endpoints beyond playground runtime:
+ - snapshot export (`collectReplicationSnapshotTransportJson` / `collect_replication_snapshot_transport_json`)
+ - log page export with cursor/limits (`collectReplicationLogTransportJson` / `collect_replication_log_transport_json`).
- Replica source transport hardening in host-runtime open path (required source DB path + source/local sidecar collision fencing).
- Operator runbook for promotion/reseed/retention tuning (`docs/REPLICATION_RUNBOOK.md`).
- Replication benchmark gate script (`ray-rs/scripts/replication-bench-gate.sh`) + benchmark doc wiring.
@@ -385,8 +388,8 @@ Validated tests:
- `ray-rs/tests/replication_faults_phase_d.rs` (corrupt/truncated segment fault paths + durable `last_error`).
Known limits:
-- HTTP rollout currently targets playground runtime; broader host-runtime transport remains planned.
+- Bundled HTTP admin endpoints currently ship in playground runtime only; host runtime provides JSON export helpers for embedding custom endpoints.
- Host-runtime OTLP export currently targets HTTP OTLP-JSON payloads only (no protobuf/gRPC exporter path).
Carry-over to next phase:
-- Host-runtime replication admin/status HTTP rollout beyond playground runtime (playground remains the only bundled HTTP surface).
+- Standardized host-runtime HTTP adapter package/templates on top of transport JSON helpers.
diff --git a/docs/REPLICATION_RUNBOOK.md b/docs/REPLICATION_RUNBOOK.md
index 7bc3f7f..b1a9d4d 100644
--- a/docs/REPLICATION_RUNBOOK.md
+++ b/docs/REPLICATION_RUNBOOK.md
@@ -39,6 +39,12 @@ Metrics surface:
- Python PyO3: `push_replication_metrics_otel_json(db, endpoint, timeout_ms=5000, bearer_token=None)`
- advanced TLS/mTLS kwargs:
`https_only`, `ca_cert_pem_path`, `client_cert_pem_path`, `client_key_pem_path`.
+- Host-runtime replication transport JSON export helpers are available via:
+ - Node NAPI: `collectReplicationSnapshotTransportJson(db, includeData?)`,
+ `collectReplicationLogTransportJson(db, cursor?, maxFrames?, maxBytes?, includePayload?)`
+ - Python PyO3: `collect_replication_snapshot_transport_json(db, include_data=False)`,
+ `collect_replication_log_transport_json(db, cursor=None, max_frames=128, max_bytes=1048576, include_payload=True)`
+ - These are intended for embedding host-side HTTP endpoints beyond playground runtime.
Alert heuristics:
- `append_failures > 0` growing: primary sidecar durability issue.
@@ -174,6 +180,6 @@ Playground curl examples:
## 9. Known V1 Limits
- Retention policy supports entry-window + time-window floors, but not richer SLA-aware policies.
-- HTTP rollout currently targets the playground runtime API; host-runtime transport rollout remains planned.
+- Bundled HTTP admin endpoints still ship in playground runtime; host runtime now exposes transport JSON helpers for embedding custom HTTP surfaces.
- Host-runtime OTLP export currently targets HTTP OTLP-JSON payloads only (no protobuf/gRPC exporter path).
- `SyncMode::Normal` and `SyncMode::Off` optimize commit latency by batching sidecar frame writes in-memory and refreshing manifest fencing periodically (not every commit). For strict per-commit sidecar visibility/fencing, use `SyncMode::Full`.
diff --git a/ray-rs/Cargo.toml b/ray-rs/Cargo.toml
index 7c231b9..7a04b80 100644
--- a/ray-rs/Cargo.toml
+++ b/ray-rs/Cargo.toml
@@ -29,6 +29,7 @@ thiserror = "2.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
ureq = "2.10"
+base64 = "0.22"
rustls-pemfile = "2.2"
webpki-roots = "1.0"
diff --git a/ray-rs/README.md b/ray-rs/README.md
index 3df3e83..3fc48d9 100644
--- a/ray-rs/README.md
+++ b/ray-rs/README.md
@@ -188,8 +188,10 @@ Phase D replication controls are available on the low-level `Database` API.
```ts
import { Database } from 'kitedb'
import {
+ collectReplicationLogTransportJson,
collectReplicationMetricsOtelJson,
collectReplicationMetricsPrometheus,
+ collectReplicationSnapshotTransportJson,
pushReplicationMetricsOtelJson,
pushReplicationMetricsOtelJsonWithOptions,
} from 'kitedb/native'
@@ -248,6 +250,12 @@ const secureExport = pushReplicationMetricsOtelJsonWithOptions(
)
console.log(secureExport.statusCode, secureExport.responseBody)
+const snapshotJson = collectReplicationSnapshotTransportJson(primary, false)
+console.log(snapshotJson)
+
+const logPageJson = collectReplicationLogTransportJson(primary, null, 128, 1_048_576, false)
+console.log(logPageJson)
+
replica.close()
primary.close()
```
diff --git a/ray-rs/index.d.ts b/ray-rs/index.d.ts
index d2b3188..945e847 100644
--- a/ray-rs/index.d.ts
+++ b/ray-rs/index.d.ts
@@ -864,10 +864,14 @@ export interface CheckResult {
export declare function collectMetrics(db: Database): DatabaseMetrics
+export declare function collectReplicationLogTransportJson(db: Database, cursor?: string | undefined | null, maxFrames?: number | undefined | null, maxBytes?: number | undefined | null, includePayload?: boolean | undefined | null): string
+
export declare function collectReplicationMetricsOtelJson(db: Database): string
export declare function collectReplicationMetricsPrometheus(db: Database): string
+export declare function collectReplicationSnapshotTransportJson(db: Database, includeData?: boolean | undefined | null): string
+
export interface OtlpHttpExportResult {
statusCode: number
responseBody: string
diff --git a/ray-rs/index.js b/ray-rs/index.js
index c051d27..4892ac1 100644
--- a/ray-rs/index.js
+++ b/ray-rs/index.js
@@ -597,8 +597,10 @@ module.exports.VectorIndex = nativeBinding.VectorIndex
module.exports.backupInfo = nativeBinding.backupInfo
module.exports.bruteForceSearch = nativeBinding.bruteForceSearch
module.exports.collectMetrics = nativeBinding.collectMetrics
+module.exports.collectReplicationLogTransportJson = nativeBinding.collectReplicationLogTransportJson
module.exports.collectReplicationMetricsOtelJson = nativeBinding.collectReplicationMetricsOtelJson
module.exports.collectReplicationMetricsPrometheus = nativeBinding.collectReplicationMetricsPrometheus
+module.exports.collectReplicationSnapshotTransportJson = nativeBinding.collectReplicationSnapshotTransportJson
module.exports.pushReplicationMetricsOtelJson = nativeBinding.pushReplicationMetricsOtelJson
module.exports.pushReplicationMetricsOtelJsonWithOptions = nativeBinding.pushReplicationMetricsOtelJsonWithOptions
module.exports.createBackup = nativeBinding.createBackup
diff --git a/ray-rs/python/PARITY_MATRIX.md b/ray-rs/python/PARITY_MATRIX.md
index a61ce7c..443eb7c 100644
--- a/ray-rs/python/PARITY_MATRIX.md
+++ b/ray-rs/python/PARITY_MATRIX.md
@@ -62,7 +62,7 @@ Legend: parity = full feature match, partial = similar capability with API or be
| Export/Import | `export*`, `import*` | `export*`, `import*` | parity | Python exposes JSON object and file helpers. |
| Streaming | `stream*`, `get*Page` | `stream*`, `get*Page` | parity | Same batching/pagination behavior. |
| Backup/Restore | `createBackup`, `restoreBackup` | `create_backup`, `restore_backup` | parity | Naming differences only. |
-| Metrics/Health | `collectMetrics`, `collectReplicationMetricsPrometheus`, `collectReplicationMetricsOtelJson`, `pushReplicationMetricsOtelJson`, `healthCheck` | `collect_metrics`, `collect_replication_metrics_prometheus`, `collect_replication_metrics_otel_json`, `push_replication_metrics_otel_json`, `health_check` | parity | Naming differences only. |
+| Metrics/Health | `collectMetrics`, `collectReplicationMetricsPrometheus`, `collectReplicationMetricsOtelJson`, `pushReplicationMetricsOtelJson`, `collectReplicationSnapshotTransportJson`, `collectReplicationLogTransportJson`, `healthCheck` | `collect_metrics`, `collect_replication_metrics_prometheus`, `collect_replication_metrics_otel_json`, `push_replication_metrics_otel_json`, `collect_replication_snapshot_transport_json`, `collect_replication_log_transport_json`, `health_check` | parity | Naming differences only. |
## Vector Search
diff --git a/ray-rs/python/README.md b/ray-rs/python/README.md
index 585f8ad..c00c206 100644
--- a/ray-rs/python/README.md
+++ b/ray-rs/python/README.md
@@ -196,8 +196,10 @@ Phase D replication controls are available on `Database`:
from kitedb import (
Database,
OpenOptions,
+ collect_replication_log_transport_json,
collect_replication_metrics_otel_json,
collect_replication_metrics_prometheus,
+ collect_replication_snapshot_transport_json,
push_replication_metrics_otel_json,
)
@@ -261,6 +263,18 @@ secure_status, secure_body = push_replication_metrics_otel_json(
)
print(secure_status, secure_body)
+snapshot_json = collect_replication_snapshot_transport_json(primary, include_data=False)
+print(snapshot_json)
+
+log_json = collect_replication_log_transport_json(
+ primary,
+ cursor=None,
+ max_frames=128,
+ max_bytes=1024 * 1024,
+ include_payload=False,
+)
+print(log_json)
+
replica.close()
primary.close()
```
diff --git a/ray-rs/python/kitedb/__init__.py b/ray-rs/python/kitedb/__init__.py
index 4e5cb58..8736904 100644
--- a/ray-rs/python/kitedb/__init__.py
+++ b/ray-rs/python/kitedb/__init__.py
@@ -103,8 +103,10 @@
# Functions
open_database,
collect_metrics,
+ collect_replication_log_transport_json,
collect_replication_metrics_otel_json,
collect_replication_metrics_prometheus,
+ collect_replication_snapshot_transport_json,
push_replication_metrics_otel_json,
health_check,
create_backup,
@@ -273,8 +275,10 @@
# Functions
"open_database",
"collect_metrics",
+ "collect_replication_log_transport_json",
"collect_replication_metrics_otel_json",
"collect_replication_metrics_prometheus",
+ "collect_replication_snapshot_transport_json",
"push_replication_metrics_otel_json",
"health_check",
"create_backup",
diff --git a/ray-rs/python/kitedb/_kitedb.pyi b/ray-rs/python/kitedb/_kitedb.pyi
index 41c9c88..d29457d 100644
--- a/ray-rs/python/kitedb/_kitedb.pyi
+++ b/ray-rs/python/kitedb/_kitedb.pyi
@@ -534,6 +534,17 @@ class Database:
def open_database(path: str, options: Optional[OpenOptions] = None) -> Database: ...
def collect_metrics(db: Database) -> DatabaseMetrics: ...
+def collect_replication_snapshot_transport_json(
+ db: Database,
+ include_data: bool = False,
+) -> str: ...
+def collect_replication_log_transport_json(
+ db: Database,
+ cursor: Optional[str] = None,
+ max_frames: int = 128,
+ max_bytes: int = 1048576,
+ include_payload: bool = True,
+) -> str: ...
def collect_replication_metrics_otel_json(db: Database) -> str: ...
def collect_replication_metrics_prometheus(db: Database) -> str: ...
def push_replication_metrics_otel_json(
diff --git a/ray-rs/src/core/single_file/replication.rs b/ray-rs/src/core/single_file/replication.rs
new file mode 100644
index 0000000..eff2fe8
--- /dev/null
+++ b/ray-rs/src/core/single_file/replication.rs
@@ -0,0 +1,709 @@
+//! Replica-side operations and token wait helpers.
+
+use crate::core::wal::record::{
+ parse_add_edge_payload, parse_add_edge_props_payload, parse_add_edges_batch_payload,
+ parse_add_edges_props_batch_payload, parse_add_node_label_payload, parse_create_node_payload,
+ parse_create_nodes_batch_payload, parse_del_edge_prop_payload, parse_del_node_prop_payload,
+ parse_del_node_vector_payload, parse_delete_edge_payload, parse_delete_node_payload,
+ parse_remove_node_label_payload, parse_set_edge_prop_payload, parse_set_edge_props_payload,
+ parse_set_node_prop_payload, parse_set_node_vector_payload, parse_wal_record, ParsedWalRecord,
+};
+use crate::error::{KiteError, Result};
+use crate::replication::manifest::ManifestStore;
+use crate::replication::primary::PrimaryRetentionOutcome;
+use crate::replication::replica::ReplicaReplicationStatus;
+use crate::replication::transport::decode_commit_frame_payload;
+use crate::replication::types::{CommitToken, ReplicationCursor, ReplicationRole};
+use crate::types::WalRecordType;
+use crate::util::crc::crc32c;
+use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
+use base64::Engine;
+use serde_json::json;
+use std::collections::HashSet;
+use std::str::FromStr;
+use std::time::{Duration, Instant};
+
+use super::{close_single_file, open_single_file, SingleFileDB, SingleFileOpenOptions};
+
+const REPLICATION_MANIFEST_FILE: &str = "manifest.json";
+const REPLICATION_FRAME_MAGIC: u32 = 0x474F_4C52;
+const REPLICATION_FRAME_HEADER_BYTES: usize = 32;
+
+impl SingleFileDB {
+ /// Promote this primary instance to the next replication epoch.
+ pub fn primary_promote_to_next_epoch(&self) -> Result {
+ self
+ .primary_replication
+ .as_ref()
+ .ok_or_else(|| {
+ KiteError::InvalidReplication("database is not opened in primary role".to_string())
+ })?
+ .promote_to_next_epoch()
+ }
+
+ /// Report a replica's applied cursor to drive retention decisions.
+ pub fn primary_report_replica_progress(
+ &self,
+ replica_id: &str,
+ epoch: u64,
+ applied_log_index: u64,
+ ) -> Result<()> {
+ self
+ .primary_replication
+ .as_ref()
+ .ok_or_else(|| {
+ KiteError::InvalidReplication("database is not opened in primary role".to_string())
+ })?
+ .report_replica_progress(replica_id, epoch, applied_log_index)
+ }
+
+ /// Run retention pruning on primary replication segments.
+ pub fn primary_run_retention(&self) -> Result {
+ self
+ .primary_replication
+ .as_ref()
+ .ok_or_else(|| {
+ KiteError::InvalidReplication("database is not opened in primary role".to_string())
+ })?
+ .run_retention()
+ }
+
+ /// Replica status surface.
+ pub fn replica_replication_status(&self) -> Option {
+ self
+ .replica_replication
+ .as_ref()
+ .map(|replication| replication.status())
+ }
+
+ /// Bootstrap replica state from source primary snapshot.
+ pub fn replica_bootstrap_from_snapshot(&self) -> Result<()> {
+ let runtime = self.replica_replication.as_ref().ok_or_else(|| {
+ KiteError::InvalidReplication("database is not opened in replica role".to_string())
+ })?;
+
+ let source_db_path = runtime.source_db_path().ok_or_else(|| {
+ KiteError::InvalidReplication("replica source db path is not configured".to_string())
+ })?;
+
+ let source = open_single_file(
+ &source_db_path,
+ SingleFileOpenOptions::new()
+ .read_only(true)
+ .create_if_missing(false)
+ .replication_role(ReplicationRole::Disabled),
+ )?;
+
+ sync_graph_state(self, &source)?;
+
+ let (epoch, head) = runtime.source_head_position()?;
+ runtime.mark_applied(epoch, head)?;
+ runtime.clear_error()?;
+
+ close_single_file(source)?;
+ Ok(())
+ }
+
+ /// Force snapshot reseed for replicas that lost log continuity.
+ pub fn replica_reseed_from_snapshot(&self) -> Result<()> {
+ self.replica_bootstrap_from_snapshot()
+ }
+
+ /// Pull and apply the next batch of replication frames.
+ pub fn replica_catch_up_once(&self, max_frames: usize) -> Result {
+ self.replica_catch_up_internal(max_frames, false)
+ }
+
+ /// Test helper: request a batch including last-applied frame to verify idempotency.
+ pub fn replica_catch_up_once_replaying_last_for_testing(
+ &self,
+ max_frames: usize,
+ ) -> Result {
+ self.replica_catch_up_internal(max_frames, true)
+ }
+
+ /// Wait until this DB has applied at least the given token.
+ pub fn wait_for_token(&self, token: CommitToken, timeout_ms: u64) -> Result {
+ let deadline = Instant::now() + Duration::from_millis(timeout_ms);
+
+ loop {
+ if self.has_token(token) {
+ return Ok(true);
+ }
+
+ if Instant::now() >= deadline {
+ return Ok(false);
+ }
+
+ std::thread::sleep(Duration::from_millis(10));
+ }
+ }
+
+ fn has_token(&self, token: CommitToken) -> bool {
+ if let Some(status) = self.primary_replication_status() {
+ if let Some(last_token) = status.last_token {
+ return last_token >= token;
+ }
+ }
+
+ if let Some(status) = self.replica_replication_status() {
+ let replica_token = CommitToken::new(status.applied_epoch, status.applied_log_index);
+ return replica_token >= token;
+ }
+
+ false
+ }
+
+ fn replica_catch_up_internal(&self, max_frames: usize, replay_last: bool) -> Result {
+ let runtime = self.replica_replication.as_ref().ok_or_else(|| {
+ KiteError::InvalidReplication("database is not opened in replica role".to_string())
+ })?;
+
+ let frames = match runtime.frames_after(max_frames.max(1), replay_last) {
+ Ok(frames) => frames,
+ Err(err) => {
+ if !runtime.status().needs_reseed {
+ let _ = runtime.mark_error(err.to_string(), false);
+ }
+ return Err(err);
+ }
+ };
+ if frames.is_empty() {
+ return Ok(0);
+ }
+
+ let mut applied = 0usize;
+ for frame in frames {
+ let (applied_epoch, applied_log_index) = runtime.applied_position();
+ let already_applied = applied_epoch > frame.epoch
+ || (applied_epoch == frame.epoch && applied_log_index >= frame.log_index);
+ if already_applied {
+ continue;
+ }
+
+ if let Err(err) = apply_replication_frame(self, &frame.payload) {
+ let _ = runtime.mark_error(
+ format!(
+ "replica apply failed at {}:{}: {err}",
+ frame.epoch, frame.log_index
+ ),
+ false,
+ );
+ return Err(err);
+ }
+
+ if let Err(err) = runtime.mark_applied(frame.epoch, frame.log_index) {
+ let _ = runtime.mark_error(
+ format!(
+ "replica cursor persist failed at {}:{}: {err}",
+ frame.epoch, frame.log_index
+ ),
+ false,
+ );
+ return Err(err);
+ }
+ applied = applied.saturating_add(1);
+ }
+
+ runtime.clear_error()?;
+ Ok(applied)
+ }
+
+ /// Export latest primary snapshot metadata and optional bytes as transport JSON.
+ pub fn primary_export_snapshot_transport_json(&self, include_data: bool) -> Result {
+ let status = self.primary_replication_status().ok_or_else(|| {
+ KiteError::InvalidReplication("database is not opened in primary role".to_string())
+ })?;
+ let snapshot_bytes = std::fs::read(&self.path)?;
+ let checksum_crc32c = format!("{:08x}", crc32c(&snapshot_bytes));
+ let generated_at_ms = std::time::SystemTime::now()
+ .duration_since(std::time::UNIX_EPOCH)
+ .unwrap_or_default()
+ .as_millis() as u64;
+
+ let payload = json!({
+ "format": "single-file-db-copy",
+ "db_path": self.path.to_string_lossy().to_string(),
+ "byte_length": snapshot_bytes.len(),
+ "checksum_crc32c": checksum_crc32c,
+ "generated_at_ms": generated_at_ms,
+ "epoch": status.epoch,
+ "head_log_index": status.head_log_index,
+ "retained_floor": status.retained_floor,
+ "start_cursor": ReplicationCursor::new(status.epoch, 0, 0, status.retained_floor).to_string(),
+ "data_base64": if include_data {
+ Some(BASE64_STANDARD.encode(&snapshot_bytes))
+ } else {
+ None
+ },
+ });
+
+ serde_json::to_string(&payload).map_err(|error| {
+ KiteError::Serialization(format!("encode replication snapshot export: {error}"))
+ })
+ }
+
+ /// Export primary replication log frames with cursor paging as transport JSON.
+ pub fn primary_export_log_transport_json(
+ &self,
+ cursor: Option<&str>,
+ max_frames: usize,
+ max_bytes: usize,
+ include_payload: bool,
+ ) -> Result {
+ if max_frames == 0 {
+ return Err(KiteError::InvalidQuery("max_frames must be > 0".into()));
+ }
+ if max_bytes == 0 {
+ return Err(KiteError::InvalidQuery("max_bytes must be > 0".into()));
+ }
+
+ let status = self.primary_replication_status().ok_or_else(|| {
+ KiteError::InvalidReplication("database is not opened in primary role".to_string())
+ })?;
+ let sidecar_path = status.sidecar_path;
+ let manifest = ManifestStore::new(sidecar_path.join(REPLICATION_MANIFEST_FILE)).read()?;
+ let parsed_cursor = match cursor {
+ Some(raw) if !raw.trim().is_empty() => Some(
+ ReplicationCursor::from_str(raw)
+ .map_err(|error| KiteError::InvalidReplication(format!("invalid cursor: {error}")))?,
+ ),
+ _ => None,
+ };
+
+ let mut segments = manifest.segments.clone();
+ segments.sort_by_key(|segment| segment.id);
+
+ let mut frames = Vec::new();
+ let mut total_bytes = 0usize;
+ let mut next_cursor: Option = None;
+ let mut limited = false;
+
+ 'outer: for segment in segments {
+ let segment_path = sidecar_path.join(format_segment_file_name(segment.id));
+ if !segment_path.exists() {
+ continue;
+ }
+ let bytes = std::fs::read(&segment_path)?;
+ let mut offset = 0usize;
+
+ while offset + REPLICATION_FRAME_HEADER_BYTES <= bytes.len() {
+ let magic = le_u32(&bytes[offset..offset + 4])?;
+ if magic != REPLICATION_FRAME_MAGIC {
+ break;
+ }
+
+ let epoch = le_u64(&bytes[offset + 8..offset + 16])?;
+ let log_index = le_u64(&bytes[offset + 16..offset + 24])?;
+ let payload_len = le_u32(&bytes[offset + 24..offset + 28])? as usize;
+ let payload_start = offset + REPLICATION_FRAME_HEADER_BYTES;
+ let payload_end = payload_start.checked_add(payload_len).ok_or_else(|| {
+ KiteError::InvalidReplication("replication frame payload overflow".to_string())
+ })?;
+ if payload_end > bytes.len() {
+ return Err(KiteError::InvalidReplication(format!(
+ "replication frame truncated in segment {} at byte {}",
+ segment.id, offset
+ )));
+ }
+
+ let frame_bytes = payload_end - offset;
+ let frame_offset = offset as u64;
+ if frame_after_cursor(parsed_cursor, epoch, segment.id, frame_offset, log_index) {
+ if (total_bytes + frame_bytes > max_bytes && !frames.is_empty())
+ || frames.len() >= max_frames
+ {
+ limited = true;
+ break 'outer;
+ }
+
+ next_cursor = Some(
+ ReplicationCursor::new(epoch, segment.id, payload_end as u64, log_index).to_string(),
+ );
+ let payload_base64 = if include_payload {
+ Some(BASE64_STANDARD.encode(&bytes[payload_start..payload_end]))
+ } else {
+ None
+ };
+
+ frames.push(json!({
+ "epoch": epoch,
+ "log_index": log_index,
+ "segment_id": segment.id,
+ "segment_offset": frame_offset,
+ "bytes": frame_bytes,
+ "payload_base64": payload_base64,
+ }));
+ total_bytes += frame_bytes;
+ }
+
+ offset = payload_end;
+ }
+ }
+
+ let payload = json!({
+ "epoch": manifest.epoch,
+ "head_log_index": manifest.head_log_index,
+ "retained_floor": manifest.retained_floor,
+ "cursor": parsed_cursor.map(|value| value.to_string()),
+ "next_cursor": next_cursor,
+ "eof": !limited,
+ "frame_count": frames.len(),
+ "total_bytes": total_bytes,
+ "frames": frames,
+ });
+
+ serde_json::to_string(&payload)
+ .map_err(|error| KiteError::Serialization(format!("encode replication log export: {error}")))
+ }
+}
+
+fn frame_after_cursor(
+ cursor: Option,
+ epoch: u64,
+ segment_id: u64,
+ segment_offset: u64,
+ log_index: u64,
+) -> bool {
+ match cursor {
+ None => true,
+ Some(cursor) => {
+ (epoch, log_index, segment_id, segment_offset)
+ > (
+ cursor.epoch,
+ cursor.log_index,
+ cursor.segment_id,
+ cursor.segment_offset,
+ )
+ }
+ }
+}
+
+fn le_u32(bytes: &[u8]) -> Result {
+ let value: [u8; 4] = bytes
+ .try_into()
+ .map_err(|_| KiteError::InvalidReplication("invalid frame u32 field".to_string()))?;
+ Ok(u32::from_le_bytes(value))
+}
+
+fn le_u64(bytes: &[u8]) -> Result {
+ let value: [u8; 8] = bytes
+ .try_into()
+ .map_err(|_| KiteError::InvalidReplication("invalid frame u64 field".to_string()))?;
+ Ok(u64::from_le_bytes(value))
+}
+
+fn format_segment_file_name(id: u64) -> String {
+ format!("segment-{id:020}.rlog")
+}
+
+fn sync_graph_state(replica: &SingleFileDB, source: &SingleFileDB) -> Result<()> {
+ let tx_guard = replica.begin_guard(false)?;
+
+ let source_nodes = source.list_nodes();
+ let source_node_set: HashSet<_> = source_nodes.iter().copied().collect();
+
+ for node_id in source_nodes {
+ let source_key = source.node_key(node_id);
+ if replica.node_exists(node_id) {
+ if replica.node_key(node_id) != source_key {
+ let _ = replica.delete_node(node_id)?;
+ replica.create_node_with_id(node_id, source_key.as_deref())?;
+ }
+ } else {
+ replica.create_node_with_id(node_id, source_key.as_deref())?;
+ }
+ }
+
+ for node_id in replica.list_nodes() {
+ if !source_node_set.contains(&node_id) {
+ let _ = replica.delete_node(node_id)?;
+ }
+ }
+
+ let source_edges = source.list_edges(None);
+ let source_edge_set: HashSet<_> = source_edges
+ .iter()
+ .map(|edge| (edge.src, edge.etype, edge.dst))
+ .collect();
+
+ for edge in source_edges {
+ if !replica.edge_exists(edge.src, edge.etype, edge.dst) {
+ replica.add_edge(edge.src, edge.etype, edge.dst)?;
+ }
+ }
+
+ for edge in replica.list_edges(None) {
+ if !source_edge_set.contains(&(edge.src, edge.etype, edge.dst)) {
+ replica.delete_edge(edge.src, edge.etype, edge.dst)?;
+ }
+ }
+
+ tx_guard.commit()
+}
+
+fn apply_replication_frame(db: &SingleFileDB, payload: &[u8]) -> Result<()> {
+ let decoded = decode_commit_frame_payload(payload)?;
+ let records = parse_wal_records(&decoded.wal_bytes)?;
+
+ if records.is_empty() {
+ return Ok(());
+ }
+
+ let tx_guard = db.begin_guard(false)?;
+ for record in &records {
+ apply_wal_record_idempotent(db, record)?;
+ }
+
+ tx_guard.commit()
+}
+
+fn parse_wal_records(wal_bytes: &[u8]) -> Result> {
+ let mut offset = 0usize;
+ let mut records = Vec::new();
+
+ while offset < wal_bytes.len() {
+ let record = parse_wal_record(wal_bytes, offset).ok_or_else(|| {
+ KiteError::InvalidReplication(format!(
+ "invalid WAL payload in replication frame at offset {offset}"
+ ))
+ })?;
+
+ if record.record_end <= offset {
+ return Err(KiteError::InvalidReplication(
+ "non-progressing WAL record parse in replication payload".to_string(),
+ ));
+ }
+
+ offset = record.record_end;
+ records.push(record);
+ }
+
+ Ok(records)
+}
+
+fn apply_wal_record_idempotent(db: &SingleFileDB, record: &ParsedWalRecord) -> Result<()> {
+ match record.record_type {
+ WalRecordType::Begin | WalRecordType::Commit | WalRecordType::Rollback => Ok(()),
+ WalRecordType::CreateNode => {
+ let data = parse_create_node_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid CreateNode replication payload".to_string())
+ })?;
+
+ if db.node_exists(data.node_id) {
+ if db.node_key(data.node_id) == data.key {
+ return Ok(());
+ }
+ return Err(KiteError::InvalidReplication(format!(
+ "create-node replay key mismatch for node {}",
+ data.node_id
+ )));
+ }
+
+ db.create_node_with_id(data.node_id, data.key.as_deref())?;
+ Ok(())
+ }
+ WalRecordType::CreateNodesBatch => {
+ let entries = parse_create_nodes_batch_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid CreateNodesBatch replication payload".to_string())
+ })?;
+
+ for entry in entries {
+ if db.node_exists(entry.node_id) {
+ if db.node_key(entry.node_id) != entry.key {
+ return Err(KiteError::InvalidReplication(format!(
+ "create-nodes-batch replay key mismatch for node {}",
+ entry.node_id
+ )));
+ }
+ continue;
+ }
+
+ db.create_node_with_id(entry.node_id, entry.key.as_deref())?;
+ }
+
+ Ok(())
+ }
+ WalRecordType::DeleteNode => {
+ let data = parse_delete_node_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid DeleteNode replication payload".to_string())
+ })?;
+ if db.node_exists(data.node_id) {
+ let _ = db.delete_node(data.node_id)?;
+ }
+ Ok(())
+ }
+ WalRecordType::AddEdge => {
+ let data = parse_add_edge_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid AddEdge replication payload".to_string())
+ })?;
+ if !db.edge_exists(data.src, data.etype, data.dst) {
+ db.add_edge(data.src, data.etype, data.dst)?;
+ }
+ Ok(())
+ }
+ WalRecordType::DeleteEdge => {
+ let data = parse_delete_edge_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid DeleteEdge replication payload".to_string())
+ })?;
+ if db.edge_exists(data.src, data.etype, data.dst) {
+ db.delete_edge(data.src, data.etype, data.dst)?;
+ }
+ Ok(())
+ }
+ WalRecordType::AddEdgesBatch => {
+ let batch = parse_add_edges_batch_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid AddEdgesBatch replication payload".to_string())
+ })?;
+
+ for edge in batch {
+ if !db.edge_exists(edge.src, edge.etype, edge.dst) {
+ db.add_edge(edge.src, edge.etype, edge.dst)?;
+ }
+ }
+ Ok(())
+ }
+ WalRecordType::AddEdgeProps => {
+ let data = parse_add_edge_props_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid AddEdgeProps replication payload".to_string())
+ })?;
+
+ if !db.edge_exists(data.src, data.etype, data.dst) {
+ db.add_edge(data.src, data.etype, data.dst)?;
+ }
+
+ for (key_id, value) in data.props {
+ if db.edge_prop(data.src, data.etype, data.dst, key_id) != Some(value.clone()) {
+ db.set_edge_prop(data.src, data.etype, data.dst, key_id, value)?;
+ }
+ }
+ Ok(())
+ }
+ WalRecordType::AddEdgesPropsBatch => {
+ let batch = parse_add_edges_props_batch_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid AddEdgesPropsBatch replication payload".to_string())
+ })?;
+
+ for entry in batch {
+ if !db.edge_exists(entry.src, entry.etype, entry.dst) {
+ db.add_edge(entry.src, entry.etype, entry.dst)?;
+ }
+
+ for (key_id, value) in entry.props {
+ if db.edge_prop(entry.src, entry.etype, entry.dst, key_id) != Some(value.clone()) {
+ db.set_edge_prop(entry.src, entry.etype, entry.dst, key_id, value)?;
+ }
+ }
+ }
+
+ Ok(())
+ }
+ WalRecordType::SetNodeProp => {
+ let data = parse_set_node_prop_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid SetNodeProp replication payload".to_string())
+ })?;
+
+ if db.node_prop(data.node_id, data.key_id) != Some(data.value.clone()) {
+ db.set_node_prop(data.node_id, data.key_id, data.value)?;
+ }
+
+ Ok(())
+ }
+ WalRecordType::DelNodeProp => {
+ let data = parse_del_node_prop_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid DelNodeProp replication payload".to_string())
+ })?;
+
+ if db.node_prop(data.node_id, data.key_id).is_some() {
+ db.delete_node_prop(data.node_id, data.key_id)?;
+ }
+ Ok(())
+ }
+ WalRecordType::SetEdgeProp => {
+ let data = parse_set_edge_prop_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid SetEdgeProp replication payload".to_string())
+ })?;
+
+ if db.edge_prop(data.src, data.etype, data.dst, data.key_id) != Some(data.value.clone()) {
+ db.set_edge_prop(data.src, data.etype, data.dst, data.key_id, data.value)?;
+ }
+ Ok(())
+ }
+ WalRecordType::SetEdgeProps => {
+ let data = parse_set_edge_props_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid SetEdgeProps replication payload".to_string())
+ })?;
+
+ for (key_id, value) in data.props {
+ if db.edge_prop(data.src, data.etype, data.dst, key_id) != Some(value.clone()) {
+ db.set_edge_prop(data.src, data.etype, data.dst, key_id, value)?;
+ }
+ }
+ Ok(())
+ }
+ WalRecordType::DelEdgeProp => {
+ let data = parse_del_edge_prop_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid DelEdgeProp replication payload".to_string())
+ })?;
+
+ if db
+ .edge_prop(data.src, data.etype, data.dst, data.key_id)
+ .is_some()
+ {
+ db.delete_edge_prop(data.src, data.etype, data.dst, data.key_id)?;
+ }
+ Ok(())
+ }
+ WalRecordType::AddNodeLabel => {
+ let data = parse_add_node_label_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid AddNodeLabel replication payload".to_string())
+ })?;
+
+ if !db.node_has_label(data.node_id, data.label_id) {
+ db.add_node_label(data.node_id, data.label_id)?;
+ }
+ Ok(())
+ }
+ WalRecordType::RemoveNodeLabel => {
+ let data = parse_remove_node_label_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid RemoveNodeLabel replication payload".to_string())
+ })?;
+
+ if db.node_has_label(data.node_id, data.label_id) {
+ db.remove_node_label(data.node_id, data.label_id)?;
+ }
+ Ok(())
+ }
+ WalRecordType::SetNodeVector => {
+ let data = parse_set_node_vector_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid SetNodeVector replication payload".to_string())
+ })?;
+
+ let current = db.node_vector(data.node_id, data.prop_key_id);
+ if current.as_deref().map(|v| v.as_ref()) != Some(data.vector.as_slice()) {
+ db.set_node_vector(data.node_id, data.prop_key_id, &data.vector)?;
+ }
+ Ok(())
+ }
+ WalRecordType::DelNodeVector => {
+ let data = parse_del_node_vector_payload(&record.payload).ok_or_else(|| {
+ KiteError::InvalidReplication("invalid DelNodeVector replication payload".to_string())
+ })?;
+
+ if db.has_node_vector(data.node_id, data.prop_key_id) {
+ db.delete_node_vector(data.node_id, data.prop_key_id)?;
+ }
+ Ok(())
+ }
+ WalRecordType::DefineLabel | WalRecordType::DefineEtype | WalRecordType::DefinePropkey => {
+ // IDs are embedded in mutation records; numeric IDs are sufficient for correctness
+ // during V1 replication apply.
+ Ok(())
+ }
+ WalRecordType::BatchVectors | WalRecordType::SealFragment | WalRecordType::CompactFragments => {
+ Err(KiteError::InvalidReplication(
+ "vector batch/maintenance WAL replay is not yet supported in replica apply".to_string(),
+ ))
+ }
+ }
+}
diff --git a/ray-rs/src/napi_bindings/database.rs b/ray-rs/src/napi_bindings/database.rs
index f1eecec..4a9185c 100644
--- a/ray-rs/src/napi_bindings/database.rs
+++ b/ray-rs/src/napi_bindings/database.rs
@@ -3312,6 +3312,49 @@ pub fn collect_replication_metrics_otel_json(db: &Database) -> Result {
}
}
+#[napi]
+pub fn collect_replication_snapshot_transport_json(
+ db: &Database,
+ include_data: Option,
+) -> Result {
+ match db.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => db
+ .primary_export_snapshot_transport_json(include_data.unwrap_or(false))
+ .map_err(|e| Error::from_reason(format!("Failed to export replication snapshot: {e}"))),
+ None => Err(Error::from_reason("Database is closed")),
+ }
+}
+
+#[napi]
+pub fn collect_replication_log_transport_json(
+ db: &Database,
+ cursor: Option,
+ max_frames: Option,
+ max_bytes: Option,
+ include_payload: Option,
+) -> Result {
+ let max_frames = max_frames.unwrap_or(128);
+ let max_bytes = max_bytes.unwrap_or(1_048_576);
+ if max_frames <= 0 {
+ return Err(Error::from_reason("maxFrames must be positive"));
+ }
+ if max_bytes <= 0 {
+ return Err(Error::from_reason("maxBytes must be positive"));
+ }
+
+ match db.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => db
+ .primary_export_log_transport_json(
+ cursor.as_deref(),
+ max_frames as usize,
+ max_bytes as usize,
+ include_payload.unwrap_or(true),
+ )
+ .map_err(|e| Error::from_reason(format!("Failed to export replication log: {e}"))),
+ None => Err(Error::from_reason("Database is closed")),
+ }
+}
+
#[napi]
pub fn push_replication_metrics_otel_json(
db: &Database,
diff --git a/ray-rs/src/pyo3_bindings/database.rs b/ray-rs/src/pyo3_bindings/database.rs
index 128d122..cfbc7f5 100644
--- a/ray-rs/src/pyo3_bindings/database.rs
+++ b/ray-rs/src/pyo3_bindings/database.rs
@@ -1767,6 +1767,57 @@ pub fn collect_replication_metrics_otel_json(db: &PyDatabase) -> PyResult PyResult {
+ let guard = db
+ .inner
+ .read()
+ .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
+ match guard.as_ref() {
+ Some(DatabaseInner::SingleFile(d)) => d
+ .primary_export_snapshot_transport_json(include_data)
+ .map_err(|e| PyRuntimeError::new_err(format!("Failed to export replication snapshot: {e}"))),
+ None => Err(PyRuntimeError::new_err("Database is closed")),
+ }
+}
+
+#[pyfunction]
+#[pyo3(signature = (db, cursor=None, max_frames=128, max_bytes=1048576, include_payload=true))]
+pub fn collect_replication_log_transport_json(
+ db: &PyDatabase,
+ cursor: Option,
+ max_frames: i64,
+ max_bytes: i64,
+ include_payload: bool,
+) -> PyResult {
+ if max_frames <= 0 {
+ return Err(PyRuntimeError::new_err("max_frames must be positive"));
+ }
+ if max_bytes <= 0 {
+ return Err(PyRuntimeError::new_err("max_bytes must be positive"));
+ }
+
+ let guard = db
+ .inner
+ .read()
+ .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
+ match guard.as_ref() {
+ Some(DatabaseInner::SingleFile(d)) => d
+ .primary_export_log_transport_json(
+ cursor.as_deref(),
+ max_frames as usize,
+ max_bytes as usize,
+ include_payload,
+ )
+ .map_err(|e| PyRuntimeError::new_err(format!("Failed to export replication log: {e}"))),
+ None => Err(PyRuntimeError::new_err("Database is closed")),
+ }
+}
+
#[pyfunction]
#[pyo3(signature = (
db,
diff --git a/ray-rs/src/pyo3_bindings/mod.rs b/ray-rs/src/pyo3_bindings/mod.rs
index 3ec942e..7110f2e 100644
--- a/ray-rs/src/pyo3_bindings/mod.rs
+++ b/ray-rs/src/pyo3_bindings/mod.rs
@@ -125,6 +125,14 @@ pub fn kitedb(m: &Bound<'_, PyModule>) -> PyResult<()> {
database::collect_replication_metrics_otel_json,
m
)?)?;
+ m.add_function(wrap_pyfunction!(
+ database::collect_replication_snapshot_transport_json,
+ m
+ )?)?;
+ m.add_function(wrap_pyfunction!(
+ database::collect_replication_log_transport_json,
+ m
+ )?)?;
m.add_function(wrap_pyfunction!(
database::push_replication_metrics_otel_json,
m
diff --git a/ray-rs/tests/replication_phase_d.rs b/ray-rs/tests/replication_phase_d.rs
new file mode 100644
index 0000000..728c22c
--- /dev/null
+++ b/ray-rs/tests/replication_phase_d.rs
@@ -0,0 +1,471 @@
+use std::sync::Arc;
+use std::time::Duration;
+
+use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
+use base64::Engine;
+use kitedb::core::single_file::{close_single_file, open_single_file, SingleFileOpenOptions};
+use kitedb::replication::types::ReplicationRole;
+
+fn open_primary(
+ path: &std::path::Path,
+ sidecar: &std::path::Path,
+ segment_max_bytes: u64,
+ retention_min_entries: u64,
+) -> kitedb::Result {
+ open_single_file(
+ path,
+ SingleFileOpenOptions::new()
+ .replication_role(ReplicationRole::Primary)
+ .replication_sidecar_path(sidecar)
+ .replication_segment_max_bytes(segment_max_bytes)
+ .replication_retention_min_entries(retention_min_entries),
+ )
+}
+
+fn open_replica(
+ replica_path: &std::path::Path,
+ source_db_path: &std::path::Path,
+ local_sidecar: &std::path::Path,
+ source_sidecar: &std::path::Path,
+) -> kitedb::Result {
+ open_single_file(
+ replica_path,
+ SingleFileOpenOptions::new()
+ .replication_role(ReplicationRole::Replica)
+ .replication_sidecar_path(local_sidecar)
+ .replication_source_db_path(source_db_path)
+ .replication_source_sidecar_path(source_sidecar),
+ )
+}
+
+#[test]
+fn promotion_increments_epoch_and_fences_stale_primary_writes() {
+ let dir = tempfile::tempdir().expect("tempdir");
+ let db_path = dir.path().join("phase-d-promote.kitedb");
+ let sidecar = dir.path().join("phase-d-promote.sidecar");
+
+ let primary_a = open_primary(&db_path, &sidecar, 256, 4).expect("open primary a");
+ let primary_b = open_primary(&db_path, &sidecar, 256, 4).expect("open primary b");
+
+ primary_a.begin(false).expect("begin a");
+ primary_a.create_node(Some("a0")).expect("create a0");
+ let t0 = primary_a
+ .commit_with_token()
+ .expect("commit a0")
+ .expect("token a0");
+ assert_eq!(t0.epoch, 1);
+
+ let new_epoch = primary_b.primary_promote_to_next_epoch().expect("promote");
+ assert_eq!(new_epoch, 2);
+
+ primary_b.begin(false).expect("begin b");
+ primary_b.create_node(Some("b0")).expect("create b0");
+ let t1 = primary_b
+ .commit_with_token()
+ .expect("commit b0")
+ .expect("token b0");
+ assert_eq!(t1.epoch, 2);
+
+ primary_a.begin(false).expect("begin stale");
+ primary_a.create_node(Some("stale")).expect("create stale");
+ let err = primary_a
+ .commit_with_token()
+ .expect_err("stale primary commit must fail");
+ assert!(
+ err.to_string().contains("stale primary"),
+ "unexpected stale commit error: {err}"
+ );
+
+ close_single_file(primary_b).expect("close b");
+ close_single_file(primary_a).expect("close a");
+}
+
+#[test]
+fn retention_respects_active_replica_cursor_and_minimum_window() {
+ let dir = tempfile::tempdir().expect("tempdir");
+ let db_path = dir.path().join("phase-d-retention.kitedb");
+ let sidecar = dir.path().join("phase-d-retention.sidecar");
+
+ let primary = open_primary(&db_path, &sidecar, 1, 2).expect("open primary");
+
+ for i in 0..6 {
+ primary.begin(false).expect("begin");
+ primary
+ .create_node(Some(&format!("n-{i}")))
+ .expect("create");
+ let _ = primary.commit_with_token().expect("commit").expect("token");
+ }
+
+ primary
+ .primary_report_replica_progress("replica-a", 1, 2)
+ .expect("report cursor");
+
+ let prune = primary.primary_run_retention().expect("run retention");
+ assert!(prune.pruned_segments > 0);
+
+ let status = primary.primary_replication_status().expect("status");
+ assert_eq!(status.retained_floor, 3);
+ assert!(status
+ .replica_lags
+ .iter()
+ .any(|lag| lag.replica_id == "replica-a" && lag.applied_log_index == 2));
+
+ close_single_file(primary).expect("close primary");
+}
+
+#[test]
+fn missing_segment_marks_replica_needs_reseed() {
+ let dir = tempfile::tempdir().expect("tempdir");
+ let primary_path = dir.path().join("phase-d-missing-primary.kitedb");
+ let primary_sidecar = dir.path().join("phase-d-missing-primary.sidecar");
+ let replica_path = dir.path().join("phase-d-missing-replica.kitedb");
+ let replica_sidecar = dir.path().join("phase-d-missing-replica.sidecar");
+
+ let primary = open_primary(&primary_path, &primary_sidecar, 1, 2).expect("open primary");
+
+ primary.begin(false).expect("begin base");
+ primary.create_node(Some("base")).expect("create base");
+ primary
+ .commit_with_token()
+ .expect("commit base")
+ .expect("token base");
+
+ let replica = open_replica(
+ &replica_path,
+ &primary_path,
+ &replica_sidecar,
+ &primary_sidecar,
+ )
+ .expect("open replica");
+ replica
+ .replica_bootstrap_from_snapshot()
+ .expect("bootstrap snapshot");
+
+ for i in 0..4 {
+ primary.begin(false).expect("begin");
+ primary
+ .create_node(Some(&format!("m-{i}")))
+ .expect("create");
+ primary.commit_with_token().expect("commit").expect("token");
+ }
+
+ primary
+ .primary_report_replica_progress("replica-m", 1, 1)
+ .expect("report lagging cursor");
+ let _ = primary.primary_run_retention().expect("run retention");
+
+ let err = replica
+ .replica_catch_up_once(32)
+ .expect_err("replica should require reseed");
+ assert!(err.to_string().contains("reseed"));
+
+ let status = replica
+ .replica_replication_status()
+ .expect("replica status");
+ assert!(status.needs_reseed);
+
+ close_single_file(replica).expect("close replica");
+ close_single_file(primary).expect("close primary");
+}
+
+#[test]
+fn lagging_replica_reseed_recovers_after_retention_gap() {
+ let dir = tempfile::tempdir().expect("tempdir");
+ let primary_path = dir.path().join("phase-d-reseed-primary.kitedb");
+ let primary_sidecar = dir.path().join("phase-d-reseed-primary.sidecar");
+ let replica_path = dir.path().join("phase-d-reseed-replica.kitedb");
+ let replica_sidecar = dir.path().join("phase-d-reseed-replica.sidecar");
+
+ let primary = open_primary(&primary_path, &primary_sidecar, 1, 2).expect("open primary");
+
+ primary.begin(false).expect("begin base");
+ primary.create_node(Some("base")).expect("create base");
+ primary
+ .commit_with_token()
+ .expect("commit base")
+ .expect("token base");
+
+ let replica = open_replica(
+ &replica_path,
+ &primary_path,
+ &replica_sidecar,
+ &primary_sidecar,
+ )
+ .expect("open replica");
+ replica
+ .replica_bootstrap_from_snapshot()
+ .expect("bootstrap snapshot");
+
+ for i in 0..5 {
+ primary.begin(false).expect("begin");
+ primary
+ .create_node(Some(&format!("r-{i}")))
+ .expect("create");
+ primary.commit_with_token().expect("commit").expect("token");
+ }
+
+ primary
+ .primary_report_replica_progress("replica-r", 1, 1)
+ .expect("report lagging cursor");
+ let _ = primary.primary_run_retention().expect("run retention");
+
+ let _ = replica
+ .replica_catch_up_once(32)
+ .expect_err("must need reseed");
+ assert!(
+ replica
+ .replica_replication_status()
+ .expect("status")
+ .needs_reseed
+ );
+
+ replica.replica_reseed_from_snapshot().expect("reseed");
+ assert!(
+ !replica
+ .replica_replication_status()
+ .expect("status post reseed")
+ .needs_reseed
+ );
+ assert_eq!(replica.count_nodes(), primary.count_nodes());
+
+ close_single_file(replica).expect("close replica");
+ close_single_file(primary).expect("close primary");
+}
+
+#[test]
+fn promotion_race_rejects_split_brain_writes() {
+ let dir = tempfile::tempdir().expect("tempdir");
+ let db_path = dir.path().join("phase-d-race.kitedb");
+ let sidecar = dir.path().join("phase-d-race.sidecar");
+
+ let left = Arc::new(open_primary(&db_path, &sidecar, 128, 8).expect("open left"));
+ let right = Arc::new(open_primary(&db_path, &sidecar, 128, 8).expect("open right"));
+
+ let l = Arc::clone(&left);
+ let h1 = std::thread::spawn(move || {
+ let promote = l.primary_promote_to_next_epoch();
+ l.begin(false).expect("left begin");
+ l.create_node(Some("left")).expect("left create");
+ let commit = l.commit_with_token();
+ (promote, commit)
+ });
+
+ let r = Arc::clone(&right);
+ let h2 = std::thread::spawn(move || {
+ let promote = r.primary_promote_to_next_epoch();
+ r.begin(false).expect("right begin");
+ r.create_node(Some("right")).expect("right create");
+ let commit = r.commit_with_token();
+ (promote, commit)
+ });
+
+ let (left_promote, left_result) = h1.join().expect("left join");
+ let (right_promote, right_result) = h2.join().expect("right join");
+ assert!(left_promote.is_ok());
+ assert!(right_promote.is_ok());
+
+ let left_ok = left_result.as_ref().is_ok_and(|token| token.is_some());
+ let right_ok = right_result.as_ref().is_ok_and(|token| token.is_some());
+ assert!(
+ left_ok ^ right_ok,
+ "exactly one writer should succeed after race"
+ );
+
+ let left = Arc::into_inner(left).expect("left unique");
+ let right = Arc::into_inner(right).expect("right unique");
+ close_single_file(left).expect("close left");
+ close_single_file(right).expect("close right");
+}
+
+#[test]
+fn retention_time_window_keeps_recent_segments() {
+ let dir = tempfile::tempdir().expect("tempdir");
+ let db_path = dir.path().join("phase-d-retention-window.kitedb");
+ let sidecar = dir.path().join("phase-d-retention-window.sidecar");
+
+ let primary = open_single_file(
+ &db_path,
+ SingleFileOpenOptions::new()
+ .replication_role(ReplicationRole::Primary)
+ .replication_sidecar_path(&sidecar)
+ .replication_segment_max_bytes(1)
+ .replication_retention_min_entries(0)
+ .replication_retention_min_ms(60_000),
+ )
+ .expect("open primary");
+
+ for i in 0..6 {
+ primary.begin(false).expect("begin");
+ primary
+ .create_node(Some(&format!("w-{i}")))
+ .expect("create");
+ primary.commit_with_token().expect("commit").expect("token");
+ }
+
+ let segments_before = std::fs::read_dir(&sidecar)
+ .expect("list sidecar")
+ .filter_map(|entry| entry.ok())
+ .filter(|entry| entry.file_name().to_string_lossy().starts_with("segment-"))
+ .count();
+ assert!(
+ segments_before > 1,
+ "expected multiple segments for retention"
+ );
+
+ let prune = primary.primary_run_retention().expect("run retention");
+ assert_eq!(prune.pruned_segments, 0);
+
+ // Ensure no filesystem-timestamp race with segment creation.
+ std::thread::sleep(Duration::from_millis(5));
+
+ let segments_after = std::fs::read_dir(&sidecar)
+ .expect("list sidecar after retention")
+ .filter_map(|entry| entry.ok())
+ .filter(|entry| entry.file_name().to_string_lossy().starts_with("segment-"))
+ .count();
+ assert_eq!(segments_after, segments_before);
+
+ close_single_file(primary).expect("close primary");
+}
+
+#[test]
+fn replica_open_requires_source_db_path() {
+ let dir = tempfile::tempdir().expect("tempdir");
+ let replica_path = dir.path().join("phase-d-misconfig-no-source.kitedb");
+ let replica_sidecar = dir.path().join("phase-d-misconfig-no-source.sidecar");
+
+ let err = open_single_file(
+ &replica_path,
+ SingleFileOpenOptions::new()
+ .replication_role(ReplicationRole::Replica)
+ .replication_sidecar_path(&replica_sidecar),
+ )
+ .err()
+ .expect("replica open without source db path must fail");
+
+ assert!(
+ err.to_string().contains("source db path"),
+ "unexpected error: {err}"
+ );
+}
+
+#[test]
+fn replica_open_rejects_source_sidecar_equal_local_sidecar() {
+ let dir = tempfile::tempdir().expect("tempdir");
+ let primary_path = dir.path().join("phase-d-misconfig-primary.kitedb");
+ let primary_sidecar = dir.path().join("phase-d-misconfig-primary.sidecar");
+ let replica_path = dir.path().join("phase-d-misconfig-replica.kitedb");
+
+ let primary = open_primary(&primary_path, &primary_sidecar, 128, 8).expect("open primary");
+ primary.begin(false).expect("begin primary");
+ primary.create_node(Some("seed")).expect("create seed");
+ primary.commit_with_token().expect("commit primary");
+
+ let err = open_single_file(
+ &replica_path,
+ SingleFileOpenOptions::new()
+ .replication_role(ReplicationRole::Replica)
+ .replication_sidecar_path(&primary_sidecar)
+ .replication_source_db_path(&primary_path)
+ .replication_source_sidecar_path(&primary_sidecar),
+ )
+ .err()
+ .expect("replica local/source sidecar collision must fail");
+
+ assert!(
+ err.to_string().contains("source sidecar path must differ"),
+ "unexpected error: {err}"
+ );
+
+ close_single_file(primary).expect("close primary");
+}
+
+#[test]
+fn primary_snapshot_transport_export_includes_metadata_and_optional_data() {
+ let dir = tempfile::tempdir().expect("tempdir");
+ let db_path = dir.path().join("phase-d-transport-snapshot.kitedb");
+ let sidecar = dir.path().join("phase-d-transport-snapshot.sidecar");
+ let primary = open_primary(&db_path, &sidecar, 128, 8).expect("open primary");
+
+ primary.begin(false).expect("begin");
+ primary.create_node(Some("snap-1")).expect("create");
+ primary.commit_with_token().expect("commit");
+
+ let without_data = primary
+ .primary_export_snapshot_transport_json(false)
+ .expect("snapshot transport export");
+ let without_data_json: serde_json::Value =
+ serde_json::from_str(&without_data).expect("parse snapshot export");
+ assert_eq!(without_data_json["format"], "single-file-db-copy");
+ assert_eq!(without_data_json["epoch"], 1);
+ assert_eq!(without_data_json["data_base64"], serde_json::Value::Null);
+ assert!(without_data_json["checksum_crc32c"]
+ .as_str()
+ .map(|value| !value.is_empty())
+ .unwrap_or(false));
+
+ let with_data = primary
+ .primary_export_snapshot_transport_json(true)
+ .expect("snapshot export with data");
+ let with_data_json: serde_json::Value =
+ serde_json::from_str(&with_data).expect("parse snapshot export with data");
+ let encoded = with_data_json["data_base64"]
+ .as_str()
+ .expect("data_base64 must be present");
+ let decoded = BASE64_STANDARD
+ .decode(encoded)
+ .expect("decode snapshot base64");
+ assert_eq!(
+ decoded.len() as u64,
+ with_data_json["byte_length"]
+ .as_u64()
+ .expect("byte_length must be u64")
+ );
+
+ close_single_file(primary).expect("close primary");
+}
+
+#[test]
+fn primary_log_transport_export_pages_by_cursor() {
+ let dir = tempfile::tempdir().expect("tempdir");
+ let db_path = dir.path().join("phase-d-transport-log.kitedb");
+ let sidecar = dir.path().join("phase-d-transport-log.sidecar");
+ let primary = open_primary(&db_path, &sidecar, 1, 2).expect("open primary");
+
+ for i in 0..5 {
+ primary.begin(false).expect("begin");
+ primary
+ .create_node(Some(&format!("transport-{i}")))
+ .expect("create");
+ primary.commit_with_token().expect("commit");
+ }
+
+ let first = primary
+ .primary_export_log_transport_json(None, 2, 1024 * 1024, true)
+ .expect("first log export");
+ let first_json: serde_json::Value = serde_json::from_str(&first).expect("parse first page");
+ assert_eq!(first_json["frame_count"], 2);
+ assert_eq!(first_json["eof"], false);
+ assert!(first_json["frames"]
+ .as_array()
+ .expect("frames array")
+ .iter()
+ .all(|frame| frame["payload_base64"].as_str().is_some()));
+
+ let cursor = first_json["next_cursor"]
+ .as_str()
+ .expect("next_cursor")
+ .to_string();
+ let second = primary
+ .primary_export_log_transport_json(Some(&cursor), 4, 1024 * 1024, false)
+ .expect("second log export");
+ let second_json: serde_json::Value = serde_json::from_str(&second).expect("parse second page");
+ assert!(second_json["frame_count"].as_u64().unwrap_or_default() > 0);
+ assert!(second_json["frames"]
+ .as_array()
+ .expect("frames array")
+ .iter()
+ .all(|frame| frame["payload_base64"].is_null()));
+
+ close_single_file(primary).expect("close primary");
+}
diff --git a/ray-rs/ts/index.ts b/ray-rs/ts/index.ts
index dc42c09..71d32a9 100644
--- a/ray-rs/ts/index.ts
+++ b/ray-rs/ts/index.ts
@@ -1032,8 +1032,10 @@ export {
backupInfo,
createOfflineBackup,
collectMetrics,
+ collectReplicationLogTransportJson,
collectReplicationMetricsOtelJson,
collectReplicationMetricsPrometheus,
+ collectReplicationSnapshotTransportJson,
pushReplicationMetricsOtelJson,
pushReplicationMetricsOtelJsonWithOptions,
healthCheck,
From 9e97f1857c9ecf248eb363cb46c813140f18b118 Mon Sep 17 00:00:00 2001
From: mask
Date: Sun, 8 Feb 2026 11:13:48 -0600
Subject: [PATCH 04/58] replication: expose transport export APIs in bindings
---
docs/REPLICATION_PLAN.md | 2 ++
docs/REPLICATION_RUNBOOK.md | 2 ++
ray-rs/src/napi_bindings/database.rs | 45 ++++++++++++++++++++++++++++
ray-rs/src/pyo3_bindings/database.rs | 41 +++++++++++++++++++++++++
4 files changed, 90 insertions(+)
diff --git a/docs/REPLICATION_PLAN.md b/docs/REPLICATION_PLAN.md
index 78a4c55..b0134de 100644
--- a/docs/REPLICATION_PLAN.md
+++ b/docs/REPLICATION_PLAN.md
@@ -377,6 +377,8 @@ Implemented:
- `GET /api/replication/metrics` (Prometheus text export)
- `GET /api/replication/snapshot/latest`
- `GET /api/replication/log`
+ - `GET /api/replication/transport/snapshot` (host-runtime transport export passthrough)
+ - `GET /api/replication/transport/log` (host-runtime transport export passthrough)
- `POST /api/replication/pull`
- `POST /api/replication/reseed`
- `POST /api/replication/promote`
diff --git a/docs/REPLICATION_RUNBOOK.md b/docs/REPLICATION_RUNBOOK.md
index b1a9d4d..ff1b801 100644
--- a/docs/REPLICATION_RUNBOOK.md
+++ b/docs/REPLICATION_RUNBOOK.md
@@ -142,6 +142,8 @@ Available endpoints in `playground/src/api/routes.ts`:
- `GET /api/replication/metrics` (Prometheus text format)
- `GET /api/replication/snapshot/latest`
- `GET /api/replication/log`
+- `GET /api/replication/transport/snapshot` (host-runtime transport export passthrough)
+- `GET /api/replication/transport/log` (host-runtime transport export passthrough)
- `POST /api/replication/pull` (runs `replica_catch_up_once`)
- `POST /api/replication/reseed` (runs `replica_reseed_from_snapshot`)
- `POST /api/replication/promote` (runs `primary_promote_to_next_epoch`)
diff --git a/ray-rs/src/napi_bindings/database.rs b/ray-rs/src/napi_bindings/database.rs
index 4a9185c..2fc3a11 100644
--- a/ray-rs/src/napi_bindings/database.rs
+++ b/ray-rs/src/napi_bindings/database.rs
@@ -1397,6 +1397,51 @@ impl Database {
}
}
+ /// Export latest primary snapshot metadata and optional bytes as transport JSON.
+ #[napi]
+ pub fn export_replication_snapshot_transport_json(
+ &self,
+ include_data: Option,
+ ) -> Result {
+ match self.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => db
+ .primary_export_snapshot_transport_json(include_data.unwrap_or(false))
+ .map_err(|e| Error::from_reason(format!("Failed to export replication snapshot: {e}"))),
+ None => Err(Error::from_reason("Database is closed")),
+ }
+ }
+
+ /// Export primary replication log page (cursor + limits) as transport JSON.
+ #[napi]
+ pub fn export_replication_log_transport_json(
+ &self,
+ cursor: Option,
+ max_frames: Option,
+ max_bytes: Option,
+ include_payload: Option,
+ ) -> Result {
+ let max_frames = max_frames.unwrap_or(128);
+ let max_bytes = max_bytes.unwrap_or(1_048_576);
+ if max_frames <= 0 {
+ return Err(Error::from_reason("maxFrames must be positive"));
+ }
+ if max_bytes <= 0 {
+ return Err(Error::from_reason("maxBytes must be positive"));
+ }
+
+ match self.inner.as_ref() {
+ Some(DatabaseInner::SingleFile(db)) => db
+ .primary_export_log_transport_json(
+ cursor.as_deref(),
+ max_frames as usize,
+ max_bytes as usize,
+ include_payload.unwrap_or(true),
+ )
+ .map_err(|e| Error::from_reason(format!("Failed to export replication log: {e}"))),
+ None => Err(Error::from_reason("Database is closed")),
+ }
+ }
+
/// Bootstrap a replica from the primary snapshot.
#[napi]
pub fn replica_bootstrap_from_snapshot(&self) -> Result<()> {
diff --git a/ray-rs/src/pyo3_bindings/database.rs b/ray-rs/src/pyo3_bindings/database.rs
index cfbc7f5..f4d8cae 100644
--- a/ray-rs/src/pyo3_bindings/database.rs
+++ b/ray-rs/src/pyo3_bindings/database.rs
@@ -428,6 +428,47 @@ impl PyDatabase {
)
}
+ /// Export latest primary snapshot metadata and optional bytes as transport JSON.
+ #[pyo3(signature = (include_data=false))]
+ fn export_replication_snapshot_transport_json(&self, include_data: bool) -> PyResult {
+ dispatch!(
+ self,
+ |db| db.primary_export_snapshot_transport_json(include_data).map_err(|e| {
+ PyRuntimeError::new_err(format!("Failed to export replication snapshot: {e}"))
+ }),
+ |_db| { unreachable!("multi-file database support removed") }
+ )
+ }
+
+ /// Export primary replication log page (cursor + limits) as transport JSON.
+ #[pyo3(signature = (cursor=None, max_frames=128, max_bytes=1048576, include_payload=true))]
+ fn export_replication_log_transport_json(
+ &self,
+ cursor: Option,
+ max_frames: i64,
+ max_bytes: i64,
+ include_payload: bool,
+ ) -> PyResult {
+ if max_frames <= 0 {
+ return Err(PyRuntimeError::new_err("max_frames must be positive"));
+ }
+ if max_bytes <= 0 {
+ return Err(PyRuntimeError::new_err("max_bytes must be positive"));
+ }
+ dispatch!(
+ self,
+ |db| db
+ .primary_export_log_transport_json(
+ cursor.as_deref(),
+ max_frames as usize,
+ max_bytes as usize,
+ include_payload,
+ )
+ .map_err(|e| PyRuntimeError::new_err(format!("Failed to export replication log: {e}"))),
+ |_db| { unreachable!("multi-file database support removed") }
+ )
+ }
+
/// Bootstrap replica state from source snapshot.
fn replica_bootstrap_from_snapshot(&self) -> PyResult<()> {
dispatch!(
From 902410b521c88a9d2b536fa513905e0d49c2f2a7 Mon Sep 17 00:00:00 2001
From: mask
Date: Sun, 8 Feb 2026 11:14:23 -0600
Subject: [PATCH 05/58] playground: add replication admin transport endpoints
---
playground/PLAN.md | 22 +-
playground/package.json | 3 +-
playground/src/api/db.ts | 8 +-
playground/src/api/routes.replication.test.ts | 1325 +++++++++++++++++
playground/src/api/routes.ts | 1233 ++++++++++++++-
playground/src/client/lib/api.ts | 160 +-
playground/src/client/lib/types.ts | 92 ++
playground/src/server.ts | 87 +-
8 files changed, 2921 insertions(+), 9 deletions(-)
create mode 100644 playground/src/api/routes.replication.test.ts
diff --git a/playground/PLAN.md b/playground/PLAN.md
index 3efb728..2afce0f 100644
--- a/playground/PLAN.md
+++ b/playground/PLAN.md
@@ -166,7 +166,14 @@ playground/
```typescript
// Database Management
GET /api/status → { connected: boolean, path?: string, nodeCount?: number, edgeCount?: number }
-POST /api/db/open ← { path: string } → { success: boolean, error?: string }
+GET /api/replication/status → { connected: boolean, role: "primary"|"replica"|"disabled", primary?: ..., replica?: ... }
+GET /api/replication/metrics → text/plain (Prometheus exposition format)
+GET /api/replication/snapshot/latest → { success: boolean, snapshot?: { byteLength, sha256, ... } }
+GET /api/replication/log?cursor=...&maxBytes=...&maxFrames=... → { success: boolean, frames: [...], nextCursor, eof }
+POST /api/replication/pull ← { maxFrames?: number } → { success: boolean, appliedFrames?: number, replica?: ... }
+POST /api/replication/reseed → { success: boolean, replica?: ... }
+POST /api/replication/promote → { success: boolean, epoch?: number, primary?: ... }
+POST /api/db/open ← { path: string, options?: { readOnly?, syncMode?, replicationRole?, ... } } → { success: boolean, error?: string }
POST /api/db/upload ← FormData (file) → { success: boolean, error?: string }
POST /api/db/demo → { success: boolean }
POST /api/db/close → { success: boolean }
@@ -193,6 +200,19 @@ POST /api/graph/path ← { startKey: string, endKey: string } → { pat
POST /api/graph/impact ← { nodeKey: string } → { impacted: string[], edges: string[] }
```
+Replication admin auth:
+- Auth mode envs:
+ - `REPLICATION_ADMIN_AUTH_MODE` = `none|token|mtls|token_or_mtls|token_and_mtls`
+ - `REPLICATION_ADMIN_TOKEN` for token modes
+ - `REPLICATION_MTLS_HEADER` (default `x-forwarded-client-cert`) for mTLS modes
+ - `REPLICATION_MTLS_SUBJECT_REGEX` optional subject filter for mTLS modes
+ - `REPLICATION_MTLS_NATIVE_TLS=true` to treat native HTTPS + client-cert verification as mTLS auth
+ - `PLAYGROUND_TLS_CERT_FILE` + `PLAYGROUND_TLS_KEY_FILE` enable HTTPS listener
+ - `PLAYGROUND_TLS_CA_FILE` optional custom client-cert CA bundle
+ - `PLAYGROUND_TLS_REQUEST_CERT` + `PLAYGROUND_TLS_REJECT_UNAUTHORIZED` for TLS client-cert enforcement
+- Admin endpoints (`/snapshot/latest`, `/metrics`, `/log`, `/pull`, `/reseed`, `/promote`) enforce the selected mode.
+- `/api/replication/status` remains readable without auth.
+
---
## Node/Edge Visualization Format
diff --git a/playground/package.json b/playground/package.json
index 9dbe234..441921a 100644
--- a/playground/package.json
+++ b/playground/package.json
@@ -6,7 +6,8 @@
"scripts": {
"dev": "bun run --watch src/server.ts",
"start": "bun run src/server.ts",
- "build": "bun run build.ts"
+ "build": "bun run build.ts",
+ "test": "bun test"
},
"dependencies": {
"elysia": "^1.2.0",
diff --git a/playground/src/api/db.ts b/playground/src/api/db.ts
index be6b00a..8155c77 100644
--- a/playground/src/api/db.ts
+++ b/playground/src/api/db.ts
@@ -8,12 +8,13 @@ import { tmpdir } from "node:os";
import { join } from "node:path";
import {
type Kite,
+ type KiteOptions,
defineEdge,
defineNode,
kite,
optional,
prop,
-} from "../../../src/index.ts";
+} from "../../../ray-rs/ts/index.ts";
import { createDemoGraph } from "./demo-data.ts";
import { mkdtemp, rm, writeFile } from "node:fs/promises";
@@ -75,16 +76,19 @@ interface DbState {
let currentDb: DbState | null = null;
+export type PlaygroundOpenOptions = Omit;
+
/**
* Open a database from a file path
*/
export async function openDatabase(
path: string,
+ options?: PlaygroundOpenOptions,
): Promise<{ success: boolean; error?: string }> {
try {
await closeDatabase();
- const db = await kite(path, { nodes, edges });
+ const db = await kite(path, { nodes, edges, ...(options ?? {}) });
currentDb = { db, path, isDemo: false };
return { success: true };
diff --git a/playground/src/api/routes.replication.test.ts b/playground/src/api/routes.replication.test.ts
new file mode 100644
index 0000000..b9270d3
--- /dev/null
+++ b/playground/src/api/routes.replication.test.ts
@@ -0,0 +1,1325 @@
+import { afterEach, beforeAll, describe, expect, test } from "bun:test";
+import { createHash } from "node:crypto";
+import { mkdtemp, readFile, rm } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+
+process.env.REPLICATION_ADMIN_TOKEN = "test-repl-admin-token";
+
+const { Elysia } = await import("elysia");
+const { apiRoutes } = await import("./routes.ts");
+const { closeDatabase, getDb, FileNode } = await import("./db.ts");
+
+const AUTH_HEADER = {
+ Authorization: `Bearer ${process.env.REPLICATION_ADMIN_TOKEN}`,
+};
+
+interface JsonResponse> {
+ status: number;
+ body: T;
+}
+
+interface TextResponse {
+ status: number;
+ body: string;
+}
+
+interface ManifestEnvelope {
+ version: number;
+ payload_crc32: number;
+ manifest: {
+ version: number;
+ epoch: number;
+ head_log_index: number;
+ retained_floor: number;
+ active_segment_id: number;
+ segments: Array<{
+ id: number;
+ start_log_index: number;
+ end_log_index: number;
+ size_bytes: number;
+ }>;
+ };
+}
+
+let app: InstanceType;
+let tempDir: string;
+let dbPath: string;
+
+type ReplicationAuthEnvKey =
+ | "REPLICATION_ADMIN_TOKEN"
+ | "REPLICATION_ADMIN_AUTH_MODE"
+ | "REPLICATION_MTLS_HEADER"
+ | "REPLICATION_MTLS_SUBJECT_REGEX"
+ | "REPLICATION_MTLS_NATIVE_TLS"
+ | "PLAYGROUND_TLS_REQUEST_CERT"
+ | "PLAYGROUND_TLS_REJECT_UNAUTHORIZED";
+
+async function withReplicationAuthEnv(
+ overrides: Partial>,
+ run: () => Promise,
+): Promise {
+ const keys: ReplicationAuthEnvKey[] = [
+ "REPLICATION_ADMIN_TOKEN",
+ "REPLICATION_ADMIN_AUTH_MODE",
+ "REPLICATION_MTLS_HEADER",
+ "REPLICATION_MTLS_SUBJECT_REGEX",
+ "REPLICATION_MTLS_NATIVE_TLS",
+ "PLAYGROUND_TLS_REQUEST_CERT",
+ "PLAYGROUND_TLS_REJECT_UNAUTHORIZED",
+ ];
+ const previous: Partial> = {};
+ for (const key of keys) {
+ previous[key] = process.env[key];
+ }
+
+ for (const [key, value] of Object.entries(overrides) as Array<
+ [ReplicationAuthEnvKey, string | null]
+ >) {
+ if (value === null) {
+ delete process.env[key];
+ } else {
+ process.env[key] = value;
+ }
+ }
+
+ try {
+ return await run();
+ } finally {
+ for (const key of keys) {
+ const value = previous[key];
+ if (value === undefined) {
+ delete process.env[key];
+ } else {
+ process.env[key] = value;
+ }
+ }
+ }
+}
+
+async function requestJson>(
+ method: string,
+ path: string,
+ body?: unknown,
+ headers?: Record,
+ origin = "http://localhost",
+): Promise> {
+ const request = new Request(`${origin}${path}`, {
+ method,
+ headers: {
+ ...(body !== undefined ? { "content-type": "application/json" } : {}),
+ ...(headers ?? {}),
+ },
+ body: body !== undefined ? JSON.stringify(body) : undefined,
+ });
+
+ const response = await app.handle(request);
+ return {
+ status: response.status,
+ body: (await response.json()) as T,
+ };
+}
+
+async function requestText(
+ method: string,
+ path: string,
+ body?: unknown,
+ headers?: Record,
+ origin = "http://localhost",
+): Promise {
+ const request = new Request(`${origin}${path}`, {
+ method,
+ headers: {
+ ...(body !== undefined ? { "content-type": "application/json" } : {}),
+ ...(headers ?? {}),
+ },
+ body: body !== undefined ? JSON.stringify(body) : undefined,
+ });
+
+ const response = await app.handle(request);
+ return {
+ status: response.status,
+ body: await response.text(),
+ };
+}
+
+async function openPrimary(): Promise {
+ tempDir = await mkdtemp(join(tmpdir(), "playground-repl-test-"));
+ dbPath = join(tempDir, "primary.kitedb");
+
+ const response = await requestJson<{ success: boolean; error?: string }>(
+ "POST",
+ "/api/db/open",
+ {
+ path: dbPath,
+ options: {
+ replicationRole: "primary",
+ },
+ },
+ );
+
+ expect(response.status).toBe(200);
+ expect(response.body.success).toBe(true);
+}
+
+async function appendCommits(count: number): Promise {
+ const db = getDb();
+ expect(db).not.toBeNull();
+ for (let i = 0; i < count; i++) {
+ await db!
+ .insert(FileNode)
+ .values({
+ key: `src/file-${i}.ts`,
+ path: `src/file-${i}.ts`,
+ language: "typescript",
+ })
+ .returning();
+ }
+}
+
+beforeAll(() => {
+ app = new Elysia().use(apiRoutes);
+});
+
+afterEach(async () => {
+ await closeDatabase();
+ if (tempDir) {
+ await rm(tempDir, { recursive: true, force: true });
+ }
+});
+
+describe("replication log endpoints", () => {
+ test("paginates log frames using maxFrames + nextCursor", async () => {
+ await openPrimary();
+ await appendCommits(5);
+
+ const first = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ eof: boolean;
+ nextCursor: string | null;
+ frames: Array<{ logIndex: string }>;
+ }>("GET", "/api/replication/log?maxFrames=2", undefined, AUTH_HEADER);
+
+ expect(first.status).toBe(200);
+ expect(first.body.success).toBe(true);
+ expect(first.body.frameCount).toBe(2);
+ expect(first.body.eof).toBe(false);
+ expect(first.body.nextCursor).toBeTruthy();
+ expect(first.body.frames.length).toBe(2);
+
+ const lastFirstLogIndex = BigInt(first.body.frames[1].logIndex);
+ const second = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ frames: Array<{ logIndex: string }>;
+ cursor: string | null;
+ }>(
+ "GET",
+ `/api/replication/log?maxFrames=2&cursor=${encodeURIComponent(first.body.nextCursor!)}`,
+ undefined,
+ AUTH_HEADER,
+ );
+
+ expect(second.status).toBe(200);
+ expect(second.body.success).toBe(true);
+ expect(second.body.cursor).toBe(first.body.nextCursor);
+ expect(second.body.frameCount).toBeGreaterThan(0);
+ expect(BigInt(second.body.frames[0].logIndex) > lastFirstLogIndex).toBe(true);
+ });
+
+ test("respects maxBytes and returns one frame minimum", async () => {
+ await openPrimary();
+ await appendCommits(3);
+
+ const response = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ eof: boolean;
+ totalBytes: number;
+ nextCursor: string | null;
+ }>("GET", "/api/replication/log?maxBytes=1", undefined, AUTH_HEADER);
+
+ expect(response.status).toBe(200);
+ expect(response.body.success).toBe(true);
+ expect(response.body.frameCount).toBe(1);
+ expect(response.body.totalBytes).toBeGreaterThan(0);
+ expect(response.body.eof).toBe(false);
+ expect(response.body.nextCursor).toBeTruthy();
+ });
+
+ test("returns structured error on malformed cursor", async () => {
+ await openPrimary();
+ await appendCommits(1);
+
+ const response = await requestJson<{ success: boolean; error?: string }>(
+ "GET",
+ "/api/replication/log?cursor=bad-cursor",
+ undefined,
+ AUTH_HEADER,
+ );
+
+ expect(response.status).toBe(200);
+ expect(response.body.success).toBe(false);
+ expect(response.body.error).toBeTruthy();
+ });
+
+ test("returns structured error on malformed 4-part cursor with non-numeric components", async () => {
+ await openPrimary();
+ await appendCommits(2);
+
+ const response = await requestJson<{ success: boolean; error?: string }>(
+ "GET",
+ "/api/replication/log?cursor=1:abc:def:ghi",
+ undefined,
+ AUTH_HEADER,
+ );
+
+ expect(response.status).toBe(200);
+ expect(response.body.success).toBe(false);
+ expect(response.body.error).toBeTruthy();
+ });
+
+ test("returns structured error on cursor with too many components", async () => {
+ await openPrimary();
+ await appendCommits(2);
+
+ const response = await requestJson<{ success: boolean; error?: string }>(
+ "GET",
+ "/api/replication/log?cursor=1:2:3:4:5",
+ undefined,
+ AUTH_HEADER,
+ );
+
+ expect(response.status).toBe(200);
+ expect(response.body.success).toBe(false);
+ expect(response.body.error).toBeTruthy();
+ });
+
+ test("accepts cursors with empty numeric components as zero (current behavior)", async () => {
+ await openPrimary();
+ await appendCommits(2);
+
+ const emptySegmentId = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ cursor: string | null;
+ nextCursor: string | null;
+ }>(
+ "GET",
+ "/api/replication/log?cursor=1::3:4",
+ undefined,
+ AUTH_HEADER,
+ );
+ expect(emptySegmentId.status).toBe(200);
+ expect(emptySegmentId.body.success).toBe(true);
+ expect(emptySegmentId.body.cursor).toBe("1::3:4");
+ expect(emptySegmentId.body.frameCount).toBe(0);
+ expect(emptySegmentId.body.nextCursor).toBe("1::3:4");
+
+ const emptyEpoch = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ cursor: string | null;
+ nextCursor: string | null;
+ }>(
+ "GET",
+ "/api/replication/log?cursor=:2",
+ undefined,
+ AUTH_HEADER,
+ );
+ expect(emptyEpoch.status).toBe(200);
+ expect(emptyEpoch.body.success).toBe(true);
+ expect(emptyEpoch.body.cursor).toBe(":2");
+ expect(emptyEpoch.body.frameCount).toBe(2);
+ expect(emptyEpoch.body.nextCursor).toBeTruthy();
+ });
+
+ test("accepts 2-part cursor format epoch:logIndex", async () => {
+ await openPrimary();
+ await appendCommits(5);
+
+ const first = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ frames: Array<{ epoch: string; logIndex: string }>;
+ }>("GET", "/api/replication/log?maxFrames=2", undefined, AUTH_HEADER);
+ expect(first.status).toBe(200);
+ expect(first.body.success).toBe(true);
+ expect(first.body.frameCount).toBe(2);
+
+ const cursor = `${first.body.frames[0].epoch}:${first.body.frames[0].logIndex}`;
+ const second = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ frames: Array<{ logIndex: string }>;
+ }>(
+ "GET",
+ `/api/replication/log?maxFrames=4&cursor=${encodeURIComponent(cursor)}`,
+ undefined,
+ AUTH_HEADER,
+ );
+
+ expect(second.status).toBe(200);
+ expect(second.body.success).toBe(true);
+ expect(second.body.frameCount).toBeGreaterThan(0);
+ expect(BigInt(second.body.frames[0].logIndex) > BigInt(first.body.frames[0].logIndex)).toBe(
+ true,
+ );
+ });
+
+ test("4-part cursor resumes consistently at frame start vs frame end offset", async () => {
+ await openPrimary();
+ await appendCommits(5);
+
+ const firstPage = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ nextCursor: string | null;
+ frames: Array<{
+ epoch: string;
+ segmentId: string;
+ segmentOffset: string;
+ logIndex: string;
+ payloadBase64: string;
+ }>;
+ }>(
+ "GET",
+ "/api/replication/log?maxFrames=1&includePayload=false",
+ undefined,
+ AUTH_HEADER,
+ );
+ expect(firstPage.status).toBe(200);
+ expect(firstPage.body.success).toBe(true);
+ expect(firstPage.body.frameCount).toBe(1);
+ expect(firstPage.body.nextCursor).toBeTruthy();
+
+ const firstFrame = firstPage.body.frames[0];
+ const startCursor = `${firstFrame.epoch}:${firstFrame.segmentId}:${firstFrame.segmentOffset}:${firstFrame.logIndex}`;
+
+ const resumedFromStart = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ frames: Array<{ logIndex: string; payloadBase64: string }>;
+ }>(
+ "GET",
+ `/api/replication/log?maxFrames=3&includePayload=false&cursor=${encodeURIComponent(startCursor)}`,
+ undefined,
+ AUTH_HEADER,
+ );
+ expect(resumedFromStart.status).toBe(200);
+ expect(resumedFromStart.body.success).toBe(true);
+ expect(resumedFromStart.body.frameCount).toBeGreaterThan(0);
+ expect(
+ BigInt(resumedFromStart.body.frames[0].logIndex) > BigInt(firstFrame.logIndex),
+ ).toBe(true);
+
+ const resumedFromEnd = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ frames: Array<{ logIndex: string; payloadBase64: string }>;
+ }>(
+ "GET",
+ `/api/replication/log?maxFrames=3&includePayload=false&cursor=${encodeURIComponent(firstPage.body.nextCursor!)}`,
+ undefined,
+ AUTH_HEADER,
+ );
+ expect(resumedFromEnd.status).toBe(200);
+ expect(resumedFromEnd.body.success).toBe(true);
+ expect(resumedFromEnd.body.frameCount).toBeGreaterThan(0);
+
+ expect(resumedFromEnd.body.frames[0].logIndex).toBe(
+ resumedFromStart.body.frames[0].logIndex,
+ );
+ expect(resumedFromStart.body.frames[0].payloadBase64).toBe("");
+ expect(resumedFromEnd.body.frames[0].payloadBase64).toBe("");
+ });
+
+ test("supports includePayload=false while preserving paging cursors", async () => {
+ await openPrimary();
+ await appendCommits(4);
+
+ const first = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ nextCursor: string | null;
+ frames: Array<{ payloadBase64: string; logIndex: string }>;
+ }>(
+ "GET",
+ "/api/replication/log?maxFrames=2&includePayload=false",
+ undefined,
+ AUTH_HEADER,
+ );
+
+ expect(first.status).toBe(200);
+ expect(first.body.success).toBe(true);
+ expect(first.body.frameCount).toBe(2);
+ expect(first.body.nextCursor).toBeTruthy();
+ for (const frame of first.body.frames) {
+ expect(frame.payloadBase64).toBe("");
+ }
+
+ const lastFirstLogIndex = BigInt(first.body.frames[1].logIndex);
+ const second = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ frames: Array<{ payloadBase64: string; logIndex: string }>;
+ }>(
+ "GET",
+ `/api/replication/log?maxFrames=2&includePayload=false&cursor=${encodeURIComponent(first.body.nextCursor!)}`,
+ undefined,
+ AUTH_HEADER,
+ );
+
+ expect(second.status).toBe(200);
+ expect(second.body.success).toBe(true);
+ expect(second.body.frameCount).toBeGreaterThan(0);
+ for (const frame of second.body.frames) {
+ expect(frame.payloadBase64).toBe("");
+ }
+ expect(BigInt(second.body.frames[0].logIndex) > lastFirstLogIndex).toBe(true);
+ });
+
+ test("includePayload=false still honors maxBytes paging and cursor resume", async () => {
+ await openPrimary();
+ await appendCommits(4);
+
+ const first = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ totalBytes: number;
+ nextCursor: string | null;
+ eof: boolean;
+ frames: Array<{ payloadBase64: string; logIndex: string }>;
+ }>(
+ "GET",
+ "/api/replication/log?includePayload=false&maxBytes=1",
+ undefined,
+ AUTH_HEADER,
+ );
+
+ expect(first.status).toBe(200);
+ expect(first.body.success).toBe(true);
+ expect(first.body.frameCount).toBe(1);
+ expect(first.body.totalBytes).toBeGreaterThan(0);
+ expect(first.body.eof).toBe(false);
+ expect(first.body.nextCursor).toBeTruthy();
+ expect(first.body.frames[0].payloadBase64).toBe("");
+
+ const firstLogIndex = BigInt(first.body.frames[0].logIndex);
+ const second = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ totalBytes: number;
+ nextCursor: string | null;
+ eof: boolean;
+ frames: Array<{ payloadBase64: string; logIndex: string }>;
+ }>(
+ "GET",
+ `/api/replication/log?includePayload=false&maxBytes=1&cursor=${encodeURIComponent(first.body.nextCursor!)}`,
+ undefined,
+ AUTH_HEADER,
+ );
+
+ expect(second.status).toBe(200);
+ expect(second.body.success).toBe(true);
+ expect(second.body.frameCount).toBe(1);
+ expect(second.body.totalBytes).toBeGreaterThan(0);
+ expect(second.body.nextCursor).toBeTruthy();
+ expect(second.body.frames[0].payloadBase64).toBe("");
+ expect(BigInt(second.body.frames[0].logIndex) > firstLogIndex).toBe(true);
+ });
+
+ test("replication log uses sane defaults when query params are omitted", async () => {
+ await openPrimary();
+ await appendCommits(3);
+
+ const response = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ eof: boolean;
+ nextCursor: string | null;
+ frames: Array<{ payloadBase64: string }>;
+ }>("GET", "/api/replication/log", undefined, AUTH_HEADER);
+
+ expect(response.status).toBe(200);
+ expect(response.body.success).toBe(true);
+ expect(response.body.frameCount).toBeGreaterThan(0);
+ expect(response.body.frameCount).toBeLessThanOrEqual(256);
+ expect(response.body.eof).toBe(true);
+ expect(response.body.nextCursor).toBeTruthy();
+ for (const frame of response.body.frames) {
+ expect(frame.payloadBase64.length).toBeGreaterThan(0);
+ }
+ });
+
+ test("replication log clamps out-of-range maxFrames/maxBytes query values", async () => {
+ await openPrimary();
+ await appendCommits(5);
+
+ const zeroFrames = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ eof: boolean;
+ frames: Array<{ payloadBase64: string }>;
+ }>(
+ "GET",
+ "/api/replication/log?includePayload=false&maxFrames=0&maxBytes=999999999",
+ undefined,
+ AUTH_HEADER,
+ );
+ expect(zeroFrames.status).toBe(200);
+ expect(zeroFrames.body.success).toBe(true);
+ expect(zeroFrames.body.frameCount).toBe(1);
+ expect(zeroFrames.body.eof).toBe(false);
+ expect(zeroFrames.body.frames[0].payloadBase64).toBe("");
+
+ const negativeFrames = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ eof: boolean;
+ frames: Array<{ payloadBase64: string }>;
+ }>(
+ "GET",
+ "/api/replication/log?includePayload=false&maxFrames=-10&maxBytes=999999999",
+ undefined,
+ AUTH_HEADER,
+ );
+ expect(negativeFrames.status).toBe(200);
+ expect(negativeFrames.body.success).toBe(true);
+ expect(negativeFrames.body.frameCount).toBe(1);
+ expect(negativeFrames.body.eof).toBe(false);
+ expect(negativeFrames.body.frames[0].payloadBase64).toBe("");
+
+ const negativeBytes = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ eof: boolean;
+ totalBytes: number;
+ frames: Array<{ payloadBase64: string }>;
+ }>(
+ "GET",
+ "/api/replication/log?includePayload=false&maxFrames=999999&maxBytes=-7",
+ undefined,
+ AUTH_HEADER,
+ );
+ expect(negativeBytes.status).toBe(200);
+ expect(negativeBytes.body.success).toBe(true);
+ expect(negativeBytes.body.frameCount).toBe(1);
+ expect(negativeBytes.body.totalBytes).toBeGreaterThan(0);
+ expect(negativeBytes.body.eof).toBe(false);
+ expect(negativeBytes.body.frames[0].payloadBase64).toBe("");
+ });
+
+ test("replication log falls back to defaults on invalid query values", async () => {
+ await openPrimary();
+ await appendCommits(10);
+
+ const response = await requestJson<{
+ success: boolean;
+ frameCount: number;
+ eof: boolean;
+ nextCursor: string | null;
+ frames: Array<{ payloadBase64: string }>;
+ }>(
+ "GET",
+ "/api/replication/log?maxFrames=abc&maxBytes=nan&includePayload=maybe",
+ undefined,
+ AUTH_HEADER,
+ );
+
+ expect(response.status).toBe(200);
+ expect(response.body.success).toBe(true);
+ expect(response.body.frameCount).toBeGreaterThan(1);
+ expect(response.body.frameCount).toBeLessThanOrEqual(256);
+ expect(response.body.eof).toBe(true);
+ expect(response.body.nextCursor).toBeTruthy();
+ for (const frame of response.body.frames) {
+ expect(frame.payloadBase64.length).toBeGreaterThan(0);
+ }
+ });
+
+ test("snapshot includeData=true returns consistent bytes/hash metadata", async () => {
+ await openPrimary();
+ await appendCommits(3);
+
+ const response = await requestJson<{
+ success: boolean;
+ role?: string;
+ snapshot?: {
+ dbPath?: string;
+ byteLength?: number;
+ sha256?: string;
+ dataBase64?: string;
+ };
+ }>("GET", "/api/replication/snapshot/latest?includeData=true", undefined, AUTH_HEADER);
+
+ expect(response.status).toBe(200);
+ expect(response.body.success).toBe(true);
+ expect(response.body.role).toBe("primary");
+
+ const snapshot = response.body.snapshot;
+ expect(snapshot).toBeTruthy();
+ expect(snapshot?.dbPath).toBeTruthy();
+ expect(snapshot?.byteLength).toBeGreaterThan(0);
+ expect(snapshot?.sha256).toBeTruthy();
+ expect(snapshot?.dataBase64).toBeTruthy();
+
+ const decoded = Buffer.from(snapshot!.dataBase64!, "base64");
+ expect(decoded.byteLength).toBe(snapshot!.byteLength);
+
+ const fileBytes = await readFile(snapshot!.dbPath!);
+ expect(fileBytes.byteLength).toBe(snapshot!.byteLength);
+ expect(Buffer.compare(decoded, fileBytes)).toBe(0);
+
+ const computed = createHash("sha256").update(fileBytes).digest("hex");
+ expect(computed).toBe(snapshot!.sha256);
+ });
+
+ test("snapshot includeData=false omits payload but keeps valid metadata", async () => {
+ await openPrimary();
+ await appendCommits(2);
+
+ const response = await requestJson<{
+ success: boolean;
+ role?: string;
+ snapshot?: {
+ dbPath?: string;
+ byteLength?: number;
+ sha256?: string;
+ dataBase64?: string;
+ };
+ }>("GET", "/api/replication/snapshot/latest?includeData=false", undefined, AUTH_HEADER);
+
+ expect(response.status).toBe(200);
+ expect(response.body.success).toBe(true);
+ expect(response.body.role).toBe("primary");
+
+ const snapshot = response.body.snapshot;
+ expect(snapshot).toBeTruthy();
+ expect(snapshot?.dbPath).toBeTruthy();
+ expect(snapshot?.byteLength).toBeGreaterThan(0);
+ expect(snapshot?.sha256).toBeTruthy();
+ expect(snapshot?.dataBase64).toBeUndefined();
+
+ const fileBytes = await readFile(snapshot!.dbPath!);
+ expect(fileBytes.byteLength).toBe(snapshot!.byteLength);
+ const computed = createHash("sha256").update(fileBytes).digest("hex");
+ expect(computed).toBe(snapshot!.sha256);
+ });
+
+ test("enforces bearer token on protected endpoints", async () => {
+ await openPrimary();
+
+ const unauthorized = await requestJson<{ success: boolean; error?: string }>(
+ "GET",
+ "/api/replication/log",
+ );
+
+ expect(unauthorized.status).toBe(401);
+ expect(unauthorized.body.success).toBe(false);
+ expect(unauthorized.body.error).toContain("Unauthorized");
+
+ const authorized = await requestJson<{ success: boolean }>(
+ "GET",
+ "/api/replication/log",
+ undefined,
+ AUTH_HEADER,
+ );
+ expect(authorized.status).toBe(200);
+ expect(authorized.body.success).toBe(true);
+ });
+
+ test("replication status remains readable without bearer token", async () => {
+ await openPrimary();
+ await appendCommits(1);
+
+ const publicStatus = await requestJson<{
+ connected: boolean;
+ authEnabled: boolean;
+ role: string;
+ primary?: { headLogIndex?: number };
+ }>("GET", "/api/replication/status");
+ expect(publicStatus.status).toBe(200);
+ expect(publicStatus.body.connected).toBe(true);
+ expect(publicStatus.body.authEnabled).toBe(true);
+ expect(publicStatus.body.role).toBe("primary");
+ expect((publicStatus.body.primary?.headLogIndex ?? 0) > 0).toBe(true);
+
+ const adminBlocked = await requestJson<{ success: boolean; error?: string }>(
+ "GET",
+ "/api/replication/log",
+ );
+ expect(adminBlocked.status).toBe(401);
+ expect(adminBlocked.body.success).toBe(false);
+ });
+
+ test("replication metrics endpoint exports Prometheus text when authorized", async () => {
+ await openPrimary();
+ await appendCommits(3);
+
+ const metrics = await requestText(
+ "GET",
+ "/api/replication/metrics",
+ undefined,
+ AUTH_HEADER,
+ );
+
+ expect(metrics.status).toBe(200);
+ expect(metrics.body).toContain("# HELP raydb_replication_enabled");
+ expect(metrics.body).toContain("# TYPE raydb_replication_enabled gauge");
+ expect(metrics.body).toContain('raydb_replication_enabled{role="primary"} 1');
+ expect(metrics.body).toContain("raydb_replication_primary_head_log_index");
+ expect(metrics.body).toContain("raydb_replication_primary_append_attempts_total");
+ });
+
+ test("replication metrics endpoint requires bearer token", async () => {
+ await openPrimary();
+
+ const unauthorized = await requestText("GET", "/api/replication/metrics");
+ expect(unauthorized.status).toBe(401);
+ expect(unauthorized.body).toContain("Unauthorized");
+ });
+
+ test("supports mTLS-only admin auth mode", async () => {
+ await openPrimary();
+ await appendCommits(1);
+
+ await withReplicationAuthEnv(
+ {
+ REPLICATION_ADMIN_AUTH_MODE: "mtls",
+ REPLICATION_MTLS_HEADER: "x-client-cert",
+ REPLICATION_MTLS_SUBJECT_REGEX: "^CN=allowed",
+ },
+ async () => {
+ const noMtls = await requestJson<{ success: boolean; error?: string }>(
+ "GET",
+ "/api/replication/log",
+ );
+ expect(noMtls.status).toBe(401);
+ expect(noMtls.body.success).toBe(false);
+
+ const badSubject = await requestJson<{ success: boolean; error?: string }>(
+ "GET",
+ "/api/replication/log",
+ undefined,
+ { "x-client-cert": "CN=denied-client" },
+ );
+ expect(badSubject.status).toBe(401);
+ expect(badSubject.body.success).toBe(false);
+
+ const goodSubject = await requestJson<{ success: boolean }>(
+ "GET",
+ "/api/replication/log",
+ undefined,
+ { "x-client-cert": "CN=allowed-client,O=RayDB" },
+ );
+ expect(goodSubject.status).toBe(200);
+ expect(goodSubject.body.success).toBe(true);
+ },
+ );
+ });
+
+ test("supports native TLS mTLS auth mode without proxy header", async () => {
+ await openPrimary();
+ await appendCommits(1);
+
+ await withReplicationAuthEnv(
+ {
+ REPLICATION_ADMIN_AUTH_MODE: "mtls",
+ REPLICATION_MTLS_NATIVE_TLS: "true",
+ PLAYGROUND_TLS_REQUEST_CERT: "true",
+ PLAYGROUND_TLS_REJECT_UNAUTHORIZED: "true",
+ REPLICATION_MTLS_HEADER: null,
+ REPLICATION_MTLS_SUBJECT_REGEX: null,
+ },
+ async () => {
+ const httpRequest = await requestJson<{ success: boolean; error?: string }>(
+ "GET",
+ "/api/replication/log",
+ );
+ expect(httpRequest.status).toBe(401);
+ expect(httpRequest.body.success).toBe(false);
+
+ const httpsRequest = await requestJson<{ success: boolean }>(
+ "GET",
+ "/api/replication/log",
+ undefined,
+ undefined,
+ "https://localhost",
+ );
+ expect(httpsRequest.status).toBe(200);
+ expect(httpsRequest.body.success).toBe(true);
+ },
+ );
+ });
+
+ test("rejects invalid native TLS mTLS config", async () => {
+ await openPrimary();
+ await appendCommits(1);
+
+ await withReplicationAuthEnv(
+ {
+ REPLICATION_ADMIN_AUTH_MODE: "mtls",
+ REPLICATION_MTLS_NATIVE_TLS: "true",
+ PLAYGROUND_TLS_REQUEST_CERT: "false",
+ PLAYGROUND_TLS_REJECT_UNAUTHORIZED: "true",
+ },
+ async () => {
+ const response = await requestJson<{ success: boolean; error?: string }>(
+ "GET",
+ "/api/replication/log",
+ );
+ expect(response.status).toBe(500);
+ expect(response.body.success).toBe(false);
+ expect(response.body.error).toContain("REPLICATION_MTLS_NATIVE_TLS requires");
+ },
+ );
+ });
+
+ test("supports token_and_mtls admin auth mode", async () => {
+ await openPrimary();
+ await appendCommits(1);
+
+ await withReplicationAuthEnv(
+ {
+ REPLICATION_ADMIN_TOKEN: "combo-token",
+ REPLICATION_ADMIN_AUTH_MODE: "token_and_mtls",
+ REPLICATION_MTLS_HEADER: "x-client-cert",
+ REPLICATION_MTLS_SUBJECT_REGEX: "^CN=combo$",
+ },
+ async () => {
+ const tokenOnly = await requestJson<{ success: boolean; error?: string }>(
+ "GET",
+ "/api/replication/log",
+ undefined,
+ { Authorization: "Bearer combo-token" },
+ );
+ expect(tokenOnly.status).toBe(401);
+ expect(tokenOnly.body.success).toBe(false);
+
+ const mtlsOnly = await requestJson<{ success: boolean; error?: string }>(
+ "GET",
+ "/api/replication/log",
+ undefined,
+ { "x-client-cert": "CN=combo" },
+ );
+ expect(mtlsOnly.status).toBe(401);
+ expect(mtlsOnly.body.success).toBe(false);
+
+ const both = await requestJson<{ success: boolean }>(
+ "GET",
+ "/api/replication/log",
+ undefined,
+ {
+ Authorization: "Bearer combo-token",
+ "x-client-cert": "CN=combo",
+ },
+ );
+ expect(both.status).toBe(200);
+ expect(both.body.success).toBe(true);
+ },
+ );
+ });
+
+ test("rejects snapshot, pull, reseed, and promote without bearer token", async () => {
+ await openPrimary();
+
+ const snapshot = await requestJson<{ success: boolean; error?: string }>(
+ "GET",
+ "/api/replication/snapshot/latest",
+ );
+ expect(snapshot.status).toBe(401);
+ expect(snapshot.body.success).toBe(false);
+ expect(snapshot.body.error).toContain("Unauthorized");
+
+ const pull = await requestJson<{ success: boolean; error?: string }>(
+ "POST",
+ "/api/replication/pull",
+ { maxFrames: 1 },
+ );
+ expect(pull.status).toBe(401);
+ expect(pull.body.success).toBe(false);
+ expect(pull.body.error).toContain("Unauthorized");
+
+ const reseed = await requestJson<{ success: boolean; error?: string }>(
+ "POST",
+ "/api/replication/reseed",
+ );
+ expect(reseed.status).toBe(401);
+ expect(reseed.body.success).toBe(false);
+ expect(reseed.body.error).toContain("Unauthorized");
+
+ const promote = await requestJson<{ success: boolean; error?: string }>(
+ "POST",
+ "/api/replication/promote",
+ );
+ expect(promote.status).toBe(401);
+ expect(promote.body.success).toBe(false);
+ expect(promote.body.error).toContain("Unauthorized");
+ });
+
+ test("reseed on primary role returns structured error", async () => {
+ await openPrimary();
+
+ const reseed = await requestJson<{ success: boolean; error?: string }>(
+ "POST",
+ "/api/replication/reseed",
+ undefined,
+ AUTH_HEADER,
+ );
+ expect(reseed.status).toBe(200);
+ expect(reseed.body.success).toBe(false);
+ expect(reseed.body.error).toContain("replica role");
+ });
+
+ test("reseed is idempotent on healthy replica", async () => {
+ await openPrimary();
+ await appendCommits(4);
+
+ const replicaPath = join(tempDir, "replica-reseed-idempotent.kitedb");
+ const openReplica = await requestJson<{ success: boolean; error?: string }>(
+ "POST",
+ "/api/db/open",
+ {
+ path: replicaPath,
+ options: {
+ replicationRole: "replica",
+ replicationSourceDbPath: dbPath,
+ },
+ },
+ );
+ expect(openReplica.status).toBe(200);
+ expect(openReplica.body.success).toBe(true);
+
+ const first = await requestJson<{
+ success: boolean;
+ role: string;
+ replica?: { needsReseed?: boolean; lastError?: string | null; appliedLogIndex?: number };
+ }>("POST", "/api/replication/reseed", undefined, AUTH_HEADER);
+ expect(first.status).toBe(200);
+ expect(first.body.success).toBe(true);
+ expect(first.body.role).toBe("replica");
+ expect(first.body.replica?.needsReseed).toBe(false);
+ expect(first.body.replica?.lastError ?? null).toBeNull();
+ expect((first.body.replica?.appliedLogIndex ?? 0) > 0).toBe(true);
+
+ const second = await requestJson<{
+ success: boolean;
+ role: string;
+ replica?: { needsReseed?: boolean; lastError?: string | null; appliedLogIndex?: number };
+ }>("POST", "/api/replication/reseed", undefined, AUTH_HEADER);
+ expect(second.status).toBe(200);
+ expect(second.body.success).toBe(true);
+ expect(second.body.role).toBe("replica");
+ expect(second.body.replica?.needsReseed).toBe(false);
+ expect(second.body.replica?.lastError ?? null).toBeNull();
+ expect(second.body.replica?.appliedLogIndex).toBe(first.body.replica?.appliedLogIndex);
+ });
+
+ test("reseed baseline allows later incremental pull after new primary commits", async () => {
+ await openPrimary();
+ await appendCommits(4);
+
+ const replicaPath = join(tempDir, "replica-reseed-continuity.kitedb");
+ const openReplica = await requestJson<{ success: boolean; error?: string }>(
+ "POST",
+ "/api/db/open",
+ {
+ path: replicaPath,
+ options: {
+ replicationRole: "replica",
+ replicationSourceDbPath: dbPath,
+ },
+ },
+ );
+ expect(openReplica.status).toBe(200);
+ expect(openReplica.body.success).toBe(true);
+
+ const reseed = await requestJson<{
+ success: boolean;
+ role: string;
+ replica?: { needsReseed?: boolean; lastError?: string | null; appliedLogIndex?: number };
+ }>("POST", "/api/replication/reseed", undefined, AUTH_HEADER);
+ expect(reseed.status).toBe(200);
+ expect(reseed.body.success).toBe(true);
+ expect(reseed.body.role).toBe("replica");
+ expect(reseed.body.replica?.needsReseed).toBe(false);
+ expect(reseed.body.replica?.lastError ?? null).toBeNull();
+ const baselineApplied = reseed.body.replica?.appliedLogIndex ?? 0;
+ expect(baselineApplied > 0).toBe(true);
+
+ const reopenPrimary = await requestJson<{ success: boolean; error?: string }>(
+ "POST",
+ "/api/db/open",
+ {
+ path: dbPath,
+ options: {
+ replicationRole: "primary",
+ },
+ },
+ );
+ expect(reopenPrimary.status).toBe(200);
+ expect(reopenPrimary.body.success).toBe(true);
+ await appendCommits(3);
+
+ const reopenReplica = await requestJson<{ success: boolean; error?: string }>(
+ "POST",
+ "/api/db/open",
+ {
+ path: replicaPath,
+ options: {
+ replicationRole: "replica",
+ replicationSourceDbPath: dbPath,
+ },
+ },
+ );
+ expect(reopenReplica.status).toBe(200);
+ expect(reopenReplica.body.success).toBe(true);
+
+ const beforePull = await requestJson<{
+ connected: boolean;
+ role: string;
+ replica?: { appliedLogIndex?: number; needsReseed?: boolean };
+ }>("GET", "/api/replication/status");
+ expect(beforePull.status).toBe(200);
+ expect(beforePull.body.role).toBe("replica");
+ expect(beforePull.body.replica?.needsReseed).toBe(false);
+ expect(beforePull.body.replica?.appliedLogIndex).toBe(baselineApplied);
+
+ const pull = await requestJson<{
+ success: boolean;
+ appliedFrames?: number;
+ replica?: { appliedLogIndex?: number; needsReseed?: boolean };
+ }>("POST", "/api/replication/pull", { maxFrames: 128 }, AUTH_HEADER);
+ expect(pull.status).toBe(200);
+ expect(pull.body.success).toBe(true);
+ expect((pull.body.appliedFrames ?? 0) > 0).toBe(true);
+ expect(pull.body.replica?.needsReseed).toBe(false);
+ expect((pull.body.replica?.appliedLogIndex ?? 0) > baselineApplied).toBe(true);
+ });
+
+ test("replica pull advances appliedLogIndex after primary commits", async () => {
+ await openPrimary();
+ await appendCommits(4);
+
+ const replicaPath = join(tempDir, "replica.kitedb");
+ const openReplica = await requestJson<{ success: boolean; error?: string }>(
+ "POST",
+ "/api/db/open",
+ {
+ path: replicaPath,
+ options: {
+ replicationRole: "replica",
+ replicationSourceDbPath: dbPath,
+ },
+ },
+ );
+ expect(openReplica.status).toBe(200);
+ expect(openReplica.body.success).toBe(true);
+
+ const before = await requestJson<{
+ connected: boolean;
+ role: string;
+ replica?: { appliedLogIndex?: number };
+ }>("GET", "/api/replication/status");
+ expect(before.status).toBe(200);
+ expect(before.body.connected).toBe(true);
+ expect(before.body.role).toBe("replica");
+ const beforeIndex = before.body.replica?.appliedLogIndex ?? 0;
+
+ const pull = await requestJson<{
+ success: boolean;
+ appliedFrames?: number;
+ replica?: { appliedLogIndex?: number };
+ }>("POST", "/api/replication/pull", { maxFrames: 64 }, AUTH_HEADER);
+ expect(pull.status).toBe(200);
+ expect(pull.body.success).toBe(true);
+ expect((pull.body.appliedFrames ?? 0) > 0).toBe(true);
+
+ const after = await requestJson<{
+ connected: boolean;
+ role: string;
+ replica?: { appliedLogIndex?: number };
+ }>("GET", "/api/replication/status");
+ expect(after.status).toBe(200);
+ expect(after.body.connected).toBe(true);
+ expect(after.body.role).toBe("replica");
+ const afterIndex = after.body.replica?.appliedLogIndex ?? 0;
+ expect(afterIndex > beforeIndex).toBe(true);
+ });
+
+ test("promote increments epoch and replica catches up from promoted primary", async () => {
+ await openPrimary();
+ await appendCommits(2);
+
+ const promote = await requestJson<{
+ success: boolean;
+ epoch?: number;
+ role?: string;
+ primary?: { epoch?: number };
+ }>("POST", "/api/replication/promote", undefined, AUTH_HEADER);
+ expect(promote.status).toBe(200);
+ expect(promote.body.success).toBe(true);
+ expect(promote.body.role).toBe("primary");
+ expect(promote.body.epoch).toBe(2);
+ expect(promote.body.primary?.epoch).toBe(2);
+
+ await appendCommits(3);
+
+ const replicaPath = join(tempDir, "replica-promoted.kitedb");
+ const openReplica = await requestJson<{ success: boolean }>("POST", "/api/db/open", {
+ path: replicaPath,
+ options: {
+ replicationRole: "replica",
+ replicationSourceDbPath: dbPath,
+ },
+ });
+ expect(openReplica.status).toBe(200);
+ expect(openReplica.body.success).toBe(true);
+
+ const pull = await requestJson<{
+ success: boolean;
+ appliedFrames?: number;
+ replica?: { appliedEpoch?: number; appliedLogIndex?: number };
+ }>("POST", "/api/replication/pull", { maxFrames: 128 }, AUTH_HEADER);
+ expect(pull.status).toBe(200);
+ expect(pull.body.success).toBe(true);
+ expect((pull.body.appliedFrames ?? 0) > 0).toBe(true);
+ expect((pull.body.replica?.appliedEpoch ?? 0) >= 2).toBe(true);
+ expect((pull.body.replica?.appliedLogIndex ?? 0) > 0).toBe(true);
+ });
+
+ test("reseed clears needsReseed after missing-segment failure", async () => {
+ await closeDatabase();
+ tempDir = await mkdtemp(join(tmpdir(), "playground-repl-test-"));
+ dbPath = join(tempDir, "primary-needs-reseed.kitedb");
+ const openPrimaryWithSmallSegments = await requestJson<{ success: boolean }>(
+ "POST",
+ "/api/db/open",
+ {
+ path: dbPath,
+ options: {
+ replicationRole: "primary",
+ replicationSegmentMaxBytes: 1,
+ },
+ },
+ );
+ expect(openPrimaryWithSmallSegments.status).toBe(200);
+ expect(openPrimaryWithSmallSegments.body.success).toBe(true);
+
+ await appendCommits(6);
+
+ const primaryStatus = await requestJson<{
+ connected: boolean;
+ role: string;
+ primary?: { sidecarPath?: string; headLogIndex?: number };
+ }>("GET", "/api/replication/status");
+ expect(primaryStatus.status).toBe(200);
+ expect(primaryStatus.body.connected).toBe(true);
+ expect(primaryStatus.body.role).toBe("primary");
+ const sidecarPath = primaryStatus.body.primary?.sidecarPath;
+ const headLogIndex = primaryStatus.body.primary?.headLogIndex ?? 0;
+ expect(sidecarPath).toBeTruthy();
+ expect(headLogIndex > 0).toBe(true);
+
+ const replicaPath = join(tempDir, "replica-needs-reseed.kitedb");
+ const openReplica = await requestJson<{ success: boolean }>("POST", "/api/db/open", {
+ path: replicaPath,
+ options: {
+ replicationRole: "replica",
+ replicationSourceDbPath: dbPath,
+ },
+ });
+ expect(openReplica.status).toBe(200);
+ expect(openReplica.body.success).toBe(true);
+
+ const initialPull = await requestJson<{ success: boolean; appliedFrames?: number }>(
+ "POST",
+ "/api/replication/pull",
+ { maxFrames: 1 },
+ AUTH_HEADER,
+ );
+ expect(initialPull.status).toBe(200);
+ expect(initialPull.body.success).toBe(true);
+ expect((initialPull.body.appliedFrames ?? 0) > 0).toBe(true);
+
+ const replicaStatusBefore = await requestJson<{
+ connected: boolean;
+ role: string;
+ replica?: { appliedLogIndex?: number };
+ }>("GET", "/api/replication/status");
+ expect(replicaStatusBefore.status).toBe(200);
+ expect(replicaStatusBefore.body.role).toBe("replica");
+ const appliedIndex = replicaStatusBefore.body.replica?.appliedLogIndex ?? 0;
+ expect(headLogIndex > appliedIndex).toBe(true);
+
+ const manifestPath = join(sidecarPath!, "manifest.json");
+ const envelope = JSON.parse(
+ await readFile(manifestPath, "utf8"),
+ ) as ManifestEnvelope;
+
+ const expectedNext = appliedIndex + 1;
+ const gapSegment = envelope.manifest.segments.find(
+ (segment) =>
+ segment.start_log_index <= expectedNext &&
+ segment.end_log_index >= expectedNext,
+ );
+ expect(gapSegment).toBeTruthy();
+ const segmentPath = join(
+ sidecarPath!,
+ `segment-${String(gapSegment!.id).padStart(20, "0")}.rlog`,
+ );
+ await rm(segmentPath, { force: true });
+
+ const pullAfterTamper = await requestJson<{ success: boolean; error?: string }>(
+ "POST",
+ "/api/replication/pull",
+ { maxFrames: 64 },
+ AUTH_HEADER,
+ );
+ expect(pullAfterTamper.status).toBe(200);
+ expect(pullAfterTamper.body.success).toBe(false);
+ expect(pullAfterTamper.body.error).toContain("needs reseed");
+
+ const replicaStatusAfter = await requestJson<{
+ connected: boolean;
+ role: string;
+ replica?: { needsReseed?: boolean; lastError?: string };
+ }>("GET", "/api/replication/status");
+ expect(replicaStatusAfter.status).toBe(200);
+ expect(replicaStatusAfter.body.role).toBe("replica");
+ expect(replicaStatusAfter.body.replica?.needsReseed).toBe(true);
+ expect(replicaStatusAfter.body.replica?.lastError).toContain("needs reseed");
+
+ const reseed = await requestJson<{
+ success: boolean;
+ role: string;
+ replica?: { needsReseed?: boolean; lastError?: string | null };
+ }>("POST", "/api/replication/reseed", undefined, AUTH_HEADER);
+ expect(reseed.status).toBe(200);
+ expect(reseed.body.success).toBe(true);
+ expect(reseed.body.role).toBe("replica");
+ expect(reseed.body.replica?.needsReseed).toBe(false);
+ expect(reseed.body.replica?.lastError ?? null).toBeNull();
+
+ const replicaStatusAfterReseed = await requestJson<{
+ connected: boolean;
+ role: string;
+ replica?: { needsReseed?: boolean; lastError?: string | null };
+ }>("GET", "/api/replication/status");
+ expect(replicaStatusAfterReseed.status).toBe(200);
+ expect(replicaStatusAfterReseed.body.role).toBe("replica");
+ expect(replicaStatusAfterReseed.body.replica?.needsReseed).toBe(false);
+ expect(replicaStatusAfterReseed.body.replica?.lastError ?? null).toBeNull();
+
+ const pullAfterReseed = await requestJson<{ success: boolean; appliedFrames?: number }>(
+ "POST",
+ "/api/replication/pull",
+ { maxFrames: 64 },
+ AUTH_HEADER,
+ );
+ expect(pullAfterReseed.status).toBe(200);
+ expect(pullAfterReseed.body.success).toBe(true);
+ });
+});
diff --git a/playground/src/api/routes.ts b/playground/src/api/routes.ts
index d42358b..7007490 100644
--- a/playground/src/api/routes.ts
+++ b/playground/src/api/routes.ts
@@ -5,10 +5,13 @@
*/
import { Elysia, t } from "elysia";
-import { getSnapshot } from "../../../src/ray/graph-db/snapshot-helper.ts";
+import { createHash } from "node:crypto";
+import { join } from "node:path";
import {
getDb,
+ getDbPath,
getStatus,
+ type PlaygroundOpenOptions,
openDatabase,
openFromBuffer,
createDemo,
@@ -29,6 +32,10 @@ import {
const MAX_NODES = 1000;
const MAX_FILE_SIZE = 10 * 1024 * 1024; // 10MB
+const REPLICATION_PULL_MAX_FRAMES_DEFAULT = 256;
+const REPLICATION_PULL_MAX_FRAMES_LIMIT = 10_000;
+const REPLICATION_LOG_MAX_BYTES_DEFAULT = 1024 * 1024;
+const REPLICATION_LOG_MAX_BYTES_LIMIT = 32 * 1024 * 1024;
// ============================================================================
// Types
@@ -48,6 +55,64 @@ interface VisEdge {
type: string;
}
+interface RawReplicationStatus {
+ role?: string;
+ epoch?: number;
+ headLogIndex?: number;
+ retainedFloor?: number;
+ replicaLags?: Array<{
+ replicaId: string;
+ epoch: number;
+ appliedLogIndex: number;
+ }>;
+ sidecarPath?: string;
+ lastToken?: string | null;
+ appendAttempts?: number;
+ appendFailures?: number;
+ appendSuccesses?: number;
+}
+
+interface RawReplicaStatus {
+ role?: string;
+ appliedEpoch?: number;
+ appliedLogIndex?: number;
+ needsReseed?: boolean;
+ lastError?: string | null;
+}
+
+interface ParsedReplicationCursor {
+ epoch: bigint;
+ segmentId: bigint;
+ segmentOffset: bigint;
+ logIndex: bigint;
+}
+
+interface ReplicationFrameResponse {
+ epoch: string;
+ logIndex: string;
+ segmentId: string;
+ segmentOffset: string;
+ payloadBase64: string;
+ bytes: number;
+}
+
+type ReplicationAdminAuthMode =
+ | "none"
+ | "token"
+ | "mtls"
+ | "token_or_mtls"
+ | "token_and_mtls";
+
+interface ReplicationAdminConfig {
+ mode: ReplicationAdminAuthMode;
+ authEnabled: boolean;
+ token: string | null;
+ mtlsHeader: string;
+ mtlsSubjectRegex: RegExp | null;
+ mtlsNativeTlsEnabled: boolean;
+ invalidConfigError: string | null;
+}
+
// ============================================================================
// Color scheme for node types
// ============================================================================
@@ -83,6 +148,668 @@ function getEdgeDef(type: string) {
}
}
+function getRawDb(): Record | null {
+ const db = getDb() as unknown as (Record & { $raw?: Record }) | null;
+ if (!db) {
+ return null;
+ }
+ return db.$raw ?? db;
+}
+
+function callRawMethod(
+ raw: Record,
+ names: Array,
+ ...args: Array
+): T {
+ for (const name of names) {
+ const candidate = raw[name];
+ if (typeof candidate === "function") {
+ return (candidate as (...values: Array) => T).call(raw, ...args);
+ }
+ }
+
+ throw new Error(`Replication method unavailable (${names.join(" | ")})`);
+}
+
+function parseBooleanEnv(raw: string | undefined, defaultValue: boolean): boolean | null {
+ if (raw === undefined) {
+ return defaultValue;
+ }
+
+ const normalized = raw.trim().toLowerCase();
+ if (normalized === "") {
+ return defaultValue;
+ }
+
+ if (normalized === "1" || normalized === "true" || normalized === "yes" || normalized === "on") {
+ return true;
+ }
+ if (normalized === "0" || normalized === "false" || normalized === "no" || normalized === "off") {
+ return false;
+ }
+ return null;
+}
+
+function resolveReplicationAdminConfig(): ReplicationAdminConfig {
+ const tokenRaw = process.env.REPLICATION_ADMIN_TOKEN?.trim();
+ const token = tokenRaw && tokenRaw.length > 0 ? tokenRaw : null;
+
+ const modeRaw = process.env.REPLICATION_ADMIN_AUTH_MODE?.trim().toLowerCase();
+ const mode: ReplicationAdminAuthMode = (() => {
+ if (!modeRaw || modeRaw === "") {
+ return token ? "token" : "none";
+ }
+
+ switch (modeRaw) {
+ case "none":
+ case "token":
+ case "mtls":
+ case "token_or_mtls":
+ case "token_and_mtls":
+ return modeRaw;
+ default:
+ return "none";
+ }
+ })();
+
+ if (modeRaw && mode === "none" && modeRaw !== "none") {
+ return {
+ mode,
+ authEnabled: true,
+ token,
+ mtlsHeader: "x-forwarded-client-cert",
+ mtlsSubjectRegex: null,
+ mtlsNativeTlsEnabled: false,
+ invalidConfigError:
+ "Invalid REPLICATION_ADMIN_AUTH_MODE; expected none|token|mtls|token_or_mtls|token_and_mtls",
+ };
+ }
+
+ const mtlsHeaderRaw = process.env.REPLICATION_MTLS_HEADER?.trim().toLowerCase();
+ const mtlsHeader = mtlsHeaderRaw && mtlsHeaderRaw.length > 0
+ ? mtlsHeaderRaw
+ : "x-forwarded-client-cert";
+
+ const nativeTlsMode = parseBooleanEnv(process.env.REPLICATION_MTLS_NATIVE_TLS, false);
+ if (nativeTlsMode === null) {
+ return {
+ mode,
+ authEnabled: true,
+ token,
+ mtlsHeader,
+ mtlsSubjectRegex: null,
+ mtlsNativeTlsEnabled: false,
+ invalidConfigError: "Invalid REPLICATION_MTLS_NATIVE_TLS (expected boolean)",
+ };
+ }
+
+ if (nativeTlsMode) {
+ const tlsRequestCert = parseBooleanEnv(process.env.PLAYGROUND_TLS_REQUEST_CERT, false);
+ if (tlsRequestCert === null) {
+ return {
+ mode,
+ authEnabled: true,
+ token,
+ mtlsHeader,
+ mtlsSubjectRegex: null,
+ mtlsNativeTlsEnabled: false,
+ invalidConfigError: "Invalid PLAYGROUND_TLS_REQUEST_CERT (expected boolean)",
+ };
+ }
+
+ const tlsRejectUnauthorized = parseBooleanEnv(process.env.PLAYGROUND_TLS_REJECT_UNAUTHORIZED, true);
+ if (tlsRejectUnauthorized === null) {
+ return {
+ mode,
+ authEnabled: true,
+ token,
+ mtlsHeader,
+ mtlsSubjectRegex: null,
+ mtlsNativeTlsEnabled: false,
+ invalidConfigError: "Invalid PLAYGROUND_TLS_REJECT_UNAUTHORIZED (expected boolean)",
+ };
+ }
+
+ if (!tlsRequestCert || !tlsRejectUnauthorized) {
+ return {
+ mode,
+ authEnabled: true,
+ token,
+ mtlsHeader,
+ mtlsSubjectRegex: null,
+ mtlsNativeTlsEnabled: false,
+ invalidConfigError:
+ "REPLICATION_MTLS_NATIVE_TLS requires PLAYGROUND_TLS_REQUEST_CERT=true and PLAYGROUND_TLS_REJECT_UNAUTHORIZED=true",
+ };
+ }
+ }
+
+ const regexRaw = process.env.REPLICATION_MTLS_SUBJECT_REGEX?.trim();
+ if (regexRaw && regexRaw.length > 0) {
+ try {
+ return {
+ mode,
+ authEnabled: mode !== "none",
+ token,
+ mtlsHeader,
+ mtlsSubjectRegex: new RegExp(regexRaw),
+ mtlsNativeTlsEnabled: nativeTlsMode,
+ invalidConfigError: null,
+ };
+ } catch {
+ return {
+ mode,
+ authEnabled: true,
+ token,
+ mtlsHeader,
+ mtlsSubjectRegex: null,
+ mtlsNativeTlsEnabled: nativeTlsMode,
+ invalidConfigError: "Invalid REPLICATION_MTLS_SUBJECT_REGEX",
+ };
+ }
+ }
+
+ return {
+ mode,
+ authEnabled: mode !== "none",
+ token,
+ mtlsHeader,
+ mtlsSubjectRegex: null,
+ mtlsNativeTlsEnabled: nativeTlsMode,
+ invalidConfigError: null,
+ };
+}
+
+function matchesMtlsRequest(request: Request, config: ReplicationAdminConfig): boolean {
+ const headerValue = request.headers.get(config.mtlsHeader);
+ if (headerValue && headerValue.trim() !== "") {
+ if (!config.mtlsSubjectRegex) {
+ return true;
+ }
+ return config.mtlsSubjectRegex.test(headerValue);
+ }
+
+ if (!config.mtlsNativeTlsEnabled || config.mtlsSubjectRegex) {
+ return false;
+ }
+
+ try {
+ return new URL(request.url).protocol === "https:";
+ } catch {
+ return false;
+ }
+}
+
+function requireReplicationAdmin(
+ request: Request,
+ set: { status?: number },
+): { ok: true } | { ok: false; error: string } {
+ const config = resolveReplicationAdminConfig();
+ if (config.invalidConfigError) {
+ set.status = 500;
+ return { ok: false, error: config.invalidConfigError };
+ }
+
+ if (config.mode === "none") {
+ return { ok: true };
+ }
+
+ const authHeader = request.headers.get("authorization");
+ const tokenOk = config.token ? authHeader === `Bearer ${config.token}` : false;
+ const mtlsOk = matchesMtlsRequest(request, config);
+
+ const authorized = (() => {
+ switch (config.mode) {
+ case "token":
+ return tokenOk;
+ case "mtls":
+ return mtlsOk;
+ case "token_or_mtls":
+ return tokenOk || mtlsOk;
+ case "token_and_mtls":
+ return tokenOk && mtlsOk;
+ case "none":
+ default:
+ return true;
+ }
+ })();
+
+ if (authorized) {
+ return { ok: true };
+ }
+
+ set.status = 401;
+ return {
+ ok: false,
+ error: `Unauthorized: replication admin auth mode '${config.mode}' not satisfied`,
+ };
+}
+
+function resolveReplicationStatus(
+ raw: Record,
+): {
+ role: "primary" | "replica" | "disabled";
+ primary: RawReplicationStatus | null;
+ replica: RawReplicaStatus | null;
+} {
+ const primary = callRawMethod(
+ raw,
+ ["primaryReplicationStatus", "primary_replication_status"],
+ );
+ const replica = callRawMethod(
+ raw,
+ ["replicaReplicationStatus", "replica_replication_status"],
+ );
+
+ const role = primary
+ ? "primary"
+ : replica
+ ? "replica"
+ : "disabled";
+
+ return { role, primary, replica };
+}
+
+function getSnapshot(rawDb: Record): Record | null {
+ const direct = rawDb._snapshot;
+ if (direct && typeof direct === "object") {
+ return direct as Record;
+ }
+
+ const cached = rawDb._snapshotCache;
+ if (cached && typeof cached === "object") {
+ return cached as Record;
+ }
+
+ return null;
+}
+
+function parsePositiveInt(
+ value: unknown,
+ fallback: number,
+ min: number,
+ max: number,
+): number {
+ if (value === undefined || value === null || value === "") {
+ return fallback;
+ }
+
+ const parsed = Number(value);
+ if (!Number.isFinite(parsed)) {
+ return fallback;
+ }
+
+ return Math.min(Math.max(Math.floor(parsed), min), max);
+}
+
+function parseBoolean(value: unknown, fallback: boolean): boolean {
+ if (value === undefined || value === null || value === "") {
+ return fallback;
+ }
+
+ if (typeof value === "boolean") {
+ return value;
+ }
+
+ const text = String(value).toLowerCase().trim();
+ if (text === "1" || text === "true" || text === "yes") {
+ return true;
+ }
+ if (text === "0" || text === "false" || text === "no") {
+ return false;
+ }
+
+ return fallback;
+}
+
+function parseReplicationCursor(raw: unknown): ParsedReplicationCursor | null {
+ if (typeof raw !== "string" || raw.trim() === "") {
+ return null;
+ }
+
+ const token = raw.trim();
+ const parts = token.split(":");
+ if (parts.length === 2) {
+ const epoch = BigInt(parts[0]);
+ const logIndex = BigInt(parts[1]);
+ return {
+ epoch,
+ segmentId: 0n,
+ segmentOffset: 0n,
+ logIndex,
+ };
+ }
+
+ if (parts.length === 4) {
+ return {
+ epoch: BigInt(parts[0]),
+ segmentId: BigInt(parts[1]),
+ segmentOffset: BigInt(parts[2]),
+ logIndex: BigInt(parts[3]),
+ };
+ }
+
+ throw new Error(
+ "invalid cursor format; expected 'epoch:logIndex' or 'epoch:segmentId:segmentOffset:logIndex'",
+ );
+}
+
+function cursorAfterFrame(
+ cursor: ParsedReplicationCursor | null,
+ epoch: bigint,
+ segmentId: bigint,
+ segmentOffset: bigint,
+ logIndex: bigint,
+): boolean {
+ if (!cursor) {
+ return true;
+ }
+
+ if (epoch > cursor.epoch) {
+ return true;
+ }
+ if (epoch < cursor.epoch) {
+ return false;
+ }
+
+ if (logIndex > cursor.logIndex) {
+ return true;
+ }
+ if (logIndex < cursor.logIndex) {
+ return false;
+ }
+
+ if (cursor.segmentId === 0n) {
+ return false;
+ }
+ if (segmentId > cursor.segmentId) {
+ return true;
+ }
+ if (segmentId < cursor.segmentId) {
+ return false;
+ }
+
+ return segmentOffset > cursor.segmentOffset;
+}
+
+function formatSegmentFileName(id: bigint): string {
+ return `segment-${id.toString().padStart(20, "0")}.rlog`;
+}
+
+async function readFileBytes(path: string): Promise {
+ const arrayBuffer = await Bun.file(path).arrayBuffer();
+ return new Uint8Array(arrayBuffer);
+}
+
+async function readManifestEnvelope(sidecarPath: string): Promise<{
+ version: number;
+ payload_crc32: number;
+ manifest: {
+ epoch: number;
+ head_log_index: number;
+ retained_floor: number;
+ active_segment_id: number;
+ segments: Array<{
+ id: number;
+ start_log_index: number;
+ end_log_index: number;
+ size_bytes: number;
+ }>;
+ };
+}> {
+ const manifestPath = join(sidecarPath, "manifest.json");
+ const text = await Bun.file(manifestPath).text();
+ return JSON.parse(text);
+}
+
+function escapePrometheusLabelValue(value: string): string {
+ return value
+ .replaceAll("\\", "\\\\")
+ .replaceAll("\"", "\\\"")
+ .replaceAll("\n", "\\n");
+}
+
+function formatPrometheusLabels(labels: Record): string {
+ const entries = Object.entries(labels);
+ if (entries.length === 0) {
+ return "";
+ }
+ const rendered = entries.map(
+ ([key, value]) => `${key}="${escapePrometheusLabelValue(String(value))}"`,
+ );
+ return `{${rendered.join(",")}}`;
+}
+
+function toMetricNumber(value: unknown, fallback = 0): number {
+ const parsed = Number(value);
+ if (!Number.isFinite(parsed)) {
+ return fallback;
+ }
+ return parsed;
+}
+
+function pushPrometheusMetricHelp(
+ lines: Array,
+ metricName: string,
+ metricType: "gauge" | "counter",
+ helpText: string,
+): void {
+ lines.push(`# HELP ${metricName} ${helpText}`);
+ lines.push(`# TYPE ${metricName} ${metricType}`);
+}
+
+function pushPrometheusMetricSample(
+ lines: Array,
+ metricName: string,
+ value: number,
+ labels: Record = {},
+): void {
+ lines.push(`${metricName}${formatPrometheusLabels(labels)} ${value}`);
+}
+
+function renderReplicationPrometheusMetrics(
+ resolved: {
+ role: "primary" | "replica" | "disabled";
+ primary: RawReplicationStatus | null;
+ replica: RawReplicaStatus | null;
+ },
+ authEnabled: boolean,
+): string {
+ const lines: Array = [];
+
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_enabled",
+ "gauge",
+ "Whether replication is enabled for the connected database (1 enabled, 0 disabled).",
+ );
+ pushPrometheusMetricSample(lines, "raydb_replication_enabled", resolved.role === "disabled" ? 0 : 1, {
+ role: resolved.role,
+ });
+
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_auth_enabled",
+ "gauge",
+ "Whether replication admin token auth is enabled for admin endpoints.",
+ );
+ pushPrometheusMetricSample(lines, "raydb_replication_auth_enabled", authEnabled ? 1 : 0);
+
+ if (resolved.primary) {
+ const epoch = toMetricNumber(resolved.primary.epoch, 0);
+ const headLogIndex = toMetricNumber(resolved.primary.headLogIndex, 0);
+ const retainedFloor = toMetricNumber(resolved.primary.retainedFloor, 0);
+ const replicaLags = resolved.primary.replicaLags ?? [];
+
+ let staleReplicaCount = 0;
+ let maxReplicaLag = 0;
+
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_primary_epoch",
+ "gauge",
+ "Primary replication epoch.",
+ );
+ pushPrometheusMetricSample(lines, "raydb_replication_primary_epoch", epoch);
+
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_primary_head_log_index",
+ "gauge",
+ "Primary replication head log index.",
+ );
+ pushPrometheusMetricSample(lines, "raydb_replication_primary_head_log_index", headLogIndex);
+
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_primary_retained_floor",
+ "gauge",
+ "Primary replication retained floor log index.",
+ );
+ pushPrometheusMetricSample(lines, "raydb_replication_primary_retained_floor", retainedFloor);
+
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_primary_replica_count",
+ "gauge",
+ "Number of replicas reporting progress to the primary.",
+ );
+ pushPrometheusMetricSample(lines, "raydb_replication_primary_replica_count", replicaLags.length);
+
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_primary_replica_lag",
+ "gauge",
+ "Replica lag in frames relative to primary head index.",
+ );
+ for (const lag of replicaLags) {
+ const replicaEpoch = toMetricNumber(lag.epoch, 0);
+ const appliedLogIndex = toMetricNumber(lag.appliedLogIndex, 0);
+ const lagFrames = replicaEpoch === epoch
+ ? Math.max(0, headLogIndex - appliedLogIndex)
+ : Math.max(0, headLogIndex);
+ if (replicaEpoch !== epoch) {
+ staleReplicaCount += 1;
+ }
+ maxReplicaLag = Math.max(maxReplicaLag, lagFrames);
+ pushPrometheusMetricSample(
+ lines,
+ "raydb_replication_primary_replica_lag",
+ lagFrames,
+ {
+ replica_id: lag.replicaId,
+ replica_epoch: replicaEpoch,
+ },
+ );
+ }
+
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_primary_stale_epoch_replica_count",
+ "gauge",
+ "Count of replicas reporting progress from a stale epoch.",
+ );
+ pushPrometheusMetricSample(
+ lines,
+ "raydb_replication_primary_stale_epoch_replica_count",
+ staleReplicaCount,
+ );
+
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_primary_max_replica_lag",
+ "gauge",
+ "Maximum replica lag in frames among replicas reporting progress.",
+ );
+ pushPrometheusMetricSample(lines, "raydb_replication_primary_max_replica_lag", maxReplicaLag);
+
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_primary_append_attempts_total",
+ "counter",
+ "Total replication append attempts on primary commit path.",
+ );
+ pushPrometheusMetricSample(
+ lines,
+ "raydb_replication_primary_append_attempts_total",
+ toMetricNumber(resolved.primary.appendAttempts, 0),
+ );
+
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_primary_append_failures_total",
+ "counter",
+ "Total replication append failures on primary commit path.",
+ );
+ pushPrometheusMetricSample(
+ lines,
+ "raydb_replication_primary_append_failures_total",
+ toMetricNumber(resolved.primary.appendFailures, 0),
+ );
+
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_primary_append_successes_total",
+ "counter",
+ "Total replication append successes on primary commit path.",
+ );
+ pushPrometheusMetricSample(
+ lines,
+ "raydb_replication_primary_append_successes_total",
+ toMetricNumber(resolved.primary.appendSuccesses, 0),
+ );
+ }
+
+ if (resolved.replica) {
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_replica_applied_epoch",
+ "gauge",
+ "Replica applied epoch.",
+ );
+ pushPrometheusMetricSample(
+ lines,
+ "raydb_replication_replica_applied_epoch",
+ toMetricNumber(resolved.replica.appliedEpoch, 0),
+ );
+
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_replica_applied_log_index",
+ "gauge",
+ "Replica applied log index.",
+ );
+ pushPrometheusMetricSample(
+ lines,
+ "raydb_replication_replica_applied_log_index",
+ toMetricNumber(resolved.replica.appliedLogIndex, 0),
+ );
+
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_replica_needs_reseed",
+ "gauge",
+ "Whether replica currently requires reseed (1 yes, 0 no).",
+ );
+ pushPrometheusMetricSample(
+ lines,
+ "raydb_replication_replica_needs_reseed",
+ resolved.replica.needsReseed ? 1 : 0,
+ );
+
+ pushPrometheusMetricHelp(
+ lines,
+ "raydb_replication_replica_last_error_present",
+ "gauge",
+ "Whether replica has a non-empty last_error value (1 yes, 0 no).",
+ );
+ const hasError = resolved.replica.lastError ? 1 : 0;
+ pushPrometheusMetricSample(lines, "raydb_replication_replica_last_error_present", hasError);
+ }
+
+ return `${lines.join("\n")}\n`;
+}
+
// ============================================================================
// API Routes
// ============================================================================
@@ -95,17 +822,519 @@ export const apiRoutes = new Elysia({ prefix: "/api" })
return await getStatus();
})
+ // --------------------------------------------------------------------------
+ // Replication (status / pull / promote)
+ // --------------------------------------------------------------------------
+ .get("/replication/status", async () => {
+ const raw = getRawDb();
+ if (!raw) {
+ return {
+ connected: false,
+ error: "No database connected",
+ };
+ }
+
+ try {
+ const resolved = resolveReplicationStatus(raw);
+ return {
+ connected: true,
+ authEnabled: resolveReplicationAdminConfig().authEnabled,
+ role: resolved.role,
+ primary: resolved.primary,
+ replica: resolved.replica,
+ };
+ } catch (error) {
+ return {
+ connected: true,
+ error:
+ error instanceof Error
+ ? error.message
+ : "Failed to query replication status",
+ };
+ }
+ })
+
+ .get("/replication/metrics", async ({ request, set }) => {
+ const auth = requireReplicationAdmin(request, set);
+ if (!auth.ok) {
+ return new Response(auth.error, {
+ status: set.status ?? 401,
+ headers: { "Content-Type": "text/plain; charset=utf-8" },
+ });
+ }
+
+ const raw = getRawDb();
+ if (!raw) {
+ return new Response("No database connected", {
+ status: 503,
+ headers: { "Content-Type": "text/plain; charset=utf-8" },
+ });
+ }
+
+ try {
+ const resolved = resolveReplicationStatus(raw);
+ const text = renderReplicationPrometheusMetrics(
+ resolved,
+ resolveReplicationAdminConfig().authEnabled,
+ );
+ return new Response(text, {
+ headers: {
+ "Content-Type": "text/plain; version=0.0.4; charset=utf-8",
+ "Cache-Control": "no-store",
+ },
+ });
+ } catch (error) {
+ return new Response(
+ error instanceof Error ? error.message : "Failed to render replication metrics",
+ {
+ status: 500,
+ headers: { "Content-Type": "text/plain; charset=utf-8" },
+ },
+ );
+ }
+ })
+
+ .get("/replication/snapshot/latest", async ({ query, request, set }) => {
+ const auth = requireReplicationAdmin(request, set);
+ if (!auth.ok) {
+ return { success: false, error: auth.error };
+ }
+
+ const raw = getRawDb();
+ if (!raw) {
+ return { success: false, error: "No database connected" };
+ }
+
+ try {
+ const resolved = resolveReplicationStatus(raw);
+ if (resolved.role !== "primary" || !resolved.primary) {
+ return {
+ success: false,
+ error: "Replication snapshot endpoint requires primary role",
+ };
+ }
+
+ const dbPath = getDbPath();
+ if (!dbPath) {
+ return { success: false, error: "Database path unavailable" };
+ }
+
+ const includeData = parseBoolean((query as Record).includeData, false);
+ const bytes = await readFileBytes(dbPath);
+ const sha256 = createHash("sha256").update(bytes).digest("hex");
+
+ return {
+ success: true,
+ role: resolved.role,
+ epoch: resolved.primary.epoch ?? null,
+ headLogIndex: resolved.primary.headLogIndex ?? null,
+ snapshot: {
+ format: "single-file-db-copy",
+ dbPath,
+ byteLength: bytes.byteLength,
+ sha256,
+ generatedAt: new Date().toISOString(),
+ dataBase64: includeData ? Buffer.from(bytes).toString("base64") : undefined,
+ },
+ };
+ } catch (error) {
+ return {
+ success: false,
+ error:
+ error instanceof Error
+ ? error.message
+ : "Failed to prepare replication snapshot",
+ };
+ }
+ })
+
+ .get("/replication/log", async ({ query, request, set }) => {
+ const auth = requireReplicationAdmin(request, set);
+ if (!auth.ok) {
+ return { success: false, error: auth.error };
+ }
+
+ const raw = getRawDb();
+ if (!raw) {
+ return { success: false, error: "No database connected" };
+ }
+
+ try {
+ const resolved = resolveReplicationStatus(raw);
+ if (resolved.role !== "primary" || !resolved.primary?.sidecarPath) {
+ return {
+ success: false,
+ error: "Replication log endpoint requires primary role with sidecar",
+ };
+ }
+
+ const queryObject = query as Record;
+ const maxBytes = parsePositiveInt(
+ queryObject.maxBytes,
+ REPLICATION_LOG_MAX_BYTES_DEFAULT,
+ 1,
+ REPLICATION_LOG_MAX_BYTES_LIMIT,
+ );
+ const maxFrames = parsePositiveInt(
+ queryObject.maxFrames,
+ REPLICATION_PULL_MAX_FRAMES_DEFAULT,
+ 1,
+ REPLICATION_PULL_MAX_FRAMES_LIMIT,
+ );
+ const includePayload = parseBoolean(queryObject.includePayload, true);
+ const cursor = parseReplicationCursor(queryObject.cursor);
+
+ const envelope = await readManifestEnvelope(resolved.primary.sidecarPath);
+ const manifest = envelope.manifest;
+ const segments = [...manifest.segments].sort((left, right) => left.id - right.id);
+
+ const frames: Array = [];
+ let totalBytes = 0;
+ let nextCursor = typeof queryObject.cursor === "string" ? queryObject.cursor : null;
+ let limited = false;
+
+ outer: for (const segment of segments) {
+ const segmentId = BigInt(segment.id);
+ const segmentPath = join(
+ resolved.primary.sidecarPath,
+ formatSegmentFileName(segmentId),
+ );
+
+ const segmentBytes = await readFileBytes(segmentPath);
+ const view = new DataView(
+ segmentBytes.buffer,
+ segmentBytes.byteOffset,
+ segmentBytes.byteLength,
+ );
+
+ let offset = 0;
+ while (offset + 32 <= segmentBytes.byteLength) {
+ const magic = view.getUint32(offset, true);
+ if (magic !== 0x474f4c52) {
+ break;
+ }
+
+ const _version = view.getUint16(offset + 4, true);
+ const _flags = view.getUint16(offset + 6, true);
+ const epoch = view.getBigUint64(offset + 8, true);
+ const logIndex = view.getBigUint64(offset + 16, true);
+ const payloadLength = view.getUint32(offset + 24, true);
+ const payloadOffset = offset + 32;
+ const payloadEnd = payloadOffset + payloadLength;
+ if (payloadEnd > segmentBytes.byteLength) {
+ break;
+ }
+
+ const frameBytes = payloadEnd - offset;
+ const frameOffset = BigInt(offset);
+ const frameAfterCursor = cursorAfterFrame(
+ cursor,
+ epoch,
+ segmentId,
+ frameOffset,
+ logIndex,
+ );
+
+ if (frameAfterCursor) {
+ if ((totalBytes + frameBytes > maxBytes && frames.length > 0) || frames.length >= maxFrames) {
+ limited = true;
+ break outer;
+ }
+
+ const payload = segmentBytes.subarray(payloadOffset, payloadEnd);
+ const nextOffset = BigInt(payloadEnd);
+ nextCursor = `${epoch}:${segmentId}:${nextOffset}:${logIndex}`;
+
+ frames.push({
+ epoch: epoch.toString(),
+ logIndex: logIndex.toString(),
+ segmentId: segmentId.toString(),
+ segmentOffset: frameOffset.toString(),
+ payloadBase64: includePayload
+ ? Buffer.from(payload).toString("base64")
+ : "",
+ bytes: frameBytes,
+ });
+ totalBytes += frameBytes;
+ }
+
+ offset = payloadEnd;
+ }
+ }
+
+ return {
+ success: true,
+ role: resolved.role,
+ epoch: manifest.epoch,
+ headLogIndex: manifest.head_log_index,
+ retainedFloor: manifest.retained_floor,
+ cursor: typeof queryObject.cursor === "string" ? queryObject.cursor : null,
+ nextCursor,
+ eof: !limited,
+ frameCount: frames.length,
+ totalBytes,
+ frames,
+ };
+ } catch (error) {
+ return {
+ success: false,
+ error:
+ error instanceof Error
+ ? error.message
+ : "Failed to fetch replication log",
+ };
+ }
+ })
+
+ .get("/replication/transport/snapshot", async ({ query, request, set }) => {
+ const auth = requireReplicationAdmin(request, set);
+ if (!auth.ok) {
+ return { success: false, error: auth.error };
+ }
+
+ const raw = getRawDb();
+ if (!raw) {
+ return { success: false, error: "No database connected" };
+ }
+
+ try {
+ const includeData = parseBoolean((query as Record).includeData, false);
+ const exported = callRawMethod(
+ raw,
+ [
+ "exportReplicationSnapshotTransportJson",
+ "export_replication_snapshot_transport_json",
+ ],
+ includeData,
+ );
+ const snapshot = JSON.parse(exported) as Record;
+ return {
+ success: true,
+ snapshot,
+ };
+ } catch (error) {
+ return {
+ success: false,
+ error:
+ error instanceof Error
+ ? error.message
+ : "Failed to export replication transport snapshot",
+ };
+ }
+ })
+
+ .get("/replication/transport/log", async ({ query, request, set }) => {
+ const auth = requireReplicationAdmin(request, set);
+ if (!auth.ok) {
+ return { success: false, error: auth.error };
+ }
+
+ const raw = getRawDb();
+ if (!raw) {
+ return { success: false, error: "No database connected" };
+ }
+
+ try {
+ const queryObject = query as Record;
+ const maxBytes = parsePositiveInt(
+ queryObject.maxBytes,
+ REPLICATION_LOG_MAX_BYTES_DEFAULT,
+ 1,
+ REPLICATION_LOG_MAX_BYTES_LIMIT,
+ );
+ const maxFrames = parsePositiveInt(
+ queryObject.maxFrames,
+ REPLICATION_PULL_MAX_FRAMES_DEFAULT,
+ 1,
+ REPLICATION_PULL_MAX_FRAMES_LIMIT,
+ );
+ const includePayload = parseBoolean(queryObject.includePayload, true);
+ const cursor = typeof queryObject.cursor === "string" ? queryObject.cursor : null;
+
+ const exported = callRawMethod(
+ raw,
+ [
+ "exportReplicationLogTransportJson",
+ "export_replication_log_transport_json",
+ ],
+ cursor,
+ maxFrames,
+ maxBytes,
+ includePayload,
+ );
+ const payload = JSON.parse(exported) as Record;
+ return {
+ success: true,
+ ...(payload as object),
+ };
+ } catch (error) {
+ return {
+ success: false,
+ error:
+ error instanceof Error
+ ? error.message
+ : "Failed to export replication transport log",
+ };
+ }
+ })
+
+ .post(
+ "/replication/pull",
+ async ({ body, request, set }) => {
+ const auth = requireReplicationAdmin(request, set);
+ if (!auth.ok) {
+ return { success: false, error: auth.error };
+ }
+
+ const raw = getRawDb();
+ if (!raw) {
+ return { success: false, error: "No database connected" };
+ }
+
+ const maxFrames = Math.min(
+ Math.max(body.maxFrames ?? REPLICATION_PULL_MAX_FRAMES_DEFAULT, 1),
+ REPLICATION_PULL_MAX_FRAMES_LIMIT,
+ );
+
+ try {
+ const applied = callRawMethod(
+ raw,
+ ["replicaCatchUpOnce", "replica_catch_up_once"],
+ maxFrames,
+ );
+ const resolved = resolveReplicationStatus(raw);
+
+ return {
+ success: true,
+ appliedFrames: applied,
+ role: resolved.role,
+ replica: resolved.replica,
+ };
+ } catch (error) {
+ return {
+ success: false,
+ error:
+ error instanceof Error
+ ? error.message
+ : "Replication pull failed",
+ };
+ }
+ },
+ {
+ body: t.Object({
+ maxFrames: t.Optional(t.Number()),
+ }),
+ },
+ )
+
+ .post("/replication/reseed", async ({ request, set }) => {
+ const auth = requireReplicationAdmin(request, set);
+ if (!auth.ok) {
+ return { success: false, error: auth.error };
+ }
+
+ const raw = getRawDb();
+ if (!raw) {
+ return { success: false, error: "No database connected" };
+ }
+
+ try {
+ callRawMethod(
+ raw,
+ ["replicaReseedFromSnapshot", "replica_reseed_from_snapshot"],
+ );
+ const resolved = resolveReplicationStatus(raw);
+
+ return {
+ success: true,
+ role: resolved.role,
+ replica: resolved.replica,
+ };
+ } catch (error) {
+ return {
+ success: false,
+ error:
+ error instanceof Error
+ ? error.message
+ : "Replica reseed failed",
+ };
+ }
+ })
+
+ .post("/replication/promote", async ({ request, set }) => {
+ const auth = requireReplicationAdmin(request, set);
+ if (!auth.ok) {
+ return { success: false, error: auth.error };
+ }
+
+ const raw = getRawDb();
+ if (!raw) {
+ return { success: false, error: "No database connected" };
+ }
+
+ try {
+ const epoch = callRawMethod(
+ raw,
+ ["primaryPromoteToNextEpoch", "primary_promote_to_next_epoch"],
+ );
+ const resolved = resolveReplicationStatus(raw);
+
+ return {
+ success: true,
+ epoch,
+ role: resolved.role,
+ primary: resolved.primary,
+ };
+ } catch (error) {
+ return {
+ success: false,
+ error:
+ error instanceof Error
+ ? error.message
+ : "Primary promote failed",
+ };
+ }
+ })
+
// --------------------------------------------------------------------------
// Database Management
// --------------------------------------------------------------------------
.post(
"/db/open",
async ({ body }) => {
- return await openDatabase(body.path);
+ return await openDatabase(body.path, body.options as PlaygroundOpenOptions | undefined);
},
{
body: t.Object({
path: t.String(),
+ options: t.Optional(
+ t.Object({
+ readOnly: t.Optional(t.Boolean()),
+ createIfMissing: t.Optional(t.Boolean()),
+ mvcc: t.Optional(t.Boolean()),
+ mvccGcIntervalMs: t.Optional(t.Number()),
+ mvccRetentionMs: t.Optional(t.Number()),
+ mvccMaxChainDepth: t.Optional(t.Number()),
+ syncMode: t.Optional(t.Union([t.Literal("Full"), t.Literal("Normal"), t.Literal("Off")])),
+ groupCommitEnabled: t.Optional(t.Boolean()),
+ groupCommitWindowMs: t.Optional(t.Number()),
+ walSizeMb: t.Optional(t.Number()),
+ checkpointThreshold: t.Optional(t.Number()),
+ replicationRole: t.Optional(
+ t.Union([
+ t.Literal("disabled"),
+ t.Literal("primary"),
+ t.Literal("replica"),
+ ]),
+ ),
+ replicationSidecarPath: t.Optional(t.String()),
+ replicationSourceDbPath: t.Optional(t.String()),
+ replicationSourceSidecarPath: t.Optional(t.String()),
+ replicationSegmentMaxBytes: t.Optional(t.Number()),
+ replicationRetentionMinEntries: t.Optional(t.Number()),
+ replicationRetentionMinMs: t.Optional(t.Number()),
+ }),
+ ),
}),
}
)
diff --git a/playground/src/client/lib/api.ts b/playground/src/client/lib/api.ts
index f9e2272..783289c 100644
--- a/playground/src/client/lib/api.ts
+++ b/playground/src/client/lib/api.ts
@@ -11,10 +11,37 @@ import type {
PathResponse,
ImpactResponse,
ApiResult,
+ ReplicationStatusResponse,
+ ReplicationSnapshotResponse,
+ ReplicationLogResponse,
+ ReplicationPullResponse,
+ ReplicationReseedResponse,
+ ReplicationPromoteResponse,
} from "./types.ts";
const API_BASE = "/api";
+export interface DbOpenOptions {
+ readOnly?: boolean
+ createIfMissing?: boolean
+ mvcc?: boolean
+ mvccGcIntervalMs?: number
+ mvccRetentionMs?: number
+ mvccMaxChainDepth?: number
+ syncMode?: "Full" | "Normal" | "Off"
+ groupCommitEnabled?: boolean
+ groupCommitWindowMs?: number
+ walSizeMb?: number
+ checkpointThreshold?: number
+ replicationRole?: "disabled" | "primary" | "replica"
+ replicationSidecarPath?: string
+ replicationSourceDbPath?: string
+ replicationSourceSidecarPath?: string
+ replicationSegmentMaxBytes?: number
+ replicationRetentionMinEntries?: number
+ replicationRetentionMinMs?: number
+}
+
// ============================================================================
// Helper
// ============================================================================
@@ -35,6 +62,28 @@ async function fetchJson(url: string, options?: RequestInit): Promise {
return response.json();
}
+async function fetchText(url: string, options?: RequestInit): Promise {
+ const response = await fetch(`${API_BASE}${url}`, {
+ ...options,
+ headers: {
+ ...options?.headers,
+ },
+ });
+
+ if (!response.ok) {
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+ }
+
+ return response.text();
+}
+
+function withAuthHeader(token?: string): HeadersInit | undefined {
+ if (!token || token.trim() === "") {
+ return undefined;
+ }
+ return { Authorization: `Bearer ${token}` };
+}
+
// ============================================================================
// Database Management
// ============================================================================
@@ -43,10 +92,13 @@ export async function getStatus(): Promise {
return fetchJson("/status");
}
-export async function openDatabase(path: string): Promise {
+export async function openDatabase(path: string, options?: DbOpenOptions): Promise {
return fetchJson("/db/open", {
method: "POST",
- body: JSON.stringify({ path }),
+ body: JSON.stringify({
+ path,
+ ...(options ? { options } : {}),
+ }),
});
}
@@ -74,6 +126,110 @@ export async function closeDatabase(): Promise {
});
}
+// ============================================================================
+// Replication
+// ============================================================================
+
+export interface ReplicationAuthOptions {
+ adminToken?: string
+}
+
+export interface ReplicationSnapshotOptions extends ReplicationAuthOptions {
+ includeData?: boolean
+}
+
+export interface ReplicationLogOptions extends ReplicationAuthOptions {
+ cursor?: string
+ maxBytes?: number
+ maxFrames?: number
+ includePayload?: boolean
+}
+
+export interface ReplicationPullOptions extends ReplicationAuthOptions {
+ maxFrames?: number
+}
+
+export async function getReplicationStatus(): Promise {
+ return fetchJson("/replication/status");
+}
+
+export async function getReplicationMetricsPrometheus(
+ options?: ReplicationAuthOptions,
+): Promise {
+ return fetchText("/replication/metrics", {
+ headers: withAuthHeader(options?.adminToken),
+ });
+}
+
+export async function getReplicationSnapshotLatest(
+ options?: ReplicationSnapshotOptions,
+): Promise {
+ const params = new URLSearchParams();
+ if (typeof options?.includeData === "boolean") {
+ params.set("includeData", options.includeData ? "true" : "false");
+ }
+ const query = params.size > 0 ? `?${params.toString()}` : "";
+
+ return fetchJson(`/replication/snapshot/latest${query}`, {
+ headers: withAuthHeader(options?.adminToken),
+ });
+}
+
+export async function getReplicationLog(
+ options?: ReplicationLogOptions,
+): Promise {
+ const params = new URLSearchParams();
+ if (options?.cursor) {
+ params.set("cursor", options.cursor);
+ }
+ if (typeof options?.maxBytes === "number") {
+ params.set("maxBytes", String(options.maxBytes));
+ }
+ if (typeof options?.maxFrames === "number") {
+ params.set("maxFrames", String(options.maxFrames));
+ }
+ if (typeof options?.includePayload === "boolean") {
+ params.set("includePayload", options.includePayload ? "true" : "false");
+ }
+ const query = params.size > 0 ? `?${params.toString()}` : "";
+
+ return fetchJson(`/replication/log${query}`, {
+ headers: withAuthHeader(options?.adminToken),
+ });
+}
+
+export async function pullReplicaOnce(
+ options?: ReplicationPullOptions,
+): Promise {
+ return fetchJson("/replication/pull", {
+ method: "POST",
+ headers: withAuthHeader(options?.adminToken),
+ body: JSON.stringify(
+ typeof options?.maxFrames === "number"
+ ? { maxFrames: options.maxFrames }
+ : {},
+ ),
+ });
+}
+
+export async function reseedReplica(
+ options?: ReplicationAuthOptions,
+): Promise {
+ return fetchJson("/replication/reseed", {
+ method: "POST",
+ headers: withAuthHeader(options?.adminToken),
+ });
+}
+
+export async function promotePrimary(
+ options?: ReplicationAuthOptions,
+): Promise {
+ return fetchJson("/replication/promote", {
+ method: "POST",
+ headers: withAuthHeader(options?.adminToken),
+ });
+}
+
// ============================================================================
// Stats
// ============================================================================
diff --git a/playground/src/client/lib/types.ts b/playground/src/client/lib/types.ts
index 97acbdf..26cb2d0 100644
--- a/playground/src/client/lib/types.ts
+++ b/playground/src/client/lib/types.ts
@@ -65,6 +65,98 @@ export interface ApiResult {
error?: string;
}
+export interface ReplicationReplicaLag {
+ replicaId: string;
+ epoch: number;
+ appliedLogIndex: number;
+}
+
+export interface PrimaryReplicationStatus {
+ role?: string;
+ epoch?: number;
+ headLogIndex?: number;
+ retainedFloor?: number;
+ replicaLags?: ReplicationReplicaLag[];
+ sidecarPath?: string;
+ lastToken?: string | null;
+ appendAttempts?: number;
+ appendFailures?: number;
+ appendSuccesses?: number;
+}
+
+export interface ReplicaReplicationStatus {
+ role?: string;
+ appliedEpoch?: number;
+ appliedLogIndex?: number;
+ needsReseed?: boolean;
+ lastError?: string | null;
+}
+
+export interface ReplicationStatusResponse {
+ connected: boolean;
+ authEnabled?: boolean;
+ role?: "primary" | "replica" | "disabled";
+ primary?: PrimaryReplicationStatus | null;
+ replica?: ReplicaReplicationStatus | null;
+ error?: string;
+}
+
+export interface ReplicationSnapshotResponse extends ApiResult {
+ role?: "primary" | "replica" | "disabled";
+ epoch?: number | null;
+ headLogIndex?: number | null;
+ snapshot?: {
+ format: string;
+ dbPath: string;
+ byteLength: number;
+ sha256: string;
+ generatedAt: string;
+ dataBase64?: string;
+ };
+}
+
+export interface ReplicationLogFrame {
+ epoch: string;
+ logIndex: string;
+ segmentId: string;
+ segmentOffset: string;
+ payloadBase64: string;
+ bytes: number;
+}
+
+export interface ReplicationLogResponse extends ApiResult {
+ role?: "primary" | "replica" | "disabled";
+ epoch?: number | null;
+ headLogIndex?: number | null;
+ retainedFloor?: number | null;
+ request?: {
+ maxBytes: number;
+ maxFrames: number;
+ includePayload: boolean;
+ cursor: string | null;
+ };
+ frames?: ReplicationLogFrame[];
+ nextCursor?: string | null;
+ eof?: boolean;
+}
+
+export interface ReplicationPullResponse extends ApiResult {
+ role?: "primary" | "replica" | "disabled";
+ appliedFrames?: number;
+ replica?: ReplicaReplicationStatus | null;
+}
+
+export interface ReplicationReseedResponse extends ApiResult {
+ role?: "primary" | "replica" | "disabled";
+ replica?: ReplicaReplicationStatus | null;
+}
+
+export interface ReplicationPromoteResponse extends ApiResult {
+ role?: "primary" | "replica" | "disabled";
+ epoch?: number | null;
+ primary?: PrimaryReplicationStatus | null;
+}
+
// ============================================================================
// UI State Types
// ============================================================================
diff --git a/playground/src/server.ts b/playground/src/server.ts
index dc5cdf6..d8543db 100644
--- a/playground/src/server.ts
+++ b/playground/src/server.ts
@@ -7,6 +7,7 @@
import { Elysia } from "elysia";
import { cors } from "@elysiajs/cors";
import { apiRoutes } from "./api/routes.ts";
+import { existsSync } from "node:fs";
import { join } from "node:path";
const PORT = process.env.PORT ? parseInt(process.env.PORT) : 3000;
@@ -24,6 +25,83 @@ const getContentType = (path: string): string => {
return "application/octet-stream";
};
+type TlsFile = ReturnType;
+
+interface PlaygroundTlsConfig {
+ enabled: boolean;
+ protocol: "http" | "https";
+ tls?: {
+ cert: TlsFile;
+ key: TlsFile;
+ ca?: TlsFile;
+ requestCert: boolean;
+ rejectUnauthorized: boolean;
+ };
+}
+
+function parseBooleanEnv(name: string, raw: string | undefined, defaultValue: boolean): boolean {
+ if (raw === undefined) {
+ return defaultValue;
+ }
+
+ const normalized = raw.trim().toLowerCase();
+ if (normalized === "") {
+ return defaultValue;
+ }
+ if (normalized === "1" || normalized === "true" || normalized === "yes" || normalized === "on") {
+ return true;
+ }
+ if (normalized === "0" || normalized === "false" || normalized === "no" || normalized === "off") {
+ return false;
+ }
+ throw new Error(`Invalid ${name} (expected boolean)`);
+}
+
+export function resolvePlaygroundTlsConfig(env: NodeJS.ProcessEnv = process.env): PlaygroundTlsConfig {
+ const certFile = env.PLAYGROUND_TLS_CERT_FILE?.trim();
+ const keyFile = env.PLAYGROUND_TLS_KEY_FILE?.trim();
+ const caFile = env.PLAYGROUND_TLS_CA_FILE?.trim();
+
+ const hasCert = Boolean(certFile && certFile.length > 0);
+ const hasKey = Boolean(keyFile && keyFile.length > 0);
+ if (hasCert !== hasKey) {
+ throw new Error("PLAYGROUND_TLS_CERT_FILE and PLAYGROUND_TLS_KEY_FILE must both be set for TLS");
+ }
+
+ if (!hasCert || !hasKey) {
+ return { enabled: false, protocol: "http" };
+ }
+
+ if (!existsSync(certFile!)) {
+ throw new Error(`PLAYGROUND_TLS_CERT_FILE does not exist: ${certFile}`);
+ }
+ if (!existsSync(keyFile!)) {
+ throw new Error(`PLAYGROUND_TLS_KEY_FILE does not exist: ${keyFile}`);
+ }
+ if (caFile && caFile.length > 0 && !existsSync(caFile)) {
+ throw new Error(`PLAYGROUND_TLS_CA_FILE does not exist: ${caFile}`);
+ }
+
+ const requestCert = parseBooleanEnv("PLAYGROUND_TLS_REQUEST_CERT", env.PLAYGROUND_TLS_REQUEST_CERT, false);
+ const rejectUnauthorized = parseBooleanEnv(
+ "PLAYGROUND_TLS_REJECT_UNAUTHORIZED",
+ env.PLAYGROUND_TLS_REJECT_UNAUTHORIZED,
+ true,
+ );
+
+ return {
+ enabled: true,
+ protocol: "https",
+ tls: {
+ cert: Bun.file(certFile!),
+ key: Bun.file(keyFile!),
+ ...(caFile && caFile.length > 0 ? { ca: Bun.file(caFile) } : {}),
+ requestCert,
+ rejectUnauthorized,
+ },
+ };
+}
+
export const app = new Elysia()
// Enable CORS for development
.use(cors({
@@ -59,12 +137,19 @@ let server: ReturnType | null = null;
if (import.meta.main) {
try {
+ const tlsConfig = resolvePlaygroundTlsConfig();
server = app.listen({
port: PORT,
hostname: "0.0.0.0",
+ ...(tlsConfig.tls ? { tls: tlsConfig.tls } : {}),
});
const actualPort = server.server?.port ?? PORT;
- console.log(`RayDB Playground running at http://localhost:${actualPort}`);
+ console.log(`RayDB Playground running at ${tlsConfig.protocol}://localhost:${actualPort}`);
+ if (tlsConfig.enabled) {
+ console.log(
+ `TLS enabled (requestCert=${tlsConfig.tls?.requestCert ? "true" : "false"}, rejectUnauthorized=${tlsConfig.tls?.rejectUnauthorized ? "true" : "false"})`,
+ );
+ }
} catch (err) {
console.error("Failed to start server", err);
process.exit(1);
From a397fd5877fd3068806f0e0b5bda213d9532ad65 Mon Sep 17 00:00:00 2001
From: mask
Date: Sun, 8 Feb 2026 11:15:14 -0600
Subject: [PATCH 06/58] replication: land phase A-D core + perf gates
---
docs/BENCHMARKS.md | 139 ++-
...-05-index-pipeline-hypothesis-embed200.txt | 44 +
...2-05-index-pipeline-hypothesis-embed50.txt | 44 +
...2-08-replication-catchup-gate.attempt1.txt | 14 +
...2-08-replication-catchup-gate.attempt2.txt | 14 +
...2-08-replication-catchup-gate.attempt3.txt | 14 +
.../2026-02-08-replication-catchup-gate.txt | 14 +
...-08-replication-gate-baseline.attempt1.txt | 60 ++
...-08-replication-gate-baseline.attempt2.txt | 60 ++
...-08-replication-gate-baseline.attempt3.txt | 60 ++
...-08-replication-gate-baseline.attempt4.txt | 60 ++
...-08-replication-gate-baseline.attempt5.txt | 60 ++
...-08-replication-gate-baseline.attempt6.txt | 60 ++
...-08-replication-gate-baseline.attempt7.txt | 60 ++
.../2026-02-08-replication-gate-baseline.txt | 60 ++
...2-08-replication-gate-primary.attempt1.txt | 60 ++
...2-08-replication-gate-primary.attempt2.txt | 60 ++
...2-08-replication-gate-primary.attempt3.txt | 60 ++
...2-08-replication-gate-primary.attempt4.txt | 60 ++
...2-08-replication-gate-primary.attempt5.txt | 60 ++
...2-08-replication-gate-primary.attempt6.txt | 60 ++
...2-08-replication-gate-primary.attempt7.txt | 60 ++
.../2026-02-08-replication-gate-primary.txt | 60 ++
docs/bindings-parity.md | 1 +
.../index_pipeline_hypothesis_bench.rs | 987 ++++++++++++++++++
ray-rs/examples/replication_catchup_bench.rs | 285 +++++
ray-rs/examples/single_file_raw_bench.rs | 11 +
ray-rs/scripts/replication-bench-gate.sh | 149 +++
ray-rs/scripts/replication-catchup-gate.sh | 107 ++
ray-rs/scripts/replication-perf-gate.sh | 14 +
ray-rs/src/api/kite.rs | 106 +-
ray-rs/src/core/single_file/mod.rs | 6 +
ray-rs/src/core/single_file/open.rs | 104 +-
ray-rs/src/core/single_file/transaction.rs | 43 +-
ray-rs/src/error.rs | 4 +
ray-rs/src/lib.rs | 3 +
ray-rs/src/napi_bindings/kite/mod.rs | 116 +-
ray-rs/src/napi_bindings/kite/types.rs | 16 +-
ray-rs/src/pyo3_bindings/options/open.rs | 86 +-
ray-rs/src/pyo3_bindings/stats/metrics.rs | 132 +++
ray-rs/src/pyo3_bindings/stats/mod.rs | 3 +-
ray-rs/src/replication/log_store.rs | 401 +++++++
ray-rs/src/replication/manifest.rs | 214 ++++
ray-rs/src/replication/mod.rs | 16 +
ray-rs/src/replication/primary.rs | 640 ++++++++++++
ray-rs/src/replication/replica.rs | 324 ++++++
ray-rs/src/replication/token.rs | 3 +
ray-rs/src/replication/transport.rs | 88 ++
ray-rs/src/replication/types.rs | 238 +++++
ray-rs/tests/replication_faults_phase_d.rs | 144 +++
ray-rs/tests/replication_phase_a.rs | 205 ++++
ray-rs/tests/replication_phase_b.rs | 137 +++
ray-rs/tests/replication_phase_c.rs | 262 +++++
53 files changed, 6076 insertions(+), 12 deletions(-)
create mode 100644 docs/benchmarks/results/2026-02-05-index-pipeline-hypothesis-embed200.txt
create mode 100644 docs/benchmarks/results/2026-02-05-index-pipeline-hypothesis-embed50.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-catchup-gate.attempt1.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-catchup-gate.attempt2.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-catchup-gate.attempt3.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-catchup-gate.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt1.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt2.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt3.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt4.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt5.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt6.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt7.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-baseline.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt1.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt2.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt3.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt4.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt5.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt6.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt7.txt
create mode 100644 docs/benchmarks/results/2026-02-08-replication-gate-primary.txt
create mode 100644 ray-rs/examples/index_pipeline_hypothesis_bench.rs
create mode 100644 ray-rs/examples/replication_catchup_bench.rs
create mode 100755 ray-rs/scripts/replication-bench-gate.sh
create mode 100755 ray-rs/scripts/replication-catchup-gate.sh
create mode 100755 ray-rs/scripts/replication-perf-gate.sh
create mode 100644 ray-rs/src/replication/log_store.rs
create mode 100644 ray-rs/src/replication/manifest.rs
create mode 100644 ray-rs/src/replication/mod.rs
create mode 100644 ray-rs/src/replication/primary.rs
create mode 100644 ray-rs/src/replication/replica.rs
create mode 100644 ray-rs/src/replication/token.rs
create mode 100644 ray-rs/src/replication/transport.rs
create mode 100644 ray-rs/src/replication/types.rs
create mode 100644 ray-rs/tests/replication_faults_phase_d.rs
create mode 100644 ray-rs/tests/replication_phase_a.rs
create mode 100644 ray-rs/tests/replication_phase_b.rs
create mode 100644 ray-rs/tests/replication_phase_c.rs
diff --git a/docs/BENCHMARKS.md b/docs/BENCHMARKS.md
index 0c839c3..f25ea3e 100644
--- a/docs/BENCHMARKS.md
+++ b/docs/BENCHMARKS.md
@@ -3,7 +3,7 @@
This document summarizes **measured** benchmark results. Raw outputs live in
`docs/benchmarks/results/` so we can trace every number back to an actual run.
-> Latest numbers below were captured on **February 4, 2026**. Prior results
+> Latest numbers below were captured on **February 4-5, 2026**. Prior results
> from **February 3, 2026** are retained for comparison. If you need fresh
> numbers, rerun the commands in the next section and update this doc with the
> new output files.
@@ -34,6 +34,19 @@ Optional knobs (Rust):
- `--group-commit-enabled`
- `--group-commit-window-ms N` (default: 2)
+### Rust (replication catch-up throughput)
+
+```bash
+cd ray-rs
+cargo run --release --example replication_catchup_bench --no-default-features -- \
+ --seed-commits 1000 --backlog-commits 5000 --max-frames 256 --sync-mode normal
+```
+
+Key outputs:
+- `primary_frames_per_sec`
+- `catchup_frames_per_sec`
+- `throughput_ratio` (`catchup/primary`)
+
### Python bindings (single-file raw)
```bash
@@ -64,6 +77,24 @@ cargo run --release --example vector_bench --no-default-features -- \
--vectors 10000 --dimensions 768 --iterations 1000 --k 10 --n-probe 10
```
+### Index pipeline hypothesis (network-dominant)
+
+```bash
+cd ray-rs
+cargo run --release --example index_pipeline_hypothesis_bench --no-default-features -- \
+ --mode both --changes 200 --working-set 200 --vector-dims 128 \
+ --tree-sitter-latency-ms 2 --scip-latency-ms 6 --embed-latency-ms 200 \
+ --embed-batch-size 32 --embed-flush-ms 20 --embed-inflight 4 \
+ --vector-apply-batch-size 64 --sync-mode normal
+```
+
+Interpretation:
+- If `parallel` hot-path elapsed is much lower than `sequential`, async embed queueing is working.
+- If `parallel` hot-path p95 is lower than `sequential`, TS+SCIP parallel parse plus unified graph commit is working.
+- If `parallel` freshness p95 is too high, tune `--embed-batch-size`, `--embed-flush-ms`,
+ and `--embed-inflight` (or reduce overwrite churn with larger working set / dedupe rules).
+- Replacement ratio (`Queue ... replaced=...`) quantifies stale embed work eliminated by dedupe.
+
### SQLite baseline (single-file raw)
```bash
@@ -78,6 +109,83 @@ Notes (SQLite):
- WAL autocheckpoint disabled; `journal_size_limit` set to match WAL size
- Edge props stored in a separate table; edges use `INSERT OR IGNORE` and props use `INSERT OR REPLACE`
+### Replication performance gates (Phase D carry-over)
+
+Run both replication perf gates:
+
+```bash
+cd ray-rs
+./scripts/replication-perf-gate.sh
+```
+
+#### Gate A: primary commit overhead
+
+Compares write latency with replication disabled vs enabled (`role=primary`)
+using the same benchmark harness.
+
+```bash
+cd ray-rs
+./scripts/replication-bench-gate.sh
+```
+
+Defaults:
+- Dataset: `NODES=10000`, `EDGES=50000`, `EDGE_TYPES=3`, `EDGE_PROPS=10`
+- `ITERATIONS=20000`
+- `SYNC_MODE=normal`
+- `ATTEMPTS=7` (median ratio across attempts is used for pass/fail)
+- Pass threshold: `P95_MAX_RATIO=1.03` (replication-on p95 / baseline p95)
+- `ITERATIONS` must be `>= 100`
+
+Example override:
+
+```bash
+cd ray-rs
+ITERATIONS=2000 ATTEMPTS=5 P95_MAX_RATIO=1.05 ./scripts/replication-bench-gate.sh
+```
+
+Outputs:
+- `docs/benchmarks/results/YYYY-MM-DD-replication-gate-baseline.txt` (single-attempt mode)
+- `docs/benchmarks/results/YYYY-MM-DD-replication-gate-primary.txt` (single-attempt mode)
+- `docs/benchmarks/results/YYYY-MM-DD-replication-gate-{baseline,primary}.attemptN.txt` (multi-attempt mode)
+
+#### Gate B: replica catch-up throughput
+
+Ensures replica catch-up throughput stays healthy relative to primary commit
+throughput on the same workload.
+
+```bash
+cd ray-rs
+./scripts/replication-catchup-gate.sh
+```
+
+Defaults:
+- `SEED_COMMITS=1000`
+- `BACKLOG_COMMITS=5000`
+- `MAX_FRAMES=256`
+- `SYNC_MODE=normal`
+- `ATTEMPTS=3` (retry count for noisy host variance)
+- Pass threshold: `MIN_CATCHUP_FPS=3000`
+- Pass threshold: `MIN_THROUGHPUT_RATIO=0.13` (catch-up fps / primary fps)
+- `BACKLOG_COMMITS` must be `>= 100`
+
+Example override:
+
+```bash
+cd ray-rs
+BACKLOG_COMMITS=10000 ATTEMPTS=5 MIN_THROUGHPUT_RATIO=1.10 ./scripts/replication-catchup-gate.sh
+```
+
+Output:
+- `docs/benchmarks/results/YYYY-MM-DD-replication-catchup-gate.txt` (single-attempt mode)
+- `docs/benchmarks/results/YYYY-MM-DD-replication-catchup-gate.attemptN.txt` (multi-attempt mode)
+
+Notes:
+- Gate A = commit-path overhead.
+- Gate B = replica apply throughput.
+- Keep replication correctness suite green alongside perf gates:
+ - `cargo test --no-default-features --test replication_phase_a --test replication_phase_b --test replication_phase_c --test replication_phase_d --test replication_faults_phase_d`
+ - `cargo test --no-default-features replication::`
+
## Latest Results (2026-02-04)
Sync-mode sweep logs (nodes-only + edges-heavy datasets):
@@ -336,6 +444,35 @@ Sync=Off, GC off:
| 10 | 313.67K/s |
| 16 | 296.99K/s |
+#### Index pipeline hypothesis notes (2026-02-05)
+
+Goal: validate whether remote embedding latency dominates enough that we should
+decouple graph hot path from vector persistence using async batching + dedupe.
+
+Harness:
+- `ray-rs/examples/index_pipeline_hypothesis_bench.rs`
+- Simulated tree-sitter + SCIP parse, graph writes, synthetic embed latency, batched vector apply.
+- `sequential`: TS parse -> TS graph commit -> SCIP parse -> SCIP graph commit -> embed -> vector apply.
+- `parallel`: TS+SCIP parse overlap -> unified graph commit -> async embed queue -> batched vector apply.
+
+Sample runs (200 events, working set=200, batch=32, flush=20ms, inflight=4, vector-apply-batch=64):
+
+| TS/SCIP parse | Embed latency | Mode | Hot path elapsed | Total elapsed | Hot p95 | Freshness p95 | Replaced jobs |
+|---------------|---------------|------|------------------|---------------|---------|----------------|---------------|
+| 1ms / 1ms | 50ms/batch | Sequential | 11.260s | 11.314s | 2.64ms | 55.09ms | n/a |
+| 1ms / 1ms | 50ms/batch | Parallel | 0.255s | 0.329s | 1.30ms | 168.43ms | 6.00% |
+| 2ms / 6ms | 200ms/batch | Sequential | 42.477s | 42.679s | 10.22ms | 205.11ms | n/a |
+| 2ms / 6ms | 200ms/batch | Parallel | 1.448s | 1.687s | 7.60ms | 775.61ms | 5.50% |
+
+Takeaway:
+- Hot path throughput improves dramatically with async pipeline.
+- Vector freshness depends on batching/queue pressure and overwrite churn; tune freshness separately
+ from hot-path latency target.
+
+Raw logs:
+- `docs/benchmarks/results/2026-02-05-index-pipeline-hypothesis-embed50.txt`
+- `docs/benchmarks/results/2026-02-05-index-pipeline-hypothesis-embed200.txt`
+
## Prior Results (2026-02-03)
Raw logs:
diff --git a/docs/benchmarks/results/2026-02-05-index-pipeline-hypothesis-embed200.txt b/docs/benchmarks/results/2026-02-05-index-pipeline-hypothesis-embed200.txt
new file mode 100644
index 0000000..c82ecb1
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-05-index-pipeline-hypothesis-embed200.txt
@@ -0,0 +1,44 @@
+==================================================================
+Index Pipeline Hypothesis Benchmark
+==================================================================
+Mode: Both
+Changes: 200
+Working set: 200
+Vector dims: 128
+Parse latency: tree-sitter=2ms scip=6ms
+Embed latency: 200ms per batch
+Embed batching: size=32 flush=20ms inflight=4
+Vector apply batch size: 64
+WAL size: 1073741824 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Seed: 42
+==================================================================
+
+--- sequential ---
+Changes: 200
+Vectors applied: 200
+Hot path elapsed: 42.477s
+Total elapsed: 42.679s
+Hot path rate: 4.71/s
+End-to-end rate: 4.69/s
+Hot path latency: p50=10.04ms p95=10.22ms p99=10.98ms
+Vector freshness: p50=204.09ms p95=205.11ms p99=206.13ms
+
+--- parallel ---
+Changes: 200
+Vectors applied: 189
+Hot path elapsed: 1.448s
+Total elapsed: 1.687s
+Hot path rate: 138.14/s
+End-to-end rate: 118.56/s
+Hot path latency: p50=7.54ms p95=7.60ms p99=7.65ms
+Vector freshness: p50=520.38ms p95=775.61ms p99=845.95ms
+Queue: enqueued=200 replaced=11 (5.50%) max_depth=23 avg_depth=8.58
+
+=== Comparison (sequential vs parallel) ===
+Hot path elapsed speedup: 29.34x
+End-to-end elapsed speedup: 25.30x
+Hot p95: 10.22ms -> 7.60ms
+Freshness p95: 205.11ms -> 775.61ms
diff --git a/docs/benchmarks/results/2026-02-05-index-pipeline-hypothesis-embed50.txt b/docs/benchmarks/results/2026-02-05-index-pipeline-hypothesis-embed50.txt
new file mode 100644
index 0000000..18da4c3
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-05-index-pipeline-hypothesis-embed50.txt
@@ -0,0 +1,44 @@
+==================================================================
+Index Pipeline Hypothesis Benchmark
+==================================================================
+Mode: Both
+Changes: 200
+Working set: 200
+Vector dims: 128
+Parse latency: tree-sitter=1ms scip=1ms
+Embed latency: 50ms per batch
+Embed batching: size=32 flush=20ms inflight=4
+Vector apply batch size: 64
+WAL size: 1073741824 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Seed: 42
+==================================================================
+
+--- sequential ---
+Changes: 200
+Vectors applied: 200
+Hot path elapsed: 11.260s
+Total elapsed: 11.314s
+Hot path rate: 17.76/s
+End-to-end rate: 17.68/s
+Hot path latency: p50=2.57ms p95=2.64ms p99=2.71ms
+Vector freshness: p50=54.87ms p95=55.09ms p99=55.15ms
+
+--- parallel ---
+Changes: 200
+Vectors applied: 188
+Hot path elapsed: 0.255s
+Total elapsed: 0.329s
+Hot path rate: 783.55/s
+End-to-end rate: 607.46/s
+Hot path latency: p50=1.27ms p95=1.30ms p99=1.35ms
+Vector freshness: p50=123.01ms p95=168.43ms p99=181.80ms
+Queue: enqueued=200 replaced=12 (6.00%) max_depth=34 avg_depth=13.29
+
+=== Comparison (sequential vs parallel) ===
+Hot path elapsed speedup: 44.11x
+End-to-end elapsed speedup: 34.36x
+Hot p95: 2.64ms -> 1.30ms
+Freshness p95: 55.09ms -> 168.43ms
diff --git a/docs/benchmarks/results/2026-02-08-replication-catchup-gate.attempt1.txt b/docs/benchmarks/results/2026-02-08-replication-catchup-gate.attempt1.txt
new file mode 100644
index 0000000..665e0f4
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-catchup-gate.attempt1.txt
@@ -0,0 +1,14 @@
+replication_catchup_bench
+sync_mode: normal
+seed_commits: 1000
+backlog_commits: 5000
+max_frames: 256
+applied_frames: 5234
+catchup_loops: 21
+produce_elapsed_ms: 209.384
+catchup_elapsed_ms: 1189.310
+primary_frames_per_sec: 23879.53
+catchup_frames_per_sec: 4400.87
+throughput_ratio: 0.1843
+primary_head_log_index: 6000
+replica_applied: 1:6000
diff --git a/docs/benchmarks/results/2026-02-08-replication-catchup-gate.attempt2.txt b/docs/benchmarks/results/2026-02-08-replication-catchup-gate.attempt2.txt
new file mode 100644
index 0000000..ec03c80
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-catchup-gate.attempt2.txt
@@ -0,0 +1,14 @@
+replication_catchup_bench
+sync_mode: normal
+seed_commits: 1000
+backlog_commits: 5000
+max_frames: 256
+applied_frames: 5234
+catchup_loops: 21
+produce_elapsed_ms: 175.423
+catchup_elapsed_ms: 1392.363
+primary_frames_per_sec: 28502.51
+catchup_frames_per_sec: 3759.08
+throughput_ratio: 0.1319
+primary_head_log_index: 6000
+replica_applied: 1:6000
diff --git a/docs/benchmarks/results/2026-02-08-replication-catchup-gate.attempt3.txt b/docs/benchmarks/results/2026-02-08-replication-catchup-gate.attempt3.txt
new file mode 100644
index 0000000..2d7c144
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-catchup-gate.attempt3.txt
@@ -0,0 +1,14 @@
+replication_catchup_bench
+sync_mode: normal
+seed_commits: 1000
+backlog_commits: 5000
+max_frames: 256
+applied_frames: 5234
+catchup_loops: 21
+produce_elapsed_ms: 196.018
+catchup_elapsed_ms: 1498.115
+primary_frames_per_sec: 25507.88
+catchup_frames_per_sec: 3493.72
+throughput_ratio: 0.1370
+primary_head_log_index: 6000
+replica_applied: 1:6000
diff --git a/docs/benchmarks/results/2026-02-08-replication-catchup-gate.txt b/docs/benchmarks/results/2026-02-08-replication-catchup-gate.txt
new file mode 100644
index 0000000..c2c2f4d
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-catchup-gate.txt
@@ -0,0 +1,14 @@
+replication_catchup_bench
+sync_mode: normal
+seed_commits: 1000
+backlog_commits: 5000
+max_frames: 256
+applied_frames: 5234
+catchup_loops: 21
+produce_elapsed_ms: 285.311
+catchup_elapsed_ms: 1837.411
+primary_frames_per_sec: 17524.76
+catchup_frames_per_sec: 2848.57
+throughput_ratio: 0.1625
+primary_head_log_index: 6000
+replica_applied: 1:6000
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt1.txt b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt1.txt
new file mode 100644
index 0000000..1a67695
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt1.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 20,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: false
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 122ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 140.00us p95= 191.08us p99= 191.08us max= 191.08us (7010 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 117ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 84ns p95= 166ns p99= 375ns max= 410.33us (8019722 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 167ns p95= 250ns p99= 333ns max= 548.71us (4624811 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 83ns p95= 125ns p99= 125ns max= 15.96us (11209568 ops/sec)
+node_vector() random p50= 125ns p95= 125ns p99= 333ns max= 28.21us (8741354 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 6.04us (16255273 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 31.75us p95= 38.33us p99= 97.83us max= 97.83us (29921 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 43.21us p95= 62.58us p99= 92.50us max= 92.50us (22061 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 166.42us p95= 208.50us p99= 283.62us max= 283.62us (5770 ops/sec)
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt2.txt b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt2.txt
new file mode 100644
index 0000000..ff1479d
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt2.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 20,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: false
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 118ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 119.42us p95= 222.46us p99= 222.46us max= 222.46us (7603 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 121ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 84ns p95= 208ns p99= 375ns max= 410.79us (7433366 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 208ns p95= 333ns p99= 458ns max= 572.71us (4119218 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 83ns p95= 125ns p99= 125ns max= 36.33us (10751544 ops/sec)
+node_vector() random p50= 125ns p95= 125ns p99= 292ns max= 3.04us (8940697 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 6.08us (16207482 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 31.17us p95= 40.12us p99= 95.42us max= 95.42us (29564 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 35.21us p95= 47.29us p99= 89.21us max= 89.21us (26505 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 165.04us p95= 213.42us p99= 323.88us max= 323.88us (5727 ops/sec)
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt3.txt b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt3.txt
new file mode 100644
index 0000000..be08c57
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt3.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 20,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: false
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 133ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 125.29us p95= 2.52ms p99= 2.52ms max= 2.52ms (2691 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 123ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 84ns p95= 167ns p99= 375ns max= 439.42us (7536110 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 208ns p95= 292ns p99= 375ns max= 525.75us (4366332 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 83ns p95= 125ns p99= 125ns max= 500ns (11050163 ops/sec)
+node_vector() random p50= 125ns p95= 208ns p99= 292ns max= 34.29us (8258136 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 334ns (16586650 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 35.08us p95= 45.38us p99= 95.75us max= 95.75us (27260 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 37.67us p95= 48.50us p99= 111.08us max= 111.08us (25065 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 174.12us p95= 191.71us p99= 276.71us max= 276.71us (5638 ops/sec)
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt4.txt b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt4.txt
new file mode 100644
index 0000000..4752878
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt4.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 20,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: false
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 130ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 128.62us p95= 241.96us p99= 241.96us max= 241.96us (6979 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 119ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 84ns p95= 208ns p99= 375ns max= 440.83us (7440872 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 208ns p95= 292ns p99= 417ns max= 487.71us (4284940 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 83ns p95= 125ns p99= 125ns max= 625ns (11142862 ops/sec)
+node_vector() random p50= 125ns p95= 167ns p99= 333ns max= 6.75us (8539188 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 1.67us (16853914 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 35.67us p95= 45.88us p99= 107.33us max= 107.33us (26608 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 36.29us p95= 46.79us p99= 108.21us max= 108.21us (26008 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 174.00us p95= 204.71us p99= 274.62us max= 274.62us (5593 ops/sec)
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt5.txt b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt5.txt
new file mode 100644
index 0000000..286e1b0
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt5.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 20,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: false
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 121ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 137.71us p95= 261.00us p99= 261.00us max= 261.00us (6679 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 119ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 84ns p95= 167ns p99= 458ns max= 405.12us (7643650 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 208ns p95= 291ns p99= 375ns max= 478.83us (4446965 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 83ns p95= 125ns p99= 125ns max= 30.29us (10672763 ops/sec)
+node_vector() random p50= 125ns p95= 125ns p99= 291ns max= 10.17us (9008473 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 37.42us (15804530 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 35.92us p95= 45.00us p99= 101.96us max= 101.96us (26753 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 36.00us p95= 48.29us p99= 120.08us max= 120.08us (25882 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 166.50us p95= 202.33us p99= 268.96us max= 268.96us (5810 ops/sec)
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt6.txt b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt6.txt
new file mode 100644
index 0000000..2d7322d
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt6.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 20,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: false
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 117ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 135.62us p95= 241.17us p99= 241.17us max= 241.17us (6755 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 121ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 84ns p95= 208ns p99= 375ns max= 425.29us (7313884 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 208ns p95= 292ns p99= 375ns max= 537.58us (4401032 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 83ns p95= 125ns p99= 125ns max= 13.33us (11008803 ops/sec)
+node_vector() random p50= 125ns p95= 209ns p99= 333ns max= 12.38us (8192810 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 292ns (16664445 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 35.21us p95= 49.25us p99= 106.29us max= 106.29us (27062 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 42.38us p95= 52.79us p99= 97.79us max= 97.79us (22964 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 172.38us p95= 194.50us p99= 302.12us max= 302.12us (5661 ops/sec)
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt7.txt b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt7.txt
new file mode 100644
index 0000000..33eb443
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.attempt7.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 20,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: false
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 123ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 125.79us p95= 311.12us p99= 311.12us max= 311.12us (6768 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 117ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 84ns p95= 208ns p99= 417ns max= 452.42us (7332599 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 208ns p95= 292ns p99= 417ns max= 510.08us (4274611 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 83ns p95= 125ns p99= 125ns max= 11.58us (11242069 ops/sec)
+node_vector() random p50= 125ns p95= 209ns p99= 333ns max= 13.62us (8355290 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 15.12us (15962660 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 37.04us p95= 51.96us p99= 108.04us max= 108.04us (25994 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 39.04us p95= 48.08us p99= 80.62us max= 80.62us (24608 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 173.21us p95= 195.38us p99= 301.79us max= 301.79us (5678 ops/sec)
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-baseline.txt b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.txt
new file mode 100644
index 0000000..8595963
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-baseline.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 5,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: false
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 119ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 157.33us p95= 243.29us p99= 243.29us max= 243.29us (6109 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 124ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 125ns p95= 375ns p99= 583ns max= 439.12us (4247543 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 208ns p95= 333ns p99= 459ns max= 516.17us (3109783 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 84ns p95= 208ns p99= 334ns max= 14.50us (8622459 ops/sec)
+node_vector() random p50= 125ns p95= 291ns p99= 417ns max= 1.92us (7523639 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 541ns (16609309 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 32.38us p95= 46.29us p99= 93.12us max= 93.12us (29030 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 43.92us p95= 106.08us p99= 119.50us max= 119.50us (19805 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 172.38us p95= 241.29us p99= 331.21us max= 331.21us (5485 ops/sec)
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt1.txt b/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt1.txt
new file mode 100644
index 0000000..d929cd8
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt1.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 20,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: true
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 121ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 121.21us p95= 247.21us p99= 247.21us max= 247.21us (6726 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 118ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 84ns p95= 125ns p99= 333ns max= 400.79us (8299775 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 208ns p95= 292ns p99= 459ns max= 503.96us (4187877 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 83ns p95= 125ns p99= 125ns max= 11.50us (11186314 ops/sec)
+node_vector() random p50= 125ns p95= 250ns p99= 417ns max= 18.21us (8077675 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 1.96us (16982052 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 31.71us p95= 39.29us p99= 96.38us max= 96.38us (29901 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 36.79us p95= 46.33us p99= 111.21us max= 111.21us (25630 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 175.79us p95= 207.83us p99= 274.54us max= 274.54us (5537 ops/sec)
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt2.txt b/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt2.txt
new file mode 100644
index 0000000..be132ba
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt2.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 20,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: true
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 135ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 120.54us p95= 217.79us p99= 217.79us max= 217.79us (7110 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 125ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 84ns p95= 375ns p99= 583ns max= 427.08us (6034754 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 208ns p95= 292ns p99= 459ns max= 531.04us (4184697 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 83ns p95= 125ns p99= 208ns max= 5.96us (10670360 ops/sec)
+node_vector() random p50= 125ns p95= 209ns p99= 334ns max= 34.75us (8338642 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 1.62us (16541940 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 34.88us p95= 45.62us p99= 103.75us max= 103.75us (27382 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 42.25us p95= 53.54us p99= 107.71us max= 107.71us (22478 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 175.92us p95= 262.46us p99= 404.67us max= 404.67us (5405 ops/sec)
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt3.txt b/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt3.txt
new file mode 100644
index 0000000..420e49e
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt3.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 20,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: true
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 123ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 124.75us p95= 243.96us p99= 243.96us max= 243.96us (6829 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 121ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 84ns p95= 166ns p99= 334ns max= 388.25us (8119958 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 208ns p95= 333ns p99= 542ns max= 540.71us (4065681 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 83ns p95= 125ns p99= 125ns max= 3.75us (11139815 ops/sec)
+node_vector() random p50= 125ns p95= 167ns p99= 458ns max= 13.12us (8167673 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 583ns (16475306 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 34.54us p95= 49.33us p99= 112.21us max= 112.21us (26552 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 43.00us p95= 55.71us p99= 109.67us max= 109.67us (22284 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 167.12us p95= 202.75us p99= 259.08us max= 259.08us (5804 ops/sec)
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt4.txt b/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt4.txt
new file mode 100644
index 0000000..ca94e66
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt4.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 20,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: true
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 124ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 143.92us p95= 268.12us p99= 268.12us max= 268.12us (6199 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 118ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 84ns p95= 166ns p99= 375ns max= 388.83us (7801067 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 208ns p95= 292ns p99= 375ns max= 761.21us (4074774 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 83ns p95= 125ns p99= 166ns max= 4.58us (11173752 ops/sec)
+node_vector() random p50= 125ns p95= 125ns p99= 250ns max= 29.42us (8860551 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 2.25us (16968595 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 35.75us p95= 42.92us p99= 108.54us max= 108.54us (27278 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 38.33us p95= 52.75us p99= 94.75us max= 94.75us (24428 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 173.46us p95= 198.42us p99= 283.46us max= 283.46us (5648 ops/sec)
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt5.txt b/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt5.txt
new file mode 100644
index 0000000..7aa52c5
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt5.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 20,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: true
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 126ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 144.71us p95= 238.04us p99= 238.04us max= 238.04us (6522 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 121ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 84ns p95= 250ns p99= 417ns max= 396.42us (7318046 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 208ns p95= 292ns p99= 375ns max= 599.25us (4275085 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 83ns p95= 125ns p99= 125ns max= 4.92us (11215704 ops/sec)
+node_vector() random p50= 125ns p95= 167ns p99= 292ns max= 6.00us (8766993 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 1.04us (16791244 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 32.54us p95= 38.75us p99= 105.38us max= 105.38us (29028 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 35.29us p95= 44.46us p99= 67.75us max= 67.75us (27413 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 166.33us p95= 206.75us p99= 297.29us max= 297.29us (5758 ops/sec)
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt6.txt b/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt6.txt
new file mode 100644
index 0000000..5e6a876
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt6.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 20,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: true
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 119ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 150.29us p95= 248.42us p99= 248.42us max= 248.42us (6196 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 121ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 125ns p95= 333ns p99= 750ns max= 446.88us (5825695 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 167ns p95= 292ns p99= 416ns max= 506.54us (4521430 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 83ns p95= 125ns p99= 125ns max= 875ns (11583264 ops/sec)
+node_vector() random p50= 84ns p95= 125ns p99= 250ns max= 2.96us (9288173 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 11.88us (17519210 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 33.62us p95= 39.17us p99= 102.12us max= 102.12us (28465 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 37.92us p95= 49.75us p99= 122.62us max= 122.62us (24472 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 170.79us p95= 236.29us p99= 327.79us max= 327.79us (5609 ops/sec)
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt7.txt b/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt7.txt
new file mode 100644
index 0000000..95a4e65
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-primary.attempt7.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 20,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: true
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 117ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 125.54us p95= 236.12us p99= 236.12us max= 236.12us (6663 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 121ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 84ns p95= 125ns p99= 333ns max= 422.50us (8204694 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 208ns p95= 333ns p99= 458ns max= 491.79us (3996020 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 83ns p95= 125ns p99= 125ns max= 27.62us (10775850 ops/sec)
+node_vector() random p50= 125ns p95= 250ns p99= 375ns max= 23.50us (7681138 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 583ns (16248010 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 37.12us p95= 51.17us p99= 97.25us max= 97.25us (25181 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 44.04us p95= 59.33us p99= 103.12us max= 103.12us (21945 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 176.75us p95= 228.33us p99= 299.38us max= 299.38us (5409 ops/sec)
diff --git a/docs/benchmarks/results/2026-02-08-replication-gate-primary.txt b/docs/benchmarks/results/2026-02-08-replication-gate-primary.txt
new file mode 100644
index 0000000..0e268e7
--- /dev/null
+++ b/docs/benchmarks/results/2026-02-08-replication-gate-primary.txt
@@ -0,0 +1,60 @@
+========================================================================================================================
+Single-file Raw Benchmark (Rust)
+========================================================================================================================
+Nodes: 10,000
+Edges: 50,000
+Edge types: 3
+Edge props: 10
+Iterations: 5,000
+WAL size: 67,108,864 bytes
+Sync mode: Normal
+Group commit: false (window 2ms)
+Auto-checkpoint: false
+Checkpoint threshold: 0.8
+Vector dims: 128
+Vector count: 1,000
+Replication primary: true
+Skip checkpoint: false
+Reopen read-only: false
+========================================================================================================================
+
+[1/6] Building graph...
+ Creating nodes...
+
Created 5000 / 10000 nodes
Created 10000 / 10000 nodes
+ Creating edges...
+
Created 5000 / 50000 edges
Created 10000 / 50000 edges
Created 15000 / 50000 edges
Created 20000 / 50000 edges
Created 25000 / 50000 edges
Created 30000 / 50000 edges
Created 35000 / 50000 edges
Created 40000 / 50000 edges
Created 45000 / 50000 edges
Created 50000 / 50000 edges
+ Built in 115ms
+
+[2/6] Vector setup...
+
+--- Vector Operations ---
+Set vectors (batch 100) p50= 146.83us p95= 238.42us p99= 238.42us max= 238.42us (6490 ops/sec)
+
+[3/6] Checkpointing...
+ Checkpointed in 125ms
+
+[4/6] Key lookup benchmarks...
+
+--- Key Lookups (node_by_key) ---
+Random existing keys p50= 84ns p95= 333ns p99= 459ns max= 447.21us (4546260 ops/sec)
+
+[5/6] Traversal and edge benchmarks...
+
+--- 1-Hop Traversals (out) ---
+Random nodes p50= 208ns p95= 334ns p99= 709ns max= 527.21us (3068479 ops/sec)
+
+--- Edge Exists ---
+Random edge exists p50= 84ns p95= 167ns p99= 292ns max= 11.38us (9021249 ops/sec)
+node_vector() random p50= 125ns p95= 250ns p99= 375ns max= 1.46us (7641054 ops/sec)
+has_node_vector() random p50= 42ns p95= 84ns p99= 84ns max= 334ns (16867161 ops/sec)
+
+[6/6] Write benchmarks...
+
+--- Batch Writes (100 nodes) ---
+Batch of 100 nodes p50= 35.17us p95= 55.12us p99= 106.67us max= 106.67us (25776 ops/sec)
+
+--- Batch Writes (100 edges) ---
+Batch of 100 edges p50= 40.79us p95= 56.33us p99= 103.50us max= 103.50us (23632 ops/sec)
+
+--- Batch Writes (100 edges + props) ---
+Batch of 100 edges + props p50= 173.88us p95= 285.83us p99= 310.67us max= 310.67us (5425 ops/sec)
diff --git a/docs/bindings-parity.md b/docs/bindings-parity.md
index 51fc0a1..6bb3002 100644
--- a/docs/bindings-parity.md
+++ b/docs/bindings-parity.md
@@ -29,6 +29,7 @@ Legend
| Vector PropValue | Full | Missing | Full | Python bindings do not expose PropValue VectorF32 |
| Schema IDs/labels | Full | Full | Full | Labels, edge types, prop keys |
| Cache API | Full | Full | Full | Python/NAPI include extra cache control |
+| Replication controls + status (Phase D) | Full | Full | Full | Promote, retention, reseed, token wait, primary/replica status |
| Integrity check | Full | Missing | Full | Single-file uses full snapshot check |
| Optimize/compact | Full | Partial | Full | Single-file checkpoint + vacuum/options exposed |
| Vector embeddings | Full | Full | Full | `set/get/del/has` node vectors |
diff --git a/ray-rs/examples/index_pipeline_hypothesis_bench.rs b/ray-rs/examples/index_pipeline_hypothesis_bench.rs
new file mode 100644
index 0000000..17ae3cb
--- /dev/null
+++ b/ray-rs/examples/index_pipeline_hypothesis_bench.rs
@@ -0,0 +1,987 @@
+//! Index pipeline hypothesis benchmark for code intelligence workloads.
+//!
+//! Tests two modes:
+//! 1) Sequential: tree-sitter parse -> TS graph write -> SCIP parse -> SCIP graph write ->
+//! embed (simulated network) -> vector write.
+//! 2) Parallel: tree-sitter + SCIP parse in parallel -> unified graph write -> enqueue;
+//! async embed workers batch results; vector writer applies batched writes.
+//!
+//! Goal: verify whether network latency dominates enough that async batching is the
+//! right architecture choice.
+//!
+//! Usage:
+//! cargo run --release --example index_pipeline_hypothesis_bench --no-default-features -- [options]
+//!
+//! Options:
+//! --mode MODE sequential|parallel|both (default: both)
+//! --changes N Number of change events (default: 20000)
+//! --working-set N Distinct chunk keys reused by events (default: 2000)
+//! --vector-dims N Vector dimensions (default: 128)
+//! --tree-sitter-latency-ms N Simulated tree-sitter parse latency per event (default: 0)
+//! --scip-latency-ms N Simulated SCIP parse latency per event (default: 0)
+//! --embed-latency-ms N Simulated remote embedding latency per batch (default: 200)
+//! --embed-batch-size N Embedding request batch size (default: 64)
+//! --embed-flush-ms N Max wait to fill embed batch (default: 25)
+//! --embed-inflight N Parallel embedding requests (default: 4)
+//! --vector-apply-batch-size N Vector writes per DB transaction (default: 256)
+//! --wal-size BYTES WAL size in bytes (default: 1073741824)
+//! --sync-mode MODE Sync mode: full|normal|off (default: normal)
+//! --group-commit-enabled Enable group commit (default: false)
+//! --group-commit-window-ms N Group commit window in ms (default: 2)
+//! --auto-checkpoint Enable auto-checkpoint (default: false)
+//! --seed N RNG seed for event generation (default: 42)
+//! --keep-db Keep generated DB files for inspection
+
+use std::collections::{HashMap, VecDeque};
+use std::env;
+use std::path::PathBuf;
+use std::sync::{Arc, Condvar, Mutex};
+use std::thread;
+use std::time::{Duration, Instant};
+
+use crossbeam_channel::{unbounded, Receiver, Sender};
+use rand::{rngs::StdRng, Rng, SeedableRng};
+use tempfile::tempdir;
+
+use kitedb::core::single_file::{
+ close_single_file, open_single_file, SingleFileDB, SingleFileOpenOptions, SyncMode,
+};
+use kitedb::types::{ETypeId, NodeId, PropKeyId, PropValue};
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum Mode {
+ Sequential,
+ Parallel,
+ Both,
+}
+
+#[derive(Debug, Clone)]
+struct BenchConfig {
+ mode: Mode,
+ changes: usize,
+ working_set: usize,
+ vector_dims: usize,
+ tree_sitter_latency_ms: u64,
+ scip_latency_ms: u64,
+ embed_latency_ms: u64,
+ embed_batch_size: usize,
+ embed_flush_ms: u64,
+ embed_inflight: usize,
+ vector_apply_batch_size: usize,
+ wal_size: usize,
+ sync_mode: SyncMode,
+ group_commit_enabled: bool,
+ group_commit_window_ms: u64,
+ auto_checkpoint: bool,
+ seed: u64,
+ keep_db: bool,
+}
+
+impl Default for BenchConfig {
+ fn default() -> Self {
+ Self {
+ mode: Mode::Both,
+ changes: 20_000,
+ working_set: 2_000,
+ vector_dims: 128,
+ tree_sitter_latency_ms: 0,
+ scip_latency_ms: 0,
+ embed_latency_ms: 200,
+ embed_batch_size: 64,
+ embed_flush_ms: 25,
+ embed_inflight: 4,
+ vector_apply_batch_size: 256,
+ wal_size: 1024 * 1024 * 1024,
+ sync_mode: SyncMode::Normal,
+ group_commit_enabled: false,
+ group_commit_window_ms: 2,
+ auto_checkpoint: false,
+ seed: 42,
+ keep_db: false,
+ }
+ }
+}
+
+#[derive(Debug, Clone)]
+struct ChangeEvent {
+ chunk_idx: usize,
+ version: u64,
+}
+
+#[derive(Debug, Clone)]
+struct EmbedJob {
+ chunk_idx: usize,
+ version: u64,
+ hot_done_at: Instant,
+}
+
+#[derive(Debug, Default)]
+struct QueueStats {
+ enqueued_jobs: u64,
+ replaced_jobs: u64,
+ max_depth: usize,
+ depth_sum: u128,
+ depth_samples: u64,
+}
+
+#[derive(Debug)]
+struct EmbedQueueState {
+ pending_by_chunk: HashMap,
+ order: VecDeque,
+ closed: bool,
+ stats: QueueStats,
+}
+
+impl EmbedQueueState {
+ fn new(capacity: usize) -> Self {
+ Self {
+ pending_by_chunk: HashMap::with_capacity(capacity),
+ order: VecDeque::with_capacity(capacity),
+ closed: false,
+ stats: QueueStats::default(),
+ }
+ }
+
+ fn sample_depth(&mut self) {
+ let depth = self.pending_by_chunk.len();
+ self.stats.max_depth = self.stats.max_depth.max(depth);
+ self.stats.depth_sum += depth as u128;
+ self.stats.depth_samples += 1;
+ }
+}
+
+struct DbFixture {
+ db: Arc,
+ node_ids: Vec,
+ etype_rel: ETypeId,
+ node_rev_key: PropKeyId,
+ node_scip_rev_key: PropKeyId,
+ edge_weight_key: PropKeyId,
+ vector_key: PropKeyId,
+ db_path: PathBuf,
+ temp_dir: tempfile::TempDir,
+}
+
+#[derive(Debug, Default)]
+struct BenchResult {
+ mode: &'static str,
+ changes: usize,
+ applied_vectors: usize,
+ total_elapsed: Duration,
+ hot_path_elapsed: Duration,
+ hot_path_ns: Vec,
+ vector_freshness_ns: Vec,
+ enqueued_jobs: u64,
+ replaced_jobs: u64,
+ queue_max_depth: usize,
+ queue_avg_depth: f64,
+}
+
+fn parse_args() -> BenchConfig {
+ let mut config = BenchConfig::default();
+ let args: Vec = env::args().collect();
+ let mut i = 1;
+
+ while i < args.len() {
+ match args[i].as_str() {
+ "--mode" => {
+ if let Some(value) = args.get(i + 1) {
+ config.mode = match value.to_lowercase().as_str() {
+ "sequential" => Mode::Sequential,
+ "parallel" => Mode::Parallel,
+ _ => Mode::Both,
+ };
+ i += 1;
+ }
+ }
+ "--changes" => {
+ if let Some(value) = args.get(i + 1) {
+ config.changes = value.parse().unwrap_or(config.changes);
+ i += 1;
+ }
+ }
+ "--working-set" => {
+ if let Some(value) = args.get(i + 1) {
+ config.working_set = value.parse().unwrap_or(config.working_set);
+ i += 1;
+ }
+ }
+ "--vector-dims" => {
+ if let Some(value) = args.get(i + 1) {
+ config.vector_dims = value.parse().unwrap_or(config.vector_dims);
+ i += 1;
+ }
+ }
+ "--tree-sitter-latency-ms" => {
+ if let Some(value) = args.get(i + 1) {
+ config.tree_sitter_latency_ms = value.parse().unwrap_or(config.tree_sitter_latency_ms);
+ i += 1;
+ }
+ }
+ "--scip-latency-ms" => {
+ if let Some(value) = args.get(i + 1) {
+ config.scip_latency_ms = value.parse().unwrap_or(config.scip_latency_ms);
+ i += 1;
+ }
+ }
+ "--embed-latency-ms" => {
+ if let Some(value) = args.get(i + 1) {
+ config.embed_latency_ms = value.parse().unwrap_or(config.embed_latency_ms);
+ i += 1;
+ }
+ }
+ "--embed-batch-size" => {
+ if let Some(value) = args.get(i + 1) {
+ config.embed_batch_size = value.parse().unwrap_or(config.embed_batch_size);
+ i += 1;
+ }
+ }
+ "--embed-flush-ms" => {
+ if let Some(value) = args.get(i + 1) {
+ config.embed_flush_ms = value.parse().unwrap_or(config.embed_flush_ms);
+ i += 1;
+ }
+ }
+ "--embed-inflight" => {
+ if let Some(value) = args.get(i + 1) {
+ config.embed_inflight = value.parse().unwrap_or(config.embed_inflight);
+ i += 1;
+ }
+ }
+ "--vector-apply-batch-size" => {
+ if let Some(value) = args.get(i + 1) {
+ config.vector_apply_batch_size = value.parse().unwrap_or(config.vector_apply_batch_size);
+ i += 1;
+ }
+ }
+ "--wal-size" => {
+ if let Some(value) = args.get(i + 1) {
+ config.wal_size = value.parse().unwrap_or(config.wal_size);
+ i += 1;
+ }
+ }
+ "--sync-mode" => {
+ if let Some(value) = args.get(i + 1) {
+ config.sync_mode = match value.to_lowercase().as_str() {
+ "full" => SyncMode::Full,
+ "off" => SyncMode::Off,
+ _ => SyncMode::Normal,
+ };
+ i += 1;
+ }
+ }
+ "--group-commit-enabled" => {
+ config.group_commit_enabled = true;
+ }
+ "--group-commit-window-ms" => {
+ if let Some(value) = args.get(i + 1) {
+ config.group_commit_window_ms = value.parse().unwrap_or(config.group_commit_window_ms);
+ i += 1;
+ }
+ }
+ "--auto-checkpoint" => {
+ config.auto_checkpoint = true;
+ }
+ "--seed" => {
+ if let Some(value) = args.get(i + 1) {
+ config.seed = value.parse().unwrap_or(config.seed);
+ i += 1;
+ }
+ }
+ "--keep-db" => {
+ config.keep_db = true;
+ }
+ _ => {}
+ }
+ i += 1;
+ }
+
+ if config.changes == 0 {
+ config.changes = 1;
+ }
+ if config.working_set == 0 {
+ config.working_set = 1;
+ }
+ if config.vector_dims == 0 {
+ config.vector_dims = 1;
+ }
+ if config.embed_batch_size == 0 {
+ config.embed_batch_size = 1;
+ }
+ if config.embed_inflight == 0 {
+ config.embed_inflight = 1;
+ }
+ if config.vector_apply_batch_size == 0 {
+ config.vector_apply_batch_size = 1;
+ }
+
+ config
+}
+
+fn generate_events(config: &BenchConfig) -> Vec {
+ let mut rng = StdRng::seed_from_u64(config.seed);
+ let mut versions = vec![0u64; config.working_set];
+ let mut events = Vec::with_capacity(config.changes);
+
+ for _ in 0..config.changes {
+ let chunk_idx = rng.gen_range(0..config.working_set);
+ versions[chunk_idx] += 1;
+ events.push(ChangeEvent {
+ chunk_idx,
+ version: versions[chunk_idx],
+ });
+ }
+
+ events
+}
+
+fn format_rate(count: usize, elapsed: Duration) -> String {
+ let seconds = elapsed.as_secs_f64();
+ if seconds <= 0.0 {
+ return "n/a".to_string();
+ }
+ let rate = count as f64 / seconds;
+ if rate >= 1_000_000.0 {
+ return format!("{:.2}M/s", rate / 1_000_000.0);
+ }
+ if rate >= 1_000.0 {
+ return format!("{:.2}K/s", rate / 1_000.0);
+ }
+ format!("{rate:.2}/s")
+}
+
+fn format_latency_ns(ns: u128) -> String {
+ if ns < 1_000 {
+ format!("{ns}ns")
+ } else if ns < 1_000_000 {
+ format!("{:.2}us", ns as f64 / 1_000.0)
+ } else if ns < 1_000_000_000 {
+ format!("{:.2}ms", ns as f64 / 1_000_000.0)
+ } else {
+ format!("{:.2}s", ns as f64 / 1_000_000_000.0)
+ }
+}
+
+fn percentile_ns(samples: &[u128], percentile: f64) -> u128 {
+ if samples.is_empty() {
+ return 0;
+ }
+ let mut sorted = samples.to_vec();
+ sorted.sort_unstable();
+ let idx = ((sorted.len() as f64) * percentile).floor() as usize;
+ sorted[idx.min(sorted.len() - 1)]
+}
+
+fn setup_fixture(config: &BenchConfig, label: &str) -> DbFixture {
+ let temp_dir = tempdir().expect("expected value");
+ let db_path = temp_dir
+ .path()
+ .join(format!("index-pipeline-{label}.kitedb"));
+
+ let open_opts = SingleFileOpenOptions::new()
+ .wal_size(config.wal_size)
+ .sync_mode(config.sync_mode)
+ .group_commit_enabled(config.group_commit_enabled)
+ .group_commit_window_ms(config.group_commit_window_ms)
+ .auto_checkpoint(config.auto_checkpoint);
+
+ let db = open_single_file(&db_path, open_opts).expect("expected value");
+ let db = Arc::new(db);
+
+ db.begin(false).expect("expected value");
+ let etype_rel = db.define_etype("REL").expect("expected value");
+ let node_rev_key = db.define_propkey("rev").expect("expected value");
+ let node_scip_rev_key = db.define_propkey("scip_rev").expect("expected value");
+ let edge_weight_key = db.define_propkey("weight").expect("expected value");
+ let vector_key = db.define_propkey("embedding").expect("expected value");
+ db.commit().expect("expected value");
+
+ let mut node_ids = Vec::with_capacity(config.working_set);
+ let create_batch = 5000usize;
+ for start in (0..config.working_set).step_by(create_batch) {
+ let end = (start + create_batch).min(config.working_set);
+ db.begin_bulk().expect("expected value");
+ let mut keys = Vec::with_capacity(end - start);
+ for idx in start..end {
+ keys.push(format!("chunk:{idx}"));
+ }
+ let key_refs: Vec> = keys.iter().map(|k| Some(k.as_str())).collect();
+ let ids = db.create_nodes_batch(&key_refs).expect("expected value");
+ node_ids.extend(ids);
+ db.commit().expect("expected value");
+ }
+
+ let edge_batch = 10_000usize;
+ for start in (0..config.working_set).step_by(edge_batch) {
+ let end = (start + edge_batch).min(config.working_set);
+ db.begin_bulk().expect("expected value");
+ let mut edges = Vec::with_capacity(end - start);
+ for idx in start..end {
+ let src = node_ids[idx];
+ let dst = node_ids[(idx + 1) % node_ids.len()];
+ edges.push((src, etype_rel, dst));
+ }
+ db.add_edges_batch(&edges).expect("expected value");
+ db.commit().expect("expected value");
+ }
+
+ db.vector_store_or_create(vector_key, config.vector_dims)
+ .expect("expected value");
+
+ DbFixture {
+ db,
+ node_ids,
+ etype_rel,
+ node_rev_key,
+ node_scip_rev_key,
+ edge_weight_key,
+ vector_key,
+ db_path,
+ temp_dir,
+ }
+}
+
+fn apply_graph_change_ts_tx(fixture: &DbFixture, event: &ChangeEvent) {
+ let src = fixture.node_ids[event.chunk_idx];
+ let dst = fixture.node_ids[(event.chunk_idx + 1) % fixture.node_ids.len()];
+
+ fixture.db.begin(false).expect("expected value");
+ fixture
+ .db
+ .set_node_prop(
+ src,
+ fixture.node_rev_key,
+ PropValue::I64(event.version as i64),
+ )
+ .expect("expected value");
+ fixture
+ .db
+ .set_edge_prop(
+ src,
+ fixture.etype_rel,
+ dst,
+ fixture.edge_weight_key,
+ PropValue::F64((event.version % 1024) as f64 / 1024.0),
+ )
+ .expect("expected value");
+ fixture.db.commit().expect("expected value");
+}
+
+fn apply_graph_change_scip_tx(fixture: &DbFixture, event: &ChangeEvent) {
+ let src = fixture.node_ids[event.chunk_idx];
+
+ fixture.db.begin(false).expect("expected value");
+ fixture
+ .db
+ .set_node_prop(
+ src,
+ fixture.node_scip_rev_key,
+ PropValue::I64(event.version as i64),
+ )
+ .expect("expected value");
+ fixture.db.commit().expect("expected value");
+}
+
+fn apply_graph_change_unified_tx(fixture: &DbFixture, event: &ChangeEvent) {
+ let src = fixture.node_ids[event.chunk_idx];
+ let dst = fixture.node_ids[(event.chunk_idx + 1) % fixture.node_ids.len()];
+
+ fixture.db.begin(false).expect("expected value");
+ fixture
+ .db
+ .set_node_prop(
+ src,
+ fixture.node_rev_key,
+ PropValue::I64(event.version as i64),
+ )
+ .expect("expected value");
+ fixture
+ .db
+ .set_node_prop(
+ src,
+ fixture.node_scip_rev_key,
+ PropValue::I64(event.version as i64),
+ )
+ .expect("expected value");
+ fixture
+ .db
+ .set_edge_prop(
+ src,
+ fixture.etype_rel,
+ dst,
+ fixture.edge_weight_key,
+ PropValue::F64((event.version % 1024) as f64 / 1024.0),
+ )
+ .expect("expected value");
+ fixture.db.commit().expect("expected value");
+}
+
+fn apply_vector_batch(
+ fixture: &DbFixture,
+ dims: usize,
+ jobs: &[EmbedJob],
+ freshness_samples: &mut Vec,
+) {
+ if jobs.is_empty() {
+ return;
+ }
+
+ fixture.db.begin(false).expect("expected value");
+ for job in jobs {
+ let node_id = fixture.node_ids[job.chunk_idx];
+ let value = (job.version % 1024) as f32 / 1024.0;
+ let vector = vec![value; dims];
+ fixture
+ .db
+ .set_node_vector(node_id, fixture.vector_key, &vector)
+ .expect("expected value");
+ }
+ fixture.db.commit().expect("expected value");
+
+ let now = Instant::now();
+ for job in jobs {
+ freshness_samples.push(now.duration_since(job.hot_done_at).as_nanos());
+ }
+}
+
+fn run_sequential(config: &BenchConfig, events: &[ChangeEvent]) -> BenchResult {
+ let fixture = setup_fixture(config, "sequential");
+ let run_start = Instant::now();
+ let mut hot_path_ns = Vec::with_capacity(events.len());
+ let mut vector_freshness_ns = Vec::with_capacity(events.len());
+ let ts_sleep = Duration::from_millis(config.tree_sitter_latency_ms);
+ let scip_sleep = Duration::from_millis(config.scip_latency_ms);
+ let embed_sleep = Duration::from_millis(config.embed_latency_ms);
+ let mut last_hot_done = run_start;
+
+ for event in events {
+ let op_start = Instant::now();
+ if config.tree_sitter_latency_ms > 0 {
+ thread::sleep(ts_sleep);
+ }
+ apply_graph_change_ts_tx(&fixture, event);
+ if config.scip_latency_ms > 0 {
+ thread::sleep(scip_sleep);
+ }
+ apply_graph_change_scip_tx(&fixture, event);
+ let hot_done = Instant::now();
+ last_hot_done = hot_done;
+ hot_path_ns.push(hot_done.duration_since(op_start).as_nanos());
+
+ if config.embed_latency_ms > 0 {
+ thread::sleep(embed_sleep);
+ }
+ let job = EmbedJob {
+ chunk_idx: event.chunk_idx,
+ version: event.version,
+ hot_done_at: hot_done,
+ };
+ apply_vector_batch(
+ &fixture,
+ config.vector_dims,
+ &[job],
+ &mut vector_freshness_ns,
+ );
+ }
+
+ let total_elapsed = run_start.elapsed();
+ let hot_path_elapsed = last_hot_done.duration_since(run_start);
+
+ if config.keep_db {
+ println!("Sequential DB kept at: {}", fixture.db_path.display());
+ std::mem::forget(fixture.temp_dir);
+ }
+
+ if let Ok(db) = Arc::try_unwrap(fixture.db) {
+ close_single_file(db).expect("expected value");
+ } else {
+ println!("Warning: failed to unwrap DB Arc; skipping explicit close");
+ }
+
+ BenchResult {
+ mode: "sequential",
+ changes: events.len(),
+ applied_vectors: vector_freshness_ns.len(),
+ total_elapsed,
+ hot_path_elapsed,
+ hot_path_ns,
+ vector_freshness_ns,
+ ..BenchResult::default()
+ }
+}
+
+fn enqueue_job(
+ queue: &Arc<(Mutex, Condvar)>,
+ chunk_capacity: usize,
+ job: EmbedJob,
+) {
+ let (lock, cv) = &**queue;
+ let mut state = lock.lock().expect("expected value");
+
+ if state.pending_by_chunk.capacity() == 0 {
+ state.pending_by_chunk.reserve(chunk_capacity);
+ }
+
+ state.stats.enqueued_jobs += 1;
+ let chunk_idx = job.chunk_idx;
+ if state.pending_by_chunk.insert(chunk_idx, job).is_some() {
+ state.stats.replaced_jobs += 1;
+ } else {
+ state.order.push_back(chunk_idx);
+ }
+ state.sample_depth();
+ cv.notify_one();
+}
+
+fn take_embed_batch(
+ queue: &Arc<(Mutex, Condvar)>,
+ batch_size: usize,
+ flush_window: Duration,
+) -> Option> {
+ let (lock, cv) = &**queue;
+ let mut state = lock.lock().expect("expected value");
+
+ loop {
+ while state.order.is_empty() && !state.closed {
+ state = cv.wait(state).expect("expected value");
+ }
+
+ if state.order.is_empty() && state.closed {
+ return None;
+ }
+
+ if !flush_window.is_zero() && state.order.len() < batch_size && !state.closed {
+ let (next_state, _) = cv
+ .wait_timeout(state, flush_window)
+ .expect("expected value");
+ state = next_state;
+ if state.order.is_empty() && state.closed {
+ return None;
+ }
+ }
+
+ let mut batch = Vec::with_capacity(batch_size);
+ while batch.len() < batch_size {
+ let Some(chunk_idx) = state.order.pop_front() else {
+ break;
+ };
+ if let Some(job) = state.pending_by_chunk.remove(&chunk_idx) {
+ batch.push(job);
+ state.sample_depth();
+ }
+ }
+
+ if !batch.is_empty() {
+ return Some(batch);
+ }
+
+ if state.closed {
+ return None;
+ }
+ }
+}
+
+fn run_parallel(config: &BenchConfig, events: &[ChangeEvent]) -> BenchResult {
+ let fixture = setup_fixture(config, "parallel");
+ let run_start = Instant::now();
+ let mut hot_path_ns = Vec::with_capacity(events.len());
+ let ts_sleep = Duration::from_millis(config.tree_sitter_latency_ms);
+ let scip_sleep = Duration::from_millis(config.scip_latency_ms);
+ let embed_sleep = Duration::from_millis(config.embed_latency_ms);
+ let embed_flush = Duration::from_millis(config.embed_flush_ms);
+ let mut last_hot_done = run_start;
+
+ let queue = Arc::new((
+ Mutex::new(EmbedQueueState::new(config.working_set)),
+ Condvar::new(),
+ ));
+ let (result_tx, result_rx): (Sender>, Receiver>) = unbounded();
+
+ let mut embed_handles = Vec::with_capacity(config.embed_inflight);
+ for _ in 0..config.embed_inflight {
+ let queue = Arc::clone(&queue);
+ let tx = result_tx.clone();
+ let batch_size = config.embed_batch_size;
+ let embed_sleep = embed_sleep;
+ let embed_flush = embed_flush;
+ embed_handles.push(thread::spawn(move || {
+ while let Some(batch) = take_embed_batch(&queue, batch_size, embed_flush) {
+ if !embed_sleep.is_zero() {
+ thread::sleep(embed_sleep);
+ }
+ if tx.send(batch).is_err() {
+ return;
+ }
+ }
+ }));
+ }
+ drop(result_tx);
+
+ let writer_db = Arc::clone(&fixture.db);
+ let writer_node_ids = fixture.node_ids.clone();
+ let vector_key = fixture.vector_key;
+ let dims = config.vector_dims;
+ let apply_batch_size = config.vector_apply_batch_size;
+ let writer_handle = thread::spawn(move || {
+ let mut apply_buffer: Vec = Vec::with_capacity(apply_batch_size * 2);
+ let mut freshness = Vec::new();
+ let mut applied = 0usize;
+
+ for mut batch in result_rx {
+ apply_buffer.append(&mut batch);
+ while apply_buffer.len() >= apply_batch_size {
+ let chunk: Vec = apply_buffer.drain(..apply_batch_size).collect();
+ writer_db.begin(false).expect("expected value");
+ for job in &chunk {
+ let node_id = writer_node_ids[job.chunk_idx];
+ let value = (job.version % 1024) as f32 / 1024.0;
+ let vector = vec![value; dims];
+ writer_db
+ .set_node_vector(node_id, vector_key, &vector)
+ .expect("expected value");
+ }
+ writer_db.commit().expect("expected value");
+ let now = Instant::now();
+ for job in &chunk {
+ freshness.push(now.duration_since(job.hot_done_at).as_nanos());
+ }
+ applied += chunk.len();
+ }
+ }
+
+ if !apply_buffer.is_empty() {
+ writer_db.begin(false).expect("expected value");
+ for job in &apply_buffer {
+ let node_id = writer_node_ids[job.chunk_idx];
+ let value = (job.version % 1024) as f32 / 1024.0;
+ let vector = vec![value; dims];
+ writer_db
+ .set_node_vector(node_id, vector_key, &vector)
+ .expect("expected value");
+ }
+ writer_db.commit().expect("expected value");
+ let now = Instant::now();
+ for job in &apply_buffer {
+ freshness.push(now.duration_since(job.hot_done_at).as_nanos());
+ }
+ applied += apply_buffer.len();
+ }
+
+ (freshness, applied)
+ });
+
+ for event in events {
+ let op_start = Instant::now();
+ if config.tree_sitter_latency_ms > 0 || config.scip_latency_ms > 0 {
+ let parse_parallel_sleep = ts_sleep.max(scip_sleep);
+ thread::sleep(parse_parallel_sleep);
+ }
+ apply_graph_change_unified_tx(&fixture, event);
+ let hot_done = Instant::now();
+ last_hot_done = hot_done;
+ hot_path_ns.push(hot_done.duration_since(op_start).as_nanos());
+
+ enqueue_job(
+ &queue,
+ config.working_set,
+ EmbedJob {
+ chunk_idx: event.chunk_idx,
+ version: event.version,
+ hot_done_at: hot_done,
+ },
+ );
+ }
+
+ {
+ let (lock, cv) = &*queue;
+ let mut state = lock.lock().expect("expected value");
+ state.closed = true;
+ cv.notify_all();
+ }
+
+ for handle in embed_handles {
+ handle.join().expect("expected value");
+ }
+
+ let (vector_freshness_ns, applied_vectors) = writer_handle.join().expect("expected value");
+ let total_elapsed = run_start.elapsed();
+ let hot_path_elapsed = last_hot_done.duration_since(run_start);
+
+ let (enqueued_jobs, replaced_jobs, queue_max_depth, queue_avg_depth) = {
+ let (lock, _) = &*queue;
+ let state = lock.lock().expect("expected value");
+ let samples = state.stats.depth_samples.max(1);
+ (
+ state.stats.enqueued_jobs,
+ state.stats.replaced_jobs,
+ state.stats.max_depth,
+ state.stats.depth_sum as f64 / samples as f64,
+ )
+ };
+
+ if config.keep_db {
+ println!("Parallel DB kept at: {}", fixture.db_path.display());
+ std::mem::forget(fixture.temp_dir);
+ }
+
+ if let Ok(db) = Arc::try_unwrap(fixture.db) {
+ close_single_file(db).expect("expected value");
+ } else {
+ println!("Warning: failed to unwrap DB Arc; skipping explicit close");
+ }
+
+ BenchResult {
+ mode: "parallel",
+ changes: events.len(),
+ applied_vectors,
+ total_elapsed,
+ hot_path_elapsed,
+ hot_path_ns,
+ vector_freshness_ns,
+ enqueued_jobs,
+ replaced_jobs,
+ queue_max_depth,
+ queue_avg_depth,
+ }
+}
+
+fn print_result(result: &BenchResult) {
+ let hot_p50 = percentile_ns(&result.hot_path_ns, 0.50);
+ let hot_p95 = percentile_ns(&result.hot_path_ns, 0.95);
+ let hot_p99 = percentile_ns(&result.hot_path_ns, 0.99);
+ let fresh_p50 = percentile_ns(&result.vector_freshness_ns, 0.50);
+ let fresh_p95 = percentile_ns(&result.vector_freshness_ns, 0.95);
+ let fresh_p99 = percentile_ns(&result.vector_freshness_ns, 0.99);
+ let hot_rate = format_rate(result.changes, result.hot_path_elapsed);
+ let end_to_end_rate = format_rate(result.changes, result.total_elapsed);
+
+ println!("\n--- {} ---", result.mode);
+ println!("Changes: {}", result.changes);
+ println!("Vectors applied: {}", result.applied_vectors);
+ println!(
+ "Hot path elapsed: {:.3}s",
+ result.hot_path_elapsed.as_secs_f64()
+ );
+ println!("Total elapsed: {:.3}s", result.total_elapsed.as_secs_f64());
+ println!("Hot path rate: {hot_rate}");
+ println!("End-to-end rate: {end_to_end_rate}");
+ println!(
+ "Hot path latency: p50={} p95={} p99={}",
+ format_latency_ns(hot_p50),
+ format_latency_ns(hot_p95),
+ format_latency_ns(hot_p99)
+ );
+ println!(
+ "Vector freshness: p50={} p95={} p99={}",
+ format_latency_ns(fresh_p50),
+ format_latency_ns(fresh_p95),
+ format_latency_ns(fresh_p99)
+ );
+
+ if result.mode == "parallel" {
+ let replace_rate = if result.enqueued_jobs > 0 {
+ (result.replaced_jobs as f64 / result.enqueued_jobs as f64) * 100.0
+ } else {
+ 0.0
+ };
+ println!(
+ "Queue: enqueued={} replaced={} ({replace_rate:.2}%) max_depth={} avg_depth={:.2}",
+ result.enqueued_jobs, result.replaced_jobs, result.queue_max_depth, result.queue_avg_depth
+ );
+ }
+}
+
+fn print_comparison(seq: &BenchResult, par: &BenchResult) {
+ let seq_hot_p95 = percentile_ns(&seq.hot_path_ns, 0.95);
+ let par_hot_p95 = percentile_ns(&par.hot_path_ns, 0.95);
+ let seq_fresh_p95 = percentile_ns(&seq.vector_freshness_ns, 0.95);
+ let par_fresh_p95 = percentile_ns(&par.vector_freshness_ns, 0.95);
+
+ let hot_gain = if par.hot_path_elapsed.as_nanos() > 0 {
+ seq.hot_path_elapsed.as_secs_f64() / par.hot_path_elapsed.as_secs_f64()
+ } else {
+ 0.0
+ };
+ let end_to_end_gain = if par.total_elapsed.as_nanos() > 0 {
+ seq.total_elapsed.as_secs_f64() / par.total_elapsed.as_secs_f64()
+ } else {
+ 0.0
+ };
+
+ println!("\n=== Comparison (sequential vs parallel) ===");
+ println!("Hot path elapsed speedup: {hot_gain:.2}x");
+ println!("End-to-end elapsed speedup: {end_to_end_gain:.2}x");
+ println!(
+ "Hot p95: {} -> {}",
+ format_latency_ns(seq_hot_p95),
+ format_latency_ns(par_hot_p95)
+ );
+ println!(
+ "Freshness p95: {} -> {}",
+ format_latency_ns(seq_fresh_p95),
+ format_latency_ns(par_fresh_p95)
+ );
+}
+
+fn main() {
+ let config = parse_args();
+ let events = generate_events(&config);
+
+ println!("==================================================================");
+ println!("Index Pipeline Hypothesis Benchmark");
+ println!("==================================================================");
+ println!("Mode: {:?}", config.mode);
+ println!("Changes: {}", config.changes);
+ println!("Working set: {}", config.working_set);
+ println!("Vector dims: {}", config.vector_dims);
+ println!(
+ "Parse latency: tree-sitter={}ms scip={}ms",
+ config.tree_sitter_latency_ms, config.scip_latency_ms
+ );
+ println!("Embed latency: {}ms per batch", config.embed_latency_ms);
+ println!(
+ "Embed batching: size={} flush={}ms inflight={}",
+ config.embed_batch_size, config.embed_flush_ms, config.embed_inflight
+ );
+ println!(
+ "Vector apply batch size: {}",
+ config.vector_apply_batch_size
+ );
+ println!("WAL size: {} bytes", config.wal_size);
+ println!("Sync mode: {:?}", config.sync_mode);
+ println!(
+ "Group commit: {} (window {}ms)",
+ config.group_commit_enabled, config.group_commit_window_ms
+ );
+ println!("Auto-checkpoint: {}", config.auto_checkpoint);
+ println!("Seed: {}", config.seed);
+ println!("==================================================================");
+
+ let mut seq_result: Option