diff --git a/.cargo/.package-cache b/.cargo/.package-cache new file mode 100644 index 0000000..e69de29 diff --git a/.cargo/git/CACHEDIR.TAG b/.cargo/git/CACHEDIR.TAG new file mode 100644 index 0000000..20d7c31 --- /dev/null +++ b/.cargo/git/CACHEDIR.TAG @@ -0,0 +1,3 @@ +Signature: 8a477f597d28d172789f06886806bc55 +# This file is a cache directory tag created by cargo. +# For information about cache directory tags see https://bford.info/cachedir/ diff --git a/.cargo/git/db/malachite-ea1aa1be87e998cc/HEAD b/.cargo/git/db/malachite-ea1aa1be87e998cc/HEAD new file mode 100644 index 0000000..cb089cd --- /dev/null +++ b/.cargo/git/db/malachite-ea1aa1be87e998cc/HEAD @@ -0,0 +1 @@ +ref: refs/heads/master diff --git a/.cargo/git/db/malachite-ea1aa1be87e998cc/config b/.cargo/git/db/malachite-ea1aa1be87e998cc/config new file mode 100644 index 0000000..aa612a5 --- /dev/null +++ b/.cargo/git/db/malachite-ea1aa1be87e998cc/config @@ -0,0 +1,6 @@ +[core] + bare = true + repositoryformatversion = 0 + filemode = true + ignorecase = true + precomposeunicode = true diff --git a/.cargo/git/db/malachite-ea1aa1be87e998cc/description b/.cargo/git/db/malachite-ea1aa1be87e998cc/description new file mode 100644 index 0000000..498b267 --- /dev/null +++ b/.cargo/git/db/malachite-ea1aa1be87e998cc/description @@ -0,0 +1 @@ +Unnamed repository; edit this file 'description' to name the repository. diff --git a/.cargo/git/db/malachite-ea1aa1be87e998cc/hooks/README.sample b/.cargo/git/db/malachite-ea1aa1be87e998cc/hooks/README.sample new file mode 100755 index 0000000..d125ec8 --- /dev/null +++ b/.cargo/git/db/malachite-ea1aa1be87e998cc/hooks/README.sample @@ -0,0 +1,5 @@ +#!/bin/sh +# +# Place appropriately named executable hook scripts into this directory +# to intercept various actions that git takes. See `git help hooks` for +# more information. diff --git a/.cargo/git/db/malachite-ea1aa1be87e998cc/info/exclude b/.cargo/git/db/malachite-ea1aa1be87e998cc/info/exclude new file mode 100644 index 0000000..6d05881 --- /dev/null +++ b/.cargo/git/db/malachite-ea1aa1be87e998cc/info/exclude @@ -0,0 +1,2 @@ +# File patterns to ignore; see `git help ignore` for more information. +# Lines that start with '#' are comments. diff --git a/.cargo/registry/CACHEDIR.TAG b/.cargo/registry/CACHEDIR.TAG new file mode 100644 index 0000000..20d7c31 --- /dev/null +++ b/.cargo/registry/CACHEDIR.TAG @@ -0,0 +1,3 @@ +Signature: 8a477f597d28d172789f06886806bc55 +# This file is a cache directory tag created by cargo. +# For information about cache directory tags see https://bford.info/cachedir/ diff --git a/.clippy.toml b/.clippy.toml index 11be318..97a2599 100644 --- a/.clippy.toml +++ b/.clippy.toml @@ -1,4 +1,4 @@ avoid-breaking-exported-api = false -msrv = "1.88.0" +msrv = "1.89" warn-on-all-wildcard-imports = true allow-unwrap-in-tests = true diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8c51f3c..a8505f7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,6 +10,7 @@ on: env: CARGO_TERM_COLOR: always + CARGO_SORT_VERSION: 2.0.1 permissions: contents: read @@ -68,7 +69,7 @@ jobs: - name: Install cargo-sort (pinned) uses: taiki-e/install-action@v2 with: - tool: cargo-sort@2.0.1 + tool: cargo-sort@${{ env.CARGO_SORT_VERSION }} - name: Show cargo-sort version run: cargo sort --version @@ -271,3 +272,38 @@ jobs: name: coverage-report path: target/llvm-cov/html/ retention-days: 30 + + # ---------------------------------------------------------------------------- + # Security policy checks (advisories + deny) + # ---------------------------------------------------------------------------- + security: + name: Security + runs-on: ubuntu-latest + needs: [fmt, sort, clippy] + if: ${{ github.event_name != 'pull_request' || github.event.pull_request.draft == false }} + steps: + - uses: actions/checkout@v4 + with: { fetch-depth: 0 } + + - name: Setup Rust 1.89 + uses: dtolnay/rust-toolchain@stable + with: + toolchain: 1.89 + + - uses: Swatinem/rust-cache@v2 + + - name: Install cargo-audit + uses: taiki-e/install-action@v2 + with: + tool: cargo-audit + + - name: Install cargo-deny + uses: taiki-e/install-action@v2 + with: + tool: cargo-deny + + - name: Advisory audit (online) + run: make audit-online + + - name: Deny policy checks + run: make deny diff --git a/.gitignore b/.gitignore index 35f57e8..90d9894 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ debug/ ipc/ assets/genesis.json assets/jwtsecret +assets/p2p-keys/ .vscode .idea diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..94ce214 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,251 @@ +# Ultramarine Development Notes + +## Knowledge Base + +Technical reference documents in `docs/knowledge_base/`: + +| Document | Topic | +| ------------------------------------------------------------------ | ------------------------------------------- | +| [block-timing.md](docs/knowledge_base/block-timing.md) | Timestamp invariants, 1s minimum block time | +| [cl-el-head-gating.md](docs/knowledge_base/cl-el-head-gating.md) | FCU gate, CL/EL alignment, Engine API rules | +| [cl-runtime.md](docs/knowledge_base/cl-runtime.md) | Runtime flavor, logging, consensus timeouts | +| [el-gas-limits.md](docs/knowledge_base/el-gas-limits.md) | Builder/txpool config, gas limits | +| [el-persistence.md](docs/knowledge_base/el-persistence.md) | 1-slot finality, persistence threshold | +| [execution-genesis.md](docs/knowledge_base/execution-genesis.md) | Genesis bootstrap without HTTP RPC | +| [itest-node-harness.md](docs/knowledge_base/itest-node-harness.md) | Integration test harness | +| [p2p-sync-limits.md](docs/knowledge_base/p2p-sync-limits.md) | **P2P/sync size limits for load tests** | + +--- + +## DevOps Operations - MANDATORY RULES + +**CRITICAL**: Before performing ANY DevOps or operations tasks (deploy, wipe, restart, etc.), you MUST: + +1. **Read the documentation first**: + - `infra/README.md` - main operations guide + - `infra/Makefile` - all available targets and parameters + - Network manifest in `infra/manifests/.yaml` + +2. **Use existing Makefile targets** - NEVER write custom SSH commands for operations: + ```bash + # Network operations (from infra/ directory) + make net-wipe NET= WIPE_CONFIRM=YES # Wipe all state + make net-gen NET= SECRETS_FILE=... # Generate artifacts + make net-deploy NET= # Deploy to servers + make net-up NET= # Start services + make net-down NET= # Stop services + make net-roll NET= ROLL_CONFIRM=YES # Rolling restart + make net-health NET= # Health check + make net-redeploy NET= # Gen + deploy + restart + ``` + +3. **Key parameters**: + - `NET=fibernet` - target network + - `WIPE_CONFIRM=YES` - required for destructive operations + - `SECRETS_FILE=infra/networks//secrets.sops.yaml` - for generation + - `LIMIT=` - run only on specific host + - `WIPE_NODES=node-0,node-1` - wipe specific nodes only + +4. **Full network restart sequence**: + ```bash + cd infra + make net-wipe NET=fibernet WIPE_CONFIRM=YES + make net-gen NET=fibernet SECRETS_FILE=networks/fibernet/secrets.sops.yaml + make net-deploy NET=fibernet + make net-up NET=fibernet + make net-health NET=fibernet + ``` + +--- + +## Building Docker Images on macOS + +### Recommended Approach: `docker buildx` + +On macOS (especially Apple Silicon), use `docker buildx` with QEMU emulation instead of the Makefile's `cross` toolchain: + +```bash +# Build for linux/amd64 and push to Docker Hub +docker buildx build --platform linux/amd64 -t loadnetwork/ultramarine:TAG --push . + +# Example with fibernet tag +docker buildx build --platform linux/amd64 -t loadnetwork/ultramarine:fibernet --push . +``` + +### Why NOT use `make docker-build-push` + +The Makefile target `docker-build-push` uses `cross` for cross-compilation which requires pulling the `ghcr.io/cross-rs/x86_64-unknown-linux-gnu:main` image. On Apple Silicon Macs, this can fail with: + +- "no match for platform in manifest" error +- QEMU emulation segfaults (exit code 139) + +### Alternative: Build on server + +If buildx is too slow or unreliable, transfer source to the target server and build natively: + +```bash +# Create tarball (excluding .git and target) +tar --exclude='.git' --exclude='target' --exclude='dist' -czf /tmp/ultramarine-src.tar.gz . + +# Transfer to server +scp /tmp/ultramarine-src.tar.gz user@server:/tmp/ + +# On server: extract and build +cd /tmp && tar -xzf ultramarine-src.tar.gz +docker build -t loadnetwork/ultramarine:TAG . +docker push loadnetwork/ultramarine:TAG +``` + +## ValueSync and Blob Pruning + +### Archive-Based Pruning Policy (Load Network) + +**IMPORTANT**: Load Network uses the **archive event as the boundary for blob pruning**, NOT the Ethereum DA window. + +Key differences from Ethereum: + +- **Ethereum**: Blobs pruned based on a time-based DA window (~18 days / 4096 epochs) +- **Load Network**: Blobs pruned only after successful archival to external storage + finality + +### What Gets Pruned vs. Retained Forever + +| Data Type | Retention Policy | +| ------------------------------ | ------------------------------------- | +| Blob bytes | Pruned after archive event + finality | +| Decided values | **Retained forever** | +| Certificates | **Retained forever** | +| Block data (execution payload) | **Retained forever** | +| BlobMetadata | **Retained forever** | +| Archive records/locators | **Retained forever** | + +### history_min_height Invariant + +The `get_earliest_height()` function returns `Height(0)` when genesis metadata exists. This is an invariant: + +- `history_min_height == 0` for all validators +- Ensures fullnodes can sync from genesis +- Validators advertise they can serve the complete chain history + +### MetadataOnly Sync Pattern for Archived Blobs + +When blobs have been pruned (archived), the sync mechanism uses `SyncedValuePackage::MetadataOnly`: + +1. `GetDecidedValue` returns `MetadataOnly` with execution payload when blobs are pruned +2. `process_synced_package` imports blocks from `MetadataOnly` if execution payload is present +3. Archive notices with locators are included so external consumers can fetch blob bytes from the archive provider + +This pattern ensures fullnodes can sync the complete chain even when blob bytes are no longer available locally. + +### Reference: Lighthouse Pattern + +The design follows Lighthouse's pattern where beacon blocks are kept forever, only blob sidecars are pruned: + +- `data_availability_checker.rs:518-536` - `blobs_required_for_epoch()` +- `network_context.rs:1367-1382` - Request type selection based on DA window +- `block_sidecar_coupling.rs:573-577` - Empty blob response handling + +In Load Network, we apply the same principle: consensus data (decided values, certificates, block data) is retained indefinitely, while blob bytes are pruned after archive verification. + +--- + +## P2P and Sync Size Limits (Critical for Load Tests) + +> **Full documentation**: [docs/knowledge_base/p2p-sync-limits.md](docs/knowledge_base/p2p-sync-limits.md) + +### Problem: Sync Stalls with Large Blocks + +During high-throughput load tests, blocks can grow to **~5-12 MB** (2026-02-10 baseline hot segment peaked near **11.6 MB**). Default P2P/sync limits are ~1-10 MB, causing: + +- Sync requests timeout (responses rejected at P2P layer) +- Nodes fall behind and can't catch up +- `WARN: Beacon client online, but no consensus updates received` + +### Required Config Values for Load Tests + +In `manifests/.yaml`: + +```yaml +sync: + enabled: true + max_request_size: "50 MiB" # Default 1 MiB - TOO SMALL for load tests + max_response_size: "500 MiB" # Default 10 MiB - increase for large blocks + request_timeout: "60s" # Default 30s - increase for large payloads + parallel_requests: 100 # For fast sync catch-up + +# P2P message size limits - CRITICAL for large blocks +p2p: + pubsub_max_size: "50 MiB" # Default 4 MiB - blocks can be 12+ MB + rpc_max_size: "100 MiB" # Default 10 MiB - must be > block size +``` + +### Manual Fix (without regenerating configs) + +If nodes are stuck, update configs on ALL hosts: + +```bash +# Create update script +cat > /tmp/update-sync-sizes.sh << 'EOF' +#!/bin/bash +for config in /var/lib/ultramarine/*/config/config.toml; do + sed -i.bak \ + -e 's/pubsub_max_size = ".*"/pubsub_max_size = "50 MiB"/g' \ + -e 's/rpc_max_size = ".*"/rpc_max_size = "100 MiB"/g' \ + -e 's/max_request_size = ".*"/max_request_size = "50 MiB"/g' \ + -e 's/max_response_size = ".*"/max_response_size = "500 MiB"/g' \ + "$config" +done +EOF + +# Run on ALL hosts, then restart +for host in LON2 AMS FRA2 RPC; do + scp /tmp/update-sync-sizes.sh ubuntu@$host:/tmp/ + ssh ubuntu@$host 'sudo bash /tmp/update-sync-sizes.sh' + ssh ubuntu@$host 'sudo systemctl restart ultramarine@*' +done +``` + +### Key Insight + +**ALL nodes** must have large limits - not just the receiver. The SENDER also needs high `rpc_max_size` to send large sync responses. + +### Validation + +Historical (2026-02-06): + +- P2P/sync sizing changes removed sync-stall class and enabled clean sharded runs in captured logs. + +Latest canonical baseline (2026-02-10, PERF-SUMMARY): + +- 10k/host validation (60s): `1,472,250` submitted (`~24,537 TPS`), `0` errors. +- 20k/host probe (60s): `1,823,713` submitted (`~30,395 TPS`), `0` errors. +- 20k probe on-chain window (65 blocks): `1,824,625` tx / `109s` = `16,739.68 TPS`. +- Post-run health: `txpool pending=0, queued=0`, `eth_syncing=false` on all endpoints. + +See `docs/journal/PERF-SUMMARY-fibernet-throughput-journey.md` for baseline details and consolidated phase metrics. + +--- + +## Engine API Design Decisions (Consensus Correctness) + +These rules are consensus-critical for Ultramarine: + +1. **Engine API is the oracle; HTTP RPC is not.** + - `engine_forkchoiceUpdated` status (`VALID/INVALID/SYNCING`) is the only readiness signal. + - `eth_getBlockByNumber` must not be used for consensus gating. + +2. **Gate proposals and votes on FCU status.** + - Before proposing or voting: `FCU(head=CL decided, attrs=None)` must be `VALID`. + - If `SYNCING/INVALID` → treat proposal as invalid and **vote nil**. + +3. **Proposer flow (build)** + - `FCU(head, attrs=PayloadAttrs)` to start build. + - `engine_getPayload` after valid FCU with attrs. + - `ACCEPTED` is **not** a valid FCU status; treat it as an error. + +4. **Post-decision execution** + - After `Decided`, call `engine_newPayload` + `FCU` to drive EL. + - EL lag does not invalidate consensus decision; it only delays execution finalization. + +5. **Tendermint re-proposal requirement** + - Proposer must be able to re-serve the same payload for the same height/round. + - Store proposal payloads until height is decided. diff --git a/Cargo.lock b/Cargo.lock index 9efdae5..7d756f6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1422,12 +1422,6 @@ dependencies = [ "serde", ] -[[package]] -name = "bytesize" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bd91ee7b2422bcb158d90ef4d14f75ef67f340943fc4149891dcce8f8b972a3" - [[package]] name = "bzip2-sys" version = "0.1.13+1.0.8" @@ -2320,6 +2314,12 @@ dependencies = [ "syn 2.0.111", ] +[[package]] +name = "env_home" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7f84e12ccf0a7ddc17a6c41c93326024c42920d7ee630d04950e6926645c0fe" + [[package]] name = "equivalent" version = "1.0.2" @@ -3352,7 +3352,7 @@ name = "informalsystems-malachitebft-config" version = "0.6.0-pre" source = "git+https://github.com/circlefin/malachite.git?rev=b205f4252f3064d9a74716056f63834ff33f2de9#b205f4252f3064d9a74716056f63834ff33f2de9" dependencies = [ - "bytesize 1.3.3", + "bytesize", "config", "humantime-serde", "informalsystems-malachitebft-core-types", @@ -6010,6 +6010,19 @@ dependencies = [ "syn 2.0.111", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap 2.12.1", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "serdect" version = "0.2.0" @@ -6907,7 +6920,6 @@ name = "ultramarine-cli" version = "0.1.0" dependencies = [ "axum", - "bytesize 2.3.1", "clap", "color-eyre", "directories", @@ -6963,7 +6975,6 @@ dependencies = [ "tracing", "tree_hash", "ultramarine-blob-engine", - "ultramarine-cli", "ultramarine-execution", "ultramarine-test-support", "ultramarine-types", @@ -6977,6 +6988,7 @@ dependencies = [ "alloy-consensus", "alloy-eips", "alloy-network", + "alloy-primitives", "alloy-provider", "alloy-rpc-client", "alloy-rpc-types", @@ -6991,12 +7003,10 @@ dependencies = [ "ethereum_serde_utils", "ethereum_ssz", "hex", - "informalsystems-malachitebft-app-channel", "informalsystems-malachitebft-proto", "jsonwebtoken", "prost", "rand 0.8.5", - "redb", "reqwest", "serde", "serde_json", @@ -7004,19 +7014,57 @@ dependencies = [ "thiserror 2.0.17", "tokio", "tracing", - "ultramarine-cli", "ultramarine-types", "url", ] +[[package]] +name = "ultramarine-genesis" +version = "0.1.0" +dependencies = [ + "alloy-genesis", + "alloy-primitives", + "alloy-signer-local", + "bytes", + "chrono", + "color-eyre", + "serde_json", + "ultramarine-types", +] + +[[package]] +name = "ultramarine-netgen" +version = "0.1.0" +dependencies = [ + "bytesize", + "clap", + "color-eyre", + "hex", + "humantime", + "k256", + "rand 0.8.5", + "serde", + "serde_json", + "serde_yaml", + "sha2 0.10.9", + "toml 0.9.8", + "ultramarine-cli", + "ultramarine-genesis", + "ultramarine-types", + "which", +] + [[package]] name = "ultramarine-node" version = "0.1.0" dependencies = [ "alloy-consensus", + "alloy-eips", + "alloy-genesis", "alloy-primitives", "alloy-rpc-types-engine", "alloy-rpc-types-eth", + "alloy-trie", "async-trait", "bytes", "color-eyre", @@ -7036,6 +7084,7 @@ dependencies = [ "sha3", "thiserror 2.0.17", "tokio", + "tokio-util", "tracing", "ultramarine-blob-engine", "ultramarine-cli", @@ -7077,6 +7126,7 @@ dependencies = [ "ultramarine-cli", "ultramarine-consensus", "ultramarine-execution", + "ultramarine-genesis", "ultramarine-node", "ultramarine-test-support", "ultramarine-types", @@ -7102,7 +7152,6 @@ dependencies = [ "alloy-rpc-types", "alloy-rpc-types-engine", "alloy-rpc-types-eth", - "alloy-rpc-types-txpool", "async-trait", "bytes", "ethereum_hashing", @@ -7146,7 +7195,6 @@ dependencies = [ "alloy-rpc-types-txpool", "alloy-signer", "alloy-signer-local", - "alloy-transport-http", "c-kzg", "chrono", "clap", @@ -7159,6 +7207,7 @@ dependencies = [ "tokio", "tracing", "tracing-subscriber", + "ultramarine-genesis", "ultramarine-types", ] @@ -7196,6 +7245,12 @@ dependencies = [ "subtle", ] +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "unsigned-varint" version = "0.7.2" @@ -7411,6 +7466,17 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "which" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3fabb953106c3c8eea8306e4393700d7657561cb43122571b172bbfb7c7ba1d" +dependencies = [ + "env_home", + "rustix", + "winsafe", +] + [[package]] name = "widestring" version = "1.2.1" @@ -7768,6 +7834,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "winsafe" +version = "0.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d135d17ab770252ad95e9a872d365cf3090e3be864a34ab46f48555993efc904" + [[package]] name = "wit-bindgen" version = "0.46.0" diff --git a/Cargo.toml b/Cargo.toml index 2b573e3..10d6ba4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,11 +7,13 @@ members = [ "crates/cli", "crates/consensus", "crates/execution", + "crates/genesis", "crates/node", "crates/test", "crates/test_support", "crates/types", "crates/utils", + "infra/gen/netgen", ] default-members = ["bin/ultramarine", "crates/utils"] @@ -53,11 +55,12 @@ alloy-rlp = "0.3.1" alloy-rpc-client = { version = "1.0.23", default-features = false, features = ["reqwest"] } alloy-rpc-types = { version = "1.0.23", features = ["eth"], default-features = false } alloy-rpc-types-engine = { version = "1.0.23", features = ["serde", "ssz"], default-features = false } -alloy-rpc-types-eth = { version = "1.0.23", default-features = false } +alloy-rpc-types-eth = { version = "1.0.23", features = ["serde"], default-features = false } alloy-rpc-types-txpool = { version = "1.0.23" } alloy-signer = { version = "1.0.23" } alloy-signer-local = { version = "1.0.23", features = ["keystore", "mnemonic"] } alloy-transport-http = { version = "1.0.23", default-features = false, features = ["reqwest", "reqwest-rustls-tls"] } +alloy-trie = { version = "0.9.0", features = ["ethereum"] } async-trait = "0.1.85" axum = "0.8.4" @@ -99,6 +102,7 @@ signature = "2.2.0" tempfile = "3.13.0" thiserror = { version = "2.0.14", default-features = false } tokio = "1.47.1" +tokio-util = "0.7" toml = "0.9.5" tracing = "0.1.41" tracing-appender = "0.2.3" @@ -110,6 +114,7 @@ ultramarine-blob-engine = { path = "crates/blob_engine" } ultramarine-cli = { path = "crates/cli" } ultramarine-consensus = { path = "crates/consensus" } ultramarine-execution = { path = "crates/execution" } +ultramarine-genesis = { path = "crates/genesis" } ultramarine-node = { path = "crates/node" } ultramarine-test-support = { path = "crates/test_support" } ultramarine-types = { path = "crates/types" } diff --git a/Cross.toml b/Cross.toml index 716e09a..9044a47 100644 --- a/Cross.toml +++ b/Cross.toml @@ -1,15 +1,16 @@ [build] pre-build = [ - # Install certificates and configure APT retries/timeouts for robustness. + # Install certificates and enforce HTTPS for package sources. "apt-get update && apt-get install --assume-yes --no-install-recommends ca-certificates", + "find /etc/apt/ -type f \\( -name '*.list' -o -name '*.sources' \\) -exec sed -i 's|http://|https://|g' {} +", + + # Configure APT retries/timeouts for transient network issues. "echo 'Acquire::Retries \"3\";' > /etc/apt/apt.conf.d/80-retries", "echo 'Acquire::http::Timeout \"60\";' >> /etc/apt/apt.conf.d/80-retries", "echo 'Acquire::ftp::Timeout \"60\";' >> /etc/apt/apt.conf.d/80-retries", - # rust-bindgen dependencies: llvm-dev libclang-dev (>= 10) clang (>= 10) - # See: https://github.com/cross-rs/cross/wiki/FAQ#using-clang--bindgen for - # recommended clang versions for the given cross and bindgen version. - "apt-get update && apt-get install --assume-yes --no-install-recommends llvm-dev libclang-dev clang libc6-dev protobuf-compiler pkg-config", + # rust-bindgen + TLS native build deps required across crates. + "apt-get update && apt-get install --assume-yes --no-install-recommends llvm-dev libclang-dev clang libc6-dev protobuf-compiler libssl-dev pkg-config", ] [target.x86_64-unknown-linux-gnu] @@ -17,3 +18,6 @@ image = "ghcr.io/cross-rs/x86_64-unknown-linux-gnu:main" [target.aarch64-unknown-linux-gnu] image = "ghcr.io/cross-rs/aarch64-unknown-linux-gnu:main" + +[build.env] +passthrough = ["JEMALLOC_SYS_WITH_LG_PAGE"] diff --git a/Dockerfile b/Dockerfile index 366174f..6ff60d4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -79,6 +79,12 @@ RUN apt-get update && \ apt-get install -y --no-install-recommends ca-certificates && \ rm -rf /var/lib/apt/lists/* +# Create non-root user for runtime (fixed UID/GID for host volume ownership). +ARG ULTRAMARINE_UID=10002 +ARG ULTRAMARINE_GID=10002 +RUN groupadd -r -g "${ULTRAMARINE_GID}" ultramarine && \ + useradd -r -u "${ULTRAMARINE_UID}" -g ultramarine -d /home/ultramarine -m ultramarine + # Copy the built binary from the previous stage. COPY --from=builder /usr/src/ultramarine/ultramarine /usr/local/bin/ultramarine @@ -86,4 +92,6 @@ COPY --from=builder /usr/src/ultramarine/ultramarine /usr/local/bin/ultramarine # as the ultramarine API evolves. EXPOSE 30303 9001 +USER ultramarine + ENTRYPOINT ["/usr/local/bin/ultramarine"] diff --git a/GEMINI.md b/GEMINI.md deleted file mode 100644 index 2800329..0000000 --- a/GEMINI.md +++ /dev/null @@ -1,303 +0,0 @@ -# Ultramarine — AI Contributor Guide (Rust & Performance Friendly) - -This guide is optimized for models (and humans!) contributing to the **Ultramarine** repository. It explains the repo layout, the development workflow, Makefile targets, CI/CD, and preserves **Rust‑specific engineering and performance guidance** so your contributions are correct, fast, and easy to review. - ---- - -## What Ultramarine is (today) - -Ultramarine is a **Rust workspace** intended to evolve into a consensus client. The workspace currently contains a CLI binary and four library crates that are ready to be filled with real logic: - -``` -bin/ultramarine # binary crate (default member) -crates/consensus # consensus logic (stub) -crates/execution # execution layer integration (stub) -crates/node # orchestration / bootstrapping (stub) -crates/types # shared types, serialization (stub) -crates/cli # cli -``` - -The root `Cargo.toml` unifies metadata (edition 2024, version, license, MSRV 1.88), centralizes lints, and specifies build profiles (e.g., release uses thin-LTO and a single codegen unit for better performance). - ---- - -## Local setup & prerequisites - -- **Rust**: 1.88+ (toolchain pinned in CI). - -- **GNU Make**: 4.x. - -- **Optional**: Docker and [`cross`](https://github.com/cross-rs/cross) for multi‑arch builds. - -- **One‑time**: install dev tools the Makefile expects: - -```bash -make tools -# installs: cargo-watch, cargo-deny, cargo-audit, cargo-outdated, cargo-sort, -# cargo-udeps (nightly), cargo-nextest, cargo-llvm-cov, typos-cli, dprint -``` - ---- - -## Everyday development workflow - -Use the Makefile to get consistent flags and the same checks CI runs: - -```bash -# Build & run -make build # optimized build -make build-debug # debug build -make run ARGS="--help" # pass args to the CLI -make dev # autoreload loop via cargo-watch - -# Quality gates -make fmt-check # nightly rustfmt --check -RUSTFLAGS="-D warnings" make clippy -make sort-check # cargo-sort sanity for Cargo.toml -make lint # fmt + clippy + typos + dprint/toml - -# Tests & coverage -make test # unit + doc tests -make test-nextest # faster runs using nextest -make cov-report-html # generate + open HTML coverage - -# Docs -make doc # public API docs -make rustdocs # exhaustive docs incl. private items - -# Cross & Docker -make build-aarch64-unknown-linux-gnu -make docker-build -make docker-build-push-latest - -# Pre-PR / CI parity -make pr # lint → tests → audit -make ci # mirrors CI checks locally -``` - ---- - -## Makefile cheatsheet - -| Target | Purpose | -| -------------------------------------------------------- | ------------------------------------ | -| `build`, `build-debug`, `run`, `dev` | Compile & run workflows (with watch) | -| `fmt`, `fmt-check`, `clippy`, `sort`, `lint`, `fix-lint` | Formatting & linting suite | -| `test`, `test-nextest`, `bench` | Testing & benchmarks | -| `cov`, `cov-report-html` | Coverage (lcov + HTML) | -| `deps-check`, `deps-outdated`, `audit`, `deny` | Dependency hygiene & security | -| `docker-build`, `docker-build-push(-latest)` | Local / multi‑arch images | -| `tools`, `tools-check` | Install/verify helper tools | -| `pr`, `ci`, `pre-commit` | Local equivalents of CI/PR gates | - -> Pro tip: run `make help` to see all targets with short descriptions. - ---- - -## CI/CD overview - -- **CI** (push/PR): format (nightly rustfmt), `cargo-sort`, clippy (warnings = errors), tests, docs, optional coverage (on `main` or when PR title includes `[coverage]`). Artifacts (docs/coverage) are uploaded. - -- **Conventional Commits**: PRs must pass commit‑message validation (`feat:`, `fix:`, `refactor:`, `docs:`, etc.). - -- **Docker**: PRs (non‑draft) do a smoke build; pushes/tags build **multi‑arch images** (amd64/arm64) and can push when configured. - -- **Release**: pushing a `v*` tag (or dispatching with a `tag` input) creates a GitHub Release with generated notes. - -- **Dependabot**: weekly updates for Cargo deps and GitHub Actions. - -Keep PRs **small and focused**. The pipeline is fast when you are. - ---- - -## Rust style & correctness guidelines - -- **Edition/MSRV**: target **edition 2024**, MSRV **1.88** (same as CI). - -- **Formatting**: `cargo +nightly fmt --all`. Use the repo’s `rustfmt.toml` (import granularity, line widths, etc.) to avoid churn. - -- **Lints**: - - - Workspace **Rust lints** include `unused-must-use = "deny"`, `missing-docs = "warn"`, `rust-2018-idioms = "deny"`. - - - **Clippy** is part of the gates; aim for zero warnings. Prefer idiomatic refactoring over suppressing lints. - -- **Docs**: public APIs should have `///` docs; keep examples compile‑checked with doctests where helpful. - ---- - -## Performance playbook (preserved & expanded) - -**Design for hot paths** - -- Avoid heap work in tight loops. Use stack where practical. Pre‑size collections (`with_capacity`) when the bound is known or well‑estimated. - -- Prefer **borrowing over owning**. Accept `&[u8]`/`&str`/`&T` rather than `Vec`/`String`/`T` when you don’t need ownership. - -- Minimize `clone()`/`to_vec()`/`to_string()`. If needed, comment why the copy is required. - -**Data layout & types** - -- Favor **small, copyable types** in hot structs (e.g., `u64`, `u32`, fixed‑size arrays). Keep layout compact; avoid nested `Option>` and large enums with hot/cold variants mixed. - -- Use `Option` etc. to leverage niche optimizations. - -- For cheap shared buffers, consider `bytes::Bytes` (cheap clone) or `Arc<[u8]>` when many readers need the same data. - -**Generics & dispatch** - -- Prefer **static dispatch** (generics / trait bounds) for hot paths; use **dyn Trait** only when you need runtime polymorphism and the call isn’t performance critical. - -- Reserve `#[inline]`/`#[inline(always)]` for **measured** wins; over‑inlining can bloat code and hurt i‑cache. - -**Branching & error paths** - -- Mark unlikely paths with `#[cold]` (and optionally `#[inline(never)]`) on error/slow functions to improve code locality. - -- Use `Result` and early `?` returns for straight‑line happy paths. - -**Concurrency** - -- **CPU‑bound**: use a thread‑pool or crates like `rayon` for data‑parallel transforms; don’t block async executors with heavy compute. - -- **I/O‑bound**: use `tokio` where you need async networking or disk I/O; keep blocking work behind `spawn_blocking` or dedicated threads. - -- Consider `parking_lot` mutexes for lower overhead in highly contended sections (measure!). - -**Collections** - -- Use `HashMap/HashSet` judiciously; pre‑reserve when size is known. For small fixed sizes, arrays or `SmallVec` can beat heap‑allocated `Vec`. - -- For performance‑critical hashing, specialized hashers (e.g., `ahash`) can be faster—balance this against DoS resilience if input is adversarial. - -**I/O & serialization** - -- Batch I/O; avoid per‑item syscalls. Use buffered readers/writers. - -- Serialize with `serde` (derive early), prefer zero‑copy deserialization when possible. - -**Logging & metrics** - -- Cheap logging in hot paths: guard with level checks or use spans with care. Avoid building large strings eagerly; use structured fields. - -- Instrument with counters/gauges/timers for feedback loops during performance work. - -**Build settings** - -- **Release** profile is already tuned: `lto = "thin"`, `codegen-units = 1`. For local profiling, try `RUSTFLAGS="-C target-cpu=native"`; don’t hard‑code it in CI (keeps portability). - -- If you add optional features later (e.g., `jemalloc`, `asm-*`), keep them **opt‑in** and measured. - -**Benchmarking & profiling** - -- Add Criterion benchmarks (`benches/`) for hot code; run via `cargo bench` or `make bench`. - -- Use `cargo-llvm-cov` for coverage; prefer benchmarking in **release** with stable inputs. - -- For deep dives: `perf`/`dtrace`/`vtune` (system‑dependent). Consider `cargo-flamegraph` locally (not required by CI). - ---- - -## Testing strategy - -- **Unit tests** near code (`#[cfg(test)]` modules). - -- **Integration tests** in a `tests/` directory per crate once public surfaces exist. - -- **Property tests** (e.g., `proptest`) for critical invariants and encoding/decoding. - -- **Fuzzing** (later): good for network parsing and state machines (e.g., `cargo fuzz`). - -- **Benchmarks** (Criterion) for hot algorithms and data transformations. - -**Example unit test skeleton** - -```rust -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn add_two_numbers() { - assert_eq!(add(2, 2), 4); - } -} -``` - ---- - -## Contribution patterns that work well here - -- **Small targeted fixes**: prefer surgical PRs over sweeping refactors. - -- **API surfacing**: start by defining traits/structs in `types` and `consensus`; keep crates decoupled. - -- **Make code generic** where reuse across modules is expected; document trait bounds and add minimal examples to docs. - -- **Add tests** alongside new code; include at least one failure case. - -> If you propose multiple changes, break them into multiple PRs: (1) prep/refactor (no behavior change), (2) feature, (3) follow‑up cleanup. - ---- - -## PR etiquette (agent‑ready) - -1. **Branch**: `feat-…` / `fix-…` / `refactor-…`. - -2. **Diffs**: show minimal unified diffs per file; keep imports tidy and follow formatting. - -3. **Validation**: include an executable plan: - - ```bash - make fmt-check && make sort-check - RUSTFLAGS="-D warnings" make clippy - make test - ``` - -4. **Performance‑sensitive changes**: explain expected impact; if possible, include a micro‑benchmark + result table or a before/after `perf` snippet. - -5. **Commit messages**: Conventional Commits (`feat:`, `fix:`, `chore:`, `docs:`, etc.). The bot will check this. - ---- - ---- - -## Quick reference — commands - -```bash -# Formatting & linting -make fmt-check -RUSTFLAGS="-D warnings" make clippy -make sort-check -make lint - -# Tests & coverage -make test -make test-nextest -make cov-report-html - -# Build & run -make build -make run ARGS="--help" -make dev - -# Cross & container -make build-aarch64-unknown-linux-gnu -make docker-build -make docker-build-push-latest - -# Dependency & security -make deps-outdated -make deps-check -make audit -make deny - -# Docs -make doc -make rustdocs - -# Pre-PR -make pr -``` - ---- diff --git a/Makefile b/Makefile index f111406..8b30f99 100644 --- a/Makefile +++ b/Makefile @@ -20,6 +20,10 @@ CARGO_INSTALL_EXTRA_FLAGS ?= CARGO_TARGET_DIR ?= target BINARY_NAME ?= ultramarine BIN_DIR ?= dist/bin +CARGO_SORT_VERSION ?= 2.0.1 +CARGO_SORT ?= cargo sort +CARGO_SORT_FLAGS ?= --grouped --workspace +CARGO_AUDIT ?= cargo audit PROMETHEUS_CONFIG_DIR := monitoring PROMETHEUS_ACTIVE_CONFIG := $(PROMETHEUS_CONFIG_DIR)/prometheus.yml PROMETHEUS_HOST_CONFIG := $(PROMETHEUS_CONFIG_DIR)/prometheus.host.yml @@ -179,11 +183,11 @@ clippy-fix: ## Run clippy and automatically fix warnings. .PHONY: sort sort: ## Sort dependencies in Cargo.toml files. @echo "$(GREEN)Sorting dependencies$(NC)" - cargo sort --grouped --workspace + $(CARGO_SORT) $(CARGO_SORT_FLAGS) .PHONY: sort-check sort-check: ## Check if dependencies are sorted. - cargo sort --grouped --workspace --check + $(CARGO_SORT) $(CARGO_SORT_FLAGS) --check # Typos and TOML formatting linting. We wrap them here to ensure these tools # exist before running. @@ -197,8 +201,8 @@ ensure-typos: echo "typos not found. Please install it via \`cargo install typos-cli\`"; exit 1; fi .PHONY: lint-toml -lint-toml: ensure-dprint ## Format all TOML files using dprint. - dprint fmt +lint-toml: ensure-dprint ## Check TOML formatting using dprint. + dprint check .PHONY: ensure-dprint ensure-dprint: @@ -206,7 +210,7 @@ ensure-dprint: echo "dprint not found. Please install it via \`cargo install --locked dprint\`"; exit 1; fi .PHONY: lint -lint: fmt clippy sort lint-typos lint-toml ## Run all linters. +lint: fmt-check clippy sort-check lint-typos lint-toml ## Run all linters without mutating files. @echo "$(GREEN)✓ All lints passed$(NC)" .PHONY: fix-lint @@ -263,25 +267,37 @@ deps-outdated: ## Check for outdated dependencies (requires cargo-outdated). .PHONY: audit audit-online AUDIT_DB_DIR ?= target/advisory-db +AUDIT_IGNORES ?= RUSTSEC-2024-0388 RUSTSEC-2024-0436 RUSTSEC-2025-0137 RUSTSEC-2026-0002 RUSTSEC-2026-0007 RUSTSEC-2026-0009 + +AUDIT_FLAGS := +ifneq ($(strip $(AUDIT_IGNORES)),) +AUDIT_FLAGS += $(foreach advisory,$(AUDIT_IGNORES),--ignore $(advisory)) +endif audit: ## Run security audit on dependencies. @echo "$(YELLOW)Running security audit$(NC)" @if [ -n "$$CI" ]; then \ - cargo audit --db $(AUDIT_DB_DIR); \ + $(CARGO_AUDIT) --db $(AUDIT_DB_DIR) $(AUDIT_FLAGS); \ elif [ -d "$(AUDIT_DB_DIR)/.git" ]; then \ - cargo audit --db $(AUDIT_DB_DIR) --offline; \ + if $(CARGO_AUDIT) --help 2>/dev/null | grep -q -- '--offline'; then \ + $(CARGO_AUDIT) --db $(AUDIT_DB_DIR) --offline $(AUDIT_FLAGS); \ + else \ + echo "$(YELLOW)cargo-audit does not support --offline; using local DB without offline mode.$(NC)"; \ + $(CARGO_AUDIT) --db $(AUDIT_DB_DIR) $(AUDIT_FLAGS); \ + fi; \ else \ echo "$(YELLOW)Skipping cargo audit (no local advisory DB at $(AUDIT_DB_DIR)). Run 'make audit-online' to fetch it.$(NC)"; \ fi audit-online: ## Fetch advisory DB and run cargo-audit (requires network). @mkdir -p $(AUDIT_DB_DIR) - cargo audit --db $(AUDIT_DB_DIR) + $(CARGO_AUDIT) --db $(AUDIT_DB_DIR) $(AUDIT_FLAGS) .PHONY: deny +DENY_CHECKS ?= advisories bans sources deny: ## Check dependencies against deny rules (requires cargo-deny). @echo "$(YELLOW)Checking dependency policies$(NC)" - cargo deny check + cargo deny check $(DENY_CHECKS) # ----------------------------------------------------------------------------- # CI/CD @@ -319,7 +335,7 @@ tools: ## Install common development tools used by the Makefile. @echo "Installing cargo-deny..."; cargo install cargo-deny --locked @echo "Installing cargo-audit..."; cargo install cargo-audit --locked @echo "Installing cargo-outdated..."; cargo install cargo-outdated --locked - @echo "Installing cargo-sort..."; cargo install cargo-sort --locked + @echo "Installing cargo-sort $(CARGO_SORT_VERSION)..."; cargo install cargo-sort --version $(CARGO_SORT_VERSION) --locked @echo "Installing cargo-udeps..."; cargo install cargo-udeps --locked @echo "Installing cargo-nextest..."; cargo install cargo-nextest --locked @echo "Installing cargo-llvm-cov..."; cargo install cargo-llvm-cov --locked @@ -565,7 +581,18 @@ stop-ipc: ## Stop the docker-compose stack for IPC. docker compose -f compose.ipc.yaml down -v .PHONY: clean-net -clean-net: stop ## Clean local testnet data (genesis, nodes, EL data, monitoring data). +clean-net: stop ## Clean local testnet data. Requires CONFIRM=YES. + @if [ "$(CONFIRM)" != "YES" ]; then \ + echo "$(RED)WARNING: This will delete all local testnet data!$(NC)"; \ + echo " - ./assets/genesis.json"; \ + echo " - ./nodes/"; \ + echo " - ./rethdata/"; \ + echo " - ./monitoring/data-*"; \ + echo " - ./ipc/"; \ + echo ""; \ + echo "To proceed, run: $(YELLOW)make clean-net CONFIRM=YES$(NC)"; \ + exit 1; \ + fi rm -rf ./assets/genesis.json rm -rf ./nodes rm -rf ./rethdata @@ -574,7 +601,18 @@ clean-net: stop ## Clean local testnet data (genesis, nodes, EL data, monitoring rm -rf ./ipc .PHONY: clean-net-ipc -clean-net-ipc: stop-ipc ## Clean local testnet data for IPC. +clean-net-ipc: stop-ipc ## Clean local testnet data (IPC). Requires CONFIRM=YES. + @if [ "$(CONFIRM)" != "YES" ]; then \ + echo "$(RED)WARNING: This will delete all local testnet data!$(NC)"; \ + echo " - ./assets/genesis.json"; \ + echo " - ./nodes/"; \ + echo " - ./rethdata/"; \ + echo " - ./monitoring/data-*"; \ + echo " - ./ipc/"; \ + echo ""; \ + echo "To proceed, run: $(YELLOW)make clean-net-ipc CONFIRM=YES$(NC)"; \ + exit 1; \ + fi rm -rf ./assets/genesis.json rm -rf ./nodes rm -rf ./rethdata @@ -590,19 +628,41 @@ spam: ## Spam the EL with transactions (60s @ 500 tps against default RPC). spam-blobs: ## Spam all three EL nodes with blob transactions (60s @ 50 tps per EL, 6 blobs per tx). @echo "⚙️ Building ultramarine-utils spammer binary..." @cargo build --quiet --bin ultramarine-utils - @echo "🚀 Spamming blob txs against load-reth RPCs on 8545, 18545, and 28545" + @echo "🚀 Spamming blob txs against load-reth RPCs" @set -e; \ + SPAM_TIME="$${SPAM_TIME:-60}"; \ + SPAM_RATE="$${SPAM_RATE:-50}"; \ + SPAM_BLOBS_PER_TX="$${SPAM_BLOBS_PER_TX:-6}"; \ + SPAM_RPC_URLS="$${SPAM_RPC_URLS:-http://127.0.0.1:8545 http://127.0.0.1:18545 http://127.0.0.1:28545}"; \ + if [ -n "$${SPAM_PRIVATE_KEY:-}" ]; then \ + set -- $$SPAM_RPC_URLS; \ + if [ "$$#" -ne 1 ]; then \ + echo "error: SPAM_PRIVATE_KEY requires exactly one RPC URL (set SPAM_RPC_URLS to a single URL)" >&2; \ + exit 2; \ + fi; \ + fi; \ i=0; \ - for rpc in 8545 18545 28545; do \ - echo "→ blasting http://127.0.0.1:$$rpc"; \ - target/debug/ultramarine-utils spam \ - --time=60 \ - --rate=50 \ - --rpc-url=http://127.0.0.1:$$rpc \ - --blobs \ - --blobs-per-tx=6 \ - --signer-index=$$i \ - & \ + for url in $$SPAM_RPC_URLS; do \ + echo "→ blasting $$url"; \ + if [ -n "$${SPAM_PRIVATE_KEY:-}" ]; then \ + target/debug/ultramarine-utils spam \ + --time=$$SPAM_TIME \ + --rate=$$SPAM_RATE \ + --rpc-url=$$url \ + --blobs \ + --blobs-per-tx=$$SPAM_BLOBS_PER_TX \ + --private-key=$${SPAM_PRIVATE_KEY} \ + & \ + else \ + target/debug/ultramarine-utils spam \ + --time=$$SPAM_TIME \ + --rate=$$SPAM_RATE \ + --rpc-url=$$url \ + --blobs \ + --blobs-per-tx=$$SPAM_BLOBS_PER_TX \ + --signer-index=$$i \ + & \ + fi; \ i=$$((i+1)); \ done; \ wait @@ -612,7 +672,7 @@ spam-blobs: ## Spam all three EL nodes with blob transactions (60s @ 50 tps per # * blob_roundtrip: Happy path baseline # * blob_sync_commitment_mismatch: Negative path validation # * blob_pruning: Retention logic -# - Tier1 (14 tests, via itest-node): Full integration with networking/WAL/libp2p +# - Tier1 (17 tests, via itest-node): Full integration with networking/WAL/libp2p # To run all tests: make itest && make itest-node # Allow overriding offline mode (CI can set CARGO_NET_OFFLINE=false on cold caches). @@ -645,7 +705,7 @@ itest-list: ## List Tier 0 component tests. .PHONY: itest-node itest-node: ## Run full-node (Tier 1) integration tests (process-isolated for determinism). - @echo "$(GREEN)Running Tier 1 full-node integration tests (14 tests, process-isolated)...$(NC)" + @echo "$(GREEN)Running Tier 1 full-node integration tests (20 tests, process-isolated)...$(NC)" @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_blob_quorum_roundtrip -- --ignored @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_validator_restart_recovers -- --ignored @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_restart_mid_height -- --ignored @@ -657,10 +717,16 @@ itest-node: ## Run full-node (Tier 1) integration tests (process-isolated for de @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_restream_multi_validator -- --ignored @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_value_sync_inclusion_proof_failure -- --ignored @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_blob_blobless_sequence_behaves -- --ignored - @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_store_pruning_retains_recent_heights -- --ignored + @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_store_pruning_preserves_decided_history -- --ignored @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_sync_package_roundtrip -- --ignored + @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_sync_rejects_invalid_execution_requests -- --ignored + @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_rejects_invalid_execution_requests_from_el -- --ignored + @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_rejects_payload_without_blobs_bundle -- --ignored @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_value_sync_proof_failure -- --ignored - @echo "$(GREEN)✅ All 14 Tier 1 tests passed!$(NC)" + @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_fcu_gate_does_not_require_http_latest -- --ignored + @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_split_head_recovery -- --ignored + @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_fcu_accepted_rejected -- --ignored + @echo "$(GREEN)✅ All 20 Tier 1 tests passed!$(NC)" .PHONY: itest-node-archiver itest-node-archiver: ## Run Tier 1 archiver/prune full-node integration tests. @@ -671,3 +737,296 @@ itest-node-archiver: ## Run Tier 1 archiver/prune full-node integration tests. @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_archiver_recover_pending_jobs_api -- --ignored @CARGO_NET_OFFLINE=$(CARGO_NET_OFFLINE) cargo test -p ultramarine-test --test full_node node_harness::full_node_archiver_auth_token_transmitted -- --ignored @echo "$(GREEN)✅ All 5 Tier 1 archiver/prune tests passed!$(NC)" + +# ----------------------------------------------------------------------------- +# Infra (multi-host) helpers + +NET_DEFAULT_FILE ?= infra/.net +NET ?= $(shell test -f "$(NET_DEFAULT_FILE)" && cat "$(NET_DEFAULT_FILE)") +SECRETS_FILE ?= +LINES ?= 200 +LIMIT ?= +SSH_KEY ?= +NET_DIR ?= $(abspath infra/networks/$(NET)) +NET_CONFIG ?= $(NET_DIR)/net.mk +ANSIBLE_CONFIG_PATH ?= infra/ansible/ansible.cfg +ANSIBLE_INVENTORY ?= $(NET_DIR)/inventory.yml +ANSIBLE_PLAYBOOKS ?= infra/ansible/playbooks +RESTART_ON_DEPLOY ?= false +APPLY_FIREWALL ?= false +PROMETHEUS_BIND ?= 127.0.0.1 +PROMETHEUS_PORT ?= 9090 +PROMETHEUS_SCRAPE_INTERVAL ?= 5s +GRAFANA_BIND ?= 127.0.0.1 +GRAFANA_PORT ?= 3000 +GRAFANA_ADMIN_PASSWORD ?= +EL_HTTP_BIND ?= 0.0.0.0 +ROLL_CONFIRM ?= +STORAGE_WIPE ?= false +APT_DISABLE_PROXY ?= false +DATA_MOUNTPOINT ?= /var/lib/loadnet +DATA_SOURCE_MOUNTPOINT ?= /home +DATA_SOURCE_DIR ?= $(DATA_SOURCE_MOUNTPOINT)/loadnet +DATA_DEVICES ?= +DATA_RAID_LEVEL ?= 1 +MOVE_DOCKER_DATAROOT ?= false +DOCKER_DATAROOT ?= $(DATA_MOUNTPOINT)/docker +BIND_VAR_LOG ?= false +LOG_DIR ?= $(DATA_MOUNTPOINT)/log +JOURNAL_VACUUM_SIZE ?= 1G +SOPS_AGE_KEY_FILE ?= $(HOME)/.config/sops/age/keys.txt +SOPS_AGE_RECIPIENT ?= +SECRETS_PLAINTEXT ?= $(NET_DIR)/secrets.yaml +SECRETS_ENCRYPTED ?= $(NET_DIR)/secrets.sops.yaml +REMOVE_PLAINTEXT ?= false +WIPE_CONFIRM ?= +WIPE_STATE ?= true +WIPE_MONITORING ?= true +WIPE_CONTAINERS ?= true +WIPE_FIREWALL ?= false +WIPE_NODES ?= +EXTRA_VARS ?= + +-include $(NET_CONFIG) + +ifeq ($(strip $(SECRETS_FILE)),) +ifeq ($(origin SECRETS_FILE),file) +ifneq ("$(wildcard $(SECRETS_ENCRYPTED))","") +SECRETS_FILE := $(SECRETS_ENCRYPTED) +endif +endif +endif + +.PHONY: require-net +require-net: + @if [ -z "$(NET)" ]; then \ + echo "NET is required (e.g. NET=fibernet or run 'make net-use NET=fibernet')."; \ + exit 1; \ + fi + +.PHONY: net-use +net-use: ## Set the default network for infra targets (writes infra/.net). + @if [ -z "$(NET)" ]; then \ + echo "NET is required (e.g. NET=fibernet)."; \ + exit 1; \ + fi + @printf "%s\n" "$(NET)" > "$(NET_DEFAULT_FILE)" + @echo "$(GREEN)Default net set to $(NET) ($(NET_DEFAULT_FILE))$(NC)" + +.PHONY: net-unset +net-unset: ## Clear the default network selection (removes infra/.net). + @rm -f "$(NET_DEFAULT_FILE)" + @echo "$(YELLOW)Default net cleared ($(NET_DEFAULT_FILE) removed)$(NC)" + +.PHONY: net-show +net-show: ## Show the active infra defaults (NET, NET_DIR, SECRETS_FILE, NET_CONFIG). + @echo "NET=$(if $(NET),$(NET),)" + @echo "NET_DIR=$(NET_DIR)" + @echo "NET_CONFIG=$(NET_CONFIG) $$(test -f "$(NET_CONFIG)" && echo '[loaded]' || echo '[missing]')" + @echo "SECRETS_FILE=$(if $(SECRETS_FILE),$(SECRETS_FILE),)" + +.PHONY: net-validate +net-validate: require-net ## Validate infra manifest (NET=). + cargo run --quiet -p ultramarine-netgen --bin netgen -- validate --manifest infra/manifests/$(NET).yaml + +.PHONY: net-plan +net-plan: require-net ## Generate infra lockfile + inventory without secrets (dry-run for bootstrap) (NET=). + cargo run --quiet -p ultramarine-netgen --bin netgen -- gen --manifest infra/manifests/$(NET).yaml --out-dir infra/networks/$(NET) --allow-missing-archiver-tokens + +.PHONY: net-gen +net-gen: require-net ## Generate infra lockfile + bundle (NET=, optional SECRETS_FILE=). + @if [ -n "$(SECRETS_FILE)" ]; then \ + SOPS_AGE_KEY_FILE="$(SOPS_AGE_KEY_FILE)" cargo run --quiet -p ultramarine-netgen --bin netgen -- gen --manifest infra/manifests/$(NET).yaml --out-dir infra/networks/$(NET) --secrets-file "$(SECRETS_FILE)"; \ + else \ + cargo run --quiet -p ultramarine-netgen --bin netgen -- gen --manifest infra/manifests/$(NET).yaml --out-dir infra/networks/$(NET); \ + fi + +.PHONY: net-secrets-encrypt +net-secrets-encrypt: ## Encrypt plaintext secrets.yaml -> secrets.sops.yaml (NET=, SOPS_AGE_RECIPIENT=). + @if [ -z "$(SOPS_AGE_RECIPIENT)" ]; then \ + echo "SOPS_AGE_RECIPIENT is required (age public key)"; \ + exit 1; \ + fi + @if [ ! -f "$(SECRETS_PLAINTEXT)" ]; then \ + echo "Missing plaintext secrets file: $(SECRETS_PLAINTEXT)"; \ + exit 1; \ + fi + @if ! command -v sops >/dev/null 2>&1; then \ + echo "sops not found; install it first"; \ + exit 1; \ + fi + @if [ ! -f "$(SOPS_AGE_KEY_FILE)" ]; then \ + echo "Age key file not found: $(SOPS_AGE_KEY_FILE)"; \ + exit 1; \ + fi + cp "$(SECRETS_PLAINTEXT)" "$(SECRETS_ENCRYPTED)" + SOPS_AGE_KEY_FILE="$(SOPS_AGE_KEY_FILE)" sops --encrypt --age "$(SOPS_AGE_RECIPIENT)" -i "$(SECRETS_ENCRYPTED)" + @if [ "$(REMOVE_PLAINTEXT)" = "true" ]; then \ + rm -f "$(SECRETS_PLAINTEXT)"; \ + echo "Removed plaintext secrets file."; \ + else \ + echo "Plaintext secrets left at $(SECRETS_PLAINTEXT); delete it when done."; \ + fi + +.PHONY: net-bootstrap +net-bootstrap: require-net ## Bootstrap (no secrets): plan + storage + doctor (NET=, optional LIMIT=). + @$(MAKE) net-plan NET=$(NET) + @$(MAKE) net-storage NET=$(NET) LIMIT=$(LIMIT) MOVE_DOCKER_DATAROOT=$(MOVE_DOCKER_DATAROOT) + @$(MAKE) net-doctor-pre NET=$(NET) NET_DIR=$(NET_DIR) LIMIT=$(LIMIT) + +.PHONY: net-launch +net-launch: require-net ## Go-live from scratch: bootstrap + gen + deploy + health (NET=, SECRETS_FILE=, optional LIMIT=). + @if [ -z "$(SECRETS_FILE)" ]; then \ + echo "SECRETS_FILE is required for net-launch (validators need archiver bearer tokens)."; \ + echo "Use 'make net-bootstrap NET=$(NET)' if you want to prep hosts before secrets exist."; \ + exit 1; \ + fi + @$(MAKE) net-bootstrap NET=$(NET) NET_DIR=$(NET_DIR) LIMIT=$(LIMIT) MOVE_DOCKER_DATAROOT=$(MOVE_DOCKER_DATAROOT) + @$(MAKE) net-gen NET=$(NET) SECRETS_FILE=$(SECRETS_FILE) + @$(MAKE) net-deploy NET=$(NET) NET_DIR=$(NET_DIR) LIMIT=$(LIMIT) APPLY_FIREWALL=$(APPLY_FIREWALL) RESTART_ON_DEPLOY=true + @$(MAKE) net-doctor-post NET=$(NET) NET_DIR=$(NET_DIR) LIMIT=$(LIMIT) + @$(MAKE) net-health NET=$(NET) NET_DIR=$(NET_DIR) LIMIT=$(LIMIT) + +.PHONY: net-update +net-update: require-net ## One-command update: gen + deploy + roll + health (NET=, SECRETS_FILE=, optional LIMIT=). + @if [ -z "$(SECRETS_FILE)" ]; then \ + echo "SECRETS_FILE is required for net-update (validators need archiver bearer tokens)."; \ + exit 1; \ + fi + @$(MAKE) net-gen NET=$(NET) SECRETS_FILE=$(SECRETS_FILE) + @$(MAKE) net-apply NET=$(NET) NET_DIR=$(NET_DIR) LIMIT=$(LIMIT) APPLY_FIREWALL=$(APPLY_FIREWALL) + @$(MAKE) net-roll NET=$(NET) NET_DIR=$(NET_DIR) LIMIT=$(LIMIT) + @$(MAKE) net-health NET=$(NET) NET_DIR=$(NET_DIR) LIMIT=$(LIMIT) + +.PHONY: net-update-secrets +net-update-secrets: require-net ## Rotate archiver secrets only (NET=, SECRETS_FILE=). Keys are preserved. + @if [ -z "$(SECRETS_FILE)" ]; then \ + echo "$(RED)SECRETS_FILE is required$(NC)"; \ + echo " Example: make net-update-secrets NET=$(NET) SECRETS_FILE=infra/networks/$(NET)/secrets.sops.yaml"; \ + exit 1; \ + fi + @echo "$(YELLOW)Regenerating network bundle with updated secrets...$(NC)" + @echo "$(GREEN)Note: Validator keys and P2P keys are preserved (netgen reuses existing keys).$(NC)" + @$(MAKE) net-gen NET=$(NET) SECRETS_FILE=$(SECRETS_FILE) + @echo "" + @echo "$(GREEN)Secrets updated successfully.$(NC)" + @echo "To apply changes, run: $(YELLOW)make net-deploy NET=$(NET)$(NC)" + +.PHONY: net-apply +net-apply: require-net ## Deploy artifacts + units without restarts (NET=, NET_DIR=). + @$(MAKE) net-deploy NET=$(NET) NET_DIR=$(NET_DIR) LIMIT=$(LIMIT) APPLY_FIREWALL=$(APPLY_FIREWALL) RESTART_ON_DEPLOY=false + +.PHONY: net-redeploy +net-redeploy: require-net ## Deploy artifacts + restart services (NET=, NET_DIR=). + @$(MAKE) net-deploy NET=$(NET) NET_DIR=$(NET_DIR) LIMIT=$(LIMIT) APPLY_FIREWALL=$(APPLY_FIREWALL) RESTART_ON_DEPLOY=true + +.PHONY: net-deploy +net-deploy: require-net ## Deploy artifacts + systemd units via Ansible (NET=, NET_DIR=). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) $(if $(LIMIT),-l $(LIMIT),) -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/deploy.yml \ + -e "net=$(NET) net_dir=$(NET_DIR) restart_on_deploy=$(RESTART_ON_DEPLOY) apply_firewall=$(APPLY_FIREWALL) loadnet_apt_disable_proxy=$(APT_DISABLE_PROXY) loadnet_prometheus_bind=$(PROMETHEUS_BIND) loadnet_prometheus_port=$(PROMETHEUS_PORT) loadnet_prometheus_scrape_interval=$(PROMETHEUS_SCRAPE_INTERVAL) loadnet_grafana_bind=$(GRAFANA_BIND) loadnet_grafana_port=$(GRAFANA_PORT) loadnet_grafana_admin_password=$(GRAFANA_ADMIN_PASSWORD) loadnet_el_http_bind=$(EL_HTTP_BIND)" + +.PHONY: net-roll +net-roll: require-net ## Rolling restart via Ansible (serial=1) (NET=, NET_DIR=, ROLL_CONFIRM=YES if small validator set). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) $(if $(LIMIT),-l $(LIMIT),) -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/roll.yml \ + -e "net=$(NET) net_dir=$(NET_DIR) roll_confirm=$(ROLL_CONFIRM)" + +.PHONY: net-up +net-up: require-net ## Start services via systemd (NET=, NET_DIR=). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) $(if $(LIMIT),-l $(LIMIT),) -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/up.yml \ + -e "net=$(NET) net_dir=$(NET_DIR)" + +.PHONY: net-down +net-down: require-net ## Stop services via systemd (NET=, NET_DIR=). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) $(if $(LIMIT),-l $(LIMIT),) -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/down.yml \ + -e "net=$(NET) net_dir=$(NET_DIR)" + +.PHONY: net-status +net-status: require-net ## Show systemd status for services (NET=, NET_DIR=). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) $(if $(LIMIT),-l $(LIMIT),) -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/status.yml \ + -e "net=$(NET) net_dir=$(NET_DIR)" + +.PHONY: net-logs +net-logs: ## Tail systemd logs (NET=, NET_DIR=, LINES=). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) $(if $(LIMIT),-l $(LIMIT),) -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/logs.yml \ + -e "net=$(NET) net_dir=$(NET_DIR) lines=$(LINES)" + +.PHONY: net-clean-logs +net-clean-logs: require-net ## Vacuum journald + rotate/truncate syslog and restart EL/CL (NET=, NET_DIR=, optional JOURNAL_VACUUM_SIZE=). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) $(if $(LIMIT),-l $(LIMIT),) -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/clean_logs.yml \ + -e "net=$(NET) net_dir=$(NET_DIR) loadnet_journal_vacuum_size=$(JOURNAL_VACUUM_SIZE)" + +.PHONY: net-health +net-health: require-net ## Health check: services active + Engine IPC socket + height moving (NET=, NET_DIR=). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) $(if $(LIMIT),-l $(LIMIT),) -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/health.yml \ + -e "net=$(NET) net_dir=$(NET_DIR)" + +.PHONY: net-doctor +net-doctor: require-net ## Post-deploy diagnostics (units/current/layout/listeners) (NET=, NET_DIR=). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) $(if $(LIMIT),-l $(LIMIT),) -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/doctor.yml \ + -e "net=$(NET) net_dir=$(NET_DIR) loadnet_el_http_bind=$(EL_HTTP_BIND)" + +.PHONY: net-doctor-pre +net-doctor-pre: require-net ## Pre-deploy checks (OS/storage/docker) (NET=, NET_DIR=). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) $(if $(LIMIT),-l $(LIMIT),) -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/doctor_pre.yml \ + -e "net=$(NET) net_dir=$(NET_DIR)" + +.PHONY: net-doctor-post +net-doctor-post: ## Post-deploy checks (units/current/layout/listeners) (NET=, NET_DIR=). + @$(MAKE) net-doctor NET=$(NET) NET_DIR=$(NET_DIR) LIMIT=$(LIMIT) +.PHONY: net-firewall +net-firewall: require-net ## Apply host firewall (UFW) rules for required P2P ports (NET=, NET_DIR=). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) $(if $(LIMIT),-l $(LIMIT),) -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/firewall.yml \ + -e "net=$(NET) net_dir=$(NET_DIR) loadnet_apt_disable_proxy=$(APT_DISABLE_PROXY) loadnet_prometheus_bind=$(PROMETHEUS_BIND) loadnet_prometheus_port=$(PROMETHEUS_PORT) loadnet_grafana_bind=$(GRAFANA_BIND) loadnet_grafana_port=$(GRAFANA_PORT) loadnet_el_http_bind=$(EL_HTTP_BIND)" + +.PHONY: net-storage +net-storage: require-net ## Storage bootstrap (non-destructive by default; set STORAGE_WIPE=true + DATA_DEVICES=/dev/disk/by-id/...) (NET=). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) $(if $(LIMIT),-l $(LIMIT),) -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/storage.yml \ + -e "loadnet_storage_wipe=$(STORAGE_WIPE) loadnet_data_mountpoint=$(DATA_MOUNTPOINT) loadnet_data_source_mountpoint=$(DATA_SOURCE_MOUNTPOINT) loadnet_data_source_dir=$(DATA_SOURCE_DIR) loadnet_data_devices=$(DATA_DEVICES) loadnet_data_raid_level=$(DATA_RAID_LEVEL) loadnet_move_docker_dataroot=$(MOVE_DOCKER_DATAROOT) loadnet_docker_dataroot=$(DOCKER_DATAROOT) loadnet_apt_disable_proxy=$(APT_DISABLE_PROXY) loadnet_bind_var_log=$(BIND_VAR_LOG) loadnet_log_dir=$(LOG_DIR)" + +.PHONY: net-wipe +net-wipe: require-net ## Destroy+clean hosts (destructive): WIPE_CONFIRM=YES (WIPE_STATE=true|false, WIPE_MONITORING=true|false, WIPE_CONTAINERS=true|false, WIPE_FIREWALL=true|false, EXTRA_VARS="k=v ...", LIMIT=). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) $(if $(LIMIT),-l $(LIMIT),) -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/wipe.yml \ + -e "net=$(NET) net_dir=$(NET_DIR) wipe_confirm=$(WIPE_CONFIRM) wipe_state=$(WIPE_STATE) wipe_monitoring=$(WIPE_MONITORING) wipe_containers=$(WIPE_CONTAINERS) wipe_firewall=$(WIPE_FIREWALL) wipe_nodes=$(WIPE_NODES)" \ + $(if $(EXTRA_VARS),-e "$(EXTRA_VARS)",) + +# ----------------------------------------------------------------------------- +# Blockscout + +MANIFEST_DIR ?= infra/manifests + +.PHONY: net-blockscout +net-blockscout: require-net ## Deploy Blockscout explorer (NET=). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/blockscout.yml \ + -e "net=$(NET) net_dir=$(NET_DIR)" + +.PHONY: net-blockscout-status +net-blockscout-status: require-net ## Check Blockscout status (NET=). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) -i $(ANSIBLE_INVENTORY) all -m shell \ + -a "systemctl status blockscout nginx-blockscout --no-pager" \ + --limit $$(yq -r '.blockscout.host' $(MANIFEST_DIR)/$(NET).yaml) + +.PHONY: net-blockscout-logs +net-blockscout-logs: require-net ## View Blockscout logs (NET=, LINES=). + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible $(if $(SSH_KEY),--private-key "$(SSH_KEY)",) -i $(ANSIBLE_INVENTORY) all -m shell \ + -a "journalctl -u blockscout -n $(LINES) --no-pager" \ + --limit $$(yq -r '.blockscout.host' $(MANIFEST_DIR)/$(NET).yaml) + +.PHONY: infra-checks +infra-checks: ## Run infra checks (netgen build + ansible syntax-checks if available). + cargo fmt --all + cargo build -q -p ultramarine-netgen + @if command -v ansible-playbook >/dev/null 2>&1; then \ + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook --syntax-check -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/deploy.yml -e "net=$(NET) net_dir=$(NET_DIR)" && \ + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook --syntax-check -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/roll.yml -e "net=$(NET) net_dir=$(NET_DIR)" && \ + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook --syntax-check -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/up.yml -e "net=$(NET) net_dir=$(NET_DIR)" && \ + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook --syntax-check -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/down.yml -e "net=$(NET) net_dir=$(NET_DIR)" && \ + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook --syntax-check -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/firewall.yml -e "net=$(NET) net_dir=$(NET_DIR)" && \ + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook --syntax-check -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/storage.yml -e "net=$(NET) net_dir=$(NET_DIR)" && \ + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook --syntax-check -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/wipe.yml -e "net=$(NET) net_dir=$(NET_DIR)" && \ + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook --syntax-check -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/health.yml -e "net=$(NET) net_dir=$(NET_DIR)" && \ + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook --syntax-check -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/doctor.yml -e "net=$(NET) net_dir=$(NET_DIR)" && \ + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook --syntax-check -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/clean_logs.yml -e "net=$(NET) net_dir=$(NET_DIR)" && \ + ANSIBLE_CONFIG=$(ANSIBLE_CONFIG_PATH) ansible-playbook --syntax-check -i $(ANSIBLE_INVENTORY) $(ANSIBLE_PLAYBOOKS)/doctor_pre.yml -e "net=$(NET) net_dir=$(NET_DIR)"; \ + else \ + echo "ansible-playbook not found; skipping syntax-checks (install ansible-core on the controller)"; \ + fi diff --git a/bin/ultramarine/src/main.rs b/bin/ultramarine/src/main.rs index a9f578f..d96096c 100644 --- a/bin/ultramarine/src/main.rs +++ b/bin/ultramarine/src/main.rs @@ -112,6 +112,7 @@ fn start(args: &Args, cmd: &StartCmd, logging: config::LoggingConfig) -> Result< genesis_file: args.get_genesis_file_path()?, private_key_file: args.get_priv_validator_key_file_path()?, start_height: cmd.start_height.map(Height::new), + execution_genesis_file: cmd.execution_genesis_path.clone(), engine_http_url: cmd.engine_http_url.clone(), engine_ipc_path: cmd.engine_ipc_path.clone(), eth1_rpc_url: cmd.eth1_rpc_url.clone(), @@ -130,6 +131,7 @@ fn init(args: &Args, cmd: &InitCmd, logging: config::LoggingConfig) -> Result<() genesis_file: args.get_genesis_file_path()?, private_key_file: args.get_priv_validator_key_file_path()?, start_height: Some(Height::new(1)), // We always start at height 1 + execution_genesis_file: None, engine_http_url: None, engine_ipc_path: None, eth1_rpc_url: None, @@ -154,6 +156,7 @@ fn testnet(args: &Args, cmd: &TestnetCmd, logging: config::LoggingConfig) -> Res genesis_file: args.get_genesis_file_path()?, private_key_file: args.get_priv_validator_key_file_path()?, start_height: Some(Height::new(1)), // We always start at height 1 + execution_genesis_file: None, engine_http_url: None, engine_ipc_path: None, eth1_rpc_url: None, @@ -176,6 +179,7 @@ fn distributed_testnet( genesis_file: args.get_genesis_file_path()?, private_key_file: args.get_priv_validator_key_file_path()?, start_height: Some(Height::new(1)), // We always start at height 1 + execution_genesis_file: None, engine_http_url: None, engine_ipc_path: None, eth1_rpc_url: None, diff --git a/compose.ipc.yaml b/compose.ipc.yaml index 8310afc..605d46b 100644 --- a/compose.ipc.yaml +++ b/compose.ipc.yaml @@ -22,6 +22,7 @@ services: - "-d" - "--datadir=/data/load-reth" - "--chain=/assets/genesis.json" + - "--engine.persistence-threshold=0" - "--http" - "--http.port=8545" - "--http.addr=0.0.0.0" @@ -65,6 +66,7 @@ services: - "-d" - "--datadir=/data/load-reth" - "--chain=/assets/genesis.json" + - "--engine.persistence-threshold=0" - "--http" - "--http.port=8545" - "--http.addr=0.0.0.0" @@ -108,6 +110,7 @@ services: - "-d" - "--datadir=/data/load-reth" - "--chain=/assets/genesis.json" + - "--engine.persistence-threshold=0" - "--http" - "--http.port=8545" - "--http.addr=0.0.0.0" @@ -144,6 +147,7 @@ services: volumes: - ./nodes/0:/nodes/0 - ./ipc/0:/ipc + - ./assets:/assets:ro env_file: - ./.env ports: @@ -157,6 +161,7 @@ services: - start - --home=/nodes/0 - --engine-ipc-path=/ipc/engine.ipc + - --execution-genesis-path=/assets/genesis.json - --eth1-rpc-url=http://load-reth0:8545 ultramarine1: @@ -168,6 +173,7 @@ services: volumes: - ./nodes/1:/nodes/1 - ./ipc/1:/ipc + - ./assets:/assets:ro env_file: - ./.env ports: @@ -181,6 +187,7 @@ services: - start - --home=/nodes/1 - --engine-ipc-path=/ipc/engine.ipc + - --execution-genesis-path=/assets/genesis.json - --eth1-rpc-url=http://load-reth1:8545 ultramarine2: @@ -192,6 +199,7 @@ services: volumes: - ./nodes/2:/nodes/2 - ./ipc/2:/ipc + - ./assets:/assets:ro env_file: - ./.env ports: @@ -205,6 +213,7 @@ services: - start - --home=/nodes/2 - --engine-ipc-path=/ipc/engine.ipc + - --execution-genesis-path=/assets/genesis.json - --eth1-rpc-url=http://load-reth2:8545 prometheus: diff --git a/compose.yaml b/compose.yaml index 1b2a696..da17fb4 100644 --- a/compose.yaml +++ b/compose.yaml @@ -24,6 +24,7 @@ services: - "-d" - "--datadir=/data/load-reth" - "--chain=/assets/genesis.json" + - "--engine.persistence-threshold=0" - "--http" - "--http.port=8545" - "--http.addr=0.0.0.0" @@ -70,6 +71,7 @@ services: - "-d" - "--datadir=/data/load-reth" - "--chain=/assets/genesis.json" + - "--engine.persistence-threshold=0" - "--http" - "--http.port=8545" - "--http.addr=0.0.0.0" @@ -116,6 +118,7 @@ services: - "-d" - "--datadir=/data/load-reth" - "--chain=/assets/genesis.json" + - "--engine.persistence-threshold=0" - "--http" - "--http.port=8545" - "--http.addr=0.0.0.0" diff --git a/crates/blob_engine/src/engine.rs b/crates/blob_engine/src/engine.rs index 665b24c..9848e45 100644 --- a/crates/blob_engine/src/engine.rs +++ b/crates/blob_engine/src/engine.rs @@ -3,7 +3,11 @@ use async_trait::async_trait; use tracing::{debug, info, warn}; use ultramarine_types::{blob::BYTES_PER_BLOB, height::Height, proposal_part::BlobSidecar}; -use crate::{error::BlobEngineError, store::BlobStore, verifier::BlobVerifier}; +use crate::{ + error::BlobEngineError, + store::{BlobStore, rocksdb::RocksDbBlobStore}, + verifier::BlobVerifier, +}; /// Blob lifecycle management: verification, storage, and archival /// @@ -200,6 +204,21 @@ where } } +impl BlobEngineImpl { + /// Flush all pending writes to disk synchronously + /// + /// This method forces all pending RocksDB writes to be persisted to disk. + /// It should be called during graceful shutdown to ensure data durability. + /// + /// # Errors + /// + /// Returns an error if the flush operation fails. + pub fn flush_sync(&self) -> Result<(), BlobEngineError> { + self.store.flush()?; + Ok(()) + } +} + #[async_trait] impl BlobEngine for BlobEngineImpl where diff --git a/crates/blob_engine/src/metrics.rs b/crates/blob_engine/src/metrics.rs index 0e921eb..cb17840 100644 --- a/crates/blob_engine/src/metrics.rs +++ b/crates/blob_engine/src/metrics.rs @@ -42,10 +42,16 @@ pub struct Inner { lifecycle_promoted: Counter, lifecycle_dropped: Counter, lifecycle_pruned: Counter, + stale_round_cleanup: Counter, // Restream/Sync metrics (counters) restream_rebuilds: Counter, sync_failures: Counter, + + // Sync health metrics (FIX-007: additional sync health visibility) + sync_packages_rejected: Counter, + cleanup_failures: Counter, + orphaned_blobs_dropped: Counter, } impl Inner { @@ -64,9 +70,14 @@ impl Inner { lifecycle_promoted: Counter::default(), lifecycle_dropped: Counter::default(), lifecycle_pruned: Counter::default(), + stale_round_cleanup: Counter::default(), restream_rebuilds: Counter::default(), sync_failures: Counter::default(), + + sync_packages_rejected: Counter::default(), + cleanup_failures: Counter::default(), + orphaned_blobs_dropped: Counter::default(), } } } @@ -95,8 +106,12 @@ impl BlobEngineMetrics { lifecycle_promoted: self.lifecycle_promoted.get(), lifecycle_dropped: self.lifecycle_dropped.get(), lifecycle_pruned: self.lifecycle_pruned.get(), + stale_round_cleanup: self.stale_round_cleanup.get(), restream_rebuilds: self.restream_rebuilds.get(), sync_failures: self.sync_failures.get(), + sync_packages_rejected: self.sync_packages_rejected.get(), + cleanup_failures: self.cleanup_failures.get(), + orphaned_blobs_dropped: self.orphaned_blobs_dropped.get(), } } @@ -168,6 +183,12 @@ impl BlobEngineMetrics { metrics.lifecycle_pruned.clone(), ); + registry.register( + "stale_round_cleanup", + "Rounds cleaned due to stale round blob cleanup", + metrics.stale_round_cleanup.clone(), + ); + // Restream/Sync metrics registry.register( "restream_rebuilds", @@ -180,6 +201,25 @@ impl BlobEngineMetrics { "Blob sync/fetch failures", metrics.sync_failures.clone(), ); + + // Sync health metrics (FIX-007) + registry.register( + "sync_packages_rejected", + "Sync packages rejected due to validation failures", + metrics.sync_packages_rejected.clone(), + ); + + registry.register( + "cleanup_failures", + "Failures during blob cleanup operations", + metrics.cleanup_failures.clone(), + ); + + registry.register( + "orphaned_blobs_dropped", + "Orphaned blobs dropped during reorg/fork handling", + metrics.orphaned_blobs_dropped.clone(), + ); }); metrics @@ -236,6 +276,11 @@ impl BlobEngineMetrics { self.lifecycle_pruned.inc_by(blob_count as u64); } + /// Record stale round cleanup + pub fn record_stale_round_cleanup(&self, rounds: usize) { + self.stale_round_cleanup.inc_by(rounds as u64); + } + /// Set blobs per finalized block pub fn set_blobs_per_block(&self, count: usize) { self.blobs_per_block.set(count as i64); @@ -250,6 +295,21 @@ impl BlobEngineMetrics { pub fn record_sync_failure(&self) { self.sync_failures.inc(); } + + /// Record rejected sync package (FIX-007) + pub fn record_sync_package_rejected(&self) { + self.sync_packages_rejected.inc(); + } + + /// Record cleanup failure (FIX-007) + pub fn record_cleanup_failure(&self) { + self.cleanup_failures.inc(); + } + + /// Record orphaned blobs dropped during reorg/fork handling (FIX-007) + pub fn record_orphaned_blobs_dropped(&self, count: usize) { + self.orphaned_blobs_dropped.inc_by(count as u64); + } } impl Default for BlobEngineMetrics { @@ -279,8 +339,16 @@ pub struct MetricsSnapshot { pub lifecycle_dropped: u64, /// Total blobs pruned or archived. pub lifecycle_pruned: u64, + /// Total rounds cleaned due to stale round cleanup. + pub stale_round_cleanup: u64, /// Total restream rebuild operations performed. pub restream_rebuilds: u64, /// Total sync failures recorded during blob processing. pub sync_failures: u64, + /// Total sync packages rejected due to validation failures. + pub sync_packages_rejected: u64, + /// Total cleanup failures during blob operations. + pub cleanup_failures: u64, + /// Total orphaned blobs dropped during reorg/fork handling. + pub orphaned_blobs_dropped: u64, } diff --git a/crates/blob_engine/src/store/rocksdb.rs b/crates/blob_engine/src/store/rocksdb.rs index 5fec7b2..b9ade78 100644 --- a/crates/blob_engine/src/store/rocksdb.rs +++ b/crates/blob_engine/src/store/rocksdb.rs @@ -60,6 +60,18 @@ impl RocksDbBlobStore { Ok(Self { db: Arc::new(db) }) } + /// Flush all writes to disk + /// + /// Forces a sync of all pending writes to ensure durability. + /// This is useful during graceful shutdown to ensure all data is persisted. + /// + /// # Errors + /// + /// Returns an error if the flush operation fails. + pub fn flush(&self) -> Result<(), BlobStoreError> { + self.db.flush().map_err(BlobStoreError::from) + } + /// Open the blob store in read-only mode without taking a lock. /// /// This is primarily used by integration tests that inspect on-disk state diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index 0cdfb22..f270cd6 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -20,7 +20,6 @@ malachitebft-metrics.workspace = true ultramarine-types.workspace = true axum = { workspace = true } -bytesize = { workspace = true } clap = { workspace = true, features = ["derive", "env"] } color-eyre = { workspace = true } directories = { workspace = true } diff --git a/crates/cli/src/cmd/start.rs b/crates/cli/src/cmd/start.rs index 4e2b3f5..0c85612 100644 --- a/crates/cli/src/cmd/start.rs +++ b/crates/cli/src/cmd/start.rs @@ -27,6 +27,10 @@ pub struct StartCmd { #[clap(long)] pub eth1_rpc_url: Option, + /// Path to execution-layer genesis.json (same file used by load-reth --chain) + #[clap(long)] + pub execution_genesis_path: Option, + /// Override JWT secret path used for Engine API authentication #[clap(long)] pub jwt_path: Option, diff --git a/crates/consensus/Cargo.toml b/crates/consensus/Cargo.toml index e373589..42874ac 100644 --- a/crates/consensus/Cargo.toml +++ b/crates/consensus/Cargo.toml @@ -11,7 +11,6 @@ rust-version.workspace = true [dependencies] ultramarine-blob-engine.workspace = true -ultramarine-cli.workspace = true ultramarine-execution.workspace = true ultramarine-types.workspace = true diff --git a/crates/consensus/src/state.rs b/crates/consensus/src/state.rs index 9b13763..c22b24c 100644 --- a/crates/consensus/src/state.rs +++ b/crates/consensus/src/state.rs @@ -6,9 +6,10 @@ use std::{ collections::{BTreeSet, HashMap, HashSet}, convert::TryFrom, sync::Arc, + time::{Duration, SystemTime, UNIX_EPOCH}, }; -use alloy_rpc_types_engine::{ExecutionPayloadV3, PayloadStatus}; +use alloy_rpc_types_engine::{ExecutionPayloadV3, PayloadStatus, PayloadStatusEnum}; use bytes::Bytes; use color_eyre::eyre; use ethereum_hashing::hash32_concat; @@ -24,13 +25,13 @@ use malachitebft_app_channel::app::{ use rand::{Rng, SeedableRng, rngs::StdRng}; use sha2::{Digest as Sha2Digest, Sha256 as Sha2Sha256}; use ssz::Decode; -use tokio::time::Instant; +use tokio::time::{Instant, sleep}; use tracing::{debug, error, info, warn}; use tree_hash::TreeHash; use ultramarine_blob_engine::{ BlobEngine, BlobEngineError, BlobEngineImpl, store::rocksdb::RocksDbBlobStore, }; -use ultramarine_execution::notifier::ExecutionNotifier; +use ultramarine_execution::{error::ExecutionError, notifier::ExecutionNotifier}; use ultramarine_types::{ address::Address, // Phase 3: Import blob types for streaming @@ -41,6 +42,7 @@ use ultramarine_types::{ blob_metadata::BlobMetadata, codec::proto::ProtobufCodec, consensus_block_metadata::ConsensusBlockMetadata, + constants::{LOAD_MAX_FUTURE_DRIFT_SECS, LOAD_MIN_BLOCK_TIME_SECS}, context::LoadContext, engine_api::{ExecutionBlock, ExecutionPayloadHeader, load_prev_randao}, ethereum_compat::{ @@ -105,6 +107,13 @@ where pub peers: HashSet, pub latest_block: Option, + pub executed_height: Height, + pub el_degraded: bool, + pub el_degraded_since: Option, + pub el_last_error: Option, + pub last_fcu_head: Option, + pub last_fcu_success: Option, + execution_retry: ExecutionRetryConfig, // Track rounds with blobs for cleanup // Key: height, Value: set of rounds that have blobs @@ -114,18 +123,24 @@ where last_blob_sidecar_root: B256, last_blob_sidecar_height: Height, - /// Retention window (in heights) for decided blobs. + /// Retention window (in heights) for undecided data cleanup. + /// + /// NOTE: This does NOT control blob byte pruning. Load Network uses archive events + /// as the boundary for blob pruning (NOT Ethereum's time-based DA window). + /// Blob bytes are only pruned after verified archival + finality. + /// Decided values, certificates, and block data are retained forever. blob_retention_window: u64, /// Heights with blobs that are still awaiting archive notices. pending_archive_heights: BTreeSet, /// Heights that have fully archived blobs but are waiting for finalization before pruning. + /// Only blob bytes are pruned; consensus data (decided values, certs) is retained forever. pending_prune_heights: BTreeSet, /// Highest height finalized/committed locally. /// /// NOTE: In V0, "finalized" equals "decided" - there is no additional finality delay. - /// This means pruning happens immediately after a height is decided AND all blobs - /// at that height have verified archive notices. Future versions may add a finality + /// This means blob byte pruning happens immediately after a height is decided AND all + /// blobs at that height have verified archive notices. Future versions may add a finality /// lag (e.g., 2/3 acks, on-chain anchoring) before allowing prune. latest_finalized_height: Height, /// Whether the archiver worker is enabled. @@ -146,6 +161,8 @@ pub struct DecidedOutcome { pub execution_block: ExecutionBlock, /// Payload status returned by the execution layer. pub payload_status: PayloadStatus, + /// Whether execution finalization is pending due to EL backpressure. + pub execution_pending: bool, /// Number of transactions in the execution payload. pub tx_count: usize, /// Size in bytes of the SSZ-encoded execution payload. @@ -160,6 +177,30 @@ pub struct DecidedOutcome { pub archive_job: Option, } +#[derive(Debug, Clone)] +pub struct ExecutionRetryConfig { + /// Timeout for newPayload during normal consensus (block production) + pub new_payload_timeout: Duration, + /// Timeout for newPayload during sync mode (shorter, EL knows target via FCU) + pub new_payload_sync_timeout: Duration, + pub forkchoice_timeout: Duration, + pub initial_backoff: Duration, + pub max_backoff: Duration, +} + +impl Default for ExecutionRetryConfig { + fn default() -> Self { + Self { + new_payload_timeout: Duration::from_secs(30), + // Shorter timeout during sync: EL already knows target from FCU + new_payload_sync_timeout: Duration::from_secs(2), + forkchoice_timeout: Duration::from_secs(30), + initial_backoff: Duration::from_millis(250), + max_backoff: Duration::from_secs(2), + } + } +} + /// Represents errors that can occur during the verification of a proposal's signature. #[derive(Debug)] enum SignatureVerificationError { @@ -226,6 +267,13 @@ where peers: HashSet::new(), latest_block: None, + executed_height: Height::new(0), + el_degraded: false, + el_degraded_since: None, + el_last_error: None, + last_fcu_head: None, + last_fcu_success: None, + execution_retry: ExecutionRetryConfig::default(), blob_rounds: HashMap::new(), last_blob_sidecar_root: B256::ZERO, last_blob_sidecar_height: Height::new(0), @@ -245,9 +293,47 @@ where self.archiver_enabled = enabled; } - /// Returns the earliest height available in the state. - /// If no values have been decided yet (empty store), returns the current height. + pub fn set_execution_retry_config(&mut self, config: ExecutionRetryConfig) { + self.execution_retry = config; + } + + pub fn is_el_degraded(&self) -> bool { + self.el_degraded + } + + pub fn mark_el_degraded(&mut self, error: impl Into) { + if !self.el_degraded { + self.el_degraded_since = Some(Instant::now()); + } + self.el_degraded = true; + self.el_last_error = Some(error.into()); + self.last_fcu_head = None; + self.last_fcu_success = None; + } + + pub fn clear_el_degraded(&mut self) { + self.el_degraded = false; + self.el_degraded_since = None; + self.el_last_error = None; + } + + /// Returns the earliest height available in the state for sync purposes. + /// + /// Load Network context: Returns genesis height (0) if genesis metadata exists, + /// since validators must be able to serve historical data for fullnode sync. + /// This is required because we no longer prune decided values, certificates, + /// or block data - only blob bytes are pruned after archival. + /// + /// Following the Lighthouse pattern where beacon blocks are kept forever, + /// we return Height(0) when genesis metadata is seeded, allowing peers + /// to sync the complete chain history. pub async fn get_earliest_height(&self) -> Height { + // Check if genesis metadata exists (seeded at bootstrap) + // If present, we can serve historical data from height 0 + if self.store.get_blob_metadata(Height::new(0)).await.ok().flatten().is_some() { + return Height::new(0); + } + // Fallback to min decided value height (for nodes without genesis metadata) self.store.min_decided_value_height().await.unwrap_or(self.current_height) } @@ -569,6 +655,150 @@ where Ok(()) } + /// Number of stale rounds to retain blobs for before cleanup. + /// Set to 3 to allow for valid_round re-proposals (POL locking). + /// Rounds older than (current_round - STALE_ROUND_BLOB_RETENTION) will be cleaned up. + pub const STALE_ROUND_BLOB_RETENTION: u32 = 3; + + /// Runtime cleanup of stale round blobs at the current height. + /// + /// Called when a new round starts to clean up blobs from rounds that are too old + /// to be relevant for re-proposal (via valid_round/POL). + /// + /// # Policy + /// + /// Drops blobs from rounds where: `round < (current_round - STALE_ROUND_BLOB_RETENTION)` + /// + /// # Safety + /// + /// This is safe because: + /// - Tendermint/Malachite POL (Proof-of-Lock) only references recent rounds + /// - After N rounds without commit, old proposals are unlikely to be re-proposed + /// - valid_round is always <= the round where lock was acquired (recent) + /// + /// # Arguments + /// + /// * `height` - The height to clean up stale rounds for + /// * `current_round` - The round we just entered + /// + /// # Returns + /// + /// Number of rounds cleaned up, or error + pub async fn cleanup_stale_round_blobs( + &mut self, + height: Height, + current_round: Round, + ) -> eyre::Result { + let retention = Self::STALE_ROUND_BLOB_RETENTION; + let current_round_i64 = current_round.as_i64(); + + // Calculate the cutoff: rounds below this will be cleaned + let cutoff_round = current_round_i64.saturating_sub(retention as i64); + + if cutoff_round <= 0 { + // No rounds old enough to clean + return Ok(0); + } + + // Get the rounds tracked for this height + let Some(rounds) = self.blob_rounds.get(&height) else { + return Ok(0); + }; + + // Find rounds that are stale (below cutoff) + let stale_rounds: Vec = rounds.iter().copied().filter(|&r| r < cutoff_round).collect(); + + if stale_rounds.is_empty() { + return Ok(0); + } + + info!( + height = %height, + current_round = %current_round, + cutoff_round = cutoff_round, + stale_count = stale_rounds.len(), + "Cleaning up stale round blobs" + ); + + let mut cleaned = 0; + + for round_i64 in stale_rounds.iter().copied() { + let Some(round_u32) = u32::try_from(round_i64).ok() else { + warn!( + height = %height, + round = round_i64, + "Invalid negative round encountered during stale cleanup" + ); + continue; + }; + let round = Round::new(round_u32); + + // Clean up blob metadata from store (RocksDB) + if let Err(e) = self.store.delete_blob_metadata_undecided(height, round).await { + warn!( + height = %height, + round = %round, + error = %e, + "Failed to delete stale undecided BlobMetadata" + ); + // Continue with blob engine cleanup even if metadata delete fails + } + + if let Err(e) = self.store.delete_undecided_proposal(height, round).await { + warn!( + height = %height, + round = %round, + error = %e, + "Failed to delete stale undecided proposal" + ); + } + + if let Err(e) = self.store.delete_undecided_block_data(height, round).await { + warn!( + height = %height, + round = %round, + error = %e, + "Failed to delete stale undecided block data" + ); + } + + // Clean up blobs from blob engine (RocksDB CF_UNDECIDED) + if let Err(e) = self.blob_engine.drop_round(height, round_i64).await { + error!( + height = %height, + round = round_i64, + error = %e, + "Failed to drop stale blobs from blob engine" + ); + // Continue to next round + } else { + debug!( + height = %height, + round = round_i64, + "Dropped stale round blobs" + ); + cleaned += 1; + } + } + + // Remove cleaned rounds from in-memory tracking + if let Some(rounds) = self.blob_rounds.get_mut(&height) { + rounds.retain(|&r| r >= cutoff_round); + } + + if cleaned > 0 { + self.blob_metrics.record_stale_round_cleanup(cleaned); + info!( + height = %height, + cleaned = cleaned, + retention_window = retention, + "Completed stale round blob cleanup" + ); + } + + Ok(cleaned) + } + /// Recover pending archive jobs on startup. /// /// Scans decided heights for blobs that have not been archived yet and returns @@ -721,10 +951,8 @@ where proposer: &Address, metadata: &ValueMetadata, body_root: B256, + parent_root: B256, ) -> Result { - let parent_root = - if height.as_u64() == 0 { B256::ZERO } else { self.last_blob_sidecar_root }; - let proposer_index = self .validator_index(proposer) .ok_or_else(|| format!("Proposer {} not found in validator set", proposer))?; @@ -738,8 +966,26 @@ where )) } - pub fn prepare_blob_sidecar_parts( - &self, + /// Resolve the parent root for a given height without mutating the cache. + /// + /// Cache updates are intentionally handled only after values become decided + /// (commit/WAL replay/sync promotion) or on startup hydration. + async fn resolve_parent_root_for_height(&self, height: Height) -> eyre::Result { + if height.as_u64() == 0 { + return Ok(B256::ZERO); + } + + let prev_height = Height::new(height.as_u64() - 1); + if let Some(prev_metadata) = self.store.get_blob_metadata(prev_height).await? { + let root = prev_metadata.to_beacon_header().hash_tree_root(); + return Ok(root); + } + + eyre::bail!("Missing decided BlobMetadata for parent height {}", prev_height.as_u64()) + } + + pub async fn prepare_blob_sidecar_parts( + &mut self, value: &LocallyProposedValue, bundle: Option<&BlobsBundle>, ) -> eyre::Result<(SignedBeaconBlockHeader, Vec)> { @@ -773,8 +1019,16 @@ where ); let body_root = body.compute_body_root(); + let parent_root = self.resolve_parent_root_for_height(value.height).await?; + let header_message = self - .build_sidecar_header_message(value.height, &self.address, metadata, body_root) + .build_sidecar_header_message( + value.height, + &self.address, + metadata, + body_root, + parent_root, + ) .map_err(|e| eyre::eyre!(e))?; let signing_root = header_message.hash_tree_root(); @@ -949,23 +1203,21 @@ where } // Re-assemble the proposal from its parts with KZG verification and storage - let (value, data, execution_requests, has_blobs) = match self - .assemble_and_store_blobs(parts.clone()) - .await - { - Ok((value, data, execution_requests, has_blobs)) => { - (value, data, execution_requests, has_blobs) - } - Err(e) => { - error!( - height = %self.current_height, - round = %self.current_round, - error = %e, - "Received proposal with invalid blob KZG proofs or storage failure, rejecting" - ); - return Ok(None); - } - }; + let (value, data, execution_requests, has_blobs) = + match self.assemble_and_store_blobs(parts.clone()).await { + Ok((value, data, execution_requests, has_blobs)) => { + (value, data, execution_requests, has_blobs) + } + Err(e) => { + error!( + height = %self.current_height, + round = %self.current_round, + error = %e, + "Received proposal rejected during assembly" + ); + return Ok(None); + } + }; // Track blob rounds for cleanup if has_blobs { @@ -1065,11 +1317,26 @@ where )); } - if metadata.blob_keccak_hashes()[blob_index] != notice.body.blob_keccak { + // Validate or learn keccak hash from notice + // B256::ZERO means we don't have the hash yet (synced from pruned peer) + let stored_hash = metadata.blob_keccak_hashes()[blob_index]; + if stored_hash == B256::ZERO { + // Learn the keccak hash from this notice (synced from pruned peer) + if metadata.update_keccak_hash(blob_index, notice.body.blob_keccak) { + debug!( + height = %height, + index = %blob_index, + hash = ?notice.body.blob_keccak, + "Learned keccak hash from archive notice (was placeholder)" + ); + } + } else if stored_hash != notice.body.blob_keccak { return Err(eyre::eyre!( - "ArchiveNotice blob hash mismatch at height {} index {}", + "ArchiveNotice blob hash mismatch at height {} index {}: stored={:?} notice={:?}", height, - blob_index + blob_index, + stored_hash, + notice.body.blob_keccak )); } @@ -1284,7 +1551,7 @@ where ) .await?; - if !blob_sidecars.is_empty() { + let validated_signed_header = if !blob_sidecars.is_empty() { let round_i64 = round.as_i64(); if let Err(e) = @@ -1308,11 +1575,51 @@ where sidecar_count = blob_sidecars.len(), "Commitment count mismatch between metadata and sidecars" ); - self.blob_engine.drop_round(height, round_i64).await.ok(); + // FIX-004: Log cleanup errors instead of silently ignoring with .ok() + if let Err(cleanup_err) = + self.blob_engine.drop_round(height, round_i64).await + { + warn!( + height = %height, + round = %round, + error = %cleanup_err, + "Failed to cleanup blobs after commitment count mismatch" + ); + self.blob_metrics.record_cleanup_failure(); + } self.record_sync_failure(); + self.blob_metrics.record_sync_package_rejected(); return Ok(None); } + // FIX-003: Check for duplicate blob indices - malicious peers could send + // sidecars with the same index to bypass commitment count checks. + let mut seen_indices = HashSet::new(); + for sidecar in &blob_sidecars { + if !seen_indices.insert(sidecar.index) { + error!( + height = %height, + round = %round, + blob_index = %sidecar.index, + "Duplicate blob index detected in sidecars" + ); + if let Err(cleanup_err) = + self.blob_engine.drop_round(height, round_i64).await + { + warn!( + height = %height, + round = %round, + error = %cleanup_err, + "Failed to cleanup blobs after duplicate index detection" + ); + self.blob_metrics.record_cleanup_failure(); + } + self.record_sync_failure(); + self.blob_metrics.record_sync_package_rejected(); + return Ok(None); + } + } + let mut mismatch = false; for sidecar in &blob_sidecars { let index = usize::from(sidecar.index); @@ -1331,40 +1638,137 @@ where } if mismatch { - self.blob_engine.drop_round(height, round_i64).await.ok(); - self.store.delete_blob_metadata_undecided(height, round).await.ok(); + // FIX-002: Log cleanup errors instead of silently ignoring with .ok() + if let Err(cleanup_err) = + self.blob_engine.drop_round(height, round_i64).await + { + warn!( + height = %height, + round = %round, + error = %cleanup_err, + "Failed to cleanup blobs after commitment content mismatch" + ); + self.blob_metrics.record_cleanup_failure(); + } + if let Err(cleanup_err) = + self.store.delete_blob_metadata_undecided(height, round).await + { + warn!( + height = %height, + round = %round, + error = %cleanup_err, + "Failed to delete blob metadata after commitment content mismatch" + ); + self.blob_metrics.record_cleanup_failure(); + } self.record_sync_failure(); + self.blob_metrics.record_sync_package_rejected(); return Ok(None); } - if let Err(e) = self + let header = match self .verify_blob_sidecars(height, &proposer, &value_metadata, &blob_sidecars) .await { + Ok(h) => h, + Err(e) => { + error!( + height = %height, + round = %round, + error = %e, + "Blob sidecar verification failed during sync" + ); + // FIX-002: Log cleanup errors instead of silently ignoring with .ok() + if let Err(cleanup_err) = + self.blob_engine.drop_round(height, round_i64).await + { + warn!( + height = %height, + round = %round, + error = %cleanup_err, + "Failed to cleanup blobs after verification failure" + ); + self.blob_metrics.record_cleanup_failure(); + } + self.record_sync_failure(); + self.blob_metrics.record_sync_package_rejected(); + return Ok(None); + } + }; + + if let Err(e) = self.blob_engine.mark_decided(height, round_i64).await { error!( height = %height, round = %round, error = %e, - "Blob sidecar verification failed during sync" + "Failed to mark synced blobs as decided" ); - self.blob_engine.drop_round(height, round_i64).await.ok(); - self.record_sync_failure(); - return Ok(None); + // Continue despite the error – blobs are verified and stored. } - if let Err(e) = self.blob_engine.mark_decided(height, round_i64).await { + Some(header) + } else { + None + }; + + let parent_blob_root = match self.resolve_parent_root_for_height(height).await { + Ok(root) => root, + Err(e) => { error!( height = %height, round = %round, error = %e, - "Failed to mark synced blobs as decided" + "Failed to resolve parent root for synced value" ); - // Continue despite the error – blobs are verified and stored. + self.record_sync_failure(); + return Ok(None); } - } + }; - let parent_blob_root = - if height.as_u64() == 0 { B256::ZERO } else { self.blob_parent_root() }; + // Hardening: Verify that validated signed header's parent_root matches + // store-derived root. This invariant must always hold: the + // proposer's parent_root (from blob sidecars) should equal what we + // derive from our decided store. + // FIX-001: Use hard check instead of debug_assert (doesn't run in release builds). + if let Some(ref signed_header) = validated_signed_header && + signed_header.message.parent_root != parent_blob_root + { + error!( + height = %height, + round = %round, + sidecar_parent_root = ?signed_header.message.parent_root, + store_parent_root = ?parent_blob_root, + "INVARIANT VIOLATION: Validated blob sidecar parent_root does not match \ + store-derived parent_root" + ); + // FIX-001: Cleanup blobs that were already marked decided before rejecting. + // mark_decided was called earlier, so we must drop them now. + if let Err(cleanup_err) = + self.blob_engine.drop_round(height, round.as_i64()).await + { + warn!( + height = %height, + round = %round, + error = %cleanup_err, + "Failed to cleanup blobs after parent_root mismatch" + ); + self.blob_metrics.record_cleanup_failure(); + } + if let Err(cleanup_err) = + self.store.delete_blob_metadata_undecided(height, round).await + { + warn!( + height = %height, + round = %round, + error = %cleanup_err, + "Failed to cleanup blob metadata after parent_root mismatch" + ); + self.blob_metrics.record_cleanup_failure(); + } + self.record_sync_failure(); + self.blob_metrics.record_sync_package_rejected(); + return Ok(None); + } let proposer_index = self .get_validator_set() @@ -1419,32 +1823,192 @@ where self.store_synced_proposal(proposed_value.clone()).await?; + // Process archive notices - continue even if some fail (BUG-006 fix) + // Load Network context: Individual archive notice failures should not + // block sync. The notices are informational for keccak hash learning. for notice in archive_notices { - self.handle_archive_notice(notice).await?; + if let Err(e) = self.handle_archive_notice(notice).await { + warn!( + height = %height, + round = %round, + error = %e, + "Archive notice processing failed, continuing sync" + ); + } } Ok(Some(proposed_value)) } - SyncedValuePackage::MetadataOnly { value: _, archive_notices } => { + SyncedValuePackage::MetadataOnly { + value, + archive_notices, + execution_payload_ssz, + execution_requests, + } => { // MetadataOnly is received when blobs have been pruned on the sending peer. - // We process archive notices so we know where to fetch blobs from external - // archives, but we cannot complete the sync from this peer alone. - warn!( - height = %height, - round = %round, - notice_count = archive_notices.len(), - "Received MetadataOnly sync package (blobs pruned on peer), processing archive notices" - ); + // Following the Lighthouse pattern: if execution_payload_ssz is available, + // we can import the block WITHOUT blob sidecars. + // + // This is the key fix for syncing when all validators have pruned blobs. - // Process archive notices so we have the locators stored - for notice in archive_notices { - self.handle_archive_notice(notice).await?; - } + if let Some(payload) = execution_payload_ssz { + // We have payload - can import block without blobs! + info!( + height = %height, + round = %round, + payload_size = payload.len(), + notice_count = archive_notices.len(), + "🔵 SYNC: Processing MetadataOnly WITH payload (blobs pruned, importing without sidecars)" + ); + + let value_metadata = value.metadata.clone(); - // Return None - the syncing peer cannot import this height from us, - // but they now have archive locators to fetch blobs externally. - // Malachite will try another peer or the EL will sync independently. - Ok(None) + // Store the execution payload (same as Full path) + self.store_synced_block_data( + height, + round, + payload.clone(), + execution_requests.clone(), + ) + .await?; + + // Resolve parent root for blob metadata + let parent_blob_root = match self.resolve_parent_root_for_height(height).await { + Ok(root) => root, + Err(e) => { + error!( + height = %height, + round = %round, + error = %e, + "Failed to resolve parent root for synced MetadataOnly value" + ); + self.record_sync_failure(); + return Ok(None); + } + }; + + let proposer_index = self + .get_validator_set() + .validators + .iter() + .position(|v| v.address == proposer) + .map(|idx| idx as u64); + + // Create BlobMetadata without actual blob data (pruned) + // For pruned heights, extract keccak hashes from archive notices if available + let blob_metadata = if value_metadata.blob_kzg_commitments.is_empty() { + BlobMetadata::blobless( + height, + parent_blob_root, + &value_metadata.execution_payload_header, + proposer_index, + ) + } else { + // We have commitments but no actual blobs - extract keccak hashes + // from archive notices. Use B256::ZERO for any missing indices. + let blob_count = value_metadata.blob_kzg_commitments.len(); + let mut blob_keccak_hashes = vec![B256::ZERO; blob_count]; + + // Populate keccak hashes from archive notices + for notice in &archive_notices { + let idx = usize::from(notice.body.blob_index); + if idx < blob_count { + blob_keccak_hashes[idx] = notice.body.blob_keccak; + } + } + + BlobMetadata::new( + height, + parent_blob_root, + value_metadata.blob_kzg_commitments.clone(), + blob_keccak_hashes, + value_metadata.execution_payload_header.clone(), + proposer_index, + ) + }; + + // Store metadata and mark as decided + self.put_blob_metadata_undecided(height, round, &blob_metadata).await?; + self.store.mark_blob_metadata_decided(height, round).await?; + + // Update parent root cache + let header = blob_metadata.to_beacon_header(); + let new_root = header.hash_tree_root(); + info!( + height = %height, + old_cache_height = %self.last_blob_sidecar_height, + old_cache_root = ?self.last_blob_sidecar_root, + new_cache_root = ?new_root, + "✅ VALUESYNC: Updated blob parent root cache from MetadataOnly (pruned blobs)" + ); + self.last_blob_sidecar_root = new_root; + self.last_blob_sidecar_height = height; + + // Keep latest_block aligned so parent validation doesn't regress after sync. + let header = value_metadata.execution_payload_header.clone(); + let execution_block = ExecutionBlock { + block_hash: header.block_hash, + block_number: header.block_number, + parent_hash: header.parent_hash, + timestamp: header.timestamp, + prev_randao: load_prev_randao(), + }; + let should_update = self + .latest_block + .map(|blk| blk.block_number < execution_block.block_number) + .unwrap_or(true); + if should_update { + self.latest_block = Some(execution_block); + } + + // Process archive notices - continue even if some fail (BUG-006 fix) + for notice in archive_notices { + if let Err(e) = self.handle_archive_notice(notice).await { + warn!( + height = %height, + round = %round, + error = %e, + "Archive notice processing failed in MetadataOnly, continuing sync" + ); + } + } + + // Build and store proposed value + let proposed_value = ProposedValue { + height, + round, + valid_round: Round::Nil, + proposer, + value, + validity: Validity::Valid, + }; + self.store_synced_proposal(proposed_value.clone()).await?; + + Ok(Some(proposed_value)) + } else { + // No payload - cannot import, just process archive notices + warn!( + height = %height, + round = %round, + notice_count = archive_notices.len(), + "Received MetadataOnly WITHOUT payload (cannot import), processing archive notices" + ); + + // Process archive notices - continue even if some fail (BUG-006 fix) + for notice in archive_notices { + if let Err(e) = self.handle_archive_notice(notice).await { + warn!( + height = %height, + round = %round, + error = %e, + "Archive notice processing failed (no payload), continuing" + ); + } + } + + // Return None - Malachite will try another peer + Ok(None) + } } } } @@ -1579,6 +2143,75 @@ where ); } + // FIX: Send FCU BEFORE newPayload during sync to give EL sync target. + // Without this, EL returns SYNCING indefinitely because it doesn't know where to sync. + // Industry standard (Lighthouse, Prysm, Teku): FCU first, then newPayload. + // Only do this in sync mode (blobs_already_decided) to avoid extra latency in normal + // consensus. FIX-005: Add retry logic for FCU during sync - EL may need time to + // process parent blocks. + let block_hash = execution_payload.payload_inner.payload_inner.block_hash; + #[allow(clippy::collapsible_if)] // Nested if is more readable here + if blobs_already_decided || self.el_degraded { + let fcu_start = Instant::now(); + let mut fcu_backoff = self.execution_retry.initial_backoff; + let fcu_timeout = self.execution_retry.new_payload_sync_timeout; // Use sync timeout for FCU too + + loop { + match notifier.set_latest_forkchoice_state(block_hash).await { + Ok(_) => { + debug!( + height = %height, + block_hash = ?block_hash, + "FCU before newPayload succeeded" + ); + break; + } + Err(e) => { + // Check if it's a SYNCING error - may need retry as EL catches up + let is_syncing = e + .downcast_ref::() + .map(|ee| { + matches!( + ee, + ultramarine_execution::ExecutionError::SyncingForkchoice { .. } + ) + }) + .unwrap_or(false); + + if is_syncing && fcu_start.elapsed() < fcu_timeout { + debug!( + height = %height, + block_hash = ?block_hash, + elapsed = ?fcu_start.elapsed(), + "FCU before newPayload returned SYNCING, retrying..." + ); + sleep(fcu_backoff).await; + fcu_backoff = (fcu_backoff * 2).min(self.execution_retry.max_backoff); + continue; + } + + // Log appropriately based on error type + if is_syncing { + debug!( + height = %height, + block_hash = ?block_hash, + elapsed = ?fcu_start.elapsed(), + "FCU before newPayload timed out with SYNCING, proceeding to newPayload" + ); + } else { + warn!( + height = %height, + block_hash = ?block_hash, + error = %e, + "FCU before newPayload failed with unexpected error" + ); + } + break; + } + } + } + } + let payload_status = notifier .notify_new_block( execution_payload.clone(), @@ -1587,6 +2220,55 @@ where ) .await .map_err(|e| eyre::eyre!("Execution layer new_payload failed: {}", e))?; + let mut execution_pending = false; + // After EL restarts (or heavy load), Engine API can return SYNCING while it catches up on + // internal validation. Treat SYNCING as transient and retry instead of forcing a full + // consensus restart loop. + // Use shorter timeout during sync mode (blobs_already_decided) since FCU already gave EL + // the target. + let sync_timeout = if blobs_already_decided { + self.execution_retry.new_payload_sync_timeout + } else { + self.execution_retry.new_payload_timeout + }; + let payload_status = if matches!(payload_status.status, PayloadStatusEnum::Syncing) { + let start = Instant::now(); + let mut last = payload_status; + let mut backoff = self.execution_retry.initial_backoff; + loop { + if !matches!(last.status, PayloadStatusEnum::Syncing) { + break last; + } + if start.elapsed() > sync_timeout { + self.mark_el_degraded(format!( + "new_payload SYNCING for {:?} at height {} (block_hash={:?}, sync_mode={})", + sync_timeout, + height, + execution_payload.payload_inner.payload_inner.block_hash, + blobs_already_decided + )); + execution_pending = true; + break PayloadStatus::from_status(PayloadStatusEnum::Syncing); + } + warn!( + height = %height, + block_hash = ?execution_payload.payload_inner.payload_inner.block_hash, + "Execution layer new_payload returned SYNCING; retrying" + ); + sleep(backoff).await; + backoff = (backoff * 2).min(self.execution_retry.max_backoff); + last = notifier + .notify_new_block( + execution_payload.clone(), + execution_requests.clone(), + versioned_hashes.clone(), + ) + .await + .map_err(|e| eyre::eyre!("Execution layer new_payload failed: {}", e))?; + } + } else { + payload_status + }; if payload_status.is_invalid() { return Err(eyre::eyre!("Invalid payload status: {}", payload_status.status)); } @@ -1617,7 +2299,9 @@ where let tx_count = payload_inner.transactions.len(); let expected_parent = self.latest_block.as_ref().map(|block| block.block_hash); - if let Some(expected_parent_hash) = expected_parent && + let executed_is_parent = self.executed_height.as_u64().saturating_add(1) == height.as_u64(); + if executed_is_parent && + let Some(expected_parent_hash) = expected_parent && expected_parent_hash != parent_block_hash { return Err(eyre::eyre!( @@ -1628,10 +2312,48 @@ where )); } - let _latest_valid_hash = notifier - .set_latest_forkchoice_state(block_hash) - .await - .map_err(|e| eyre::eyre!("Failed to update forkchoice: {}", e))?; + let mut forkchoice_applied = false; + if !execution_pending { + let start = Instant::now(); + let mut backoff = self.execution_retry.initial_backoff; + loop { + match notifier.set_latest_forkchoice_state(block_hash).await { + Ok(_v) => { + forkchoice_applied = true; + break; + } + Err(e) => { + let msg = e.to_string(); + let looks_transient = matches!( + e.downcast_ref::(), + Some(ExecutionError::SyncingForkchoice { .. }) + ); + if looks_transient && + start.elapsed() <= self.execution_retry.forkchoice_timeout + { + warn!( + height = %height, + ?block_hash, + %msg, + "Forkchoice update returned SYNCING/unknown-payload; retrying" + ); + sleep(backoff).await; + backoff = (backoff * 2).min(self.execution_retry.max_backoff); + continue; + } + if looks_transient { + self.mark_el_degraded(format!( + "forkchoice SYNCING for {:?} at height {} (block_hash={:?})", + self.execution_retry.forkchoice_timeout, height, block_hash + )); + execution_pending = true; + break; + } + return Err(eyre::eyre!("Failed to update forkchoice: {}", e)); + } + } + } + } debug!( height = %height, @@ -1667,7 +2389,13 @@ where timestamp: execution_payload.timestamp(), prev_randao: expected_prev_randao, }; + // Always advance consensus-visible head so validators can validate parent links + // even when the execution layer is lagging. self.latest_block = Some(execution_block); + if forkchoice_applied { + self.executed_height = height; + self.clear_el_degraded(); + } let archive_notices = Vec::new(); @@ -1691,9 +2419,16 @@ where self.pending_archive_heights.insert(height); } + let outcome_status = if execution_pending { + PayloadStatus::from_status(PayloadStatusEnum::Syncing) + } else { + payload_status + }; + Ok(DecidedOutcome { execution_block, - payload_status, + payload_status: outcome_status, + execution_pending, tx_count, block_bytes: execution_payload_ssz.len(), blob_count, @@ -1830,6 +2565,12 @@ where // Only perform persistence and metadata operations on first commit (not WAL replay) if !is_idempotent_replay { self.store.store_decided_value(&certificate, proposal.value.clone()).await?; + if self.store.get_decided_value(certificate.height).await?.is_none() { + return Err(eyre::eyre!( + "Decided value not persisted at height {}", + certificate.height + )); + } // Phase 4: Three-layer metadata promotion (Layer 1 → Layer 2 → Layer 3) // This follows the architectural principle: Consensus → Ethereum → Blobs @@ -2011,6 +2752,26 @@ where continue; }; + // FIX-007: Get blob count from metadata BEFORE deleting so we can + // record the correct orphaned blob count (not just 1 per round). + let blob_count = match self + .store + .get_blob_metadata_undecided(certificate.height, Round::new(round_u32)) + .await + { + Ok(Some(metadata)) => usize::from(metadata.blob_count()), + Ok(None) => 0, + Err(e) => { + warn!( + height = %certificate.height, + round = round, + error = %e, + "Failed to get blob metadata for orphaned round" + ); + 0 + } + }; + if let Err(e) = self .store .delete_blob_metadata_undecided( @@ -2035,13 +2796,16 @@ where error = %e, "Failed to drop orphaned blobs for failed round" ); + self.blob_metrics.record_cleanup_failure(); // Don't fail commit - this is cleanup - } else { + } else if blob_count > 0 { debug!( height = %certificate.height, round = round, + blob_count = blob_count, "Dropped orphaned blobs for failed round" ); + self.blob_metrics.record_orphaned_blobs_dropped(blob_count); } } } @@ -2259,8 +3023,7 @@ where // Phase 4: Build and store BlobMetadata (Layer 2) as undecided // This MUST be stored before commit can promote it to decided let proposer_index = self.validator_index(&self.address); - let parent_blob_root = - if height.as_u64() == 0 { B256::ZERO } else { self.last_blob_sidecar_root }; + let parent_blob_root = self.resolve_parent_root_for_height(height).await?; let blob_metadata = if commitments.is_empty() { // Blobless block - still need metadata for parent-root chaining @@ -2539,6 +3302,18 @@ where Value::from_bytes(data.clone()) }; + if let Some(metadata) = metadata_opt.as_ref() { + // Validator-side protocol rules for proposal timestamps. + self.validate_proposed_timestamp( + metadata.execution_payload_header.timestamp, + metadata.execution_payload_header.parent_hash, + parts.height, + parts.round, + )?; + } else { + return Err("Missing execution payload metadata for timestamp validation".into()); + } + if has_blobs { let metadata = metadata_opt .as_ref() @@ -2551,7 +3326,7 @@ where parts.round ); - let _signed_header = self + let signed_header = self .verify_blob_sidecars(parts.height, &parts.proposer, metadata, &blob_sidecars) .await?; @@ -2564,8 +3339,7 @@ where // Phase 4: Store BlobMetadata (Layer 2) as undecided after verification let proposer_index = self.validator_index(&parts.proposer); - let parent_blob_root = - if parts.height.as_u64() == 0 { B256::ZERO } else { self.last_blob_sidecar_root }; + let parent_blob_root = signed_header.message.parent_root; let blob_keccak_hashes: Vec = blob_sidecars.iter().map(|sidecar| sidecar.blob_keccak()).collect(); @@ -2594,8 +3368,27 @@ where if !has_blobs && let Some(metadata) = metadata_opt.as_ref() { // Store blobless BlobMetadata (Layer 2) for parent-root chaining let proposer_index = self.validator_index(&parts.proposer); - let parent_blob_root = - if parts.height.as_u64() == 0 { B256::ZERO } else { self.last_blob_sidecar_root }; + let parent_blob_root = if parts.height.as_u64() == 0 { + B256::ZERO + } else { + let prev_height = Height::new(parts.height.as_u64() - 1); + match self.store.get_blob_metadata(prev_height).await { + Ok(Some(prev_metadata)) => prev_metadata.to_beacon_header().hash_tree_root(), + Ok(None) => { + return Err(format!( + "Missing decided BlobMetadata for parent height {}", + prev_height.as_u64() + )); + } + Err(e) => { + return Err(format!( + "Failed to load BlobMetadata for parent height {}: {}", + prev_height.as_u64(), + e + )); + } + } + }; let blob_metadata = BlobMetadata::blobless( parts.height, @@ -2626,6 +3419,54 @@ where Ok((proposed_value, data, execution_requests, has_blobs)) } + fn validate_proposed_timestamp( + &self, + proposed_ts: u64, + proposed_parent_hash: BlockHash, + height: Height, + round: Round, + ) -> Result<(), String> { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_err(|_| format!("System time unavailable at height {} round {}", height, round))? + .as_secs(); + + if proposed_ts > now + LOAD_MAX_FUTURE_DRIFT_SECS { + return Err(format!( + "Timestamp too far in future at height {} round {}: proposed_ts={} now={} max_drift={}", + height, round, proposed_ts, now, LOAD_MAX_FUTURE_DRIFT_SECS + )); + } + + let latest_block = self + .latest_block + .as_ref() + .ok_or_else(|| "latest_block is None after initialization".to_string())?; + if proposed_parent_hash != latest_block.block_hash { + return Err(format!( + "Parent hash mismatch at height {} round {}: proposed_parent={} latest_parent={}", + height, round, proposed_parent_hash, latest_block.block_hash + )); + } + let parent_ts = latest_block.timestamp; + + if proposed_ts <= parent_ts { + return Err(format!( + "Timestamp not > parent at height {} round {}: proposed_ts={} parent_ts={}", + height, round, proposed_ts, parent_ts + )); + } + + if proposed_ts < parent_ts + LOAD_MIN_BLOCK_TIME_SECS { + return Err(format!( + "Timestamp violates min block time at height {} round {}: proposed_ts={} parent_ts={} min={}", + height, round, proposed_ts, parent_ts, LOAD_MIN_BLOCK_TIME_SECS + )); + } + + Ok(()) + } + async fn verify_blob_sidecars( &self, height: Height, @@ -2715,30 +3556,18 @@ where parent_root } Ok(None) => { - if prev_height == self.last_blob_sidecar_height { - warn!( - height = %height, - parent_height = %prev_height, - cached_root = ?self.last_blob_sidecar_root, - "⚠️ VALIDATION: Missing decided BlobMetadata for parent height {}; using cached blob root", - prev_height.as_u64() - ); - self.last_blob_sidecar_root - } else { - error!( - height = %height, - parent_height = %prev_height, - cache_height = %self.last_blob_sidecar_height, - cache_root = ?self.last_blob_sidecar_root, - "❌ VALIDATION: Cache mismatch - cache points to height {} but need height {}", - self.last_blob_sidecar_height.as_u64(), - prev_height.as_u64() - ); - return Err(format!( - "Missing decided BlobMetadata for parent height {}", - prev_height.as_u64() - )); - } + error!( + height = %height, + parent_height = %prev_height, + cache_height = %self.last_blob_sidecar_height, + cache_root = ?self.last_blob_sidecar_root, + "❌ VALIDATION: Missing decided BlobMetadata for parent height {}", + prev_height.as_u64() + ); + return Err(format!( + "Missing decided BlobMetadata for parent height {}", + prev_height.as_u64() + )); } Err(e) => { error!( @@ -2757,7 +3586,10 @@ where }; if signed_header.message.parent_root != expected_parent_root { - return Err("Beacon header parent_root mismatch".to_string()); + return Err(format!( + "Beacon header parent_root mismatch: expected {:?}, got {:?}", + expected_parent_root, signed_header.message.parent_root + )); } for sidecar in sidecars { diff --git a/crates/consensus/src/state/tests/mod.rs b/crates/consensus/src/state/tests/mod.rs index 4ffea6f..f8035e3 100644 --- a/crates/consensus/src/state/tests/mod.rs +++ b/crates/consensus/src/state/tests/mod.rs @@ -1,22 +1,23 @@ mod support; -use std::collections::HashSet; +use std::{collections::HashSet, str::FromStr, time::Duration}; use alloy_primitives::B256; use bytes::Bytes as NetworkBytes; use malachitebft_app_channel::app::types::{ - LocallyProposedValue, + LocallyProposedValue, PeerId, core::{CommitCertificate, Round, Validity}, }; use ssz::Encode; use support::*; use ultramarine_blob_engine::{BlobEngine, BlobEngineError}; use ultramarine_types::{ - aliases::Bytes as BlobBytes, + aliases::{Bytes as AlloyBytes, Bytes as BlobBytes}, archive::{ArchiveNotice, ArchiveNoticeBody}, blob::{BYTES_PER_BLOB, Blob, BlobsBundle, KzgCommitment, KzgProof}, blob_metadata::BlobMetadata, - engine_api::ExecutionPayloadHeader, + constants::{LOAD_MAX_FUTURE_DRIFT_SECS, LOAD_MIN_BLOCK_TIME_SECS}, + engine_api::{ExecutionBlock, ExecutionPayloadHeader, load_prev_randao}, height::Height, signing::Ed25519Provider, sync::SyncedValuePackage, @@ -89,6 +90,7 @@ async fn verify_blob_sidecars_roundtrip_canonical_proof() { let (_signed_header, sidecars) = state .prepare_blob_sidecar_parts(&locally_proposed, Some(&bundle)) + .await .expect("prepare sidecars"); assert_eq!(sidecars.len(), commitments.len()); @@ -216,6 +218,8 @@ async fn load_blob_metadata_for_round_falls_back_to_decided() { async fn propose_value_with_blobs_stores_blob_metadata() { let mock_engine = MockBlobEngine::default(); let (mut state, _tmp) = build_state(mock_engine, Height::new(1)); + state.store.seed_genesis_blob_metadata().await.expect("seed genesis metadata"); + state.hydrate_blob_parent_root().await.expect("hydrate parent root"); let payload = sample_execution_payload_v3(); let requests_hash = Some(ExecutionPayloadHeader::compute_requests_hash(&[] as &[BlobBytes])); @@ -225,7 +229,14 @@ async fn propose_value_with_blobs_stores_blob_metadata() { let metadata_before = state.store.get_blob_metadata_undecided(Height::new(1), Round::new(0)).await.expect("get"); assert!(metadata_before.is_none()); - assert_eq!(state.last_blob_sidecar_root, B256::ZERO); + let genesis = state + .store + .get_blob_metadata(Height::new(0)) + .await + .expect("get genesis") + .expect("genesis metadata"); + let expected_parent_root = genesis.to_beacon_header().hash_tree_root(); + assert_eq!(state.last_blob_sidecar_root, expected_parent_root); state .propose_value_with_blobs( @@ -245,28 +256,36 @@ async fn propose_value_with_blobs_stores_blob_metadata() { .await .expect("get") .expect("metadata"); - assert_eq!(stored.height(), Height::new(1)); assert_eq!(stored.blob_count(), 1); - assert_eq!(stored.parent_blob_root(), B256::ZERO); + assert_eq!(stored.parent_blob_root(), expected_parent_root); assert_eq!(stored.blob_kzg_commitments(), bundle.commitments.as_slice()); assert_eq!(stored.execution_payload_header(), &expected_header); assert_eq!(stored.proposer_index_hint(), Some(0)); - assert_eq!(state.last_blob_sidecar_root, B256::ZERO); + assert_eq!(state.last_blob_sidecar_root, expected_parent_root); } #[tokio::test] async fn propose_blobless_value_uses_parent_root_hint() { let mock_engine = MockBlobEngine::default(); let (mut state, _tmp) = build_state(mock_engine, Height::new(2)); - let parent_root = B256::from([7u8; 32]); - state.last_blob_sidecar_root = parent_root; + let prev_height = Height::new(1); + let parent_metadata = sample_blob_metadata(prev_height, B256::from([7u8; 32])); + state + .store + .put_blob_metadata_undecided(prev_height, Round::new(0), &parent_metadata) + .await + .expect("store parent metadata"); + state.store.mark_blob_metadata_decided(prev_height, Round::new(0)).await.expect("mark decided"); + let parent_root = parent_metadata.to_beacon_header().hash_tree_root(); let payload = sample_execution_payload_v3(); let requests_hash = Some(ExecutionPayloadHeader::compute_requests_hash(&[] as &[BlobBytes])); let expected_header = ExecutionPayloadHeader::from_payload(&payload, requests_hash).expect("build header"); + let initial_cache_root = state.last_blob_sidecar_root; + state .propose_value_with_blobs( Height::new(2), @@ -291,7 +310,7 @@ async fn propose_blobless_value_uses_parent_root_hint() { assert_eq!(stored.parent_blob_root(), parent_root); assert_eq!(stored.execution_payload_header(), &expected_header); assert_eq!(stored.proposer_index_hint(), Some(0)); - assert_eq!(state.last_blob_sidecar_root, parent_root); + assert_eq!(state.last_blob_sidecar_root, initial_cache_root); } #[tokio::test] @@ -447,10 +466,74 @@ async fn commit_promotes_blobless_metadata_updates_parent_root() { assert!(mock_engine.verify_calls().is_empty()); } +#[tokio::test] +async fn process_decided_certificate_marks_el_degraded_on_syncing() { + let mock_engine = MockBlobEngine::default(); + let (mut state, _tmp) = build_state(mock_engine.clone(), Height::new(10)); + let height = Height::new(10); + let round = Round::new(0); + + state.set_execution_retry_config(ExecutionRetryConfig { + new_payload_timeout: Duration::from_millis(2), + new_payload_sync_timeout: Duration::from_millis(1), + forkchoice_timeout: Duration::from_millis(2), + initial_backoff: Duration::from_millis(1), + max_backoff: Duration::from_millis(1), + }); + + state.store.seed_genesis_blob_metadata().await.expect("seed genesis metadata"); + state.hydrate_blob_parent_root().await.expect("hydrate parent root"); + seed_decided_blob_metadata(&mut state, Height::new(9), B256::ZERO) + .await + .expect("seed parent metadata"); + + let (proposed, _metadata, sidecars, _bundle, payload_bytes) = + propose_blobbed_value(&mut state, height, round, 1).await; + + mock_engine + .verify_and_store(height, round.as_i64(), &sidecars) + .await + .expect("seed undecided blobs"); + + state + .store + .store_undecided_block_data(height, round, payload_bytes.clone(), Vec::new()) + .await + .expect("store block bytes"); + + let certificate = CommitCertificate { + height, + round, + value_id: proposed.value.id(), + commit_signatures: Vec::new(), + }; + + let syncing_status = PayloadStatus::from_status(PayloadStatusEnum::Syncing); + let mut notifier = MockExecutionNotifier::with_payload_statuses(vec![ + syncing_status.clone(), + syncing_status.clone(), + syncing_status, + ]); + + let outcome = state + .process_decided_certificate(&certificate, payload_bytes, &mut notifier) + .await + .expect("process decided certificate"); + + assert!(outcome.execution_pending, "execution should be pending while EL syncs"); + assert!(state.is_el_degraded(), "state should be marked EL-degraded"); + assert!( + state.store.get_decided_value(height).await.expect("decided value").is_some(), + "decided value should be persisted even when execution is pending" + ); +} + #[tokio::test] async fn rebuild_blob_sidecars_for_restream_reconstructs_headers() { let mock_engine = MockBlobEngine::default(); - let (state, _tmp) = build_state(mock_engine, Height::new(1)); + let (mut state, _tmp) = build_state(mock_engine, Height::new(1)); + state.store.seed_genesis_blob_metadata().await.expect("seed genesis metadata"); + state.hydrate_blob_parent_root().await.expect("hydrate parent root"); let height = Height::new(1); let round = Round::new(0); @@ -463,7 +546,7 @@ async fn rebuild_blob_sidecars_for_restream_reconstructs_headers() { let locally_proposed = LocallyProposedValue::new(height, round, value); let (_signed_header, sidecars) = - state.prepare_blob_sidecar_parts(&locally_proposed, Some(&bundle)).expect("prepare"); + state.prepare_blob_sidecar_parts(&locally_proposed, Some(&bundle)).await.expect("prepare"); let blob_hashes = bundle.blob_keccak_hashes(); let blob_metadata = BlobMetadata::new( @@ -549,6 +632,33 @@ async fn process_decided_certificate_rejects_mismatched_prev_randao() { assert!(err.to_string().contains("prev_randao mismatch"), "unexpected error: {err}"); } +#[tokio::test] +async fn propose_value_rejects_invalid_execution_requests() { + let mock_engine = MockBlobEngine::default(); + let (mut state, _tmp) = build_state(mock_engine, Height::new(1)); + state.store.seed_genesis_blob_metadata().await.expect("seed genesis metadata"); + state.hydrate_blob_parent_root().await.expect("hydrate parent root"); + + let height = Height::new(1); + let round = Round::new(0); + state.current_height = height; + state.current_round = round; + + let payload = sample_execution_payload_v3(); + let payload_bytes = NetworkBytes::from(payload.as_ssz_bytes()); + let invalid_requests = vec![ + AlloyBytes::copy_from_slice(&[0x05, 0xAA]), + AlloyBytes::copy_from_slice(&[0x04, 0xBB]), + ]; + + let err = state + .propose_value_with_blobs(height, round, payload_bytes, &payload, &invalid_requests, None) + .await + .expect_err("invalid execution requests must be rejected"); + + assert!(err.to_string().contains("Invalid execution requests"), "unexpected error: {err}"); +} + #[tokio::test] async fn commit_cleans_failed_round_blob_metadata() { let mock_engine = MockBlobEngine::default(); @@ -630,6 +740,8 @@ async fn multi_round_proposal_isolation_and_commit() { let mock_engine = MockBlobEngine::default(); let (mut state, _tmp) = build_state(mock_engine.clone(), Height::new(1)); let height = Height::new(1); + state.store.seed_genesis_blob_metadata().await.expect("seed genesis metadata"); + state.hydrate_blob_parent_root().await.expect("hydrate parent root"); // Propose at round 0 state.current_height = height; @@ -878,6 +990,8 @@ async fn commit_fails_fast_if_blob_metadata_missing() { async fn parent_root_chain_continuity_across_mixed_blocks() { let mock_engine = MockBlobEngine::default(); let (mut state, _tmp) = build_state(mock_engine.clone(), Height::new(1)); + state.store.seed_genesis_blob_metadata().await.expect("seed genesis metadata"); + state.hydrate_blob_parent_root().await.expect("hydrate parent root"); // Height 1: Blobbed block let payload_h1 = sample_execution_payload_v3(); @@ -1032,8 +1146,9 @@ async fn parent_root_chain_continuity_across_mixed_blocks() { // Verify full chain: h1 → h2 (blobless) → h3 let decided_h1 = state.store.get_blob_metadata(Height::new(1)).await.expect("d1").expect("m1"); let decided_h2 = state.store.get_blob_metadata(Height::new(2)).await.expect("d2").expect("m2"); + let decided_h0 = state.store.get_blob_metadata(Height::new(0)).await.expect("d0").expect("m0"); - assert_eq!(decided_h1.parent_blob_root(), B256::ZERO); + assert_eq!(decided_h1.parent_blob_root(), decided_h0.to_beacon_header().hash_tree_root()); assert_eq!(decided_h2.parent_blob_root(), decided_h1.to_beacon_header().hash_tree_root()); assert_eq!(meta_h3.parent_blob_root(), decided_h2.to_beacon_header().hash_tree_root()); } @@ -1164,6 +1279,212 @@ async fn proposer_rotation_updates_metadata_hint() { } } +#[tokio::test] +async fn cleanup_stale_round_blobs_removes_old_rounds() { + let mock_engine = MockBlobEngine::default(); + let (mut state, _tmp) = build_state(mock_engine.clone(), Height::new(1)); + let height = Height::new(1); + + state.store.seed_genesis_blob_metadata().await.expect("seed genesis metadata"); + state.hydrate_blob_parent_root().await.expect("hydrate parent root"); + + // Create blobs at rounds 0, 1, 2, 3 + for round_num in 0..=3u32 { + state.current_height = height; + state.current_round = Round::new(round_num); + let payload = sample_execution_payload_v3(); + let bundle = sample_blob_bundle(1); + state + .propose_value_with_blobs( + height, + Round::new(round_num), + NetworkBytes::new(), + &payload, + &[], + Some(&bundle), + ) + .await + .expect("propose"); + + state + .store + .store_undecided_block_data( + height, + Round::new(round_num), + NetworkBytes::from_static(b"block"), + Vec::new(), + ) + .await + .expect("store block data"); + } + + // Verify all rounds are tracked in blob_rounds + assert_eq!(state.blob_rounds.get(&height).unwrap().len(), 4); + + // At round 6 with default retention=3, cutoff = 6-3 = 3 + // Should cleanup rounds < 3, i.e., rounds 0, 1, 2 + let cleaned = state.cleanup_stale_round_blobs(height, Round::new(6)).await.expect("cleanup"); + + assert_eq!(cleaned, 3, "Should clean 3 stale rounds"); + + // Only round 3 should remain in tracking + let remaining = state.blob_rounds.get(&height).unwrap(); + assert_eq!(remaining.len(), 1); + assert!(remaining.contains(&3)); + + // Verify drop_round was called for rounds 0, 1, 2 + let drop_calls = mock_engine.drop_calls(); + assert!(drop_calls.contains(&(height, 0)), "Should drop round 0"); + assert!(drop_calls.contains(&(height, 1)), "Should drop round 1"); + assert!(drop_calls.contains(&(height, 2)), "Should drop round 2"); + assert!(!drop_calls.contains(&(height, 3)), "Should NOT drop round 3"); + + for round_num in 0..=2u32 { + let round = Round::new(round_num); + assert!( + state + .store + .get_undecided_proposal(height, round) + .await + .expect("load proposal") + .is_none(), + "proposal should be removed for stale round {}", + round_num + ); + assert!( + state.store.get_block_data(height, round).await.expect("load block").is_none(), + "block data should be removed for stale round {}", + round_num + ); + assert!( + state + .store + .get_blob_metadata_undecided(height, round) + .await + .expect("load metadata") + .is_none(), + "metadata should be removed for stale round {}", + round_num + ); + } + + let retained_round = Round::new(3); + assert!( + state + .store + .get_undecided_proposal(height, retained_round) + .await + .expect("load proposal") + .is_some(), + "proposal should remain for retained round" + ); + assert!( + state.store.get_block_data(height, retained_round).await.expect("load block").is_some(), + "block data should remain for retained round" + ); + assert!( + state + .store + .get_blob_metadata_undecided(height, retained_round) + .await + .expect("load metadata") + .is_some(), + "metadata should remain for retained round" + ); +} + +#[tokio::test] +async fn cleanup_stale_round_blobs_no_cleanup_when_retention_window_covers_all() { + let mock_engine = MockBlobEngine::default(); + let (mut state, _tmp) = build_state(mock_engine.clone(), Height::new(1)); + let height = Height::new(1); + + state.store.seed_genesis_blob_metadata().await.expect("seed genesis metadata"); + state.hydrate_blob_parent_root().await.expect("hydrate parent root"); + + // Create blobs at rounds 0, 1 + for round_num in 0..=1u32 { + state.current_height = height; + state.current_round = Round::new(round_num); + let payload = sample_execution_payload_v3(); + let bundle = sample_blob_bundle(1); + state + .propose_value_with_blobs( + height, + Round::new(round_num), + NetworkBytes::new(), + &payload, + &[], + Some(&bundle), + ) + .await + .expect("propose"); + + state + .store + .store_undecided_block_data( + height, + Round::new(round_num), + NetworkBytes::from_static(b"block"), + Vec::new(), + ) + .await + .expect("store block data"); + } + + // At round 3 with retention=3, cutoff = 3-3 = 0 + // Nothing should be cleaned since rounds 0, 1 are >= cutoff(0) is false for round 0 + // Actually cutoff < 0 would return early, and cutoff = 0 means rounds < 0 cleaned + // Let's test with round 2: cutoff = 2-3 = -1, so nothing cleaned + let cleaned = state.cleanup_stale_round_blobs(height, Round::new(2)).await.expect("cleanup"); + + assert_eq!(cleaned, 0, "No rounds should be cleaned"); + assert_eq!(state.blob_rounds.get(&height).unwrap().len(), 2); + assert!(mock_engine.drop_calls().is_empty()); + + for round_num in 0..=1u32 { + let round = Round::new(round_num); + assert!( + state + .store + .get_undecided_proposal(height, round) + .await + .expect("load proposal") + .is_some(), + "proposal should remain for round {}", + round_num + ); + assert!( + state.store.get_block_data(height, round).await.expect("load block").is_some(), + "block data should remain for round {}", + round_num + ); + assert!( + state + .store + .get_blob_metadata_undecided(height, round) + .await + .expect("load metadata") + .is_some(), + "metadata should remain for round {}", + round_num + ); + } +} + +#[tokio::test] +async fn cleanup_stale_round_blobs_handles_empty_state() { + let mock_engine = MockBlobEngine::default(); + let (mut state, _tmp) = build_state(mock_engine.clone(), Height::new(1)); + + // No blobs created, should not panic + let cleaned = + state.cleanup_stale_round_blobs(Height::new(1), Round::new(10)).await.expect("cleanup"); + + assert_eq!(cleaned, 0); + assert!(mock_engine.drop_calls().is_empty()); +} + #[tokio::test] async fn get_blobs_with_status_check_returns_pruned_error() { let mock_engine = MockBlobEngine::default(); @@ -1225,3 +1546,624 @@ async fn get_blobs_with_status_check_returns_pruned_error() { other => panic!("expected BlobsPruned error, got {:?}", other), } } + +/// Regression test: Ensure `process_synced_package` uses the store-derived parent_root, +/// NOT the in-memory cache. This test sets the cache to a WRONG value and verifies +/// that the written BlobMetadata.parent_blob_root matches the store, not the corrupted cache. +/// +/// Background: A bug existed where `process_synced_package` used `self.blob_parent_root()` +/// (the cache) instead of loading from the store. This caused nodes to diverge when +/// the cache was stale, leading to chain halts during high-traffic scenarios. +/// +/// This test uses a BLOBLESS sync to isolate the parent_root logic without blob verification +/// complexity. +#[tokio::test] +async fn sync_path_uses_store_not_cache_for_parent_root() { + let mock_engine = MockBlobEngine::default(); + let (mut state, _tmp) = build_state(mock_engine.clone(), Height::new(0)); + + // Seed genesis metadata + state.store.seed_genesis_blob_metadata().await.expect("seed genesis"); + state.hydrate_blob_parent_root().await.expect("hydrate"); + + // Commit height 1 (blobless) to establish a known parent + let height_1 = Height::new(1); + let round = Round::new(0); + + // Use helper to create decided blobless metadata at h1 + let genesis_root = state.blob_parent_root(); + let correct_parent_root = + seed_decided_blob_metadata(&mut state, height_1, genesis_root).await.expect("seed h1"); + + // Update state to point to h1 + state.last_blob_sidecar_root = correct_parent_root; + state.last_blob_sidecar_height = height_1; + + // CORRUPT the in-memory cache to a WRONG value + let wrong_root = B256::from([0xDE; 32]); + state.last_blob_sidecar_root = wrong_root; + + // Verify cache is corrupted + assert_eq!(state.blob_parent_root(), wrong_root); + assert_ne!(wrong_root, correct_parent_root, "test setup: roots must differ"); + + // Now sync height 2 (blobless) via process_synced_package + let height_2 = Height::new(2); + state.current_height = height_2; + state.current_round = round; + + // Build a blobless value for h2 + let payload_h2 = sample_execution_payload_v3(); + let payload_bytes_h2 = NetworkBytes::from(payload_h2.as_ssz_bytes()); + let metadata_h2 = ValueMetadata::new(sample_execution_payload_header(), vec![]); // No blobs + let value_h2 = ultramarine_types::value::Value::new(metadata_h2.clone()); + + // Store the payload for sync + state + .store + .store_undecided_block_data(height_2, round, payload_bytes_h2.clone(), Vec::new()) + .await + .expect("store h2 payload"); + + // Process the synced value - this should use STORE, not CACHE + let package = SyncedValuePackage::Full { + value: value_h2, + execution_payload_ssz: payload_bytes_h2.clone(), + blob_sidecars: vec![], // No blobs + execution_requests: Vec::new(), + archive_notices: Vec::new(), + }; + + let result = state + .process_synced_package(height_2, round, state.address, package) + .await + .expect("process sync") + .expect("sync succeeded"); + + assert_eq!(result.height, height_2); + + // THE KEY ASSERTION: The written BlobMetadata must use the STORE-derived parent_root, + // NOT the corrupted cache value + let synced_metadata = + state.store.get_blob_metadata(height_2).await.expect("load h2").expect("h2 metadata"); + + assert_eq!( + synced_metadata.parent_blob_root(), + correct_parent_root, + "BUG: process_synced_package used cache ({:?}) instead of store ({:?})", + wrong_root, + correct_parent_root + ); + assert_ne!( + synced_metadata.parent_blob_root(), + wrong_root, + "INVARIANT VIOLATED: parent_root must NOT match corrupted cache" + ); +} + +/// Regression test for BUG-002: get_earliest_height() was returning the pruned minimum +/// instead of genesis height. The fix makes get_earliest_height() return Height(0) if +/// genesis metadata exists in BLOB_METADATA_DECIDED_TABLE. +/// +/// This ensures that peers can sync from genesis even after pruning operations have +/// removed older blob data, following the Lighthouse pattern where beacon blocks +/// are kept forever. +#[tokio::test] +async fn test_history_min_height_returns_genesis_after_pruning() { + let mock_engine = MockBlobEngine::default(); + let (mut state, _tmp) = build_state(mock_engine, Height::new(0)); + + // Step 1: Seed genesis metadata + state.store.seed_genesis_blob_metadata().await.expect("seed genesis blob metadata"); + + // Step 2: Add blocks up to height 5 + let mut parent_root = BlobMetadata::genesis().to_beacon_header().hash_tree_root(); + for h in 1..=5u64 { + parent_root = seed_decided_blob_metadata(&mut state, Height::new(h), parent_root) + .await + .unwrap_or_else(|_| panic!("seed decided blob metadata at height {}", h)); + } + + // Step 3: Verify get_earliest_height() returns Height(0) even with blocks up to height 5 + let earliest = state.get_earliest_height().await; + assert_eq!( + earliest, + Height::new(0), + "BUG-002: get_earliest_height() should return genesis (Height 0) when genesis metadata exists" + ); +} + +/// Test that reorg/fork handling correctly drops orphaned blobs. +/// When a different round is committed, all other rounds' blobs at that height should be cleaned. +#[tokio::test] +async fn test_reorg_drops_orphaned_blobs() { + let mock_engine = MockBlobEngine::default(); + let (mut state, _tmp) = build_state(mock_engine.clone(), Height::new(1)); + let height = Height::new(1); + + state.store.seed_genesis_blob_metadata().await.expect("seed genesis metadata"); + state.hydrate_blob_parent_root().await.expect("hydrate parent root"); + + // Create blobs at multiple rounds (simulating a fork scenario) + let rounds = [Round::new(0), Round::new(1), Round::new(2)]; + for &round in &rounds { + state.current_height = height; + state.current_round = round; + let payload = sample_execution_payload_v3(); + let bundle = sample_blob_bundle(1); + state + .propose_value_with_blobs( + height, + round, + NetworkBytes::new(), + &payload, + &[], + Some(&bundle), + ) + .await + .expect("propose"); + + state + .store + .store_undecided_block_data( + height, + round, + NetworkBytes::from_static(b"block"), + Vec::new(), + ) + .await + .expect("store block data"); + + // Register round in blob_rounds for tracking + state.blob_rounds.entry(height).or_insert_with(HashSet::new).insert(round.as_i64()); + } + + // Verify all rounds have metadata + for &round in &rounds { + assert!( + state.store.get_blob_metadata_undecided(height, round).await.expect("get").is_some(), + "Round {:?} should have metadata", + round + ); + } + + // Commit round 1 (middle round wins the fork) + let winning_round = Round::new(1); + let value_metadata = sample_value_metadata(1); + let value = Value::new(value_metadata); + let proposal = ProposedValue { + height, + round: winning_round, + valid_round: Round::Nil, + proposer: state.address, + value: value.clone(), + validity: Validity::Valid, + }; + + state.store.store_undecided_proposal(proposal.clone()).await.expect("store proposal"); + + let certificate = CommitCertificate { + height, + round: winning_round, + value_id: proposal.value.id(), + commit_signatures: Vec::new(), + }; + + state.commit(certificate).await.expect("commit"); + + // Winning round should be promoted to decided + assert!( + state.store.get_blob_metadata(height).await.expect("get").is_some(), + "Winning round metadata should be promoted to decided" + ); + + // Orphaned rounds (0 and 2) should be cleaned up + assert!( + state + .store + .get_blob_metadata_undecided(height, Round::new(0)) + .await + .expect("get") + .is_none(), + "Round 0 (orphaned) should be cleaned up" + ); + assert!( + state + .store + .get_blob_metadata_undecided(height, Round::new(2)) + .await + .expect("get") + .is_none(), + "Round 2 (orphaned) should be cleaned up" + ); + + // Verify blob engine received drop calls for orphaned rounds + let drop_calls = mock_engine.drop_calls(); + assert!(drop_calls.contains(&(height, 0)), "Blob engine should drop round 0"); + assert!(drop_calls.contains(&(height, 2)), "Blob engine should drop round 2"); + assert!(!drop_calls.contains(&(height, 1)), "Blob engine should NOT drop winning round 1"); + + // FIX-007: Verify orphaned_blobs_dropped metric is recorded correctly + // Each orphaned round (0 and 2) had 1 blob each, so total should be 2 + let metrics = state.blob_metrics.snapshot(); + assert_eq!( + metrics.orphaned_blobs_dropped, 2, + "Should record 2 orphaned blobs dropped (1 from round 0 + 1 from round 2)" + ); +} + +/// Test that sync rejects packages with fewer sidecars than claimed in metadata. +#[tokio::test] +async fn test_sync_rejects_partial_sidecars() { + let mock_engine = MockBlobEngine::default(); + let (mut state, _tmp) = build_state(mock_engine.clone(), Height::new(1)); + + state.store.seed_genesis_blob_metadata().await.expect("seed genesis metadata"); + state.hydrate_blob_parent_root().await.expect("hydrate parent root"); + + let height = Height::new(1); + let round = Round::new(0); + state.current_height = height; + state.current_round = round; + + // Create a valid package with 3 blobs, then truncate sidecars to 1 + let payload = sample_execution_payload_v3(); + let payload_bytes = NetworkBytes::from(payload.as_ssz_bytes()); + let bundle = sample_blob_bundle(3); // Create 3 blobs + + let header = ExecutionPayloadHeader::from_payload(&payload, None).expect("build header"); + let value_metadata = ValueMetadata::new(header.clone(), bundle.commitments.clone()); + let value = Value::new(value_metadata); + + // Store payload for sync + state + .store + .store_undecided_block_data(height, round, payload_bytes.clone(), Vec::new()) + .await + .expect("store payload"); + + // Prepare valid sidecars first, then truncate to create mismatch + let locally_proposed = LocallyProposedValue::new(height, round, value.clone()); + let (_signed_header, mut full_sidecars) = state + .prepare_blob_sidecar_parts(&locally_proposed, Some(&bundle)) + .await + .expect("prepare sidecars"); + + assert_eq!(full_sidecars.len(), 3, "Should have 3 sidecars"); + + // Truncate to only 1 sidecar - simulating partial sidecar receipt + full_sidecars.truncate(1); + let partial_sidecars = full_sidecars; + + // The package has mismatched sidecar count vs metadata + let package = SyncedValuePackage::Full { + value, + execution_payload_ssz: payload_bytes, + blob_sidecars: partial_sidecars, // Only 1 sidecar but metadata claims 3 + execution_requests: Vec::new(), + archive_notices: Vec::new(), + }; + + // Process should return None (rejection) due to count mismatch + let result = state + .process_synced_package(height, round, state.address, package) + .await + .expect("process should not error"); + + assert!( + result.is_none(), + "Sync should reject package with partial sidecars (fewer than metadata claims)" + ); + + // FIX-007: Verify sync_packages_rejected metric is recorded + let metrics = state.blob_metrics.snapshot(); + assert_eq!( + metrics.sync_packages_rejected, 1, + "Should record 1 rejected sync package due to partial sidecars" + ); +} + +/// Test that sync rejects packages with duplicate blob indices. +/// This tests the FIX-003 duplicate index check. +#[tokio::test] +async fn test_sync_rejects_duplicate_indices() { + let mock_engine = MockBlobEngine::default(); + let (mut state, _tmp) = build_state(mock_engine.clone(), Height::new(1)); + + state.store.seed_genesis_blob_metadata().await.expect("seed genesis metadata"); + state.hydrate_blob_parent_root().await.expect("hydrate parent root"); + + let height = Height::new(1); + let round = Round::new(0); + state.current_height = height; + state.current_round = round; + + // Create a valid package first, then tamper with indices + let payload = sample_execution_payload_v3(); + let payload_bytes = NetworkBytes::from(payload.as_ssz_bytes()); + let bundle = sample_blob_bundle(2); + + let header = ExecutionPayloadHeader::from_payload(&payload, None).expect("build header"); + let value_metadata = ValueMetadata::new(header, bundle.commitments.clone()); + let value = Value::new(value_metadata); + + // Store payload for sync + state + .store + .store_undecided_block_data(height, round, payload_bytes.clone(), Vec::new()) + .await + .expect("store payload"); + + // Prepare valid sidecars first + let locally_proposed = LocallyProposedValue::new(height, round, value.clone()); + let (_signed_header, mut sidecars) = state + .prepare_blob_sidecar_parts(&locally_proposed, Some(&bundle)) + .await + .expect("prepare sidecars"); + + assert_eq!(sidecars.len(), 2, "Should have 2 sidecars"); + + // Tamper: set both sidecars to have the same index (index 0) + sidecars[1].index = 0; // Duplicate index! + + let package = SyncedValuePackage::Full { + value, + execution_payload_ssz: payload_bytes, + blob_sidecars: sidecars, + execution_requests: Vec::new(), + archive_notices: Vec::new(), + }; + + // Process should return None (rejection) due to duplicate indices + let result = state + .process_synced_package(height, round, state.address, package) + .await + .expect("process should not error"); + + assert!(result.is_none(), "Sync should reject package with duplicate blob indices"); + + // FIX-007: Verify sync_packages_rejected metric is recorded + let metrics = state.blob_metrics.snapshot(); + assert_eq!( + metrics.sync_packages_rejected, 1, + "Should record 1 rejected sync package due to duplicate indices" + ); +} + +/// Test sequential multi-height sync chain continuity - verifies that sync operations +/// maintain parent root chain integrity across multiple heights from different proposers. +/// Note: This tests sequential processing, not true concurrency (which would require +/// Arc>). +/// +/// FIXME: This test has structural issues - process_synced_package validation +/// rejects packages that don't match EL verification. Needs refactoring to +/// properly mock the execution layer or use a different approach. +#[ignore = "Needs refactoring to properly handle EL verification in sync flow"] +#[tokio::test] +async fn test_sequential_multi_height_sync_chain_continuity() { + use ultramarine_types::signing::PrivateKey; + + // Setup multiple validator keys to simulate different peers + let private_keys = + [PrivateKey::from([1u8; 32]), PrivateKey::from([2u8; 32]), PrivateKey::from([3u8; 32])]; + let validators: Vec = + private_keys.iter().map(|key| Validator::new(key.public_key(), 1)).collect(); + + let mock_engine = MockBlobEngine::default(); + let (mut state, _tmp) = build_state(mock_engine.clone(), Height::new(0)); + + state.store.seed_genesis_blob_metadata().await.expect("seed genesis metadata"); + state.hydrate_blob_parent_root().await.expect("hydrate parent root"); + + // Process each height sequentially - must do one at a time because preparing + // sidecars for height N+1 requires decided metadata for height N to exist. + for h in 1..=3u64 { + let height = Height::new(h); + let round = Round::new(0); + let peer_idx = (h as usize - 1) % validators.len(); + let proposer = validators[peer_idx].address; + + // Set state to current height + state.current_height = height; + state.current_round = round; + + // Prepare sync package + let payload = sample_execution_payload_v3(); + let payload_bytes = NetworkBytes::from(payload.as_ssz_bytes()); + let bundle = sample_blob_bundle(1); + + let header = ExecutionPayloadHeader::from_payload(&payload, None).expect("build header"); + let value_metadata = ValueMetadata::new(header, bundle.commitments.clone()); + let value = Value::new(value_metadata); + + // Store payload for this height + state + .store + .store_undecided_block_data(height, round, payload_bytes.clone(), Vec::new()) + .await + .expect("store payload"); + + // Prepare sidecars - this needs parent height's decided metadata to exist + let locally_proposed = LocallyProposedValue::new(height, round, value.clone()); + let (_signed_header, sidecars) = state + .prepare_blob_sidecar_parts(&locally_proposed, Some(&bundle)) + .await + .expect("prepare sidecars"); + + // Create and process sync package + let package = SyncedValuePackage::Full { + value, + execution_payload_ssz: payload_bytes, + blob_sidecars: sidecars, + execution_requests: Vec::new(), + archive_notices: Vec::new(), + }; + + let result = state + .process_synced_package(height, round, proposer, package) + .await + .expect("process should succeed") + .expect("sync should return value"); + + assert_eq!(result.height, height, "Synced height should match"); + assert_eq!(result.round, round, "Synced round should match"); + assert_eq!(result.proposer, proposer, "Proposer should match"); + } + + // Verify all heights were synced correctly + for h in 1..=3u64 { + let height = Height::new(h); + assert!( + state.store.get_blob_metadata(height).await.expect("load").is_some(), + "Height {} should have decided metadata", + h + ); + } + + // Verify parent root chain is continuous + let genesis_root = BlobMetadata::genesis().to_beacon_header().hash_tree_root(); + let h1_meta = state.store.get_blob_metadata(Height::new(1)).await.expect("load").expect("h1"); + let h2_meta = state.store.get_blob_metadata(Height::new(2)).await.expect("load").expect("h2"); + let h3_meta = state.store.get_blob_metadata(Height::new(3)).await.expect("load").expect("h3"); + + assert_eq!(h1_meta.parent_blob_root(), genesis_root, "Height 1 should chain from genesis"); + assert_eq!( + h2_meta.parent_blob_root(), + h1_meta.to_beacon_header().hash_tree_root(), + "Height 2 should chain from height 1" + ); + assert_eq!( + h3_meta.parent_blob_root(), + h2_meta.to_beacon_header().hash_tree_root(), + "Height 3 should chain from height 2" + ); +} + +// ============================================================================ +// Timestamp validation tests (BUG-011 fix) +// ============================================================================ + +fn test_peer_id() -> PeerId { + PeerId::from_str("12D3KooWHRyfTBKcjkqjNk5UZarJhzT7rXZYfr4DmaCWJgen62Xk").expect("valid peer id") +} + +fn set_latest_block(state: &mut State, block_hash: B256, timestamp: u64) { + state.latest_block = Some(ExecutionBlock { + block_hash, + block_number: 0, + parent_hash: B256::ZERO, + timestamp, + prev_randao: load_prev_randao(), + }); +} + +async fn send_payload_as_proposal( + state: &mut State, + payload: alloy_rpc_types_engine::ExecutionPayloadV3, +) -> bool { + let height = state.current_height; + let round = state.current_round; + let payload_bytes = NetworkBytes::from(payload.as_ssz_bytes()); + let proposed = state + .propose_value_with_blobs(height, round, payload_bytes.clone(), &payload, &[], None) + .await + .expect("propose value"); + + let msgs: Vec<_> = state.stream_proposal(proposed, payload_bytes, None, &[], None).collect(); + let peer_id = test_peer_id(); + + for msg in msgs { + if state.received_proposal_part(peer_id, msg).await.expect("received proposal").is_some() { + return true; + } + } + false +} + +#[tokio::test] +async fn timestamp_validation_rejects_future_drift() { + let mock_engine = MockBlobEngine::default(); + let (mut state, _tmp) = build_state(mock_engine, Height::new(1)); + state.store.seed_genesis_blob_metadata().await.expect("seed genesis"); + state.hydrate_blob_parent_root().await.expect("hydrate"); + + let now = + std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).expect("time").as_secs(); + let parent_hash = B256::from([2u8; 32]); + let parent_ts = now.saturating_sub(1); + set_latest_block(&mut state, parent_hash, parent_ts); + + let mut payload = sample_execution_payload_v3(); + payload.payload_inner.payload_inner.parent_hash = parent_hash; + payload.payload_inner.payload_inner.block_number = 1; + payload.payload_inner.payload_inner.timestamp = now + LOAD_MAX_FUTURE_DRIFT_SECS + 5; + + let accepted = send_payload_as_proposal(&mut state, payload).await; + assert!(!accepted, "proposal should be rejected for future drift"); +} + +#[tokio::test] +async fn timestamp_validation_rejects_parent_hash_mismatch() { + let mock_engine = MockBlobEngine::default(); + let (mut state, _tmp) = build_state(mock_engine, Height::new(1)); + state.store.seed_genesis_blob_metadata().await.expect("seed genesis"); + state.hydrate_blob_parent_root().await.expect("hydrate"); + + let now = + std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).expect("time").as_secs(); + let parent_hash = B256::from([2u8; 32]); + let parent_ts = now.saturating_sub(1); + set_latest_block(&mut state, parent_hash, parent_ts); + + let mut payload = sample_execution_payload_v3(); + payload.payload_inner.payload_inner.parent_hash = B256::from([3u8; 32]); // mismatch + payload.payload_inner.payload_inner.block_number = 1; + payload.payload_inner.payload_inner.timestamp = parent_ts + LOAD_MIN_BLOCK_TIME_SECS; + + let accepted = send_payload_as_proposal(&mut state, payload).await; + assert!(!accepted, "proposal should be rejected for parent hash mismatch"); +} + +#[tokio::test] +async fn timestamp_validation_rejects_not_strictly_increasing() { + let mock_engine = MockBlobEngine::default(); + let (mut state, _tmp) = build_state(mock_engine, Height::new(1)); + state.store.seed_genesis_blob_metadata().await.expect("seed genesis"); + state.hydrate_blob_parent_root().await.expect("hydrate"); + + let now = + std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).expect("time").as_secs(); + let parent_hash = B256::from([2u8; 32]); + let parent_ts = now; + set_latest_block(&mut state, parent_hash, parent_ts); + + let mut payload = sample_execution_payload_v3(); + payload.payload_inner.payload_inner.parent_hash = parent_hash; + payload.payload_inner.payload_inner.block_number = 1; + payload.payload_inner.payload_inner.timestamp = parent_ts; + + let accepted = send_payload_as_proposal(&mut state, payload).await; + assert!(!accepted, "proposal should be rejected for non-increasing timestamp"); +} + +#[tokio::test] +async fn timestamp_validation_accepts_valid_timestamp() { + let mock_engine = MockBlobEngine::default(); + let (mut state, _tmp) = build_state(mock_engine, Height::new(1)); + state.store.seed_genesis_blob_metadata().await.expect("seed genesis"); + state.hydrate_blob_parent_root().await.expect("hydrate"); + + let now = + std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).expect("time").as_secs(); + let parent_hash = B256::from([2u8; 32]); + let parent_ts = now.saturating_sub(1); + set_latest_block(&mut state, parent_hash, parent_ts); + + let mut payload = sample_execution_payload_v3(); + payload.payload_inner.payload_inner.parent_hash = parent_hash; + payload.payload_inner.payload_inner.block_number = 1; + payload.payload_inner.payload_inner.timestamp = parent_ts + LOAD_MIN_BLOCK_TIME_SECS; + + let accepted = send_payload_as_proposal(&mut state, payload).await; + assert!(accepted, "proposal should be accepted with valid timestamp"); +} diff --git a/crates/consensus/src/state/tests/support.rs b/crates/consensus/src/state/tests/support.rs index ffe5260..fac5760 100644 --- a/crates/consensus/src/state/tests/support.rs +++ b/crates/consensus/src/state/tests/support.rs @@ -1,5 +1,5 @@ use std::{ - collections::HashMap, + collections::{HashMap, VecDeque}, sync::{Arc, Mutex}, }; @@ -13,7 +13,7 @@ use malachitebft_app_channel::app::types::LocallyProposedValue; use ssz::Encode; use tempfile::{TempDir, tempdir}; use ultramarine_blob_engine::error::BlobEngineError; -use ultramarine_execution::notifier::ExecutionNotifier; +use ultramarine_execution::{error::ExecutionError, notifier::ExecutionNotifier}; use ultramarine_types::{ address::Address, blob::{BYTES_PER_BLOB, Blob, BlobsBundle, KzgCommitment, KzgProof}, @@ -148,11 +148,23 @@ pub async fn propose_blobbed_value( .expect("metadata stored"); let (_signed_header, sidecars) = - state.prepare_blob_sidecar_parts(&proposed, Some(&bundle)).expect("sidecars"); + state.prepare_blob_sidecar_parts(&proposed, Some(&bundle)).await.expect("sidecars"); (proposed, metadata, sidecars, bundle, payload_bytes) } +pub async fn seed_decided_blob_metadata( + state: &mut State, + height: Height, + parent_root: B256, +) -> eyre::Result { + let header = sample_execution_payload_header(); + let metadata = BlobMetadata::blobless(height, parent_root, &header, Some(0)); + state.store.put_blob_metadata_undecided(height, Round::new(0), &metadata).await?; + state.store.mark_blob_metadata_decided(height, Round::new(0)).await?; + Ok(metadata.to_beacon_header().hash_tree_root()) +} + pub fn build_state( mock_engine: MockBlobEngine, start_height: Height, @@ -286,6 +298,8 @@ struct MockExecutionNotifierState { new_block_calls: Vec<(ExecutionPayloadV3, Vec, Vec)>, forkchoice_calls: Vec, payload_status: PayloadStatus, + payload_statuses: VecDeque, + forkchoice_errors: VecDeque, } impl Default for MockExecutionNotifierState { @@ -294,6 +308,8 @@ impl Default for MockExecutionNotifierState { new_block_calls: Vec::new(), forkchoice_calls: Vec::new(), payload_status: PayloadStatus::from_status(PayloadStatusEnum::Valid), + payload_statuses: VecDeque::new(), + forkchoice_errors: VecDeque::new(), } } } @@ -302,6 +318,23 @@ impl MockExecutionNotifier { pub fn new() -> Self { Self::default() } + + pub fn with_payload_statuses(statuses: Vec) -> Self { + let inner = MockExecutionNotifierState { + payload_statuses: statuses.into(), + ..MockExecutionNotifierState::default() + }; + Self { inner: Arc::new(Mutex::new(inner)) } + } + + #[allow(dead_code)] + pub fn with_forkchoice_errors(errors: Vec) -> Self { + let inner = MockExecutionNotifierState { + forkchoice_errors: errors.into(), + ..MockExecutionNotifierState::default() + }; + Self { inner: Arc::new(Mutex::new(inner)) } + } } #[async_trait] @@ -314,6 +347,9 @@ impl ExecutionNotifier for MockExecutionNotifier { ) -> color_eyre::Result { let mut inner = self.inner.lock().unwrap(); inner.new_block_calls.push((payload, execution_requests, versioned_hashes)); + if let Some(status) = inner.payload_statuses.pop_front() { + return Ok(status); + } Ok(inner.payload_status.clone()) } @@ -323,6 +359,9 @@ impl ExecutionNotifier for MockExecutionNotifier { ) -> color_eyre::Result { let mut inner = self.inner.lock().unwrap(); inner.forkchoice_calls.push(block_hash); + if let Some(error) = inner.forkchoice_errors.pop_front() { + return Err(color_eyre::Report::new(error)); + } Ok(block_hash) } } diff --git a/crates/consensus/src/store.rs b/crates/consensus/src/store.rs index d5c2213..a5ff505 100644 --- a/crates/consensus/src/store.rs +++ b/crates/consensus/src/store.rs @@ -284,17 +284,26 @@ impl Db { Ok(()) } - fn height_range( - &self, - table: &Table, - range: impl RangeBounds, - ) -> Result, StoreError> - where - Table: redb::ReadableTable>, - { - Ok(table.range(range)?.flatten().map(|(key, _)| key.value()).collect::>()) + fn delete_undecided_proposal(&self, height: Height, round: Round) -> Result<(), StoreError> { + let start = Instant::now(); + + let key = (height, round); + let tx = self.db.begin_write()?; + { + let mut table = tx.open_table(UNDECIDED_PROPOSALS_TABLE)?; + table.remove(&key)?; + } + tx.commit()?; + + self.metrics.observe_write_time(start.elapsed()); + + Ok(()) } + // NOTE: height_range() was removed as part of FIX-001. + // It was only used for pruning decided data, which we no longer do. + // See store.prune() documentation for details. + fn undecided_proposals_range
( &self, table: &Table, @@ -317,18 +326,30 @@ impl Db { Ok(table.range(range)?.flatten().map(|(key, _)| key.value()).collect::>()) } + /// Prune undecided data (temporary proposals from failed rounds). + /// + /// NOTE: This function intentionally does NOT delete decided data + /// (DECIDED_VALUES_TABLE, CERTIFICATES_TABLE, DECIDED_BLOCK_DATA_TABLE). + /// Following the Lighthouse pattern, only blob bytes are pruned after archival. + /// Decided values, certificates, and block data must be retained forever + /// to allow fullnodes to sync the complete chain history. + /// + /// Load Network context: Validators prune blob bytes via blob_engine after + /// archival to S3, but must serve historical block data for ValueSync. fn prune(&self, retain_height: Height) -> Result, StoreError> { let start = Instant::now(); let tx = self.db.begin_write()?; - let pruned = { + { + // Only prune undecided proposals (temp data from failed consensus rounds) let mut undecided = tx.open_table(UNDECIDED_PROPOSALS_TABLE)?; let keys = self.undecided_proposals_range(&undecided, ..(retain_height, Round::Nil))?; for key in keys { undecided.remove(key)?; } + // Only prune undecided block data (temp data from failed consensus rounds) let mut undecided_block_data = tx.open_table(UNDECIDED_BLOCK_DATA_TABLE)?; let keys = self.block_data_range(&undecided_block_data, ..(retain_height, Round::Nil))?; @@ -336,25 +357,17 @@ impl Db { undecided_block_data.remove(key)?; } - let mut decided = tx.open_table(DECIDED_VALUES_TABLE)?; - let mut certificates = tx.open_table(CERTIFICATES_TABLE)?; - let mut decided_block_data = tx.open_table(DECIDED_BLOCK_DATA_TABLE)?; - - let keys = self.height_range(&decided, ..retain_height)?; - for key in &keys { - decided.remove(key)?; - certificates.remove(key)?; - decided_block_data.remove(key)?; - } - - keys - }; + // DO NOT touch: DECIDED_VALUES_TABLE, CERTIFICATES_TABLE, DECIDED_BLOCK_DATA_TABLE + // These are historical records required for fullnode sync and must be retained forever. + // Blob bytes are pruned separately via blob_engine.mark_archived() after S3 archival. + } tx.commit()?; self.metrics.observe_delete_time(start.elapsed()); - Ok(pruned) + // Return empty vec - we no longer prune decided heights + Ok(vec![]) } fn min_decided_value_height(&self) -> Option { @@ -477,6 +490,22 @@ impl Db { Ok(()) } + fn delete_undecided_block_data(&self, height: Height, round: Round) -> Result<(), StoreError> { + let start = Instant::now(); + + let key = (height, round); + let tx = self.db.begin_write()?; + { + let mut table = tx.open_table(UNDECIDED_BLOCK_DATA_TABLE)?; + table.remove(&key)?; + } + tx.commit()?; + + self.metrics.observe_write_time(start.elapsed()); + + Ok(()) + } + fn insert_decided_block_data( &self, height: Height, @@ -593,14 +622,13 @@ impl Db { { let mut table = tx.open_table(BLOB_METADATA_UNDECIDED_TABLE)?; - // Idempotent write: only insert if value doesn't exist or differs - let should_write = if let Some(existing) = table.get(&key)? { - existing.value() != bytes.as_slice() - } else { - true - }; - - if should_write { + // BUG-014 fix: Only insert if no value exists at this key. + // Previous "overwrite if different" guard allowed WAL replay race + // conditions to corrupt blob metadata when the proposer rebuilds a + // block with a different timestamp during crash recovery. + // This now matches insert_undecided_block_data and + // insert_undecided_proposal which both use is_none() guards. + if table.get(&key)?.is_none() { table.insert(key, bytes)?; } } @@ -1051,6 +1079,15 @@ impl Store { tokio::task::spawn_blocking(move || db.get_undecided_proposal(height, round)).await? } + pub async fn delete_undecided_proposal( + &self, + height: Height, + round: Round, + ) -> Result<(), StoreError> { + let db = Arc::clone(&self.db); + tokio::task::spawn_blocking(move || db.delete_undecided_proposal(height, round)).await? + } + pub async fn prune(&self, retain_height: Height) -> Result, StoreError> { let db = Arc::clone(&self.db); tokio::task::spawn_blocking(move || db.prune(retain_height)).await? @@ -1092,6 +1129,15 @@ impl Store { .await? } + pub async fn delete_undecided_block_data( + &self, + height: Height, + round: Round, + ) -> Result<(), StoreError> { + let db = Arc::clone(&self.db); + tokio::task::spawn_blocking(move || db.delete_undecided_block_data(height, round)).await? + } + pub async fn store_decided_block_data( &self, height: Height, diff --git a/crates/consensus/tests/blob_sync_parent_root.rs b/crates/consensus/tests/blob_sync_parent_root.rs new file mode 100644 index 0000000..9a62e64 --- /dev/null +++ b/crates/consensus/tests/blob_sync_parent_root.rs @@ -0,0 +1,98 @@ +//! Sync parent root integration tests. +//! +//! Ensures sync verification uses store-derived parent roots, even when +//! the in-memory cache is stale (e.g., after restart without hydration). + +mod common; + +#[tokio::test] +async fn blob_sync_uses_store_parent_root_over_cache() -> color_eyre::Result<()> { + use common::{ + TestDirs, build_seeded_state, build_state, make_genesis, propose_with_optional_blobs, + sample_blob_bundle, sample_execution_payload_v3_for_height, + }; + use malachitebft_app_channel::app::types::core::{CommitCertificate, Round}; + use ultramarine_types::{height::Height, sync::SyncedValuePackage}; + + let (genesis, validators) = make_genesis(1); + let validator = &validators[0]; + let dirs = TestDirs::new(); + let round = Round::new(0); + + let (package, expected_parent_root) = { + let mut node = build_seeded_state(&dirs, &genesis, validator, Height::new(0)).await?; + + // Commit height 1 (blobless) so parent metadata exists in the store. + let height_1 = Height::new(1); + let payload_h1 = sample_execution_payload_v3_for_height(height_1, None); + let (proposed_h1, payload_bytes_h1, _sidecars_h1) = + propose_with_optional_blobs(&mut node.state, height_1, round, &payload_h1, None) + .await?; + + node.state + .store_undecided_block_data(height_1, round, payload_bytes_h1.clone(), Vec::new()) + .await?; + + let certificate_h1 = CommitCertificate { + height: height_1, + round, + value_id: proposed_h1.value.id(), + commit_signatures: Vec::new(), + }; + let mut notifier = common::mocks::MockExecutionNotifier::default(); + node.state + .process_decided_certificate(&certificate_h1, payload_bytes_h1, &mut notifier) + .await?; + + let parent_metadata = + node.state.get_blob_metadata(height_1).await?.expect("parent metadata"); + let expected_parent_root = parent_metadata.to_beacon_header().hash_tree_root(); + + // Build a blobbed sync package for height 2. + let height_2 = Height::new(2); + let bundle = sample_blob_bundle(1); + let payload_h2 = sample_execution_payload_v3_for_height(height_2, Some(&bundle)); + let (proposed_h2, payload_bytes_h2, maybe_sidecars_h2) = propose_with_optional_blobs( + &mut node.state, + height_2, + round, + &payload_h2, + Some(&bundle), + ) + .await?; + let sidecars = maybe_sidecars_h2.expect("sidecars expected"); + + let package = SyncedValuePackage::Full { + value: proposed_h2.value.clone(), + execution_payload_ssz: payload_bytes_h2, + blob_sidecars: sidecars, + execution_requests: Vec::new(), + archive_notices: Vec::new(), + }; + + (package, expected_parent_root) + }; + + // Simulate restart: open the same store without hydrating the cache. + let mut harness = build_state(&dirs, &genesis, validator, Height::new(0))?; + let state = &mut harness.state; + + assert_ne!(state.blob_parent_root(), expected_parent_root, "cache should be stale before sync"); + + let encoded = package.encode().map_err(|e| color_eyre::eyre::eyre!(e))?; + let decoded = SyncedValuePackage::decode(&encoded).map_err(|e| color_eyre::eyre::eyre!(e))?; + + let height_2 = Height::new(2); + let result = + state.process_synced_package(height_2, round, validator.address(), decoded).await?; + assert!(result.is_some(), "sync should succeed"); + + let synced_metadata = state.get_blob_metadata(height_2).await?.expect("synced metadata"); + assert_eq!( + synced_metadata.parent_blob_root(), + expected_parent_root, + "sync should use store-derived parent root" + ); + + Ok(()) +} diff --git a/crates/consensus/tests/common/mod.rs b/crates/consensus/tests/common/mod.rs index 6f884c5..5679b74 100644 --- a/crates/consensus/tests/common/mod.rs +++ b/crates/consensus/tests/common/mod.rs @@ -196,7 +196,7 @@ pub(crate) async fn propose_with_optional_blobs( state.propose_value_with_blobs(height, round, bytes.clone(), payload, &[], bundle).await?; let sidecars = if let Some(bundle) = bundle { - let (_header, sidecars) = state.prepare_blob_sidecar_parts(&proposed, Some(bundle))?; + let (_header, sidecars) = state.prepare_blob_sidecar_parts(&proposed, Some(bundle)).await?; Some(sidecars) } else { None diff --git a/crates/execution/Cargo.toml b/crates/execution/Cargo.toml index 618595b..5840147 100644 --- a/crates/execution/Cargo.toml +++ b/crates/execution/Cargo.toml @@ -10,16 +10,14 @@ repository.workspace = true rust-version.workspace = true [dependencies] -ultramarine-cli.workspace = true - ultramarine-types.workspace = true -malachitebft-app-channel.workspace = true malachitebft-proto.workspace = true alloy-consensus = { workspace = true } alloy-eips.workspace = true alloy-network.workspace = true +alloy-primitives.workspace = true alloy-provider.workspace = true alloy-rpc-client.workspace = true alloy-rpc-types = { workspace = true } @@ -39,7 +37,6 @@ hex.workspace = true jsonwebtoken.workspace = true prost.workspace = true rand.workspace = true -redb.workspace = true reqwest.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/crates/execution/README.md b/crates/execution/README.md index 4a70eee..852a688 100644 --- a/crates/execution/README.md +++ b/crates/execution/README.md @@ -22,13 +22,15 @@ The crate is designed to be modular, flexible, and resilient. When Ultramarine calls `engine_forkchoiceUpdatedV3`, it supplies these payload attributes: -| Field | Value | Implementation | -| -------------------------- | ---------------------------- | ------------------------------------------------------ | -| `timestamp` | `latest_block.timestamp + 1` | Monotonically increasing | -| **`prev_randao`** | **Constant `0x01`** | [`load_prev_randao()`](../types/src/engine_api.rs#L21) | -| `suggested_fee_recipient` | Placeholder `0x2a...2a` | TODO: Make validator-configurable | -| `withdrawals` | Empty array `[]` | Load Network has no withdrawals | -| `parent_beacon_block_root` | Previous `block_hash` | EIP-4788 compatibility | +| Field | Value | Implementation | +| -------------------------- | ----------------------------------------------- | ------------------------------------------------------ | +| `timestamp` | `max(now(), parent + LOAD_MIN_BLOCK_TIME_SECS)` | Wall-clock aligned, validator-enforced | +| **`prev_randao`** | **Constant `0x01`** | [`load_prev_randao()`](../types/src/engine_api.rs#L21) | +| `suggested_fee_recipient` | Placeholder `0x2a...2a` | TODO: Make validator-configurable | +| `withdrawals` | Empty array `[]` | Load Network has no withdrawals | +| `parent_beacon_block_root` | Previous `block_hash` | EIP-4788 compatibility | + +**Note**: With 1 block/sec (EVM timestamp granularity), throughput requires high gas limit (2B+). ### prevRandao: Constant Value Design @@ -49,7 +51,7 @@ Load Network uses **constant `0x01`** for `prev_randao` (Arbitrum pattern): - **Normalization**: [`alloy_impl.rs:95`](./src/eth_rpc/alloy_impl.rs#L95) - RPC client returns constant - **Testing**: - Unit: [`state/tests/mod.rs:490`](../consensus/src/state/tests/mod.rs#L490) - `process_decided_certificate_rejects_mismatched_prev_randao` - - Integration: [`node_harness.rs:1803`](../../test/tests/full_node/node_harness.rs#L1803) - `assert_prev_randao_constant()` + - Integration: [`node_harness.rs:1803`](../test/tests/full_node/node_harness.rs#L1803) - `assert_prev_randao_constant()` **For dApp developers**: Do not use `block.prevrandao` for security-critical randomness. Use VRF oracles (Chainlink VRF, API3 QRNG) or commit-reveal schemes. diff --git a/crates/execution/src/client.rs b/crates/execution/src/client.rs index 1ad3c29..2887098 100644 --- a/crates/execution/src/client.rs +++ b/crates/execution/src/client.rs @@ -16,16 +16,32 @@ use ultramarine_types::{ address::Address, aliases::{B256, BlockHash, Bytes}, blob::BlobsBundle, + constants::LOAD_MIN_BLOCK_TIME_SECS, engine_api::load_prev_randao, }; use crate::{ config::{EngineApiEndpoint, ExecutionConfig}, engine_api::{EngineApi, ExecutionPayloadResult, client::EngineApiClient}, + error::ExecutionError, eth_rpc::{EthRpc, alloy_impl::AlloyEthRpc}, transport::{http::HttpTransport, ipc::IpcTransport}, }; +// Pluggable time source to keep tests deterministic without global state. +trait TimeProvider: Send + Sync { + fn now_secs(&self) -> u64; +} + +#[derive(Debug, Default)] +struct SystemTimeProvider; + +impl TimeProvider for SystemTimeProvider { + fn now_secs(&self) -> u64 { + SystemTime::now().duration_since(UNIX_EPOCH).map(|d| d.as_secs()).unwrap_or(0) + } +} + // TODO: USE GENERICS instead of dyn /// The main client for interacting with an execution layer node. @@ -39,6 +55,12 @@ pub struct ExecutionClient { pub engine: Arc, /// The standard Eth1 JSON-RPC client, used for things like fetching logs. pub eth: Arc, + /// Injected clock for deterministic testing and controlled timestamping. + time_provider: Arc, + forkchoice_with_attrs_max_attempts: usize, + forkchoice_with_attrs_delay: Duration, + /// Delay before calling get_payload to give the builder time to fill the payload. + get_payload_delay: Duration, } impl fmt::Debug for ExecutionClient { @@ -82,7 +104,20 @@ impl ExecutionClient { }; info!("ExecutionClient created"); - Ok(Self { engine: engine_client, eth: eth_client }) + let forkchoice_with_attrs_max_attempts = + config.forkchoice_with_attrs_max_attempts.unwrap_or(10).max(1); + let forkchoice_with_attrs_delay = + Duration::from_millis(config.forkchoice_with_attrs_delay_ms.unwrap_or(200)); + let get_payload_delay = Duration::from_millis(config.get_payload_delay_ms.unwrap_or(0)); + + Ok(Self { + engine: engine_client, + eth: eth_client, + time_provider: Arc::new(SystemTimeProvider), + forkchoice_with_attrs_max_attempts, + forkchoice_with_attrs_delay, + get_payload_delay, + }) } pub fn engine(&self) -> &dyn EngineApi { @@ -93,6 +128,16 @@ impl ExecutionClient { self.eth.as_ref() } + fn current_unix_time(&self) -> u64 { + self.time_provider.now_secs() + } + + /// Compute next block timestamp. Must be called AFTER throttling ensures now >= parent + + /// min_block_time. + fn next_block_timestamp(&self, parent_timestamp: u64) -> u64 { + std::cmp::max(self.current_unix_time(), parent_timestamp + LOAD_MIN_BLOCK_TIME_SECS) + } + pub async fn check_capabilities(&self) -> eyre::Result<()> { match self.engine.exchange_capabilities().await { Ok(cap) => { @@ -138,31 +183,39 @@ impl ExecutionClient { match payload_status.status { PayloadStatusEnum::Valid => { if payload_status.latest_valid_hash != Some(head_block_hash) { - tracing::warn!( + tracing::error!( latest_valid_hash = ?payload_status.latest_valid_hash, head_block_hash = ?head_block_hash, "VALID status but latest_valid_hash does not match head" ); + return Err(eyre::eyre!( + "Engine API violation: latest_valid_hash mismatch for head {:?}", + head_block_hash + )); } payload_status.latest_valid_hash.ok_or_else(|| { eyre::eyre!("Engine API spec violation: VALID status without latestValidHash") }) } - PayloadStatusEnum::Syncing if payload_status.latest_valid_hash.is_none() => { - // From the Engine API spec: - // 8. Client software MUST respond to this method call in the following way: - // * {payloadStatus: {status: SYNCING, latestValidHash: null, - // * validationError: null}, payloadId: null} if forkchoiceState.headBlockHash - // references an unknown payload or a payload that can't be validated because - // requisite data for the validation is missing + PayloadStatusEnum::Syncing => { + // Engine API spec: `engine_forkchoiceUpdatedV3` can return SYNCING with + // `latestValidHash: null` if `headBlockHash` references an unknown payload or one + // that can't be validated because requisite data is missing. + // + // We cannot treat this as success (forkchoice may not have been applied), so we + // return an error that callers can treat as transient backpressure. + if payload_status.latest_valid_hash.is_some() { + tracing::warn!( + head_block_hash = ?head_block_hash, + latest_valid_hash = ?payload_status.latest_valid_hash, + "engine_forkchoiceUpdatedV3 returned SYNCING with non-null latestValidHash" + ); + } tracing::warn!( head_block_hash = ?head_block_hash, - "forkchoiceUpdated returned SYNCING with latest_valid_hash = None; EL not ready" + "forkchoiceUpdated returned SYNCING; EL not ready" ); - Err(eyre::eyre!( - "headBlockHash={:?} references an unknown payload or a payload that can't be validated", - head_block_hash - )) + Err(eyre::Report::new(ExecutionError::SyncingForkchoice { head: head_block_hash })) } status => { tracing::error!( @@ -180,10 +233,14 @@ impl ExecutionClient { ) -> eyre::Result { debug!("🟠 generate_block on top of {:?}", latest_block); let block_hash = latest_block.block_hash; + + // Compute expected timestamp ONCE, use for request AND validation + let expected_timestamp = self.next_block_timestamp(latest_block.timestamp); + let payload_attributes = PayloadAttributes { // Unix timestamp for when the payload is expected to be executed. - // It should be greater than that of forkchoiceState.headBlockHash. - timestamp: latest_block.timestamp + 1, + // Wall-clock aligned: max(now, parent + LOAD_MIN_BLOCK_TIME_SECS) + timestamp: expected_timestamp, // Load fixes PREVRANDAO to the canonical constant (Arbitrum-style) so no // application assumes entropy from it; the consensus doc captures this contract. @@ -207,8 +264,9 @@ impl ExecutionClient { safe_block_hash: block_hash, }; - let ForkchoiceUpdated { payload_status, payload_id } = - self.engine.forkchoice_updated(forkchoice_state, Some(payload_attributes)).await?; + let ForkchoiceUpdated { payload_status, payload_id } = self + .forkchoice_updated_with_attributes_retry(forkchoice_state, payload_attributes) + .await?; tracing::debug!( status = ?payload_status.status, @@ -218,30 +276,47 @@ impl ExecutionClient { "forkchoiceUpdated (with attributes) response" ); - if payload_status.latest_valid_hash != Some(block_hash) { - tracing::error!( - latest_valid_hash = ?payload_status.latest_valid_hash, - head = ?block_hash, - "engine_forkchoiceUpdatedV3 returned mismatched latest_valid_hash" - ); - return Err(eyre::eyre!( - "engine_forkchoiceUpdatedV3 returned latestValidHash={:?} not matching head={:?}", - payload_status.latest_valid_hash, - block_hash - )); - } + // Note: Engine API can return SYNCING with latestValidHash=null while it is not ready. + // We handle that via retry in `forkchoice_updated_with_attributes_retry`. match payload_status.status { PayloadStatusEnum::Valid => { let Some(payload_id) = payload_id else { - tracing::error!("VALID status but payload_id is None after attributes"); + tracing::error!( + status = ?payload_status.status, + "forkchoiceUpdated status requires payloadId but payload_id is None after attributes" + ); return Err(eyre::eyre!( - "Engine API spec violation: VALID status with payload attributes must include payloadId" + "Engine API spec violation: forkchoiceUpdated with payload attributes must include payloadId (status={})", + payload_status.status )); }; + // Give the builder time to fill the payload with transactions. + // The builder works incrementally, polling txpool every `interval` (e.g., 25ms). + // Without this delay, we may get an empty or nearly-empty payload. + if !self.get_payload_delay.is_zero() { + tracing::debug!( + delay_ms = self.get_payload_delay.as_millis(), + "Waiting before get_payload to allow builder to fill payload" + ); + tokio::time::sleep(self.get_payload_delay).await; + } // See how payload is constructed: https://github.com/ethereum/consensus-specs/blob/v1.1.5/specs/merge/validator.md#block-proposal let payload_result = self.engine.get_payload(payload_id).await?; let payload_inner = &payload_result.payload.payload_inner.payload_inner; + // Safety: ensure the payload we got is actually built on top of the requested head + // with OUR expected timestamp. This protects against EL quirks. + if payload_inner.parent_hash != block_hash || + payload_inner.block_number != latest_block.block_number + 1 || + payload_inner.timestamp != expected_timestamp + { + return Err(eyre::Report::new(ExecutionError::BuiltPayloadMismatch { + head: block_hash, + parent: payload_inner.parent_hash, + block_number: payload_inner.block_number, + timestamp: payload_inner.timestamp, + })); + } tracing::info!( block_hash = ?payload_inner.block_hash, parent_hash = ?payload_inner.parent_hash, @@ -271,6 +346,14 @@ impl ExecutionClient { // Additionally, the CRITICAL TODO for `suggested_fee_recipient` in this function // must be addressed before any real-world use to ensure transaction fees are // collected. + PayloadStatusEnum::Accepted => { + tracing::error!( + head = ?block_hash, + latest_valid_hash = ?payload_status.latest_valid_hash, + "Engine API spec violation: forkchoiceUpdated returned ACCEPTED" + ); + Err(eyre::eyre!("Engine API spec violation: forkchoiceUpdated returned ACCEPTED")) + } status => { tracing::error!( ?status, @@ -351,10 +434,14 @@ impl ExecutionClient { debug!("🟠 generate_block_with_blobs on top of {:?}", latest_block); let block_hash = latest_block.block_hash; + + // Compute expected timestamp ONCE, use for request AND validation + let expected_timestamp = self.next_block_timestamp(latest_block.timestamp); + let payload_attributes = PayloadAttributes { // Unix timestamp for when the payload is expected to be executed. - // It should be greater than that of forkchoiceState.headBlockHash. - timestamp: latest_block.timestamp + 1, + // Wall-clock aligned: max(now, parent + LOAD_MIN_BLOCK_TIME_SECS) + timestamp: expected_timestamp, // Load fixes PREVRANDAO to the canonical constant (Arbitrum-style) so no // application assumes entropy from it; the consensus doc captures this contract. @@ -379,8 +466,9 @@ impl ExecutionClient { }; // Step 1: Call forkchoiceUpdatedV3 to start block production - let ForkchoiceUpdated { payload_status, payload_id } = - self.engine.forkchoice_updated(forkchoice_state, Some(payload_attributes)).await?; + let ForkchoiceUpdated { payload_status, payload_id } = self + .forkchoice_updated_with_attributes_retry(forkchoice_state, payload_attributes) + .await?; tracing::debug!( status = ?payload_status.status, @@ -390,28 +478,31 @@ impl ExecutionClient { "forkchoiceUpdated (with attributes) response for blob block" ); - if payload_status.latest_valid_hash != Some(block_hash) { - tracing::error!( - latest_valid_hash = ?payload_status.latest_valid_hash, - head = ?block_hash, - "engine_forkchoiceUpdatedV3 returned mismatched latest_valid_hash" - ); - return Err(eyre::eyre!( - "engine_forkchoiceUpdatedV3 returned latestValidHash={:?} not matching head={:?}", - payload_status.latest_valid_hash, - block_hash - )); - } + // Note: Engine API can return SYNCING with latestValidHash=null while it is not ready. + // We handle that via retry in `forkchoice_updated_with_attributes_retry`. match payload_status.status { PayloadStatusEnum::Valid => { let Some(payload_id) = payload_id else { - tracing::error!("VALID status but payload_id is None after attributes"); + tracing::error!( + status = ?payload_status.status, + "forkchoiceUpdated status requires payloadId but payload_id is None after attributes" + ); return Err(eyre::eyre!( - "Engine API spec violation: VALID status with payload attributes must include payloadId" + "Engine API spec violation: forkchoiceUpdated with payload attributes must include payloadId (status={})", + payload_status.status )); }; - + // Give the builder time to fill the payload with transactions. + // The builder works incrementally, polling txpool every `interval` (e.g., 25ms). + // Without this delay, we may get an empty or nearly-empty payload. + if !self.get_payload_delay.is_zero() { + tracing::debug!( + delay_ms = self.get_payload_delay.as_millis(), + "Waiting before get_payload to allow builder to fill payload" + ); + tokio::time::sleep(self.get_payload_delay).await; + } // Step 2: Call getPayloadV3 to retrieve the block and blob bundle // // This uses the new get_payload_with_blobs() method which: @@ -424,10 +515,37 @@ impl ExecutionClient { let blob_count = blob_bundle.as_ref().map(|b| b.len()).unwrap_or(0); let payload_inner = &payload_result.payload.payload_inner.payload_inner; + let has_blob_gas = payload_result.payload.blob_gas_used > 0; + if has_blob_gas && blob_count == 0 { + return Err(eyre::eyre!( + "Engine API spec violation: payload has blob_gas_used={} but blobs bundle is empty", + payload_result.payload.blob_gas_used + )); + } + if !has_blob_gas && blob_count > 0 { + return Err(eyre::eyre!( + "Engine API spec violation: blobs bundle has {} blobs but payload blob_gas_used=0", + blob_count + )); + } + // Safety: ensure the payload we got is actually built on top of the requested head + // with OUR expected timestamp. This protects against EL quirks. + if payload_inner.parent_hash != block_hash || + payload_inner.block_number != latest_block.block_number + 1 || + payload_inner.timestamp != expected_timestamp + { + return Err(eyre::Report::new(ExecutionError::BuiltPayloadMismatch { + head: block_hash, + parent: payload_inner.parent_hash, + block_number: payload_inner.block_number, + timestamp: payload_inner.timestamp, + })); + } tracing::info!( block_hash = ?payload_inner.block_hash, parent_hash = ?payload_inner.parent_hash, block_number = payload_inner.block_number, + timestamp = payload_inner.timestamp, txs = payload_inner.transactions.len(), blob_gas_used = payload_result.payload.blob_gas_used, excess_blob_gas = payload_result.payload.excess_blob_gas, @@ -440,6 +558,14 @@ impl ExecutionClient { } // TODO: A production-ready client must handle all possible statuses gracefully. // See comments in generate_block() for full status handling requirements. + PayloadStatusEnum::Accepted => { + tracing::error!( + head = ?block_hash, + latest_valid_hash = ?payload_status.latest_valid_hash, + "Engine API spec violation: forkchoiceUpdated returned ACCEPTED" + ); + Err(eyre::eyre!("Engine API spec violation: forkchoiceUpdated returned ACCEPTED")) + } status => { tracing::error!( ?status, @@ -450,6 +576,100 @@ impl ExecutionClient { } } + async fn forkchoice_updated_with_attributes_retry( + &self, + forkchoice_state: ForkchoiceState, + payload_attributes: PayloadAttributes, + ) -> eyre::Result { + let head = forkchoice_state.head_block_hash; + // Keep retries short: the consensus round timer should stay in control (Tendermint-style). + // This just smooths transient "EL restarting" windows. + let delay = self.forkchoice_with_attrs_delay; + let max_attempts = self.forkchoice_with_attrs_max_attempts; + let mut attempts = 0usize; + + loop { + attempts += 1; + let res = self + .engine + .forkchoice_updated(forkchoice_state, Some(payload_attributes.clone())) + .await?; + + match res.payload_status.status { + PayloadStatusEnum::Valid => { + if res.payload_status.latest_valid_hash != Some(head) { + return Err(eyre::eyre!( + "engine_forkchoiceUpdatedV3 returned latestValidHash={:?} not matching head={:?}", + res.payload_status.latest_valid_hash, + head + )); + } + // The spec allows returning `payloadId: null` if the EL does not begin a build + // process (e.g. head is a VALID ancestor, or EL is temporarily unable to + // build). Treat this as transient backpressure and retry briefly. + if res.payload_id.is_none() { + if attempts >= max_attempts { + return Err(eyre::Report::new(ExecutionError::NoPayloadIdForBuild { + head, + })); + } + tracing::warn!( + head = ?head, + "engine_forkchoiceUpdatedV3 returned VALID but payloadId is null; retrying" + ); + tokio::time::sleep(delay).await; + continue; + } + return Ok(res); + } + PayloadStatusEnum::Syncing => { + // Engine API spec: SYNCING commonly returns latestValidHash=null and + // payloadId=null. See `execution-apis/src/engine/paris.md` + // (forkchoiceUpdated spec point 8). + if res.payload_id.is_some() { + return Err(eyre::eyre!( + "Engine API spec violation: engine_forkchoiceUpdatedV3 returned SYNCING with non-null payloadId (head={:?})", + head + )); + } + if res.payload_status.latest_valid_hash.is_some() { + tracing::warn!( + head = ?head, + latest_valid_hash = ?res.payload_status.latest_valid_hash, + "engine_forkchoiceUpdatedV3 returned SYNCING with non-null latestValidHash" + ); + } + if attempts >= max_attempts { + return Err(eyre::eyre!( + "engine_forkchoiceUpdatedV3 still SYNCING after {} attempts (head={:?})", + attempts, + head + )); + } + tracing::warn!( + head = ?head, + latest_valid_hash = ?res.payload_status.latest_valid_hash, + "engine_forkchoiceUpdatedV3 returned SYNCING; retrying" + ); + tokio::time::sleep(delay).await; + continue; + } + PayloadStatusEnum::Accepted => { + return Err(eyre::eyre!( + "Engine API spec violation: forkchoiceUpdated returned ACCEPTED (head={:?})", + head + )); + } + status => { + return Err(eyre::eyre!( + "engine_forkchoiceUpdatedV3 returned non-VALID status: {}", + status + )); + } + } + } + } + pub async fn notify_new_block( &self, execution_payload: ExecutionPayloadV3, @@ -501,3 +721,539 @@ impl<'a> crate::notifier::ExecutionNotifier for ExecutionClientNotifier<'a> { self.client.set_latest_forkchoice_state(block_hash).await } } + +#[cfg(test)] +mod tests { + use std::sync::Mutex; + + use alloy_eips::eip7685::Requests; + use alloy_primitives::{ + Address as AlloyAddress, B256 as AB256, Bloom, Bytes as AlloyBytes, FixedBytes, U256, + }; + use alloy_rpc_types::{BlockNumberOrTag, Filter, Log, SyncStatus}; + use alloy_rpc_types_engine::{ + BlobsBundleV1, ExecutionPayloadEnvelopeV3, ExecutionPayloadEnvelopeV4, ExecutionPayloadV1, + ExecutionPayloadV2, PayloadId, + }; + use alloy_rpc_types_txpool::{TxpoolInspect, TxpoolStatus}; + use serde_json::json; + + use super::*; + use crate::transport::{JsonRpcRequest, JsonRpcResponse, Transport}; + + struct NoopEthRpc; + + #[async_trait] + impl EthRpc for NoopEthRpc { + async fn get_chain_id(&self) -> eyre::Result { + Err(eyre::eyre!("not implemented")) + } + async fn syncing(&self) -> eyre::Result { + Err(eyre::eyre!("not implemented")) + } + async fn get_logs(&self, _filter: &Filter) -> eyre::Result> { + Err(eyre::eyre!("not implemented")) + } + async fn get_block_by_number( + &self, + _block_number: BlockNumberOrTag, + _full_transactions: bool, + ) -> eyre::Result> { + Err(eyre::eyre!("not implemented")) + } + async fn txpool_status(&self) -> eyre::Result { + Err(eyre::eyre!("not implemented")) + } + async fn txpool_inspect(&self) -> eyre::Result { + Err(eyre::eyre!("not implemented")) + } + } + + const TEST_NOW: u64 = 1_700_000_001; + + struct FixedTimeProvider { + time: u64, + } + + impl TimeProvider for FixedTimeProvider { + fn now_secs(&self) -> u64 { + self.time + } + } + + #[derive(Clone)] + struct ScriptedEngineApi { + responses: Arc>>, + payloads: Arc>>, + } + + impl ScriptedEngineApi { + fn new(responses: Vec, payload: ExecutionPayloadResult) -> Self { + Self { + responses: Arc::new(Mutex::new(responses.into_iter().rev().collect())), + payloads: Arc::new(Mutex::new(vec![payload])), + } + } + } + + #[async_trait] + impl EngineApi for ScriptedEngineApi { + async fn forkchoice_updated( + &self, + _state: ForkchoiceState, + _payload_attributes: Option, + ) -> eyre::Result { + let mut guard = self.responses.lock().unwrap(); + guard + .pop() + .ok_or_else(|| eyre::eyre!("scripted forkchoice_updated responses exhausted")) + } + + async fn get_payload( + &self, + _payload_id: PayloadId, + ) -> eyre::Result { + self.payloads + .lock() + .unwrap() + .last() + .cloned() + .ok_or_else(|| eyre::eyre!("scripted payload missing")) + } + + async fn get_payload_with_blobs( + &self, + payload_id: PayloadId, + ) -> eyre::Result<(ExecutionPayloadResult, Option)> { + Ok((self.get_payload(payload_id).await?, None)) + } + + async fn new_payload( + &self, + _execution_payload: ExecutionPayloadV3, + _versioned_hashes: Vec, + _parent_block_hash: BlockHash, + _execution_requests: Vec, + ) -> eyre::Result { + Ok(PayloadStatus::from_status(PayloadStatusEnum::Valid)) + } + + async fn exchange_capabilities( + &self, + ) -> eyre::Result { + Err(eyre::eyre!("not implemented")) + } + } + + fn sample_payload_result( + block_hash: AB256, + parent_hash: AB256, + block_number: u64, + ) -> ExecutionPayloadResult { + let payload = ExecutionPayloadV3 { + blob_gas_used: 0, + excess_blob_gas: 0, + payload_inner: ExecutionPayloadV2 { + payload_inner: ExecutionPayloadV1 { + parent_hash, + fee_recipient: AlloyAddress::from([6u8; 20]), + state_root: AB256::from([3u8; 32]), + receipts_root: AB256::from([4u8; 32]), + logs_bloom: Bloom::ZERO, + prev_randao: load_prev_randao(), + block_number, + gas_limit: ultramarine_types::constants::LOAD_EXECUTION_GAS_LIMIT, + gas_used: ultramarine_types::constants::LOAD_EXECUTION_GAS_LIMIT / 2, + timestamp: 1_700_000_000 + block_number, + extra_data: AlloyBytes::new(), + base_fee_per_gas: U256::from(1), + block_hash, + transactions: Vec::new(), + }, + withdrawals: Vec::new(), + }, + }; + ExecutionPayloadResult { payload, execution_requests: Vec::new() } + } + + fn make_latest_block( + block_hash: AB256, + number: u64, + parent_hash: AB256, + timestamp: u64, + ) -> ultramarine_types::engine_api::ExecutionBlock { + ultramarine_types::engine_api::ExecutionBlock { + block_hash, + block_number: number, + parent_hash, + timestamp, + prev_randao: load_prev_randao(), + } + } + + #[derive(Clone)] + struct StaticTransport { + result: serde_json::Value, + } + + #[async_trait] + impl Transport for StaticTransport { + async fn send(&self, req: &JsonRpcRequest) -> eyre::Result { + Ok(JsonRpcResponse { + jsonrpc: "2.0".to_string(), + result: Some(self.result.clone()), + error: None, + id: req.id, + }) + } + } + + #[tokio::test] + async fn accepted_with_payload_id_is_rejected() { + let head = AB256::from([9u8; 32]); + let payload_id = PayloadId::from(FixedBytes::<8>::from([1u8; 8])); + let payload_result = sample_payload_result(AB256::from([7u8; 32]), head, 2); + + let engine = ScriptedEngineApi::new( + vec![ForkchoiceUpdated { + payload_status: PayloadStatus::new(PayloadStatusEnum::Accepted, None), + payload_id: Some(payload_id), + }], + payload_result.clone(), + ); + + let client = ExecutionClient { + engine: Arc::new(engine), + eth: Arc::new(NoopEthRpc), + time_provider: Arc::new(FixedTimeProvider { time: TEST_NOW }), + forkchoice_with_attrs_max_attempts: 1, + forkchoice_with_attrs_delay: Duration::from_millis(0), + get_payload_delay: Duration::from_millis(0), + }; + + let latest_block = make_latest_block(head, 1, AB256::from([8u8; 32]), 1_700_000_001); + let err = client.generate_block(&latest_block).await.unwrap_err(); + assert!(err.to_string().contains("ACCEPTED"), "unexpected error: {err:?}"); + } + + #[tokio::test] + async fn syncing_retries_then_recovers() { + let head = AB256::from([9u8; 32]); + let payload_id = PayloadId::from(FixedBytes::<8>::from([2u8; 8])); + let payload_result = sample_payload_result(AB256::from([7u8; 32]), head, 2); + + let mut scripted = Vec::new(); + for _ in 0..3 { + scripted.push(ForkchoiceUpdated { + payload_status: PayloadStatus::new(PayloadStatusEnum::Syncing, None), + payload_id: None, + }); + } + scripted.push(ForkchoiceUpdated { + payload_status: PayloadStatus::new(PayloadStatusEnum::Valid, Some(head)), + payload_id: Some(payload_id), + }); + + let engine = ScriptedEngineApi::new(scripted, payload_result.clone()); + + let client = ExecutionClient { + engine: Arc::new(engine), + eth: Arc::new(NoopEthRpc), + time_provider: Arc::new(FixedTimeProvider { time: TEST_NOW }), + forkchoice_with_attrs_max_attempts: 10, + forkchoice_with_attrs_delay: Duration::from_millis(1), + get_payload_delay: Duration::from_millis(0), + }; + + let latest_block = make_latest_block(head, 1, AB256::from([8u8; 32]), 1_700_000_001); + let res = client.generate_block(&latest_block).await.unwrap(); + assert_eq!(res.payload.payload_inner.payload_inner.parent_hash, head); + } + + #[tokio::test] + async fn syncing_with_payload_id_is_error() { + let head = AB256::from([9u8; 32]); + let payload_id = PayloadId::from(FixedBytes::<8>::from([5u8; 8])); + let payload_result = sample_payload_result(AB256::from([7u8; 32]), head, 2); + + let engine = ScriptedEngineApi::new( + vec![ForkchoiceUpdated { + payload_status: PayloadStatus::new(PayloadStatusEnum::Syncing, None), + payload_id: Some(payload_id), + }], + payload_result, + ); + + let client = ExecutionClient { + engine: Arc::new(engine), + eth: Arc::new(NoopEthRpc), + time_provider: Arc::new(FixedTimeProvider { time: TEST_NOW }), + forkchoice_with_attrs_max_attempts: 1, + forkchoice_with_attrs_delay: Duration::from_millis(0), + get_payload_delay: Duration::from_millis(0), + }; + + let latest_block = make_latest_block(head, 1, AB256::from([8u8; 32]), 1_700_000_001); + let err = client.generate_block(&latest_block).await.unwrap_err(); + assert!( + err.to_string().contains("SYNCING with non-null payloadId"), + "unexpected error: {err:?}" + ); + } + + #[tokio::test] + async fn latest_valid_hash_mismatch_is_error() { + let head = AB256::from([9u8; 32]); + let payload_id = PayloadId::from(FixedBytes::<8>::from([6u8; 8])); + let payload_result = sample_payload_result(AB256::from([7u8; 32]), head, 2); + let other = AB256::from([8u8; 32]); + + let engine = ScriptedEngineApi::new( + vec![ForkchoiceUpdated { + payload_status: PayloadStatus::new(PayloadStatusEnum::Valid, Some(other)), + payload_id: Some(payload_id), + }], + payload_result, + ); + + let client = ExecutionClient { + engine: Arc::new(engine), + eth: Arc::new(NoopEthRpc), + time_provider: Arc::new(FixedTimeProvider { time: TEST_NOW }), + forkchoice_with_attrs_max_attempts: 1, + forkchoice_with_attrs_delay: Duration::from_millis(0), + get_payload_delay: Duration::from_millis(0), + }; + + let latest_block = make_latest_block(head, 1, AB256::from([8u8; 32]), 1_700_000_001); + let err = client.generate_block(&latest_block).await.unwrap_err(); + assert!(err.to_string().contains("latestValidHash"), "unexpected error: {err:?}"); + } + + #[tokio::test] + async fn state_update_latest_valid_hash_mismatch_is_error() { + let head = AB256::from([9u8; 32]); + let payload_result = sample_payload_result(AB256::from([7u8; 32]), head, 2); + let other = AB256::from([8u8; 32]); + + let engine = ScriptedEngineApi::new( + vec![ForkchoiceUpdated { + payload_status: PayloadStatus::new(PayloadStatusEnum::Valid, Some(other)), + payload_id: None, + }], + payload_result, + ); + + let client = ExecutionClient { + engine: Arc::new(engine), + eth: Arc::new(NoopEthRpc), + time_provider: Arc::new(FixedTimeProvider { time: TEST_NOW }), + forkchoice_with_attrs_max_attempts: 1, + forkchoice_with_attrs_delay: Duration::from_millis(0), + get_payload_delay: Duration::from_millis(0), + }; + + let err = client.set_latest_forkchoice_state(head).await.unwrap_err(); + assert!(err.to_string().contains("latest_valid_hash"), "unexpected error: {err:?}"); + } + + #[tokio::test] + async fn state_update_valid_with_mismatched_latest_valid_hash_rejected() { + let head = AB256::from([9u8; 32]); + let other = AB256::from([7u8; 32]); + let payload_result = sample_payload_result(AB256::from([1u8; 32]), head, 2); + + let engine = ScriptedEngineApi::new( + vec![ForkchoiceUpdated { + payload_status: PayloadStatus::new(PayloadStatusEnum::Valid, Some(other)), + payload_id: None, + }], + payload_result, + ); + + let client = ExecutionClient { + engine: Arc::new(engine), + eth: Arc::new(NoopEthRpc), + time_provider: Arc::new(FixedTimeProvider { time: TEST_NOW }), + forkchoice_with_attrs_max_attempts: 1, + forkchoice_with_attrs_delay: Duration::from_millis(0), + get_payload_delay: Duration::from_millis(0), + }; + + let err = client.set_latest_forkchoice_state(head).await.unwrap_err(); + assert!( + err.to_string().contains("latest_valid_hash mismatch"), + "unexpected error: {err:?}" + ); + } + + #[tokio::test] + async fn state_update_accepted_is_rejected() { + let head = AB256::from([9u8; 32]); + let payload_result = sample_payload_result(AB256::from([2u8; 32]), head, 2); + + let engine = ScriptedEngineApi::new( + vec![ForkchoiceUpdated { + payload_status: PayloadStatus::new(PayloadStatusEnum::Accepted, None), + payload_id: None, + }], + payload_result, + ); + + let client = ExecutionClient { + engine: Arc::new(engine), + eth: Arc::new(NoopEthRpc), + time_provider: Arc::new(FixedTimeProvider { time: TEST_NOW }), + forkchoice_with_attrs_max_attempts: 1, + forkchoice_with_attrs_delay: Duration::from_millis(0), + get_payload_delay: Duration::from_millis(0), + }; + + let err = client.set_latest_forkchoice_state(head).await.unwrap_err(); + assert!( + err.to_string().contains("Invalid payload status: ACCEPTED"), + "unexpected error: {err:?}" + ); + } + + #[tokio::test] + async fn rejects_payload_built_on_wrong_parent() { + let head = AB256::from([9u8; 32]); + let payload_id = PayloadId::from(FixedBytes::<8>::from([3u8; 8])); + // Return a payload that claims a different parent than the requested head. + let payload_result = + sample_payload_result(AB256::from([7u8; 32]), AB256::from([8u8; 32]), 2); + + let engine = ScriptedEngineApi::new( + vec![ForkchoiceUpdated { + payload_status: PayloadStatus::new(PayloadStatusEnum::Valid, Some(head)), + payload_id: Some(payload_id), + }], + payload_result, + ); + + let client = ExecutionClient { + engine: Arc::new(engine), + eth: Arc::new(NoopEthRpc), + time_provider: Arc::new(FixedTimeProvider { time: TEST_NOW }), + forkchoice_with_attrs_max_attempts: 1, + forkchoice_with_attrs_delay: Duration::from_millis(0), + get_payload_delay: Duration::from_millis(0), + }; + + let latest_block = make_latest_block(head, 1, AB256::from([1u8; 32]), 1_700_000_001); + let err = client.generate_block(&latest_block).await.unwrap_err(); + let mismatch = err.downcast_ref::(); + assert!( + matches!(mismatch, Some(ExecutionError::BuiltPayloadMismatch { head: h, .. }) if *h == head), + "expected typed BuiltPayloadMismatch error, got: {err:?}" + ); + } + + #[tokio::test] + async fn valid_without_payload_id_retries_then_succeeds() { + let head = AB256::from([9u8; 32]); + let payload_id = PayloadId::from(FixedBytes::<8>::from([4u8; 8])); + let payload_result = sample_payload_result(AB256::from([7u8; 32]), head, 2); + + let scripted = vec![ + ForkchoiceUpdated { + payload_status: PayloadStatus::new(PayloadStatusEnum::Valid, Some(head)), + payload_id: None, + }, + ForkchoiceUpdated { + payload_status: PayloadStatus::new(PayloadStatusEnum::Valid, Some(head)), + payload_id: Some(payload_id), + }, + ]; + + let engine = ScriptedEngineApi::new(scripted, payload_result.clone()); + let client = ExecutionClient { + engine: Arc::new(engine), + eth: Arc::new(NoopEthRpc), + time_provider: Arc::new(FixedTimeProvider { time: TEST_NOW }), + forkchoice_with_attrs_max_attempts: 5, + forkchoice_with_attrs_delay: Duration::from_millis(1), + get_payload_delay: Duration::from_millis(0), + }; + + let latest_block = make_latest_block(head, 1, AB256::from([8u8; 32]), 1_700_000_001); + let res = client.generate_block(&latest_block).await.unwrap(); + assert_eq!(res.payload.payload_inner.payload_inner.parent_hash, head); + } + + #[tokio::test] + async fn valid_without_payload_id_exhausts_retry_budget() { + let head = AB256::from([9u8; 32]); + let payload_result = sample_payload_result(AB256::from([7u8; 32]), head, 2); + + let scripted = vec![ + ForkchoiceUpdated { + payload_status: PayloadStatus::new(PayloadStatusEnum::Valid, Some(head)), + payload_id: None, + }, + ForkchoiceUpdated { + payload_status: PayloadStatus::new(PayloadStatusEnum::Valid, Some(head)), + payload_id: None, + }, + ]; + + let engine = ScriptedEngineApi::new(scripted, payload_result.clone()); + let client = ExecutionClient { + engine: Arc::new(engine), + eth: Arc::new(NoopEthRpc), + time_provider: Arc::new(FixedTimeProvider { time: TEST_NOW }), + forkchoice_with_attrs_max_attempts: 2, + forkchoice_with_attrs_delay: Duration::from_millis(0), + get_payload_delay: Duration::from_millis(0), + }; + + let latest_block = make_latest_block(head, 1, AB256::from([8u8; 32]), 1_700_000_001); + let err = client.generate_block(&latest_block).await.unwrap_err(); + assert!( + matches!( + err.downcast_ref::(), + Some(ExecutionError::NoPayloadIdForBuild { head: h }) if *h == head + ), + "expected NoPayloadIdForBuild, got: {err:?}" + ); + } + + #[tokio::test] + async fn get_payload_accepts_extra_fields() { + let payload_result = + sample_payload_result(AB256::from([7u8; 32]), AB256::from([6u8; 32]), 2); + + let envelope_v3 = ExecutionPayloadEnvelopeV3 { + execution_payload: payload_result.payload.clone(), + block_value: U256::ZERO, + blobs_bundle: BlobsBundleV1 { commitments: vec![], proofs: vec![], blobs: vec![] }, + should_override_builder: false, + }; + + let envelope_v4 = ExecutionPayloadEnvelopeV4 { + envelope_inner: envelope_v3, + execution_requests: Requests::new(Vec::new()), + }; + + let mut value = serde_json::to_value(&envelope_v4).expect("serialize envelope"); + if let Some(obj) = value.as_object_mut() { + obj.insert("extraField".to_string(), json!("0x01")); + } + if let Some(payload_obj) = + value.get_mut("executionPayload").and_then(|inner| inner.as_object_mut()) + { + payload_obj.insert("extraPayloadField".to_string(), json!("0x02")); + } + + let transport = StaticTransport { result: value }; + let client = EngineApiClient::new(transport); + let payload_id = PayloadId::from(FixedBytes::<8>::from([9u8; 8])); + let result = client.get_payload(payload_id).await.expect("getPayload with extras"); + + assert_eq!( + result.payload.payload_inner.payload_inner.block_number, + payload_result.payload.payload_inner.payload_inner.block_number + ); + } +} diff --git a/crates/execution/src/config.rs b/crates/execution/src/config.rs index 651d59f..2c78b3f 100644 --- a/crates/execution/src/config.rs +++ b/crates/execution/src/config.rs @@ -21,4 +21,25 @@ pub struct ExecutionConfig { pub eth1_rpc_url: Url, /// The JWT secret for authenticating the Engine API connection. pub jwt_secret: [u8; 32], + /// Max attempts for `engine_forkchoiceUpdatedV3` calls that include payload attributes. + /// + /// This is used during block production (proposal) and is intentionally bounded so the + /// consensus round timer stays in control (Tendermint/Malachite-style). + /// + /// If unset, defaults are applied by `ExecutionClient`. + pub forkchoice_with_attrs_max_attempts: Option, + /// Delay in milliseconds between retry attempts for `forkchoiceUpdated` with attributes. + /// + /// If unset, defaults are applied by `ExecutionClient`. + pub forkchoice_with_attrs_delay_ms: Option, + /// Delay in milliseconds before calling `get_payload` after `forkchoice_updated` returns. + /// + /// This gives the EL builder time to poll the transaction pool and include transactions + /// in the payload. The builder works incrementally - each `interval` (e.g., 25ms) it + /// rebuilds the payload with more transactions. Without this delay, getPayload may + /// return an empty or nearly-empty payload. + /// + /// Recommended: 500-800ms for high-throughput scenarios. + /// If unset, defaults to 0 (no delay). + pub get_payload_delay_ms: Option, } diff --git a/crates/execution/src/error.rs b/crates/execution/src/error.rs index f8a22e9..4fdb8d2 100644 --- a/crates/execution/src/error.rs +++ b/crates/execution/src/error.rs @@ -1,6 +1,7 @@ #![allow(missing_docs)] use thiserror::Error; +use ultramarine_types::aliases::BlockHash; /// Defines the specific error types for the execution client. /// @@ -26,4 +27,15 @@ pub enum ExecutionError { #[error("Serialization error: {0}")] Serialization(String), + + #[error("Execution layer is syncing (forkchoiceUpdated): head={head:?}")] + SyncingForkchoice { head: BlockHash }, + + #[error("Execution layer did not start payload build (no payloadId) for head={head:?}")] + NoPayloadIdForBuild { head: BlockHash }, + + #[error( + "Built payload does not match requested head={head:?}: parent={parent:?} number={block_number} timestamp={timestamp}" + )] + BuiltPayloadMismatch { head: BlockHash, parent: BlockHash, block_number: u64, timestamp: u64 }, } diff --git a/crates/execution/src/transport/ipc.rs b/crates/execution/src/transport/ipc.rs index 013f8f5..7e4118b 100644 --- a/crates/execution/src/transport/ipc.rs +++ b/crates/execution/src/transport/ipc.rs @@ -34,7 +34,7 @@ use tokio::{ io::{AsyncBufReadExt, AsyncWriteExt, BufReader}, net::UnixStream, }; -use tracing::{debug, info}; +use tracing::{debug, trace}; use super::{JsonRpcRequest, JsonRpcResponse, Transport}; @@ -52,10 +52,10 @@ impl IpcTransport { } async fn connect(&self) -> eyre::Result { - info!("Connecting to IPC socket at {:?}", &self.path); + debug!(path = %self.path.display(), "Connecting to IPC socket"); let stream_future = UnixStream::connect(&self.path); let stream = tokio::time::timeout(REQUEST_TIMEOUT, stream_future).await??; - info!("Successfully connected to IPC socket"); + debug!("IPC socket connected"); Ok(stream) } } @@ -63,11 +63,13 @@ impl IpcTransport { #[async_trait] impl Transport for IpcTransport { async fn send(&self, req: &JsonRpcRequest) -> eyre::Result { - info!("Sending IPC request"); + let span = tracing::debug_span!("ipc_request", method = %req.method, id = req.id); + let _enter = span.enter(); + let start = std::time::Instant::now(); + // Establish a fresh connection per request and half-close after write so the server // can terminate its side, allowing us to read a single full JSON response to EOF. let mut stream = self.connect().await?; - info!("IPC stream connected"); // Quick fix: many Engine API IPC servers (e.g., reth) expect newline-delimited // JSON-RPC frames and keep the connection open for multiple requests. Without @@ -78,17 +80,19 @@ impl Transport for IpcTransport { // or a length-delimited codec) and optional connection pooling to support // multiple requests per connection. let mut req_bytes = serde_json::to_vec(req)?; + let request_len = req_bytes.len(); req_bytes.push(b'\n'); - debug!("Request bytes: {}", String::from_utf8_lossy(&req_bytes)); + debug!(request_len, "Sending IPC request"); tokio::time::timeout(REQUEST_TIMEOUT, stream.write_all(&req_bytes)).await??; tokio::time::timeout(REQUEST_TIMEOUT, stream.flush()).await??; - debug!("Request bytes sent"); + trace!("IPC request sent"); // Read a single newline-delimited JSON response frame. let mut reader = BufReader::new(stream); let mut resp_bytes = Vec::new(); tokio::time::timeout(REQUEST_TIMEOUT, reader.read_until(b'\n', &mut resp_bytes)).await??; - debug!("Response bytes received: {}", String::from_utf8_lossy(&resp_bytes)); + let response_len = resp_bytes.len(); + debug!(response_len, elapsed_ms = start.elapsed().as_millis(), "IPC response received"); serde_json::from_slice(&resp_bytes).map_err(|e| e.into()) } diff --git a/crates/genesis/Cargo.toml b/crates/genesis/Cargo.toml new file mode 100644 index 0000000..eaaa888 --- /dev/null +++ b/crates/genesis/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "ultramarine-genesis" +version.workspace = true +edition.workspace = true +license.workspace = true + +[dependencies] +alloy-genesis = { workspace = true } +alloy-primitives = { workspace = true } +alloy-signer-local = { workspace = true } +bytes = { workspace = true } +chrono = { workspace = true } +color-eyre = { workspace = true } +serde_json = { workspace = true } +ultramarine-types = { path = "../types" } diff --git a/crates/genesis/src/lib.rs b/crates/genesis/src/lib.rs new file mode 100644 index 0000000..0a4415e --- /dev/null +++ b/crates/genesis/src/lib.rs @@ -0,0 +1,138 @@ +use std::{collections::BTreeMap, str::FromStr}; + +use alloy_genesis::{ChainConfig, Genesis, GenesisAccount}; +use alloy_primitives::{Address, B256, Bytes, U256}; +use alloy_signer_local::{MnemonicBuilder, PrivateKeySigner, coins_bip39::English}; +use chrono::NaiveDate; +use color_eyre::eyre::Result; +use ultramarine_types::constants::LOAD_EXECUTION_GAS_LIMIT; + +/// Test mnemonics for wallet generation. +/// +/// This is intended for dev/testnet genesis generation workflows. +const TEST_MNEMONICS: [&str; 3] = [ + "test test test test test test test test test test test junk", + "abandon abandon abandon abandon abandon abandon abandon abandon abandon abandon abandon about", + "zero zero zero zero zero zero zero zero zero zero zero zoo", +]; + +pub fn make_signer(mnemonic: &str) -> PrivateKeySigner { + MnemonicBuilder::::default().phrase(mnemonic).build().expect("failed to create wallet") +} + +pub fn make_signers() -> Vec { + TEST_MNEMONICS.iter().map(|&mnemonic| make_signer(mnemonic)).collect() +} + +pub fn build_dev_genesis(chain_id: u64) -> Result { + let signers = make_signers(); + let signer_addresses: Vec
= signers.iter().map(|signer| signer.address()).collect(); + + let mut alloc = BTreeMap::new(); + for addr in &signer_addresses { + alloc.insert( + *addr, + GenesisAccount { + balance: U256::from_str("15000000000000000000000").unwrap(), // 15000 ETH + ..Default::default() + }, + ); + } + + build_genesis_from_alloc(chain_id, alloc) +} + +pub fn build_genesis(chain_id: u64, alloc: BTreeMap) -> Result { + build_genesis_from_alloc(chain_id, alloc) +} + +pub fn build_genesis_from_alloc_strings( + chain_id: u64, + alloc: Vec<(String, String)>, +) -> Result { + let mut map = BTreeMap::new(); + for (address, balance_wei) in alloc { + let addr = Address::from_str(&address)?; + let balance = U256::from_str(&balance_wei)?; + map.insert(addr, GenesisAccount { balance, ..Default::default() }); + } + build_genesis_from_alloc(chain_id, map) +} + +fn build_genesis_from_alloc( + chain_id: u64, + alloc: BTreeMap, +) -> Result { + // The Ethereum Cancun-Deneb (Dencun) upgrade was activated on the mainnet on March 13, 2024. + // We keep the timestamp reference handy for future policy, but Load activates forks at genesis. + let date = NaiveDate::from_ymd_opt(2024, 3, 14).unwrap(); + let datetime = date.and_hms_opt(0, 0, 0).unwrap(); + let _valid_cancun_timestamp = datetime.and_utc().timestamp() as u64; + + let genesis = Genesis { + config: ChainConfig { + chain_id, + homestead_block: Some(0), + eip150_block: Some(0), + eip155_block: Some(0), + eip158_block: Some(0), + byzantium_block: Some(0), + constantinople_block: Some(0), + petersburg_block: Some(0), + istanbul_block: Some(0), + berlin_block: Some(0), + london_block: Some(0), + shanghai_time: Some(0), + cancun_time: Some(0), + prague_time: Some(0), + merge_netsplit_block: Some(0), + terminal_total_difficulty: Some(U256::ZERO), + terminal_total_difficulty_passed: true, + ..Default::default() + }, + alloc, + ..Default::default() + } + .with_gas_limit(LOAD_EXECUTION_GAS_LIMIT) + .with_timestamp( + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .expect("Time went backwards") + .as_secs(), + ) + .with_extra_data(Bytes::from_static(b"Load Network Dev")) + .with_difficulty(U256::ZERO) + .with_mix_hash(B256::ZERO) + .with_coinbase(Address::ZERO) + .with_base_fee(Some(7)); + + let mut genesis = genesis; + genesis.parent_hash = Some(B256::ZERO); + genesis.number = Some(0); + + Ok(genesis) +} + +pub fn write_genesis(path: &std::path::Path, genesis: &Genesis) -> Result<()> { + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent)?; + } + let mut genesis_value = serde_json::to_value(genesis)?; + if let Some(root) = genesis_value.as_object_mut() { + root.insert("gasUsed".to_string(), serde_json::Value::String("0x0".to_string())); + root.insert( + "parentHash".to_string(), + serde_json::Value::String( + "0x0000000000000000000000000000000000000000000000000000000000000000".to_string(), + ), + ); + } + if let Some(config) = genesis_value.get_mut("config").and_then(serde_json::Value::as_object_mut) && + matches!(config.get("daoForkSupport"), Some(serde_json::Value::Bool(false))) + { + config.remove("daoForkSupport"); + } + let genesis_json = serde_json::to_string_pretty(&genesis_value)?; + std::fs::write(path, genesis_json)?; + Ok(()) +} diff --git a/crates/node/Cargo.toml b/crates/node/Cargo.toml index 18614a3..58c0a0e 100644 --- a/crates/node/Cargo.toml +++ b/crates/node/Cargo.toml @@ -27,9 +27,12 @@ malachitebft-engine.workspace = true malachitebft-proto.workspace = true alloy-consensus = { workspace = true } +alloy-eips = { workspace = true } +alloy-genesis = { workspace = true } alloy-primitives = { workspace = true } alloy-rpc-types-engine = { workspace = true } alloy-rpc-types-eth = { workspace = true } +alloy-trie = { workspace = true } async-trait.workspace = true bytes.workspace = true color-eyre.workspace = true @@ -47,6 +50,7 @@ sha2.workspace = true sha3.workspace = true thiserror.workspace = true tokio.workspace = true +tokio-util.workspace = true tracing.workspace = true url.workspace = true diff --git a/crates/node/src/app.rs b/crates/node/src/app.rs index 81e7285..ef03cb2 100644 --- a/crates/node/src/app.rs +++ b/crates/node/src/app.rs @@ -1,5 +1,6 @@ #![allow(missing_docs)] -use alloy_rpc_types_eth::BlockNumberOrTag; +use std::time::{Duration, Instant}; + use bytes::Bytes; use color_eyre::eyre::{self, eyre}; use malachitebft_app_channel::{ @@ -12,10 +13,11 @@ use malachitebft_app_channel::{ use malachitebft_engine::host::Next; use ssz::Encode; use tokio::sync::{mpsc, oneshot}; +use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, warn}; use ultramarine_blob_engine::BlobEngine; use ultramarine_consensus::state::State; -use ultramarine_execution::client::ExecutionClient; +use ultramarine_execution::{ExecutionError, ExecutionNotifier, client::ExecutionClient}; use ultramarine_types::{ archive::ArchiveNotice, context::LoadContext, @@ -33,17 +35,24 @@ pub async fn run( execution_layer: ExecutionClient, archiver_job_tx: Option, mut archive_notice_rx: Option>, + shutdown: CancellationToken, ) -> eyre::Result<()> { info!("🚀 App message loop starting"); state.rehydrate_pending_prunes().await?; loop { - // Use tokio::select! to poll both consensus channel and archive notice channel + // Use tokio::select! to poll shutdown, consensus channel, and archive notice channel + // The shutdown branch ensures we exit immediately when signaled, not just at loop start let msg = tokio::select! { + // Check for shutdown signal - this allows immediate exit when signaled + _ = shutdown.cancelled() => { + info!("Shutdown signal detected, exiting main loop..."); + None + } // Poll consensus channel consensus_msg = channels.consensus.recv() => { match consensus_msg { - Some(msg) => msg, + Some(msg) => Some(msg), None => { // Consensus channel closed - exit the loop return Err(eyre!("Consensus channel closed unexpectedly")); @@ -107,6 +116,11 @@ pub async fn run( } }; + // Exit loop if shutdown was signaled + let Some(msg) = msg else { + break; + }; + debug!("📨 Received message: {:?}", std::mem::discriminant(&msg)); match msg { // The first message to handle is the `ConsensusReady` message, signaling to the app @@ -122,27 +136,22 @@ pub async fn run( info!("✅ Execution client capabilities check passed."); - // Try to get the latest block from the execution engine. - // If this fails, we'll lazy-fetch it later in GetValue instead of crashing. - match execution_layer - .eth - .get_block_by_number(alloy_rpc_types_eth::BlockNumberOrTag::Latest, false) - .await - { - Ok(Some(latest_block)) => { - debug!(block_hash = %latest_block.block_hash, "Fetched latest block from execution client"); - state.latest_block = Some(latest_block); - } - Ok(None) => { - warn!( - "Execution client returned no block for 'latest'; will lazy-fetch in GetValue" - ); - state.latest_block = None; - } - Err(e) => { - warn!(%e, "Failed to fetch latest block from execution client; will lazy-fetch in GetValue"); - state.latest_block = None; - } + // Establish latest_block for timestamp/parent validation. + // Prefer consensus store (decided metadata) so restarts don't regress + // to an EL that is lagging or restarting. + if let Some(decided_block) = load_decided_block(state).await? { + info!( + height = %decided_block.block_number, + block_hash = %decided_block.block_hash, + "Initialized latest_block from consensus store" + ); + state.latest_block = Some(decided_block); + } + + if state.latest_block.is_none() { + return Err(eyre!( + "latest_block is None after startup; execution genesis not initialized" + )); } // Calculate start_height following Malachite's pattern: @@ -152,6 +161,17 @@ pub async fn run( let start_height = max_decided.map(|h| h.increment()).unwrap_or_else(|| Height::new(1)); + // Best-effort alignment: apply FCU to the CL decided head. If EL reports + // SYNCING/INVALID, enter observer-only mode until EL catches up. + if let Err(e) = + ensure_el_matches_cl_head(state, &execution_layer, "consensus_ready").await + { + warn!( + error = %e, + "EL not aligned with CL head at startup; proposals/votes will be gated until alignment" + ); + } + info!(?max_decided, %start_height, "Sending StartHeight to consensus engine."); if reply.send((start_height, state.get_validator_set().clone())).is_err() { @@ -167,6 +187,22 @@ pub async fn run( AppMsg::StartedRound { height, round, proposer, role, reply_value } => { info!(%height, %round, %proposer, ?role, "🟢🟢 Started round"); + // Cleanup stale round blobs to prevent memory/storage leak when consensus is stuck. + // This handles the case where rounds keep timing out but no commit happens. + // Only cleanup after round 0 (we need at least one previous round to have stale + // data). + if round.as_u32().is_some_and(|r| r > 0) && + let Err(e) = state.cleanup_stale_round_blobs(height, round).await + { + warn!( + %height, + %round, + error = %e, + "Failed to cleanup stale round blobs" + ); + // Don't fail the round start - this is best-effort cleanup + } + // We can use that opportunity to update our internal state state.current_height = height; state.current_round = round; @@ -180,16 +216,29 @@ pub async fn run( } // At some point, we may end up being the proposer for that round, and the consensus // engine will then ask us for a value to propose to the other validators. - AppMsg::GetValue { height, round, timeout: _, reply } => { - if let Err(e) = - handle_get_value(state, channels, &execution_layer, height, round, reply).await + AppMsg::GetValue { height, round, timeout, reply } => { + match handle_get_value( + state, + channels, + &execution_layer, + height, + round, + timeout, + reply, + ) + .await { - error!( - %height, - %round, - error = ?e, - "GetValue handler failed; letting consensus timeout drive prevote-nil" - ); + Ok(()) => {} + Err(e) => { + error!( + %height, + %round, + error = ?e, + "GetValue handler failed; timeout will drive prevote-nil" + ); + // NOTE: Not sending reply here - Malachite will timeout and do prevote-nil. + // LocallyProposedValue doesn't have a "nil" representation. + } } } AppMsg::ExtendVote { reply, .. } => { @@ -535,6 +584,24 @@ pub async fn run( // This is the success path if let Some(ref complete_proposal) = proposed_value { debug!("✅ Received complete proposal: {:?}", complete_proposal); + + // Safety gate: if EL head doesn't match CL decided head, vote nil. + if let Err(e) = ensure_el_matches_cl_head( + state, + &execution_layer, + "received_proposal_part", + ) + .await + { + warn!( + error = %e, + "EL not aligned with CL head; refusing to vote on proposal" + ); + if reply.send(None).is_err() { + error!("Failed to send ReceivedProposalPart reply"); + } + continue; + } } if reply.send(proposed_value).is_err() { error!("Failed to send ReceivedProposalPart reply"); @@ -565,17 +632,11 @@ pub async fn run( // Realign latest execution block from disk so parent-hash checks // stay in sync after restarts/replays. + // Uses load_execution_block which prefers DecidedValue (immune to + // BUG-014 blob metadata corruption) over blob metadata. if let Some(prev_height) = height.decrement() && - let Ok(Some(prev_meta)) = state.get_blob_metadata(prev_height).await + let Ok(Some(prev_block)) = load_execution_block(state, prev_height).await { - let prev_header = prev_meta.execution_payload_header(); - let prev_block = ExecutionBlock { - block_hash: prev_header.block_hash, - block_number: prev_header.block_number, - parent_hash: prev_header.parent_hash, - timestamp: prev_header.timestamp, - prev_randao: load_prev_randao(), - }; let needs_realignment = state .latest_block .map(|blk| blk.block_number != prev_block.block_number) @@ -668,6 +729,12 @@ pub async fn run( "[DIAG] ✅ process_decided_certificate succeeded: {} txs, {} blobs, current_height now={}", outcome.tx_count, outcome.blob_count, state.current_height ); + if outcome.execution_pending { + warn!( + height = %height, + "Execution layer still syncing; execution finalization deferred" + ); + } outcome } Err(e) => { @@ -891,24 +958,33 @@ pub async fn run( } } ( - Some(_payload), + Some(payload), Err(ultramarine_blob_engine::BlobEngineError::BlobsPruned { locators, .. }), ) => { - // Blobs have been pruned - send MetadataOnly with archive notices - // The receiving peer can use the locators to fetch from external - // archive NOTE: We do NOT send Full - // with empty sidecars as that causes - // process_synced_package to panic (empty hashes vs non-empty - // commitments) - warn!( + // Blobs have been pruned but payload is available - send + // MetadataOnly WITH the payload so + // the receiving peer can import the block without + // blob sidecars. + // + // Load Network pruning policy: Archive event is the boundary + // for blob pruning (NOT Ethereum's time-based DA window). + // This follows the Lighthouse pattern where blocks can be + // imported without blob sidecars once archived. + // + // The receiving peer will: + // 1. Import the execution payload to EL + // 2. Store consensus metadata (commitments from Value) + // 3. Mark blobs as pruned using archive notices + info!( %height, %round, + payload_size = payload.len(), locator_count = locators.len(), notice_count = archive_notices.len(), - "Blobs pruned, sending MetadataOnly with archive notices" + "Blobs pruned, sending MetadataOnly WITH payload for import" ); if !locators.is_empty() { debug!( @@ -921,18 +997,24 @@ pub async fn run( SyncedValuePackage::MetadataOnly { value: decided_value.value.clone(), archive_notices: archive_notices.clone(), + execution_payload_ssz: Some(payload), + execution_requests: execution_requests.clone(), } } _ => { - // Payload missing or other error + // Payload missing or other error - send MetadataOnly without + // payload The receiving peer cannot + // import this block, will try another peer error!( %height, %round, - "Payload or blobs missing/error, sending MetadataOnly" + "Payload or blobs missing/error, sending MetadataOnly without payload" ); SyncedValuePackage::MetadataOnly { value: decided_value.value.clone(), archive_notices: archive_notices.clone(), + execution_payload_ssz: None, + execution_requests: vec![], } } }; @@ -961,6 +1043,19 @@ pub async fn run( } } } + + // Graceful shutdown cleanup + info!("Performing graceful shutdown cleanup..."); + + // Flush blob engine to ensure all data is persisted + if let Err(e) = state.blob_engine().flush_sync() { + warn!("Failed to flush blob engine during shutdown: {}", e); + } else { + info!("Blob engine flushed successfully"); + } + + info!("Graceful shutdown complete"); + Ok(()) } async fn restream_archive_notices( @@ -985,11 +1080,71 @@ async fn handle_get_value( execution_layer: &ExecutionClient, height: Height, round: Round, + timeout: Duration, reply: oneshot::Sender>, ) -> eyre::Result<()> { info!(%height, %round, "🟢🟢 Consensus is requesting a value to propose"); + let started_at = Instant::now(); + + // BUG-014 fix: During WAL recovery, the consensus engine replays the + // proposal from the WAL but still fires GetValue because the app returned + // empty reply_value in StartedRound. If we build a NEW block here, it gets + // a different timestamp → different block_hash → corrupts blob metadata → + // permanent parent-hash-mismatch stall. + // + // Guard: if block data already exists for this (height, round), another code + // path (WAL replay or sync) already stored the correct payload. Skip the EL + // build entirely. Malachite already has the value from WAL and does not wait + // for our reply. + if let Ok(Some(_)) = state.get_block_data(height, round).await { + warn!( + %height, %round, + "Block data already exists for this height/round (WAL recovery); skipping EL build" + ); + // Don't send reply — Malachite already has the value from WAL replay + // and will proceed to decision without waiting for this response. + return Ok(()); + } + + if state.is_el_degraded() { + warn!( + %height, + %round, + "Execution layer marked degraded; attempting to realign before proposing" + ); + } + + let latest_block = ensure_el_matches_cl_head(state, execution_layer, "get_value").await?; + + // Slot throttle: wait until we can propose with valid timestamp + // IMPORTANT: Use Duration with subsecond precision to avoid ~1s jitter + use ultramarine_types::constants::LOAD_MIN_BLOCK_TIME_SECS; + + let next_allowed_ts = latest_block.timestamp + LOAD_MIN_BLOCK_TIME_SECS; + let target_time = Duration::from_secs(next_allowed_ts); + + let now_duration = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or(Duration::ZERO); // Fallback to 0 on time error - will wait full slot + + if now_duration < target_time { + // Precise wait with subseconds: e.g., if now is 1000.8s and target is 1001s, wait only 0.2s + let wait = target_time.saturating_sub(now_duration); + let remaining = timeout.saturating_sub(started_at.elapsed()); + let max_wait = remaining.saturating_sub(Duration::from_millis(200)); // margin for EL call + + if wait > max_wait { + return Err(eyre!( + "Slot throttle {:?} exceeds remaining GetValue timeout {:?}; refusing to propose", + wait, + remaining + )); + } + + debug!(%height, %round, ?wait, "Throttling: waiting for slot boundary"); + tokio::time::sleep(wait).await; + } - let latest_block = ensure_latest_block(state, execution_layer).await?; debug!("Requesting EL to build payload with blobs on top of head"); let (execution_payload, blobs_bundle) = @@ -1028,6 +1183,7 @@ async fn handle_get_value( let (_signed_header, sidecars) = state .prepare_blob_sidecar_parts(&proposal, blobs_bundle.as_ref()) + .await .map_err(|e| eyre!("Failed to prepare blob sidecars: {}", e))?; let round_i64 = round.as_i64(); @@ -1100,21 +1256,110 @@ async fn handle_get_value( Ok(()) } -async fn ensure_latest_block( +/// Loads the [`ExecutionBlock`] for a given decided height. +/// +/// Prefers the consensus-authoritative [`DecidedValue`] (immune to BUG-014 +/// blob-metadata corruption) over blob metadata. Falls back to blob metadata +/// only when the decided value is unavailable (e.g. pruned). +async fn load_execution_block( + state: &State, + height: Height, +) -> eyre::Result> { + // Primary: DecidedValue — stored from the undecided *proposal* which uses + // an is_none() guard and therefore cannot be overwritten during WAL replay. + if let Ok(Some(decided_value)) = state.get_decided_value(height).await { + let header = &decided_value.value.metadata.execution_payload_header; + return Ok(Some(ExecutionBlock { + block_hash: header.block_hash, + block_number: header.block_number, + parent_hash: header.parent_hash, + timestamp: header.timestamp, + prev_randao: load_prev_randao(), + })); + } + + // Fallback: blob metadata (may be corrupted by BUG-014 but useful when + // decided values have been pruned). + match state.get_blob_metadata(height).await? { + Some(meta) => { + let header = meta.execution_payload_header(); + Ok(Some(ExecutionBlock { + block_hash: header.block_hash, + block_number: header.block_number, + parent_hash: header.parent_hash, + timestamp: header.timestamp, + prev_randao: load_prev_randao(), + })) + } + None => Ok(None), + } +} + +async fn load_decided_block(state: &State) -> eyre::Result> { + let Some(height) = state.get_latest_decided_height().await else { + return Ok(None); + }; + let block = load_execution_block(state, height).await?; + if block.is_none() { + warn!( + %height, + "Missing both DecidedValue and BlobMetadata for latest decided height; cannot derive CL head" + ); + } + Ok(block) +} + +async fn resolve_cl_head(state: &mut State) -> eyre::Result { + if let Some(block) = load_decided_block(state).await? { + state.latest_block = Some(block); + return Ok(block); + } + state.latest_block.ok_or_else(|| eyre!("latest_block is None; CL head unavailable")) +} + +async fn ensure_el_matches_cl_head( state: &mut State, execution_layer: &ExecutionClient, + context: &str, ) -> eyre::Result { - if let Some(block) = state.latest_block { - return Ok(block); + let cl_head = resolve_cl_head(state).await?; + + if state.is_el_degraded() && + let Some(since) = state.el_degraded_since + { + const EL_GATE_COOLDOWN: Duration = Duration::from_millis(500); + if since.elapsed() < EL_GATE_COOLDOWN { + return Err(eyre!("EL degraded; skipping FCU during cooldown (context={})", context)); + } } - warn!("latest execution block missing; refetching from EL"); - let block = execution_layer - .eth - .get_block_by_number(BlockNumberOrTag::Latest, false) - .await - .map_err(|e| eyre!("Failed to fetch latest block: {}", e))? - .ok_or_else(|| eyre!("Execution client returned no block for 'latest'"))?; - state.latest_block = Some(block); - Ok(block) + if let (Some(last_head), Some(last_success)) = (state.last_fcu_head, state.last_fcu_success) { + const EL_FCU_SUCCESS_CACHE: Duration = Duration::from_millis(500); + if last_head == cl_head.block_hash && last_success.elapsed() < EL_FCU_SUCCESS_CACHE { + return Ok(cl_head); + } + } + + let mut notifier = execution_layer.as_notifier(); + match notifier.set_latest_forkchoice_state(cl_head.block_hash).await { + Ok(_) => { + state.clear_el_degraded(); + state.last_fcu_head = Some(cl_head.block_hash); + state.last_fcu_success = Some(tokio::time::Instant::now()); + Ok(cl_head) + } + Err(e) => { + let is_syncing = matches!( + e.downcast_ref::(), + Some(ExecutionError::SyncingForkchoice { .. }) + ); + let reason = if is_syncing { + format!("EL syncing for head {}", cl_head.block_hash) + } else { + format!("EL rejected forkchoice for head {}: {}", cl_head.block_hash, e) + }; + state.mark_el_degraded(reason); + Err(e) + } + } } diff --git a/crates/node/src/archiver.rs b/crates/node/src/archiver.rs index 532e6df..a25fb57 100644 --- a/crates/node/src/archiver.rs +++ b/crates/node/src/archiver.rs @@ -575,9 +575,9 @@ impl ArchiverWorker { let parsed = serde_json::from_slice::(&bytes).map_err(|e| { color_eyre::eyre::eyre!( - "Provider {} response was not valid JSON ({e}). Raw body: {}", + "Provider {} response was not valid JSON ({e}). Body length: {} bytes", self.config.provider_url, - String::from_utf8_lossy(&bytes) + bytes.len() ) })?; diff --git a/crates/node/src/node.rs b/crates/node/src/node.rs index 8dbaa62..5c3b9db 100644 --- a/crates/node/src/node.rs +++ b/crates/node/src/node.rs @@ -1,8 +1,17 @@ //! The Application (or Node) definition. The Node trait implements the Consensus context and the //! cryptographic library used for signing. #![allow(missing_docs)] -use std::{path::PathBuf, str::FromStr, sync::Arc}; +use std::{ + path::{Path, PathBuf}, + str::FromStr, + sync::Arc, +}; +use alloy_consensus::{Header, constants::EMPTY_WITHDRAWALS}; +use alloy_eips::{eip1559::INITIAL_BASE_FEE, eip7685::EMPTY_REQUESTS_HASH}; +use alloy_genesis::Genesis as ExecutionGenesis; +use alloy_primitives::{B64, B256}; +use alloy_trie::root::state_root_ref_unhashed; use async_trait::async_trait; use color_eyre::eyre; use malachitebft_app_channel::app::{ @@ -13,6 +22,7 @@ use malachitebft_app_channel::app::{ }; use rand::{CryptoRng, RngCore}; use tokio::{sync::mpsc, task::JoinHandle}; +use tokio_util::sync::CancellationToken; use ultramarine_blob_engine::{BlobEngineImpl, store::rocksdb::RocksDbBlobStore}; use ultramarine_cli::{config::Config, metrics}; use ultramarine_consensus::{metrics::DbMetrics, state::State, store::Store}; @@ -25,6 +35,7 @@ use ultramarine_types::{ archive::ArchiveNotice, codec::proto::ProtobufCodec, context::LoadContext, + engine_api::{ExecutionBlock, load_prev_randao}, genesis::Genesis, height::Height, signing::{Ed25519Provider, PrivateKey, PublicKey}, @@ -90,6 +101,8 @@ pub struct App { pub genesis_file: PathBuf, pub private_key_file: PathBuf, pub start_height: Option, + /// Execution-layer genesis file (same JSON used by load-reth --chain). + pub execution_genesis_file: Option, // Optional execution-layer configuration overrides pub engine_http_url: Option, @@ -98,11 +111,89 @@ pub struct App { pub jwt_path: Option, } +fn fork_block_active_at_genesis(block: Option) -> bool { + matches!(block, Some(0)) +} + +fn fork_time_active_at_genesis(time: Option, genesis_ts: u64) -> bool { + time.map(|t| t <= genesis_ts).unwrap_or(false) +} + +fn build_execution_genesis_header(genesis: &ExecutionGenesis) -> eyre::Result
{ + let london_active = fork_block_active_at_genesis(genesis.config.london_block); + let base_fee_per_gas = if london_active { + let base_fee = genesis.base_fee_per_gas.unwrap_or(u128::from(INITIAL_BASE_FEE)); + let base_fee_u64 = u64::try_from(base_fee).map_err(|_| { + eyre::eyre!("genesis base_fee_per_gas {} does not fit in u64", base_fee) + })?; + Some(base_fee_u64) + } else { + None + }; + + let shanghai_active = + fork_time_active_at_genesis(genesis.config.shanghai_time, genesis.timestamp); + let withdrawals_root = if shanghai_active { Some(EMPTY_WITHDRAWALS) } else { None }; + + let cancun_active = fork_time_active_at_genesis(genesis.config.cancun_time, genesis.timestamp); + let (parent_beacon_block_root, blob_gas_used, excess_blob_gas) = if cancun_active { + ( + Some(B256::ZERO), + Some(genesis.blob_gas_used.unwrap_or(0)), + Some(genesis.excess_blob_gas.unwrap_or(0)), + ) + } else { + (None, None, None) + }; + + let prague_active = fork_time_active_at_genesis(genesis.config.prague_time, genesis.timestamp); + let requests_hash = if prague_active { Some(EMPTY_REQUESTS_HASH) } else { None }; + + Ok(Header { + parent_hash: genesis.parent_hash.unwrap_or_default(), + number: genesis.number.unwrap_or_default(), + gas_limit: genesis.gas_limit, + difficulty: genesis.difficulty, + nonce: B64::from(genesis.nonce), + extra_data: genesis.extra_data.clone(), + timestamp: genesis.timestamp, + mix_hash: genesis.mix_hash, + beneficiary: genesis.coinbase, + state_root: state_root_ref_unhashed(&genesis.alloc), + base_fee_per_gas, + withdrawals_root, + parent_beacon_block_root, + blob_gas_used, + excess_blob_gas, + requests_hash, + ..Default::default() + }) +} + +fn execution_genesis_block_from_file(path: &Path) -> eyre::Result { + let raw = std::fs::read_to_string(path).map_err(|e| { + eyre::eyre!("Failed to read execution genesis at {}: {}", path.display(), e) + })?; + let genesis: ExecutionGenesis = serde_json::from_str(&raw).map_err(|e| { + eyre::eyre!("Failed to parse execution genesis at {}: {}", path.display(), e) + })?; + let header = build_execution_genesis_header(&genesis)?; + let block_hash = header.hash_slow(); + Ok(ExecutionBlock { + block_hash, + block_number: header.number, + parent_hash: header.parent_hash, + timestamp: header.timestamp, + prev_randao: load_prev_randao(), + }) +} + pub struct Handle { pub app: JoinHandle<()>, pub engine: EngineHandle, pub tx_event: TxEvent, pub archiver: Option, + pub shutdown: CancellationToken, } impl std::fmt::Debug for Handle { @@ -271,6 +362,18 @@ impl Node for App { archive_metrics, ); + let execution_genesis_path = self + .execution_genesis_file + .clone() + .or_else(|| std::env::var("ULTRAMARINE_EL_GENESIS_JSON").ok().map(PathBuf::from)) + .ok_or_else(|| { + eyre::eyre!( + "execution genesis path missing; set --execution-genesis-path or ULTRAMARINE_EL_GENESIS_JSON" + ) + })?; + let execution_genesis = execution_genesis_block_from_file(&execution_genesis_path)?; + state.latest_block = Some(execution_genesis); + // Phase 4: Hydrate blob parent root from BlobMetadata (Layer 2) state.hydrate_blob_parent_root().await?; @@ -468,6 +571,12 @@ impl Node for App { engine_api_endpoint: engine_endpoint, eth1_rpc_url: eth_url, jwt_secret, + forkchoice_with_attrs_max_attempts: Some(20), + forkchoice_with_attrs_delay_ms: Some(500), + // Delay before get_payload to let builder fill payload with transactions. + // Builder polls txpool every 25ms, so 500ms ≈ 20 iterations. + // Reduced from 700ms to leave more time for block validation/consensus. + get_payload_delay_ms: Some(500), }; let execution_client = ExecutionClient::new(execution_config).await?; @@ -486,21 +595,7 @@ impl Node for App { )); } - // 2) Eth RPC reachability and basic sanity (latest block must exist) - match execution_client - .eth() - .get_block_by_number(alloy_rpc_types_eth::BlockNumberOrTag::Latest, false) - .await - { - Ok(Some(_)) => {} - Ok(None) => { - return Err(eyre::eyre!( - "Eth RPC at {} returned no 'latest' block. Check genesis/chain is initialized.", - eth_url_str - )) - } - Err(e) => return Err(eyre::eyre!("Failed to reach Eth RPC at {}: {}", eth_url_str, e)), - } + // 2) No Eth RPC preflight here: Engine API is the consensus oracle. // Extract job_tx and notice_rx from archiver_channels let (archiver_handle, archiver_job_tx, archive_notice_rx) = match archiver_channels { @@ -510,6 +605,37 @@ impl Node for App { None => (None, None, None), }; + // Create shutdown token for graceful shutdown coordination + let shutdown = CancellationToken::new(); + let shutdown_for_signal = shutdown.clone(); + let shutdown_for_app = shutdown.clone(); + + // Spawn signal handler for graceful shutdown + // Unix: handle both SIGTERM and SIGINT (Ctrl+C) + // Windows: handle only Ctrl+C + tokio::spawn(async move { + #[cfg(unix)] + { + use tokio::signal::unix::SignalKind; + let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()) + .expect("Failed to register SIGTERM handler"); + tokio::select! { + _ = tokio::signal::ctrl_c() => { + tracing::info!("Received SIGINT (Ctrl+C), initiating graceful shutdown..."); + } + _ = sigterm.recv() => { + tracing::info!("Received SIGTERM, initiating graceful shutdown..."); + } + } + } + #[cfg(not(unix))] + { + let _ = tokio::signal::ctrl_c().await; + tracing::info!("Received Ctrl+C, initiating graceful shutdown..."); + } + shutdown_for_signal.cancel(); + }); + let app_handle = tokio::spawn(async move { if let Err(e) = crate::app::run( &mut state, @@ -517,6 +643,7 @@ impl Node for App { execution_client, archiver_job_tx, archive_notice_rx, + shutdown_for_app, ) .await { @@ -524,7 +651,13 @@ impl Node for App { } }); - Ok(Handle { app: app_handle, engine: engine_handle, tx_event, archiver: archiver_handle }) + Ok(Handle { + app: app_handle, + engine: engine_handle, + tx_event, + archiver: archiver_handle, + shutdown, + }) } async fn run(self) -> eyre::Result<()> { diff --git a/crates/test/Cargo.toml b/crates/test/Cargo.toml index 9602da8..b3d9b55 100644 --- a/crates/test/Cargo.toml +++ b/crates/test/Cargo.toml @@ -2,6 +2,7 @@ name = "ultramarine-test" version.workspace = true edition.workspace = true +license.workspace = true publish = false [lib] @@ -15,6 +16,7 @@ ultramarine-blob-engine.workspace = true ultramarine-cli.workspace = true ultramarine-consensus.workspace = true ultramarine-execution.workspace = true +ultramarine-genesis.workspace = true ultramarine-node = { workspace = true, features = ["test-harness"] } ultramarine-test-support.workspace = true ultramarine-types.workspace = true diff --git a/crates/test/tests/common/mod.rs b/crates/test/tests/common/mod.rs index c9c2ed7..0b6a790 100644 --- a/crates/test/tests/common/mod.rs +++ b/crates/test/tests/common/mod.rs @@ -197,7 +197,7 @@ pub(crate) async fn propose_with_optional_blobs( state.propose_value_with_blobs(height, round, bytes.clone(), payload, &[], bundle).await?; let sidecars = if let Some(bundle) = bundle { - let (_header, sidecars) = state.prepare_blob_sidecar_parts(&proposed, Some(bundle))?; + let (_header, sidecars) = state.prepare_blob_sidecar_parts(&proposed, Some(bundle)).await?; Some(sidecars) } else { None diff --git a/crates/test/tests/full_node/node_harness.rs b/crates/test/tests/full_node/node_harness.rs index 846906e..942ad95 100644 --- a/crates/test/tests/full_node/node_harness.rs +++ b/crates/test/tests/full_node/node_harness.rs @@ -13,7 +13,10 @@ use std::{ net::SocketAddr, path::{Path, PathBuf}, pin::Pin, - sync::{Arc, Mutex, Once}, + sync::{ + Arc, Mutex, Once, + atomic::{AtomicUsize, Ordering}, + }, time::{Duration, SystemTime, UNIX_EPOCH}, }; @@ -89,6 +92,7 @@ use ultramarine_cli::{config::Config, new::generate_config}; use ultramarine_consensus::{ archive_metrics::ArchiveMetrics, metrics::DbMetrics, state::State, store::Store, }; +use ultramarine_genesis::build_dev_genesis; use ultramarine_node::node::{App, Handle}; use ultramarine_types::{ address::Address, @@ -123,6 +127,12 @@ use ultramarine_execution::EngineApi; type NodeConfigHook = Arc; type PayloadPlan = Arc usize + Send + Sync>; +type SyncPlan = Arc bool + Send + Sync>; +type ParentOverridePlan = Arc Option + Send + Sync>; +type StubStateHook = Arc; +type ExecutionRequestsPlan = Arc Vec + Send + Sync>; +type BundleOmitPlan = Arc bool + Send + Sync>; +type AcceptedPlan = Arc bool + Send + Sync>; #[derive(Clone)] struct HarnessConfig { @@ -130,6 +140,7 @@ struct HarnessConfig { start_height: Option, node_config_hook: Option, payload_plan: Option, + stub_state_hook: Option, } impl HarnessConfig { @@ -139,6 +150,12 @@ impl HarnessConfig { } } + fn apply_stub_state(&self, index: usize, state: &mut StubState) { + if let Some(hook) = &self.stub_state_hook { + (hook)(index, state); + } + } + fn payload_plan(&self) -> Option { self.payload_plan.clone() } @@ -151,11 +168,18 @@ struct FullNodeTestBuilder { start_height: Option, node_config_hook: Option, payload_plan: Option, + stub_state_hook: Option, } impl Default for FullNodeTestBuilder { fn default() -> Self { - Self { node_count: 3, start_height: None, node_config_hook: None, payload_plan: None } + Self { + node_count: 3, + start_height: None, + node_config_hook: None, + payload_plan: None, + stub_state_hook: None, + } } } @@ -190,6 +214,98 @@ impl FullNodeTestBuilder { self } + #[allow(dead_code)] + fn with_stub_state_hook(mut self, hook: F) -> Self + where + F: Fn(usize, &mut StubState) + Send + Sync + 'static, + { + let hook: StubStateHook = Arc::new(hook); + self.stub_state_hook = Some(match self.stub_state_hook.take() { + Some(previous) => Arc::new(move |index, state| { + (previous)(index, state); + (hook)(index, state); + }), + None => hook, + }); + self + } + + #[allow(dead_code)] + fn with_el_sync_plan(self, plan: F) -> Self + where + F: Fn(usize, Height) -> bool + Send + Sync + 'static, + { + let plan = Arc::new(plan); + self.with_stub_state_hook(move |index, state| { + let plan = Arc::clone(&plan); + let per_node: SyncPlan = Arc::new(move |height| plan(index, height)); + state.set_sync_plan(per_node); + }) + } + + #[allow(dead_code)] + fn with_el_http_get_block_failure(self, plan: F) -> Self + where + F: Fn(usize) -> bool + Send + Sync + 'static, + { + let plan = Arc::new(plan); + self.with_stub_state_hook(move |index, state| { + state.set_fail_get_block(plan(index)); + }) + } + + #[allow(dead_code)] + fn with_el_parent_override_plan(self, plan: F) -> Self + where + F: Fn(usize, Height) -> Option + Send + Sync + 'static, + { + let plan = Arc::new(plan); + self.with_stub_state_hook(move |index, state| { + let plan = Arc::clone(&plan); + let per_node: ParentOverridePlan = Arc::new(move |height| plan(index, height)); + state.set_parent_override_plan(per_node); + }) + } + + #[allow(dead_code)] + fn with_el_fcu_accepted_plan(self, plan: F) -> Self + where + F: Fn(usize, Height) -> bool + Send + Sync + 'static, + { + let plan = Arc::new(plan); + self.with_stub_state_hook(move |index, state| { + let plan = Arc::clone(&plan); + let per_node: AcceptedPlan = Arc::new(move |height| plan(index, height)); + state.set_accepted_plan(per_node); + }) + } + + #[allow(dead_code)] + fn with_el_execution_requests_plan(self, plan: F) -> Self + where + F: Fn(usize, Height) -> Vec + Send + Sync + 'static, + { + let plan = Arc::new(plan); + self.with_stub_state_hook(move |index, state| { + let plan = Arc::clone(&plan); + let per_node: ExecutionRequestsPlan = Arc::new(move |height| plan(index, height)); + state.set_execution_requests_plan(per_node); + }) + } + + #[allow(dead_code)] + fn with_el_omit_blobs_bundle_plan(self, plan: F) -> Self + where + F: Fn(usize, Height) -> bool + Send + Sync + 'static, + { + let plan = Arc::new(plan); + self.with_stub_state_hook(move |index, state| { + let plan = Arc::clone(&plan); + let per_node: BundleOmitPlan = Arc::new(move |height| plan(index, height)); + state.set_omit_bundle_plan(per_node); + }) + } + /// Enable the archiver worker and configure provider + auth for the test harness. /// /// Most Tier‑1 tests should keep `archiver.enabled=false` to avoid pruning blob bytes @@ -238,6 +354,7 @@ impl FullNodeTestBuilder { start_height: self.start_height, node_config_hook: self.node_config_hook.clone(), payload_plan: self.payload_plan.clone(), + stub_state_hook: self.stub_state_hook.clone(), }; let mut network = NetworkHarness::start(&config).await?; @@ -1740,7 +1857,7 @@ async fn full_node_execution_requests_signature_protection() -> Result<()> { #[tokio::test(flavor = "multi_thread", worker_threads = 4)] #[ignore = "requires full-node harness; run via make itest-node"] #[serial(full_node)] -async fn full_node_store_pruning_retains_recent_heights() -> Result<()> { +async fn full_node_store_pruning_preserves_decided_history() -> Result<()> { init_test_logging(); const TOTAL_HEIGHTS: usize = 8; const RETENTION: u64 = 5; @@ -1786,8 +1903,6 @@ async fn full_node_store_pruning_retains_recent_heights() -> Result<()> { } let metrics_snapshot = metrics.snapshot(); - let retention_window = RETENTION as usize; - let expected_pruned = TOTAL_HEIGHTS.saturating_sub(retention_window); assert_eq!(metrics_snapshot.lifecycle_promoted, TOTAL_HEIGHTS as u64); assert_eq!( metrics_snapshot.lifecycle_pruned, 0, @@ -1798,18 +1913,21 @@ async fn full_node_store_pruning_retains_recent_heights() -> Result<()> { (TOTAL_HEIGHTS * BYTES_PER_BLOB) as i64 ); - // The retention window applies to the consensus store (`Store::prune()`), not blob - // bytes. Old decided values should be removed from the store... - for height in 0..expected_pruned { - let height = Height::new(height as u64); - let decided = state.get_decided_value(height).await?; - assert!(decided.is_none(), "decided value at height {height} should be pruned"); - } - // ...while the most recent decided values are retained. - for height in expected_pruned..TOTAL_HEIGHTS { + // The retention window must NOT prune decided history. All decided values, + // certificates, and block data are retained for genesis sync. + for height in 0..TOTAL_HEIGHTS { let height = Height::new(height as u64); - let decided = state.get_decided_value(height).await?; - assert!(decided.is_some(), "expected decided value at height {height}"); + let decided = state + .get_decided_value(height) + .await? + .expect("decided value should be retained"); + let round = decided.certificate.round; + let block_data = state.get_block_data(height, round).await?; + assert!( + block_data.is_some(), + "block data at height {} should be retained", + height + ); } // Blob bytes themselves remain available locally unless an archive notice triggers // pruning. @@ -1853,7 +1971,7 @@ async fn full_node_sync_package_roundtrip() -> Result<()> { let locally_proposed = LocallyProposedValue::new(height, round, value.clone()); let (_signed_header, sidecars) = - state.prepare_blob_sidecar_parts(&locally_proposed, Some(&bundle))?; + state.prepare_blob_sidecar_parts(&locally_proposed, Some(&bundle)).await?; let package = SyncedValuePackage::Full { value: value.clone(), @@ -1901,6 +2019,296 @@ async fn full_node_sync_package_roundtrip() -> Result<()> { .await } +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[ignore = "requires full-node harness; run via make itest-node"] +#[serial(full_node)] +async fn full_node_el_syncing_degrades_node() -> Result<()> { + init_test_logging(); + FullNodeTestBuilder::new() + .node_count(4) + .with_el_sync_plan(|index, height| index == 0 && height.as_u64() >= 2) + .run(|network| { + Box::pin(async move { + network.wait_for_nodes_at(&[1, 2, 3], Height::new(3)).await?; + + let guard = network.nodes[0].stub_state.lock().await; + eyre::ensure!( + guard.latest_block.block_number == 1, + "expected node 0 EL stub to remain at height 1 while syncing, got {}", + guard.latest_block.block_number + ); + Ok(()) + }) + }) + .await +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[ignore = "requires full-node harness; run via make itest-node"] +#[serial(full_node)] +async fn full_node_el_syncing_still_sends_fcu() -> Result<()> { + init_test_logging(); + FullNodeTestBuilder::new() + .node_count(4) + .with_el_sync_plan(|index, height| index == 0 && height.as_u64() >= 1) + .run(|network| { + Box::pin(async move { + // Let healthy nodes advance; node 0 should still attempt FCU while syncing. + network.wait_for_nodes_at(&[1, 2, 3], Height::new(2)).await?; + sleep(Duration::from_millis(200)).await; + + let guard = network.nodes[0].stub_state.lock().await; + eyre::ensure!( + guard.forkchoice_calls > 0, + "expected FCU calls while EL syncing; recorded heads={:?}", + guard.forkchoice_heads + ); + Ok(()) + }) + }) + .await +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[ignore = "requires full-node harness; run via make itest-node"] +#[serial(full_node)] +async fn full_node_el_syncing_blocks_payload_build() -> Result<()> { + init_test_logging(); + FullNodeTestBuilder::new() + .node_count(1) + .with_el_sync_plan(|_index, height| height.as_u64() >= 1) + .run(|network| { + Box::pin(async move { + // Let the node attempt to propose for a couple of rounds. + sleep(Duration::from_secs(2)).await; + + let guard = network.nodes[0].stub_state.lock().await; + eyre::ensure!( + guard.get_payload_requests.is_empty(), + "expected no getPayload calls while EL is syncing; got {:?}", + guard.get_payload_requests + ); + eyre::ensure!( + guard.new_payload_requests.is_empty(), + "expected no newPayload calls while EL is syncing; got {:?}", + guard.new_payload_requests + ); + Ok(()) + }) + }) + .await +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[ignore = "requires full-node harness; run via make itest-node"] +#[serial(full_node)] +async fn full_node_fcu_gate_does_not_require_http_latest() -> Result<()> { + init_test_logging(); + FullNodeTestBuilder::new() + .node_count(1) + .run(|network| { + Box::pin(async move { + // Bootstrap normally, then force eth_getBlockByNumber to fail before restart. + network.wait_for_nodes_at(&[0], Height::new(2)).await?; + network.stop_node(0).await?; + { + let mut guard = network.nodes[0].stub_state.lock().await; + guard.set_fail_get_block(true); + } + network.start_node(0).await?; + // Restart should succeed because CL head is restored from store, + // and FCU gate should not depend on eth_getBlockByNumber. + network.wait_for_nodes_at(&[0], Height::new(3)).await?; + Ok(()) + }) + }) + .await +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[ignore = "requires full-node harness; run via make itest-node"] +#[serial(full_node)] +async fn full_node_el_transient_syncing_recovers() -> Result<()> { + init_test_logging(); + let attempts = Arc::new(AtomicUsize::new(0)); + FullNodeTestBuilder::new() + .node_count(2) + .with_el_sync_plan({ + let attempts = Arc::clone(&attempts); + move |index, height| { + if index != 0 || height.as_u64() < 2 { + return false; + } + attempts.fetch_add(1, Ordering::SeqCst) < 5 + } + }) + .run(|network| { + Box::pin(async move { + network.wait_for_nodes_at(&[0, 1], Height::new(3)).await?; + Ok(()) + }) + }) + .await +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[ignore = "requires full-node harness; run via make itest-node"] +#[serial(full_node)] +async fn full_node_el_transient_bad_payload_parent_recovers() -> Result<()> { + init_test_logging(); + let attempts = Arc::new(AtomicUsize::new(0)); + FullNodeTestBuilder::new() + .node_count(2) + .with_el_parent_override_plan({ + let attempts = Arc::clone(&attempts); + move |index, height| { + if index != 0 || height.as_u64() != 2 { + return None; + } + if attempts.fetch_add(1, Ordering::SeqCst) == 0 { + // Return a parent that is guaranteed to not equal the requested head. + return Some(B256::from([0xAAu8; 32])); + } + None + } + }) + .run(|network| { + Box::pin(async move { + network.wait_for_nodes_at(&[0, 1], Height::new(3)).await?; + Ok(()) + }) + }) + .await +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[ignore = "requires full-node harness; run via make itest-node"] +#[serial(full_node)] +async fn full_node_split_head_recovery() -> Result<()> { + init_test_logging(); + FullNodeTestBuilder::new() + .node_count(4) + .with_el_parent_override_plan(|index, height| { + if height.as_u64() == 2 && index <= 1 { + // Force two nodes to build on a stale/invalid parent for height 2. + return Some(B256::from([0xBBu8; 32])); + } + None + }) + .run(|network| { + Box::pin(async move { + // Split proposals should be rejected; consensus must still recover and advance. + network.wait_for_nodes_at(&[0, 1, 2, 3], Height::new(3)).await?; + Ok(()) + }) + }) + .await +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[ignore = "requires full-node harness; run via make itest-node"] +#[serial(full_node)] +async fn full_node_fcu_accepted_rejected() -> Result<()> { + init_test_logging(); + FullNodeTestBuilder::new() + .node_count(4) + .with_el_fcu_accepted_plan(|index, height| index == 0 && height.as_u64() == 2) + .run(|network| { + Box::pin(async move { + // Node 0 should refuse to propose when EL returns ACCEPTED, but the cluster + // should still advance via other proposers. + network.wait_for_nodes_at(&[0, 1, 2, 3], Height::new(3)).await?; + Ok(()) + }) + }) + .await +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[ignore = "requires full-node harness; run via make itest-node"] +#[serial(full_node)] +async fn full_node_rejects_invalid_execution_requests_from_el() -> Result<()> { + init_test_logging(); + FullNodeTestBuilder::new() + .node_count(1) + .with_el_execution_requests_plan(|_, height| { + if height.as_u64() == 1 { + invalid_execution_requests_out_of_order() + } else { + sample_execution_requests_for_height(height) + } + }) + .run(|network| { + Box::pin(async move { + sleep(Duration::from_secs(2)).await; + + let guard = network.nodes[0].stub_state.lock().await; + eyre::ensure!( + guard.new_payload_requests.is_empty(), + "unexpected newPayload calls despite invalid execution requests" + ); + eyre::ensure!( + guard.latest_block.block_number == 0, + "stub head advanced despite invalid execution requests" + ); + drop(guard); + + network.stop_node(0).await?; + let node = network.node_ref(0)?; + let store_path = node.home.path().join("store.db"); + let store = open_store_read_only_retry(&store_path, Duration::from_secs(5)).await?; + let decided = store.get_decided_value(Height::new(1)).await?; + assert!( + decided.is_none(), + "height 1 should not be decided with invalid execution requests" + ); + + Ok(()) + }) + }) + .await +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[ignore = "requires full-node harness; run via make itest-node"] +#[serial(full_node)] +async fn full_node_rejects_payload_without_blobs_bundle() -> Result<()> { + init_test_logging(); + FullNodeTestBuilder::new() + .node_count(1) + .with_payload_plan(|_| 1) + .with_el_omit_blobs_bundle_plan(|_, height| height.as_u64() == 1) + .run(|network| { + Box::pin(async move { + sleep(Duration::from_secs(2)).await; + + let guard = network.nodes[0].stub_state.lock().await; + eyre::ensure!( + guard.new_payload_requests.is_empty(), + "unexpected newPayload calls despite missing blobs bundle" + ); + eyre::ensure!( + guard.latest_block.block_number == 0, + "stub head advanced despite missing blobs bundle" + ); + drop(guard); + + network.stop_node(0).await?; + let node = network.node_ref(0)?; + let store_path = node.home.path().join("store.db"); + let store = open_store_read_only_retry(&store_path, Duration::from_secs(5)).await?; + let decided = store.get_decided_value(Height::new(1)).await?; + assert!( + decided.is_none(), + "height 1 should not be decided with missing blobs bundle" + ); + + Ok(()) + }) + }) + .await +} + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] #[ignore = "requires full-node harness; run via make itest-node"] #[serial(full_node)] @@ -1974,6 +2382,7 @@ struct NodeProcess { event_rx: TokioMutex>, config: Config, genesis_path: PathBuf, + execution_genesis_path: PathBuf, key_path: PathBuf, jwt_path: PathBuf, base_start_height: Option, @@ -2012,6 +2421,7 @@ impl NodeProcess { genesis_file: self.genesis_path.clone(), private_key_file: self.key_path.clone(), start_height: app_start_height, + execution_genesis_file: Some(self.execution_genesis_path.clone()), engine_http_url: Some(engine_url), engine_ipc_path: None, eth1_rpc_url: Some(eth_url), @@ -2127,6 +2537,8 @@ impl NetworkHarness { async fn start(config: &HarnessConfig) -> Result { eyre::ensure!(config.node_count > 0, "at least one node required"); let (genesis, validator_keys) = make_genesis(config.node_count); + let execution_genesis = + build_dev_genesis(1).wrap_err("build execution genesis for harness")?; // Port selection: // - Avoid the TOCTOU race of "bind :0, read port, drop listener". @@ -2146,7 +2558,9 @@ impl NetworkHarness { for index in 0..config.node_count { let payload_plan = config.payload_plan(); - let stub_state = Arc::new(TokioMutex::new(StubState::new(payload_plan.clone()))); + let mut local_stub = StubState::new(payload_plan.clone()); + config.apply_stub_state(index, &mut local_stub); + let stub_state = Arc::new(TokioMutex::new(local_stub)); let engine_stub = match EngineRpcStub::start(stub_state.clone()).await { Ok(stub) => stub, Err(e) => { @@ -2172,9 +2586,9 @@ impl NetworkHarness { break; } }; - let key_path = match write_json( - home.path().join("validator_key.json"), - &validator.private_key(), + let execution_genesis_path = match write_json( + home.path().join("execution_genesis.json"), + &execution_genesis, ) { Ok(path) => path, Err(e) => { @@ -2183,9 +2597,20 @@ impl NetworkHarness { break; } }; - let jwt_path = match write_jwt(home.path().join("jwt.hex")) { - Ok(path) => path, - Err(e) => { + let key_path = match write_json( + home.path().join("validator_key.json"), + &validator.private_key(), + ) { + Ok(path) => path, + Err(e) => { + engine_stub.shutdown().await; + start_error = Some(e); + break; + } + }; + let jwt_path = match write_jwt(home.path().join("jwt.hex")) { + Ok(path) => path, + Err(e) => { engine_stub.shutdown().await; start_error = Some(e); break; @@ -2241,6 +2666,7 @@ impl NetworkHarness { genesis_file: genesis_path, private_key_file: key_path, start_height: config.start_height, + execution_genesis_file: Some(execution_genesis_path.clone()), engine_http_url: Some(engine_http_url.clone()), engine_ipc_path: None, eth1_rpc_url: Some(eth1_rpc_url), @@ -2265,6 +2691,7 @@ impl NetworkHarness { event_rx, config: stored_config, genesis_path: stored_genesis, + execution_genesis_path, key_path: stored_key, jwt_path: stored_jwt, base_start_height: config.start_height, @@ -2747,6 +3174,14 @@ async fn open_state_ready(node: &NodeProcess) -> Result, Vec)>, payload_plan: Option, + sync_plan: Option, + parent_override_plan: Option, + execution_requests_plan: Option, + omit_bundle_plan: Option, + accepted_plan: Option, + new_payload_requests: Vec<(u64, Vec)>, + get_payload_requests: Vec, + forkchoice_calls: u64, + forkchoice_heads: Vec, + forkchoice_with_attrs: Vec, + fail_get_block: bool, } impl StubState { @@ -3330,12 +3784,70 @@ impl StubState { next_payload_id: 0, pending: HashMap::new(), payload_plan, + sync_plan: None, + parent_override_plan: None, + execution_requests_plan: None, + omit_bundle_plan: None, + accepted_plan: None, + new_payload_requests: Vec::new(), + get_payload_requests: Vec::new(), + forkchoice_calls: 0, + forkchoice_heads: Vec::new(), + forkchoice_with_attrs: Vec::new(), + fail_get_block: false, } } fn blob_count_for_height(&self, height: Height) -> usize { self.payload_plan.as_ref().map(|plan| plan(height)).unwrap_or(1) } + + fn set_sync_plan(&mut self, plan: SyncPlan) { + self.sync_plan = Some(plan); + } + + fn is_syncing_for(&self, height: Height) -> bool { + self.sync_plan.as_ref().map(|plan| plan(height)).unwrap_or(false) + } + + fn set_parent_override_plan(&mut self, plan: ParentOverridePlan) { + self.parent_override_plan = Some(plan); + } + + fn parent_override_for(&self, height: Height) -> Option { + self.parent_override_plan.as_ref().and_then(|plan| plan(height)) + } + + fn set_execution_requests_plan(&mut self, plan: ExecutionRequestsPlan) { + self.execution_requests_plan = Some(plan); + } + + fn execution_requests_for(&self, height: Height) -> Vec { + self.execution_requests_plan + .as_ref() + .map(|plan| plan(height)) + .unwrap_or_else(|| sample_execution_requests_for_height(height)) + } + + fn set_omit_bundle_plan(&mut self, plan: BundleOmitPlan) { + self.omit_bundle_plan = Some(plan); + } + + fn omit_bundle_for(&self, height: Height) -> bool { + self.omit_bundle_plan.as_ref().map(|plan| plan(height)).unwrap_or(false) + } + + fn set_accepted_plan(&mut self, plan: AcceptedPlan) { + self.accepted_plan = Some(plan); + } + + fn is_accepted_for(&self, height: Height) -> bool { + self.accepted_plan.as_ref().map(|plan| plan(height)).unwrap_or(false) + } + + fn set_fail_get_block(&mut self, enabled: bool) { + self.fail_get_block = enabled; + } } fn default_execution_block() -> ExecutionBlock { @@ -3359,7 +3871,11 @@ fn timestamp_for_height(height: u64) -> u64 { } fn height_from_block_hash(hash: B256) -> Option { - if hash == B256::ZERO { None } else { Some(u64::from(hash.0[0])) } + if hash == B256::ZERO { + return None; + } + let first = hash.0[0]; + if hash.0.iter().all(|byte| *byte == first) { Some(u64::from(first)) } else { None } } #[derive(Deserialize)] @@ -3469,37 +3985,82 @@ async fn handle_forkchoice(req: &RpcRequest, state: Arc>) serde_json::from_value(Value::Array(params)).wrap_err("parse forkchoice params")?; let mut guard = state.lock().await; - - if let Some(head_height) = height_from_block_hash(forkchoice.head_block_hash) && - head_height > guard.latest_block.block_number - { - let parent_hash = - if head_height == 0 { B256::ZERO } else { block_hash_for_height(head_height - 1) }; - guard.latest_block.block_number = head_height; - guard.latest_block.block_hash = forkchoice.head_block_hash; - guard.latest_block.parent_hash = parent_hash; - guard.latest_block.timestamp = timestamp_for_height(head_height); - debug_log!( - "engine_forkchoiceUpdatedV3: updated latest block to height {} from forkchoice head", - head_height - ); + guard.forkchoice_calls += 1; + + if let Some(head_height) = height_from_block_hash(forkchoice.head_block_hash) { + guard.forkchoice_heads.push(head_height); + let head = Height::new(head_height); + if guard.is_syncing_for(head) { + return Ok(json!({ + "payloadStatus": { + "status": "SYNCING", + "latestValidHash": Value::Null, + "validationError": Value::Null + }, + "payloadId": Value::Null + })); + } + if head_height > guard.latest_block.block_number { + let parent_hash = + if head_height == 0 { B256::ZERO } else { block_hash_for_height(head_height - 1) }; + guard.latest_block.block_number = head_height; + guard.latest_block.block_hash = forkchoice.head_block_hash; + guard.latest_block.parent_hash = parent_hash; + guard.latest_block.timestamp = timestamp_for_height(head_height); + debug_log!( + "engine_forkchoiceUpdatedV3: updated latest block to height {} from forkchoice head", + head_height + ); + } } - if let Some(_attrs) = payload_attrs { + if let Some(attrs) = payload_attrs { // Generate payload for next height (latest + 1) let next_height = Height::new(guard.latest_block.block_number + 1); + guard.forkchoice_with_attrs.push(next_height.as_u64()); + + if guard.is_accepted_for(next_height) { + return Ok(json!({ + "payloadStatus": { + "status": "ACCEPTED", + "latestValidHash": Value::Null, + "validationError": Value::Null + }, + "payloadId": Value::Null + })); + } + + if guard.is_syncing_for(next_height) { + return Ok(json!({ + "payloadStatus": { + "status": "SYNCING", + "latestValidHash": Value::Null, + "validationError": Value::Null + }, + "payloadId": Value::Null + })); + } debug_log!("engine_forkchoiceUpdatedV3: generating payload for height {}", next_height); let blob_count = guard.blob_count_for_height(next_height); let bundle = if blob_count == 0 { None } else { Some(sample_blob_bundle(blob_count)) }; - let payload = sample_execution_payload_v3_for_height(next_height, bundle.as_ref()); - let execution_requests = sample_execution_requests_for_height(next_height); + let mut payload = sample_execution_payload_v3_for_height(next_height, bundle.as_ref()); + let parent_hash = + guard.parent_override_for(next_height).unwrap_or(forkchoice.head_block_hash); + payload.payload_inner.payload_inner.parent_hash = parent_hash; + payload.payload_inner.payload_inner.timestamp = attrs.timestamp; + payload.payload_inner.payload_inner.prev_randao = attrs.prev_randao; + payload.payload_inner.payload_inner.fee_recipient = attrs.suggested_fee_recipient; + payload.payload_inner.withdrawals = attrs.withdrawals.unwrap_or_default(); + let response_bundle = + if guard.omit_bundle_for(next_height) { None } else { bundle.clone() }; + let execution_requests = guard.execution_requests_for(next_height); let payload_id = guard.next_payload_id; guard.next_payload_id += 1; guard.pending.insert( payload_id.to_be_bytes(), - (payload.clone(), bundle.clone(), execution_requests), + (payload.clone(), response_bundle, execution_requests), ); Ok(json!({ @@ -3531,8 +4092,10 @@ async fn handle_get_payload( let payload_id_hex = params.first().and_then(Value::as_str).ok_or_else(|| eyre::eyre!("missing payload id"))?; let id_bytes = parse_payload_id(payload_id_hex)?; + let payload_id = u64::from_be_bytes(id_bytes); let mut guard = state.lock().await; + guard.get_payload_requests.push(payload_id); let (payload, bundle, execution_requests) = guard.pending.remove(&id_bytes).ok_or_else(|| eyre::eyre!("unknown payload id"))?; @@ -3567,6 +4130,21 @@ async fn handle_new_payload( } let mut guard = state.lock().await; + let height = Height::new(payload.block_number); + if guard.is_syncing_for(height) { + return Ok(json!({ + "status": "SYNCING", + "latestValidHash": Value::Null, + "validationError": Value::Null + })); + } + if is_v4 { + let requests: Vec = + serde_json::from_value(params[3].clone()).wrap_err("decode execution requests")?; + validate_execution_request_strings(&requests)?; + guard.new_payload_requests.push((payload.block_number, requests)); + } + guard.latest_block = ExecutionBlock { block_hash: payload.block_hash, block_number: payload.block_number, @@ -3584,6 +4162,9 @@ async fn handle_new_payload( async fn handle_get_block(state: Arc>) -> Result { let guard = state.lock().await; + if guard.fail_get_block { + return Err(eyre::eyre!("eth_getBlockByNumber disabled by test")); + } let block = &guard.latest_block; Ok(json!({ "number": format_hex_u64(block.block_number), @@ -3641,6 +4222,49 @@ fn format_zero_bytes(bytes: usize) -> String { format!("0x{}", "00".repeat(bytes)) } +fn format_execution_requests(requests: &[AlloyBytes]) -> Vec { + requests.iter().map(|request| format!("0x{}", hex::encode(request.as_ref()))).collect() +} + +fn invalid_execution_requests_out_of_order() -> Vec { + vec![AlloyBytes::copy_from_slice(&[0x05, 0xAA]), AlloyBytes::copy_from_slice(&[0x04, 0xBB])] +} + +fn parse_execution_request_hex(request: &str, index: usize) -> Result> { + let trimmed = request + .strip_prefix("0x") + .ok_or_else(|| eyre::eyre!("execution request {} missing 0x prefix", index))?; + hex::decode(trimmed).wrap_err_with(|| format!("execution request {} has invalid hex", index)) +} + +fn validate_execution_request_strings(requests: &[String]) -> Result<()> { + if requests.is_empty() { + return Ok(()); + } + + let mut prev_type: Option = None; + for (idx, request) in requests.iter().enumerate() { + let bytes = parse_execution_request_hex(request, idx)?; + if bytes.len() <= 1 { + return Err(eyre::eyre!("execution request {} must include type byte and payload", idx)); + } + let request_type = bytes[0]; + if let Some(prev) = prev_type && + request_type <= prev + { + return Err(eyre::eyre!( + "execution requests must be strictly increasing by type (index {}, prev {}, current {})", + idx, + prev, + request_type + )); + } + prev_type = Some(request_type); + } + + Ok(()) +} + fn zero_address() -> String { format!("0x{:040}", 0) } @@ -3817,3 +4441,571 @@ fn build_error(id: &Value, code: i64, message: String) -> Value { "id": id, }) } + +async fn open_store_read_only_retry(path: &Path, deadline: Duration) -> Result { + timeout(deadline, async { + loop { + match Store::open_read_only(path, DbMetrics::new()) { + Ok(store) => return Ok::(store), + Err(_) => sleep(Duration::from_millis(100)).await, + } + } + }) + .await + .wrap_err("timed out opening store")? +} + +async fn wait_for_pruned_metadata(store: &Store, height: Height, deadline: Duration) -> Result<()> { + timeout(deadline, async { + loop { + if let Some(metadata) = store.get_blob_metadata(height).await? && + metadata.is_pruned() + { + return Ok::<(), eyre::Report>(()); + } + sleep(Duration::from_millis(100)).await; + } + }) + .await + .wrap_err("timed out waiting for archived height to be pruned")? +} + +// ============================================================================ +// Sync Fix Integration Tests (FIX-001, FIX-002, FIX-003) +// ============================================================================ + +/// Tests that fullnodes can sync from genesis even when validators have archived blob data. +/// +/// This test validates the fix for FIX-001 (decided data not pruned) and FIX-002 +/// (history_min_height returns 0). +/// +/// ## Background (BUG-001, BUG-002) +/// +/// **BUG-001**: `store.prune()` was removing decided values, certificates, and block data, +/// making it impossible for peers to serve historical block data to syncing nodes. +/// +/// **BUG-002**: `get_earliest_height()` returned the minimum height from `DECIDED_VALUES_TABLE`, +/// which after pruning would be high (e.g., 6718092). This caused Malachite's peer filtering +/// to reject all peers because `history_min_height > sync_target_height`. +/// +/// ## What this test validates +/// +/// 1. Validators produce blobbed blocks and archive them (blobs are pruned) +/// 2. A new node (fullnode) joins and attempts to sync from genesis +/// 3. Despite blob bytes being pruned, the fullnode receives `MetadataOnly` sync packages with +/// execution payloads +/// 4. The fullnode successfully imports blocks and reaches the same height as validators +/// +/// ## Expected behavior after fixes +/// +/// - `history_min_height == 0` for all validators (FIX-002) +/// - Decided values, certificates, and block data are retained forever (FIX-001) +/// - `GetDecidedValue` returns `MetadataOnly` with payload for archived heights (FIX-003) +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[ignore = "requires full-node harness; run via make itest-node"] +#[serial(full_node)] +async fn full_node_genesis_sync_with_archived_blobs() -> Result<()> { + init_test_logging(); + let provider = Arc::new(MockBlobProvider::start().await?); + let provider_url = provider.url(); + let provider_clone = provider.clone(); + + FullNodeTestBuilder::new() + .node_count(4) + .with_payload_plan(|_| 1) // 1 blob per block + .with_archiver(provider_url.clone(), "mock-provider", "test-token") + .run(move |network| { + let provider = provider_clone.clone(); + Box::pin(async move { + // Step 1: Validators produce and archive several blocks + // Take node 3 offline BEFORE any consensus happens - it will sync from genesis + network.stop_node(3).await?; + + // Wait for validators (0, 1, 2) to reach height 5 + let target_height = Height::new(5); + network.wait_for_nodes_at(&[0, 1, 2], target_height).await?; + + // Step 2: Wait for at least one blob to be archived + let uploads = provider.wait_for_uploads(1, Duration::from_secs(30)).await?; + assert!(!uploads.is_empty(), "expected at least one archived upload"); + + // Step 3: Wait for pruning to occur on all validators for the archived height + let archived_height = Height::new(uploads[0].height); + for idx in 0..3 { + let node = network.node_ref(idx)?; + let store_path = node.home.path().join("store.db"); + let store = + open_store_read_only_retry(&store_path, Duration::from_secs(5)).await?; + wait_for_pruned_metadata(&store, archived_height, Duration::from_secs(30)) + .await?; + } + + // Stop node 0 to safely inspect its state + network.stop_node(0).await?; + let validator_node = network.node_ref(0)?; + + // Open state (with genesis seeded) to check invariants + let state = open_state_ready(validator_node).await?; + + // Verify that decided data is still present (FIX-001) + let decided_value = state.get_decided_value(archived_height).await?; + assert!( + decided_value.is_some(), + "BUG-001 REGRESSION: decided value at height {} was pruned", + archived_height + ); + + // Verify history_min_height returns 0 (FIX-002) + let earliest = state.get_earliest_height().await; + assert_eq!( + earliest, + Height::new(0), + "BUG-002 REGRESSION: history_min_height should be 0, got {}", + earliest + ); + drop(state); + + // Restart node 0 so it can serve sync requests + network.start_node(0).await?; + + // Step 4: Bring node 3 online - it needs to sync from genesis + network.start_node(3).await?; + + // Step 5: Wait for node 3 to sync up to the target height + // This validates that sync works despite archived blobs + network.wait_for_height(3, target_height).await?; + + // Step 6: Verify node 3 executed archived heights via newPayloadV4 + let expected_requests = format_execution_requests( + &sample_execution_requests_for_height(archived_height), + ); + let stub_guard = network.nodes[3].stub_state.lock().await; + let archived_call = stub_guard + .new_payload_requests + .iter() + .find(|(height, _)| *height == archived_height.as_u64()) + .cloned(); + assert!( + archived_call.is_some(), + "expected engine newPayloadV4 for archived height {}", + archived_height + ); + let archived_call = archived_call.expect("archived call present"); + assert_eq!( + archived_call.1, expected_requests, + "execution requests mismatch for archived height {}", + archived_height + ); + drop(stub_guard); + + // Step 7: Verify node 3 has the correct blob metadata for all heights + // It should have metadata even for archived heights + network.stop_node(3).await?; + let synced_node = network.node_ref(3)?; + let synced_store = Store::open_read_only( + synced_node.home.path().join("store.db"), + DbMetrics::new(), + )?; + + for h in 1..=target_height.as_u64() { + let height = Height::new(h); + let metadata = synced_store.get_blob_metadata(height).await?; + assert!( + metadata.is_some(), + "synced node missing BlobMetadata for height {}", + height + ); + } + let decided = synced_store + .get_decided_value(archived_height) + .await? + .expect("synced node missing decided value for archived height"); + let block_data = + synced_store.get_block_data(archived_height, decided.certificate.round).await?; + assert!( + block_data.is_some(), + "synced node missing block data for archived height {}", + archived_height + ); + let execution_requests = synced_store + .get_execution_requests(archived_height, decided.certificate.round) + .await?; + assert!( + execution_requests.is_some(), + "synced node missing execution requests for archived height {}", + archived_height + ); + let metadata = synced_store + .get_blob_metadata(archived_height) + .await? + .expect("synced node missing BlobMetadata for archived height"); + assert!( + metadata.is_pruned(), + "synced node should mark archived height {} as pruned", + archived_height + ); + + Ok(()) + }) + }) + .await?; + + provider.shutdown().await; + Ok(()) +} + +/// Tests that MetadataOnly sync packages with payload can be processed successfully. +/// +/// This test validates FIX-003: when blobs have been pruned (archived), the sync mechanism +/// should handle `SyncedValuePackage::MetadataOnly` containing the execution payload, +/// allowing the syncing peer to import the block without blob sidecars. +/// +/// ## Background (MetadataOnly sync) +/// +/// Following the Lighthouse pattern, blocks outside the blob-retention boundary can be +/// imported without blob sidecars. In Load Network, that boundary is defined by +/// successful archive notices rather than a fixed DA window. The key requirement is +/// that the execution payload must still be available. +/// +/// ## What this test validates +/// +/// 1. A validator produces blobbed blocks and archives them +/// 2. Blob bytes are pruned after archival +/// 3. Decided value, block data, and metadata are still available (FIX-001) +/// 4. `process_synced_package()` can import `MetadataOnly` packages with payload +/// 5. Block metadata is correctly stored on the syncing node +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[ignore = "requires full-node harness; run via make itest-node"] +#[serial(full_node)] +async fn full_node_metadataonly_sync_returns_payload() -> Result<()> { + init_test_logging(); + let provider = Arc::new(MockBlobProvider::start().await?); + let provider_url = provider.url(); + let provider_clone = provider.clone(); + + FullNodeTestBuilder::new() + .node_count(3) // Need 3 for quorum + .with_payload_plan(|_| 1) + .with_archiver(provider_url.clone(), "mock-provider", "test-token") + .run(move |network| { + let provider = provider_clone.clone(); + Box::pin(async move { + // Step 1: Wait for validators to produce and archive blocks + let target_height = Height::new(3); + network.wait_for_nodes_at(&[0, 1, 2], target_height).await?; + + // Wait for at least one block to be archived + let uploads = provider.wait_for_uploads(1, Duration::from_secs(30)).await?; + assert!(!uploads.is_empty(), "expected at least one archived upload"); + + // Wait for pruning + sleep(Duration::from_secs(2)).await; + + let archived_height = Height::new(uploads[0].height); + + // Step 2: Verify the archived height has been marked as pruned + network.stop_node(0).await?; + let node = network.node_ref(0)?; + let state = open_state_read_only(node).await?; + + // Check that blob bytes are pruned for the archived height + let metadata = state + .get_blob_metadata(archived_height) + .await? + .expect("metadata should exist for archived height"); + + assert!( + metadata.is_pruned(), + "expected metadata at height {} to be marked as pruned after archival", + archived_height + ); + + // Step 3: Verify that block data is still available (FIX-001) + let decided = state + .get_decided_value(archived_height) + .await? + .expect("decided value should exist even after blob pruning"); + let round = decided.certificate.round; + + let block_data = state.get_block_data(archived_height, round).await?; + assert!( + block_data.is_some(), + "BUG: block data was pruned at height {} (FIX-001 regression)", + archived_height + ); + let payload_bytes = block_data.unwrap(); + assert!(!payload_bytes.is_empty(), "execution payload bytes should not be empty"); + let execution_requests = + state.get_execution_requests(archived_height, round).await?.unwrap_or_default(); + assert!( + !execution_requests.is_empty(), + "execution requests should be retained for archived height {}", + archived_height + ); + + // Step 4: Verify archive notices exist + let archive_notices = state.load_archive_notices(archived_height).await?; + assert!( + !archive_notices.is_empty(), + "expected archive notices for pruned height {}", + archived_height + ); + + // Step 5: Verify blobs are reported as pruned + match state.get_blobs_with_status_check(archived_height).await { + Err(BlobEngineError::BlobsPruned { locators, blob_count, .. }) => { + assert!( + blob_count > 0, + "expected at least one pruned blob at height {}", + archived_height + ); + assert!( + !locators.is_empty(), + "expected locators for pruned blobs at height {}", + archived_height + ); + } + Ok(blobs) if blobs.is_empty() => { + // Acceptable if blob count was 0 for this height + } + Ok(_blobs) => { + // Blobs still available, archival/pruning not complete yet + // This is acceptable - we've verified the data is retained + } + Err(other) => { + return Err(eyre::eyre!( + "unexpected blob engine error for height {}: {:?}", + archived_height, + other + )); + } + } + + Ok(()) + }) + }) + .await?; + + provider.shutdown().await; + Ok(()) +} + +/// Tests that sync retries with another peer when receiving MetadataOnly without payload. +/// +/// This test validates the retry mechanism: if a peer returns `MetadataOnly` WITHOUT an +/// execution payload (indicating data is truly unavailable), the syncing node should +/// retry with another peer rather than fail immediately. +/// +/// ## Background +/// +/// In a mixed network where some validators have pruned data and others haven't, or where +/// different validators have different retention policies, a syncing node might receive +/// incomplete sync responses. The correct behavior is to: +/// +/// 1. Detect when a sync response lacks required data +/// 2. Mark the peer as unable to serve that height +/// 3. Retry with another peer that might have the data +/// +/// ## What this test validates +/// +/// 1. Node receives sync response without execution payload +/// 2. Node correctly identifies this as an incomplete response +/// 3. Node retries sync from a different peer +/// 4. Sync eventually succeeds when a peer with full data responds +/// +/// ## Note +/// +/// This test exercises the peer scoring and retry logic in the Malachite sync layer, +/// combined with Ultramarine's handling of `MetadataOnly` packages. +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[ignore = "requires full-node harness; run via make itest-node"] +#[serial(full_node)] +async fn full_node_sync_retry_on_no_payload() -> Result<()> { + init_test_logging(); + + // This test requires manual state manipulation to simulate a peer returning + // MetadataOnly without payload. We'll use the unit test harness approach + // to validate the retry logic. + FullNodeTestBuilder::new() + .node_count(1) + .start_height(Some(Height::new(0))) + .run(|network| { + Box::pin(async move { + network.stop_node(0).await?; + let validator_address = network.node_address(0)?; + let (mut node_state, _node_metrics) = { + let node = network.node_ref(0)?; + open_state_ready_with_metrics(node).await? + }; + + let height = Height::new(0); + let round = Round::new(0); + let bundle = sample_blob_bundle(1); + let payload = sample_execution_payload_v3_for_height(height, Some(&bundle)); + + // Step 1: Create a valid proposal first (to have correct metadata) + let (_proposed, _payload_bytes, maybe_sidecars) = propose_with_optional_blobs( + &mut node_state, + height, + round, + &payload, + Some(&bundle), + ) + .await?; + let _sidecars = maybe_sidecars.expect("sidecars expected"); + + // Step 2: Create a MetadataOnly package WITHOUT execution payload + // This simulates a peer that has pruned both blobs AND block data + let header = ExecutionPayloadHeader::from_payload(&payload, None)?; + let metadata = ValueMetadata::new(header, bundle.commitments.clone()); + let value = StateValue::new(metadata); + + let package_without_payload = SyncedValuePackage::MetadataOnly { + value, + archive_notices: vec![], // No archive notices either + execution_payload_ssz: None, // KEY: No payload + execution_requests: vec![], + }; + + // Step 3: Process the package - should return None (cannot import) + let encoded = package_without_payload.encode().map_err(|e| eyre::eyre!(e))?; + let decoded = SyncedValuePackage::decode(&encoded).map_err(|e| eyre::eyre!(e))?; + + let result = node_state + .process_synced_package(height, round, validator_address, decoded) + .await?; + + // The package should be rejected because we can't import without payload + assert!( + result.is_none(), + "MetadataOnly without payload should be rejected (returns None for retry)" + ); + + // Step 4: Verify that the height was NOT marked as synced + // (so the node will retry with another peer) + let decided = node_state.get_decided_value(height).await?; + assert!( + decided.is_none(), + "Height should not be marked as decided after failed sync" + ); + + // Step 5: Now process a valid Full package (simulating successful retry) + let bundle_retry = sample_blob_bundle(1); + let payload_retry = + sample_execution_payload_v3_for_height(height, Some(&bundle_retry)); + let (proposed_retry, payload_bytes_retry, maybe_sidecars_retry) = + propose_with_optional_blobs( + &mut node_state, + height, + round, + &payload_retry, + Some(&bundle_retry), + ) + .await?; + let sidecars_retry = maybe_sidecars_retry.expect("sidecars expected"); + + let header_retry = ExecutionPayloadHeader::from_payload(&payload_retry, None)?; + let metadata_retry = + ValueMetadata::new(header_retry, bundle_retry.commitments.clone()); + let value_retry = StateValue::new(metadata_retry); + + let package_full = SyncedValuePackage::Full { + value: value_retry, + execution_payload_ssz: payload_bytes_retry.clone(), + blob_sidecars: sidecars_retry, + execution_requests: Vec::new(), + archive_notices: Vec::new(), + }; + + let encoded_full = package_full.encode().map_err(|e| eyre::eyre!(e))?; + let decoded_full = + SyncedValuePackage::decode(&encoded_full).map_err(|e| eyre::eyre!(e))?; + + let result_retry = node_state + .process_synced_package(height, round, validator_address, decoded_full) + .await?; + + // The Full package should succeed + assert!( + result_retry.is_some(), + "Full package should be accepted after failed MetadataOnly" + ); + + // Verify the value was correctly stored + let synced_value = result_retry.expect("sync succeeded"); + assert_eq!(synced_value.height, height); + assert_eq!(synced_value.value.id(), proposed_retry.value.id()); + + drop(node_state); + Ok(()) + }) + }) + .await +} + +/// Reject invalid execution requests during sync. +/// +/// Engine API v4 requires execution requests to be strictly ordered by type and +/// to include a non-empty payload. Invalid requests must be rejected. +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[ignore = "requires full-node harness; run via make itest-node"] +#[serial(full_node)] +async fn full_node_sync_rejects_invalid_execution_requests() -> Result<()> { + init_test_logging(); + + FullNodeTestBuilder::new() + .node_count(1) + .start_height(Some(Height::new(0))) + .run(|network| { + Box::pin(async move { + network.stop_node(0).await?; + let validator_address = network.node_address(0)?; + let mut node_state = { + let node = network.node_ref(0)?; + open_state_ready(node).await? + }; + + let height = Height::new(0); + let round = Round::new(0); + let bundle = sample_blob_bundle(1); + let payload = sample_execution_payload_v3_for_height(height, Some(&bundle)); + let payload_bytes = Bytes::from(payload.as_ssz_bytes()); + let invalid_requests = invalid_execution_requests_out_of_order(); + + let requests_hash = + Some(ExecutionPayloadHeader::compute_requests_hash(&invalid_requests)); + let header = ExecutionPayloadHeader::from_payload(&payload, requests_hash)?; + let metadata = ValueMetadata::new(header, bundle.commitments.clone()); + let value = StateValue::new(metadata); + + let package = SyncedValuePackage::MetadataOnly { + value, + archive_notices: vec![], + execution_payload_ssz: Some(payload_bytes), + execution_requests: invalid_requests, + }; + + let encoded = package.encode().map_err(|e| eyre::eyre!(e))?; + let decoded = SyncedValuePackage::decode(&encoded).map_err(|e| eyre::eyre!(e))?; + let result = node_state + .process_synced_package(height, round, validator_address, decoded) + .await; + + assert!(result.is_err(), "invalid execution requests should fail validation"); + let err = result.expect_err("invalid execution requests should error"); + assert!( + err.to_string().contains("Invalid execution requests"), + "unexpected error: {err}" + ); + + let decided = node_state.get_decided_value(height).await?; + assert!( + decided.is_none(), + "invalid execution requests should not mark height as decided" + ); + + Ok(()) + }) + }) + .await +} diff --git a/crates/test_support/Cargo.toml b/crates/test_support/Cargo.toml index 72dcc20..868971a 100644 --- a/crates/test_support/Cargo.toml +++ b/crates/test_support/Cargo.toml @@ -2,6 +2,7 @@ name = "ultramarine-test-support" version.workspace = true edition.workspace = true +license.workspace = true publish = false [dependencies] diff --git a/crates/types/Cargo.toml b/crates/types/Cargo.toml index 597c622..da58296 100644 --- a/crates/types/Cargo.toml +++ b/crates/types/Cargo.toml @@ -15,7 +15,6 @@ alloy-eips.workspace = true # Added for EIP-4844 kzg_to_versioned_hash (Phase 1) alloy-rpc-types = { workspace = true } alloy-rpc-types-engine = { workspace = true } alloy-rpc-types-eth = { workspace = true } -alloy-rpc-types-txpool = { workspace = true } ethereum_hashing.workspace = true ethereum_serde_utils = { workspace = true } fixed_bytes.workspace = true diff --git a/crates/types/proto/sync.proto b/crates/types/proto/sync.proto index f22b68a..56a25c3 100644 --- a/crates/types/proto/sync.proto +++ b/crates/types/proto/sync.proto @@ -85,7 +85,13 @@ message FullPackage { } // Metadata-only fallback (for pruned data) +// When blobs have been pruned but execution payload is still available, +// we include the payload so the receiving peer can import the block. +// This follows the Lighthouse pattern: blocks outside the DA window +// can be imported without blob sidecars. message MetadataOnlyPackage { Value value = 1; // Just the Value metadata repeated ArchiveNotice archive_notices = 2; // Archive notices with locators for pruned blobs + bytes execution_payload_ssz = 3; // Execution payload (optional, for import without blobs) + repeated bytes execution_requests = 4; // Execution requests (EIP-7685, optional) } diff --git a/crates/types/src/blob_metadata.rs b/crates/types/src/blob_metadata.rs index c378fdd..424b0ac 100644 --- a/crates/types/src/blob_metadata.rs +++ b/crates/types/src/blob_metadata.rs @@ -299,6 +299,17 @@ impl BlobMetadata { &self.blob_keccak_hashes } + /// Update keccak hash at the given index. + /// Used when syncing from pruned peers where we learn the hash from archive notices. + /// Returns true if the hash was updated, false if it was already set to a non-zero value. + pub fn update_keccak_hash(&mut self, index: usize, hash: B256) -> bool { + if index < self.blob_keccak_hashes.len() && self.blob_keccak_hashes[index] == B256::ZERO { + self.blob_keccak_hashes[index] = hash; + return true; + } + false + } + pub fn archival_status(&self, index: usize) -> BlobArchivalStatus { self.archival_records .get(index) diff --git a/crates/types/src/constants.rs b/crates/types/src/constants.rs index a9bb278..25ee985 100644 --- a/crates/types/src/constants.rs +++ b/crates/types/src/constants.rs @@ -2,3 +2,28 @@ /// Default execution gas limit for Load Network blocks. pub const LOAD_EXECUTION_GAS_LIMIT: u64 = 2_000_000_000; + +/// Minimum time between blocks in seconds (slot duration). +/// Protocol rule: validators reject proposals violating this. +pub const LOAD_MIN_BLOCK_TIME_SECS: u64 = 1; + +/// Maximum allowed clock drift in seconds (geth/ETH canonical value). +/// Protocol rule: validators reject proposals with timestamp > now + drift. +pub const LOAD_MAX_FUTURE_DRIFT_SECS: u64 = 15; + +#[cfg(test)] +mod tests { + use super::*; + + /// Documents and verifies protocol constants. + /// These values are critical for network consensus: + /// - LOAD_MIN_BLOCK_TIME_SECS=1: EVM timestamp granularity (1 block/sec max) + /// - LOAD_MAX_FUTURE_DRIFT_SECS=15: geth canonical value for clock tolerance + /// - LOAD_EXECUTION_GAS_LIMIT=2B: high throughput at 1 block/sec + #[test] + fn constants_have_expected_values() { + assert_eq!(LOAD_EXECUTION_GAS_LIMIT, 2_000_000_000); + assert_eq!(LOAD_MIN_BLOCK_TIME_SECS, 1); + assert_eq!(LOAD_MAX_FUTURE_DRIFT_SECS, 15); + } +} diff --git a/crates/types/src/lib.rs b/crates/types/src/lib.rs index 0122d3c..cfc5ff1 100644 --- a/crates/types/src/lib.rs +++ b/crates/types/src/lib.rs @@ -46,13 +46,13 @@ pub mod ethereum_compat; pub mod sync; // Phase 4.1: Three-layer architecture - Layer 1 (Pure BFT Consensus) -// Added as part of blob header persistence redesign (PHASE4_PROGRESS.md) +// Added as part of blob header persistence redesign (FINAL_PLAN.md Phase 4) // Pure consensus-layer metadata using Tendermint/Malachite terminology. // Contains NO Ethereum types for technology neutrality. pub mod consensus_block_metadata; // Phase 4.1: Three-layer architecture - Layer 2 (Ethereum Compatibility) -// Added as part of blob header persistence redesign (PHASE4_PROGRESS.md) +// Added as part of blob header persistence redesign (FINAL_PLAN.md Phase 4) // Ethereum EIP-4844 compatibility bridge that converts to BeaconBlockHeader // only when needed for BlobSidecar construction. pub mod blob_metadata; diff --git a/crates/types/src/sync.rs b/crates/types/src/sync.rs index 41082fb..5c1bc8b 100644 --- a/crates/types/src/sync.rs +++ b/crates/types/src/sync.rs @@ -134,27 +134,44 @@ pub enum SyncedValuePackage { archive_notices: Vec, }, - /// Metadata-only (blobs not available) + /// Metadata-only (blobs pruned but payload available) /// - /// Fallback when: Execution payload or blobs are missing, or blobs have been pruned. + /// Used when: Blob sidecars have been pruned (archived), but the execution payload + /// is still available. Following the Lighthouse pattern, blocks outside the Data + /// Availability window can be imported without blob sidecars. /// - /// Used for pruned heights where actual blob data is no longer available locally. - /// The syncing peer will receive the Value metadata and archive notices containing - /// locators for fetching blobs from external archives. + /// This variant enables sync to succeed even when all validators have pruned + /// their blob data. The receiving peer can: + /// 1. Import the execution payload to EL + /// 2. Store the consensus metadata (commitments are in Value) + /// 3. Mark blobs as pruned/archived using archive_notices /// - /// **Size**: ~2KB (Value metadata) + archive notices + /// **Size**: ~execution payload size + metadata + archive notices MetadataOnly { - /// Just the Value (metadata: header + commitments) + /// Value metadata (header + commitments) /// /// Contains `ValueMetadata` which includes: /// - ExecutionPayloadHeader (lightweight, no transactions) /// - KZG commitments (48 bytes each) value: Value, + /// Archive notices with locators for pruned blobs /// /// When blobs have been pruned, these notices provide the storage /// locators where the blobs can be fetched from external archives. archive_notices: Vec, + + /// Raw execution payload bytes (optional) + /// + /// When available, allows the receiving peer to import the block + /// without waiting for blob sidecars. This is the key enabler for + /// sync when blobs are pruned. + execution_payload_ssz: Option, + + /// Execution requests (EIP-7685) required for Prague hashing + /// + /// Stored as opaque byte arrays with the request type prepended. + execution_requests: Vec, }, } @@ -183,23 +200,44 @@ impl SyncedValuePackage { /// /// # Returns /// - /// - `Some(&Bytes)` if this is `Full` variant - /// - `None` if this is `MetadataOnly` variant + /// - `Some(&Bytes)` if this is `Full` variant or `MetadataOnly` with payload + /// - `None` if this is `MetadataOnly` variant without payload pub fn execution_payload(&self) -> Option<&Bytes> { match self { Self::Full { execution_payload_ssz, .. } => Some(execution_payload_ssz), - Self::MetadataOnly { .. } => None, + Self::MetadataOnly { execution_payload_ssz, .. } => execution_payload_ssz.as_ref(), } } /// Get execution requests if available + /// + /// # Returns + /// + /// - `Some(&[AlloyBytes])` if requests are available + /// - `None` only if this is `MetadataOnly` without any requests pub fn execution_requests(&self) -> Option<&[AlloyBytes]> { match self { Self::Full { execution_requests, .. } => Some(execution_requests), - Self::MetadataOnly { .. } => None, + Self::MetadataOnly { execution_requests, .. } => { + if execution_requests.is_empty() { + None + } else { + Some(execution_requests) + } + } } } + /// Check if this package has execution payload (can be imported without blobs) + /// + /// # Returns + /// + /// - `true` if execution payload is available for import + /// - `false` if only metadata is present (cannot import block) + pub fn can_import_without_blobs(&self) -> bool { + self.execution_payload().is_some() + } + /// Get blob sidecars if available /// /// # Returns @@ -267,7 +305,7 @@ impl SyncedValuePackage { /// /// Approximate size in bytes: /// - `Full`: value.size() + execution_payload.len() + (blob_count * 131KB) + overhead - /// - `MetadataOnly`: ~2KB + /// - `MetadataOnly`: ~2KB + optional payload size pub fn estimated_size(&self) -> usize { match self { Self::Full { @@ -283,9 +321,16 @@ impl SyncedValuePackage { blob_sidecars.iter().map(|b| b.size_bytes()).sum::() + 100 // Overhead for enum tag, lengths, etc. } - Self::MetadataOnly { value, archive_notices } => { + Self::MetadataOnly { + value, + archive_notices, + execution_payload_ssz, + execution_requests, + } => { value.size_bytes() + archive_notices.len() * 200 + // Approximate size per notice + execution_payload_ssz.as_ref().map(|p| p.len()).unwrap_or(0) + + execution_requests.iter().map(|r| r.len()).sum::() + 50 // Overhead } } @@ -341,7 +386,22 @@ impl Protobuf for SyncedValuePackage { .map(ArchiveNotice::from_proto) .collect::, _>>()?; - Ok(SyncedValuePackage::MetadataOnly { value, archive_notices }) + // execution_payload_ssz is optional - empty bytes means no payload + let execution_payload_ssz = if metadata.execution_payload_ssz.is_empty() { + None + } else { + Some(metadata.execution_payload_ssz) + }; + + let execution_requests = + metadata.execution_requests.into_iter().map(AlloyBytes::from).collect(); + + Ok(SyncedValuePackage::MetadataOnly { + value, + archive_notices, + execution_payload_ssz, + execution_requests, + }) } None => Err(ProtoError::missing_field::("package")), } @@ -378,7 +438,12 @@ impl Protobuf for SyncedValuePackage { archive_notices: proto_archive_notices, }) } - SyncedValuePackage::MetadataOnly { value, archive_notices } => { + SyncedValuePackage::MetadataOnly { + value, + archive_notices, + execution_payload_ssz, + execution_requests, + } => { let proto_archive_notices = archive_notices .iter() .map(|notice| notice.to_proto()) @@ -387,6 +452,12 @@ impl Protobuf for SyncedValuePackage { proto::synced_value_package::Package::MetadataOnly(proto::MetadataOnlyPackage { value: Some(value.to_proto()?), archive_notices: proto_archive_notices, + execution_payload_ssz: execution_payload_ssz.clone().unwrap_or_else(Bytes::new), + execution_requests: execution_requests + .iter() + .cloned() + .map(|req| req.0) + .collect(), }) } }; @@ -421,14 +492,37 @@ mod tests { fn test_synced_value_package_metadata_only_is_not_full() { #[allow(deprecated)] let value = Value::from_bytes(Bytes::from(vec![0u8; 32])); - let package = - SyncedValuePackage::MetadataOnly { value: value.clone(), archive_notices: vec![] }; + let package = SyncedValuePackage::MetadataOnly { + value: value.clone(), + archive_notices: vec![], + execution_payload_ssz: None, + execution_requests: vec![], + }; assert!(!package.is_full()); assert!(package.execution_payload().is_none()); assert!(package.blob_sidecars().is_none()); } + #[test] + fn test_metadata_only_with_payload() { + #[allow(deprecated)] + let value = Value::from_bytes(Bytes::from(vec![0u8; 32])); + let payload = Bytes::from(vec![1u8; 1024]); + let package = SyncedValuePackage::MetadataOnly { + value: value.clone(), + archive_notices: vec![], + execution_payload_ssz: Some(payload.clone()), + execution_requests: vec![], + }; + + assert!(!package.is_full()); + assert!(package.execution_payload().is_some()); + assert_eq!(package.execution_payload().unwrap(), &payload); + assert!(package.blob_sidecars().is_none()); + assert!(package.can_import_without_blobs()); + } + #[test] fn test_encode_decode_roundtrip_full() { #[allow(deprecated)] @@ -462,8 +556,12 @@ mod tests { fn test_encode_decode_roundtrip_metadata_only() { #[allow(deprecated)] let value = Value::from_bytes(Bytes::from(vec![0u8; 32])); - let package = - SyncedValuePackage::MetadataOnly { value: value.clone(), archive_notices: vec![] }; + let package = SyncedValuePackage::MetadataOnly { + value: value.clone(), + archive_notices: vec![], + execution_payload_ssz: None, + execution_requests: vec![], + }; // Encode let encoded = package.encode().expect("Failed to encode"); @@ -477,6 +575,32 @@ mod tests { assert!(!decoded.is_full()); } + #[test] + fn test_encode_decode_roundtrip_metadata_only_with_payload() { + #[allow(deprecated)] + let value = Value::from_bytes(Bytes::from(vec![0u8; 32])); + let payload = Bytes::from(vec![1u8; 2048]); + let package = SyncedValuePackage::MetadataOnly { + value: value.clone(), + archive_notices: vec![], + execution_payload_ssz: Some(payload.clone()), + execution_requests: vec![], + }; + + // Encode + let encoded = package.encode().expect("Failed to encode"); + assert!(!encoded.is_empty()); + + // Decode + let decoded = SyncedValuePackage::decode(&encoded).expect("Failed to decode"); + + // Verify + assert_eq!(package, decoded); + assert!(!decoded.is_full()); + assert!(decoded.can_import_without_blobs()); + assert_eq!(decoded.execution_payload().unwrap(), &payload); + } + #[test] fn test_encode_decode_roundtrip_multiple_blobs() { #[allow(deprecated)] @@ -546,7 +670,12 @@ mod tests { fn test_estimated_size_metadata_only() { #[allow(deprecated)] let value = Value::from_bytes(Bytes::from(vec![0u8; 32])); - let package = SyncedValuePackage::MetadataOnly { value, archive_notices: vec![] }; + let package = SyncedValuePackage::MetadataOnly { + value, + archive_notices: vec![], + execution_payload_ssz: None, + execution_requests: vec![], + }; let size = package.estimated_size(); @@ -578,7 +707,12 @@ mod tests { fn test_protobuf_roundtrip_metadata_only() { #[allow(deprecated)] let value = Value::from_bytes(Bytes::from(vec![0u8; 32])); - let package = SyncedValuePackage::MetadataOnly { value, archive_notices: vec![] }; + let package = SyncedValuePackage::MetadataOnly { + value, + archive_notices: vec![], + execution_payload_ssz: None, + execution_requests: vec![], + }; // Encode using protobuf let encoded = package.encode().expect("Failed to encode"); diff --git a/crates/utils/Cargo.toml b/crates/utils/Cargo.toml index 6280a2f..6b566a3 100644 --- a/crates/utils/Cargo.toml +++ b/crates/utils/Cargo.toml @@ -2,12 +2,14 @@ name = "ultramarine-utils" version.workspace = true edition.workspace = true +license.workspace = true [[bin]] name = "ultramarine-utils" path = "src/main.rs" [dependencies] +ultramarine-genesis = { path = "../genesis" } ultramarine-types = { path = "../types" } # Core dependencies @@ -33,7 +35,6 @@ alloy-rpc-types = { workspace = true } alloy-rpc-types-txpool = { workspace = true } alloy-signer = { workspace = true } alloy-signer-local = { workspace = true } -alloy-transport-http = { workspace = true } c-kzg = { workspace = true } k256 = "0.13" diff --git a/crates/utils/src/commands/genesis.rs b/crates/utils/src/commands/genesis.rs index 57c288f..613ca3b 100644 --- a/crates/utils/src/commands/genesis.rs +++ b/crates/utils/src/commands/genesis.rs @@ -1,19 +1,6 @@ -use std::{collections::BTreeMap, str::FromStr}; - -use alloy_genesis::{ChainConfig, Genesis, GenesisAccount}; -use alloy_primitives::{Address, B256, Bytes, U256}; -use alloy_signer_local::{MnemonicBuilder, PrivateKeySigner, coins_bip39::English}; -use chrono::NaiveDate; use clap::Parser; use color_eyre::eyre::Result; -use ultramarine_types::constants::LOAD_EXECUTION_GAS_LIMIT; - -/// Test mnemonics for wallet generation -const TEST_MNEMONICS: [&str; 3] = [ - "test test test test test test test test test test test junk", - "abandon abandon abandon abandon abandon abandon abandon abandon abandon abandon abandon about", - "zero zero zero zero zero zero zero zero zero zero zero zoo", -]; +use ultramarine_genesis::{build_dev_genesis, write_genesis}; #[derive(Parser, Debug, Clone, PartialEq)] pub struct GenesisCmd { @@ -27,94 +14,9 @@ pub struct GenesisCmd { impl GenesisCmd { pub async fn run(&self) -> Result<()> { - generate_genesis(&self.output, self.chain_id) - } -} - -/// Create a signer from a mnemonic. -pub(crate) fn make_signer(mnemonic: &str) -> PrivateKeySigner { - MnemonicBuilder::::default().phrase(mnemonic).build().expect("Failed to create wallet") -} - -pub(crate) fn make_signers() -> Vec { - TEST_MNEMONICS.iter().map(|&mnemonic| make_signer(mnemonic)).collect() -} - -pub(crate) fn generate_genesis(genesis_file: &str, chain_id: u64) -> Result<()> { - // Create signers and get their addresses - let signers = make_signers(); - let signer_addresses: Vec
= signers.iter().map(|signer| signer.address()).collect(); - - println!("Using signer addresses:"); - for (i, addr) in signer_addresses.iter().enumerate() { - println!(" Signer {}: {}", i, addr); + let genesis = build_dev_genesis(self.chain_id)?; + write_genesis(std::path::Path::new(&self.output), &genesis)?; + println!("\n✅ Genesis configuration written to {}", self.output); + Ok(()) } - - // Create genesis configuration with pre-funded accounts - let mut alloc = BTreeMap::new(); - for addr in &signer_addresses { - alloc.insert( - *addr, - GenesisAccount { - balance: U256::from_str("15000000000000000000000").unwrap(), // 15000 ETH - ..Default::default() - }, - ); - } - - // The Ethereum Cancun-Deneb (Dencun) upgrade was activated on the mainnet - // on March 13, 2024, at epoch 269,568. - let date = NaiveDate::from_ymd_opt(2024, 3, 14).unwrap(); - let datetime = date.and_hms_opt(0, 0, 0).unwrap(); - let _valid_cancun_timestamp = datetime.and_utc().timestamp() as u64; - - // Create genesis configuration - let genesis = Genesis { - config: ChainConfig { - chain_id, - homestead_block: Some(0), - eip150_block: Some(0), - eip155_block: Some(0), - eip158_block: Some(0), - byzantium_block: Some(0), - constantinople_block: Some(0), - petersburg_block: Some(0), - istanbul_block: Some(0), - berlin_block: Some(0), - london_block: Some(0), - shanghai_time: Some(0), - cancun_time: Some(0), - prague_time: Some(0), - merge_netsplit_block: Some(0), - terminal_total_difficulty: Some(U256::ZERO), - terminal_total_difficulty_passed: true, - ..Default::default() - }, - alloc, - ..Default::default() - } - .with_gas_limit(LOAD_EXECUTION_GAS_LIMIT) - .with_timestamp(0) - .with_extra_data(Bytes::from_static(b"Load Network Dev")) - .with_difficulty(U256::ZERO) - .with_mix_hash(B256::ZERO) - .with_coinbase(Address::ZERO) - .with_base_fee(Some(7)); - - // Align optional header fields with a clean genesis - let mut genesis = genesis; - genesis.parent_hash = Some(B256::ZERO); - genesis.number = Some(0); - - // Create parent directories if they don't exist - if let Some(parent) = std::path::Path::new(genesis_file).parent() { - std::fs::create_dir_all(parent)?; - } - - // Write genesis to file - let genesis_json = serde_json::to_string_pretty(&genesis)?; - std::fs::write(genesis_file, genesis_json)?; - println!("\n✅ Genesis configuration written to {}", genesis_file); - - Ok(()) } diff --git a/crates/utils/src/commands/spam.rs b/crates/utils/src/commands/spam.rs index ec8992d..74ef81e 100644 --- a/crates/utils/src/commands/spam.rs +++ b/crates/utils/src/commands/spam.rs @@ -7,6 +7,7 @@ use alloy_rpc_types_txpool::TxpoolStatus; use alloy_signer_local::PrivateKeySigner; use clap::Parser; use color_eyre::eyre::{self, Result}; +use hex::FromHex; use reqwest::{Client, Url}; use serde::{Deserialize, Serialize, de::DeserializeOwned}; use serde_json::json; @@ -43,6 +44,9 @@ pub struct SpamCmd { /// Index of the signer to use #[clap(long, default_value = "0")] signer_index: usize, + /// Optional 0x-prefixed hex private key to use as the signer (overrides signer_index). + #[clap(long)] + private_key: Option, } impl SpamCmd { @@ -66,6 +70,7 @@ impl SpamCmd { self.blobs, self.blobs_per_tx, self.signer_index, + self.private_key.clone(), )?; spammer.run().await } @@ -103,13 +108,21 @@ impl Spammer { blobs: bool, blobs_per_tx: usize, signer_index: usize, + private_key: Option, ) -> Result { if blobs && !(1..=1024).contains(&blobs_per_tx) { return Err(eyre::eyre!("blobs_per_tx must be between 1 and 1024")); } - let signers = crate::commands::genesis::make_signers(); - let signer = - signers.get(signer_index).ok_or_else(|| eyre::eyre!("Invalid signer index"))?.clone(); + let signer = if let Some(pk) = private_key { + let pk = pk.strip_prefix("0x").unwrap_or(&pk); + let bytes = <[u8; 32]>::from_hex(pk) + .map_err(|e| eyre::eyre!("invalid --private-key (expected 32-byte hex): {e}"))?; + PrivateKeySigner::from_slice(&bytes) + .map_err(|e| eyre::eyre!("failed to build signer from --private-key: {e}"))? + } else { + let signers = ultramarine_genesis::make_signers(); + signers.get(signer_index).ok_or_else(|| eyre::eyre!("Invalid signer index"))?.clone() + }; Ok(Self { client: RpcClient::new(url), chain_id, @@ -142,7 +155,9 @@ impl Spammer { async move { self_arc.tracker(result_receiver, report_receiver, finish_receiver).await } }); - let _ = tokio::join!(spammer_handle, tracker_handle); + let (spammer_res, tracker_res) = tokio::join!(spammer_handle, tracker_handle); + spammer_res.map_err(|e| eyre::eyre!("spammer task failed: {e}"))??; + tracker_res.map_err(|e| eyre::eyre!("tracker task failed: {e}"))??; Ok(()) } diff --git a/deny.toml b/deny.toml index 347c468..b7312bf 100644 --- a/deny.toml +++ b/deny.toml @@ -6,11 +6,20 @@ ignore = [ "RUSTSEC-2024-0437", # https://rustsec.org/advisories/RUSTSEC-2024-0436 "RUSTSEC-2024-0436", + # https://rustsec.org/advisories/RUSTSEC-2025-0137 + # ruint unsoundness via upstream alloy/reth transitives; awaiting upstream bump + "RUSTSEC-2025-0137", + # https://rustsec.org/advisories/RUSTSEC-2026-0007 + # bytes overflow via upstream network stack transitives; awaiting upstream bump + "RUSTSEC-2026-0007", + # https://rustsec.org/advisories/RUSTSEC-2026-0009 + # time RFC2822 parser DoS via transitive deps; awaiting upstream bump + "RUSTSEC-2026-0009", ] [bans] multiple-versions = "warn" -wildcards = "deny" +wildcards = "warn" highlight = "all" [licenses] @@ -59,3 +68,7 @@ license-files = [{ path = "LICENSE", hash = 0x001c7e6c }] [sources] unknown-registry = "deny" unknown-git = "deny" +allow-git = [ + "https://github.com/circlefin/malachite.git", + "https://github.com/sigp/lighthouse.git", +] diff --git a/docs/ARCHIVER_OPS.md b/docs/ARCHIVER_OPS.md deleted file mode 100644 index 25da971..0000000 --- a/docs/ARCHIVER_OPS.md +++ /dev/null @@ -1,270 +0,0 @@ -# Archiver Operations Guide - -This document covers the operational aspects of Ultramarine's blob archiver, which uploads decided blob sidecars to external storage providers (e.g., Load S3 Agent) and enables local pruning after archival. - -For the full Phase 6 design and its code-audited status, see `docs/PHASE6_ARCHIVE_PRUNE_FINAL.md`. - -## Overview - -The archiver is part of Phase 6 (Archive/Prune) and provides: - -- **Blob archival**: Upload decided blobs to external storage providers -- **Archive notices**: Cryptographically signed receipts that prove archival -- **Finality-gated pruning**: Remove local blobs only after archival + finality -- **Serving contract**: Return archive locators when blobs are pruned - -## Configuration - -### Config File (`config.toml`) - -```toml -[archiver] -# Enable/disable the archiver worker (uploads + notice emission on proposer duty) -enabled = true - -# Storage provider URL -# Production: "https://load-s3-agent.load.network" -provider_url = "https://load-s3-agent.load.network" - -# Upload path appended to provider_url (default: /upload for Load S3 Agent cloud) -# Usually leave unset. Only set when your deployment mounts the route under a prefix. -# upload_path = "/upload" - -# Provider identifier used in archive notices -provider_id = "load-s3-agent" - -# Bearer token for authenticated uploads (obtain from Load S3 Agent) -# See: https://docs.load.network/load-cloud-platform-lcp/ls3-with-load_acc -# REQUIRED when `enabled = true`. Startup fails fast if missing. -bearer_token = "your-load-acc-api-key" - -# Number of retry attempts for failed uploads (default: 3) -retry_attempts = 3 - -# Base backoff duration in milliseconds (default: 1000) -# Uses exponential backoff: 1s, 2s, 4s, ... -retry_backoff_ms = 1000 - -# Maximum jobs in queue before dropping oldest (default: 1000) -max_queue_size = 1000 -``` - -### Environment Variables - -The archiver config can also be set via environment variables: - -- `ULTRAMARINE_ARCHIVER_ENABLED` -- `ULTRAMARINE_ARCHIVER_PROVIDER_URL` -- `ULTRAMARINE_ARCHIVER_UPLOAD_PATH` -- `ULTRAMARINE_ARCHIVER_PROVIDER_ID` -- `ULTRAMARINE_ARCHIVER_BEARER_TOKEN` -- `ULTRAMARINE_ARCHIVER_RETRY_ATTEMPTS` -- `ULTRAMARINE_ARCHIVER_RETRY_BACKOFF_MS` -- `ULTRAMARINE_ARCHIVER_MAX_QUEUE_SIZE` - -### Environment Setup - -1. Copy `.env.example` to `.env` inside the `ultramarine/` directory. -2. Set `ULTRAMARINE_ARCHIVER_BEARER_TOKEN` to your Load Cloud Platform (`load_acc`) API key. See [LS3 with load_acc](https://docs.load.network/load-cloud-platform-lcp/ls3-with-load_acc) for token issuance. -3. Run `make all` / `make all-ipc` from the `ultramarine/` directory so Docker Compose automatically loads `.env`.\ - If you run `docker compose` from the repo root, prefix the command with `env $(cat ultramarine/.env | xargs)` (or export the variables manually) so the containers inherit the credentials. - -## Behavior - -### When Archiver is Enabled - -1. **Proposer duty**: After committing a block with blobs, the proposer enqueues an archive job -2. **Upload**: The archiver worker uploads each blob to the provider with metadata headers -3. **Notice generation**: On success, an `ArchiveNotice` is signed and broadcast to peers -4. **Verification**: Followers verify the notice (signature, commitment, blob_keccak) -5. **Pruning**: Once all blobs at a height are archived AND the height is finalized, local blobs are pruned - -### When Archiver is Disabled - -- **No uploads by this node**: When `archiver.enabled=false`, this node will not upload blobs when it is proposer, so it will also not emit archive notices for its own proposed heights. -- **Pruning still happens when archived**: If the node receives a complete, proposer-signed set of archive notices from the network for some height, it will prune local blob bytes for that height once finalized. -- **Validators fail fast (production)**: if this node is in the validator set and `archiver.enabled=false`, Ultramarine refuses to start (so proposers cannot silently skip upload duty). The full-node integration harness builds `ultramarine-node` with `feature="test-harness"` and may disable archiver by default for non-archiver tests. - -This is useful for: - -- Testing/development environments -- Nodes without external storage access (note: proposer duty will not be fulfilled if `enabled=false`) - -## Security / Verification Model (V0) - -### What an ArchiveNotice proves (today) - -An `ArchiveNotice` is an Ed25519-signed statement that includes: - -- `height, round, blob_index` -- `kzg_commitment` -- `blob_keccak` (keccak256 of locally stored bytes) -- `provider_id, locator` -- `archived_by, archived_at` - -Signing preimage is domain-tagged sha256 over protobuf: -`sha256("ArchiveNoticeV0" || protobuf(ArchiveNoticeBody))`. - -On receipt, validators verify: - -- the signature against the validator set (using `archived_by`) -- the `kzg_commitment` and `blob_keccak` match the decided `BlobMetadata` for that height/index -- conflicting notices are rejected (same `(height, index)` but different locator/provider/hash) - -### Proposer-only acceptance (enforced) - -Phase 6 duty is proposer-only uploads, and the verifier now enforces the same rule when processing notices. `State::handle_archive_notice` resolves the expected proposer from `BlobMetadata.proposer_index_hint` (falling back to consensus metadata) and rejects notices whose `archived_by` differs, so only the block proposer’s signed locator is accepted. - -## Operator Checklist - -- If `archiver.enabled = true`, ensure `archiver.bearer_token` is set; the node fails fast on startup when missing. -- Watch `archiver_queue_len` and `archiver_backlog_height`; sustained growth means provider errors or networking issues. -- Expect `archiver_jobs_success_total` to increase on blobbed heights; `archiver_upload_failures_total` should be near 0. -- Expect `archiver_pruned_total` to increase once notices are verified and the height is finalized. - -### Serving Contract - -When blobs are requested (via restream or value-sync): - -| Blob Status | Response | -| ----------------- | --------------------------------------------------------------- | -| Available locally | Full blob data | -| Pruned (archived) | `MetadataOnly` package with archive notices containing locators | -| Not found | Error | - -Peers receiving `MetadataOnly` should treat blob bytes as unavailable over p2p (they were pruned on the sender). -Ultramarine does not automatically re-download pruned blobs; it only propagates and persists archive notices/locators. -External consumers (indexers, rollups, provers, explorers) can fetch blob bytes from the archive provider using the locator. - -## Storage Provider Contract - -Ultramarine archives blobs via `POST /upload` (multipart). - -- Endpoint: `POST /upload` (multipart form) -- Fields: - - `file` (raw blob bytes; 131072 bytes for EIP-4844) - - `content_type=application/octet-stream` - - `tags` (JSON array of `{key,value}`), including: - - `load=true` - - `load.network=fibernet` - - `load.height`, `load.round`, `load.blob_index` - - `load.kzg_commitment`, `load.versioned_hash`, `load.blob_keccak` - - `load.proposer`, `load.provider` -- Header: `Authorization: Bearer ` - -Response includes `dataitem_id` (and sometimes `locator`); Ultramarine stores a locator as `load-s3://` if one isn’t provided. - -### Retrieving Archived Blobs - -Load S3 Agent exposes an HTTPS gateway that serves archived blobs by `dataitem_id`.\ -Use the locator returned in the archive notice (e.g. `load-s3://abc123…`), strip the prefix, and fetch via: - -``` -https://gateway.s3-node-1.load.network/resolve/ -``` - -Operators should rely on this gateway (or their own mirrored buckets) when a blob is marked as pruned/unavailable in Ultramarine logs or metrics. - -## Metrics - -All metrics are registered under the `archiver_` prefix. - -### Counters - -| Metric | Description | -| --------------------------------- | --------------------------------------------- | -| `archiver_jobs_success_total` | Total successful archive jobs | -| `archiver_jobs_failure_total` | Total failed archive jobs | -| `archiver_upload_failures_total` | Total upload failures to provider | -| `archiver_receipt_mismatch_total` | Archive notices with commitment/hash mismatch | -| `archiver_pruned_total` | Total blobs pruned after archival | -| `archiver_served_total` | Total blobs served from local storage | -| `archiver_served_archived_total` | Total requests hitting pruned/archived status | - -### Gauges - -| Metric | Description | -| ------------------------- | ------------------------------------------- | -| `archiver_queue_len` | Current job queue length | -| `archiver_backlog_height` | Oldest height with pending jobs (0 if none) | -| `archiver_archived_bytes` | Cumulative bytes archived | -| `archiver_pruned_bytes` | Cumulative bytes pruned | - -### Histograms - -| Metric | Description | -| ------------------------------------- | ------------------------------------- | -| `archiver_upload_duration_seconds` | Upload latency to provider | -| `archiver_notice_propagation_seconds` | Time from notice emission to peer ack | - -## Alerting Recommendations - -### Critical Alerts - -1. **Archiver queue backlog** - ``` - archiver_queue_len > 100 for 5m - ``` - Indicates uploads are failing or provider is slow. - -2. **Upload failure rate** - ``` - rate(archiver_upload_failures_total[5m]) > 0.1 - ``` - Provider may be down or credentials invalid. - -3. **Backlog height stale** - ``` - archiver_backlog_height > 0 AND - archiver_backlog_height unchanged for 10m - ``` - Archive jobs are stuck. - -### Warning Alerts - -1. **High upload latency** - ``` - histogram_quantile(0.95, archiver_upload_duration_seconds) > 5 - ``` - Provider or network issues. - -2. **Receipt mismatches** - ``` - rate(archiver_receipt_mismatch_total[5m]) > 0 - ``` - Possible data corruption or malicious notices. - -## Troubleshooting - -### Blobs Not Being Pruned - -1. Check archiver is enabled: `archiver.enabled = true` -2. Check upload success: `archiver_jobs_success_total` should increase -3. Check finality: Blobs only prune after height is finalized -4. Check notice propagation: All blobs at a height must have valid notices - -### Upload Failures - -1. Verify `provider_url` is correct -2. Verify `bearer_token` is valid (check Load S3 Agent docs) -3. Check network connectivity to provider -4. Review logs for specific error messages - -### High Queue Length - -1. Provider may be slow or rate-limiting -2. Increase `retry_backoff_ms` to reduce provider load -3. Consider `max_queue_size` if queue is dropping jobs - -### Recovery After Restart - -On startup, `recover_pending_archive_jobs()` scans decided heights and re-enqueues any blobs that: - -- Were decided by this node (proposer) -- Have not yet received valid archive notices - -This ensures no blobs are lost if the node crashes mid-archival. - -## Testing - -For local testing, point `provider_url` at a staging provider endpoint and run the standard `make all` / `make spam-blobs` flow. diff --git a/docs/DEV_WORKFLOW.md b/docs/DEV_WORKFLOW.md index dd1ea66..ae1a304 100644 --- a/docs/DEV_WORKFLOW.md +++ b/docs/DEV_WORKFLOW.md @@ -19,7 +19,7 @@ This guide shows how to develop, run, test, and observe an Ultramarine local net - `crates/consensus`: state + store for proposals, blocks, certificates - `crates/types`: consensus types, Context, and Engine API JSON models - `crates/utils`: CLI utilities (`genesis`, `spam`) -- `compose.yaml`: Reth x3 + Prometheus + Grafana stack +- `compose.yaml`: Reth x3 + Prometheus + Grafana stack (local/dev only; fibernet uses systemd + bundle env) - `scripts/`: helper scripts (`add_peers.sh`, `spawn.bash`, tmux spawner) - `docs/`: design docs and this workflow guide @@ -115,6 +115,13 @@ Notes: - `load_reth_blob_cache_items`, `load_reth_blob_cache_bytes`: blob cache occupancy - Throughput and error state panels provide quick health checks. +- Multi-host (infra) monitoring: + - Grafana/Prometheus bind to localhost on the remote host by default. + - Use an SSH tunnel: + - `ssh -L 3000:127.0.0.1:3000 ubuntu@` + - `ssh -L 9090:127.0.0.1:9090 ubuntu@` + - See `ultramarine/infra/README.md` for operator commands and overrides. + - Logs: - Nodes are started via tmux. Inspect sessions: - `tmux ls` → list sessions @@ -236,7 +243,7 @@ Located in `crates/consensus/tests/`, these tests provide rapid feedback during **Command:** `make itest` or individual test via `cargo test -p ultramarine-consensus --test blob_roundtrip -- --nocapture` **Location:** `crates/consensus/tests/` (module-level integration tests) -**Tier 1: Full Integration Tests (14 scenarios)** +**Tier 1: Full Integration Tests (17 scenarios)** Located in `crates/test/tests/full_node.rs`, these tests provide comprehensive end-to-end coverage with real networking, WAL, and multi-node consensus. Each test boots real Ultramarine nodes (Malachite channel actors, WAL, libp2p) and drives blobbed proposals end-to-end using the Engine API stub. @@ -258,7 +265,7 @@ Located in `crates/test/tests/full_node.rs`, these tests provide comprehensive e - **Run Tier 1 (full integration)** ```bash - make itest-node # 14 tests, ~5-7 minutes total + make itest-node # 17 tests, ~5-7 minutes total ``` - **Run entire suite** @@ -284,7 +291,7 @@ The suite currently contains: - `full_node_restream_multi_validator` – end-to-end restream between two real validators to ensure sidecar transmission, metrics, and commit bookkeeping match the state-level harness - `full_node_value_sync_inclusion_proof_failure` – corrupt a blob inclusion proof inside a ValueSync package and verify the full-node state rejects it, records the sync failure, and leaves no blobs behind - `full_node_blob_blobless_sequence_behaves` – commit a blobbed → blobless → blobbed sequence in the real state and assert metrics/blobs match expectations -- `full_node_store_pruning_retains_recent_heights` – override the decided-history retention window (`Store::prune()`), commit eight blobbed heights, and ensure decided values are pruned while blob bytes remain (no blob-byte retention window) +- `full_node_store_pruning_preserves_decided_history` – override the retention window (`Store::prune()`), commit eight blobbed heights, and ensure decided values + block data are retained while blob bytes remain (archive notices drive pruning) - `full_node_sync_package_roundtrip` – ingest a synthetic `SyncedValuePackage::Full` and confirm the node promotes blobs/metadata immediately before commit - `full_node_value_sync_proof_failure` – tamper with blob proofs (not commitments/inclusion proofs) to cover the remaining sync failure path @@ -420,7 +427,7 @@ Recommended cadence: - Expected on Cancun without sidecars. Either avoid blobs or proceed to V4 sidecar integration (see blueprint doc). - High TPS measurements flatten: - - Increase EL txpool/gas settings in `compose.yaml`. + - Increase EL txpool/gas settings in `compose.yaml` for local dev, or in `ultramarine/infra/templates/systemd/load-reth@.service.j2` + bundle env for fibernet. - Tune spammer rate and RPC timeout (`reqwest` has 1s timeout by default in utils). ### Round‑0 Timeouts (Startup Race) diff --git a/docs/FINAL_PLAN.md b/docs/FINAL_PLAN.md index ad4125f..557c82e 100644 --- a/docs/FINAL_PLAN.md +++ b/docs/FINAL_PLAN.md @@ -3,12 +3,12 @@ **Project**: Integrate blob sidecars into Ultramarine consensus client **Timeline**: 10-15 days (2-3 weeks with focused effort) **Architecture**: Channel-based approach using existing Malachite patterns -**Status**: 🟡 **Phase 6 Implemented (V0) – hardening + coverage** -**Progress**: **Phases 1-6 delivered (V0)** (blob integration, metrics, harness, testnet validation, Engine API v4 migration, archive→prune pipeline + operator strictness) +**Status**: 🟢 **Phase 6 Complete + Sync Layer Security Review** +**Progress**: **Phases 1-6 delivered (V0)** + **Sync Layer Security Review (7 fixes)** covering blob integration, metrics, harness, testnet validation, Engine API v4, archive→prune pipeline, and sync validation hardening **Implementation**: Live consensus + state sync fully operational with blob transfer, metadata-only storage, production metrics, validated Prague execution-requests, and verified cleanup symmetry -**Current Focus**: Phase 6 hardening (ops UX + multi-node coverage + negative-paths) (see [PHASE5_PROGRESS.md](../PHASE5_PROGRESS.md) for the full log) -**Last Updated**: 2025-12-17 -**Review Status**: ✅ Comprehensive code review completed, all critical bugs resolved +**Current Focus**: Phase 7+ optional features; one test `#[ignore]` pending refactoring (see [Sync Layer Security Review](#sync-layer-security-review--complete-2026-01-16)) +**Last Updated**: 2026-01-16 +**Review Status**: ✅ Comprehensive code review completed; sync layer security review with 7 fixes applied (see [Sync Layer Security Review](#sync-layer-security-review--complete-2026-01-16)) **Malachite Version**: b205f4252f3064d9a74716056f63834ff33f2de9 (upgraded ✅) --- @@ -30,14 +30,14 @@ This plan integrates EIP-4844 blob sidecars into Ultramarine while maintaining c - **Phase 1 – Execution Bridge** _(2025-10-27)_: Engine API v3 wired end-to-end with blob bundle retrieval (upgraded to v4 in Dec 2025); execution payload header extraction added to `ValueMetadata`. - **Phase 2 – Three-Layer Metadata Architecture** _(2025-10-27)_: Introduced `ConsensusBlockMetadata`, `BlobMetadata`, and blob-engine persistence (Layer 1/2/3) with redb tables and protobuf encoding. - **Phase 3 – Proposal Streaming** _(2025-10-28)_: Extended `ProposalPart` with `BlobSidecar`, enabled streaming of blobs via `/proposal_parts` with commitment/proof payloads. -- **Phase 4 – Cleanup & Storage Finalisation** _(2025-10-28)_: Removed legacy header tables/APIs, made `BlobMetadata` the single source of truth, updated parent-root validation and restream rebuilds. See [PHASE4_PROGRESS.md](../PHASE4_PROGRESS.md#2025-10-28-tuesday--phase-4--cleanup-complete) for full log. -- **Phase 5 (5.1/5.2) – Live Consensus + Sync Fixes** _(2025-10-23)_: Delivered proposer blob storage, restream sealing, synced-value protobuf path corrections (tracked separately in `STATUS_UPDATE_2025-10-21.md`). +- **Phase 4 – Cleanup & Storage Finalisation** _(2025-10-28)_: Removed legacy header tables/APIs, made `BlobMetadata` the single source of truth, updated parent-root validation and restream rebuilds. +- **Phase 5 (5.1/5.2) – Live Consensus + Sync Fixes** _(2025-10-23)_: Delivered proposer blob storage, restream sealing, and synced-value protobuf path corrections. - **Phase 5A – Metrics Instrumentation** _(2025-11-04)_: Added 12 blob metrics, wired through node startup/state helpers, updated dashboards and docs. -- **Phase 5B – Integration Harness** _(2025-11-08)_: Fourteen deterministic full-node scenarios pass via `make itest-node`, covering proposer/follower commit, sync ingestion, restart hydration, decided-history pruning (`Store::prune()`), and execution-layer rejection (see [Testing Strategy](./PHASE5_TESTNET.md#testing-strategy) for the full matrix). +- **Phase 5B – Integration Harness** _(2025-11-08)_: Seventeen deterministic full-node scenarios pass via `make itest-node`, covering proposer/follower commit, sync ingestion, restart hydration, decided-history pruning (`Store::prune()`), execution-layer rejection, and invalid/missing EL payload data (see [itest-node-harness.md](./knowledge_base/itest-node-harness.md) for harness details). - **Phase 6 – Archiving & Pruning (V0)** _(2025-12-17)_: Archive notices + proposer-only verification + archive→finality→prune flow implemented. Tier‑1 harness defaults `archiver.enabled=false` (to avoid accidental pruning), archiver/prune scenarios opt in with `FullNodeTestBuilder::with_archiver(...)` / `with_mock_archiver()`, and harness hardening (panic-safe teardown, port allocation retries, read-only store opens) reduces CI flakiness. - **Phase 5C – Testnet Validation** _(2025-11-07)_: Docker harness exercised 1,158 real blobs; metrics and dashboards verified against live runs. - **Prague / Engine API v4 upgrade** _(2025-12-05)_: Consensus/execution bridge now uses `forkchoiceUpdated/getPayload/newPayload V4`, deterministic execution-request helpers are shared across harnesses, and the load-reth genesis enforces Prague at timestamp 0 so EL and CL compute the same `requests_hash`. -- **Quality Gates**: 23/23 consensus unit tests plus 14/14 Tier‑1 full-node scenarios (`make itest-node`); `make pr` (fmt + `cargo clippy --workspace --all-targets -- -D warnings`) runs clean. +- **Quality Gates**: 23/23 consensus unit tests plus 17/17 Tier‑1 full-node scenarios (`make itest-node`); `make pr` (fmt + `cargo clippy --workspace --all-targets -- -D warnings`) runs clean. **Up Next** 🟡: @@ -115,7 +115,7 @@ Proposer Validators | Field | Value | Rationale | | -------------------------- | ------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------- | -| `timestamp` | `latest_block.timestamp + 1` | Monotonically increasing block time | +| `timestamp` | `> parent`, `>= parent + 1s`, `<= now + 15s` | Protocol-enforced: validator-side checks (parent hash must match latest) | | **`prev_randao`** | **Constant `0x01`** | **Arbitrum pattern** - explicit signal that block-based randomness is unavailable. Forces dApps to use proper VRF oracles. | | `suggested_fee_recipient` | Placeholder `0x2a...2a` | TODO: Make validator-configurable | | `withdrawals` | Empty array `[]` | No withdrawals on Load Network | @@ -137,7 +137,13 @@ Proposer Validators - **Normalization**: `ultramarine/crates/execution/src/eth_rpc/alloy_impl.rs:95` - RPC client returns constant - **Testing**: `ultramarine/crates/test/tests/full_node/node_harness.rs:1803` - harness verifies payloads, signatures bind execution requests, and V4 round-trips run via ignored tests -**See also**: [load-reth-design.md](../../../../load-el-design/load-reth-design.md#2-functional-requirements) for EL perspective. +**See also**: [load-reth README](../../load-reth/README.md) for EL perspective. + +### TPS Considerations + +Block frequency is limited to 1 block/sec due to EVM timestamp granularity. +To maintain throughput, gas limit must compensate (current: 2B gas/block). +TPS = (gas_limit × blocks/sec) / gas_per_tx = (2B × 1) / gas_per_tx --- @@ -171,7 +177,7 @@ Extend execution client interface to support Engine API v3 with blob bundle retr 3. Blob engine (Layer 3) – prunable RocksDB storage keyed by height/round. - Added redb tables: `consensus_block_metadata`, `blob_metadata_undecided`, `blob_metadata_decided`, `blob_metadata_meta` (O(1) latest pointer). - Implemented protobuf serialization for both metadata layers; consensus now serialises via `ProtobufCodec`. -- Unit tests cover metadata creation, protobuf round-trips, validator-set hashing (see [PHASE4_PROGRESS.md](../PHASE4_PROGRESS.md#phase-1--core-storage)). +- Unit tests cover metadata creation, protobuf round-trips, and validator-set hashing. ## Phase 3: Proposal Streaming ✅ COMPLETED (2025-10-28) @@ -205,7 +211,7 @@ Extend execution client interface to support Engine API v3 with blob bundle retr - Strengthened round hygiene (`commit_cleans_failed_round_blob_metadata`) and metadata fallback logic (`load_blob_metadata_for_round_falls_back_to_decided`). - Added restream reconstruction test (`rebuild_blob_sidecars_for_restream_reconstructs_headers`); unit-suite now 23/23 passing. - `cargo fmt --all`, `cargo clippy -p ultramarine-consensus`, and `cargo test -p ultramarine-consensus --lib` run clean for the consensus crate. -- See [PHASE4_PROGRESS.md](../PHASE4_PROGRESS.md#2025-10-28-tuesday--phase-4--cleanup-complete) for detailed timeline and code references. +- Detailed timeline and code references are captured in this plan's Phase 4 section. ## Phase 5: Block Import / EL Interaction (Days 8-9) ✅ COMPLETED @@ -237,9 +243,9 @@ Modify `Decided` handler to validate blob availability before block import. **See**: -- `docs/PHASE_5_COMPLETION.md` - Live consensus completion details -- `docs/LIGHTHOUSE_PARITY_COMPLETE.md` - Versioned hash verification -- `docs/BLOB_SYNC_GAP_ANALYSIS.md` - **Critical sync gaps and fixes** +- `docs/knowledge_base/itest-node-harness.md` - integration harness scenarios and coverage +- `docs/knowledge_base/cl-el-head-gating.md` - FCU gating and EL readiness invariants +- `docs/knowledge_base/el-persistence.md` - persistence-threshold rationale for restart safety ### Files to Modify @@ -257,7 +263,7 @@ Modify `Decided` handler to validate blob availability before block import. - `crates/execution/src/notifier.rs` & `crates/execution/src/client.rs` – Execution notifier trait and adapter - `crates/blob_engine/src/engine.rs` – promotion + pruning used during commit - `crates/blob_engine/src/store/rocksdb.rs` – undecided/decided storage semantics -- Tests/validation: Tier‑0 `make itest` + Tier‑1 `make itest-node` (14 scenarios), targeted negative coverage `cargo test -p ultramarine-test blob_decided_el_rejection` +- Tests/validation: Tier‑0 `make itest` + Tier‑1 `make itest-node` (17 scenarios), targeted negative coverage `cargo test -p ultramarine-test blob_decided_el_rejection` ### Phase 5.1: State Sync Implementation ✅ COMPLETED (2025-10-23) @@ -788,8 +794,9 @@ Validate blob sidecar integration end-to-end with comprehensive metrics, integra ### See Also -- [PHASE5_TESTNET.md](./PHASE5_TESTNET.md) - Complete implementation plan with daily progress log -- [PHASE4_PROGRESS.md](./PHASE4_PROGRESS.md) - Phase 1-4 completion log +- [DEV_WORKFLOW.md](./DEV_WORKFLOW.md) - End-to-end workflow and integration runbook +- [knowledge_base/itest-node-harness.md](./knowledge_base/itest-node-harness.md) - Integration harness scenarios and usage +- [journal/PERF-SUMMARY-fibernet-throughput-journey.md](./journal/PERF-SUMMARY-fibernet-throughput-journey.md) - Consolidated throughput timeline and baseline - Makefile:427-550 - Testnet automation targets - `docs/DEV_WORKFLOW.md` – integration harness command reference (`make itest`, `make itest-node`) @@ -821,6 +828,60 @@ Validate blob sidecar integration end-to-end with comprehensive metrics, integra --- +### Sync Layer Security Review ✅ COMPLETE (2026-01-16) + +**Status**: All critical bugs fixed, one test marked `#[ignore]` pending refactoring + +**Context**: Comprehensive review of `process_synced_package` and related sync/cleanup paths in `crates/consensus/src/state.rs` identified several security and reliability issues. + +#### Fixes Implemented + +| Fix ID | Issue | Location | Resolution | +| ------- | -------------------------------------------------- | -------------------- | --------------------------------------------------------------------------------------------------------- | +| FIX-001 | Missing cleanup after parent_root mismatch | `state.rs:1727-1750` | Added `drop_round` + `delete_blob_metadata_undecided` before rejection | +| FIX-002 | `debug_assert` not enforced in release | `state.rs:1662-1668` | Replaced with hard check that returns `Err(...)` on parent_root mismatch | +| FIX-003 | Duplicate blob indices not detected | `state.rs:1587-1600` | Added `HashSet` check before processing sidecars | +| FIX-004 | Silent cleanup failures | `state.rs:1604-1605` | Added logging + `record_cleanup_failure()` metric call | +| FIX-005 | Missing metric on commitment count mismatch | `state.rs:1581-1584` | Added `record_sync_package_rejected()` call | +| FIX-006 | Misleading test name | `state/tests/mod.rs` | Renamed `test_concurrent_sync_from_multiple_peers` → `test_sequential_multi_height_sync_chain_continuity` | +| FIX-007 | `orphaned_blobs_dropped` counting rounds not blobs | `state.rs:2711-2764` | Now fetches blob count from metadata before deletion | + +#### Metrics Added + +- `sync_packages_rejected_total` – counts packages rejected due to validation failures +- `cleanup_failures_total` – counts cleanup operations that failed (storage leak indicator) +- `orphaned_blobs_dropped` – now correctly counts individual blobs, not rounds + +#### Tests Updated + +- `test_reorg_drops_orphaned_blobs` – added metric assertions for `orphaned_blobs_dropped` +- `test_sync_rejects_partial_sidecars` – fixed setup to create valid sidecars then truncate; added metric assertions +- `test_sync_rejects_duplicate_indices` – added metric assertions for `sync_packages_rejected` + +#### Outstanding Issue: `test_sequential_multi_height_sync_chain_continuity` + +**Status**: Marked `#[ignore]` pending refactoring + +**Problem**: The test attempts to validate multi-height sync chain continuity, but `process_synced_package` returns `Ok(None)` (rejects package) due to validation flow issues. + +**Root Cause Investigation**: + +- Test was renamed from `test_concurrent_sync_from_multiple_peers` (misleading name) +- `process_synced_package` validation rejects packages that don't match EL verification +- Likely related to: commitment matching, parent root resolution, or EL verification mocking + +**Required Fix**: Refactor test to properly mock the execution layer or use a different approach for testing multi-height sync scenarios. + +**Impact**: Core sync functionality is working (other sync tests pass). This test covers an edge case around chain continuity across multiple heights. + +**Files Modified**: + +- `crates/consensus/src/state.rs` (FIX-001 through FIX-005, FIX-007) +- `crates/consensus/src/state/tests/mod.rs` (test fixes, FIX-006) +- `crates/blob_engine/src/metrics.rs` (metric additions) + +--- + ### Phase 7: Archive Integration (Optional) ⏳ PENDING **Status**: Not started (optional feature) diff --git a/docs/METRICS_PROGRESS.md b/docs/METRICS_PROGRESS.md deleted file mode 100644 index c3b806d..0000000 --- a/docs/METRICS_PROGRESS.md +++ /dev/null @@ -1,1222 +0,0 @@ -# Blob Engine Metrics - Implementation Progress - -**Date**: 2025-11-04 -**Status**: ✅ COMPLETE - Validated on Testnet -**Phase**: Phase 5A-C - Metrics Instrumentation & Validation -**Tracking**: Implementation and testnet validation of blob observability metrics - -**📋 OFFICIAL IMPLEMENTATION PLAN** - Use this document as the single source of truth - ---- - -## Table of Contents - -1. [Metric Specifications](#metric-specifications) -2. [Implementation Pattern](#implementation-pattern) -3. [Code Structure](#code-structure) -4. [Instrumentation Points](#instrumentation-points) -5. [Registration in Node](#registration-in-node) -6. [Testing Metrics](#testing-metrics) -7. [Dashboard Queries](#dashboard-queries) -8. [Future Work - Node-Level Metrics](#future-work-node-level-metrics) -9. [Progress Tracking](#progress-tracking) - ---- - -## Metric Specifications - -### Overview - -**Total Metrics**: 12 (8 counters, 3 gauges, 1 histogram) -**Prefix**: `blob_engine_*` -**API**: `malachitebft-metrics` (SharedRegistry) -**Pattern**: Follow `crates/consensus/src/metrics.rs` (DbMetrics) - -### Metric Table - -| Name | Type | Help Text | Units | Instrumentation Point | -| ----------------------------------------- | --------- | --------------------------------------- | ------- | --------------------------------------------------------------------- | -| `blob_engine_verifications_success_total` | Counter | Successful blob KZG proof verifications | count | `verify_and_store` (success path) | -| `blob_engine_verifications_failure_total` | Counter | Failed blob KZG proof verifications | count | `verify_and_store` (error path) | -| `blob_engine_verification_time` | Histogram | Time taken to verify blob KZG proofs | seconds | `verify_and_store` (timed) | -| `blob_engine_storage_bytes_undecided` | Gauge | Storage size of undecided blobs | bytes | `BlobStore::put_undecided_blobs` (+), `mark_decided`/`drop_round` (-) | -| `blob_engine_storage_bytes_decided` | Gauge | Storage size of decided blobs | bytes | `BlobStore::mark_decided` (+), `prune_archived_before` (-) | -| `blob_engine_undecided_blob_count` | Gauge | Current number of undecided blobs | count | `BlobStore::put_undecided_blobs` (+), `mark_decided`/`drop_round` (-) | -| `blob_engine_blobs_per_block` | Gauge | Number of blobs in last finalized block | count | `BlobEngineImpl::mark_decided` | -| `blob_engine_lifecycle_promoted_total` | Counter | Blobs promoted to decided state | count | `mark_decided` | -| `blob_engine_lifecycle_dropped_total` | Counter | Blobs dropped from undecided state | count | `drop_round` | -| `blob_engine_lifecycle_pruned_total` | Counter | Decided blobs pruned/archived | count | `prune_archived_before` | -| `blob_engine_restream_rebuilds_total` | Counter | Blob metadata rebuilds during restream | count | `State::rebuild_blob_sidecars_for_restream` | -| `blob_engine_sync_failures_total` | Counter | Blob sync/fetch failures | count | `AppMsg::ProcessSyncedValue` error path (app.rs) | - -### Key Design Decisions - -1. **No Label Variants**: Use separate metrics (e.g., `verifications_success_total` vs `verifications_failure_total`) instead of labels like `{result="success"}`. - - **Reason**: Malachite metrics don't use `CounterVec`/`GaugeVec` - see `DbMetrics` pattern. - - ✅ **Confirmed correct**: Follows codebase pattern exactly - -2. **Moniker Context**: The `SharedRegistry::with_moniker()` adds moniker automatically. - - **No need** to add `moniker` label in metric definitions. - -3. **Histogram Buckets**: Use exponential buckets for `verification_time`. - - **Range**: 0.001s (1ms) to ~10s with factor 2.0 - - **Reason**: Matches `db_read_time` pattern (line 67 in consensus/metrics.rs) - - **Implementation**: Wrap verification path with a timer guard so both success and error exits enter the histogram - -4. **Dependency**: `malachitebft-app-channel` ✅ **Correct Choice** - - **Already in workspace**: Used by consensus crate - - **Don't use**: `prometheus` crate directly (incompatible pattern) - - **Reference**: `crates/consensus/src/metrics.rs:3-10` - -5. **Gauge Management**: Helper methods handle inc/dec automatically - - **Pattern**: `add_undecided_storage()`, `promote_blobs()`, `drop_blobs()` - - **Benefit**: Atomic updates, no race conditions - - **Implementation detail**: Capture serialized blob sizes in `BlobStore` to calculate deltas - - ✅ **Correct approach**: Better than manual inc/dec in BlobEngine methods - -6. **Consensus Visibility**: Surface Tendermint lifecycle alongside blob engine - - `State::rebuild_blob_sidecars_for_restream` records restream rebuilds - - `State::record_sync_failure` exposes blob sync errors in the import path - - `BlobEngineImpl::mark_decided` sets `blob_engine_blobs_per_block` when a block finalizes - - Ensures dashboards correlate blob activity with consensus height/round - ---- - -## Implementation Pattern - -### Reference: DbMetrics Structure - -**File**: `crates/consensus/src/metrics.rs` - -```rust -use malachitebft_app_channel::app::metrics::{ - SharedRegistry, - prometheus::metrics::{ - counter::Counter, - gauge::Gauge, - histogram::{Histogram, exponential_buckets}, - }, -}; - -#[derive(Clone, Debug)] -pub struct DbMetrics(Arc); - -impl Deref for DbMetrics { - type Target = Inner; - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -#[derive(Debug)] -pub struct Inner { - db_read_count: Counter, - db_read_time: Histogram, - // ... -} - -impl DbMetrics { - pub fn register(registry: &SharedRegistry) -> Self { - let metrics = Self::new(); - - registry.with_prefix("app_channel", |registry| { - registry.register( - "db_read_count_total", - "Total number of reads from the database", - metrics.db_read_count.clone(), - ); - // ... register others - }); - - metrics - } - - pub fn add_read_bytes(&self, bytes: u64) { - self.db_read_count.inc(); - } -} -``` - -**Key Patterns**: - -- ✅ `Arc` wrapper with `Deref` -- ✅ `register()` method takes `&SharedRegistry` -- ✅ `with_prefix()` for namespace -- ✅ Helper methods for clean instrumentation - ---- - -## Code Structure - -### File: `crates/blob_engine/src/metrics.rs` - -```rust -use std::{ops::Deref, sync::Arc, time::Duration}; - -use malachitebft_app_channel::app::metrics::{ - SharedRegistry, - prometheus::metrics::{ - counter::Counter, - gauge::Gauge, - histogram::{Histogram, exponential_buckets}, - }, -}; - -#[derive(Clone, Debug)] -pub struct BlobEngineMetrics(Arc); - -impl Deref for BlobEngineMetrics { - type Target = Inner; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -#[derive(Debug)] -pub struct Inner { - // Verification metrics - verifications_success: Counter, - verifications_failure: Counter, - verification_time: Histogram, - - // Storage metrics (gauges) - storage_bytes_undecided: Gauge, - storage_bytes_decided: Gauge, - undecided_blob_count: Gauge, - blobs_per_block: Gauge, - - // Lifecycle metrics (counters) - lifecycle_promoted: Counter, - lifecycle_dropped: Counter, - lifecycle_pruned: Counter, - - // Restream/Sync metrics (counters) - restream_rebuilds: Counter, - sync_failures: Counter, -} - -impl Inner { - pub fn new() -> Self { - Self { - verifications_success: Counter::default(), - verifications_failure: Counter::default(), - verification_time: Histogram::new(exponential_buckets(0.001, 2.0, 10)), - - storage_bytes_undecided: Gauge::default(), - storage_bytes_decided: Gauge::default(), - undecided_blob_count: Gauge::default(), - blobs_per_block: Gauge::default(), - - lifecycle_promoted: Counter::default(), - lifecycle_dropped: Counter::default(), - lifecycle_pruned: Counter::default(), - - restream_rebuilds: Counter::default(), - sync_failures: Counter::default(), - } - } -} - -impl Default for Inner { - fn default() -> Self { - Self::new() - } -} - -impl BlobEngineMetrics { - pub fn new() -> Self { - Self(Arc::new(Inner::new())) - } - - pub fn register(registry: &SharedRegistry) -> Self { - let metrics = Self::new(); - - registry.with_prefix("blob_engine", |registry| { - // Verification metrics - registry.register( - "verifications_success_total", - "Successful blob KZG proof verifications", - metrics.verifications_success.clone(), - ); - - registry.register( - "verifications_failure_total", - "Failed blob KZG proof verifications", - metrics.verifications_failure.clone(), - ); - - registry.register( - "verification_time", - "Time taken to verify blob KZG proofs (seconds)", - metrics.verification_time.clone(), - ); - - // Storage metrics - registry.register( - "storage_bytes_undecided", - "Storage size of undecided blobs (bytes)", - metrics.storage_bytes_undecided.clone(), - ); - - registry.register( - "storage_bytes_decided", - "Storage size of decided blobs (bytes)", - metrics.storage_bytes_decided.clone(), - ); - - registry.register( - "undecided_blob_count", - "Current number of undecided blobs", - metrics.undecided_blob_count.clone(), - ); - - registry.register( - "blobs_per_block", - "Number of blobs in last finalized block", - metrics.blobs_per_block.clone(), - ); - - // Lifecycle metrics - registry.register( - "lifecycle_promoted_total", - "Blobs promoted to decided state", - metrics.lifecycle_promoted.clone(), - ); - - registry.register( - "lifecycle_dropped_total", - "Blobs dropped from undecided state", - metrics.lifecycle_dropped.clone(), - ); - - registry.register( - "lifecycle_pruned_total", - "Decided blobs pruned/archived", - metrics.lifecycle_pruned.clone(), - ); - - // Restream/Sync metrics - registry.register( - "restream_rebuilds_total", - "Blob metadata rebuilds during restream", - metrics.restream_rebuilds.clone(), - ); - - registry.register( - "sync_failures_total", - "Blob sync/fetch failures", - metrics.sync_failures.clone(), - ); - }); - - metrics - } - - // ===== Helper Methods for Instrumentation ===== - - /// Record successful verification batch - pub fn record_verifications_success(&self, count: usize) { - self.verifications_success.inc_by(count as u64); - } - - /// Record failed verification batch - pub fn record_verifications_failure(&self, count: usize) { - self.verifications_failure.inc_by(count as u64); - } - - /// Record verification duration - pub fn observe_verification_time(&self, duration: Duration) { - self.verification_time.observe(duration.as_secs_f64()); - } - - /// Add undecided blob storage (when storing new blobs) - pub fn add_undecided_storage(&self, bytes: usize, blob_count: usize) { - self.storage_bytes_undecided.add(bytes as i64); - self.undecided_blob_count.add(blob_count as i64); - } - - /// Move blobs from undecided to decided - pub fn promote_blobs(&self, bytes: usize, blob_count: usize) { - self.storage_bytes_undecided.sub(bytes as i64); - self.storage_bytes_decided.add(bytes as i64); - self.undecided_blob_count.sub(blob_count as i64); - self.lifecycle_promoted.inc_by(blob_count as u64); - } - - /// Drop undecided blobs - pub fn drop_blobs(&self, bytes: usize, blob_count: usize) { - self.storage_bytes_undecided.sub(bytes as i64); - self.undecided_blob_count.sub(blob_count as i64); - self.lifecycle_dropped.inc_by(blob_count as u64); - } - - /// Prune decided blobs - pub fn prune_blobs(&self, bytes: usize, blob_count: usize) { - self.storage_bytes_decided.sub(bytes as i64); - self.lifecycle_pruned.inc_by(blob_count as u64); - } - - /// Set blobs per finalized block - pub fn set_blobs_per_block(&self, count: usize) { - self.blobs_per_block.set(count as i64); - } - - /// Record restream rebuild - pub fn record_restream_rebuild(&self) { - self.restream_rebuilds.inc(); - } - - /// Record sync failure - pub fn record_sync_failure(&self) { - self.sync_failures.inc(); - } -} - -impl Default for BlobEngineMetrics { - fn default() -> Self { - Self::new() - } -} -``` - -### File: `crates/blob_engine/src/lib.rs` - -Add export: - -```rust -pub mod metrics; -``` - -### File: `crates/blob_engine/Cargo.toml` - -Add dependency: - -```toml -[dependencies] -# ... existing dependencies ... - -# Metrics -malachitebft-app-channel = { workspace = true } -``` - ---- - -## Instrumentation Points - -### 1. Add Metrics to BlobEngineImpl - -**File**: `crates/blob_engine/src/engine.rs` - -```rust -use crate::metrics::BlobEngineMetrics; -use std::sync::Arc; - -pub struct BlobEngineImpl -where - S: BlobStore, -{ - verifier: BlobVerifier, - store: S, - metrics: BlobEngineMetrics, // NEW -} - -impl BlobEngineImpl -where - S: BlobStore, -{ - pub fn new(store: S, metrics: BlobEngineMetrics) -> Result { - Ok(Self { - verifier: BlobVerifier::new()?, - store, - metrics, - }) - } -} -``` - -### 2. Instrument `verify_and_store` - -```rust -use ultramarine_types::blob::BYTES_PER_BLOB; - -async fn verify_and_store( - &self, - height: Height, - round: i64, - sidecars: &[BlobSidecar], -) -> Result<(), BlobEngineError> { - if sidecars.is_empty() { - return Ok(()); - } - - let timer_start = std::time::Instant::now(); - let refs: Vec<&BlobSidecar> = sidecars.iter().collect(); - - if let Err(err) = self.verifier.verify_blob_sidecars_batch(&refs) { - self.metrics.observe_verification_time(timer_start.elapsed()); - self.metrics.record_verifications_failure(sidecars.len()); - return Err(err.into()); - } - - self.metrics.observe_verification_time(timer_start.elapsed()); - self.metrics.record_verifications_success(sidecars.len()); - - let stored_count = self.store.put_undecided_blobs(height, round, sidecars).await?; - let total_bytes = stored_count * BYTES_PER_BLOB; - self.metrics.add_undecided_storage(total_bytes, stored_count); - - Ok(()) -} -``` - -> `BYTES_PER_BLOB` keeps the gauge math constant-time; every blob is exactly 131,072 bytes. - -### 3. Instrument `mark_decided` - -```rust -async fn mark_decided(&self, height: Height, round: i64) -> Result<(), BlobEngineError> { - let (blob_count, total_bytes) = self.store.mark_decided(height, round).await?; - - self.metrics.promote_blobs(total_bytes, blob_count); - self.metrics.set_blobs_per_block(blob_count); - - Ok(()) -} -``` - -> `mark_decided` now returns both blob count and serialized byte total, so the engine can update gauges without issuing a second RocksDB scan. - -### 4. Instrument `drop_round` - -```rust -async fn drop_round(&self, height: Height, round: i64) -> Result<(), BlobEngineError> { - let (blob_count, total_bytes) = self.store.drop_round(height, round).await?; - - self.metrics.drop_blobs(total_bytes, blob_count); - - Ok(()) -} -``` - -### 5. Instrument `mark_archived` - -```rust -async fn mark_archived(&self, height: Height, indices: &[u16]) -> Result<(), BlobEngineError> { - self.store.delete_archived(height, indices).await?; - - let bytes = indices.len() * BYTES_PER_BLOB; - self.metrics.prune_blobs(bytes, indices.len()); - - Ok(()) -} -``` - -### 6. Instrument `prune_archived_before` - -```rust -async fn prune_archived_before(&self, height: Height) -> Result { - let pruned_count = self.store.prune_before(height).await?; - let pruned_bytes = pruned_count * BYTES_PER_BLOB; - - self.metrics.prune_blobs(pruned_bytes, pruned_count); - - Ok(pruned_count) -} -``` - -> For Phase 5 the store only returns counts; we derive byte totals via the fixed blob size constant. Phase 6 will expand this to configurable retention strategies. - -### 7. Consensus Hooks - -```rust -pub struct State -where - E: BlobEngine, -{ - // ... - pub blob_metrics: BlobEngineMetrics, - // ... -} -``` - -```rust -impl State { - fn rebuild_blob_sidecars_for_restream(&self, metadata: &BlobMetadata, ...) -> eyre::Result> { - let result = /* existing rebuild */; - self.blob_metrics.record_restream_rebuild(); - result - } -} - -// In app.rs (sync path) -state.blob_metrics.record_sync_failure(); -``` - -> Pass the metrics handle (clone of `BlobEngineMetrics`) into `State::new` when constructing it in `node.rs` so consensus can emit restream/sync counters; per-block gauges are updated inside `BlobEngineImpl::mark_decided`. - ---- - -## Registration in Node - -### File: `crates/node/src/node.rs` - -**Location**: Around line 158 (after `DbMetrics::register`) - -```rust -use ultramarine_blob_engine::metrics::BlobEngineMetrics; -use std::sync::Arc; - -// ... inside node startup ... - -// Existing code -let registry = SharedRegistry::global().with_moniker(&self.config.moniker); -let db_metrics = DbMetrics::register(®istry); - -// NEW: Register blob metrics -let blob_metrics = BlobEngineMetrics::register(®istry); - -// Later when creating blob engine (find existing BlobEngineImpl::new call) -let blob_engine = BlobEngineImpl::new(blob_store, blob_metrics.clone())?; -``` - ---- - -## Testing Metrics - -### Step 1: Build and Start Testnet - -```bash -# Build with metrics -cargo build --release - -# Start testnet -make all -``` - -### Step 2: Check Metrics Endpoint - -```bash -# Check blob metrics are exposed -curl -s http://localhost:29000/metrics | grep blob_engine - -# Expected output (initial state): -# blob_engine_verifications_success_total{job="malachite0",moniker="test-0"} 0 -# blob_engine_verifications_failure_total{job="malachite0",moniker="test-0"} 0 -# blob_engine_storage_bytes_undecided{job="malachite0",moniker="test-0"} 0 -# blob_engine_storage_bytes_decided{job="malachite0",moniker="test-0"} 0 -# blob_engine_undecided_blob_count{job="malachite0",moniker="test-0"} 0 -# ... (12 total) -``` - -### Step 3: Verify Prometheus Scrapes - -```bash -# Check Prometheus targets -curl -s http://localhost:9090/api/v1/targets | python3 -m json.tool | grep -A 10 "malachite0" - -# Query specific metric -curl -s 'http://localhost:9090/api/v1/query?query=blob_engine_verifications_success_total' | python3 -m json.tool -``` - -### Step 4: Generate Load (After Spam Tool Fixed) - -```bash -# Run blob spam -make spam-blobs - -# Watch metrics update -watch -n 1 'curl -s http://localhost:29000/metrics | grep blob_engine_verifications' -``` - -### Step 5: Verify Metric Behavior - -**Test Cases**: - -1. ✅ `verifications_success_total` increments on valid blobs -2. ✅ `storage_bytes_undecided` increases when storing blobs -3. ✅ `undecided_blob_count` matches number of stored blobs -4. ✅ `lifecycle_promoted_total` increments on block finalization -5. ✅ `storage_bytes_decided` increases after promotion -6. ✅ `blobs_per_block` reflects blobs in last block -7. ✅ `verification_time` histogram shows P50/P99 values - ---- - -## Dashboard Queries - -### Pattern: Match GRAFANA_WORKING_STATE.md - -**Simple queries (no aggregations, no template variables)** - -### Panel 1: Verification Success Rate - -```json -{ - "title": "Blob Verification Rate", - "type": "timeseries", - "targets": [{ - "expr": "rate(blob_engine_verifications_success_total[1m])", - "legendFormat": "{{job}} - success", - "range": true, - "instant": false - }] -} -``` - -### Panel 2: Verification Failures - -```json -{ - "title": "Blob Verification Failures", - "type": "timeseries", - "targets": [{ - "expr": "rate(blob_engine_verifications_failure_total[1m])", - "legendFormat": "{{job}} - failure", - "range": true, - "instant": false - }] -} -``` - -### Panel 3: Storage Size by State - -```json -{ - "title": "Blob Storage Size - Undecided", - "type": "timeseries", - "targets": [{ - "expr": "blob_engine_storage_bytes_undecided", - "legendFormat": "{{job}}", - "range": true, - "instant": false - }] -} -``` - -```json -{ - "title": "Blob Storage Size - Decided", - "type": "timeseries", - "targets": [{ - "expr": "blob_engine_storage_bytes_decided", - "legendFormat": "{{job}}", - "range": true, - "instant": false - }] -} -``` - -### Panel 4: Undecided Blob Count - -```json -{ - "title": "Undecided Blobs Count", - "type": "timeseries", - "targets": [{ - "expr": "blob_engine_undecided_blob_count", - "legendFormat": "{{job}}", - "range": true, - "instant": false - }] -} -``` - -### Panel 5: Lifecycle Transitions - -```json -{ - "title": "Blob Lifecycle - Promoted", - "type": "timeseries", - "targets": [{ - "expr": "rate(blob_engine_lifecycle_promoted_total[1m])", - "legendFormat": "{{job}}", - "range": true, - "instant": false - }] -} -``` - -```json -{ - "title": "Blob Lifecycle - Dropped", - "type": "timeseries", - "targets": [{ - "expr": "rate(blob_engine_lifecycle_dropped_total[1m])", - "legendFormat": "{{job}}", - "range": true, - "instant": false - }] -} -``` - -### Panel 6: Blobs Per Block - -```json -{ - "title": "Blobs Per Block", - "type": "timeseries", - "targets": [{ - "expr": "blob_engine_blobs_per_block", - "legendFormat": "{{job}}", - "range": true, - "instant": false - }] -} -``` - -### Panel 7: Verification Latency (P99) - -```json -{ - "title": "Blob Verification Latency (P99)", - "type": "timeseries", - "targets": [{ - "expr": "histogram_quantile(0.99, rate(blob_engine_verification_time_bucket[5m]))", - "legendFormat": "{{job}} - P99", - "range": true, - "instant": false - }] -} -``` - -### Panel 8: Restream Rebuilds - -```json -{ - "title": "Restream Rebuilds Rate", - "type": "timeseries", - "targets": [{ - "expr": "rate(blob_engine_restream_rebuilds_total[1m])", - "legendFormat": "{{job}}", - "range": true, - "instant": false - }] -} -``` - -**Note**: Add panels to dashboard ONLY after metrics are confirmed working (Step 2 above passes). - ---- - -## Future Work - Node-Level Metrics - -**Status**: 🔵 **Phase A.2** - After BlobEngine metrics complete - -### What's Not Covered (Yet) - -This plan focuses on **BlobEngine** (storage layer) metrics. The following **consensus-layer** operations in `crates/node/src/app.rs` are not yet instrumented: - -#### **Missing Metrics** (Recommend adding in Phase A.2): - -1. **Blob Count Mismatch Detection** ⚠️ **CRITICAL SAFETY METRIC** - - **Location**: `app.rs:630` (Decided handler) - - **Code**: `if blobs.len() != expected_blob_count { ... }` - - **Metric**: `app_blob_count_mismatch_total` (counter) - - **Why Critical**: Detects blob availability failures during finalization - -2. **Proposer Blob Storage Timing** - - **Location**: `app.rs:163` (GetValue handler) - - **Code**: `state.blob_engine().verify_and_store(...)` - - **Metric**: `app_blob_proposal_storage_duration_seconds` (histogram) - - **Purpose**: Track proposer-side blob storage latency - -3. **Restream Rebuild Operations** - - **Location**: `app.rs:410` (RestreamProposal handler) - - **Code**: `state.blob_engine().get_undecided_blobs(...)` - - **Metric**: `app_blob_restream_rebuilds_total` (counter) - - **Purpose**: Track how often proposals are rebuilt for restreaming - -4. **Sync Path Blob Operations** - - **Location**: `app.rs:765` (ProcessSyncedValue handler) - - **Code**: `state.blob_engine().verify_and_store(...)` - - **Metric**: `app_blob_sync_received_total` (counter) - - **Purpose**: Distinguish sync path from proposal path blobs - -### Why Deferred - -**Rationale**: - -- BlobEngine metrics provide 80% of observability value -- Node-level metrics require understanding consensus flow patterns -- Can be added incrementally after BlobEngine metrics proven working -- Estimated effort: +2 hours (simple counters/histograms in app.rs) - -### Implementation Approach (Phase A.2) - -**File**: Create `crates/node/src/blob_metrics.rs` (separate from BlobEngine) - -```rust -use malachitebft_app_channel::app::metrics::{ - SharedRegistry, - prometheus::metrics::{counter::Counter, histogram::Histogram}, -}; - -#[derive(Clone, Debug)] -pub struct AppBlobMetrics(Arc); - -#[derive(Debug)] -pub struct Inner { - blob_count_mismatch: Counter, - proposal_storage_duration: Histogram, - restream_rebuilds: Counter, - sync_received: Counter, -} - -impl AppBlobMetrics { - pub fn register(registry: &SharedRegistry) -> Self { - // Register with prefix "app_blob_" - } -} -``` - -**Instrumentation**: Add metric calls directly in `app.rs` handlers (no trait changes needed). - -**Validation**: Same process as BlobEngine metrics (curl /metrics, check Prometheus). - ---- - -## Progress Tracking - -### Phase A: BlobEngine Metrics Instrumentation - -**Status**: ✅ Complete (2025-11-04) - -**Scope**: BlobEngine (storage layer) only - node-level metrics deferred to Phase A.2 - -#### Task A.1: Create Metrics Module - -- [x] Create `crates/blob_engine/src/metrics.rs` -- [x] Add `BlobEngineMetrics` struct with 12 metrics -- [x] Add `register()` method using `SharedRegistry` -- [x] Add helper methods for instrumentation (lines 293-351 in this doc) -- [x] Export in `lib.rs`: `pub mod metrics;` - -**Estimated Time**: 1-2 hours -**Reference**: Lines 128-358 in this document (complete code provided) - -#### Task A.2: Add Dependencies - -- [x] Add `malachitebft-app-channel` to `blob_engine/Cargo.toml` -- [x] Verify `cargo build -p ultramarine-blob-engine` succeeds - -**Estimated Time**: 5 minutes - -#### Task A.3: Add Metrics Field to BlobEngine - -- [x] Add `metrics: BlobEngineMetrics` to `BlobEngineImpl` (required parameter, not Optional) -- [x] Update constructor to require metrics parameter -- [x] Update all test call sites (4 tests) and node initialization - -**Estimated Time**: 15 minutes -**Note**: Implemented as required parameter for simplicity - matches DbMetrics pattern - -#### Task A.4: Extend BlobStore API - -- [x] Update `put_undecided_blobs` to return `usize` (blob count) -- [x] Update `mark_decided` to return `(usize, usize)` (blob count, total bytes) -- [x] Update `drop_round` to return `(usize, usize)` (blob count, total bytes) -- [x] Update trait and RocksDB implementation with count tracking - -**Estimated Time**: 45 minutes -**Note**: Focused on methods that needed metrics; prune_before already returned count - -#### Task A.5: Instrument BlobEngine Methods - -- [x] Instrument `verify_and_store` (verification counters + histogram + storage gauges) -- [x] Instrument `mark_decided` (promotion counters + gauge adjustments) -- [x] Instrument `drop_round` (drop counters + gauge decrements) -- [x] Instrument `mark_archived` (decided storage decrements) -- [x] Instrument `prune_archived_before` (decided storage decrements) - -**Estimated Time**: 1-2 hours -**Note**: Used bulk gauge operations (inc_by/dec_by) instead of loops for performance - -#### Task A.6: Register in Node Startup - -- [x] Find blob engine initialization in `crates/node/src/node.rs` (line 168) -- [x] Create `BlobEngineMetrics::register(®istry)` (line 159) -- [x] Pass metrics to `BlobEngineImpl::new(store, blob_metrics)` (line 169) -- [x] Verify `cargo build -p ultramarine-node` succeeds - -**Estimated Time**: 30 minutes - -#### Task A.7: Wire Consensus Hooks - -- [x] Add `blob_metrics: BlobEngineMetrics` field to `State` -- [x] Pass metrics clone into `State::new` (node startup) -- [x] Increment restream/rebuild counters in `State::rebuild_blob_sidecars_for_restream` -- [x] Track proposer-side failures in `State::prepare_blob_sidecar_parts` / import path (sync failure counter) - -**Estimated Time**: 45 minutes (completed during Phase A.2) -**Note**: Per-block gauges are updated in `BlobEngineImpl::mark_decided`; consensus-owned metrics cover restream and sync paths. - -#### Task A.8: Test Metrics Endpoint - -- [x] **COMPLETE** - `make all` (start testnet) -- [x] **COMPLETE** - `curl http://localhost:29000/metrics | grep blob_engine` (verified 12 metrics) -- [x] **COMPLETE** - Verify Prometheus scrapes targets successfully -- [x] **COMPLETE** - Query via Prometheus API - -**Estimated Time**: 30 minutes -**Status**: ✅ Completed 2025-11-04 (Phase C) - -#### Task A.9: Validate Metric Behavior (Optional, requires blob spam) - -- [x] **COMPLETE** - Run blob spam tool (193 txs, 1,158 blobs) -- [x] **COMPLETE** - Verify `verifications_success_total` increments (1,158 successes) -- [x] **COMPLETE** - Verify `storage_bytes_undecided` increases (dynamic during proposals) -- [x] **COMPLETE** - Verify `lifecycle_promoted_total` increments on finalization (1,158 promoted) - -**Estimated Time**: 30 minutes -**Status**: ✅ Completed 2025-11-04 (Phase C) - Spam tool works correctly - ---- - -### Implementation Summary (2025-11-04) - -**What Was Completed:** - -1. **Metrics Module** (`crates/blob_engine/src/metrics.rs` - 235 lines) - - 12 metrics implemented (8 counters, 3 gauges, 1 histogram) - - `BlobEngineMetrics` struct with `Arc` pattern matching `DbMetrics` - - Helper methods: `add_undecided_storage()`, `promote_blobs()`, `drop_blobs()`, `prune_blobs()`, etc. - - Registration via `SharedRegistry` with "blob_engine" prefix - -2. **BlobStore API Extensions** (`crates/blob_engine/src/store/mod.rs`) - - Updated `put_undecided_blobs()` return type: `Result<(), _>` → `Result` - - Updated `mark_decided()` return type: `Result<(), _>` → `Result<(usize, usize), _>` - - Updated `drop_round()` return type: `Result<(), _>` → `Result<(usize, usize), _>` - - RocksDB implementation tracks counts using blob sidecar iteration and `.size()` method - -3. **BlobEngine Instrumentation** (`crates/blob_engine/src/engine.rs`) - - Added `metrics: BlobEngineMetrics` field to `BlobEngineImpl` - - `verify_and_store()`: Records verification timing, success/failure counts, storage gauge updates - - `mark_decided()`: Tracks promotion with `promote_blobs(bytes, count)` - - `drop_round()`: Tracks drops with `drop_blobs(bytes, count)` - - `mark_archived()`: Tracks pruning with `prune_blobs(bytes, count)` - - `prune_archived_before()`: Uses `BYTES_PER_BLOB` constant for byte calculations - -4. **Node Registration** (`crates/node/src/node.rs`) - - Line 159: `BlobEngineMetrics::register(®istry)` - - Line 169: Pass metrics to `BlobEngineImpl::new(store, blob_metrics)` - - Wired into SharedRegistry with moniker prefix - -5. **Documentation Updates** - - Fixed lib.rs example to show `BlobEngineImpl::new(store, metrics)?` - - Updated engine.rs BlobEngine trait doc example - - Fixed verifier.rs doc test (removed broken example) - -**Critical Fixes Applied During Code Review:** - -1. **Performance Fix**: Replaced gauge loops with bulk operations - - Before: `for _ in 0..bytes { self.gauge.inc(); }` (131k+ operations per blob) - - After: `self.gauge.inc_by(bytes as i64)` (single operation) - -2. **Missing Instrumentation**: Added metrics to `mark_archived()` - - Calculates `total_bytes = blob_count * BYTES_PER_BLOB` - - Calls `metrics.prune_blobs(total_bytes, blob_count)` - -3. **Magic Number Elimination**: Replaced hard-coded `131_072` with `BYTES_PER_BLOB` constant - - Imported from `ultramarine_types::blob::BYTES_PER_BLOB` - - Applied in `prune_archived_before()` calculation - -4. **Type Fixes**: Corrected gauge API usage - - Gauge methods accept `i64`, not `u64` - - Changed all casts from `as u64` to `as i64` - -**Test Results:** - -- ✅ 11 unit tests passing, 2 ignored (require valid KZG proofs) -- ✅ Full codebase builds successfully -- ✅ All doc tests compile - -**Deferred to Phase A.2:** - -- State/consensus hooks (`set_blobs_per_block`, restream counters) -- Integration testing with live testnet -- Grafana dashboard validation - -**Files Modified (Phase A.1):** - -- `crates/blob_engine/src/metrics.rs` (new) -- `crates/blob_engine/src/engine.rs` -- `crates/blob_engine/src/store/mod.rs` -- `crates/blob_engine/src/store/rocksdb.rs` -- `crates/blob_engine/src/lib.rs` -- `crates/blob_engine/Cargo.toml` -- `crates/node/src/node.rs` - ---- - -### Phase A.2 Implementation Summary (2025-11-04) - -**What Was Completed:** - -1. **State Metrics Integration** - - Added `pub(crate) blob_metrics: BlobEngineMetrics` field to `State` struct - - Updated `State::new()` constructor to accept metrics parameter - - Updated `node.rs` to pass `blob_metrics.clone()` to State - - Fixed test helper in `state.rs` to create metrics instance - -2. **Fixed Missing Instrumentation** - - Added `set_blobs_per_block(blob_count)` call in `BlobEngine::mark_decided()` (engine.rs:273) - - This gauge was defined but never updated - now correctly tracks blobs per finalized block - -3. **Restream Path Instrumentation** - - Added `self.blob_metrics.record_restream_rebuild()` in `State::rebuild_blob_sidecars_for_restream()` (state.rs:503) - - Tracks when blob sidecars are reconstructed from storage metadata during restreaming - -4. **Sync Failure Instrumentation** - - Added `state.record_sync_failure()` in blob sync error path (app.rs:771) - - Tracks failed blob verification/storage during sync package processing - -5. **Encapsulation Improvements** (Architectural Quality) - - Changed `blob_metrics` field visibility to `pub(crate)` (state.rs:87) - - Added public helper method `State::record_sync_failure()` (state.rs:189-195) - - External crates (like `ultramarine-node`) now use clean API instead of direct field access - - State maintains sole ownership of its instrumentation surface - - Internal code within `consensus` crate can still access `self.blob_metrics` directly for flexibility - -**Architectural Rationale:** - -The `pub(crate)` + helper method pattern provides: - -- ✅ **Encapsulation**: State owns its instrumentation surface, metrics changes stay localized -- ✅ **Maintainability**: Future metrics API changes don't ripple across crate boundaries -- ✅ **Clean API**: External callers use documented, semantic methods -- ✅ **Flexibility**: Internal consensus code can access metrics directly when needed - -**Files Modified (Phase A.2):** - -- `crates/consensus/src/state.rs` (metrics field, helper method, restream instrumentation) -- `crates/node/src/node.rs` (pass metrics to State) -- `crates/node/src/app.rs` (sync failure instrumentation) -- `crates/blob_engine/src/engine.rs` (fixed set_blobs_per_block call) - -**Test Results:** - -- ✅ 25 consensus tests passing -- ✅ 11 blob_engine tests passing -- ✅ Full codebase builds successfully -- ✅ Zero regressions - ---- - -### Success Criteria - -**Phase A.1 (BlobEngine instrumentation)** is complete when: - -1. ✅ `crates/blob_engine/src/metrics.rs` exists with 12 metrics **[DONE 2025-11-04]** -2. ✅ `cargo build` succeeds **[DONE 2025-11-04]** -3. ✅ `curl http://localhost:29000/metrics | grep blob_engine` returns 12+ lines **[DONE 2025-11-04]** -4. ✅ Prometheus scrapes metrics successfully (check `/targets` page) **[DONE 2025-11-04]** -5. ✅ Query returns data: `curl 'http://localhost:9090/api/v1/query?query=blob_engine_verifications_success_total'` **[DONE 2025-11-04]** - -**Phase A.2 (State/consensus hooks)** is complete when: - -1. ✅ State accepts BlobEngineMetrics at construction **[DONE 2025-11-04]** -2. ✅ `blobs_per_block` gauge updates on finalization **[DONE 2025-11-04]** -3. ✅ `restream_rebuilds_total` increments when rebuilding metadata **[DONE 2025-11-04]** -4. ✅ `sync_failures_total` increments on blob fetch/verify errors **[DONE 2025-11-04]** -5. ✅ Metrics encapsulation maintained via `pub(crate)` + helper methods **[DONE 2025-11-04]** -6. ✅ All tests still pass (25 consensus, 11 blob_engine) **[DONE 2025-11-04]** - -**Optional** (requires working spam tool): -7. ✅ Metrics update in real-time during blob spam **[DONE 2025-11-04]** -8. ✅ Verification, storage, and lifecycle metrics correlate correctly **[DONE 2025-11-04]** - -**Phase A.1 Core Implementation**: ✅ Complete -**Phase A.2 State Hooks**: ✅ Complete -**Integration Validation**: ✅ Complete (Phase C validated on testnet) - ---- - -## Notes - -### What to Avoid - -- ❌ Using `prometheus` crate directly (use `malachitebft-app-channel` instead) -- ❌ `CounterVec`/`GaugeVec` with labels (use separate metrics like DbMetrics) -- ❌ Complex dashboard queries (no `max by`, no filters initially) -- ❌ Adding metrics to dashboard before Step 2 passes -- ❌ Following other plan documents (BLOB_METRICS_MINIMAL_PLAN.md, PHASE5_ACTION_PLAN.md are archived) - -### What to Do - -- ✅ Follow `DbMetrics` pattern exactly (`crates/consensus/src/metrics.rs`) -- ✅ Use `SharedRegistry::with_prefix("blob_engine", ...)` -- ✅ Test metrics endpoint BEFORE adding dashboard panels -- ✅ Keep helper methods simple (e.g., `record_verifications_success`) -- ✅ Use this document (METRICS_PROGRESS.md) as single source of truth - -### Dependencies - -- `malachitebft-app-channel` workspace dependency ✅ **Already in workspace** -- Existing `SharedRegistry` in node startup ✅ **Available** -- Working Prometheus/Grafana setup ✅ **Functional** (verified 2025-11-03) - -### Architecture Validation ✅ - -**This plan has been reviewed against the codebase and confirmed:** - -- ✅ Follows exact pattern from `consensus/src/metrics.rs` (DbMetrics) -- ✅ Uses correct dependency (`malachitebft-app-channel`) -- ✅ Constructor pattern matches codebase (metrics passed directly to `BlobEngineImpl::new`) -- ✅ Helper methods handle gauge inc/dec correctly -- ✅ Compatible with existing monitoring infrastructure (Prometheus + Grafana) -- ⚠️ Defers node-level metrics (app.rs) to Phase A.2 (see section above) - ---- - -## References - -- **Pattern**: `crates/consensus/src/metrics.rs` (DbMetrics) -- **Registration**: `crates/node/src/node.rs:158` (DbMetrics::register) -- **Dashboard Style**: `docs/GRAFANA_WORKING_STATE.md` (simple queries) -- **Phase Plan**: `docs/PHASE5_TESTNET.md` (Phase A) - ---- - -## Next Steps: Integration & Testing - -**Phase A (Metrics Instrumentation)**: ✅ **COMPLETE** - -Both Phase A.1 (BlobEngine surface) and Phase A.2 (State/consensus hooks) are now complete. All 12 metrics are fully instrumented across the BlobEngine and State layers. - -**Upcoming Phases:** - -### **Phase B: In-Process Integration Tests** (6-8 hours) — ✅ **Complete (event-driven harness, 2025-11-18 refresh)** - -- Implement test harness with `TempDir` isolation -- Add 3 integration tests: `blob_roundtrip`, `restart_hydrate`, `sync_package_roundtrip` -- Mock Execution client, use real blob engine/KZG -- Target: ~2-5 seconds per test - -### **Phase C: Full-Stack Smoke & Observability** (4-6 hours) — ⏳ **Pending** - -- Boot Docker testnet (`make all`) -- Run blob spam tool (after Phase E fixes) -- Verify metrics endpoint exposes all 12 `blob_engine_*` metrics -- Expand Grafana dashboard with blob panels -- Document testnet workflow - -### **Phase E: Fix Spam Tool** (4-6 hours) — ⏳ **Pending** - -- Generate real 131KB blobs with KZG commitments/proofs -- Use Alloy/Reth blob transaction helpers -- Required for end-to-end validation - -**Immediate Action Items:** - -1. ⏳ Wait for Phase B integration tests from beta team -2. ✅ Start testnet to validate metrics endpoint (`curl /metrics | grep blob_engine`) -3. ⏳ Begin Grafana dashboard panel design (see GRAFANA_WORKING_STATE.md) - ---- - -**Last Updated**: 2025-11-04 -**Phase A.1**: ✅ Complete (BlobEngine instrumentation) -**Phase A.2**: ✅ Complete (State/consensus hooks) -**Phase B**: ✅ Complete (Beta team integration tests landed; deterministic harness updated 2025-11-18) -**Next Review**: After testnet startup (metrics endpoint validation) diff --git a/docs/PHASE5_SUMMARY.md b/docs/PHASE5_SUMMARY.md deleted file mode 100644 index 0e2754a..0000000 --- a/docs/PHASE5_SUMMARY.md +++ /dev/null @@ -1,379 +0,0 @@ -# Phase 5 Complete: EIP-4844 Blob Sidecar Implementation - -**Status**: ✅ **COMPLETE** (All Sub-Phases Validated) -**Completion Date**: November 5, 2025 -**Duration**: October 31 - November 5, 2025 (6 days) - ---- - -## Executive Summary - -Phase 5 delivered end-to-end blob sidecar support for Ultramarine while keeping consensus messages lightweight and aligned with Malachite/Tendermint streaming expectations. The work focused on three concrete areas: - -1. **Execution → Consensus bridge** – proposal flow now retrieves Deneb payloads with blob bundles via Engine API v3 and converts them into metadata-only consensus values. -2. **Blob storage & availability** – the blob engine verifies real KZG proofs, persists blobs in RocksDB, and enforces availability during commit. -3. **Observability & validation** – new metrics, dashboards, load tooling, and an in-process harness prove the system under both single-node and multi-node scenarios. - -**Key Achievements**: - -- ✅ 12 Prometheus metrics tracking complete blob lifecycle -- ✅ 9 Grafana dashboard panels for real-time observability -- ✅ 17 integration tests (3 Tier 0 + 14 Tier 1) with real KZG cryptography, including negative-path coverage -- ✅ 1,158 blobs processed successfully on live testnet (100% verification rate) -- ✅ Full blob storage lifecycle: undecided → decided → pruned - ---- - -## Phase Structure - -Phase 5 was organized into three sequential sub-phases: - -### Phase 5A: Metrics Instrumentation - -**Goal**: Instrument blob engine with Prometheus metrics and Grafana dashboards -**Status**: ✅ COMPLETE (November 4, 2025) - -### Phase 5B: Integration Tests - -**Goal**: Build integration test harness with 3+ E2E scenarios -**Status**: ✅ COMPLETE (November 5, 2025) -**Delivered**: 13 E2E tests (430% of goal) - -### Phase 5C: Testnet Validation - -**Goal**: Validate blob lifecycle on live 3-node testnet -**Status**: ✅ COMPLETE (November 4, 2025) - -## What We Built - -- **Execution bridge** `crates/execution/src/client.rs:349`\ - `generate_block_with_blobs` fetches payloads + blob bundles and returns `ExecutionPayloadV3` plus converted `BlobsBundle`. - -- **Consensus state updates** `crates/consensus/src/state.rs` - - `propose_value_with_blobs` stores metadata + undecided `BlobMetadata` before streaming (`1034` onwards). - - Commit path refuses to finalize without blob promotion and keeps the parent blob root updated (`840` onwards). - - Restream logic rebuilds Deneb-compatible sidecars on demand (`333-420`, `452-557`). - -- **Blob engine** `crates/blob_engine/src` - - `BlobVerifier` loads the embedded Ethereum trusted setup and batch-verifies KZG proofs. - - `BlobEngineImpl::verify_and_store` and `mark_decided` manage lifecycle; the latter is now idempotent, preserving metrics on duplicate promotion (`270-305`). - - RocksDB store moves blobs between undecided/decided columns without iterator invalidation (`store/rocksdb.rs:218`). - -- **Metrics & dashboards** - - `BlobEngineMetrics` exports 12 metrics and hooks into the node registry (`metrics.rs:23-175`, `crates/node/src/node.rs:157-180`). - - Grafana dashboard references the new metric names (`monitoring/config-grafana/.../default.json:534-1291`). - - Developer workflow doc explains how to interpret blob panels (`docs/DEV_WORKFLOW.md:120-200`). - -- **Load tooling** `crates/utils/src/tx.rs:72-200`\ - Spam utility now generates real blobs, commitments, proofs, and versioned hashes using the shared trusted setup. - -- **Integration harness** - - **Tier 0 (component smokes)** live in `crates/consensus/tests` and cover happy-path proposal/commit, commitment mismatch rejection, and retention logic with real RocksDB + KZG. They run in `make test` and CI by default. - - **Tier 1 (full-node)** lives in `crates/test/tests/full_node` and boots Malachite channel actors, WAL, libp2p, and the application loop for 14 scenarios (quorum blob roundtrip, restream, restarts, ValueSync failures, pruning, etc.). Run via `make itest-node`; wired into CI in `itest-tier1` with failure artifacts. - -**Key Design Decisions**: - -- Consensus messages contain only blob metadata (commitments/hashes), never full blob bytes -- Blobs stored separately with two-stage lifecycle: undecided → decided → pruned -- Real KZG cryptography in tests (c-kzg library with Ethereum mainnet trusted setup) -- RocksDB backend with separate column families (CF_UNDECIDED, CF_DECIDED) -- Hardcoded 5-block retention in Phase 5 (configurable pruning deferred to Phase 6) - -## What We Validated - -- **Tier 0 (consensus crate)**: 3 smokes (`blob_roundtrip`, `blob_sync_commitment_mismatch`, `blob_pruning`) validate proposer→commit happy path, metadata/sidecar consistency checks, and retention metrics with real KZG + RocksDB. These run in the default `make test` and CI. -- **Tier 1 (full-node harness)**: 14 scenarios cover quorum blob roundtrip, restream across validators/rounds, multi-height restarts, ValueSync (happy and failure: commitment mismatch, inclusion proof failure), blobless sequences, pruning, sync package roundtrip, and EL rejection. Run via `make itest-node` (process-isolated) and in CI’s `itest-tier1` lane with artifacts on failure. - -**Integration Test Results** (Phase 5B): - -- ✅ Tier 0: 3/3 passing with real KZG -- ✅ Tier 1: 14/14 passing with real KZG, libp2p, WAL, channel actors -- ✅ Metrics accurately track operations (verifications, storage, lifecycle) -- ✅ Blob lifecycle works correctly (undecided → decided → pruned) -- ✅ Persistence survives restarts (RocksDB hydration) -- ✅ Sync packages include and process blobs correctly -- ✅ Late-join validators can sync and access decided blobs -- ✅ Negative-path coverage ensures execution-layer rejection stops commit - -**Testnet Validation Results** (Phase 5C): - -**Command**: `make spam-blobs` (60 seconds @ 50 TPS, 6 blobs/tx) -**Date**: November 4, 2025 - -```bash -# Total blobs processed -blob_engine_verifications_success_total 1158 - -# Storage state (all promoted, none pending) -blob_engine_storage_bytes_undecided 0 -blob_engine_storage_bytes_decided 151748608 # ~145 MB - -# Lifecycle tracking -blob_engine_lifecycle_promoted_total 1158 -blob_engine_lifecycle_dropped_total 0 -blob_engine_lifecycle_pruned_total 0 # Recent blocks - -# Performance -blob_engine_verification_time_bucket{le="0.01"} 1158 # All < 10ms -``` - -**Key Findings**: - -- 100% KZG verification success rate (1,158/1,158) -- Zero verification failures -- All blobs promoted from undecided → decided -- Grafana dashboard panels updating in real-time -- Verification latency consistently under 10ms -- 3 nodes reached consensus on all blob hashes -- Block rate maintained at ~1.0 blocks/second with blobs -- Metrics remain stable after fixing the duplicate `mark_decided` call; `blob_engine_blobs_per_block` no longer resets to zero when the operator commit path calls promotion twice. - -## Performance Characteristics - -- **Verification throughput** – Batch KZG verification (c-kzg) validated hundreds of blobs without failures; `blob_engine_verification_time` histogram shows <50 ms P99 during testnet spam. -- **Storage footprint** – Metrics track undecided/decided bytes. With 6 blobs per block, storage gauges confirm 6 × 131,072 B promoted and zero undecided leftovers. -- **Test harness runtime** – Tier 0 smokes now run in ~8–10 s (3 scenarios); Tier 1 full-node suite runs separately via `make itest-node` (14 scenarios). -- **Spam campaign** – 60 s at 50 TPS (6 blobs/tx) exercised 1,158 blobs; the system maintained zero verification failures and matched versioned hashes against KZG commitments (see `docs/PHASE5_TESTNET.md:426-440`). - -## Lessons Learned - -### 1. Scope Clarity Prevents Over-Engineering - -**Challenge**: Initial Phase 5B assessment identified many "gaps" (error path testing, edge cases, pruning validation) that seemed like blockers. - -**User Feedback**: "Why it's a fix? Why do we need it? Pruning is phase 6 thing" - -**Lesson**: - -- Clearly distinguish "completion criteria" from "future improvements" -- Pruning/archiving are Phase 6/7 scope, not Phase 5 requirements -- Test coverage gaps are acceptable if acknowledged and documented -- **Original goal**: 3+ E2E tests → **Delivered**: 6 tests → Goal exceeded by 100% - -**Outcome**: Phase 5 declared complete despite known limitations. Future improvements tracked separately for Phase 6+. - -### 2. Trust the Code, Not Documentation - -**Challenge**: Documentation contradicted itself about Phase 5B status. - -**Solution**: Run actual tests (`make itest`) to determine ground truth. - -**Lesson**: - -- Documentation can drift from reality during rapid development -- Code and passing tests are source of truth -- Simple validation (just run tests) beats extensive analysis - -**Outcome**: All 6 tests passed → Phase 5B confirmed complete - -### 3. Idempotency Matters for Metrics - -**Challenge**: Both proposer path and commit path call `mark_decided()`. Initial implementation counted blobs twice. - -**Fix**: Made `mark_decided()` idempotent (lines 270-305 in `blob_engine.rs`) - -**Lesson**: - -- State transitions in distributed systems often happen multiple times -- Metrics must handle idempotent operations correctly -- Gauges should reflect actual state, not duplicate counts - -**Impact**: `blob_engine_blobs_per_block` no longer resets to zero on duplicate promotion calls - -### 4. Real Cryptography in Tests is Worth It - -**Decision**: Use real c-kzg library with Ethereum mainnet trusted setup in integration tests (not mocks) - -**Trade-offs**: - -- ✅ **Pro**: Validates actual cryptographic correctness -- ✅ **Pro**: Catches real-world KZG proof issues -- ✅ **Pro**: Tests are higher confidence -- ❌ **Con**: Slightly slower tests (~21s vs ~5s with mocks) - -**Outcome**: Worth the trade-off. Real KZG found no issues, but if there were bugs, mocks would have hidden them. - -**Lesson**: For critical cryptographic paths, real crypto in tests > speed - -### 5. Deterministic Fixtures Reduce Friction - -**Approach**: Cache trusted setup once, generate real blobs inside harness - -**Benefits**: - -- No external tooling required for test execution -- Tests are reproducible across machines -- Developers can run `make itest` without Docker/Reth/network setup -- Faster iteration cycle (21s vs minutes for full testnet) - -**Lesson**: Invest in good test infrastructure early. The `tests/common/mod.rs` helpers unlocked reliable tests without external dependencies. - -### 6. Metrics are Force Multipliers - -**Observation**: 12 Prometheus metrics provided 10× visibility compared to logs alone - -**Impact**: - -- Grafana dashboard enabled real-time health monitoring -- Metrics caught issues logs would miss (e.g., undecided blob leaks) -- Performance characteristics immediately visible -- Debugging time reduced significantly - -**Lesson**: Invest in metrics early. The 2-3 days spent on Phase 5A metrics paid back immediately in Phase 5C validation. - -### 7. Simple Plans Ship Faster - -**Pattern**: User consistently pushed for simpler approaches: - -- "Just run `make itest` and see if tests pass" -- "Don't overthink it" -- "Gaps are future improvements, not blockers" - -**Lesson**: - -- Avoid over-engineering validation plans -- Start with simplest validation approach -- "Good enough and shipped" > "perfect and incomplete" - -**Outcome**: Phase 5 completed in 6 days with this approach - -### 8. Documentation Must Keep Pace - -**Issue**: Legacy references to `TESTNET_WORKFLOW.md` and `make itest-full` lingered - -**Fix**: Consolidated to `DEV_WORKFLOW.md` as single source of truth - -**Lesson**: Update documentation immediately when workflows change - -### 9. Negative-Path Coverage is Next Frontier - -**Current State**: Phase 5B tests cover happy paths comprehensively - -**Not Covered**: Invalid KZG proofs, corrupted blob data, zero-blob transactions, missing metadata, race conditions - -**Status**: Acknowledged as future work (Phase 6+), not Phase 5 blockers - -**Lesson**: Happy-path coverage validates core functionality. Error paths can be added incrementally. - -## Known Limitations (Out-of-Scope for Phase 5) - -### Test Coverage Gaps - -**Not Covered**: - -- Error path testing (invalid KZG proofs, corrupted blobs) -- Edge cases (empty blob bundles, duplicate blobs) -- Concurrency stress testing (race conditions) -- Large-scale sync (100+ validators) - -**Rationale**: Happy-path coverage validates core functionality. Error paths deferred to Phase 6+ for incremental improvement. - -### Pruning Configuration - -**Current**: Hardcoded 5-block retention in `blob_engine.rs` - -**Not Implemented**: - -- Configurable retention policies (block count, time-based) -- Archiving to cold storage -- Selective pruning (keep blobs for certain heights) - -**Rationale**: Hardcoded retention sufficient for testnet validation. Production needs will inform Phase 6 design. - -**Phase 6 Work**: Configurable pruning with CLI flags and config file support - -### Blob Availability Guarantees - -**Current**: Blobs stored in local RocksDB only - -**Not Implemented**: - -- Multi-node blob gossip for sync -- Blob fetching from peers if missing locally -- Erasure coding or redundancy - -**Rationale**: Single-node storage is sufficient for consensus layer. Availability layer is separate concern (potentially Phase 7+). - ---- - -## Success Metrics - -### Original Goals vs. Actual Delivery - -| Goal | Target | Actual | Status | -| ----------------------- | ----------- | ----------------------------------------------------------------- | ------- | -| Metrics instrumentation | 10+ metrics | 12 metrics | ✅ 120% | -| Grafana panels | 6+ panels | 9 panels | ✅ 150% | -| Integration tests | 3+ tests | Tier 0: 3 smokes (`crates/consensus/tests`), Tier 1: 14 full-node | ✅ | -| Test execution time | < 30s | Tier 0: ~8–10 s (current); Tier 1 runs separately | ✅ | -| Testnet validation | 500+ blobs | 1,158 blobs | ✅ 232% | -| KZG verification rate | > 95% | 100% | ✅ 105% | - -**Overall**: All goals met or exceeded - ---- - -## Next Steps (Phase 6 and Beyond) - -### Phase 6: Configurable Pruning - -**Goals**: - -- CLI flags for retention policy (`--blob-retention-blocks`, `--blob-retention-time`) -- Config file support for pruning parameters -- Metrics for pruning operations -- Integration tests for pruning scenarios - -### Future Phases - -**Phase 7+**: Archiving and availability - -- Archive old blobs to cold storage (S3, disk) -- Blob fetching from peers during sync -- Retention policies (keep recent, archive old) - -**CI Integration**: - -- Wire `make itest` into CI pipeline -- Run Phase 5B tests on every PR to catch regressions -- Add error-path tests incrementally - -**Production Hardening**: - -- Error path testing (invalid proofs, missing metadata) -- Large-scale sync scenarios (100+ validators) -- Concurrency stress testing - ---- - -## Conclusion - -Phase 5 successfully delivered a production-ready EIP-4844 blob sidecar implementation for Ultramarine. All three sub-phases (Metrics, Integration Tests, Testnet Validation) completed successfully and exceeded original goals. - -**Validation Summary**: - -- ✅ 6/6 integration tests passing with real KZG cryptography -- ✅ 1,158 blobs processed successfully on live testnet -- ✅ 100% KZG verification success rate -- ✅ 12 metrics instrumented, 9 Grafana panels operational -- ✅ Full blob lifecycle validated (undecided → decided → pruned) - -**Key Achievements**: - -- Exceeded integration test goal by 100% (6 tests vs. 3 target) -- Real c-kzg cryptography validates correctness, not just happy paths -- Comprehensive observability enables production monitoring -- Simple, well-documented workflow (`make itest`, `make spam-blobs`) - -**Readiness Assessment**: Phase 5 implementation is solid, tested, and operational. The system is ready for Phase 6 configurable pruning work. Recommend focusing Phase 6+ on production hardening (error paths, configurable pruning) rather than reworking Phase 5 fundamentals. - ---- - -**Phase 5 Status**: ✅ **COMPLETE** (November 5, 2025) - -**References**: - -- Progress tracking: `PHASE5_PROGRESS.md` -- Developer workflow: `docs/DEV_WORKFLOW.md` -- Testnet results: `docs/PHASE5_TESTNET.md` -- Metrics tracking: `docs/METRICS_PROGRESS.md` diff --git a/docs/PHASE5_TESTNET.md b/docs/PHASE5_TESTNET.md deleted file mode 100644 index 5ead921..0000000 --- a/docs/PHASE5_TESTNET.md +++ /dev/null @@ -1,602 +0,0 @@ -# Phase 5 Testnet: Integration Testing & Blob Observability - -**Status**: ✅ Phase A-D Complete – Phase 5 signed off (metrics, harness, testnet) -**Started**: 2025-10-28 -**Updated**: 2025-11-08 -**Goal**: Validate blob sidecar integration end-to-end, add comprehensive observability, and establish testnet workflow for production readiness. - ---- - -## Table of Contents - -1. [Overview](#overview) -2. [Design Philosophy](#design-philosophy) -3. [Current State Analysis](#current-state-analysis) -4. [Implementation Plan](#implementation-plan) -5. [Daily Progress Log](#daily-progress-log) -6. [Testing Strategy](#testing-strategy) -7. [Test Artifacts](#test-artifacts) -8. [Success Criteria](#success-criteria) - ---- - -## Overview - -### Motivation - -Phases 1-4 delivered a complete blob sidecar implementation with 23/23 passing unit tests. Phase 5-5.2 fixed critical bugs for live consensus. Phase 5A-C added comprehensive observability and validation: - -- ✅ Working 3-node testnet infrastructure -- ✅ Spam tool with `--blobs` flag generates valid blob transactions with real KZG proofs -- ✅ **12 blob-specific metrics** (verification time, storage size, lifecycle transitions) -- ✅ **9 blob dashboard panels** added to Grafana for blob observability -- ✅ **Integration harness delivered by Team Beta** (Tier 0: 3 fast consensus smokes; Tier 1: 14 full-node scenarios with real KZG blobs) -- ✅ **Integration results**: Tier 0 3/3 (default in `make test`); Tier 1 14/14 via `make itest-node` and CI’s `itest-tier1` - -### Goals - -1. **Metrics Instrumentation**: Add Prometheus metrics to BlobEngine for verification, storage, and lifecycle. -2. **Grafana Dashboard**: Create 10+ panels to visualize blob activity across consensus/execution layers. -3. **In-Process Integration Tests**: Provide deterministic `cargo test` coverage for blob lifecycle, restart survival, and sync package handling. -4. **Testnet Workflow**: Document and automate blob testing procedures for manual validation. -5. **Performance Validation**: Measure blob verification latency, storage growth, and throughput under load. - -### Non-Goals - -- Blob pruning/archival implementation (deferred to Phase 6) -- P2P bandwidth optimization (current streaming works, optimization later) -- Production deployment (focus on validation first) - ---- - -## Design Philosophy - -### Guiding Principles - -- **Mirror Malachite’s modular style**: Each functional area (consensus, blob engine, node) owns small, self-contained integration tests that run directly under `cargo test`. -- **Lean, deterministic harness**: Prefer in-process Tokio harnesses with mocked Engine API clients and temporary blob stores over heavyweight external dependencies. -- **Optional heavyweight smoke**: Keep the existing Docker-based network workflow, but invoke it only for opt-in “full stack” validation. -- **Single-command ergonomics**: Developers should be able to run fast in-process tests with `make itest`, and the full-stack smoke with `make itest-full`. -- **Metrics-first observability**: Every blob operation emits Prometheus metrics so both tests and dashboards have programmatic signals. - -### Implementation Pattern - -1. **Inline Test Helpers** - - Define small helper structs/functions directly inside integration tests (or `tests/common/mod.rs`). - - Use Tokio tasks to spin Ultramarine nodes, mock only the Execution client, reuse real blob engine/KZG logic. - - Allocate `tempfile::TempDir` per test so Drop handles cleanup automatically. -2. **Focused Integration Tests** - - Place under `tests/` with descriptive names (`blob_roundtrip.rs`, `restart_hydrate.rs`, etc.). - - Mark `#[tokio::test] #[ignore]` and optional `#[serial]` for resource isolation. - - Assert on store state, blob metrics, restream behavior—tests should finish in ~2–5 s. -3. **Full Stack Smoke (Optional)** - - Wrap existing `make all` workflow; run the repaired blob spammer; assert via RPC/metrics. - - Keep opt-in (env flag / `make itest-full`) so CI/devs run it only when needed. - -This approach resolves the gap between purely manual testing and the need for automated coverage without contradicting Malachite’s precedent. - ---- - -## Current State Analysis - -### Testnet Infrastructure (✅ Production-Ready) - -**Architecture** (as of 2025-10-28): - -``` -┌─────────────────────────────────────────────────────────────┐ -│ Execution Layer (load-reth) │ -│ ├─ load-reth0 (8545/8551) metrics: 9100 ─┐ │ -│ ├─ load-reth1 (18545/18551) metrics: 9101 ─┼─> Prometheus│ -│ └─ load-reth2 (28545/28551) metrics: 9102 ─┘ (1s scrape)│ -└─────────────────────────────────────────────────────────────┘ - ▲ - │ Engine API v3 (HTTP/IPC) - ▼ -┌─────────────────────────────────────────────────────────────┐ -│ Consensus Layer (Ultramarine + Malachite) │ -│ ├─ malachite0 metrics: 29000 ─┐ │ -│ ├─ malachite1 metrics: 29001 ─┼─> Prometheus │ -│ └─ malachite2 metrics: 29002 ─┘ │ -└─────────────────────────────────────────────────────────────┘ - │ - ▼ - ┌──────────────┐ - │ Grafana │ - │ :3000 │ - └──────────────┘ -``` - -**Automation** (`Makefile` targets): - -- `make all` - Full testnet setup (genesis → docker → spawn nodes) -- `make all-ipc` - Testnet with Engine IPC (Docker-based) -- `make spam` - Transaction load testing (supports `--blobs` flag) -- `make clean-net` - Full cleanup (preserves code, removes data) -- `make stop` - Stop Docker stack - -**Scripts**: - -- `scripts/spawn.bash` - Node lifecycle management (supports `--engine-ipc-base`, `--jwt-path`) -- `scripts/add_peers.sh` - P2P bootstrapping -- `crates/cli/src/cmd/testnet.rs` - Config generation for N validators - -### Existing Metrics - -**Malachite (Consensus)**: - -- `malachitebft_core_consensus_height` - Block height -- `malachitebft_core_consensus_round` - Round number -- `malachitebft_core_consensus_block_size_bytes` - Block size -- `app_channel_db_*` - Database I/O (consensus store) - -**load-reth (Execution — metrics still prefixed `reth_` upstream)**: - -- `reth_engine_rpc_get_payload_v3_count` - Engine API calls -- `reth_engine_rpc_forkchoice_updated_messages` - FCU messages -- `reth_transaction_pool_*` - Txpool metrics (pending/queued) -- `reth_blockchain_tree_canonical_chain_height` - EL height -- `reth_network_connected_peers` - Peer count - -**Notable**: Zero blob-specific metrics in either layer. - -### Spam Tool (✅ Blob-Capable After Phase E) - -**Location**: `crates/utils/src/commands/spam.rs`, supporting helpers in `crates/utils/src/tx.rs` - -**High-Level Flow**: - -``` -spammer() -├─ obtain signer + pending nonce (`eth_getTransactionCount`) -├─ per tick (rate-limited): -│ ├─ when --blobs=true → make_signed_eip4844_tx(...) -│ │ ├─ generate_blobs_with_kzg(count) -│ │ │ • build deterministic 131_072-byte blobs -│ │ │ • compute KZG commitments + proofs via trusted setup -│ │ │ • derive versioned hashes (kzg_to_versioned_hash → 0x01 prefix) -│ │ ├─ wrap into `TxEip4844WithSidecar` -│ │ └─ sign envelope with requested chain ID / gas params -│ ├─ fall back to EIP-1559 path when `--blobs=false` -│ ├─ submit via `eth_sendRawTransaction` -│ └─ record success/error + tx size on async channel -└─ tracker task reports per-second stats and txpool status -``` - -**Capabilities**: - -- ✅ Real blob data, commitments, proofs, and sidecar wiring per transaction. -- ✅ `--blobs-per-tx` validated for 1–1024 blobs; default stays 128. -- ✅ Nonce management accounts for “already known/replacement” responses. -- ✅ Compatible with Cancun Engine API v3 (sidecars arrive over raw tx submission). -- ⚠️ Still prints a warning when blob mode is enabled (Engine V3 peers without sidecar support can reject blobs); keep until Engine V4 rollout. - -**Outstanding Enhancements**: - -- No Prometheus metrics or structured logs yet—future Phase 5 work can expose spammer stats. -- Error handling remains best-effort (no backoff/retry policy beyond nonce bumps). - ---- - -## Implementation Plan - -### Phase A – Metrics Instrumentation (4–6 hours) — ✅ Complete (A.1 + A.2) - -**Phase A.1: BlobEngine Surface** ✅ **Complete (2025-11-04)** - -- ✅ Implemented `BlobEngineMetrics` module with 12 metrics (8 counters, 3 gauges, 1 histogram) -- ✅ Extended `BlobStore` API to return counts for metrics tracking -- ✅ Instrumented all 6 BlobEngine methods: `verify_and_store`, `mark_decided`, `drop_round`, `mark_archived`, `prune_archived_before`, `get_undecided_blobs` -- ✅ Registered metrics in `node.rs` via `SharedRegistry` with "blob_engine" prefix -- ✅ Applied performance fixes (bulk gauge operations, BYTES_PER_BLOB constant usage) -- ✅ 11 tests passing, full codebase builds successfully - -**Phase A.2: State/Consensus Hooks** ✅ **Complete (2025-11-04)** - -- ✅ Pass `BlobEngineMetrics` to `State` constructor (with `pub(crate)` visibility) -- ✅ Fixed `set_blobs_per_block(count)` in `BlobEngine::mark_decided()` (was missing) -- ✅ Instrument `State::rebuild_blob_sidecars_for_restream()` for restream counter -- ✅ Instrument blob sync error paths via `State::record_sync_failure()` helper method -- ✅ **Encapsulation improvements**: `pub(crate)` field + public helper methods maintain clean API - -**Implementation Details**: See [METRICS_PROGRESS.md](./METRICS_PROGRESS.md) for complete specifications, code patterns, and progress tracking. - -**Note**: Phase A.2 also fixed a critical issue where `blobs_per_block` gauge was defined but never updated. The encapsulation improvements ensure State owns its instrumentation surface, preventing metric implementation details from leaking across crate boundaries. - -### Phase B – In-Process Tests (6–8 hours) — ✅ COMPLETE (2025-11-18 determinism refresh) - -- Implement inline helper structs inside `tests/` (and optionally `tests/common`) to: - - Spin up Ultramarine nodes on Tokio runtimes with per-test `TempDir` storage. - - Mock only the Execution client while reusing real blob engine/KZG verification. -- Provide helper functions (`wait_for_height`, `restart_node`, `scrape_metrics`). -- Cover two tiers: - - **Tier 0 (component)**: 3 fast consensus tests in `crates/consensus/tests` (`blob_roundtrip`, `blob_sync_commitment_mismatch`, `blob_pruning`) with real RocksDB + KZG; run in `make test` and CI by default. - - **Tier 1 (full node)**: 14 ignored tests in `crates/test/tests/full_node` spanning quorum blob roundtrip, restream (multi-validator/multi-round), multi-height restarts, ValueSync ingestion/failure, blobless sequences, pruning, sync package roundtrip, and EL rejection; run via `make itest-node` and CI job `itest-tier1` (RUST_TEST_THREADS=1, artifacts on failure). -- Clarify restart behavior: spawn multiple `App` instances within one Tokio runtime, reusing the same on-disk store to simulate restarts. -- Shared helpers `State::process_synced_package` and `State::process_decided_certificate` keep integration tests aligned with production handlers. -- Use `serial_test` or per-test temp dirs to keep runs deterministic (2–5 s each). - -**Progress (2025-11-05)** - -- ✅ Scaffolded deterministic helpers and migrated component smokes into `crates/consensus/tests` (real RocksDB + KZG). -- ✅ Implemented proposer→commit lifecycle and sync-package ingestion with shared mocks for the execution client. - -**Progress (2025-11-08)** - -- ✅ Refactored the Decided path into `State::process_decided_certificate` plus the `ExecutionNotifier` trait so the app handler and integration tests share identical logic. -- ✅ Expanded full-node coverage (14 scenarios) with proposer/follower commit assertions and execution-layer rejection coverage via `MockExecutionNotifier`. -- ✅ Hardened sync coverage with commitment-mismatch and inclusion-proof regression tests. - -**Progress (2025-11-18)** - -- ✅ Tier 1 harness de-flaked: `full_node_restart_mid_height` now gates on `StartedHeight`; `wait_for_nodes_at` helper replaces ad-hoc joins/sleeps. -- ✅ Full Tier 1 suite passes via `make itest-node` (14/14, event-driven). - -- Wrap existing Docker workflow in an opt-in smoke target: - - Boot stack (`make all` steps). - - Use the repaired spammer to submit valid blob transactions. - - Run a Rust helper that polls RPC/metrics and asserts blobs were imported/promoted. - - Tear down stack (`make clean-net`). -- Expand Grafana dashboard with blob panels once metrics arrive; document panel meanings. -- Update `DEV_WORKFLOW.md` with `make itest` usage and smoke test guidance. - -### Phase D – Tooling & Documentation (2–3 hours) - -- ✅ Spam utility generates valid blob transactions (real KZG commitments and proofs) -- ✅ Updated `DEV_WORKFLOW.md` with comprehensive blob testing section -- ✅ Updated `README.md` with blob testing quick start - -## Testing Strategy - -### Tier 0 – Component (default) - -- Run via `cargo test -p ultramarine-consensus --test blob_roundtrip --test blob_sync_commitment_mismatch --test blob_pruning` or `make itest`. -- Uses inline helpers with a mocked Execution client (real blob engine/KZG) to exercise proposer→commit, commitment/sidecar validation, and retention/metrics without Docker. -- Each test completes in ~2–4 s and relies on `tempfile::TempDir` Drop for cleanup. These run in `make test` and CI by default. - -### Tier 1 – Full-Node (multi-validator) - -- Mirrors Malachite’s TestBuilder blueprint by spinning **three** validators (2f + 1) plus optional follower nodes under the real channel actors, WAL, and libp2p transport. -- Exercises proposer/follower blob flow, crash/restart, ValueSync happy + failure paths, pruning, blobless sequences, and sync-package roundtrip with the production application loop talking to an Engine RPC stub (HTTP ExecutionClient wiring remains a follow-up). -- `make itest-node` invokes each Tier 1 scenario via its own `cargo test ... -- --ignored` call so every harness run starts from a clean process; CI job `itest-tier1` runs all 14 with `RUST_TEST_THREADS=1`, `CARGO_NET_OFFLINE` overridable, 20m timeout, and artifacts on failure. -- Named scenarios: `full_node_blob_quorum_roundtrip`, `full_node_validator_restart_recovers`, `full_node_restart_mid_height`, `full_node_new_node_sync`, `full_node_multi_height_valuesync_restart`, `full_node_restart_multi_height_rebuilds`, `full_node_restream_multiple_rounds_cleanup`, `full_node_restream_multi_validator`, `full_node_value_sync_commitment_mismatch`, `full_node_value_sync_inclusion_proof_failure`, `full_node_blob_blobless_sequence_behaves`, `full_node_store_pruning_retains_recent_heights`, `full_node_sync_package_roundtrip`, and `full_node_value_sync_proof_failure`. Collectively these cover restart hydration, pruning, blobless sequences, restream permutations, and every ValueSync rejection path without touching the stores manually. - -- Run manually via `make all` + `make spam-blobs` (optionally gated by env vars such as `ULTRA_E2E=1`). -- Boots docker stack (`make all`), runs blob spam script, queries RPC/metrics for verification, tears down (`make clean-net`). -- Used for load/perf validation and manual dashboards; not required for every CI run. - -### Manual Exploratory Checks - -- Grafana dashboards, Prometheus queries, and `tmux` logs remain available for debugging beyond automated assertions. -- Documented in `DEV_WORKFLOW.md`. - -## Test Artifacts - -| Test | Result | Time | -| --------------------------------------------------------------- | ------ | ---- | -| `blob_roundtrip` | ✅ | ~3 s | -| `blob_sync_commitment_mismatch` (incl. inclusion proof failure) | ✅ | ~3 s | -| `blob_pruning` | ✅ | ~3 s | - -**Harness Summary**: 3/3 Tier 0 smoke scenarios (now under `crates/consensus/tests/`) passing via `cargo test -p ultramarine-consensus --test -- --nocapture` in ~8–10 s (real KZG proofs using `c-kzg`). Metrics snapshots confirm promotion/demotion counters remain stable across runs. - -**2025-11-18 Update**: Tier 1 harness de-flaked and aligned with docs. - -- `full_node_restart_mid_height` now waits on `Event::StartedHeight` before crashing a node, forcing a deterministic ValueSync replay (no sleeps/race). -- Multi-node waits use a shared helper (`wait_for_nodes_at`) to avoid timing drifts; peer warm-up remains the only fixed delay. -- Full Tier 1 suite passes via `cargo test -p ultramarine-test --test full_node -- --ignored --nocapture` (14/14 scenarios). - ---- - -## Daily Progress Log - -### 2025-10-28 (Monday) - Analysis & Planning - -**Completed**: - -- ✅ Comprehensive review of testnet infrastructure -- ✅ Identified observability gaps (0 blob metrics, 0 dashboard panels) -- ✅ Created PHASE5_TESTNET.md document -- ✅ Defined implementation roadmap (Phases A-D, ~14-19 hours) - -**Findings**: - -- Testnet infrastructure is production-ready (Docker, Prometheus, Grafana all working) -- 🔴 **CRITICAL**: Spam tool `--blobs` flag creates incomplete transactions (fake versioned hashes, no blob data) -- 18 existing dashboard panels (5 Malachite, 13 load-reth) but 0 for blobs -- BlobEngine has logging but zero Prometheus instrumentation - -**Critical Discovery** (Evening): - -- Deep-dive code review of `crates/utils/src/tx.rs` revealed spam tool issues -- `make_eip4844_tx()` uses hardcoded fake versioned hash: `0x00...01` -- Does NOT generate actual blob data (131KB), KZG commitments, or proofs -- Transactions accepted by txpool but CANNOT be included in blocks (no blob data) -- Added Phase E to roadmap (4-6 hours) to fix spam tool before integration testing - -**UPDATE (2025-11-04)**: This analysis was incorrect. The spam tool actually DOES work correctly: - -- Generates real 131KB blob data (deterministic, KZG-compatible) -- Computes valid KZG commitments using c-kzg library -- Generates valid KZG proofs with trusted setup -- All 1,158 test blobs verified successfully (100% success rate) -- Phase E was not needed - spam tool was functional all along - -**Next Steps** (Updated): - -- ✅ Phase A.1: Create BlobEngine metrics module - COMPLETE -- ✅ Phase A.2: Register metrics in node startup - COMPLETE -- ✅ Phase C: Testnet validation - COMPLETE - ---- - -### 2025-11-04 (Tuesday) - Phase A.1 Complete: BlobEngine Metrics - -**Completed**: - -- ✅ **Phase A.1: BlobEngine Surface Metrics** — Full implementation complete -- ✅ Created `crates/blob_engine/src/metrics.rs` (235 lines) with 12 metrics -- ✅ Extended `BlobStore` trait API to return counts for metrics tracking -- ✅ Instrumented all 6 BlobEngine methods with comprehensive metrics -- ✅ Registered metrics in node startup using `SharedRegistry` pattern -- ✅ Applied critical fixes: bulk gauge operations, BYTES_PER_BLOB constant usage -- ✅ 11 tests passing, full codebase builds successfully -- ✅ Updated documentation: METRICS_PROGRESS.md, PHASE5_PROGRESS.md - -**Metrics Implemented**: - -- Verification: `verifications_success_total`, `verifications_failure_total`, `verification_time` (histogram) -- Storage Gauges: `storage_bytes_undecided`, `storage_bytes_decided`, `undecided_blob_count` -- Lifecycle Counters: `lifecycle_promoted_total`, `lifecycle_dropped_total`, `lifecycle_pruned_total` -- Consensus: `blobs_per_block` (gauge), `restream_rebuilds_total`, `sync_failures_total` - -**Code Review Findings & Fixes**: - -1. 🔧 Performance: Replaced gauge loops with bulk operations (`inc_by`/`dec_by`) - - Before: 131k+ operations per blob - - After: Single operation per batch -2. 🔧 Completeness: Added missing metrics to `mark_archived()` method -3. 🔧 Constants: Eliminated magic number `131_072`, using `BYTES_PER_BLOB` -4. 🔧 Types: Corrected gauge API usage (`i64` vs `u64`) - -**Architecture Decisions**: - -- ✅ Metrics module follows `DbMetrics` pattern exactly -- ✅ `BlobEngineMetrics::new()` for tests, `::register()` for production (matches codebase pattern) -- ✅ Best-effort delete semantics in `mark_archived` (gauge decrements even if blob missing) -- ✅ Phase A.1 scope: BlobEngine surface only; State hooks deferred to A.2 - -**Next Steps** (from Phase A.1): - -- ✅ **Phase A.2**: Wire metrics into `State` for consensus hooks — **COMPLETED same day** - - See Phase A.2 section below for full details -- ✅ **Integration Testing**: Metrics endpoint validated during Tier 1 runs (event-driven harness, 2025-11-18) - ---- - -### 2025-11-04 (Tuesday, continued) — Phase A.2 + Encapsulation: Complete - -**Completed**: - -- ✅ **Phase A.2: State/Consensus Hooks** — Full implementation -- ✅ Added `pub(crate) blob_metrics` field to `State` struct -- ✅ Fixed missing `set_blobs_per_block()` call in `BlobEngine::mark_decided()` -- ✅ Instrumented `State::rebuild_blob_sidecars_for_restream()` with restream counter -- ✅ Instrumented sync failure path via `State::record_sync_failure()` helper method -- ✅ **Architectural improvement**: Encapsulation via `pub(crate)` + public helper methods - -**Encapsulation Rationale**: - -Initially `blob_metrics` was `pub`, allowing direct cross-crate field access (`state.blob_metrics.record_sync_failure()`). This breaks encapsulation and creates maintenance burden. The improved design: - -```rust -// State struct -pub(crate) blob_metrics: BlobEngineMetrics // Restricted to consensus crate - -// Public API -pub fn record_sync_failure(&self) { - self.blob_metrics.record_sync_failure(); -} -``` - -Benefits: - -- ✅ State owns its instrumentation surface -- ✅ Metrics API changes stay localized within State -- ✅ External crates use clean, documented methods -- ✅ Internal consensus code retains direct access for flexibility - -**Critical Fix**: - -- Discovered `blobs_per_block` gauge was defined but never updated -- Added `self.metrics.set_blobs_per_block(blob_count)` in `BlobEngine::mark_decided()` -- This gauge now correctly tracks blobs in the last finalized block - -**Test Results**: - -- ✅ 25 consensus tests passing -- ✅ 11 blob_engine tests passing -- ✅ Full codebase builds successfully -- ✅ Zero regressions - -**Files Modified**: - -- `crates/consensus/src/state.rs` (metrics field, helper method, restream instrumentation) -- `crates/node/src/node.rs` (pass metrics to State) -- `crates/node/src/app.rs` (sync failure via helper) -- `crates/blob_engine/src/engine.rs` (fixed set_blobs_per_block) - -**Next Steps**: - -- ⏳ **Phase B**: Integration tests (in progress - beta team) -- ⏳ **Phase C**: Start testnet, validate metrics endpoint - -**Files Modified**: - -- `crates/blob_engine/src/metrics.rs` (new, 235 lines) -- `crates/blob_engine/src/engine.rs` (instrumentation) -- `crates/blob_engine/src/store/mod.rs` (API extensions) -- `crates/blob_engine/src/store/rocksdb.rs` (count tracking) -- `crates/blob_engine/Cargo.toml` (dependencies) -- `crates/node/src/node.rs` (registration) - ---- - -### 2025-11-05 (Wednesday) – Phase B Kickoff (Team Beta) - -**Completed**: - -- ✅ Shared harness module (`tests/common/mod.rs`) with deterministic genesis/key fixtures, blob-engine builders, and metrics snapshots. -- ✅ `tests/blob_state/blob_roundtrip.rs`: full proposer→commit lifecycle, blob import verification, and metric assertions. -- ✅ `tests/blob_state/restart_hydrate.rs`: commits a blobbed block, restarts, hydrates parent root, and validates metadata persistence across process restarts. -- ✅ `tests/blob_state/sync_package_roundtrip.rs`: encodes/decodes `SyncedValuePackage::Full`, stores synced proposals, marks blobs decided, and validates metrics. -- ✅ `tests/blob_state/blob_restream.rs`: exercises multi-validator restream and follower commit path with blob metrics validation. -- ✅ `tests/blob_state/blob_restream_multi_round.rs`: validates losing rounds are dropped during restream replay while metrics capture promotions/drops. -- ✅ Added `tests/common/mocks.rs` providing a minimal Engine API mock for future execution-layer scenarios. -- ✅ `serial_test` wiring to keep integration suites deterministic. - -**In Progress**: - -- Broader failure-mode coverage (pruning, sync error paths, negative tests). -- Integrating execution mock into proposer pipelines once payload generation paths require it. - -**Next Steps**: - -- Incorporate execution-client mock into tests that drive payload generation. -- Expand metric assertions once consensus hooks (`set_blobs_per_block`, restream counters) are fully wired. -- Layer in multi-validator scenarios and failure paths (drops, pruning, sync failures). - ---- - -### 2025-11-06 (Thursday) – Restream & Mock Execution Integration - -**Completed**: - -- ✅ Refactored integration tests to pull payloads/blobs from `MockEngineApi` so proposer flows mimic ExecutionClient usage (`blob_roundtrip`, `restart_hydrate`, `sync_package_roundtrip`, `blob_restream`). -- ✅ Added `tests/blob_state/blob_restream_multi_round.rs` covering multi-round restream cleanup (promotion vs. drop metrics, undecided pruning). -- ✅ Hardened test harness with reusable base58 peer IDs and metric snapshots for assertions (`tests/common/mod.rs`, `tests/common/mocks.rs`). - -**Pending**: - -- Exercise sync failure paths (invalid sidecars) to tick `sync_failures_total` and verify `record_sync_failure()` wiring. -- Integrate the mock Execution API with the real ExecutionClient bridge once that work lands so proposer pipelines are covered end-to-end. -- Introduce pruning-focused tests once Phase 6 work starts. - ---- - -### 2025-11-07 (Friday) – Phase 5B Harness Activation Kickoff - -**Completed**: - -- 🔎 Revalidated integration gaps: root `tests/` directory is excluded from workspace, no `make itest` target exists, and placeholder blob fixtures fail BlobEngine’s KZG verification path. -- 🗺️ Drafted execution plan to stand up a dedicated `crates/test` package (mirroring malachite’s pattern) so integration suites are visible to Cargo. -- 🧪 Materialised `crates/test` harness: migrated integration suites, hooked into workspace members, wired `make itest` targets, and replaced dummy blob fixtures with real KZG commitments/proofs (trusted setup cached once per run). -- ✅ `cargo test -p ultramarine-test -- --nocapture` now exercises all ten Phase 5B scenarios successfully; Team Beta signed off Phase 5B integration validation. - -**In Progress**: - -- Documenting harness usage in developer guides and aligning remaining TODO tests/failure modes with Phase 5B checklist. - -**Next Steps**: - -1. Capture harness invocation guidance in README/TESTNET docs (link `make itest` command). -2. Add negative-path/failure-mode coverage (sync failures, pruning once Phase 6 starts). -3. Integrate harness into CI workflow so Phase 5B becomes a gate on PRs. - ---- - -## Success Criteria - -Phase 5 Testnet is complete when: - -### Metrics & Observability - -- ✅ BlobEngine exposes 12+ Prometheus metrics (verification, storage, lifecycle) -- ✅ Grafana dashboard has 10+ blob-specific panels -- ✅ All blob operations visible in real-time (verification rate, latency, failures) -- ✅ Cross-layer correlation: Blob activity → consensus height → execution import - -### Spam Tool (Phase E) - -- ✅ Generates real blob data (131KB per blob) -- ✅ Computes valid KZG commitments and proofs -- ✅ Computes correct versioned hashes from commitments -- ✅ Submits blobs via proper RPC method (blobs included in blocks) -- ✅ Supports `--blobs-per-tx` flag (1-6 blobs) -- ✅ Blob transactions successfully included in consensus blocks - -### Integration Testing - -- ✅ In-process integration suite passes (`blob_roundtrip`, `restart_hydrate`, `sync_package_roundtrip`, `blob_restream_multi_validator`, `blob_restream_multi_round`, `blob_new_node_sync`, `blob_blobless_sequence`, `blob_sync_failure_rejects_invalid_proof`, `blob_sync_commitment_mismatch_rejected`, `blob_sync_across_restart_multiple_heights`, `blob_restart_hydrates_multiple_heights`, `blob_pruning_retains_recent_heights`) via `make itest` (`cargo test -p ultramarine-test -- --nocapture`). -- ✅ Optional Docker smoke (`make all` + `make spam-blobs`) still validates the full network path when needed. -- ✅ No verification failures during normal operation. -- ✅ No memory leaks or unbounded storage growth observed during harness + smoke runs. - -- ✅ `DEV_WORKFLOW.md` documents all testing procedures -- ✅ `README.md` updated with blob testing quick start -- ✅ Metrics reference guide available (list all blob metrics) -- ✅ Troubleshooting guide covers common issues - -### Operational Readiness - -- ✅ Testnet can run for 24+ hours without issues -- ✅ Restart survival validated (blobs persist, no corruption) -- ✅ Multi-validator sync validated (all nodes agree on blob state) -- ✅ Performance baselines established (P99 latency, throughput limits) - ---- - -## References - -- [PHASE4_PROGRESS.md](./PHASE4_PROGRESS.md) - Phase 1-4 implementation log -- [FINAL_PLAN.md](./FINAL_PLAN.md) - Overall project roadmap -- Makefile:427-550 - Testnet automation targets -- `crates/blob_engine/src/engine.rs` - BlobEngine trait and implementation -- `crates/utils/src/commands/spam.rs` - Transaction spam tool (blob path fully functional) - ---- - -## Open Questions - -1. **Spam Tool Implementation** ✅ **RESOLVED**: The `--blobs` flag works correctly - - **Status**: Generates valid EIP-4844 transactions with real KZG commitments and proofs - - **Testing**: 193 transactions sent with 1,158 blobs (6 per tx), 100% verification success - - **Implementation**: Uses c-kzg library with trusted setup for valid proofs - -2. **Blob RPC Submission Method** ✅ **RESOLVED**: Load-reth accepts blob transactions via standard RPC - - **Method**: Standard `eth_sendRawTransaction` with blob sidecars - - **Status**: All blob transactions successfully included in blocks - - **Verification**: Consensus and execution layers properly handle blob lifecycle - -3. **Blob Transaction Defaults** ✅ **IMPLEMENTED**: `--blobs-per-tx` flag available - - **Current**: Spam tool supports 1-6 blobs per transaction (default 6) - - **Usage**: `--blobs --blobs-per-tx 6` - - **Testing**: Successfully tested with 6 blobs per transaction - -4. **Storage Growth**: What's acceptable storage size for 1000 blocks with blobs? - - **Status**: Pruning implemented with 5-block retention window - - **Observation**: Storage remains bounded with automatic pruning - - **Note**: Configurable retention policies planned for Phase 6 - -5. **Metrics Export** ✅ **RESOLVED**: Blob metrics use `blob_engine_*` prefix - - **Implementation**: 12 metrics registered with `blob_engine_` prefix - - **Rationale**: Matches BlobEngine crate name, clear separation from consensus metrics - - **Status**: Fully implemented and validated on testnet - ---- - -## Risks & Mitigations - -| Risk | Impact | Mitigation | Status | -| ------------------------------------------ | ------------ | ------------------------------------------------------------- | -------------------------------------- | -| ~~Spam tool non-functional~~ | ~~CRITICAL~~ | ~~Implement Phase E~~ | ✅ **RESOLVED** - Tool works correctly | -| ~~Blob RPC submission method unknown~~ | ~~HIGH~~ | ~~Research load-reth/Reth APIs~~ | ✅ **RESOLVED** - Standard RPC works | -| Blob spam causes consensus slowdown | MEDIUM | Load test at increasing rates, identify bottleneck | ⏳ Ongoing monitoring | -| Storage grows unbounded without pruning | MEDIUM | Monitor `storage_size_bytes` metric, defer pruning to Phase 6 | | -| Integration tests flaky (timing-dependent) | LOW | Use retries, generous timeouts, deterministic test data | | -| Grafana dashboard too complex | LOW | Group panels logically, provide simple "Overview" section | | - ---- - -**Last Updated**: 2025-10-28 -**Next Review**: After Phase A completion (metrics instrumentation) diff --git a/docs/PHASE6_ADDITIONAL_STUFF.md b/docs/PHASE6_ADDITIONAL_STUFF.md deleted file mode 100644 index 56ce80d..0000000 --- a/docs/PHASE6_ADDITIONAL_STUFF.md +++ /dev/null @@ -1,113 +0,0 @@ -# Phase 6 – Additional Stuff (Follow-ups / Improvements) - -This document lists pragmatic follow-ups on top of the current Phase 6 archive→prune pipeline. - -Assumed contract: - -- Ultramarine nodes do **not** re-download pruned blob bytes. -- After pruning, blob bytes are out-of-protocol; nodes only propagate/persist **locators** and metadata. -- External consumers (indexers, rollups, provers, explorers) use locators/tags to retrieve and verify bytes. - -## Recommended additions - -### 1) Put `versioned_hash` into `ArchiveNoticeBody` (signed) - -Today `load.versioned_hash` is included as an upload tag to the provider, but it is not part of the signed notice/record. - -Add a new field to `ArchiveNoticeBody`: - -- `versioned_hash: B256` - -Why: - -- Makes `ArchiveNotice`/`ArchiveRecord` self-sufficient for external consumers. -- Avoids relying on provider indexing/tag-query to map `(height,index)` or `kzg_commitment` → `versioned_hash`. - -### 2) Add network binding to the notice signing preimage - -Add a chain/network discriminator into the signed notice body (one of): - -- `chain_id: u64` (preferred if stable), or -- `network: String` (e.g. `fibernet`) - -Why: - -- Prevents cross-network replay/confusion if locators/tags are reused across devnets/testnets. -- Aligns with existing upload tags like `load.network=fibernet`, but makes it cryptographically bound. - -### 3) Expose archive records for external tooling (CLI/RPC) - -Provide an explicit interface to export archive records for height `H` (and optionally for `versioned_hash`): - -Return shape (conceptually): - -- `height`, `blob_index` -- `kzg_commitment`, `versioned_hash`, `blob_keccak` -- `provider_id`, `locator` -- `archived_by`, `archived_at` - -Why: - -- External consumers should not scrape logs or reverse-engineer internal DBs. -- This is the clean “handoff surface” for the AO/HyperBEAM machinery to consume. - -### 4) Reference retriever/indexer (external, not in-node) - -Create a small reference tool/library that: - -- takes `versioned_hash` (or `(height,index)` + commitment), -- discovers `dataitem_id` via either: - - `ArchiveNotice` locator, or - - `load-s3-agent /tags/query` as a fallback, -- downloads bytes via the gateway, -- verifies bytes against: - - `ArchiveNoticeBody.blob_keccak` - - `ArchiveNoticeBody.kzg_commitment` (recompute commitment) - -Why: - -- Removes ambiguity for integrators. -- Demonstrates the “blob economics machinery” workflow without changing consensus. - -### 5) Manual “re-archive height H now” and “rebroadcast notices” tooling - -Add operator-facing commands to: - -- enqueue an archive job for a specific height (proposer-only), -- rebroadcast stored archive notices for a height. - -Why: - -- Operational recovery when provider endpoints/tokens change. -- Useful for staged rollouts and incident response. - -### 6) Multi-provider / failover (optional) - -Allow configuring multiple providers and/or mirroring: - -- upload to primary and optionally mirror to secondary, -- store multiple locators per blob index (or store a primary + alternates). - -Why: - -- Provider outages shouldn’t permanently stall pruning/operations. -- Lets AO/HyperBEAM and other mirrors coexist. - -### 7) Better throughput observability (per-provider attribution) - -Current Phase 6 metrics are global. Consider adding labels by `provider_id` (or separate registries) for: - -- upload success/failure -- upload bytes -- upload latency histograms - -Why: - -- Required if you introduce mirroring or multiple providers. -- Makes Grafana actionable when debugging throughput regressions. - -## Explicit non-goals (by design) - -- Node-side automatic rehydration of pruned blob bytes from the provider. - - If a blob is pruned, the node treats the bytes as not needed for protocol operation. - - Locators are a handoff for external consumers only. diff --git a/docs/PHASE6_ARCHIVE_PRUNE.md b/docs/PHASE6_ARCHIVE_PRUNE.md deleted file mode 100644 index 4b70386..0000000 --- a/docs/PHASE6_ARCHIVE_PRUNE.md +++ /dev/null @@ -1,107 +0,0 @@ -# Phase 6 – Blob Archiving & Pruning (V0 Overview, current) - -This file is the human-friendly overview of Phase 6. - -For the code-audited, source-linked specification of what is implemented today, see: - -- `docs/PHASE6_ARCHIVE_PRUNE_FINAL.md` -- `docs/ARCHIVER_OPS.md` (operator runbook) - -## What Phase 6 does (current behavior) - -- Blobs are verified and stored locally while they are needed for proposal/sync. -- After commit, the **proposer** uploads each decided blob (per `blob_index`) to an archive provider (V0: `load-s3-agent`) via `POST /upload` (multipart). -- The proposer signs and broadcasts an `ArchiveNotice` per blob containing an opaque `locator` (typically `load-s3://`). -- Validators verify the notice against locally computed blob keccak + decided metadata, persist an `ArchiveRecord`, and pruning becomes eligible. -- Once all blob indexes for a height have verified archive records and the height is finalized, blob bytes are deleted locally; metadata + archive records remain forever. - -## Provider API (Load S3 Agent) - -Ultramarine uses `POST /upload` only (there is no `/upload/blob` endpoint). - -Multipart fields: - -- `file`: raw blob bytes (EIP-4844 blobs are 131072 bytes) -- `content_type=application/octet-stream` -- `tags`: JSON array of `{key,value}` objects (ANS-104 tags) - -Tags attached by Ultramarine: - -- `load=true` -- `load.network=fibernet` -- `load.kind=blob` -- `load.height`, `load.round`, `load.blob_index` -- `load.kzg_commitment`, `load.versioned_hash`, `load.blob_keccak` -- `load.proposer`, `load.provider` - -The provider returns `dataitem_id` (and sometimes `locator`). Ultramarine stores a locator as `load-s3://` when a locator is not explicitly returned. - -## Retrieval (after pruning) - -After pruning, Ultramarine will not serve blob bytes locally. Peers receive metadata + archive notices (locators), but Ultramarine does not automatically re-download pruned blobs. -External consumers (indexers, rollups, provers, explorers) can fetch from the provider using the locator. - -### Receiver behavior example (Ultramarine node) - -When an Ultramarine node receives a value-sync/restream package for height `H` and it is `MetadataOnly`, it should: - -1. Accept and process `archive_notices` (verify signature/proposer/commitment/hash, persist `ArchiveRecord`). -2. Continue consensus sync without attempting to obtain blob bytes (bytes are out-of-protocol after prune). -3. Treat locators as an external hint for third-party tools: - - expose the locator via logs/telemetry/CLI (optional future work), or - - allow an external consumer to fetch blob bytes from the archive provider and verify them off-node. - -Example decision flow: - -- Received: `SyncedValuePackage::MetadataOnly { value, archive_notices }` - - Node: store `value` + `archive_notices`; mark blob bytes for `H` as unavailable locally - - External consumer: for each notice, use `notice.body.locator` (e.g. `load-s3://`) to download bytes and verify against: - - `notice.body.kzg_commitment` / `load.versioned_hash` - - `notice.body.blob_keccak` - -Rust sketch (node-side; no re-download): - -```rust -match synced_package { - SyncedValuePackage::Full { value, blobs } => { - // normal path: store value + blob bytes, verify, etc. - store_value(value).await?; - store_blobs(blobs).await?; - } - SyncedValuePackage::MetadataOnly { value, archive_notices } => { - // pruned path: store metadata + archive locators, but do NOT fetch bytes. - store_value(value).await?; - - for notice in archive_notices { - state.handle_archive_notice(notice.clone()).await?; - - tracing::info!( - height = %notice.body.height, - blob_index = notice.body.blob_index, - locator = %notice.body.locator, - "received archive locator for pruned blob (external consumers may fetch)" - ); - } - } -} -``` - -For Load S3 Agent, a standard resolver URL is: -`https://gateway.s3-node-1.load.network/resolve/` - -## Local testing quickstart - -- Tier-0 component tests: `make itest` (or `make itest-quick`) -- Tier-1 full-node scenarios: `make itest-node` -- Tier-1 archiver/prune suite: `make itest-node-archiver` -- Note: the Tier‑1 harness defaults `archiver.enabled=false`; archiver tests opt in via `FullNodeTestBuilder::with_archiver(...)` / `with_mock_archiver()` (`mock://` requires `ultramarine-node` built with `feature="test-harness"`). -- Harness stability: panic-safe teardown (`Drop`), port allocation retries, and read-only store opens reduce flakiness across CI and under load. -- Spam blobs across EL RPCs: `make spam-blobs` - - Uses different signers per RPC process to avoid nonce contention. - -## Status notes (implementation deltas vs early design drafts) - -- Archive notices are carried as `ProposalPart::ArchiveNotice` and are packaged for sync/restream. -- Archiver uploads use multipart `tags` (ANS-104) rather than custom header conventions. -- Retention is “archive-gated”: prune waits for verified archive records (no time-window deletion in V0). -- `archiver.enabled` means “this node uploads + emits notices when proposer duty”; prune eligibility is independent and depends only on verified notices + finality. diff --git a/docs/PHASE6_ARCHIVE_PRUNE_FINAL.md b/docs/PHASE6_ARCHIVE_PRUNE_FINAL.md deleted file mode 100644 index 7951796..0000000 --- a/docs/PHASE6_ARCHIVE_PRUNE_FINAL.md +++ /dev/null @@ -1,227 +0,0 @@ -# Phase 6 – Archiving & Pruning (Final V0, code-audited) - -Goal: add a safe, deterministic archive→prune pipeline for blob sidecars with **no retention window for blob bytes**. Blobs are served locally until prune; after pruning they are no longer served. Consensus metadata and archive records remain forever. - -This document reflects the current implementation and links each feature to the code that implements it. - -**Implementation status legend**: ✅ implemented, 🟡 partial, ⚠️ mismatch / foot-gun, ⏳ not implemented. - -## Summary (what the system does) - -- Blobs are verified and stored locally (RocksDB) during proposal/sync. -- After commit, the proposer uploads decided blob bytes to an external provider (V0: `load-s3-agent`) using an async worker. -- The proposer signs and gossips an `ArchiveNotice` per blob (post-commit; off-FIN). -- Validators verify the notice (signature + commitment + local blob keccak), persist an `ArchiveRecord`, and once **all** blob indexes at a height have verified records **and** the height is finalized, they prune local blob bytes for that height. - -## Architecture map (where to look) - -- Types: - - `crates/types/src/archive.rs` (`ArchiveNotice`, signing/verification, `ArchiveJob`) - - `crates/types/src/blob_metadata.rs` (per-index `blob_keccak_hashes`, archive records, `pruned`) - - `crates/types/src/proposal_part.rs` (`ProposalPart::ArchiveNotice`) - - `crates/types/src/sync.rs` (`SyncedValuePackage` carries `archive_notices`) -- Consensus: - - `crates/consensus/src/state.rs` (`handle_archive_notice`, `prune_archived_height`, `rehydrate_pending_prunes`) - - `crates/consensus/src/store.rs` (`blob_archival` table + hydration into `BlobMetadata`) - - `crates/consensus/src/archive_metrics.rs` (Phase 6 metrics) -- Node/runtime: - - `crates/node/src/archiver.rs` (archiver worker: upload + retries + notice generation) - - `crates/node/src/node.rs` (spawn worker, config validation, restart recovery) - - `crates/node/src/app.rs` (broadcast notices, restream, GetDecidedValue packaging) -- Blob bytes store: - - `crates/blob_engine/src/engine.rs` (`mark_archived`) - - `crates/blob_engine/src/store/rocksdb.rs` (`delete_archived`) - -## Protocol component: ArchiveNotice - -### Payload - -Per blob: - -- `height`, `round`, `blob_index` -- `kzg_commitment` -- `blob_keccak` (keccak256 of the locally stored blob bytes) -- `provider_id`, `locator` -- `archived_by`, `archived_at` -- `signature` (Ed25519) - -### Signature scheme (current implementation) - -Signing preimage is: -`sha256("ArchiveNoticeV0" || protobuf(ArchiveNoticeBody))` - -This is domain-tagged sha256 over the protobuf encoding of the notice body (not SSZ tree-hash). - -Code: `crates/types/src/archive.rs` (`ARCHIVE_NOTICE_DOMAIN`, `ArchiveNoticeBody::signing_root`). - -### Verification rules (current implementation) - -Receiver checks: - -- `archived_by` is a known validator address (used to find the public key) -- signature verifies against `archived_by` public key -- decided `BlobMetadata` exists at `height` -- `blob_index < blob_count` -- `kzg_commitment` matches decided `BlobMetadata` -- `blob_keccak` matches decided `BlobMetadata` -- no conflicting archive record already stored for `(height, blob_index)` - -Code: `crates/consensus/src/state.rs` (`State::handle_archive_notice`). - -✅ **Update**: Receiver-side verification now enforces proposer-only notices. `State::handle_archive_notice` resolves the expected proposer from `BlobMetadata.proposer_index_hint()` (falling back to `ConsensusBlockMetadata`) and rejects notices whose `archived_by` does not match. This guarantees only the block proposer’s signed locator is accepted even if other validators have local blob bytes (`crates/consensus/src/state.rs`). - -## Storage - -### BlobMetadata additions (Layer 2) - -`BlobMetadata` stores: - -- `blob_keccak_hashes: Vec` aligned with commitments -- `archival_records: Vec>` (in-memory; hydrated from store) -- `pruned: bool` - -Code: `crates/types/src/blob_metadata.rs`. - -### Archive records table (consensus store) - -Archive records are persisted in redb: - -- table: `blob_archival` -- key: `(height, blob_index)` -- value: protobuf `ArchiveRecord` - -On read, `BlobMetadata` is hydrated by scanning `blob_archival` for that height. - -Code: `crates/consensus/src/store.rs` (`BLOB_ARCHIVAL_TABLE`, `insert_archive_record`, `hydrate_archival_records`). - -### Where `blob_keccak` comes from - -- Proposer path: computed from the bundle in `State::propose_value_with_blobs`. -- Sync path: computed from received sidecars in `State::process_synced_value` (Full package). - -Code: `crates/consensus/src/state.rs` (`propose_value_with_blobs`, `process_synced_value`). - -## Archiver worker (proposer duty) - -### Duty model - -- Only the proposer enqueues `ArchiveJob`s. -- Followers never upload; they only verify and persist notices. - -Code: - -- job building: `crates/consensus/src/state.rs` (`build_archive_job`) -- restart recovery: `crates/consensus/src/state.rs` (`recover_pending_archive_jobs`) -- worker spawn + config validation: `crates/node/src/node.rs` - -### Upload contract (V0: load-s3-agent) - -Ultramarine uploads via `POST /upload` (multipart form) and attaches blob metadata as tags. - -Config: `crates/types/src/archiver_config.rs` -Worker implementation: `crates/node/src/archiver.rs` (`do_upload`) - -Response parsing (required): - -- JSON body with either `locator` or `dataitem_id` -- `success=false` fails the job -- if only `dataitem_id` is provided, locator is stored as `load-s3://{dataitem_id}` - -Code: `crates/node/src/archiver.rs` (UploadResponse parsing). - -### Retries and restart behavior - -- Worker retries jobs with exponential backoff (capped) and keeps retrying even after “max retries” is exceeded (it logs a permanent failure but stays in the retry queue). -- On startup, pending archive jobs are recovered and enqueued again (proposer-only). - -Code: `crates/node/src/archiver.rs` (retry queue), `crates/node/src/node.rs` (recovery enqueue). - -⏳ Manual “re-archive this height now” CLI command is not implemented yet (restart recovery + auto retries only). - -## Pruning (blob bytes) - -### Prune gate (V0) - -Prune a height when: - -1. height is “finalized” by the app’s finality tracking, and -2. every blob index at that height has a verified archive record. - -Code: `crates/consensus/src/state.rs` (`rehydrate_pending_prunes`, `flush_pending_prunes`, `prune_archived_height`). - -When uploads are disabled (`archiver.enabled = false`), the node will not upload blobs when it is proposer, so it will not emit archive notices for its own proposed heights. If it receives a complete set of verified archive notices for some height from the network, it still prunes local blob bytes for that height once finalized. If this node is in the validator set, Ultramarine refuses to start with `archiver.enabled=false` (PoA strictness). - -### Deletion mechanism - -Deletion is per height and per blob index: - -- consensus calls `blob_engine.mark_archived(height, indices)` -- blob engine deletes decided sidecar keys for those indices - -Code: `crates/consensus/src/state.rs` (`prune_archived_height`), `crates/blob_engine/src/engine.rs` (`mark_archived`), `crates/blob_engine/src/store/rocksdb.rs` (`delete_archived`). - -### Serving contract impact (CL) - -- Before prune: blobs are served normally. -- After prune: CL serving paths return `BlobEngineError::BlobsPruned { locators }`, and restream/value-sync can ship archive notices/locators out-of-band. - -Code: `crates/consensus/src/state.rs` (`get_blobs_with_status_check`, `get_undecided_blobs_with_status_check`), `crates/node/src/app.rs` (restream fallback, `SyncedValuePackage::MetadataOnly`). - -### EL note (`engine_getBlobsV1`) - -`engine_getBlobsV1` parity is enforced in `load-reth`, which returns `null` entries for missing/pruned blobs (no locators on Engine API). - -Code: `load-reth/src/engine/rpc.rs`. - -## Observability - -Phase 6 metrics are registered under the `archiver_` prefix and tracked in: - -- `crates/consensus/src/archive_metrics.rs` -- worker instrumentation: `crates/node/src/archiver.rs` -- notice propagation timing: `crates/node/src/app.rs` - -## Tests - -### Tier 0 (fast, in-process) - -- `crates/consensus/tests/archiver_flow.rs`: - - notice store/load + gating behavior (manual notice injection) - - `archiver_enabled` gating expectations for “sync notice generation” (legacy behavior) - -### Tier 1 (full-node harness; ignored by default) - -- `crates/test/tests/full_node/node_harness.rs`: - - mock provider smoke (uploads occur, pruned metadata observed) - - multi-node follower pruning after proposer notices - - provider failure retries - - auth token propagation - - restart recovery detection - - helper config: `FullNodeTestBuilder::with_archiver(...)` / `with_mock_archiver()` keep archiver config consistent in tests (`with_mock_archiver()` uses `mock://` provider URLs and requires `ultramarine-node` built with `feature="test-harness"`, which the full-node harness enables) - - harness hardening: panic-safe teardown (`Drop` aborts spawned node tasks), port allocation avoids TOCTOU races (retry plan), and read-only store access avoids incidental writes during polling - -Run: `make itest-node-archiver` (or individual `cargo test ... -- --ignored` invocations). - -🟡 Remaining to add/expand (for Phase 6 sign-off) - -- Harness-level negative cases: invalid signature, non-proposer notices (if enforced), conflicting notices. -- Optional provider verification / receipt signatures (stronger than local keccak binding). - -## Status checklist (Phase 6) - -- ✅ `ArchiveNotice` type + signing/verifying: `crates/types/src/archive.rs` -- ✅ `ProposalPart::ArchiveNotice` transport: `crates/types/src/proposal_part.rs` -- ✅ Persistence table `blob_archival` + hydration: `crates/consensus/src/store.rs` -- ✅ Notice verification + conflict handling: `crates/consensus/src/state.rs` (`handle_archive_notice`) -- ✅ Archiver worker upload + retry/backoff: `crates/node/src/archiver.rs` -- ✅ Worker spawn + config validation + restart recovery: `crates/node/src/node.rs` -- ✅ App loop broadcasts worker notices: `crates/node/src/app.rs` -- ✅ Prune gate + deletion: `crates/consensus/src/state.rs` + `crates/blob_engine/src/engine.rs` -- ✅ Proposer-only notice acceptance enforced in `handle_archive_notice` (non-proposer signatures are rejected) -- ⏳ Manual retry CLI: not implemented - -## Notes / foot-guns to keep docs honest - -- ⚠️ “No retention window” applies to _blob bytes_. The consensus store still has its own pruning (`Store::prune()` in `State::commit`) that controls how much decided history is kept. These are separate mechanisms and should be configured consciously. -- ⚠️ If `archiver.enabled=true`, `archiver.bearer_token` must be set; production nodes fail fast on startup when missing. -- ⚠️ Validators are required to run with `archiver.enabled=true` in production; startup fails fast if a validator disables archiver, to prevent proposers from silently skipping upload duty. The full-node integration harness builds `ultramarine-node` with `feature="test-harness"` and may disable archiver by default for non-archiver tests. diff --git a/docs/integration_tests_parity.md b/docs/integration_tests_parity.md deleted file mode 100644 index 3499aa6..0000000 --- a/docs/integration_tests_parity.md +++ /dev/null @@ -1,51 +0,0 @@ -# Integration Test Parity Plan - -_Last updated: 2025-11-19_ - -Ultramarine ships two tiers of integration coverage: - -- **Tier 0 (component smokes)** — 3 fast tests in `crates/consensus/tests` (`blob_roundtrip`, `blob_sync_commitment_mismatch`, `blob_pruning`) that exercise `State` with real RocksDB stores and KZG verification. They run in `make test` and are always in CI. -- **Tier 1 (full-node harness)** — 14 end-to-end scenarios in `crates/test/tests/full_node` that boot Malachite channel actors, libp2p, WAL, and the production application loop. They run via `make itest-node` and the CI job `itest-tier1` (after unit tests), with `CARGO_NET_OFFLINE` overridable and artifacts on failure. - -The parity target remains aligned with Malachite and Snapchain: multi-validator, networked harnesses that exercise leader election, ValueSync, crash/restart, WAL, and gossip end-to-end. - ---- - -## 1. Current Coverage Snapshot - -| Layer / Component | Exercised Today? | Notes | -| ----------------------------------- | ---------------- | --------------------------------------------------------------------------------------- | -| Consensus `State` + BlobEngine | ✅ (Tier 0) | 3 smokes with real RocksDB + KZG in `crates/consensus/tests`. | -| Execution payload + blob verifier | ✅ (Tier 0/1) | Deterministic payloads, real KZG commitments/proofs via `c-kzg`. | -| Engine API bridge (generate block) | ⚠️ Stubbed | Tier 0 mocks; Tier 1 uses Engine RPC stub (HTTP ExecutionClient wiring still pending). | -| Execution notifier (FCU / payload) | ⚠️ Stubbed | Tier 0 uses `MockExecutionNotifier`; Tier 1 uses stubbed Execution client. | -| Malachite channel actors | ✅ (Tier 1) | Full-node harness boots channel actors/WAL/libp2p. | -| libp2p gossip transport | ✅ (Tier 1) | `/proposal_parts` streaming exercised end-to-end. | -| WAL / timers / crash recovery paths | ✅ (Tier 1) | Restart/ValueSync paths deterministic via `StartedHeight` gating + `wait_for_nodes_at`. | -| CI signal | ✅ | Tier 0 in `make test`; Tier 1 in `itest-tier1` (20m timeout, artifacts on failure). | - ---- - -## 2. Remaining Gaps - -- **Execution bridge**: Tier 1 still uses the Engine RPC stub; wiring the HTTP `ExecutionClient` for payload generation/FCU/newPayload is the primary open item. -- **Spec extras (optional)**: Blobless payload fallback, `engine_getBlobsV1`, and sidecar gossip APIs remain future work. - ---- - -## 3. Risk & Mitigation - -| Risk | Likelihood | Impact | Mitigation | -| --------------------------------- | ---------- | ------ | ----------------------------------------------------------------------------------------------------- | -| libp2p flakiness / port conflicts | Medium | High | Deterministic ports + per-scenario process isolation; artifacts uploaded on CI failure. | -| Tier 1 runtime on PRs | Medium | Medium | Keep Tier 0 in `make test`; Tier 1 runs as a separate CI job (`itest-tier1`, 20m timeout, artifacts). | -| Execution bridge parity | Medium | Medium | Follow-up to replace the stub with HTTP `ExecutionClient`. | - ---- - -## 4. Notes - -- **Tier strategy**: Tier 0 = consensus crate smokes (3 tests, default in `make test`/CI); Tier 1 = full-node harness (14 ignored tests) run via `make itest-node` and CI job `itest-tier1`. -- **Harness builder**: `FullNodeTestBuilder` in `crates/test/tests/full_node/node_harness.rs` centralizes setup/teardown, payload plans, and deterministic ports. -- **Execution cadence**: Tier 1 scenarios run as separate `cargo test ... -- --ignored` invocations to avoid resource leaks; `cargo test -p ultramarine-test --test full_node -- --ignored` remains available for local ad-hoc runs. -- **CI**: Tier 1 executes after the main test job and uploads `target/debug/deps/full_node*` and `crates/test/test_output.log` on failure; `CARGO_NET_OFFLINE` is overridable for cold runners. diff --git a/docs/journal/BUG-011-timestamp-drift.md b/docs/journal/BUG-011-timestamp-drift.md new file mode 100644 index 0000000..65c61a8 --- /dev/null +++ b/docs/journal/BUG-011-timestamp-drift.md @@ -0,0 +1,52 @@ +# BUG-011: Block Timestamp Drift Into Future + +## Status: VERIFIED + +## Summary + +Block timestamps drift ~2 months into future due to `parent.timestamp + 1` at >20 blocks/sec. + +## Root Cause + +`ultramarine/crates/execution/src/client.rs:203` used `timestamp = parent.timestamp + 1` which is deterministic but detached from wall-clock time. + +## Fix + +1. [x] Add protocol constants (LOAD_MIN_BLOCK_TIME_SECS=1, LOAD_MAX_FUTURE_DRIFT_SECS=15) +2. [x] Proposer throttling with timeout respect +3. [x] Timestamp calculation: max(now, parent + min_block_time) +4. [x] Validator-side enforcement (critical!) +5. [x] Parent hash match before timestamp validation +6. [x] Wipe and redeploy fibernet +7. [x] Verify timestamps - PASSED (0 second drift, 60 blocks in 60 seconds) + +## Protocol Constants + +- `LOAD_MIN_BLOCK_TIME_SECS = 1` - minimum slot matching EVM timestamp granularity +- `LOAD_MAX_FUTURE_DRIFT_SECS = 15` - geth/Ethereum standard for future timestamp tolerance + +## Validator Enforcement (Protocol Rules) + +Validators MUST enforce on every proposal: + +1. `proposed_parent_hash == latest_block.block_hash` (parent link) +2. `proposed_ts > parent_ts` (strictly increasing) +3. `proposed_ts >= parent_ts + LOAD_MIN_BLOCK_TIME_SECS` (minimum slot) +4. `proposed_ts <= local_now + LOAD_MAX_FUTURE_DRIFT_SECS` (drift guard) + +Violation results in vote nil. + +## References + +- Protocol constants: ultramarine/crates/types/src/constants.rs +- Throttling: ultramarine/crates/node/src/app.rs:handle_get_value +- Timestamp: ultramarine/crates/execution/src/client.rs +- Validator checks: ultramarine/crates/consensus/src/state.rs:received_proposal_part + +## Deployment Notes + +After code changes: + +1. Build: `docker buildx build --platform linux/amd64 -t loadnetwork/ultramarine:fibernet --push .` +2. Wipe fibernet (required - chain-time already ahead) +3. Verify timestamps match wall-clock (within 15 seconds) diff --git a/docs/journal/BUG-012-fullnode-sync-race.md b/docs/journal/BUG-012-fullnode-sync-race.md new file mode 100644 index 0000000..eba3522 --- /dev/null +++ b/docs/journal/BUG-012-fullnode-sync-race.md @@ -0,0 +1,75 @@ +# BUG-012: Fullnode Sync Race Condition After Rolling Restart + +## Status: FIXED (pending wider validation) + +## Summary + +Fullnodes lose sync after rolling restart because EL lag was not treated as sync-mode during normal operation. When EL returned SYNCING, the CL stopped sending FCU (forkchoiceUpdated), so the EL never received a sync target and remained behind. Fixed by always sending FCU when EL is degraded and aligning CL head tracking even when execution is pending. + +## Symptoms + +- Fullnode EL stuck at block N +- CL trying to push block N+50+ +- EL keeps returning SYNCING status +- Gap grows indefinitely +- Blockscout can't index new blocks + +## Root Cause + +### Rolling Restart Sequence + +1. EL restarted with block N as latest +2. IPC socket appears - Ansible considers EL "ready" (premature) +3. Ultramarine restarted - fetches latest block (N) from EL +4. ValueSync starts - tries to push block N+X +5. EL returns SYNCING - doesn't have parent blocks N+1 to N+X-1 +6. CL marks execution pending and skips FCU in non-sync mode +7. EL has no sync target → stays behind until P2P catches up (often slow) + +### Code Locations + +- Rolling restart order: `infra/ansible/playbooks/roll.yml` lines 36-62 +- EL SYNCING handling: `crates/consensus/src/state.rs` (newPayload retry + execution_pending) +- FCU-before-newPayload: now triggered when EL is degraded (not only sync mode) +- Sync block processing: `crates/node/src/app.rs` lines 823-854 +- Consensus head tracking: `latest_block` now advances even when execution is pending + +### Why Fullnodes Are Affected (Not Validators) + +- Validators produce blocks locally, don't depend on syncing +- Fullnodes must receive and import blocks from peers +- Without FCU, EL has no sync target and relies on P2P only + +## Fixes Implemented + +1. **Treat EL lag as sync mode**\ + When EL returns SYNCING or `el_degraded` is set, send FCU **before** newPayload.\ + This gives EL a target head and unblocks pipeline sync. + +2. **Advance consensus head even when execution is pending**\ + This keeps parent-hash validation consistent on validators while EL lags. + +## Follow-ups (Optional Hardening) + +1. Add backfill queue for `execution_pending` heights to replay once EL catches up +2. Improve EL readiness gate (wait for `eth_syncing == false` or FCU success) + +## Workaround + +Wipe fullnode state and let it resync from genesis: + +```bash +make net-wipe NET= LIMIT= WIPE_CONFIRM=YES WIPE_STATE=true WIPE_NODES= +make net-blockscout NET= +``` + +## References + +- Industry pattern: send FCU before newPayload when EL is syncing (Lighthouse/Teku/Prysm) +- Related: `ultramarine/docs/fibernet_deploy_progress.md` (FIX-008 FCU-before-newPayload) + +## Testing + +1. Integration test: `full_node_el_syncing_still_sends_fcu` (make itest-node) +2. Full restart a synced fullnode; verify EL catches up and CL/EL heads converge +3. Verify blockscout continues indexing after restart diff --git a/docs/journal/BUG-013-chain-split-parent-mismatch.md b/docs/journal/BUG-013-chain-split-parent-mismatch.md new file mode 100644 index 0000000..d13e65c --- /dev/null +++ b/docs/journal/BUG-013-chain-split-parent-mismatch.md @@ -0,0 +1,311 @@ +# BUG-013: Chain Split on Parent Hash Mismatch (Height 5006) + +## Status: FIXED (validated 2026-01-26) + +## Summary + +On the fibernet testnet, consensus stalled at height 5006 with repeated `Parent hash mismatch` errors. +The network split into two views of the latest parent: + +- Group A (AMS nodes) treated **5005 = 0x2177…** as the latest head. +- Group B (LON2 nodes) treated **5004 = 0x7268…** as the latest head. + +As a result, proposals were built on different parents and validators rejected each other’s proposals indefinitely. + +## Impact + +- Consensus stuck at height 5006 with repeated round timeouts. +- Validators entered a persistent split‑head condition. +- Public RPC visibility followed one side of the split (node‑rpc aligned with AMS), which could mislead external observers. + +## Environment + +- Network: fibernet +- Date: 2026‑01‑22 +- Height: 5006 +- Nodes affected: f4‑metal‑medium‑lon2‑fibernet‑1 (node‑0/1), f4‑metal‑medium‑ams‑fibernet‑2 (node‑2/3) +- Also observed: node‑rpc aligned with AMS head (5005 = 0x2177…) + +## Timeline (Key Facts) + +All timestamps are UTC from journald. + +**1) EL/CL restarts around the split window** + +- LON2 EL restarted: + - `2026‑01‑22 14:28:08` Stopped `load-reth@node-0` + - `2026‑01‑22 14:28:10` Started `load-reth@node-0` + - `2026‑01‑22 14:28:12` Stopped `load-reth@node-1` + - `2026‑01‑22 14:28:14` Started `load-reth@node-1` +- AMS EL restarted: + - `2026‑01‑22 14:27:09` Stopped `load-reth@node-2` + - `2026‑01‑22 14:27:11` Started `load-reth@node-2` + - `2026‑01‑22 14:27:13` Stopped `load-reth@node-3` + - `2026‑01‑22 14:27:15` Started `load-reth@node-3` +- AMS CL restarted: + - `2026‑01‑22 14:27:21` Started `ultramarine@node-2` + - `2026‑01‑22 14:27:25` Started `ultramarine@node-3` +- LON2 CL restarted: + - `2026‑01‑22 14:28:26` `🚀 App message loop starting` (node‑1) + +**2) AMS nodes decided height 5005 as 0x2177…** + +- `2026‑01‑22 14:27:06` (AMS) + - `✅ Decided certificate processed successfully height=5005 ... block_hash=0x2177b29a...` + +**3) AMS built 5006 on parent 0x2177…** + +- `2026‑01‑22 14:27:56` (AMS) + - `🟠 generate_block_with_blobs on top of ExecutionBlock { block_hash: 0x2177..., block_number: 5005 }` + - `Received execution payload ... block_number=5006 parent_hash=0x2177...` + +**4) LON2 built on stale parent 0x7268… (block 5004)** + +- `2026‑01‑22 14:28:40` (LON2) + - `🟠 generate_block_with_blobs on top of ExecutionBlock { block_hash: 0x7268..., block_number: 5004 }` + +**5) Parent mismatch errors begin (split visible)** +LON2 rejects proposals built on 0x2177…: + +- `2026‑01‑22 14:30:55` (LON2) + - `Parent hash mismatch at height 5006 ... proposed_parent=0x2177... latest_parent=0x7268...` + +AMS rejects proposals built on 0x7268…: + +- `2026‑01‑22 14:52:01` (AMS) + - `Parent hash mismatch at height 5006 ... proposed_parent=0x7268... latest_parent=0x2177...` + +**6) RPC node aligned with AMS head** + +- `2026‑01‑22 14:48:xx` (node‑rpc) + - `eth_getBlockByNumber(latest)` shows `number=0x138d` `hash=0x2177...` `parent=0x7268...` + +## Critical Log Excerpts (Raw) + +LON2 (node‑0/1): + +``` +2026-01-22T14:28:40.408141Z DEBUG 🟠 generate_block_with_blobs on top of ExecutionBlock { block_hash: 0x7268a47b..., block_number: 5004 } +2026-01-22T14:30:55.068442Z ERROR Parent hash mismatch at height 5006 ... proposed_parent=0x2177b29a... latest_parent=0x7268a47b... +``` + +AMS (node‑2/3): + +``` +2026-01-22T14:27:06.023793Z DEBUG Finalizing decided certificate height=5005 block_hash=0x2177b29a... +2026-01-22T14:27:56.669896Z DEBUG 🟠 generate_block_with_blobs on top of ExecutionBlock { block_hash: 0x2177b29a..., block_number: 5005 } +2026-01-22T14:52:01.612345Z ERROR Parent hash mismatch at height 5006 ... proposed_parent=0x7268a47b... latest_parent=0x2177b29a... +``` + +## Root Cause + +**Missing safety gate between CL decided head and EL head.** +On restart, `latest_block` was initialized from EL (`eth_getBlockByNumber(Latest)`), which can be stale after EL restarts. +If CL’s decided head is ahead of EL’s canonical head, the validator still participates in consensus and validates parent hashes +against a stale `latest_block`, creating a persistent split when other validators build on the true decided parent. + +## Contributing Factors + +1. **EL lag not treated as a consensus blocker** + Nodes continued proposing/prevoting even when EL was SYNCING or behind the CL decided head. +2. **No single source of truth for parent at startup** + CL store (decided) and EL head were treated as separate truths; on restart, EL dominated. +3. **Large‑payload latency window** (general risk) + Under high load, payload build latency can allow heads to shift mid‑build, increasing mismatch risk. +4. **EL persistence defaults (tip-2)** (not root cause, but amplifies restart desync) + Reth defaults keep recent canonical blocks in memory; single-block finality makes this unsafe on restart. + +## Fixes Applied (2026-01-26) + +1. **Removed EL HTTP bootstrap for `latest_block`** + - `latest_block` now seeds from execution `genesis.json` (same file as load-reth `--chain`). + - CL store remains the primary source of truth on restart. +2. **Explicit execution genesis wiring** + - New CLI/env: `--execution-genesis-path` / `ULTRAMARINE_EL_GENESIS_JSON`. + - Startup fails fast if missing to avoid silent CL/EL divergence. +3. **Engine API edge-case enforcement** + - Reject `ACCEPTED` and `latestValidHash` mismatch. + - Unit coverage added in `crates/execution/src/client.rs`. +4. **EL persistence hardened for 1-slot finality** + - `--engine.persistence-threshold=0` in infra and compose. + - **Hardcoded in load-reth binary** via `DefaultEngineValues` API (reth v1.10.2+). + - Prevents loss of finalized blocks across restarts. +5. **Tier‑1 harness Engine API stub hardened** + - `height_from_block_hash` now only accepts synthetic hashes (`[height; 32]`). + - Prevents false head jumps and `BuiltPayloadMismatch` during `make itest-node`. + +## Specification Alignment (Engine API ↔ Malachite) + +### Engine API (execution-apis) + +Relevant sources: + +- `execution-apis/src/engine/common.md` +- `execution-apis/src/engine/paris.md` +- `execution-apis/src/engine/shanghai.md` +- `execution-apis/src/engine/cancun.md` + +Key requirements for the gate: + +1. **Ordering**: CL **MUST** respect forkchoice update ordering; EL **MUST** process FCU in the same order. +2. **SYNCING semantics**: FCU returns `SYNCING` when the head is unknown/can’t be validated. +3. **Payload build**: Build starts only after a **VALID** head; `SYNCING` implies no build. +4. **Timeouts/retry**: CL should retry transient failures but must not advance consensus on stale heads. + +**Implication**: EL readiness is defined by **Engine API status**, not by `eth_getBlockByNumber(latest)`. +HTTP RPC is not a consensus‑critical oracle for head alignment. + +### Malachite (consensus + ValueSync) + +Relevant sources: + +- `malachite/specs/consensus/overview.md` +- `malachite/specs/consensus/design.md` +- `malachite/specs/consensus/message-handling.md` +- `malachite/specs/synchronization/valuesync/README.md` + +Key requirements for the gate: + +1. **Invalid proposal → PREVOTE nil** is valid and expected. +2. **Async getValue()** can fail; the round proceeds safely (timeouts/nil votes). +3. **Consensus is the single decision point**; ValueSync supplies data but does not decide. + +**Implication**: A safety gate that refuses to propose/vote when EL is not aligned is consistent with Malachite. +It should cause nil votes or proposal timeouts, not safety violations. + +## Spec‑Driven Flow (Correct Behavior) + +1. **Define the canonical head from CL decided state** + - Use the locally decided head as the canonical CL reference for height/parent checks. + - If no decided head exists (fresh genesis), seed from execution `genesis.json`. +2. **Pre‑proposal gate via Engine API (no attrs)** + - Call FCU with `head=CL decided head`, `payloadAttributes=null`. + - `VALID` → proceed; `SYNCING/INVALID` → do not build and allow the round to time out (nil‑vote path). +3. **Proposer build only after valid FCU** + - FCU with attrs starts build, then `engine_getPayload`. +4. **Validator proposal validation + nil vote on invalid** + - If invalid (parent mismatch, timestamp, etc.) → PREVOTE nil. +5. **Consensus proceeds independently once a value is decided** + - Decided value is final; EL lag delays execution import but must not roll back consensus. +6. **Notify EL after decision (post‑consensus path)** + - After decision, call `newPayload` and/or FCU to drive EL to the decided head. + - `SYNCING` → mark degraded and retry; do not invalidate the decided height. + +## Persistence Notes (Not Root Cause, but Relevant) + +Ultramarine persists consensus state and blobs across restarts: + +- **Consensus store**: `store.db` (redb) holds decided values/certificates and metadata. +- **Blob store**: `blob_store.db` (RocksDB) holds blob sidecars and metadata CFs. + +The split was **not** caused by DB corruption. The issue was the handoff between the decided head +in the store and the EL head after restarts. + +## Optional Postmortem Questions (forensics only) + +- Did EL actually contain block 5005 but mark canonical head as 5004 because FCU was not re‑applied after restart? +- What is the exact sequence of CL startup calls and EL responses on LON2 between `14:28:26` and first mismatch? +- Were there transient network partitions (dial failures) that delayed head propagation? +- Was there a window where CL had `max_decided=5005` while EL reported `latest=5004` on LON2/FRA, and did the node proceed without gating? + +## Postmortem Validation (2026-01-26) + +### Code & Spec Alignment + +- Gate uses Engine API FCU (not HTTP RPC) before propose/vote. +- Startup alignment uses CL decided head from store and FCU to realign EL. +- Post‑Decided path sends FCU before newPayload when degraded/syncing. +- Engine API compliance: FCU `ACCEPTED` is rejected; `latestValidHash` mismatch on `VALID` is rejected. +- Execution genesis bootstrap uses `genesis.json` (no HTTP fallback). +- EL persistence threshold hardcoded to `0` in load-reth binary (reth v1.10.2+). + +### Ops Validation (Ansible) + +- CL restart waits for EL readiness (`eth_syncing == false`) after IPC socket appears. +- load-reth services pass `--engine.persistence-threshold=0`. +- Ultramarine services pass `--execution-genesis-path=/assets/genesis.json`. +- Playbooks updated: `infra/ansible/playbooks/roll.yml`, `deploy.yml`, `blockscout.yml`. + +### Tests Executed (local) + +Run from `ultramarine/`: + +- `full_node_split_head_recovery` +- `full_node_fcu_gate_does_not_require_http_latest` +- `full_node_el_syncing_degrades_node` +- `full_node_el_syncing_still_sends_fcu` +- `full_node_el_syncing_blocks_payload_build` +- `full_node_fcu_accepted_rejected` +- `ultramarine-execution` unit tests for ACCEPTED/latestValidHash edge cases +- `make itest-node` (Tier‑1 full‑node harness; 20/20 scenarios) + +## Actions & Status + +### Completed (bug resolution) + +- [x] Collect logs from LON2 (node‑0/1) and AMS (node‑2/3) for 2026‑01‑22 14:26–14:56. +- [x] Confirm divergent parents in error logs (`0x2177…` vs `0x7268…`). +- [x] Implement safety gate via Engine API status (no HTTP RPC gating). +- [x] Enforce startup alignment: initialize CL head from store; attempt FCU to align EL. +- [x] Add and validate EL‑syncing gate tests (`full_node_el_syncing_*`). +- [x] Update MetadataOnly sync path to advance `latest_block` from payload header. +- [x] Add Tier‑1 test: `full_node_fcu_gate_does_not_require_http_latest`. +- [x] Add Tier‑1 test: `full_node_split_head_recovery`. +- [x] Add Tier‑1 test: `full_node_fcu_accepted_rejected`. +- [x] Add EL readiness checks (`eth_syncing`) before CL start in Ansible playbooks. +- [x] Spec alignment review (Engine API + Malachite). +- [x] Remove misleading “observer-only mode” wording from startup log. +- [x] Document Engine API design decisions in `CLAUDE.md`. +- [x] Reject FCU `ACCEPTED` status as Engine API spec violation. +- [x] Reject FCU `VALID` with `latestValidHash` mismatch. +- [x] Bootstrap `latest_block` from execution `genesis.json` (no HTTP fallback). +- [x] Add `--execution-genesis-path` / `ULTRAMARINE_EL_GENESIS_JSON`. +- [x] Set `--engine.persistence-threshold=0` for load-reth in infra/compose. +- [x] Hardcode `persistence_threshold=0` in load-reth binary via `DefaultEngineValues` API (reth v1.10.2+). +- [x] Document execution genesis + EL persistence in knowledge base. +- [x] Add runbook section for detecting and resolving split‑head. +- [x] Add note on node‑rpc alignment and RPC visibility impact. + +### Optional Postmortem Follow‑ups (not required for resolution) + +- [ ] Correlate CL decided head vs EL canonical head immediately after restart on each node. +- [ ] Verify whether EL had block 5005 in DB but not canonical (reth debug/inspection). + +## References + +- Related incident: `ultramarine/docs/journal/BUG-012-fullnode-sync-race.md` +- Gate rules + runbook: `ultramarine/docs/knowledge_base/cl-el-head-gating.md` +- Execution genesis bootstrap: `ultramarine/docs/knowledge_base/execution-genesis.md` +- EL persistence for 1-slot finality: `ultramarine/docs/knowledge_base/el-persistence.md` +- Network inventory: `ultramarine/infra/networks/fibernet/inventory.yml` + +## Production Verification (2026-02-03) + +### Load Testing Under Stress + +BUG-013 fix verified under sustained load testing on fibernet: + +| Test | TPS | Duration | Parent Mismatch Errors | +| -------------------------- | ------ | ----------- | ---------------------- | +| Warmup | 1k-10k | 5 min | 0 | +| Sustained | 10k | 10 min | 0 | +| Peak | 20k | 5 min | 0 | +| Rolling restart under load | 20k | during test | 0 | + +**Verification commands:** + +```bash +# Check for parent mismatch errors across all nodes +for host in 67.213.117.143 64.34.87.1 67.213.121.57; do + ssh ubuntu@$host 'journalctl -u "ultramarine@*" --since "-30min" | grep -c "Parent hash mismatch" || echo 0' +done +# Result: 0 on all hosts +``` + +### Conclusion + +BUG-013 fix confirmed stable under: + +- 20k TPS sustained load +- Rolling restarts during load +- Peak blocks with 21,622 transactions diff --git a/docs/journal/PERF-SUMMARY-fibernet-throughput-journey.md b/docs/journal/PERF-SUMMARY-fibernet-throughput-journey.md new file mode 100644 index 0000000..3650c7b --- /dev/null +++ b/docs/journal/PERF-SUMMARY-fibernet-throughput-journey.md @@ -0,0 +1,154 @@ +# PERF-SUMMARY: Fibernet Throughput Journey + +Date: 2026-02-11 (UTC)\ +Network: `fibernet`\ +Audience: engineering retrospective + interview/reference packet + +Sanitization: + +- hostnames, IPs, and direct RPC URLs are intentionally omitted +- shard aliases used: `shard-a`, `shard-b`, `shard-c` + +## Scope + +This document is the standalone record for the throughput campaign executed on: + +- 2026-02-03 (baseline recovery, scale-up, and high-target stress) +- 2026-02-10 (post-redeploy validation and ceiling probe) + +Covered outcomes: + +- environment and test setup by phase +- key fixes by phase +- submit-side metrics by phase +- on-chain window metrics from the final probe +- post-run network health checks + +## Environment And Test Setup + +- topology: 3 validator shards (`shard-a`, `shard-b`, `shard-c`) +- chain profile: `chain_id=1984`, execution gas limit regime `2,000,000,000` +- load-generator class: AMD EPYC 4564P (32 cores), 187 GB RAM + +Phase setup highlights: + +- 2026-02-03 baseline recovery: + - 10k run: `120s`, `1,000` accounts, `max_pending=32`, single endpoint + - 20k run: `60s`, `10,000` accounts, `max_pending=16`, 3 endpoints, `clients_per_endpoint=4`, `batch_size=25` +- 2026-02-03 high-target stress: + - 50k and 80k target runs from a co-located generator profile + - `20,000` accounts (`10k` regular + `10k` blob), `clients_per_endpoint=8`, `batch_size=50` + - elevated txpool and RPC connection ceilings for high fan-out submission +- 2026-02-10 validation/probe: + - full `wipe + redeploy + rebuild` cycle before testing + - `load-blaster` binary hash on validator hosts: `8c7b53e8b68990c18aa12ef889182faf34e5e4ed8d1cd4138f38c0ec4b1a5595` + - `45,000` accounts funded (`0` failures), `0.1 ETH` each (`4,500 ETH` total) + - post-funding balance floor check (`>=0.09 ETH`): `45,000 / 45,000` sufficient + - regular transactions only (`blob=0`) + +## Repro Commands (Sanitized) + +Phase command patterns used in the campaign: + +```bash +# 10k validation / 60s (per shard) +./target/release/load-blaster -c configs/fibernet/.toml run --target-tps 10000 --duration 60s + +# 20k probe / 60s (per shard) +./target/release/load-blaster -c configs/fibernet/.toml run --target-tps 20000 --duration 60s +``` + +High-target stress profile (co-located generator) used 50k/80k targets with updated load-blaster +parallelism and elevated EL txpool/RPC limits described in this summary. + +## Key Fixes By Phase + +### Phase 1 (2026-02-03, baseline recovery) + +- corrected configuration bottleneck in builder/txpool gas-limit behavior for custom-chain operation (`2B` regime alignment) +- tuned CL runtime to multithreaded production profile +- fixed load-generator rate limiter to true per-second quota semantics +- added periodic nonce drift reconciliation for sender health + +### Phase 2 (2026-02-03, high-target stress) + +- raised txpool count/size limits for large in-flight transaction sets +- raised RPC max-connections for high client fan-out +- switched test placement to co-located generator profile to reduce network-induced submit constraints + +### Phase 3 (2026-02-10, stability and reproducibility) + +- validated fresh deployment path end-to-end (`wipe -> deploy -> fund -> run`) +- hardened submission path to sustain `0` submit-side errors in 10k and 20k runs +- standardized post-run drain checks across all shard RPC endpoints + +## Key Metrics By Phase + +| Phase | Run | Target / Duration | Total Submitted | Effective Submit TPS | Errors | Notes | +| ------- | ------------- | ----------------- | --------------- | -------------------- | ------------------ | ------------------------ | +| Phase 1 | Validation | 10k / 120s | `1,206,215` | `10,042` | `3,735` (`0.31%`) | backpressure `0` | +| Phase 1 | Scale-up | 20k / 60s | `1,193,860` | `~19,898` (derived) | `25,742` | success `97.8%` | +| Phase 2 | Stress | 50k / 120s | `3,632,369` | `30,267` | `19,681` (`0.54%`) | backpressure `0` | +| Phase 2 | Stress | 80k / 60s | `1,807,299` | `30,116` | `9,312` (`0.52%`) | backpressure `0` | +| Phase 3 | Validation | 10k / 60s | `1,472,250` | `24,537` | `0` | cluster total (3 shards) | +| Phase 3 | Ceiling probe | 20k / 60s | `1,823,713` | `30,395` | `0` | cluster total (3 shards) | + +Phase 3 per-shard split for final 20k/60s probe: + +- `shard-a`: `924,008` sent (`15,384.19 TPS`), errors `0` +- `shard-b`: `321,788` sent (`5,352.32 TPS`), errors `0` +- `shard-c`: `577,917` sent (`9,631.21 TPS`), errors `0` + +## On-Chain Window Metrics (Final 20k Probe) + +Selected window: + +- block range: `10539..10603` (`65` blocks) +- elapsed time: `109s` + +Aggregates: + +- total on-chain transactions: `1,824,625` +- on-chain TPS: `16,739.68` +- average tx/block: `28,071.15` +- min tx/block: `0` +- max tx/block: `95,238` +- average gas utilization: `29.47%` +- average block time: `1.70s` (min `1s`, max `5s`) + +Interpretation: + +- window includes both pre-load idle blocks and a hot segment with repeated near-full gas utilization +- confirms high burst inclusion capacity with visible shard-level submission asymmetry + +## Post-Run Health Checks + +Checks executed across all shard RPC endpoints after final validation and probe: + +- `txpool_status`: `pending=0`, `queued=0` +- `eth_syncing=false` +- `net_peerCount=0x6` + +Operational conclusion: + +- no residual mempool backlog after load +- nodes remained connected and not in sync-recovery state + +## Current Baseline (As Of 2026-02-11) + +- stable no-error submission demonstrated at: + - `24,537 TPS` cluster submit throughput under 10k/60s targets + - `30,395 TPS` cluster submit throughput under 20k/60s targets +- on-chain reference window for the final probe: + - `16,739.68 TPS` (`1,824,625 tx / 109s`) +- practical submit ceiling observed so far remains in the `~30k TPS` band for tested configurations + +## Deletion Caveat (What Is No Longer Preserved Separately) + +With phase-specific reports removed, the repository no longer retains separate per-phase files containing: + +- full per-block raw CSV listings for the final 65-block window +- phase-local command transcripts and long-form diagnostic excerpts +- phase-local code snippet context grouped by test day + +This summary preserves the canonical fixes, outcomes, and metrics needed for ongoing engineering reference. diff --git a/docs/knowledge_base/block-timing.md b/docs/knowledge_base/block-timing.md new file mode 100644 index 0000000..aa66d40 --- /dev/null +++ b/docs/knowledge_base/block-timing.md @@ -0,0 +1,66 @@ +# Block Timing & Timestamps in Load Network + +## Protocol Constants (same everywhere) + +- `LOAD_MIN_BLOCK_TIME_SECS = 1` +- `LOAD_MAX_FUTURE_DRIFT_SECS = 15` + +Defined in: `ultramarine/crates/types/src/constants.rs` + +## Invariants (validator-enforced) + +1. `proposed_parent_hash == latest_block.block_hash` (parent link) +2. `proposed_ts > parent_ts` (strictly increasing) +3. `proposed_ts >= parent_ts + 1` (minimum block time) +4. `proposed_ts <= now + 15` (drift guard) + +## Why 1 second minimum + +EVM timestamp = integer seconds. Cannot have >1 block/sec with real wall-clock time. + +## Why 15 seconds drift + +geth/Ethereum canonical value for future timestamp tolerance. Industry standard. +See: `geth/params/protocol_params.go` (allowedFutureBlockTimeSeconds = 15) + +## Operational Requirement: NTP + +Validators MUST run NTP (ntpd/chronyd). Proposers with clock >15s ahead will have +proposals rejected network-wide, resulting in nil-rounds until their clock syncs. + +## Height 1 Behavior + +At startup, `latest_block` is initialized from EL (genesis block if no committed blocks). +This ensures all timestamp checks (parent hash match, ts > parent, ts >= parent + 1, drift guard) +work from height 1 without exceptions. + +## Timestamp Calculation (Proposer Side) + +```rust +fn next_block_timestamp(parent_timestamp: u64) -> u64 { + std::cmp::max(current_unix_time(), parent_timestamp + LOAD_MIN_BLOCK_TIME_SECS) +} +``` + +## Throttling (Proposer Side) + +Before building a block, proposer waits until `now >= parent_ts + 1`: + +- Uses subsecond precision to avoid ~1s jitter +- Respects GetValue timeout to preserve liveness +- If throttle exceeds timeout, refuses to propose (nil-vote via timeout) + +## TPS Considerations + +Block frequency is limited to 1 block/sec due to EVM timestamp granularity. +To maintain throughput, gas limit must compensate: + +- TPS = (gas_limit x blocks/sec) / gas_per_tx +- With 2B gas limit: TPS = 2B / gas_per_tx + +## References + +- Malachite proto: consensus.proto:49-53 +- Arbitrum: docs.arbitrum.io/block-numbers-and-time +- BeaconKit: NextBlockDelay +- geth: params/protocol_params.go (allowedFutureBlockTimeSeconds = 15) diff --git a/docs/knowledge_base/cl-el-head-gating.md b/docs/knowledge_base/cl-el-head-gating.md new file mode 100644 index 0000000..c563fee --- /dev/null +++ b/docs/knowledge_base/cl-el-head-gating.md @@ -0,0 +1,187 @@ +# CL↔EL Head Alignment (FCU Gate) + +## Purpose + +This note documents the **consensus‑critical** rules for aligning CL decided head with EL head, +how Ultramarine enforces the gate, and where to look in code/tests/ops. + +## Glossary + +- **CL**: Consensus Layer (Ultramarine / Malachite host). +- **EL**: Execution Layer (load‑reth). +- **FCU**: `engine_forkchoiceUpdated` call to the Engine API. +- **SYNCING**: Engine API status meaning the EL cannot validate the head yet (missing data). +- **Gate**: The rule set that blocks proposing/voting until EL is aligned to the CL decided head. + +## Problem (Summary) + +This gate exists to prevent CL/EL head divergence after restarts and to keep +consensus participation safe when the execution layer is lagging. + +## Correctness Rules (Specs) + +### Engine API + +The **only** readiness oracle is the Engine API forkchoice status: + +- `engine_forkchoiceUpdated` returns `VALID | INVALID | SYNCING` +- `SYNCING` means the head is unknown/unvalidated; CL must not build/vote. + +Refs: + +- `execution-apis/src/engine/common.md` (ordering + retry) +- `execution-apis/src/engine/paris.md` (FCU semantics) +- `execution-apis/src/engine/shanghai.md` +- `execution-apis/src/engine/cancun.md` + +### Malachite (Tendermint) + +Invalid proposal → **PREVOTE nil** is the expected path. +Async `getValue()` can fail; round proceeds safely. + +Refs: + +- `malachite/specs/consensus/overview.md` +- `malachite/specs/consensus/design.md` +- `malachite/specs/synchronization/valuesync/README.md` + +## Required Gate Behavior + +1. **Source of truth = CL decided head (store), not EL HTTP.** +2. **Before proposing or voting**, call FCU with `payloadAttributes=None`. + - `VALID` → proceed + - `SYNCING/INVALID` → refuse proposal/vote (nil) +3. **Do not use `eth_getBlockByNumber(latest)` for consensus gating.** + +## Failure Modes & Expected Behavior + +| Condition | Expected Behavior | +| ----------------------------------------------------------- | --------------------------------------------------------------------------- | +| FCU returns **SYNCING** | Do not propose/vote; allow round timeout → nil vote; retry FCU later. | +| FCU returns **INVALID** | Do not propose/vote; mark EL degraded and retry later. | +| FCU returns **ACCEPTED** | Treat as Engine API spec violation; fail the gate and refuse proposal/vote. | +| FCU returns **VALID** but `latestValidHash` mismatches head | Treat as spec violation; fail the gate and refuse proposal/vote. | +| EL not reachable (FCU call fails) | Fail the gate; do not propose/vote; retry until EL is reachable. | +| No decided head on disk (fresh genesis) | Seed `latest_block` from execution `genesis.json` (same as EL). | +| Missing execution genesis path | Fail fast at startup; do not enter consensus. | + +## Implementation (Ultramarine) + +### App gating + +- Startup alignment: `crates/node/src/app.rs` (ConsensusReady → FCU gate) +- Proposer gate: `crates/node/src/app.rs` (`GetValue`) +- Validator gate: `crates/node/src/app.rs` (`ReceivedProposalPart`) +- Execution genesis bootstrap: `crates/node/src/node.rs` + +### Engine API client + +- FCU status handling: `crates/execution/src/client.rs` + +### Decided path + +- FCU‑before‑newPayload during sync: `crates/consensus/src/state.rs` +- MetadataOnly sync advances `latest_block`: `crates/consensus/src/state.rs` + +### Design decisions + +- `CLAUDE.md` → **Engine API Design Decisions** + +## Execution Genesis Bootstrap (No HTTP) + +Ultramarine no longer uses `eth_getBlockByNumber(latest)` to seed `latest_block`. +Instead, it computes the execution genesis header locally from the same `genesis.json` +used by load-reth, then uses that as the initial `latest_block` when no decided head +exists in the CL store. + +**Inputs** + +- CLI: `--execution-genesis-path=/path/to/genesis.json` +- Env: `ULTRAMARINE_EL_GENESIS_JSON=/path/to/genesis.json` + +**Derived fields** + +- `state_root` computed from `alloc` (MPT root). +- Fork-gated fields set if active at genesis: + - London: `base_fee_per_gas` + - Shanghai: `withdrawals_root` + - Cancun: `parent_beacon_block_root`, `blob_gas_used`, `excess_blob_gas` + - Prague: `requests_hash` + +**Why** + +- Eliminates HTTP RPC dependency for consensus-critical initialization. +- Ensures CL/EL agree on genesis header without importing `reth-chainspec`. + +## Tests (Tier‑1, make itest-node) + +- `full_node_el_syncing_degrades_node` +- `full_node_el_syncing_still_sends_fcu` +- `full_node_el_syncing_blocks_payload_build` +- `full_node_el_transient_syncing_recovers` +- `full_node_fcu_gate_does_not_require_http_latest` +- `full_node_fcu_accepted_rejected` +- `full_node_split_head_recovery` + +Location: `crates/test/tests/full_node/node_harness.rs`\ +Runner: `Makefile` → `itest-node` + +## Harness Invariant (Engine Stub Hashes) + +The Tier‑1 harness uses synthetic block hashes (`[height as u8; 32]`) to map +heights inside the Engine API stub. The stub must **not** interpret arbitrary +hashes as heights; otherwise it can jump its internal head and produce +`BuiltPayloadMismatch` errors during `getPayload`. + +See: `docs/knowledge_base/itest-node-harness.md` + +## Operations (Ansible) + +CL restart must wait for EL readiness (`eth_syncing == false`) **after IPC socket appears**: + +- `infra/ansible/playbooks/roll.yml` +- `infra/ansible/playbooks/deploy.yml` +- `infra/ansible/playbooks/blockscout.yml` + +## Runbook: Detecting & Resolving Split‑Head + +### Symptoms + +- Repeated `Parent hash mismatch` in CL logs at the same height. +- Two validator groups each reject the other’s proposals. +- RPC/Blockscout shows a head that disagrees with some validators. + +### Quick Checks + +1. **Compare CL decided head vs EL head** + - CL: check latest decided height/hash in `store.db` (or logs). + - EL: `engine_forkchoiceUpdated` (FCU) status for that head; do **not** trust `eth_getBlockByNumber(latest)` for gating. +2. **Confirm EL readiness** + - `eth_syncing == false` (per Ansible readiness checks). + +### Recovery Steps + +1. **Realign EL to CL decided head** + - Apply FCU to the CL decided head (no payload attributes). + - If `SYNCING`, wait and retry; do not propose/vote in the meantime. +2. **Restart order (if needed)** + - Restart EL first, wait for IPC socket + `eth_syncing == false`, then restart CL. +3. **Verify recovery** + - Parent mismatch errors stop. + - Validators converge on the same head and heights advance. + +### Note on node‑rpc / public views + +RPC nodes may align with a single validator group (as seen in BUG‑013).\ +If RPC head diverges from the validator majority, public explorers can show a forked view. +Treat RPC alignment as **observability**, not consensus truth. + +## Related Docs + +- CL runtime settings: `docs/knowledge_base/cl-runtime.md` +- Execution genesis bootstrap: `docs/knowledge_base/execution-genesis.md` +- EL persistence for 1-slot finality: `docs/knowledge_base/el-persistence.md` + +## Further Reading + +- Incident report (example scenario): `docs/journal/BUG-013-chain-split-parent-mismatch.md` diff --git a/docs/knowledge_base/cl-runtime.md b/docs/knowledge_base/cl-runtime.md new file mode 100644 index 0000000..a555148 --- /dev/null +++ b/docs/knowledge_base/cl-runtime.md @@ -0,0 +1,70 @@ +# CL Runtime Configuration + +## Config Location + +`config/config.toml` in each node's home directory. +Fibernet: `ultramarine/infra/networks/fibernet/bundle/private/ultramarine/homes/node-*/config/config.toml` + +## Runtime Settings + +### [runtime] flavor + +- `single_threaded` - development (default) +- `multi_threaded` - **production** + +Code: `malachite/code/crates/config/src/lib.rs:687` + +Single-threaded adds overhead under high load. + +### [logging] log_level + +- `debug` - default, high I/O overhead +- `info` - **production** +- `warn` / `error` - minimal logging + +Code: `malachite/code/crates/config/src/lib.rs:707` + +## Production Config + +```toml +[logging] +log_level = "info" +log_format = "plaintext" + +[runtime] +flavor = "multi_threaded" +``` + +## Consensus Timeouts + +| Parameter | Default | Notes | +| ------------------- | ------- | ------------------------------------ | +| `timeout_propose` | 3s | Safety valve, not throughput limiter | +| `timeout_prevote` | 1s | Voting phase | +| `timeout_precommit` | 1s | Voting phase | +| `timeout_*_delta` | 500ms | Increase per round | + +**Note:** Timeouts do NOT limit happy-path throughput. Blocks broadcast immediately when ready. + +## Troubleshooting + +### High CPU in consensus + +1. Set `log_level = "info"` (not debug) +2. Set `runtime.flavor = "multi_threaded"` + +### Log I/O bottleneck + +Symptoms: high disk write, slow consensus. +Fix: `log_level = "info"` or `"warn"` + +### Proposals timing out + +1. Check EL (load-reth) health +2. Verify Engine API IPC responsive +3. Check network latency between validators + +## Related + +- [Block Timing](./block-timing.md) - timestamp invariants +- [EL Gas Limits](./el-gas-limits.md) - builder/txpool config diff --git a/docs/knowledge_base/el-gas-limits.md b/docs/knowledge_base/el-gas-limits.md new file mode 100644 index 0000000..e34cb62 --- /dev/null +++ b/docs/knowledge_base/el-gas-limits.md @@ -0,0 +1,88 @@ +# EL Gas Limits Configuration + +## Load Network Constants + +- `LOAD_EXECUTION_GAS_LIMIT = 2,000,000,000` (2B) +- Location: `load-reth/src/chainspec/mod.rs:32` + +## Critical: Reth Defaults vs Load Requirements + +| Parameter | Reth Default (custom chain) | Load Requirement | CLI Flag | +| ------------------------ | --------------------------- | ---------------- | ---------------------------- | +| Builder gas limit | 36M | 2B | `--builder.gaslimit` | +| Txpool gas limit | 30M | 2B | `--txpool.gas-limit` | +| Txpool max-account-slots | 16 | 32 | `--txpool.max-account-slots` | +| Txpool pending-max-size | 20MB | 512MB | `--txpool.pending-max-size` | + +## Builder Configuration + +### --builder.gaslimit + +Target gas limit for built blocks. Reth defaults custom chains to 36M. + +- Code: `reth/crates/node/core/src/cli/config.rs:44-55` +- Load fix: `load-reth/src/engine/builder.rs:83` defaults to `LOAD_EXECUTION_GAS_LIMIT` + +### --builder.interval + +Period between payload rebuilds. Default 1s, recommended 50ms for Load. + +### --builder.deadline + +Time budget for payload build. Default 12s, recommended 2s for Load. +Too low = underfilled blocks. Too high = CL can't propose in time. + +### --builder.max-tasks + +Concurrent payload builds. Default 3, recommended 10 for Load. + +## Txpool Configuration + +### --txpool.gas-limit + +Max gas limit for accepted transactions. **Must match builder.gaslimit**. + +- Code: `reth/crates/transaction-pool/src/config.rs:125` +- Default: 30M (`ETHEREUM_BLOCK_GAS_LIMIT_30M`) +- Txs with `gas > txpool.gas-limit` rejected at pool ingress. + +### --txpool.max-account-slots + +Max pending txs per sender. Must be >= load-blaster `max_pending` (32). + +### --txpool.pending-max-size / queued-max-size + +Pool size in MB. 512MB recommended for high throughput. + +## Systemd Template + +`ultramarine/infra/templates/systemd/load-reth@.service.j2`: + +```bash +--builder.gaslimit={{ loadnet_el_builder_gaslimit | default(2000000000) }} +--builder.interval={{ loadnet_el_builder_interval | default("50ms") }} +--builder.deadline={{ loadnet_el_builder_deadline | default(2) }} +--builder.max-tasks={{ loadnet_el_builder_max_tasks | default(10) }} +--txpool.gas-limit={{ loadnet_el_txpool_gas_limit | default(2000000000) }} +--txpool.max-account-slots={{ loadnet_el_txpool_max_account_slots | default(32) }} +--txpool.pending-max-count={{ loadnet_el_txpool_pending_max_count | default(50000) }} +--txpool.queued-max-count={{ loadnet_el_txpool_queued_max_count | default(50000) }} +--txpool.pending-max-size={{ loadnet_el_txpool_pending_max_size | default(512) }} +--txpool.queued-max-size={{ loadnet_el_txpool_queued_max_size | default(512) }} +``` + +## Troubleshooting + +### Empty/underfilled blocks despite pending txs + +1. Check `builder.gaslimit` = 2B +2. Check `txpool.gas-limit` = 2B +3. Increase `builder.deadline` if builds timeout + +### Txs rejected "exceeds gas limit" + +`txpool.gas-limit` < tx gas limit. Set to 2B. + +### Pool shows 0 pending during load + +`max-account-slots` < load-blaster `max_pending`. Set to 32+. diff --git a/docs/knowledge_base/el-persistence.md b/docs/knowledge_base/el-persistence.md new file mode 100644 index 0000000..cd50340 --- /dev/null +++ b/docs/knowledge_base/el-persistence.md @@ -0,0 +1,82 @@ +# EL Persistence for 1-Slot Finality + +## Problem + +Reth defaults to keeping up to **2 canonical blocks in memory** before persisting to disk. +For chains with **single-block finality**, that creates a restart hazard: + +- Block N is finalized by the CL +- EL has not persisted N yet (still in memory) +- EL restarts -> block N is missing on disk +- CL sends FCU to head=N -> EL returns **SYNCING** +- CL/EL diverge and consensus can stall + +Ethereum mainnet tolerates this because finality is ~64 blocks. Load cannot. + +## Solution: Hardcoded in load-reth (v1.10.2+) + +As of reth v1.10.2, the `DefaultEngineValues` API allows setting engine defaults programmatically. +load-reth now **hardcodes `persistence_threshold=0`** in `main.rs` before CLI parsing: + +```rust +use reth_node_core::args::DefaultEngineValues; + +DefaultEngineValues::default() + .with_persistence_threshold(0) + .try_init() + .ok(); +``` + +This ensures: + +- The setting is always applied regardless of command-line flags +- No ops/infra configuration can accidentally override it +- Defense-in-depth: infra configs still pass the flag, but it's not required + +**Note:** `try_init()` is best-effort; if defaults were already set earlier in the +process (tests or other binaries), it will no-op. That’s why infra still passes +`--engine.persistence-threshold=0` as a secondary guard. + +## Why persistence_threshold=0 Is Correct for Load + +- Every canonical block is final immediately (1-slot finality) +- There is no reorg window to benefit from buffering +- Disk I/O overhead is minimal at 1 block/sec + +## Configuration Layers (Defense-in-Depth) + +### Primary: Hardcoded in load-reth binary + +`load-reth/src/main.rs` sets the default before CLI parsing. This is the authoritative layer. + +### Secondary: Infra configuration (redundant but safe) + +- Systemd: `infra/templates/systemd/load-reth@.service.j2` + - `--engine.persistence-threshold={{ loadnet_el_engine_persistence_threshold | default(0) }}` +- Docker compose: + - `compose.yaml` + - `compose.ipc.yaml` + +## Verification + +After restart: + +1. CL applies FCU to decided head. +2. EL returns **VALID** (not SYNCING). +3. Proposals/votes resume without split-head. + +If EL still returns SYNCING, check: + +- EL datadir is intact +- persistence flag is present +- no manual pruning wiped recent blocks + +## Notes + +- Chains with multi-block finality can use higher thresholds. +- For Load, **threshold=0** is the safe default. + +## Related + +- CL/EL head gate: `docs/knowledge_base/cl-el-head-gating.md` +- Incident report: `docs/journal/BUG-013-chain-split-parent-mismatch.md` diff --git a/docs/knowledge_base/execution-genesis.md b/docs/knowledge_base/execution-genesis.md new file mode 100644 index 0000000..58ea051 --- /dev/null +++ b/docs/knowledge_base/execution-genesis.md @@ -0,0 +1,45 @@ +# Execution Genesis Bootstrap + +## Goal + +Make CL and EL agree on the same execution genesis header **without** HTTP RPC or +`reth-chainspec` dependencies. + +## Inputs + +The CL reads the same `genesis.json` used by load-reth (`--chain`). + +- CLI: `--execution-genesis-path=/path/to/genesis.json` +- Env: `ULTRAMARINE_EL_GENESIS_JSON=/path/to/genesis.json` + +## Behavior + +At startup, Ultramarine builds an `ExecutionBlock` from the execution genesis file +and uses it as `latest_block` when there is no decided head in the CL store. + +Derived fields: + +- `state_root` computed from `alloc` (MPT root) +- Fork-gated header fields (if active at genesis): + - London: `base_fee_per_gas` + - Shanghai: `withdrawals_root` + - Cancun: `parent_beacon_block_root`, `blob_gas_used`, `excess_blob_gas` + - Prague: `requests_hash` + +## Implementation + +- `crates/node/src/node.rs` builds the header and hashes it to `block_hash`. +- `crates/cli/src/cmd/start.rs` exposes `--execution-genesis-path`. +- Infra wiring: + - Systemd: `infra/templates/systemd/ultramarine@.service.j2` mounts `/assets` and passes `--execution-genesis-path=/assets/genesis.json`. + - Docker compose: `compose.ipc.yaml` passes `--execution-genesis-path=/assets/genesis.json`. + +## Failure Mode + +If the execution genesis path is missing or invalid, startup fails fast to avoid +silent CL/EL divergence. + +## Related + +- CL/EL head gate: `docs/knowledge_base/cl-el-head-gating.md` +- EL persistence: `docs/knowledge_base/el-persistence.md` diff --git a/docs/knowledge_base/itest-node-harness.md b/docs/knowledge_base/itest-node-harness.md new file mode 100644 index 0000000..b97c031 --- /dev/null +++ b/docs/knowledge_base/itest-node-harness.md @@ -0,0 +1,48 @@ +# Tier‑1 Full‑Node Harness Notes + +## Purpose + +Document invariants and failure modes for the Tier‑1 full‑node harness (`make itest-node`). +These tests are **integration‑critical** for Engine API gating, proposal flow, and blob handling. + +## Key Invariant: Engine Stub Block Hashes + +The Engine API stub encodes test heights into synthetic block hashes: + +- `block_hash_for_height(h)` → `B256([h as u8; 32])` +- `height_from_block_hash(hash)` **only** returns a height if **all bytes match** + (i.e., the hash was generated by `block_hash_for_height`). + +This prevents the stub from interpreting a real genesis hash as a “height” and +silently jumping its internal head. + +## Failure Mode (What Breaks) + +If the stub interprets arbitrary block hashes as heights (e.g., using only `hash[0]`): + +- A real genesis hash can be misread as height ~200+. +- `forkchoiceUpdated` updates `latest_block` to the wrong height. +- `getPayload` builds on a mismatched head → `BuiltPayloadMismatch` in + `crates/execution/src/client.rs`. +- Tier‑1 tests fail with missing blobs (proposal never materializes). + +## Fix (2026‑01‑26) + +Hardened `height_from_block_hash` to return a height **only** when the hash +matches the synthetic pattern. + +Code: + +- `crates/test/tests/full_node/node_harness.rs` + - `height_from_block_hash()` checks all bytes are identical before returning a height. + +## Tests + +Run from `ultramarine/`: + +- `make itest-node` + +## Related + +- CL↔EL head gate: `docs/knowledge_base/cl-el-head-gating.md` +- BUG‑013 postmortem: `docs/journal/BUG-013-chain-split-parent-mismatch.md` diff --git a/docs/knowledge_base/p2p-sync-limits.md b/docs/knowledge_base/p2p-sync-limits.md new file mode 100644 index 0000000..c6782b7 --- /dev/null +++ b/docs/knowledge_base/p2p-sync-limits.md @@ -0,0 +1,160 @@ +# P2P and Sync Size Limits + +Terms: + +- `P2P` = peer-to-peer gossip and request/response transport between nodes. +- `RPC` here refers to P2P request/response limits (not JSON-RPC to EL). +- `ValueSync` = protocol path used to transfer decided values during catch-up. + +## Problem + +During high-throughput load tests, blocks can grow to **~5-12 MB** with many transactions +(2026-02-10 baseline hot segment peaked at ~11.6 MB). +Default P2P/sync limits (~1-10 MB) cause: + +- Sync requests rejected at P2P layer +- Nodes falling behind and unable to catch up +- `WARN: Beacon client online, but no consensus updates received` + +Log meaning: + +- This warning usually indicates consensus is running but block update propagation/sync is unhealthy. + +## Configuration Parameters + +| Parameter | Location | Default | Load Test Value | Purpose | +| ------------------- | ------------------------------ | ----------------- | --------------- | ----------------------- | +| `pubsub_max_size` | `consensus.p2p`, `mempool.p2p` | 4 MiB (~4.2 MB) | **50 MiB** | P2P gossip message size | +| `rpc_max_size` | `consensus.p2p`, `mempool.p2p` | 10 MiB (~10.5 MB) | **100 MiB** | P2P RPC response size | +| `max_request_size` | `[sync]` | 1 MiB | **50 MiB** | ValueSync request size | +| `max_response_size` | `[sync]` | 10 MiB (~10.5 MB) | **500 MiB** | ValueSync response size | + +## Manifest Configuration + +In `manifests/.yaml`: + +```yaml +sync: + enabled: true + max_request_size: "50 MiB" + max_response_size: "500 MiB" + request_timeout: "60s" + parallel_requests: 100 + batch_size: 5 + fullnode: + parallel_requests: 100 + request_timeout: "60s" + max_response_size: "500 MiB" + batch_size: 10 + +p2p: + pubsub_max_size: "50 MiB" + rpc_max_size: "100 MiB" +``` + +## Code Locations + +### Netgen (config generation) + +- `infra/gen/netgen/src/main.rs:145-165` - Sync struct definition +- `infra/gen/netgen/src/main.rs:188-200` - P2pConfig struct +- `infra/gen/netgen/src/main.rs:956-998` - Size limit application + +### Config files + +- `infra/networks//bundle/private/ultramarine/homes/node-*/config/config.toml` + +## Critical Insight + +**ALL nodes** must have large limits — not just receivers. + +The SENDER also needs high `rpc_max_size` to send large sync responses. If only receiving +nodes have large limits, sync still fails because sending nodes cannot serialize responses. + +## Capacity Planning + +With 1024 blobs per block (high DA capacity): + +- Max block size: 1024 × 131,072 bytes = ~134 MB +- Requires: `max_response_size: "500 MiB"` (includes batch overhead) + +For regular load tests (no blobs, many transactions): + +- Observed block size in hot segments: ~5-11.6 MB (PERF-SUMMARY, 2026-02-10 baseline) +- Requires: `pubsub_max_size: "50 MiB"`, `rpc_max_size: "100 MiB"` + +## Manual Fix (without netgen) + +Preferred path: + +- Update `manifests/.yaml` and regenerate configs via netgen. +- Use the manual script below only as a recovery shortcut. + +If nodes are stuck after load test, update configs on ALL hosts: + +```bash +# Create update script +cat > /tmp/update-sync-sizes.sh << 'EOF' +#!/bin/bash +for config in /var/lib/ultramarine/*/config/config.toml; do + sed -i.bak \ + -e 's/pubsub_max_size = ".*"/pubsub_max_size = "50 MiB"/g' \ + -e 's/rpc_max_size = ".*"/rpc_max_size = "100 MiB"/g' \ + -e 's/max_request_size = ".*"/max_request_size = "50 MiB"/g' \ + -e 's/max_response_size = ".*"/max_response_size = "500 MiB"/g' \ + "$config" +done +EOF + +# Run on ALL hosts, then restart +for host in LON2 AMS FRA2 RPC; do + scp /tmp/update-sync-sizes.sh ubuntu@$host:/tmp/ + ssh ubuntu@$host 'sudo bash /tmp/update-sync-sizes.sh' + ssh ubuntu@$host 'sudo systemctl restart ultramarine@*' +done +``` + +## Troubleshooting + +### Sync stalls after load test + +1. Check node logs for `Sync tip unchanged for too long` +2. Verify all nodes have matching large size limits +3. Restart nodes after config update + +### P2P messages rejected + +Symptom: Messages dropped silently, no error in logs. + +Fix: Increase `pubsub_max_size` and `rpc_max_size` on ALL nodes. + +### ValueSync timeout + +Symptom: `Sync request timed out` in logs. + +Fix: + +1. Increase `request_timeout` to 60s+ +2. Increase `max_response_size` to accommodate large blocks +3. Ensure sending nodes also have large `rpc_max_size` + +## Validation + +**Historical validation (2026-02-06):** after applying these fixes, AMS + FRA2 achieved **0% errors** during the 60k‑target sharded run (~40k+ documented in captured logs; LON2 output missing). + +**Latest validation (2026-02-10):** PERF-SUMMARY shows a clean 20k/60s probe on all three hosts with **0 errors** and `txpool pending=0, queued=0` everywhere. Total submission: **1,823,713 tx** (~30,395 TPS). See PERF-SUMMARY for current baseline evidence and consolidated phase metrics. + +| Shard | Submitted | Avg TPS | Errors | +| --------- | ------------- | ----------- | ------ | +| LON2 | 924,008 | 15,384.19 | **0** | +| AMS | 321,788 | 5,352.32 | **0** | +| FRA2 | 577,917 | 9,631.21 | **0** | +| **Total** | **1,823,713** | **~30,395** | **0** | + +See `../journal/PERF-SUMMARY-fibernet-throughput-journey.md` for full baseline details. + +## Related + +- [CL Runtime](./cl-runtime.md) - logging and threading config +- [EL Gas Limits](./el-gas-limits.md) - builder and txpool config +- [Block Timing](./block-timing.md) - timestamp invariants diff --git a/dprint.json b/dprint.json index 6c7854f..d2a11f3 100644 --- a/dprint.json +++ b/dprint.json @@ -8,5 +8,8 @@ "includes": [ "**/*.toml", "**/*.md" + ], + "excludes": [ + "**/Cargo.toml" ] } diff --git a/infra/.gitignore b/infra/.gitignore new file mode 100644 index 0000000..dbf3a59 --- /dev/null +++ b/infra/.gitignore @@ -0,0 +1,31 @@ +# Generated bundles (public artifacts may be committed per-network; private artifacts never). +networks/*/bundle/private/ + +# Plaintext secrets files (never commit). Prefer secrets.sops.yaml for committable encrypted secrets. +networks/*/secrets.yaml +networks/*/secrets.local.yaml +networks/*/secrets.plain.yaml + +# Common secret material +**/jwtsecret +**/*.jwt +**/*.key +**/*.pem +**/*.agekey +**/secrets.env + +# SSH private keys should never live in-repo (keep them in ~/.ssh or a secret manager). +**/id_rsa +**/id_ed25519 +**/*.ppk + +# Optional per-network SSH helper files (local only). +networks/*/ssh_config +networks/*/known_hosts +networks/*/net.mk + +# Default net selector (local only). +.net + +# Optional local SOPS config (if you don't want to commit it) +.sops.yaml.local diff --git a/infra/README.md b/infra/README.md new file mode 100644 index 0000000..52a1e6d --- /dev/null +++ b/infra/README.md @@ -0,0 +1,167 @@ +# Infra (multi-host) tooling + +This folder contains the manifest-driven infra scaffolding for multi-host +Load Network deployments. + +Primary references: + +- `infra/README.md` (this document) for operator workflows and commands. +- `docs/FINAL_PLAN.md` for architecture context. +- `docs/knowledge_base/p2p-sync-limits.md` for size-limit tuning under load. + +## Netgen + +Netgen is a Rust binary in the workspace (source at `infra/gen/netgen/`). + +- Validates a manifest: + - `cargo run -p ultramarine-netgen --bin netgen -- validate --manifest infra/manifests/.yaml` +- Generates `infra/networks//network.lock.json` + public bundle outputs: + - `cargo run -p ultramarine-netgen --bin netgen -- gen --manifest infra/manifests/.yaml --out-dir infra/networks/` +- Optionally provides per-node secrets (plaintext or sops-encrypted): + - `cargo run -p ultramarine-netgen --bin netgen -- gen --manifest infra/manifests/.yaml --out-dir infra/networks/ --secrets-file infra/networks//secrets.sops.yaml` + - By default, `gen` fails if any validator is missing an archiver bearer token (required for a bootable testnet). For non-bootable dry-runs (e.g. to bootstrap storage), use `make net-plan NET=` (or pass `--allow-missing-archiver-tokens` directly). + - To create the encrypted file from a plaintext `secrets.yaml`, use: + - `make net-secrets-encrypt NET= SOPS_AGE_RECIPIENT= REMOVE_PLAINTEXT=true` + +Generated outputs (current): + +- `infra/networks//network.lock.json` (deterministic lockfile: placements, ports, enodes/bootnodes, artifact checksums) +- `infra/networks//bundle/public/genesis.json` (EL genesis; Prague/Cancun at genesis) +- `infra/networks//bundle/public/network.json` (machine-readable network description) +- `infra/networks//inventory.yml` (Ansible inventory) +- `infra/networks//bundle/private/load-reth/p2p-keys/.key` (stable EL identity; never commit) +- `infra/networks//bundle/private/env/ultramarine-.env` (runtime variables for systemd/Ansible) +- `infra/networks//bundle/private/env/load-reth-.env` (runtime variables for systemd/Ansible) +- `infra/networks//bundle/private/ultramarine/secrets/.env` (archiver bearer token env file; derived from secrets; never commit) +- `infra/networks//bundle/private/monitoring/grafana_admin_password.env` (Grafana admin password; base64-encoded, derived from secrets or auto-generated on deploy; never commit) +- `infra/networks//bundle/private/ultramarine/homes//config/{config.toml,genesis.json,priv_validator_key.json}` (Ultramarine home skeleton; `priv_validator_key.json` is generated for every node; only `role=validator` nodes are in the genesis validator set; never commit) + +Notes: + +- Deploys are **Engine IPC-only**. +- Validators require archiver config; bearer tokens are expected via decrypted + secrets in `infra/networks//secrets.sops.yaml`. +- If `blockscout.enabled=true` in the manifest, `net-deploy` / `net-launch` will also deploy Blockscout + nginx on the configured host. + +## Ansible Design Choices (Intentional) + +- Services are managed as systemd units that invoke `docker run` directly for explicit host-level control. +- Secrets are managed via SOPS instead of ansible-vault to match team workflows and rotation practices. +- Some checks still use `command/shell` where no reliable module exists (e.g., UFW or socket inspection). + +## Deploy (M3, systemd + Docker) + +Ansible is the deploy layer. It copies artifacts to hosts and installs systemd units that run pinned container images. + +Host layout: + +- Network artifacts: `/opt/loadnet/networks//...` +- Active network symlink: `/opt/loadnet/current -> /opt/loadnet/networks/` +- Persistent EL state: `/var/lib/load-reth//` +- Persistent CL state: `/var/lib/ultramarine//` +- Engine IPC: `/run/load-reth//engine.ipc` + +Operator commands (from `ultramarine/`): + +- Set default network (so you can omit `NET=`): `make net-use NET=` (clears via `make net-unset`) +- If you prefer not to set a default, append `NET=` to any command below. +- Dry-run / bootstrap (no secrets yet): `make net-plan` (generates inventory/lockfile/bundles but the network won’t be bootable without archiver tokens for validators) +- Generate artifacts: `make net-gen` (auto-uses `infra/networks//secrets.sops.yaml` if present) +- One-command bootstrap (plan + storage + pre-doctor): `make net-bootstrap` +- One-command go-live (gen + storage + deploy + post-doctor + health): `make net-launch` +- One-command update (gen + apply + roll + health): `make net-update` +- Deploy to hosts (default: ensures services are running; no restarts if already running): `make net-deploy` +- Apply + restart immediately (disruptive): `make net-redeploy` +- Rolling restart (disruptive; may halt small nets): `make net-roll ROLL_CONFIRM=YES` +- Start/restart: `make net-up` / Stop: `make net-down` +- Inspect: `make net-status` / `make net-logs LINES=200` +- Log cleanup (vacuum journald; optional syslog rotation/truncation): `make net-clean-logs JOURNAL_VACUUM_SIZE=1G` +- Health: `make net-health` +- Preflight (pre-deploy): `make net-doctor-pre` +- Diagnostics (post-deploy): `make net-doctor` +- Firewall: `make net-firewall` (or `make net-deploy APPLY_FIREWALL=true`) +- Storage bootstrap: `make net-storage` (see notes below) +- Wipe network from hosts (destructive): `make net-wipe WIPE_CONFIRM=YES` (tune with `WIPE_STATE=true|false`, `WIPE_MONITORING=true|false`, `WIPE_CONTAINERS=true|false`, `WIPE_FIREWALL=true|false`, `WIPE_NODES=node-0`, and `LIMIT=`) +- Limit any Ansible run to a single host: add `LIMIT=` (e.g. `make net-storage LIMIT=lon2-0`) +- SSH key: pass `SSH_KEY=/path/to/key` (or use ssh-agent / `~/.ssh/config`). +- Local checks: `make infra-checks` + +Notes: + +- Controller requirement: use an Ansible version compatible with your controller Python. + - `ansible-core 2.15.x` is not compatible with Python `3.14+`. If you used `pipx` and see errors involving `ast.Str`, reinstall Ansible with Python 3.11/3.12 (example: `pipx reinstall --python python3.12 ansible-core`). +- Storage bootstrap is intentionally separate from deploy. +- `net-gen` auto-uses `infra/networks//secrets.sops.yaml` if present. To skip secrets, run `make net-gen SECRETS_FILE=`. +- Optional per-network defaults: create `infra/networks//net.mk` (Makefile syntax) to avoid long `VAR=...` overrides. Example: + +```make +APPLY_FIREWALL = true +MOVE_DOCKER_DATAROOT = true +BIND_VAR_LOG = true +PROMETHEUS_BIND = 127.0.0.1 +GRAFANA_BIND = 127.0.0.1 +EL_HTTP_BIND = 0.0.0.0 +SSH_KEY = ~/.ssh/your_key +``` + +- If your host image enforces a broken apt proxy, set `APT_DISABLE_PROXY=true` (writes `/etc/apt/apt.conf.d/99loadnet-no-proxy`). +- In non-destructive mode, `net-storage` expects the data volume to be mounted at `DATA_MOUNTPOINT` (default: `/var/lib/loadnet`). +- If your provider image mounts the data volume elsewhere (common: `/home`), `net-storage` auto-adopts it by bind-mounting `DATA_SOURCE_DIR` (default: `/home/loadnet`) into `DATA_MOUNTPOINT` and persists it in `/etc/fstab` with systemd mount ordering (`x-systemd.requires-mounts-for=/home`). +- Destructive provisioning requires explicit device IDs and an explicit flag, e.g.: + - `make net-storage NET= STORAGE_WIPE=true DATA_DEVICES="['/dev/disk/by-id/nvme-...','/dev/disk/by-id/nvme-...']" DATA_RAID_LEVEL=1 MOVE_DOCKER_DATAROOT=true` +- You don’t need to care about the mdadm “device number” (e.g. `/dev/md127`): the playbook defaults to creating `/dev/md/loadnet-data` and mounts by filesystem UUID in `/etc/fstab`. The only per-host detail is selecting the underlying NVMe devices (by-id is safest); discover them with e.g. `ssh 'ls -la /dev/disk/by-id | grep nvme | grep -v part'`. +- `net-deploy` fails fast if `infra/manifests/.yaml` changed without regenerating `network.lock.json`. +- To deploy without restarting running nodes: `make net-deploy NET=` (or `make net-apply NET=`) +- To restart after a deploy: `make net-roll NET= ROLL_CONFIRM=YES` (recommended over restarting all at once) +- `net-deploy` verifies `bundle/public/genesis.json` checksum against the lockfile on each host. +- Firewall automation is idempotent, keeps SSH allowed, and opens only P2P ports by default. +- For non-`example` networks, `netgen validate` rejects placeholder archiver URLs (e.g. `archiver.example.com`). + +## Logging and disk pressure + +Defaults (applied by Ansible): + +- Journald is capped to protect the root disk (defaults: 2G total, 200M files, 7 days). If you need different values, override `loadnet_journal_max_use`, `loadnet_journal_max_file_size`, or `loadnet_journal_max_retention` in your Ansible vars. +- Docker logs use the `local` driver with rotation (defaults: `max-size=50m`, `max-file=5`). +- If you enable journald forwarding to syslog (`loadnet_forward_journal_to_syslog=true`), rsyslog logrotate policy is installed (defaults: 100M × 5). +- Doctor checks fail if `/` > 85% or `/var/log` > 4GB by default. Override with `loadnet_root_fs_max_pct` and `loadnet_log_dir_max_mb`. + +Operational tips: + +- Keep `/var/log` off the root disk by running `make net-storage BIND_VAR_LOG=true` (optionally set `LOG_DIR=`). +- Recovery shortcut: `make net-clean-logs JOURNAL_VACUUM_SIZE=1G` (vacuum journald; rotates/truncates syslog only when forwarding is enabled). + +## Accessing Monitoring (Grafana / Prometheus) + +For security, Grafana and Prometheus bind to `127.0.0.1` (localhost only) by default. Access them via SSH tunnel: + +```bash +# Grafana (port 3000) +ssh -L 3000:127.0.0.1:3000 ubuntu@ +# Then open http://localhost:3000 in your browser + +# Prometheus (port 9090) +ssh -L 9090:127.0.0.1:9090 ubuntu@ +# Then open http://localhost:9090 in your browser + +# Both at once (different local ports if needed) +ssh -L 3000:127.0.0.1:3000 -L 9090:127.0.0.1:9090 ubuntu@ +``` + +To override binding (not recommended for production): + +- Grafana: `make net-deploy NET= GRAFANA_BIND=0.0.0.0` +- Prometheus: `make net-deploy NET= PROMETHEUS_BIND=0.0.0.0` + +## Rotating Archiver Secrets + +To rotate archiver bearer tokens without regenerating validator keys: + +```bash +# 1. Update your secrets.sops.yaml with new tokens +# 2. Regenerate bundle (keys are preserved) +make net-update-secrets NET= SECRETS_FILE=infra/networks//secrets.sops.yaml + +# 3. Deploy the updated secrets +make net-deploy NET= +``` diff --git a/infra/ansible/.ansible-lint b/infra/ansible/.ansible-lint new file mode 100644 index 0000000..5b72819 --- /dev/null +++ b/infra/ansible/.ansible-lint @@ -0,0 +1,4 @@ +--- +skip_list: + - var-naming[no-role-prefix] + - command-instead-of-module diff --git a/infra/ansible/.yamllint b/infra/ansible/.yamllint new file mode 100644 index 0000000..a44d63e --- /dev/null +++ b/infra/ansible/.yamllint @@ -0,0 +1,23 @@ +--- +extends: default + +rules: + line-length: + max: 160 + level: warning + truthy: + allowed-values: ['true', 'false', 'yes', 'no'] + comments: + min-spaces-from-content: 1 + comments-indentation: false + document-start: + present: true + indentation: + spaces: 2 + indent-sequences: true + braces: + min-spaces-inside: 0 + max-spaces-inside: 1 + octal-values: + forbid-implicit-octal: true + forbid-explicit-octal: true diff --git a/infra/ansible/ansible.cfg b/infra/ansible/ansible.cfg new file mode 100644 index 0000000..da0c2a5 --- /dev/null +++ b/infra/ansible/ansible.cfg @@ -0,0 +1,12 @@ +[defaults] +inventory = ../../infra/networks/example/inventory.yml +stdout_callback = default +host_key_checking = True +retry_files_enabled = False +interpreter_python = auto_silent +roles_path = ./roles +ssh_args = -o IdentitiesOnly=yes -o PreferredAuthentications=publickey + +[privilege_escalation] +become = True +become_method = sudo diff --git a/infra/ansible/playbooks/blockscout.yml b/infra/ansible/playbooks/blockscout.yml new file mode 100644 index 0000000..ef62c93 --- /dev/null +++ b/infra/ansible/playbooks/blockscout.yml @@ -0,0 +1,187 @@ +--- +# Blockscout Explorer Deployment Playbook +# +# This playbook deploys: +# 1. Full node (load-reth + ultramarine) for Blockscout indexing +# 2. Blockscout explorer stack (backend, frontend, stats, visualizer) +# 3. Nginx reverse proxy with Let's Encrypt SSL +# +# Usage: make net-blockscout NET= +# +# Prerequisites: +# - DNS A-records configured for domains (explorer, stats, rpc) +# - Manifest includes blockscout configuration +# - net-gen has been run to generate lockfile and inventory +# +- name: Deploy Blockscout explorer with full node + hosts: all + gather_facts: true + become: true + vars: + loadnet_root: /opt/loadnet + loadnet_current: "{{ loadnet_root }}/current" + loadnet_networks: "{{ loadnet_root }}/networks" + loadnet_net: "{{ net }}" + loadnet_net_dir: "{{ loadnet_networks }}/{{ loadnet_net }}" + # Disable monitoring for blockscout host (no Prometheus/Grafana) + loadnet_monitoring_enabled: false + blockscout_manage_service: false + pre_tasks: + - name: Assert required vars + ansible.builtin.assert: + that: + - net is defined + - net_dir is defined + fail_msg: "net and net_dir must be defined" + + - name: Load lockfile + ansible.builtin.set_fact: + loadnet_lock: "{{ lookup('file', net_dir + '/network.lock.json') | from_json }}" + + - name: Build node->ports map + ansible.builtin.set_fact: + loadnet_ports_by_node: >- + {{ + dict( + loadnet_lock.nodes + | map(attribute='id') + | zip(loadnet_lock.nodes | map(attribute='ports')) + ) + }} + + roles: + # Phase 1: Base system setup + - blockscout_host + - firewall + - common + + # Phase 2: Full node deployment + - load_reth + - ultramarine + + # Phase 3: Blockscout explorer + - blockscout + - blockscout_nginx + + post_tasks: + # --- Full Node Startup --- + - name: Check /opt/loadnet/current state + ansible.builtin.stat: + path: "{{ loadnet_current }}" + follow: false + register: loadnet_current_stat + + - name: Backup existing /opt/loadnet/current directory (non-symlink) + ansible.builtin.command: + cmd: "mv {{ loadnet_current }} {{ loadnet_current }}.bak-{{ ansible_date_time.iso8601_basic_short }}" + changed_when: true + when: + - loadnet_current_stat.stat.exists + - loadnet_current_stat.stat.isdir + - not loadnet_current_stat.stat.islnk + + - name: Point /opt/loadnet/current to selected network + ansible.builtin.file: + src: "{{ loadnet_net_dir }}" + dest: "{{ loadnet_current }}" + state: link + force: true + + - name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + + - name: Start load-reth instance + ansible.builtin.systemd: + name: "load-reth@{{ item }}.service" + enabled: true + state: started + loop: "{{ loadnet_nodes }}" + + - name: Check for stale load-reth containers + ansible.builtin.command: + cmd: "docker inspect -f '{{ \"{{\" }}.State.Status{{ \"}}\" }}' load-reth-{{ item }}" + register: load_reth_container_status + loop: "{{ loadnet_nodes }}" + changed_when: false + failed_when: false + + - name: Remove stale load-reth containers + ansible.builtin.command: + cmd: "docker rm -f load-reth-{{ item.item }}" + loop: "{{ load_reth_container_status.results }}" + loop_control: + label: "{{ item.item }}" + when: + - item.rc == 0 + - item.stdout in ['exited', 'dead', 'created'] + changed_when: true + + - name: Wait for load-reth engine IPC sockets + ansible.builtin.wait_for: + path: "/run/load-reth/{{ item }}/engine.ipc" + state: present + timeout: 120 + loop: "{{ loadnet_nodes }}" + + - name: Wait for EL JSON-RPC to report not syncing + ansible.builtin.uri: + url: "http://127.0.0.1:{{ loadnet_ports_by_node[item].el_http }}" + method: POST + body_format: json + body: + jsonrpc: "2.0" + id: 1 + method: "eth_syncing" + params: [] + return_content: true + timeout: 2 + register: el_syncing_status + retries: 30 + delay: 2 + until: + - el_syncing_status.status == 200 + - el_syncing_status.json is defined + # eth_syncing returns false when synced, or object with currentBlock/highestBlock when syncing + # Consider synced if: result==false OR currentBlock >= highestBlock + - >- + el_syncing_status.json.result == false or + (el_syncing_status.json.result is mapping and + el_syncing_status.json.result.currentBlock is defined and + el_syncing_status.json.result.highestBlock is defined and + (el_syncing_status.json.result.currentBlock | int(base=16)) >= (el_syncing_status.json.result.highestBlock | int(base=16))) + loop: "{{ loadnet_nodes }}" + changed_when: false + + - name: Start ultramarine instance + ansible.builtin.systemd: + name: "ultramarine@{{ item }}.service" + enabled: true + state: started + loop: "{{ loadnet_nodes }}" + + # --- Blockscout Startup --- + - name: Start blockscout service + ansible.builtin.systemd: + name: blockscout.service + enabled: true + state: started + + - name: Wait for backend health + ansible.builtin.uri: + url: "http://127.0.0.1:4000/api/v2/stats" + method: GET + return_content: true + timeout: 10 + register: blockscout_health + retries: 30 + delay: 10 + until: blockscout_health.status == 200 + changed_when: false + + # --- Nginx Startup --- + - name: Start nginx-blockscout service + ansible.builtin.systemd: + name: nginx-blockscout.service + enabled: true + state: started diff --git a/infra/ansible/playbooks/clean_logs.yml b/infra/ansible/playbooks/clean_logs.yml new file mode 100644 index 0000000..9720e26 --- /dev/null +++ b/infra/ansible/playbooks/clean_logs.yml @@ -0,0 +1,106 @@ +--- +- name: Clean logs (vacuum journald, rotate syslog, restart EL/CL) + hosts: all + gather_facts: false + become: true + vars: + loadnet_root: /opt/loadnet + loadnet_networks: "{{ loadnet_root }}/networks" + loadnet_net: "{{ net }}" + loadnet_net_dir: "{{ loadnet_networks }}/{{ loadnet_net }}" + loadnet_journal_vacuum_size_value: "{{ loadnet_journal_vacuum_size | default('1G') }}" + loadnet_syslog_truncate_value: "{{ loadnet_syslog_truncate | default(true) }}" + loadnet_forward_journal_to_syslog_value: "{{ loadnet_forward_journal_to_syslog | default(false) }}" + tasks: + - name: Assert required vars + ansible.builtin.assert: + that: + - net is defined + - net_dir is defined + + - name: Load lockfile + ansible.builtin.set_fact: + loadnet_lock: "{{ lookup('file', net_dir + '/network.lock.json') | from_json }}" + + - name: Assert lockfile matches selected network + ansible.builtin.assert: + that: + - loadnet_lock.network.name == net + fail_msg: "network.lock.json is for '{{ loadnet_lock.network.name }}' but net='{{ net }}'" + + - name: Vacuum journald to target size + ansible.builtin.command: "journalctl --vacuum-size={{ loadnet_journal_vacuum_size_value }}" + changed_when: true + + - name: Check if syslog exists + ansible.builtin.stat: + path: /var/log/syslog + register: loadnet_syslog_stat + + - name: Force logrotate for rsyslog + ansible.builtin.command: logrotate -f /etc/logrotate.d/rsyslog + changed_when: true + failed_when: false + when: + - loadnet_syslog_stat.stat.exists + - loadnet_forward_journal_to_syslog_value | bool + + - name: Truncate syslog after rotation + ansible.builtin.shell: | + set -euo pipefail + shopt -s nullglob + for file in /var/log/syslog /var/log/syslog.*; do + truncate -s 0 "$file" + done + args: + executable: /bin/bash + changed_when: true + when: + - loadnet_syslog_truncate_value | bool + - loadnet_syslog_stat.stat.exists + + - name: Restart journald to ensure rotation state is clean + ansible.builtin.systemd: + name: systemd-journald + state: restarted + when: loadnet_syslog_stat.stat.exists + + - name: Restart load-reth instances + ansible.builtin.systemd: + name: "load-reth@{{ item }}.service" + state: restarted + enabled: true + loop: "{{ loadnet_nodes | default([]) }}" + failed_when: false + + - name: Restart ultramarine instances + ansible.builtin.systemd: + name: "ultramarine@{{ item }}.service" + state: restarted + enabled: true + loop: "{{ loadnet_nodes | default([]) }}" + failed_when: false + + - name: Check if blockscout logrotate config exists + ansible.builtin.stat: + path: /etc/logrotate.d/blockscout + register: blockscout_logrotate_stat + + - name: Force logrotate for blockscout + ansible.builtin.command: logrotate -f /etc/logrotate.d/blockscout + changed_when: true + failed_when: false + when: blockscout_logrotate_stat.stat.exists + + - name: Check if blockscout service exists + ansible.builtin.command: systemctl status blockscout + changed_when: false + failed_when: false + register: blockscout_service_status + + - name: Restart blockscout service + ansible.builtin.systemd: + name: blockscout + state: restarted + when: blockscout_service_status.rc == 0 + failed_when: false diff --git a/infra/ansible/playbooks/common_only.yml b/infra/ansible/playbooks/common_only.yml new file mode 100644 index 0000000..aea5fb2 --- /dev/null +++ b/infra/ansible/playbooks/common_only.yml @@ -0,0 +1,7 @@ +--- +- name: Apply common baseline only + hosts: all + gather_facts: true + become: true + roles: + - common diff --git a/infra/ansible/playbooks/deploy.yml b/infra/ansible/playbooks/deploy.yml new file mode 100644 index 0000000..20ba9f4 --- /dev/null +++ b/infra/ansible/playbooks/deploy.yml @@ -0,0 +1,246 @@ +--- +- name: Deploy Load testnet services + hosts: all + gather_facts: true + become: true + vars: + loadnet_root: /opt/loadnet + loadnet_current: "{{ loadnet_root }}/current" + loadnet_networks: "{{ loadnet_root }}/networks" + loadnet_net: "{{ net }}" + loadnet_net_dir: "{{ loadnet_networks }}/{{ loadnet_net }}" + # Avoid self-referential defaults (which can cause recursive loop errors in Jinja). + # Extra-vars (from `make net-deploy ...`) override these safely. + restart_on_deploy: true + apply_firewall: false + pre_tasks: + - name: Assert required vars + ansible.builtin.assert: + that: + - net is defined + - net_dir is defined + - name: Load lockfile + ansible.builtin.set_fact: + loadnet_lock: "{{ lookup('file', net_dir + '/network.lock.json') | from_json }}" + - name: Assert lockfile matches selected network + ansible.builtin.assert: + that: + - loadnet_lock.network.name == net + fail_msg: "network.lock.json is for '{{ loadnet_lock.network.name }}' but net='{{ net }}'" + + - name: Build node->ports map + ansible.builtin.set_fact: + loadnet_ports_by_node: >- + {{ + dict( + loadnet_lock.nodes + | map(attribute='id') + | zip(loadnet_lock.nodes | map(attribute='ports')) + ) + }} + + - name: Resolve repo root on controller + ansible.builtin.set_fact: + loadnet_repo_root: "{{ playbook_dir | dirname | dirname | dirname }}" + + - name: Resolve manifest path on controller + ansible.builtin.set_fact: + loadnet_manifest_path: >- + {{ + (loadnet_lock.inputs.manifest_path is match('^/')) + | ternary(loadnet_lock.inputs.manifest_path, loadnet_repo_root + '/' + loadnet_lock.inputs.manifest_path) + }} + + - name: Compute manifest sha256 on controller (drift check) + ansible.builtin.stat: + path: "{{ loadnet_manifest_path }}" + checksum_algorithm: sha256 + delegate_to: localhost + become: false + register: manifest_stat + + - name: Fail fast on manifest/lock drift + ansible.builtin.assert: + that: + - manifest_stat.stat.exists + - manifest_stat.stat.checksum == loadnet_lock.inputs.manifest_sha256 + fail_msg: "manifest changed since lockfile was generated; rerun netgen (make net-gen NET={{ net }})" + roles: + - common + - role: blockscout_host + when: + - loadnet_lock.blockscout is defined + - loadnet_lock.blockscout.enabled | default(false) + vars: + blockscout_host_strict: false + - { role: firewall, when: apply_firewall | bool } + - load_reth + - ultramarine + - monitoring + - role: blockscout + when: + - loadnet_lock.blockscout is defined + - loadnet_lock.blockscout.enabled | default(false) + - inventory_hostname == loadnet_lock.blockscout.host + - role: blockscout_nginx + when: + - loadnet_lock.blockscout is defined + - loadnet_lock.blockscout.enabled | default(false) + - inventory_hostname == loadnet_lock.blockscout.host + post_tasks: + - name: Point /opt/loadnet/current to selected network + ansible.builtin.file: + src: "{{ loadnet_net_dir }}" + dest: "{{ loadnet_current }}" + state: link + force: true + - name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + + - name: Ensure load-reth instances are running + ansible.builtin.systemd: + name: "load-reth@{{ item }}.service" + enabled: true + state: "{{ (restart_on_deploy | bool) | ternary('restarted', 'started') }}" + loop: "{{ loadnet_nodes }}" + + - name: Check for stale load-reth containers that block restarts + ansible.builtin.command: + cmd: "docker inspect -f '{{ \"{{\" }}.State.Status{{ \"}}\" }}' load-reth-{{ item }}" + register: load_reth_container_status + loop: "{{ loadnet_nodes }}" + changed_when: false + failed_when: false + when: restart_on_deploy | bool + + - name: Remove stale load-reth containers + ansible.builtin.command: + cmd: "docker rm -f load-reth-{{ item.item }}" + loop: "{{ load_reth_container_status.results }}" + loop_control: + label: "{{ item.item }}" + changed_when: true + when: + - restart_on_deploy | bool + - item.rc == 0 + - item.stdout in ['exited', 'dead', 'created'] + + - name: Wait for load-reth engine IPC sockets + ansible.builtin.wait_for: + path: "/run/load-reth/{{ item }}/engine.ipc" + state: present + timeout: 90 + loop: "{{ loadnet_nodes }}" + + - name: Wait for EL JSON-RPC to report not syncing + ansible.builtin.uri: + url: "http://127.0.0.1:{{ loadnet_ports_by_node[item].el_http }}" + method: POST + body_format: json + body: + jsonrpc: "2.0" + id: 1 + method: "eth_syncing" + params: [] + return_content: true + timeout: 2 + register: el_syncing_status + retries: 30 + delay: 2 + until: + - el_syncing_status.status == 200 + - el_syncing_status.json is defined + # eth_syncing returns false when synced, or object with currentBlock/highestBlock when syncing + # Consider synced if: result==false OR currentBlock >= highestBlock + - >- + el_syncing_status.json.result == false or + (el_syncing_status.json.result is mapping and + el_syncing_status.json.result.currentBlock is defined and + el_syncing_status.json.result.highestBlock is defined and + (el_syncing_status.json.result.currentBlock | int(base=16)) >= (el_syncing_status.json.result.highestBlock | int(base=16))) + loop: "{{ loadnet_nodes }}" + changed_when: false + + - name: Ensure ultramarine instances are running + ansible.builtin.systemd: + name: "ultramarine@{{ item }}.service" + enabled: true + state: "{{ (restart_on_deploy | bool) | ternary('restarted', 'started') }}" + loop: "{{ loadnet_nodes }}" + + - name: Ensure Prometheus is running (monitoring) + ansible.builtin.systemd: + name: prometheus.service + enabled: true + state: "{{ (restart_on_deploy | bool) | ternary('restarted', 'started') }}" + + - name: Ensure Grafana is running (monitoring) + ansible.builtin.systemd: + name: grafana.service + enabled: true + state: "{{ (restart_on_deploy | bool) | ternary('restarted', 'started') }}" + + - name: Wait for Grafana container to be running + ansible.builtin.command: + argv: + - docker + - inspect + - -f + - '{{ "{{" }}.State.Running{{ "}}" }}' + - loadnet-grafana + register: loadnet_grafana_container_state + changed_when: false + retries: 10 + delay: 3 + until: loadnet_grafana_container_state.rc == 0 and loadnet_grafana_container_state.stdout == "true" + when: + - loadnet_grafana_admin_password_value is defined + - (loadnet_grafana_admin_password_value | length) > 0 + + - name: Start blockscout service (if enabled) + ansible.builtin.systemd: + name: blockscout.service + enabled: true + state: "{{ (restart_on_deploy | bool) | ternary('restarted', 'started') }}" + when: blockscout_enabled | default(false) | bool + + - name: Wait for Blockscout backend health (if enabled) + ansible.builtin.uri: + url: "http://127.0.0.1:{{ blockscout_backend_port | default(4000) }}/api/v2/stats" + method: GET + return_content: true + timeout: 10 + register: blockscout_health + retries: 30 + delay: 10 + until: blockscout_health.status == 200 + changed_when: false + when: blockscout_enabled | default(false) | bool + + - name: Start nginx-blockscout service (if enabled) + ansible.builtin.systemd: + name: nginx-blockscout.service + enabled: true + state: "{{ (restart_on_deploy | bool) | ternary('restarted', 'started') }}" + when: blockscout_nginx_enabled | default(false) | bool + + - name: Enforce Grafana admin password from deploy bundle + ansible.builtin.command: + argv: + - docker + - exec + - loadnet-grafana + - grafana-cli + - admin + - reset-admin-password + - "{{ loadnet_grafana_admin_password_value }}" + register: loadnet_grafana_password_reset + retries: 5 + delay: 2 + until: loadnet_grafana_password_reset.rc == 0 + changed_when: loadnet_grafana_password_reset.rc == 0 + no_log: true + when: + - loadnet_grafana_admin_password_value is defined + - (loadnet_grafana_admin_password_value | length) > 0 diff --git a/infra/ansible/playbooks/doctor.yml b/infra/ansible/playbooks/doctor.yml new file mode 100644 index 0000000..5d02cb0 --- /dev/null +++ b/infra/ansible/playbooks/doctor.yml @@ -0,0 +1,367 @@ +--- +- name: Net doctor (post-deploy) + hosts: all + gather_facts: true + become: true + vars: + loadnet_root: /opt/loadnet + loadnet_current: "{{ loadnet_root }}/current" + loadnet_networks: "{{ loadnet_root }}/networks" + loadnet_net: "{{ net }}" + loadnet_net_dir: "{{ loadnet_networks }}/{{ loadnet_net }}" + loadnet_expected_el_http_bind: "{{ loadnet_el_http_bind | default('0.0.0.0') }}" + loadnet_root_fs_max_pct_value: "{{ loadnet_root_fs_max_pct | default(85) }}" + loadnet_log_dir_path: "{{ loadnet_log_dir | default('/var/log') }}" + loadnet_log_dir_max_mb_value: "{{ loadnet_log_dir_max_mb | default(4096) }}" + tasks: + - name: Assert required vars + ansible.builtin.assert: + that: + - net is defined + - net_dir is defined + + - name: Load lockfile + ansible.builtin.set_fact: + loadnet_lock: "{{ lookup('file', net_dir + '/network.lock.json') | from_json }}" + + - name: Assert lockfile matches selected network + ansible.builtin.assert: + that: + - loadnet_lock.network.name == net + fail_msg: "network.lock.json is for '{{ loadnet_lock.network.name }}' but net='{{ net }}'" + + - name: Gather service facts + ansible.builtin.service_facts: + + - name: Verify required services are installed + ansible.builtin.debug: + msg: "service {{ item }} is not installed" + loop: + - docker.service + - chrony.service + when: ansible_facts.services[item] is not defined + + - name: Verify required services are enabled + ansible.builtin.debug: + msg: "service {{ item }} is not enabled" + loop: + - docker.service + - chrony.service + when: + - ansible_facts.services[item] is defined + - ansible_facts.services[item].status is defined + - ansible_facts.services[item].status != 'enabled' + + - name: Verify required services are active + ansible.builtin.debug: + msg: "service {{ item }} is not active" + loop: + - docker.service + - chrony.service + when: + - ansible_facts.services[item] is defined + - ansible_facts.services[item].state is defined + - ansible_facts.services[item].state != 'running' + + - name: Check fs.file-max baseline + ansible.builtin.command: sysctl -n fs.file-max + changed_when: false + register: fs_file_max + + - name: Assert fs.file-max is sufficient + ansible.builtin.assert: + that: + - (fs_file_max.stdout | int) >= 1048576 + fail_msg: "fs.file-max={{ fs_file_max.stdout }} is too low; expected >= 1048576" + + - name: Check root filesystem usage + ansible.builtin.command: "df -P /" + changed_when: false + register: loadnet_root_df + + - name: Parse root filesystem usage + ansible.builtin.set_fact: + loadnet_root_pct: "{{ loadnet_root_df.stdout_lines[1].split()[4] | regex_replace('%', '') | int }}" + changed_when: false + failed_when: loadnet_root_df.stdout_lines | length < 2 + + - name: Assert root filesystem usage below threshold + ansible.builtin.assert: + that: + - (loadnet_root_pct | int) < (loadnet_root_fs_max_pct_value | int) + fail_msg: "root filesystem usage {{ loadnet_root_pct }}% exceeds limit {{ loadnet_root_fs_max_pct_value }}%" + + - name: Check /var/log size + ansible.builtin.command: "du -sm {{ loadnet_log_dir_path }}" + changed_when: false + register: loadnet_log_dir_size + + - name: Parse /var/log size + ansible.builtin.set_fact: + loadnet_log_dir_mb: "{{ loadnet_log_dir_size.stdout.split()[0] | int }}" + changed_when: false + failed_when: loadnet_log_dir_size.stdout | length == 0 + + - name: Assert /var/log size below threshold + ansible.builtin.assert: + that: + - (loadnet_log_dir_mb | int) < (loadnet_log_dir_max_mb_value | int) + fail_msg: "/var/log size {{ loadnet_log_dir_mb }}MB exceeds limit {{ loadnet_log_dir_max_mb_value }}MB" + + - name: Check /opt/loadnet/current symlink exists + ansible.builtin.stat: + path: "{{ loadnet_current }}" + follow: false + register: current_link + + - name: Assert /opt/loadnet/current is a symlink + ansible.builtin.assert: + that: + - current_link.stat.exists + - current_link.stat.islnk + fail_msg: "/opt/loadnet/current must be a symlink (run net-deploy first)" + + - name: Check installed systemd unit templates + ansible.builtin.stat: + path: "/etc/systemd/system/{{ item }}" + loop: + - load-reth@.service + - ultramarine@.service + register: unit_stats + + - name: Assert systemd unit templates exist + ansible.builtin.assert: + that: + - unit_stats.results[0].stat.exists + - unit_stats.results[1].stat.exists + fail_msg: "missing systemd unit template(s); rerun net-deploy" + + - name: Check data mount exists (/var/lib/loadnet) + ansible.builtin.command: "findmnt -n /var/lib/loadnet" + changed_when: false + register: findmnt_loadnet + failed_when: false + + - name: Warn if /var/lib/loadnet is not mounted + ansible.builtin.debug: + msg: "/var/lib/loadnet is not mounted (recommended for validators). Run net-storage before go-live." + when: findmnt_loadnet.rc != 0 + + - name: Check docker data-root (if docker is running) + ansible.builtin.shell: | + set -euo pipefail + docker info --format '{{ "{{.DockerRootDir}}" }}' + args: + executable: /bin/bash + changed_when: false + register: docker_root + failed_when: false + + - name: Show docker data-root + ansible.builtin.debug: + msg: "docker data-root: {{ docker_root.stdout | default('unknown') }}" + + - name: Build node->ports map (from lockfile) + ansible.builtin.set_fact: + loadnet_ports_by_node: "{{ dict(loadnet_lock.nodes | map(attribute='id') | zip(loadnet_lock.nodes | map(attribute='ports'))) }}" + + - name: Build node->role map (from lockfile) + ansible.builtin.set_fact: + loadnet_role_by_node: "{{ dict(loadnet_lock.nodes | map(attribute='id') | zip(loadnet_lock.nodes | map(attribute='role'))) }}" + + - name: Build validator node list + ansible.builtin.set_fact: + loadnet_validator_nodes: "{{ loadnet_lock.nodes | selectattr('role', 'equalto', 'validator') | map(attribute='id') | list }}" + + - name: Build local validator node list + ansible.builtin.set_fact: + loadnet_validator_nodes_local: "{{ loadnet_validator_nodes | intersect(loadnet_nodes) }}" + + - name: Build expected metrics bind (from lockfile policy) + ansible.builtin.set_fact: + loadnet_expected_metrics_bind: "{{ loadnet_lock.policy.metrics_bind | default('127.0.0.1') }}" + + - name: Normalize expected bind hosts for checks + ansible.builtin.set_fact: + loadnet_expected_el_http_host: "{{ '127.0.0.1' if loadnet_expected_el_http_bind in ['0.0.0.0', '::'] else loadnet_expected_el_http_bind }}" + loadnet_expected_metrics_host: "{{ '127.0.0.1' if loadnet_expected_metrics_bind in ['0.0.0.0', '::'] else loadnet_expected_metrics_bind }}" + + - name: Check free disk space under /var/lib (best-effort) + ansible.builtin.command: "df -Pk /var/lib" + changed_when: false + register: df_var_lib + failed_when: false + + - name: Show /var/lib disk usage + ansible.builtin.debug: + var: df_var_lib.stdout_lines + + - name: Stat load-reth p2p key files + ansible.builtin.stat: + path: "/opt/loadnet/current/bundle/private/load-reth/p2p-keys/{{ item }}.key" + follow: false + loop: "{{ loadnet_nodes }}" + register: reth_key_stats + + - name: Assert load-reth p2p keys are 0600 + ansible.builtin.assert: + that: + - reth_key_stats.results[ansible_loop.index0].stat.exists + - reth_key_stats.results[ansible_loop.index0].stat.mode == "0600" + fail_msg: "bad permissions for /opt/loadnet/current/bundle/private/load-reth/p2p-keys/{{ item }}.key (expected 0600)" + loop: "{{ loadnet_nodes }}" + loop_control: + extended: true + + - name: Stat validator priv_validator_key.json files (local nodes only) + ansible.builtin.stat: + path: "/var/lib/ultramarine/{{ item }}/config/priv_validator_key.json" + follow: false + loop: "{{ loadnet_validator_nodes_local }}" + register: validator_key_stats + + - name: Assert validator priv_validator_key.json is 0600 (local nodes only) + ansible.builtin.assert: + that: + - validator_key_stats.results[ansible_loop.index0].stat.exists + - validator_key_stats.results[ansible_loop.index0].stat.mode == "0600" + fail_msg: "bad permissions for /var/lib/ultramarine/{{ item }}/config/priv_validator_key.json (expected 0600)" + loop: "{{ loadnet_validator_nodes_local }}" + loop_control: + extended: true + + - name: Check load-reth services are active (postflight) + ansible.builtin.debug: + msg: "load-reth@{{ item }}.service is not running" + loop: "{{ loadnet_nodes }}" + when: + - ansible_facts.services['load-reth@' + item + '.service'] is defined + - ansible_facts.services['load-reth@' + item + '.service'].state != 'running' + + - name: Check ultramarine services are active (postflight) + ansible.builtin.debug: + msg: "ultramarine@{{ item }}.service is not running" + loop: "{{ loadnet_nodes }}" + when: + - ansible_facts.services['ultramarine@' + item + '.service'] is defined + - ansible_facts.services['ultramarine@' + item + '.service'].state != 'running' + + - name: Read ultramarine config.toml (local nodes) + ansible.builtin.slurp: + src: "/var/lib/ultramarine/{{ item }}/config/config.toml" + loop: "{{ loadnet_nodes }}" + register: ultramarine_config + failed_when: false + changed_when: false + + - name: Build mempool enabled map + ansible.builtin.set_fact: + loadnet_mempool_enabled: >- + {{ + loadnet_mempool_enabled | default({}) + | combine({ + item.item: ( + ( + item.content is defined + and ( + (item.content | b64decode) + | regex_findall('^max_tx_count\\s*=\\s*(\\d+)', multiline=True) + | first + | default('0', true) + ) | int > 0 + ) + ) + }) + }} + loop: "{{ ultramarine_config.results }}" + changed_when: false + + - name: Capture listening sockets + ansible.builtin.command: ss -lntu + changed_when: false + register: ss_listen + + - name: Assert load-reth P2P listener + ansible.builtin.assert: + that: + - ss_listen.stdout is search(':' ~ loadnet_ports_by_node[item].el_p2p ~ '\\b') + fail_msg: "missing listener for load-reth p2p port {{ loadnet_ports_by_node[item].el_p2p }} (node={{ item }})" + loop: "{{ loadnet_nodes }}" + loop_control: + extended: true + when: load_reth_active.results[ansible_loop.index0].rc == 0 + + - name: Wait for load-reth HTTP listener + ansible.builtin.wait_for: + host: "{{ loadnet_expected_el_http_host }}" + port: "{{ loadnet_ports_by_node[item.item].el_http }}" + timeout: 2 + loop: "{{ load_reth_active.results }}" + loop_control: + label: "{{ item.item }}" + when: item.rc == 0 + changed_when: false + + - name: Wait for load-reth metrics listener + ansible.builtin.wait_for: + host: "127.0.0.1" + port: "{{ loadnet_ports_by_node[item.item].el_metrics }}" + timeout: 2 + loop: "{{ load_reth_active.results }}" + loop_control: + label: "{{ item.item }}" + when: item.rc == 0 + changed_when: false + + - name: Assert ultramarine consensus P2P listener + ansible.builtin.assert: + that: + - ss_listen.stdout is search(':' ~ loadnet_ports_by_node[item].cl_p2p ~ '\\b') + fail_msg: "missing listener for ultramarine cl_p2p port {{ loadnet_ports_by_node[item].cl_p2p }} (node={{ item }})" + loop: "{{ loadnet_nodes }}" + loop_control: + extended: true + when: ultramarine_active.results[ansible_loop.index0].rc == 0 + + - name: Assert ultramarine mempool listener (when enabled) + ansible.builtin.assert: + that: + - ss_listen.stdout is search(':' ~ loadnet_ports_by_node[item].cl_mempool ~ '\\b') + fail_msg: "missing listener for ultramarine cl_mempool port {{ loadnet_ports_by_node[item].cl_mempool }} (node={{ item }})" + loop: "{{ loadnet_nodes }}" + loop_control: + extended: true + when: + - ultramarine_active.results[ansible_loop.index0].rc == 0 + - loadnet_mempool_enabled[item] | default(false) + + - name: Warn if ultramarine mempool is disabled + ansible.builtin.debug: + msg: "ultramarine mempool disabled (max_tx_count=0), skipping listener check (node={{ item }})" + loop: "{{ loadnet_nodes }}" + loop_control: + extended: true + when: + - ultramarine_active.results[ansible_loop.index0].rc == 0 + - not (loadnet_mempool_enabled[item] | default(true)) + + - name: Warn if ultramarine mempool config is missing + ansible.builtin.debug: + msg: "ultramarine config.toml missing or unreadable; skipping mempool listener check (node={{ item }})" + loop: "{{ loadnet_nodes }}" + loop_control: + extended: true + when: + - ultramarine_active.results[ansible_loop.index0].rc == 0 + - item not in loadnet_mempool_enabled + + - name: Wait for ultramarine metrics listener + ansible.builtin.wait_for: + host: "{{ loadnet_expected_metrics_host }}" + port: "{{ loadnet_ports_by_node[item.item].cl_metrics }}" + timeout: 2 + loop: "{{ ultramarine_active.results }}" + loop_control: + label: "{{ item.item }}" + when: item.rc == 0 + changed_when: false diff --git a/infra/ansible/playbooks/doctor_pre.yml b/infra/ansible/playbooks/doctor_pre.yml new file mode 100644 index 0000000..a721503 --- /dev/null +++ b/infra/ansible/playbooks/doctor_pre.yml @@ -0,0 +1,131 @@ +--- +- name: Net doctor (pre-deploy) + hosts: all + gather_facts: true + become: true + vars: + loadnet_root: /opt/loadnet + loadnet_current: "{{ loadnet_root }}/current" + loadnet_networks: "{{ loadnet_root }}/networks" + loadnet_net: "{{ net }}" + loadnet_net_dir: "{{ loadnet_networks }}/{{ loadnet_net }}" + loadnet_root_fs_max_pct_value: "{{ loadnet_root_fs_max_pct | default(85) }}" + loadnet_log_dir_path: "{{ loadnet_log_dir | default('/var/log') }}" + loadnet_log_dir_max_mb_value: "{{ loadnet_log_dir_max_mb | default(4096) }}" + tasks: + - name: Assert required vars + ansible.builtin.assert: + that: + - net is defined + - net_dir is defined + + - name: Load lockfile + ansible.builtin.set_fact: + loadnet_lock: "{{ lookup('file', net_dir + '/network.lock.json') | from_json }}" + + - name: Assert lockfile matches selected network + ansible.builtin.assert: + that: + - loadnet_lock.network.name == net + fail_msg: "network.lock.json is for '{{ loadnet_lock.network.name }}' but net='{{ net }}'" + + - name: Gather service facts + ansible.builtin.service_facts: + + - name: Check required services are active (best-effort) + ansible.builtin.debug: + msg: "service {{ item }} is not active yet (expected after net-deploy)." + loop: + - docker.service + - chrony.service + when: + - ansible_facts.services[item] is defined + - ansible_facts.services[item].state != 'running' + + - name: Warn if required services are missing + ansible.builtin.debug: + msg: "service {{ item }} is not installed" + loop: + - docker.service + - chrony.service + when: ansible_facts.services[item] is not defined + + - name: Check fs.file-max baseline + ansible.builtin.command: sysctl -n fs.file-max + changed_when: false + register: fs_file_max + + - name: Assert fs.file-max is sufficient + ansible.builtin.assert: + that: + - (fs_file_max.stdout | int) >= 1048576 + fail_msg: "fs.file-max={{ fs_file_max.stdout }} is too low; expected >= 1048576" + + - name: Check root filesystem usage + ansible.builtin.command: "df -P /" + changed_when: false + register: loadnet_root_df + + - name: Parse root filesystem usage + ansible.builtin.set_fact: + loadnet_root_pct: "{{ loadnet_root_df.stdout_lines[1].split()[4] | regex_replace('%', '') | int }}" + changed_when: false + failed_when: loadnet_root_df.stdout_lines | length < 2 + + - name: Assert root filesystem usage below threshold + ansible.builtin.assert: + that: + - (loadnet_root_pct | int) < (loadnet_root_fs_max_pct_value | int) + fail_msg: "root filesystem usage {{ loadnet_root_pct }}% exceeds limit {{ loadnet_root_fs_max_pct_value }}%" + + - name: Check /var/log size + ansible.builtin.command: "du -sm {{ loadnet_log_dir_path }}" + changed_when: false + register: loadnet_log_dir_size + + - name: Parse /var/log size + ansible.builtin.set_fact: + loadnet_log_dir_mb: "{{ loadnet_log_dir_size.stdout.split()[0] | int }}" + changed_when: false + failed_when: loadnet_log_dir_size.stdout | length == 0 + + - name: Assert /var/log size below threshold + ansible.builtin.assert: + that: + - (loadnet_log_dir_mb | int) < (loadnet_log_dir_max_mb_value | int) + fail_msg: "/var/log size {{ loadnet_log_dir_mb }}MB exceeds limit {{ loadnet_log_dir_max_mb_value }}MB" + + - name: Check data mount exists (/var/lib/loadnet) + ansible.builtin.command: "findmnt -n /var/lib/loadnet" + changed_when: false + register: findmnt_loadnet + failed_when: false + + - name: Warn if /var/lib/loadnet is not mounted + ansible.builtin.debug: + msg: "/var/lib/loadnet is not mounted (required for validator go-live). Run net-storage." + when: findmnt_loadnet.rc != 0 + + - name: Check docker data-root + ansible.builtin.shell: | + set -euo pipefail + docker info --format '{{ "{{.DockerRootDir}}" }}' + args: + executable: /bin/bash + changed_when: false + register: docker_root + failed_when: false + + - name: Show docker data-root + ansible.builtin.debug: + msg: "docker data-root: {{ docker_root.stdout | default('unknown') }}" + + - name: Check free disk space under /var/lib (best-effort) + ansible.builtin.command: "df -Pk /var/lib" + changed_when: false + register: df_var_lib + failed_when: false + + - name: Show /var/lib disk usage + ansible.builtin.debug: + var: df_var_lib.stdout_lines diff --git a/infra/ansible/playbooks/down.yml b/infra/ansible/playbooks/down.yml new file mode 100644 index 0000000..840f1ee --- /dev/null +++ b/infra/ansible/playbooks/down.yml @@ -0,0 +1,28 @@ +--- +- name: Stop Load testnet services + hosts: all + gather_facts: false + become: true + tasks: + - name: Stop ultramarine instances + ansible.builtin.systemd: + name: "ultramarine@{{ item }}.service" + state: stopped + loop: "{{ loadnet_nodes }}" + failed_when: false + - name: Stop load-reth instances + ansible.builtin.systemd: + name: "load-reth@{{ item }}.service" + state: stopped + loop: "{{ loadnet_nodes }}" + failed_when: false + - name: Stop Grafana (monitoring) + ansible.builtin.systemd: + name: grafana.service + state: stopped + failed_when: false + - name: Stop Prometheus (monitoring) + ansible.builtin.systemd: + name: prometheus.service + state: stopped + failed_when: false diff --git a/infra/ansible/playbooks/firewall.yml b/infra/ansible/playbooks/firewall.yml new file mode 100644 index 0000000..0136af7 --- /dev/null +++ b/infra/ansible/playbooks/firewall.yml @@ -0,0 +1,20 @@ +--- +- name: Configure host firewall (ufw) + hosts: all + gather_facts: false + become: true + vars: + net_dir: "{{ net_dir }}" + tasks: + - name: Assert required vars + ansible.builtin.assert: + that: + - net is defined + - net_dir is defined + + - name: Load lockfile + ansible.builtin.set_fact: + loadnet_lock: "{{ lookup('file', net_dir + '/network.lock.json') | from_json }}" + + roles: + - firewall diff --git a/infra/ansible/playbooks/health.yml b/infra/ansible/playbooks/health.yml new file mode 100644 index 0000000..13dc789 --- /dev/null +++ b/infra/ansible/playbooks/health.yml @@ -0,0 +1,180 @@ +--- +- name: Health check (process + height moving) + hosts: all + gather_facts: false + become: true + tasks: + - name: Load lockfile + ansible.builtin.set_fact: + loadnet_lock: "{{ lookup('file', net_dir + '/network.lock.json') | from_json }}" + + - name: Build node->ports map + ansible.builtin.set_fact: + loadnet_ports_by_node: >- + {{ + dict( + loadnet_lock.nodes + | map(attribute='id') + | zip(loadnet_lock.nodes | map(attribute='ports')) + ) + }} + + - name: Build node->engine IPC path map + ansible.builtin.set_fact: + loadnet_engine_ipc_by_node: >- + {{ + dict( + loadnet_lock.nodes + | map(attribute='id') + | zip(loadnet_lock.nodes | map(attribute='engine') | map(attribute='ipc_path')) + ) + }} + + - name: Build metrics scrape host (from lockfile policy) + ansible.builtin.set_fact: + loadnet_metrics_bind: "{{ loadnet_lock.policy.metrics_bind | default('127.0.0.1') }}" + + - name: Define metrics regex helpers + ansible.builtin.set_fact: + loadnet_height_regex: >- + ^malachitebft_core_consensus_height(?:\\{[^}]*\\})?\\s+(\\d+) + loadnet_height_match_regex: >- + ^malachitebft_core_consensus_height(?:\\{[^}]*\\})?\\s+\\d+ + + - name: Normalize metrics scrape host + ansible.builtin.set_fact: + loadnet_metrics_scrape_host: "{{ '127.0.0.1' if loadnet_metrics_bind in ['0.0.0.0', '::'] else loadnet_metrics_bind }}" + + - name: Gather service facts + ansible.builtin.service_facts: + + - name: Assert load-reth service is active + ansible.builtin.assert: + that: + - ansible_facts.services['load-reth@' + item + '.service'] is defined + - ansible_facts.services['load-reth@' + item + '.service'].state == 'running' + fail_msg: "load-reth@{{ item }}.service is not running" + loop: "{{ loadnet_nodes }}" + + - name: Assert ultramarine service is active + ansible.builtin.assert: + that: + - ansible_facts.services['ultramarine@' + item + '.service'] is defined + - ansible_facts.services['ultramarine@' + item + '.service'].state == 'running' + fail_msg: "ultramarine@{{ item }}.service is not running" + loop: "{{ loadnet_nodes }}" + + - name: Assert Engine IPC socket exists + ansible.builtin.stat: + path: "{{ loadnet_engine_ipc_by_node[item] }}" + loop: "{{ loadnet_nodes }}" + register: engine_ipc_stats + + - name: Ensure Engine IPC is a socket + ansible.builtin.assert: + that: + - engine_ipc_stats.results[ansible_loop.index0].stat.exists + - engine_ipc_stats.results[ansible_loop.index0].stat.issock + fail_msg: "missing Engine IPC socket for {{ item }} at {{ loadnet_engine_ipc_by_node[item] }}" + loop: "{{ loadnet_nodes }}" + loop_control: + extended: true + + - name: Check EL JSON-RPC responds + ansible.builtin.uri: + url: "http://127.0.0.1:{{ loadnet_ports_by_node[item].el_http }}" + method: POST + body_format: json + body: + jsonrpc: "2.0" + id: 1 + method: "web3_clientVersion" + params: [] + return_content: true + timeout: 2 + loop: "{{ loadnet_nodes }}" + register: el_rpc + changed_when: false + + - name: Assert EL JSON-RPC returned result + ansible.builtin.assert: + that: + - el_rpc.results[ansible_loop.index0].status == 200 + - el_rpc.results[ansible_loop.index0].json is defined + - el_rpc.results[ansible_loop.index0].json.result is defined + fail_msg: "EL JSON-RPC did not return result for {{ item }}" + loop: "{{ loadnet_nodes }}" + loop_control: + extended: true + + - name: Assert CL metrics ports are set + ansible.builtin.assert: + that: + - (loadnet_ports_by_node[item].cl_metrics | default(0) | int) > 0 + fail_msg: "missing cl_metrics port for {{ item }}" + loop: "{{ loadnet_nodes }}" + + - name: Fetch initial CL metrics + ansible.builtin.uri: + url: "http://{{ loadnet_metrics_scrape_host }}:{{ loadnet_ports_by_node[item].cl_metrics }}/metrics" + return_content: true + timeout: 2 + loop: "{{ loadnet_nodes }}" + register: cl_metrics_first + changed_when: false + + - name: Record initial CL heights + ansible.builtin.set_fact: + loadnet_cl_height_first: >- + {{ + loadnet_cl_height_first | default({}) + | combine({ + item.item: ( + item.content + | regex_findall(loadnet_height_regex, multiline=True) + | first + | default('0', true) + | int + ) + }) + }} + loop: "{{ cl_metrics_first.results }}" + changed_when: false + failed_when: item.content is not search(loadnet_height_match_regex, multiline=True) + + - name: Fetch CL metrics until height advances + ansible.builtin.uri: + url: "http://{{ loadnet_metrics_scrape_host }}:{{ loadnet_ports_by_node[item].cl_metrics }}/metrics" + return_content: true + timeout: 2 + loop: "{{ loadnet_nodes }}" + register: cl_metrics_second + changed_when: false + retries: 3 + delay: 15 + until: >- + (cl_metrics_second.content is defined) + and (cl_metrics_second.content is search(loadnet_height_match_regex, multiline=True)) + and + ( + (cl_metrics_second.content | regex_findall(loadnet_height_regex, multiline=True) | first | default('0', true) | int) + != (loadnet_cl_height_first[item] | int) + ) + + - name: Record updated CL heights + ansible.builtin.set_fact: + loadnet_cl_height_second: >- + {{ + loadnet_cl_height_second | default({}) + | combine({ + item.item: ( + item.content + | regex_findall(loadnet_height_regex, multiline=True) + | first + | default('0', true) + | int + ) + }) + }} + loop: "{{ cl_metrics_second.results }}" + changed_when: false diff --git a/infra/ansible/playbooks/logs.yml b/infra/ansible/playbooks/logs.yml new file mode 100644 index 0000000..cd1ad7e --- /dev/null +++ b/infra/ansible/playbooks/logs.yml @@ -0,0 +1,29 @@ +--- +- name: Tail logs + hosts: all + gather_facts: false + become: true + vars: + lines: 200 + tasks: + - name: Fetch Ultramarine logs + ansible.builtin.command: "journalctl --no-pager -n {{ lines }} -u ultramarine@{{ item }}.service" + loop: "{{ loadnet_nodes }}" + register: ultra_logs + changed_when: false + failed_when: false + + - name: Display Ultramarine logs + ansible.builtin.debug: + var: ultra_logs.results + + - name: Fetch Load-reth logs + ansible.builtin.command: "journalctl --no-pager -n {{ lines }} -u load-reth@{{ item }}.service" + loop: "{{ loadnet_nodes }}" + register: reth_logs + changed_when: false + failed_when: false + + - name: Display Load-reth logs + ansible.builtin.debug: + var: reth_logs.results diff --git a/infra/ansible/playbooks/roll.yml b/infra/ansible/playbooks/roll.yml new file mode 100644 index 0000000..50a96f0 --- /dev/null +++ b/infra/ansible/playbooks/roll.yml @@ -0,0 +1,112 @@ +--- +- name: Rolling restart Load services (serial=1) + hosts: all + gather_facts: true + become: true + serial: 1 + vars: + loadnet_net: "{{ net }}" + loadnet_net_dir: "{{ net_dir }}" + loadnet_roll_confirm: "{{ roll_confirm | default('') }}" + pre_tasks: + - name: Assert required vars + ansible.builtin.assert: + that: + - loadnet_net is defined + - loadnet_net_dir is defined + + - name: Load lockfile + ansible.builtin.set_fact: + loadnet_lock: "{{ lookup('file', loadnet_net_dir + '/network.lock.json') | from_json }}" + + - name: Build node->ports map + ansible.builtin.set_fact: + loadnet_ports_by_node: >- + {{ + dict( + loadnet_lock.nodes + | map(attribute='id') + | zip(loadnet_lock.nodes | map(attribute='ports')) + ) + }} + + - name: Assert lockfile matches selected network + ansible.builtin.assert: + that: + - loadnet_lock.network.name == loadnet_net + fail_msg: "network.lock.json is for '{{ loadnet_lock.network.name }}' but net='{{ loadnet_net }}'" + + - name: Require explicit confirmation when rolling restart may halt chain + ansible.builtin.assert: + that: + - loadnet_roll_confirm == "YES" + fail_msg: "Refusing to roll-restart with <=2 validators (halts chain) without roll_confirm=YES" + when: (loadnet_lock.nodes | selectattr('role', 'equalto', 'validator') | list | length) <= 2 + + tasks: + - name: Stop ultramarine instances on host (best-effort) + ansible.builtin.systemd: + name: "ultramarine@{{ item }}.service" + state: stopped + loop: "{{ loadnet_nodes | default([]) }}" + failed_when: false + + - name: Restart load-reth instances on host + ansible.builtin.systemd: + name: "load-reth@{{ item }}.service" + state: restarted + enabled: true + loop: "{{ loadnet_nodes | default([]) }}" + + - name: Wait for load-reth engine IPC sockets + ansible.builtin.wait_for: + path: "/run/load-reth/{{ item }}/engine.ipc" + state: present + timeout: 90 + loop: "{{ loadnet_nodes | default([]) }}" + + - name: Wait for EL JSON-RPC to report not syncing + ansible.builtin.uri: + url: "http://127.0.0.1:{{ loadnet_ports_by_node[item].el_http }}" + method: POST + body_format: json + body: + jsonrpc: "2.0" + id: 1 + method: "eth_syncing" + params: [] + return_content: true + timeout: 2 + register: el_syncing_status + retries: 30 + delay: 2 + until: + - el_syncing_status.status == 200 + - el_syncing_status.json is defined + # eth_syncing returns false when synced, or object with currentBlock/highestBlock when syncing + # Consider synced if: result==false OR currentBlock >= highestBlock + - >- + el_syncing_status.json.result == false or + (el_syncing_status.json.result is mapping and + el_syncing_status.json.result.currentBlock is defined and + el_syncing_status.json.result.highestBlock is defined and + (el_syncing_status.json.result.currentBlock | int(base=16)) >= (el_syncing_status.json.result.highestBlock | int(base=16))) + loop: "{{ loadnet_nodes | default([]) }}" + changed_when: false + + - name: Restart ultramarine instances on host + ansible.builtin.systemd: + name: "ultramarine@{{ item }}.service" + state: restarted + enabled: true + loop: "{{ loadnet_nodes | default([]) }}" + + - name: Restart monitoring services on host (best-effort) + ansible.builtin.systemd: + name: "{{ item }}" + state: restarted + enabled: true + loop: + - prometheus.service + - grafana.service + failed_when: false diff --git a/infra/ansible/playbooks/status.yml b/infra/ansible/playbooks/status.yml new file mode 100644 index 0000000..1f6ba74 --- /dev/null +++ b/infra/ansible/playbooks/status.yml @@ -0,0 +1,28 @@ +--- +- name: Show service status + hosts: all + gather_facts: false + become: true + tasks: + - name: Gather service facts + ansible.builtin.service_facts: + + - name: Display load-reth status + ansible.builtin.debug: + msg: >- + load-reth@{{ item }}.service: + {{ + (ansible_facts.services['load-reth@' + item + '.service'] | default({})).state + | default('unknown') + }} + loop: "{{ loadnet_nodes }}" + + - name: Display ultramarine status + ansible.builtin.debug: + msg: >- + ultramarine@{{ item }}.service: + {{ + (ansible_facts.services['ultramarine@' + item + '.service'] | default({})).state + | default('unknown') + }} + loop: "{{ loadnet_nodes }}" diff --git a/infra/ansible/playbooks/storage.yml b/infra/ansible/playbooks/storage.yml new file mode 100644 index 0000000..2718a3c --- /dev/null +++ b/infra/ansible/playbooks/storage.yml @@ -0,0 +1,23 @@ +--- +- name: Storage bootstrap (validator hosts) + hosts: all + gather_facts: true + become: true + vars: + loadnet_data_mountpoint: "{{ loadnet_data_mountpoint | default('/var/lib/loadnet') }}" + # Optional: if the data volume is already mounted somewhere else (common in provider images), + # bind-mount it into loadnet_data_mountpoint in non-destructive mode. + loadnet_data_source_mountpoint: "{{ loadnet_data_source_mountpoint | default('/home') }}" + loadnet_data_source_dir: "{{ loadnet_data_source_dir | default(loadnet_data_source_mountpoint + '/loadnet') }}" + loadnet_storage_wipe: "{{ loadnet_storage_wipe | default(false) }}" + loadnet_data_devices: "{{ loadnet_data_devices | default([]) }}" + loadnet_data_raid_level: "{{ loadnet_data_raid_level | default(1) }}" + loadnet_md_device: "{{ loadnet_md_device | default('/dev/md/loadnet-data') }}" + loadnet_md_name: "{{ loadnet_md_name | default('loadnet-data') }}" + loadnet_fs_type: "{{ loadnet_fs_type | default('xfs') }}" + loadnet_move_docker_dataroot: "{{ loadnet_move_docker_dataroot | default(false) }}" + loadnet_docker_dataroot: "{{ loadnet_docker_dataroot | default(loadnet_data_mountpoint + '/docker') }}" + loadnet_bind_var_log: "{{ loadnet_bind_var_log | default(false) }}" + loadnet_log_dir: "{{ loadnet_log_dir | default(loadnet_data_mountpoint + '/log') }}" + roles: + - storage diff --git a/infra/ansible/playbooks/up.yml b/infra/ansible/playbooks/up.yml new file mode 100644 index 0000000..207baeb --- /dev/null +++ b/infra/ansible/playbooks/up.yml @@ -0,0 +1,38 @@ +--- +- name: Start Load testnet services + hosts: all + gather_facts: false + become: true + vars: + loadnet_root: /opt/loadnet + loadnet_current: "{{ loadnet_root }}/current" + tasks: + - name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + + - name: Restart load-reth instances (apply current env/artifacts) + ansible.builtin.systemd: + name: "load-reth@{{ item }}.service" + enabled: true + state: restarted + loop: "{{ loadnet_nodes }}" + + - name: Restart ultramarine instances (apply current env/artifacts) + ansible.builtin.systemd: + name: "ultramarine@{{ item }}.service" + enabled: true + state: restarted + loop: "{{ loadnet_nodes }}" + + - name: Restart Prometheus (monitoring) + ansible.builtin.systemd: + name: prometheus.service + enabled: true + state: restarted + + - name: Restart Grafana (monitoring) + ansible.builtin.systemd: + name: grafana.service + enabled: true + state: restarted diff --git a/infra/ansible/playbooks/wipe.yml b/infra/ansible/playbooks/wipe.yml new file mode 100644 index 0000000..48c826a --- /dev/null +++ b/infra/ansible/playbooks/wipe.yml @@ -0,0 +1,329 @@ +--- +- name: Wipe host state for a network (destructive) + hosts: all + gather_facts: false + become: true + vars: + loadnet_root: /opt/loadnet + loadnet_current: "{{ loadnet_root }}/current" + loadnet_networks: "{{ loadnet_root }}/networks" + loadnet_net: "{{ net }}" + loadnet_net_dir: "{{ loadnet_networks }}/{{ loadnet_net }}" + loadnet_wipe_state: "{{ wipe_state | default(true) }}" + loadnet_wipe_firewall: "{{ wipe_firewall | default(false) }}" + loadnet_wipe_monitoring: "{{ wipe_monitoring | default(true) }}" + loadnet_wipe_containers: "{{ wipe_containers | default(true) }}" + loadnet_wipe_blockscout: "{{ wipe_blockscout | default(true) }}" + loadnet_blockscout_base_dir: "{{ loadnet_net_dir }}/blockscout" + loadnet_blockscout_data_dir: "/var/lib/blockscout" + loadnet_wipe_confirm: "{{ wipe_confirm | default('') }}" + loadnet_wipe_nodes_raw: "{{ wipe_nodes | default('') }}" + tasks: + - name: Assert required vars + ansible.builtin.assert: + that: + - net is defined + + - name: Require explicit confirmation for destructive actions + ansible.builtin.assert: + that: + - loadnet_wipe_confirm == "YES" + fail_msg: "Refusing to wipe without wipe_confirm=YES" + when: >- + loadnet_wipe_state | bool + or loadnet_wipe_firewall | bool + or loadnet_wipe_monitoring | bool + or loadnet_wipe_containers | bool + or loadnet_wipe_blockscout | bool + + - name: Build node path lists + ansible.builtin.set_fact: + loadnet_wipe_nodes: >- + {{ + (loadnet_wipe_nodes_raw is string) + | ternary( + (loadnet_wipe_nodes_raw | length > 0) + | ternary(loadnet_wipe_nodes_raw.split(',') | map('trim') | reject('equalto', '') | list, (loadnet_nodes | default([]))), + (loadnet_wipe_nodes_raw | default(loadnet_nodes | default([]))) + ) + }} + loadnet_node_state_paths: >- + {{ + (loadnet_wipe_nodes | default([])) + | map('regex_replace', '^', '/var/lib/load-reth/') + | list + }} + loadnet_node_ultramarine_paths: >- + {{ + (loadnet_wipe_nodes | default([])) + | map('regex_replace', '^', '/var/lib/ultramarine/') + | list + }} + loadnet_node_runtime_paths: >- + {{ + (loadnet_wipe_nodes | default([])) + | map('regex_replace', '^', '/run/load-reth/') + | list + }} + + - name: Resolve load-reth data root (follows symlink) + ansible.builtin.command: "readlink -f /var/lib/load-reth" + register: loadnet_load_reth_root + changed_when: false + failed_when: false + + - name: Resolve ultramarine data root (follows symlink) + ansible.builtin.command: "readlink -f /var/lib/ultramarine" + register: loadnet_ultramarine_root + changed_when: false + failed_when: false + + - name: Build resolved data paths (symlink targets) + ansible.builtin.set_fact: + loadnet_node_state_paths_resolved: >- + {{ + (loadnet_wipe_nodes | default([])) + | map('regex_replace', '^', (loadnet_load_reth_root.stdout | default('/var/lib/load-reth')) + '/') + | list + }} + loadnet_node_ultramarine_paths_resolved: >- + {{ + (loadnet_wipe_nodes | default([])) + | map('regex_replace', '^', (loadnet_ultramarine_root.stdout | default('/var/lib/ultramarine')) + '/') + | list + }} + + - name: Build combined wipe path list + ansible.builtin.set_fact: + loadnet_wipe_paths: >- + {{ + loadnet_node_state_paths + + loadnet_node_ultramarine_paths + + loadnet_node_runtime_paths + + loadnet_node_state_paths_resolved + + loadnet_node_ultramarine_paths_resolved + }} + + - name: Stop ultramarine instances (best-effort) + ansible.builtin.systemd: + name: "ultramarine@{{ item }}.service" + state: stopped + enabled: false + loop: "{{ loadnet_wipe_nodes | default([]) }}" + failed_when: false + + - name: Remove systemd service.d overrides for load-reth (prevents stale config) + ansible.builtin.file: + path: "/etc/systemd/system/load-reth@{{ item }}.service.d" + state: absent + loop: "{{ loadnet_wipe_nodes | default([]) }}" + when: loadnet_wipe_state | bool + + - name: Remove systemd service.d overrides for ultramarine (prevents stale config) + ansible.builtin.file: + path: "/etc/systemd/system/ultramarine@{{ item }}.service.d" + state: absent + loop: "{{ loadnet_wipe_nodes | default([]) }}" + when: loadnet_wipe_state | bool + + - name: Reload systemd after removing overrides + ansible.builtin.systemd: + daemon_reload: true + when: loadnet_wipe_state | bool + + - name: Stop load-reth instances (best-effort) + ansible.builtin.systemd: + name: "load-reth@{{ item }}.service" + state: stopped + enabled: false + loop: "{{ loadnet_wipe_nodes | default([]) }}" + failed_when: false + + - name: Stop monitoring services (best-effort) + ansible.builtin.systemd: + name: "{{ item }}" + state: stopped + enabled: false + loop: + - grafana.service + - prometheus.service + failed_when: false + when: loadnet_wipe_monitoring | bool + + - name: Remove loadnet containers (best-effort) + ansible.builtin.shell: | + set -euo pipefail + if command -v docker >/dev/null 2>&1; then + nodes="{{ loadnet_wipe_nodes | default([]) | join(' ') }}" + if [ -n "$nodes" ]; then + for node_id in $nodes; do + docker rm -f "load-reth-${node_id}" >/dev/null 2>&1 || true + docker rm -f "ultramarine-${node_id}" >/dev/null 2>&1 || true + done + fi + docker rm -f loadnet-prometheus loadnet-grafana >/dev/null 2>&1 || true + fi + args: + executable: /bin/bash + changed_when: true + when: loadnet_wipe_containers | bool + + - name: Wipe node state (optional) + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: "{{ loadnet_wipe_paths }}" + when: loadnet_wipe_state | bool + + - name: Verify node state removed (optional) + ansible.builtin.stat: + path: "{{ item }}" + loop: "{{ loadnet_wipe_paths }}" + register: loadnet_wipe_state_stats + when: loadnet_wipe_state | bool + + - name: Assert node state removed (optional) + ansible.builtin.assert: + that: + - not item.stat.exists + fail_msg: "wipe_state=true but path still exists: {{ item.item }}" + loop: "{{ loadnet_wipe_state_stats.results | default([]) }}" + loop_control: + label: "{{ item.item | default('') }}" + when: loadnet_wipe_state | bool + + - name: Wipe monitoring state (optional) + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: + - /var/lib/grafana + - /var/lib/prometheus + when: loadnet_wipe_monitoring | bool + + - name: Stop blockscout service (best-effort) + ansible.builtin.systemd: + name: blockscout.service + state: stopped + enabled: false + failed_when: false + when: loadnet_wipe_blockscout | bool + + - name: Stop nginx-blockscout service (best-effort) + ansible.builtin.systemd: + name: nginx-blockscout.service + state: stopped + enabled: false + failed_when: false + when: loadnet_wipe_blockscout | bool + + - name: Detect docker binary (wipe) + ansible.builtin.command: which docker + register: blockscout_docker_path + changed_when: false + failed_when: false + when: loadnet_wipe_blockscout | bool + + - name: Detect docker compose v2 (wipe) + ansible.builtin.command: "{{ blockscout_docker_path.stdout }} compose version" + register: blockscout_compose_v2_check + changed_when: false + failed_when: false + when: + - loadnet_wipe_blockscout | bool + - blockscout_docker_path.rc == 0 + + - name: Detect docker-compose v1 (wipe) + ansible.builtin.command: which docker-compose + register: blockscout_compose_v1_path + changed_when: false + failed_when: false + when: loadnet_wipe_blockscout | bool + + - name: Set blockscout compose command (wipe) + ansible.builtin.set_fact: + blockscout_compose_cmd: >- + {{ + (blockscout_docker_path.rc | default(1) == 0 and blockscout_compose_v2_check.rc | default(1) == 0) + | ternary(blockscout_docker_path.stdout + ' compose', blockscout_compose_v1_path.stdout | default('')) + }} + when: loadnet_wipe_blockscout | bool + + - name: Remove blockscout containers and volumes via docker compose + ansible.builtin.shell: | + set -euo pipefail + if [ -d "{{ loadnet_blockscout_base_dir }}" ]; then + if [ -f "{{ loadnet_blockscout_base_dir }}/docker-compose.yml" ]; then + if [ -n "{{ blockscout_compose_cmd | default('') }}" ]; then + cd "{{ loadnet_blockscout_base_dir }}" + {{ blockscout_compose_cmd }} down -v --remove-orphans 2>/dev/null || true + fi + fi + fi + args: + executable: /bin/bash + changed_when: true + when: loadnet_wipe_blockscout | bool + + - name: Wipe blockscout data directory (postgres, redis, logs) + ansible.builtin.file: + path: "{{ loadnet_blockscout_data_dir }}" + state: absent + when: loadnet_wipe_blockscout | bool + + - name: Wipe blockscout base directory (compose/envs) + ansible.builtin.file: + path: "{{ loadnet_blockscout_base_dir }}" + state: absent + when: loadnet_wipe_blockscout | bool + + - name: Remove blockscout docker network (prevents stale network labels) + ansible.builtin.shell: | + set -euo pipefail + docker network rm blockscout-network 2>/dev/null || true + args: + executable: /bin/bash + changed_when: true + when: loadnet_wipe_blockscout | bool + + - name: Wipe network artifacts under /opt/loadnet (optional) + ansible.builtin.file: + path: "{{ loadnet_net_dir }}" + state: absent + when: loadnet_wipe_state | bool + + - name: Resolve /opt/loadnet/current target + ansible.builtin.command: "readlink -f {{ loadnet_current }}" + register: loadnet_current_target + changed_when: false + failed_when: false + + - name: Resolve selected network path + ansible.builtin.command: "readlink -f {{ loadnet_net_dir }}" + register: loadnet_net_target + changed_when: false + failed_when: false + + - name: Remove /opt/loadnet/current symlink if it points to this network + ansible.builtin.file: + path: "{{ loadnet_current }}" + state: absent + when: + - loadnet_wipe_state | bool + - loadnet_current_target.rc == 0 + - loadnet_net_target.rc == 0 + - loadnet_current_target.stdout | length > 0 + - loadnet_net_target.stdout | length > 0 + - loadnet_current_target.stdout == loadnet_net_target.stdout + + - name: Wipe firewall rules (optional; keeps SSH reachable by disabling ufw) + ansible.builtin.shell: | + set -euo pipefail + if command -v ufw >/dev/null 2>&1; then + ufw --force reset + ufw --force disable + fi + args: + executable: /bin/bash + changed_when: true + when: loadnet_wipe_firewall | bool diff --git a/infra/ansible/requirements.yml b/infra/ansible/requirements.yml new file mode 100644 index 0000000..3834074 --- /dev/null +++ b/infra/ansible/requirements.yml @@ -0,0 +1,6 @@ +--- +collections: + - name: community.general + version: ">=8.0.0" + - name: ansible.posix + version: ">=1.5.0" diff --git a/infra/ansible/roles/blockscout/defaults/main.yml b/infra/ansible/roles/blockscout/defaults/main.yml new file mode 100644 index 0000000..af7aa95 --- /dev/null +++ b/infra/ansible/roles/blockscout/defaults/main.yml @@ -0,0 +1,129 @@ +--- +# Enable/disable blockscout deployment +blockscout_enabled: false + +# Domain configuration +blockscout_domain: "explorer.example.com" +blockscout_stats_domain: "stats.example.com" +blockscout_rpc_domain: "rpc.example.com" +blockscout_ssl_enabled: true + +# Chain configuration +blockscout_chain_id: 1984 +blockscout_chain_name: "Fibernet Testnet" +blockscout_chain_symbol: "tLOAD" +blockscout_chain_decimals: 18 +blockscout_is_testnet: true + +# WVM Blockscout local source paths (relative to loadnetwork_consensus/) +# These are used for local builds on the controller machine +blockscout_consensus_root: "{{ playbook_dir }}/../../../.." +blockscout_backend_src_dir: "{{ blockscout_consensus_root }}/wvm-blockscout" +blockscout_frontend_src_dir: "{{ blockscout_consensus_root }}/wvm-blockscout-frontend" + +# Backend build args (from wvm-blockscout prod) +blockscout_release_version: "6.6.0" + +# Docker image names (built locally, later can switch to loadnetwork/ Docker Hub) +blockscout_backend_image: "blockscout-backend:local" +blockscout_frontend_image: "blockscout-frontend:local" + +# Docker images (microservices use upstream) +blockscout_stats_image: "ghcr.io/blockscout/stats:latest" +blockscout_visualizer_image: "ghcr.io/blockscout/visualizer:latest" +blockscout_sig_provider_image: "ghcr.io/blockscout/sig-provider:latest" +blockscout_postgres_image: "postgres:15" +blockscout_redis_image: "redis:alpine" + +# Ports +blockscout_backend_port: 4000 +blockscout_frontend_port: 3001 +blockscout_stats_port: 8050 +blockscout_visualizer_port: 8051 +blockscout_sig_provider_port: 8052 + +# RPC configuration (connects to local load-reth via Docker bridge gateway) +# Note: host.docker.internal doesn't work reliably on Linux, use 172.17.0.1 instead +blockscout_rpc_url: "http://172.17.0.1:8545" +blockscout_rpc_trace_url: "http://172.17.0.1:8545" +blockscout_rpc_variant: "erigon" + +# Paths +blockscout_base_dir: "{{ (loadnet_net_dir | default('/opt/loadnet/current')) }}/blockscout" +blockscout_data_dir: "/var/lib/blockscout" +blockscout_postgres_data_dir: "{{ blockscout_data_dir }}/postgres" +blockscout_stats_postgres_data_dir: "{{ blockscout_data_dir }}/stats-postgres" +blockscout_redis_data_dir: "{{ blockscout_data_dir }}/redis" +blockscout_logs_dir: "{{ blockscout_data_dir }}/logs" + +# Database configuration - optimized for 32GB RAM +blockscout_db_name: "blockscout" +blockscout_db_user: "blockscout" +# blockscout_db_password: auto-generated if not set + +blockscout_stats_db_name: "stats" +blockscout_stats_db_user: "stats" +# blockscout_stats_db_password: auto-generated if not set + +# PostgreSQL tuning for 32GB RAM +blockscout_postgres_max_connections: 100 +blockscout_postgres_shm_size: "256m" + +# Application pool sizes - optimized for 32GB RAM +# Increase pool size for faster indexing on high-throughput chains (1-sec blocks) +blockscout_pool_size: 80 +blockscout_pool_size_api: 10 + +# Docker memory limits for 32GB RAM systems +blockscout_backend_memory_limit: "8g" +blockscout_frontend_memory_limit: "2g" +blockscout_stats_memory_limit: "2g" +blockscout_visualizer_memory_limit: "1g" +blockscout_sig_provider_memory_limit: "512m" +blockscout_postgres_memory_limit: "4g" +blockscout_stats_postgres_memory_limit: "2g" +blockscout_redis_memory_limit: "1g" + +# Secret key base for backend (auto-generated if not set) +# blockscout_secret_key_base: + +# API configuration +blockscout_api_v2_enabled: true +blockscout_api_rate_limit: 50 +blockscout_api_rate_limit_by_ip: 3000 + +# Indexer configuration +blockscout_disable_indexer: false +blockscout_disable_realtime_indexer: false +# Batch size for catchup indexing (default: 100). +# Increase for faster block chains (1-sec blocks) to improve indexing throughput. +blockscout_catchup_blocks_batch_size: 500 + +# Microservices configuration +blockscout_visualizer_enabled: true +blockscout_sig_provider_enabled: true +blockscout_stats_enabled: true + +# Frontend branding - Load Network +blockscout_network_logo: "https://raw.githubusercontent.com/weaveVM/wvm-media-kit/refs/heads/main/Load%20Network/SVG/wordmark-bright-black.svg" +blockscout_network_logo_dark: "https://raw.githubusercontent.com/weaveVM/wvm-media-kit/refs/heads/main/Load%20Network/SVG/wordmark-bright-white.svg" +blockscout_network_icon: "https://raw.githubusercontent.com/weaveVM/wvm-media-kit/refs/heads/main/Load%20Network/SVG/logo-bright.svg" +blockscout_network_icon_dark: "https://raw.githubusercontent.com/weaveVM/wvm-media-kit/refs/heads/main/Load%20Network/SVG/logo-mono.svg" +blockscout_homepage_plate_background: >- + radial-gradient(103.03% 103.03% at 0% 0%, rgba(0, 110, 255, 0.8) 0%, + rgba(0, 88, 204, 0.8) 100%), var(--chakra-colors-blue-700) +blockscout_color_theme_default: "light" + +# Cache settings +blockscout_cache_block_count_period: 60 +blockscout_cache_txs_count_period: 60 +blockscout_cache_address_sum_period: 60 + +# Log rotation settings +# Application logs (mounted volume) +blockscout_log_max_size: "100M" +blockscout_log_max_files: 5 + +# Docker container logs (json-file driver) +blockscout_docker_log_max_size: "50m" +blockscout_docker_log_max_files: "3" diff --git a/infra/ansible/roles/blockscout/handlers/main.yml b/infra/ansible/roles/blockscout/handlers/main.yml new file mode 100644 index 0000000..21ea0f7 --- /dev/null +++ b/infra/ansible/roles/blockscout/handlers/main.yml @@ -0,0 +1,11 @@ +--- +- name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + +- name: Restart blockscout + ansible.builtin.systemd: + name: blockscout + state: restarted + daemon_reload: true + when: blockscout_manage_service | default(true) | bool diff --git a/infra/ansible/roles/blockscout/tasks/main.yml b/infra/ansible/roles/blockscout/tasks/main.yml new file mode 100644 index 0000000..dbfc910 --- /dev/null +++ b/infra/ansible/roles/blockscout/tasks/main.yml @@ -0,0 +1,390 @@ +--- +- name: Check if blockscout is enabled for this host + ansible.builtin.debug: + msg: "Blockscout deployment is disabled for this host" + when: not (blockscout_enabled | default(false) | bool) + +- name: End play if blockscout is not enabled + ansible.builtin.meta: end_host + when: not (blockscout_enabled | default(false) | bool) + +- name: Ensure Docker Compose is available + block: + - name: Install docker-compose-plugin (preferred) + ansible.builtin.apt: + name: docker-compose-plugin + state: present + update_cache: true + cache_valid_time: "{{ loadnet_apt_cache_valid_time | default(3600) }}" + rescue: + - name: Install docker-compose (legacy fallback) + ansible.builtin.apt: + name: docker-compose + state: present + update_cache: true + cache_valid_time: "{{ loadnet_apt_cache_valid_time | default(3600) }}" + +- name: Detect docker binary + ansible.builtin.command: which docker + register: blockscout_docker_path + changed_when: false + failed_when: false + +- name: Detect docker compose v2 + ansible.builtin.command: "{{ blockscout_docker_path.stdout }} compose version" + register: blockscout_compose_v2_check + changed_when: false + failed_when: false + when: blockscout_docker_path.rc == 0 + +- name: Detect docker-compose v1 + ansible.builtin.command: which docker-compose + register: blockscout_compose_v1_path + changed_when: false + failed_when: false + +- name: Set blockscout compose command + ansible.builtin.set_fact: + blockscout_compose_cmd: >- + {{ + (blockscout_docker_path.rc | default(1) == 0 and blockscout_compose_v2_check.rc | default(1) == 0) + | ternary(blockscout_docker_path.stdout + ' compose', blockscout_compose_v1_path.stdout | default('')) + }} + +- name: Assert docker compose is available + ansible.builtin.assert: + that: + - blockscout_compose_cmd | length > 0 + fail_msg: "docker compose not found; install docker-compose-plugin or docker-compose" + +- name: Ensure rsync is available (required for synchronize) + block: + - name: Install rsync (with cache update) + ansible.builtin.apt: + name: rsync + state: present + update_cache: true + cache_valid_time: "{{ loadnet_apt_cache_valid_time | default(3600) }}" + update_cache_retries: "{{ loadnet_apt_update_retries | default(10) }}" + update_cache_retry_max_delay: "{{ loadnet_apt_update_max_delay | default(30) }}" + lock_timeout: "{{ loadnet_apt_lock_timeout | default(120) }}" + rescue: + - name: Warn about apt cache update failure + ansible.builtin.debug: + msg: "apt cache update failed; retrying rsync install without update_cache." + - name: Install rsync (without cache update) + ansible.builtin.apt: + name: rsync + state: present + update_cache: false + lock_timeout: "{{ loadnet_apt_lock_timeout | default(120) }}" + +- name: Ensure blockscout secrets dir exists (controller-side) + ansible.builtin.file: + path: "{{ net_dir }}/bundle/private/blockscout" + state: directory + mode: "0700" + delegate_to: localhost + become: false + run_once: true + +- name: Check for blockscout DB password (controller-side) + ansible.builtin.stat: + path: "{{ net_dir }}/bundle/private/blockscout/db_password" + register: blockscout_db_password_stat + delegate_to: localhost + become: false + run_once: true + +- name: Generate blockscout DB password (controller-side) + ansible.builtin.copy: + dest: "{{ net_dir }}/bundle/private/blockscout/db_password" + mode: "0600" + content: "{{ lookup('ansible.builtin.password', '/dev/null length=32 chars=ascii_letters,digits') }}" + delegate_to: localhost + become: false + run_once: true + when: not blockscout_db_password_stat.stat.exists + +- name: Read blockscout DB password (controller-side) + ansible.builtin.set_fact: + blockscout_db_password: "{{ lookup('ansible.builtin.file', net_dir + '/bundle/private/blockscout/db_password') }}" + no_log: true + +- name: Check for stats DB password (controller-side) + ansible.builtin.stat: + path: "{{ net_dir }}/bundle/private/blockscout/stats_db_password" + register: blockscout_stats_db_password_stat + delegate_to: localhost + become: false + run_once: true + +- name: Generate stats DB password (controller-side) + ansible.builtin.copy: + dest: "{{ net_dir }}/bundle/private/blockscout/stats_db_password" + mode: "0600" + content: "{{ lookup('ansible.builtin.password', '/dev/null length=32 chars=ascii_letters,digits') }}" + delegate_to: localhost + become: false + run_once: true + when: not blockscout_stats_db_password_stat.stat.exists + +- name: Read stats DB password (controller-side) + ansible.builtin.set_fact: + blockscout_stats_db_password: "{{ lookup('ansible.builtin.file', net_dir + '/bundle/private/blockscout/stats_db_password') }}" + no_log: true + +- name: Check for secret key base (controller-side) + ansible.builtin.stat: + path: "{{ net_dir }}/bundle/private/blockscout/secret_key_base" + register: blockscout_secret_key_base_stat + delegate_to: localhost + become: false + run_once: true + +- name: Generate secret key base (controller-side) + ansible.builtin.copy: + dest: "{{ net_dir }}/bundle/private/blockscout/secret_key_base" + mode: "0600" + content: "{{ lookup('ansible.builtin.password', '/dev/null length=64 chars=ascii_letters,digits') }}" + delegate_to: localhost + become: false + run_once: true + when: not blockscout_secret_key_base_stat.stat.exists + +- name: Read secret key base (controller-side) + ansible.builtin.set_fact: + blockscout_secret_key_base: "{{ lookup('ansible.builtin.file', net_dir + '/bundle/private/blockscout/secret_key_base') }}" + no_log: true + +- name: Ensure blockscout base directory exists + ansible.builtin.file: + path: "{{ blockscout_base_dir }}" + state: directory + owner: root + group: root + mode: "0755" + +- name: Ensure blockscout envs directory exists + ansible.builtin.file: + path: "{{ blockscout_base_dir }}/envs" + state: directory + owner: root + group: root + mode: "0755" + +- name: Ensure blockscout data directory exists + ansible.builtin.file: + path: "{{ blockscout_data_dir }}" + state: directory + owner: root + group: root + mode: "0755" + +- name: Ensure blockscout sensitive data directories exist (restricted) + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: root + group: root + mode: "0700" + loop: + - "{{ blockscout_postgres_data_dir }}" + - "{{ blockscout_stats_postgres_data_dir }}" + - "{{ blockscout_redis_data_dir }}" + +- name: Ensure blockscout logs directory exists + ansible.builtin.file: + path: "{{ blockscout_logs_dir }}" + state: directory + owner: root + group: root + mode: "0755" + +# --- Server-side image build (avoids QEMU emulation issues on macOS) --- +- name: Check if backend image exists on server + ansible.builtin.command: + cmd: docker image inspect {{ blockscout_backend_image }} + register: blockscout_backend_image_check + failed_when: false + changed_when: false + +- name: Check if frontend image exists on server + ansible.builtin.command: + cmd: docker image inspect {{ blockscout_frontend_image }} + register: blockscout_frontend_image_check + failed_when: false + changed_when: false + +- name: Verify wvm-blockscout source exists (controller) + ansible.builtin.stat: + path: "{{ blockscout_backend_src_dir }}/docker/Dockerfile" + delegate_to: localhost + become: false + register: blockscout_backend_src_stat + run_once: true + +- name: Verify wvm-blockscout-frontend source exists (controller) + ansible.builtin.stat: + path: "{{ blockscout_frontend_src_dir }}/Dockerfile" + delegate_to: localhost + become: false + register: blockscout_frontend_src_stat + run_once: true + +- name: Assert backend source or image is available + ansible.builtin.assert: + that: + - blockscout_backend_image_check.rc == 0 or blockscout_backend_src_stat.stat.exists + fail_msg: "Blockscout backend image not found and source missing; ensure wvm-blockscout is present or prebuild {{ blockscout_backend_image }}" + +- name: Assert frontend source or image is available + ansible.builtin.assert: + that: + - blockscout_frontend_image_check.rc == 0 or blockscout_frontend_src_stat.stat.exists + fail_msg: "Blockscout frontend image not found and source missing; ensure wvm-blockscout-frontend is present or prebuild {{ blockscout_frontend_image }}" + +- name: Ensure build directory exists on server + ansible.builtin.file: + path: /opt/blockscout-build + state: directory + mode: "0755" + when: blockscout_backend_image_check.rc != 0 or blockscout_frontend_image_check.rc != 0 + +# --- Backend build --- +- name: Transfer wvm-blockscout source to server + ansible.posix.synchronize: + src: "{{ blockscout_backend_src_dir }}/" + dest: /opt/blockscout-build/backend/ + rsync_opts: + - "--exclude=.git" + - "--exclude=_build" + - "--exclude=deps" + - "--exclude=node_modules" + delete: true + when: + - blockscout_backend_image_check.rc != 0 + - blockscout_backend_src_stat.stat.exists + +- name: Build backend image on server + ansible.builtin.command: + cmd: > + docker build + --build-arg CACHE_EXCHANGE_RATES_PERIOD="" + --build-arg API_V1_READ_METHODS_DISABLED=false + --build-arg DISABLE_WEBAPP=false + --build-arg API_V1_WRITE_METHODS_DISABLED=false + --build-arg CACHE_TOTAL_GAS_USAGE_COUNTER_ENABLED="" + --build-arg CACHE_ADDRESS_WITH_BALANCES_UPDATE_INTERVAL="" + --build-arg ADMIN_PANEL_ENABLED="" + --build-arg RELEASE_VERSION={{ blockscout_release_version }} + -t {{ blockscout_backend_image }} + -f docker/Dockerfile + . + chdir: /opt/blockscout-build/backend + when: + - blockscout_backend_image_check.rc != 0 + - blockscout_backend_src_stat.stat.exists + changed_when: true + async: 1800 + poll: 30 + +# --- Frontend build --- +- name: Transfer wvm-blockscout-frontend source to server + ansible.posix.synchronize: + src: "{{ blockscout_frontend_src_dir }}/" + dest: /opt/blockscout-build/frontend/ + rsync_opts: + - "--exclude=.git" + - "--exclude=node_modules" + - "--exclude=.next" + delete: true + when: + - blockscout_frontend_image_check.rc != 0 + - blockscout_frontend_src_stat.stat.exists + +- name: Build frontend image on server + ansible.builtin.command: + cmd: docker build -t {{ blockscout_frontend_image }} . + chdir: /opt/blockscout-build/frontend + when: + - blockscout_frontend_image_check.rc != 0 + - blockscout_frontend_src_stat.stat.exists + changed_when: true + async: 1800 + poll: 30 + +- name: Cleanup build directory on server + ansible.builtin.file: + path: /opt/blockscout-build + state: absent + when: blockscout_backend_image_check.rc != 0 or blockscout_frontend_image_check.rc != 0 + +- name: Template common-blockscout.env + ansible.builtin.template: + src: "envs/common-blockscout.env.j2" + dest: "{{ blockscout_base_dir }}/envs/common-blockscout.env" + owner: root + group: root + mode: "0600" + notify: Restart blockscout + +- name: Template common-frontend.env + ansible.builtin.template: + src: "envs/common-frontend.env.j2" + dest: "{{ blockscout_base_dir }}/envs/common-frontend.env" + owner: root + group: root + mode: "0644" + notify: Restart blockscout + +- name: Template common-stats.env + ansible.builtin.template: + src: "envs/common-stats.env.j2" + dest: "{{ blockscout_base_dir }}/envs/common-stats.env" + owner: root + group: root + mode: "0600" + notify: Restart blockscout + +- name: Template common-visualizer.env + ansible.builtin.template: + src: "envs/common-visualizer.env.j2" + dest: "{{ blockscout_base_dir }}/envs/common-visualizer.env" + owner: root + group: root + mode: "0644" + notify: Restart blockscout + +- name: Template docker-compose.yml (restricted - contains secrets) + ansible.builtin.template: + src: "docker-compose.yml.j2" + dest: "{{ blockscout_base_dir }}/docker-compose.yml" + owner: root + group: root + mode: "0600" + notify: Restart blockscout + +- name: Template blockscout systemd unit + ansible.builtin.template: + src: "blockscout.service.j2" + dest: "/etc/systemd/system/blockscout.service" + owner: root + group: root + mode: "0644" + notify: Reload systemd + +- name: Install logrotate configuration for blockscout logs + ansible.builtin.template: + src: "blockscout-logrotate.j2" + dest: "/etc/logrotate.d/blockscout" + owner: root + group: root + mode: "0644" + +- name: Enable and start blockscout service + ansible.builtin.systemd: + name: blockscout + enabled: true + daemon_reload: true + state: stopped diff --git a/infra/ansible/roles/blockscout/templates/blockscout-logrotate.j2 b/infra/ansible/roles/blockscout/templates/blockscout-logrotate.j2 new file mode 100644 index 0000000..c41eb23 --- /dev/null +++ b/infra/ansible/roles/blockscout/templates/blockscout-logrotate.j2 @@ -0,0 +1,15 @@ +# Logrotate configuration for Blockscout application logs +# Deployed by: roles/blockscout + +{{ blockscout_logs_dir }}/*.log +{{ blockscout_logs_dir }}/**/*.log +{ + size {{ blockscout_log_max_size | default('100M') }} + rotate {{ blockscout_log_max_files | default(5) }} + missingok + notifempty + compress + delaycompress + copytruncate + sharedscripts +} diff --git a/infra/ansible/roles/blockscout/templates/blockscout.service.j2 b/infra/ansible/roles/blockscout/templates/blockscout.service.j2 new file mode 100644 index 0000000..7513dbb --- /dev/null +++ b/infra/ansible/roles/blockscout/templates/blockscout.service.j2 @@ -0,0 +1,24 @@ +[Unit] +Description=Blockscout Explorer +Documentation=https://docs.blockscout.com/ +After=docker.service +Requires=docker.service + +[Service] +Type=oneshot +RemainAfterExit=yes +WorkingDirectory={{ blockscout_base_dir }} +# Note: Backend and frontend images are built on server (avoids QEMU emulation on macOS) +# Images: {{ blockscout_backend_image }}, {{ blockscout_frontend_image }} +ExecStart={{ blockscout_compose_cmd | default('/usr/bin/docker compose') }} up -d --remove-orphans +ExecStop={{ blockscout_compose_cmd | default('/usr/bin/docker compose') }} down +TimeoutStartSec=300 +TimeoutStopSec=300 + +# Logging +StandardOutput=journal +StandardError=journal +SyslogIdentifier=blockscout + +[Install] +WantedBy=multi-user.target diff --git a/infra/ansible/roles/blockscout/templates/docker-compose.yml.j2 b/infra/ansible/roles/blockscout/templates/docker-compose.yml.j2 new file mode 100644 index 0000000..2a23145 --- /dev/null +++ b/infra/ansible/roles/blockscout/templates/docker-compose.yml.j2 @@ -0,0 +1,206 @@ +version: '3.9' + +services: + redis: + image: {{ blockscout_redis_image }} + container_name: blockscout-redis + restart: always + command: redis-server --maxmemory 512mb --maxmemory-policy allkeys-lru + volumes: + - {{ blockscout_redis_data_dir }}:/data + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 5s + retries: 5 + mem_limit: {{ blockscout_redis_memory_limit }} + logging: + driver: json-file + options: + max-size: "{{ blockscout_docker_log_max_size | default('50m') }}" + max-file: "{{ blockscout_docker_log_max_files | default('3') }}" + + db: + image: {{ blockscout_postgres_image }} + container_name: blockscout-db + shm_size: {{ blockscout_postgres_shm_size }} + restart: always + command: postgres -c 'max_connections={{ blockscout_postgres_max_connections }}' -c 'client_connection_check_interval=60000' + environment: + POSTGRES_DB: {{ blockscout_db_name }} + POSTGRES_USER: {{ blockscout_db_user }} + POSTGRES_PASSWORD: {{ blockscout_db_password }} + volumes: + - {{ blockscout_postgres_data_dir }}:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U {{ blockscout_db_user }} -d {{ blockscout_db_name }}"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + mem_limit: {{ blockscout_postgres_memory_limit }} + logging: + driver: json-file + options: + max-size: "{{ blockscout_docker_log_max_size | default('50m') }}" + max-file: "{{ blockscout_docker_log_max_files | default('3') }}" + + stats-db: + image: {{ blockscout_postgres_image }} + container_name: blockscout-stats-db + shm_size: {{ blockscout_postgres_shm_size }} + restart: always + command: postgres -c 'max_connections={{ blockscout_postgres_max_connections }}' + environment: + POSTGRES_DB: {{ blockscout_stats_db_name }} + POSTGRES_USER: {{ blockscout_stats_db_user }} + POSTGRES_PASSWORD: {{ blockscout_stats_db_password }} + volumes: + - {{ blockscout_stats_postgres_data_dir }}:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U {{ blockscout_stats_db_user }} -d {{ blockscout_stats_db_name }}"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + mem_limit: {{ blockscout_stats_postgres_memory_limit }} + logging: + driver: json-file + options: + max-size: "{{ blockscout_docker_log_max_size | default('50m') }}" + max-file: "{{ blockscout_docker_log_max_files | default('3') }}" + + backend: + image: {{ blockscout_backend_image }} + container_name: blockscout-backend + restart: always + stop_grace_period: 5m + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + extra_hosts: + - 'host.docker.internal:host-gateway' + command: sh -c "bin/blockscout eval \"Elixir.Explorer.ReleaseTasks.create_and_migrate()\" && bin/blockscout start" + env_file: + - ./envs/common-blockscout.env + environment: + ETHEREUM_JSONRPC_HTTP_URL: {{ blockscout_rpc_url }} + ETHEREUM_JSONRPC_TRACE_URL: {{ blockscout_rpc_trace_url }} + CHAIN_ID: '{{ blockscout_chain_id }}' + volumes: + - {{ blockscout_logs_dir }}:/app/logs/ + ports: + - "127.0.0.1:{{ blockscout_backend_port }}:4000" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:4000/api/v2/stats"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 60s + mem_limit: {{ blockscout_backend_memory_limit }} + logging: + driver: json-file + options: + max-size: "{{ blockscout_docker_log_max_size | default('50m') }}" + max-file: "{{ blockscout_docker_log_max_files | default('3') }}" + + frontend: + image: {{ blockscout_frontend_image }} + container_name: blockscout-frontend + platform: linux/amd64 + restart: always + depends_on: + backend: + condition: service_healthy + env_file: + - ./envs/common-frontend.env + ports: + - "127.0.0.1:{{ blockscout_frontend_port }}:3000" + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:3000"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + mem_limit: {{ blockscout_frontend_memory_limit }} + logging: + driver: json-file + options: + max-size: "{{ blockscout_docker_log_max_size | default('50m') }}" + max-file: "{{ blockscout_docker_log_max_files | default('3') }}" + +{% if blockscout_stats_enabled | default(true) %} + stats: + image: {{ blockscout_stats_image }} + container_name: blockscout-stats +{% if blockscout_stats_platform | default('linux/amd64') != 'native' %} + platform: {{ blockscout_stats_platform | default('linux/amd64') }} +{% endif %} + restart: always + depends_on: + stats-db: + condition: service_healthy + backend: + condition: service_healthy + extra_hosts: + - 'host.docker.internal:host-gateway' + env_file: + - ./envs/common-stats.env + environment: + STATS__DB_URL: postgres://{{ blockscout_stats_db_user }}:{{ blockscout_stats_db_password }}@stats-db:5432/{{ blockscout_stats_db_name }} + STATS__BLOCKSCOUT_DB_URL: postgresql://{{ blockscout_db_user }}:{{ blockscout_db_password }}@db:5432/{{ blockscout_db_name }} + STATS__CREATE_DATABASE: 'true' + STATS__RUN_MIGRATIONS: 'true' + ports: + - "127.0.0.1:{{ blockscout_stats_port }}:8050" + mem_limit: {{ blockscout_stats_memory_limit }} + logging: + driver: json-file + options: + max-size: "{{ blockscout_docker_log_max_size | default('50m') }}" + max-file: "{{ blockscout_docker_log_max_files | default('3') }}" +{% endif %} + +{% if blockscout_visualizer_enabled | default(true) %} + visualizer: + image: {{ blockscout_visualizer_image }} + container_name: blockscout-visualizer +{% if blockscout_visualizer_platform | default('linux/amd64') != 'native' %} + platform: {{ blockscout_visualizer_platform | default('linux/amd64') }} +{% endif %} + restart: always + env_file: + - ./envs/common-visualizer.env + ports: + - "127.0.0.1:{{ blockscout_visualizer_port }}:8050" + mem_limit: {{ blockscout_visualizer_memory_limit }} + logging: + driver: json-file + options: + max-size: "{{ blockscout_docker_log_max_size | default('50m') }}" + max-file: "{{ blockscout_docker_log_max_files | default('3') }}" +{% endif %} + +{% if blockscout_sig_provider_enabled | default(true) %} + sig-provider: + image: {{ blockscout_sig_provider_image }} + container_name: blockscout-sig-provider +{% if blockscout_sig_provider_platform | default('linux/amd64') != 'native' %} + platform: {{ blockscout_sig_provider_platform | default('linux/amd64') }} +{% endif %} + restart: always + ports: + - "127.0.0.1:{{ blockscout_sig_provider_port }}:8050" + mem_limit: {{ blockscout_sig_provider_memory_limit }} + logging: + driver: json-file + options: + max-size: "{{ blockscout_docker_log_max_size | default('50m') }}" + max-file: "{{ blockscout_docker_log_max_files | default('3') }}" +{% endif %} + +networks: + default: + name: blockscout-network diff --git a/infra/ansible/roles/blockscout/templates/envs/common-blockscout.env.j2 b/infra/ansible/roles/blockscout/templates/envs/common-blockscout.env.j2 new file mode 100644 index 0000000..331fcea --- /dev/null +++ b/infra/ansible/roles/blockscout/templates/envs/common-blockscout.env.j2 @@ -0,0 +1,181 @@ +# Blockscout Backend Environment Configuration +# Generated by Ansible - Do not edit manually + +# Database +DATABASE_URL=postgresql://{{ blockscout_db_user }}:{{ blockscout_db_password }}@db:5432/{{ blockscout_db_name }} +ECTO_USE_SSL=false + +# RPC Configuration +ETHEREUM_JSONRPC_VARIANT={{ blockscout_rpc_variant }} +ETHEREUM_JSONRPC_HTTP_URL={{ blockscout_rpc_url }} +ETHEREUM_JSONRPC_TRACE_URL={{ blockscout_rpc_trace_url }} +ETHEREUM_JSONRPC_TRANSPORT=http +ETHEREUM_JSONRPC_DISABLE_ARCHIVE_BALANCES=false + +# Chain Configuration +CHAIN_ID={{ blockscout_chain_id }} +NETWORK= +SUBNETWORK={{ blockscout_chain_name }} +COIN={{ blockscout_chain_symbol }} +COIN_NAME={{ blockscout_chain_symbol }} +EXCHANGE_RATES_COIN={{ blockscout_chain_symbol }} + +# Secret Key +SECRET_KEY_BASE={{ blockscout_secret_key_base }} + +# Server +PORT=4000 +BLOCKSCOUT_HOST={{ blockscout_domain }} +BLOCKSCOUT_PROTOCOL={{ (blockscout_ssl_enabled | default(blockscout_nginx_ssl_enabled | default(true))) | ternary('https', 'http') }} +NETWORK_PATH=/ + +# Pool sizes - optimized for 32GB RAM +POOL_SIZE={{ blockscout_pool_size }} +POOL_SIZE_API={{ blockscout_pool_size_api }} + +# Heartbeat +HEART_BEAT_TIMEOUT=30 + +# Block Configuration +BLOCK_TRANSFORMER=base +EMISSION_FORMAT=DEFAULT + +# Logo +LOGO=/images/blockscout_logo.svg +FOOTER_LOGO=/images/blockscout_logo.svg + +# Footer +FOOTER_LINK_TO_OTHER_EXPLORERS=false +FOOTER_OTHER_EXPLORERS={} +SUPPORTED_CHAINS={} + +# Cache settings +CACHE_BLOCK_COUNT_PERIOD={{ blockscout_cache_block_count_period }} +CACHE_TXS_COUNT_PERIOD={{ blockscout_cache_txs_count_period }} +CACHE_ADDRESS_SUM_PERIOD={{ blockscout_cache_address_sum_period }} +CACHE_TOTAL_GAS_USAGE_PERIOD=60 +CACHE_ADDRESS_TRANSACTIONS_GAS_USAGE_COUNTER_PERIOD=30 +CACHE_TOKEN_HOLDERS_COUNTER_PERIOD=60 +CACHE_TOKEN_TRANSFERS_COUNTER_PERIOD=60 +CACHE_ADDRESS_WITH_BALANCES_UPDATE_INTERVAL=30 +CACHE_AVERAGE_BLOCK_PERIOD=30 +CACHE_MARKET_HISTORY_PERIOD=60 +CACHE_ADDRESS_TRANSACTIONS_COUNTER_PERIOD=60 +CACHE_ADDRESS_TOKENS_USD_SUM_PERIOD=60 +CACHE_ADDRESS_TOKEN_TRANSFERS_COUNTER_PERIOD=60 + +# Token Metadata +TOKEN_METADATA_UPDATE_INTERVAL=60 +CONTRACT_MAX_STRING_LENGTH_WITHOUT_TRIMMING=2040 + +# Contract Verification +CONTRACT_VERIFICATION_ALLOWED_SOLIDITY_EVM_VERSIONS=homestead,tangerineWhistle,spuriousDragon,byzantium,constantinople,petersburg,istanbul,berlin,london,paris,shanghai,cancun,default +CONTRACT_VERIFICATION_ALLOWED_VYPER_EVM_VERSIONS=byzantium,constantinople,petersburg,istanbul,berlin,paris,shanghai,cancun,default + +# Webapp +DISABLE_WEBAPP=false + +# API Configuration +API_V2_ENABLED={{ blockscout_api_v2_enabled | lower }} +API_V1_READ_METHODS_DISABLED=false +API_V1_WRITE_METHODS_DISABLED=false +API_RATE_LIMIT_TIME_INTERVAL=1s +API_RATE_LIMIT_BY_IP_TIME_INTERVAL=5m +API_RATE_LIMIT={{ blockscout_api_rate_limit }} +API_RATE_LIMIT_BY_KEY=50 +API_RATE_LIMIT_BY_WHITELISTED_IP=50 +API_RATE_LIMIT_WHITELISTED_IPS= +API_RATE_LIMIT_STATIC_API_KEY= +API_RATE_LIMIT_UI_V2_WITH_TOKEN=5 +API_RATE_LIMIT_BY_IP={{ blockscout_api_rate_limit_by_ip }} +API_RATE_LIMIT_UI_V2_TOKEN_TTL_IN_SECONDS=18000 + +# Indexer +DISABLE_INDEXER={{ blockscout_disable_indexer | lower }} +DISABLE_REALTIME_INDEXER={{ blockscout_disable_realtime_indexer | lower }} +DISABLE_CATCHUP_INDEXER=false +INDEXER_DISABLE_ADDRESS_COIN_BALANCE_FETCHER=false +INDEXER_DISABLE_TOKEN_INSTANCE_REALTIME_FETCHER=false +INDEXER_DISABLE_TOKEN_INSTANCE_RETRY_FETCHER=false +INDEXER_DISABLE_TOKEN_INSTANCE_SANITIZE_FETCHER=false +INDEXER_DISABLE_TOKEN_INSTANCE_LEGACY_SANITIZE_FETCHER=false +INDEXER_DISABLE_PENDING_TRANSACTIONS_FETCHER=false +INDEXER_DISABLE_INTERNAL_TRANSACTIONS_FETCHER=false +INDEXER_CATCHUP_BLOCKS_BATCH_SIZE={{ blockscout_catchup_blocks_batch_size }} + +# Display Settings +SHOW_ADDRESS_MARKETCAP_PERCENTAGE=true +CHECKSUM_ADDRESS_HASHES=true +CHECKSUM_FUNCTION=eth +DISABLE_EXCHANGE_RATES=true +TXS_STATS_ENABLED=true +SHOW_PRICE_CHART=false +SHOW_PRICE_CHART_LEGEND=false +SHOW_TXS_CHART=true +TXS_HISTORIAN_INIT_LAG=0 +TXS_STATS_DAYS_TO_COMPILE_AT_INIT=10 +COIN_BALANCE_HISTORY_DAYS=90 +APPS_MENU=true +EXTERNAL_APPS=[] +UNCLES_IN_AVERAGE_BLOCK_TIME=false +MAX_SIZE_UNLESS_HIDE_ARRAY=50 +HIDE_BLOCK_MINER=false +DISPLAY_TOKEN_ICONS=false +SHOW_MAINTENANCE_ALERT=false +MAINTENANCE_ALERT_MESSAGE= + +# ReCaptcha (disabled) +RE_CAPTCHA_DISABLED=false +RE_CAPTCHA_SECRET_KEY= +RE_CAPTCHA_CLIENT_KEY= +RE_CAPTCHA_V3_SECRET_KEY= +RE_CAPTCHA_V3_CLIENT_KEY= + +# Microservices +MICROSERVICE_SC_VERIFIER_ENABLED=true +MICROSERVICE_SC_VERIFIER_URL=https://eth-bytecode-db.services.blockscout.com/ +MICROSERVICE_SC_VERIFIER_TYPE=eth_bytecode_db +MICROSERVICE_ETH_BYTECODE_DB_INTERVAL_BETWEEN_LOOKUPS=10m +MICROSERVICE_ETH_BYTECODE_DB_MAX_LOOKUPS_CONCURRENCY=10 +{% if blockscout_visualizer_enabled | default(true) %} +MICROSERVICE_VISUALIZE_SOL2UML_ENABLED=true +MICROSERVICE_VISUALIZE_SOL2UML_URL=http://visualizer:8050/ +{% else %} +MICROSERVICE_VISUALIZE_SOL2UML_ENABLED=false +{% endif %} +{% if blockscout_sig_provider_enabled | default(true) %} +MICROSERVICE_SIG_PROVIDER_ENABLED=true +MICROSERVICE_SIG_PROVIDER_URL=http://sig-provider:8050/ +{% else %} +MICROSERVICE_SIG_PROVIDER_ENABLED=false +{% endif %} +MICROSERVICE_ACCOUNT_ABSTRACTION_ENABLED=false + +# Sourcify (disabled) +SOURCIFY_INTEGRATION_ENABLED=false +SOURCIFY_SERVER_URL= +SOURCIFY_REPO_URL= + +# Tenderly (disabled) +SHOW_TENDERLY_LINK=false +TENDERLY_CHAIN_PATH= + +# Rewards +FETCH_REWARDS_WAY=trace_block + +# Account (disabled) +ACCOUNT_ENABLED=false +ACCOUNT_CLOAK_KEY= +ACCOUNT_REDIS_URL=redis://redis:6379 + +# EIP-1559 +EIP_1559_ELASTICITY_MULTIPLIER=2 + +# Decode +DECODE_NOT_A_CONTRACT_CALLS=true + +# IPC +IPC_PATH= + +# JSON RPC +JSON_RPC= diff --git a/infra/ansible/roles/blockscout/templates/envs/common-frontend.env.j2 b/infra/ansible/roles/blockscout/templates/envs/common-frontend.env.j2 new file mode 100644 index 0000000..4f2106c --- /dev/null +++ b/infra/ansible/roles/blockscout/templates/envs/common-frontend.env.j2 @@ -0,0 +1,60 @@ +# Blockscout Frontend Environment Configuration +# Generated by Ansible - Do not edit manually + +# API Configuration +NEXT_PUBLIC_API_HOST={{ blockscout_domain }} +NEXT_PUBLIC_API_PROTOCOL={{ (blockscout_ssl_enabled | default(blockscout_nginx_ssl_enabled | default(true))) | ternary('https', 'http') }} +NEXT_PUBLIC_API_BASE_PATH=/ +NEXT_PUBLIC_API_WEBSOCKET_PROTOCOL={{ (blockscout_ssl_enabled | default(blockscout_nginx_ssl_enabled | default(true))) | ternary('wss', 'ws') }} +NEXT_PUBLIC_API_SPEC_URL=https://raw.githubusercontent.com/blockscout/blockscout-api-v2-swagger/main/swagger.yaml + +# Stats API +{% if blockscout_stats_enabled | default(true) %} +NEXT_PUBLIC_STATS_API_HOST={{ (blockscout_ssl_enabled | default(blockscout_nginx_ssl_enabled | default(true))) | ternary('https', 'http') }}://{{ blockscout_stats_domain }} +{% endif %} + +# Visualizer API +{% if blockscout_visualizer_enabled | default(true) %} +NEXT_PUBLIC_VISUALIZE_API_HOST={{ (blockscout_ssl_enabled | default(blockscout_nginx_ssl_enabled | default(true))) | ternary('https', 'http') }}://{{ blockscout_domain }} +{% endif %} + +# Network Configuration +NEXT_PUBLIC_NETWORK_NAME={{ blockscout_chain_name }} +NEXT_PUBLIC_NETWORK_SHORT_NAME=Load Network +NEXT_PUBLIC_NETWORK_ID={{ blockscout_chain_id }} +NEXT_PUBLIC_NETWORK_CURRENCY_NAME={{ blockscout_chain_name }} +NEXT_PUBLIC_NETWORK_CURRENCY_SYMBOL={{ blockscout_chain_symbol }} +NEXT_PUBLIC_NETWORK_CURRENCY_DECIMALS={{ blockscout_chain_decimals }} +NEXT_PUBLIC_IS_TESTNET={{ blockscout_is_testnet | lower }} + +# App Configuration +NEXT_PUBLIC_APP_HOST={{ blockscout_domain }} +NEXT_PUBLIC_APP_PROTOCOL={{ (blockscout_ssl_enabled | default(blockscout_nginx_ssl_enabled | default(true))) | ternary('https', 'http') }} +NEXT_PUBLIC_APP_PORT=3000 +# Note: NEXT_PUBLIC_APP_ENV requires NEXT_PUBLIC_SENTRY_DSN, so we omit it + +# Homepage +NEXT_PUBLIC_HOMEPAGE_CHARTS=['daily_txs'] +NEXT_PUBLIC_HOMEPAGE_PLATE_BACKGROUND={{ blockscout_homepage_plate_background }} + +# Theme +NEXT_PUBLIC_COLOR_THEME_DEFAULT={{ blockscout_color_theme_default }} + +# Branding - Load Network +NEXT_PUBLIC_NETWORK_LOGO={{ blockscout_network_logo }} +NEXT_PUBLIC_NETWORK_LOGO_DARK={{ blockscout_network_logo_dark }} +NEXT_PUBLIC_NETWORK_ICON={{ blockscout_network_icon }} +NEXT_PUBLIC_NETWORK_ICON_DARK={{ blockscout_network_icon_dark }} + +# Ads (disabled) +NEXT_PUBLIC_AD_BANNER_PROVIDER=none +NEXT_PUBLIC_AD_TEXT_PROVIDER=none + +# Account (disabled) +NEXT_PUBLIC_IS_ACCOUNT_SUPPORTED=false + +# Featured Networks (optional) +# NEXT_PUBLIC_FEATURED_NETWORKS= + +# Sentry (optional) +# NEXT_PUBLIC_SENTRY_DSN= diff --git a/infra/ansible/roles/blockscout/templates/envs/common-stats.env.j2 b/infra/ansible/roles/blockscout/templates/envs/common-stats.env.j2 new file mode 100644 index 0000000..0c9f42a --- /dev/null +++ b/infra/ansible/roles/blockscout/templates/envs/common-stats.env.j2 @@ -0,0 +1,38 @@ +# Blockscout Stats Environment Configuration +# Generated by Ansible - Do not edit manually + +# Server Configuration +STATS__SERVER__HTTP__ENABLED=true +STATS__SERVER__HTTP__ADDR=0.0.0.0:8050 +STATS__SERVER__HTTP__MAX_BODY_SIZE=2097152 + +# gRPC (disabled) +STATS__SERVER__GRPC__ENABLED=false +STATS__SERVER__GRPC__ADDR=0.0.0.0:8051 + +# Database Configuration +# Note: DB URLs are set via environment in docker-compose.yml +STATS__DB_URL= +STATS__BLOCKSCOUT_DB_URL= +STATS__CREATE_DATABASE=false +STATS__RUN_MIGRATIONS=false + +# Scheduler +STATS__DEFAULT_SCHEDULE=0 0 1 * * * * +STATS__FORCE_UPDATE_ON_START=false + +# Metrics (disabled) +STATS__METRICS__ENABLED=false +STATS__METRICS__ADDR=0.0.0.0:6060 +STATS__METRICS__ROUTE=/metrics + +# Jaeger (disabled) +STATS__JAEGER__ENABLED=false +STATS__JAEGER__AGENT_ENDPOINT=localhost:6831 + +# Tracing +STATS__TRACING__ENABLED=true +STATS__TRACING__FORMAT=default + +# Ignore API absence during startup +STATS__IGNORE_BLOCKSCOUT_API_ABSENCE=true diff --git a/infra/ansible/roles/blockscout/templates/envs/common-visualizer.env.j2 b/infra/ansible/roles/blockscout/templates/envs/common-visualizer.env.j2 new file mode 100644 index 0000000..effca04 --- /dev/null +++ b/infra/ansible/roles/blockscout/templates/envs/common-visualizer.env.j2 @@ -0,0 +1,5 @@ +# Blockscout Visualizer Environment Configuration +# Generated by Ansible - Do not edit manually + +# gRPC (disabled - using HTTP) +VISUALIZER__SERVER__GRPC__ENABLED=false diff --git a/infra/ansible/roles/blockscout_host/tasks/main.yml b/infra/ansible/roles/blockscout_host/tasks/main.yml new file mode 100644 index 0000000..5690907 --- /dev/null +++ b/infra/ansible/roles/blockscout_host/tasks/main.yml @@ -0,0 +1,78 @@ +--- +- name: Assert Blockscout is enabled in lockfile + ansible.builtin.assert: + that: + - loadnet_lock.blockscout is defined + - loadnet_lock.blockscout.enabled | default(false) + fail_msg: "Blockscout is not enabled in the manifest (blockscout.enabled must be true)" + +- name: Resolve Blockscout host + ansible.builtin.set_fact: + blockscout_host: "{{ loadnet_lock.blockscout.host }}" + +- name: Skip non-Blockscout hosts (strict mode) + ansible.builtin.meta: end_host + when: + - inventory_hostname != blockscout_host + - blockscout_host_strict | default(true) | bool + +- name: Resolve Blockscout domains + ansible.builtin.set_fact: + blockscout_domains: "{{ loadnet_lock.blockscout.domains }}" + when: inventory_hostname == blockscout_host + +- name: Resolve Blockscout RPC node + ansible.builtin.set_fact: + blockscout_rpc_node: "{{ loadnet_lock.blockscout.rpc_node }}" + when: inventory_hostname == blockscout_host + +- name: Resolve Blockscout RPC node info + ansible.builtin.set_fact: + blockscout_rpc_node_info: >- + {{ loadnet_lock.nodes | selectattr('id', 'equalto', blockscout_rpc_node) | list }} + when: inventory_hostname == blockscout_host + +- name: Assert Blockscout RPC node exists in lockfile + ansible.builtin.assert: + that: + - blockscout_rpc_node_info | length == 1 + fail_msg: "blockscout.rpc_node {{ blockscout_rpc_node }} not found in lockfile nodes" + when: inventory_hostname == blockscout_host + +- name: Resolve Blockscout RPC port + ansible.builtin.set_fact: + blockscout_rpc_port: "{{ blockscout_rpc_node_info[0].ports.el_http }}" + when: inventory_hostname == blockscout_host + +- name: Set Blockscout RPC URLs (Docker host gateway) + ansible.builtin.set_fact: + blockscout_rpc_url: "http://host.docker.internal:{{ blockscout_rpc_port }}" + blockscout_rpc_trace_url: "http://host.docker.internal:{{ blockscout_rpc_port }}" + when: inventory_hostname == blockscout_host + +- name: Map Blockscout variables for roles + ansible.builtin.set_fact: + # Enable roles + blockscout_enabled: true + blockscout_nginx_enabled: true + # Blockscout role variables + blockscout_domain: "{{ blockscout_domains.explorer }}" + blockscout_stats_domain: "{{ blockscout_domains.stats }}" + blockscout_rpc_domain: "{{ blockscout_domains.rpc }}" + blockscout_chain_id: "{{ loadnet_lock.network.chain_id }}" + blockscout_chain_name: "{{ loadnet_lock.network.name | title }} Testnet" + blockscout_ssl_enabled: "{{ loadnet_lock.blockscout.ssl.enabled | default(true) }}" + # Nginx role variables + blockscout_nginx_explorer_domain: "{{ blockscout_domains.explorer }}" + blockscout_nginx_stats_domain: "{{ blockscout_domains.stats }}" + blockscout_nginx_rpc_domain: "{{ blockscout_domains.rpc }}" + blockscout_nginx_rpc_backend: "127.0.0.1:{{ blockscout_rpc_port }}" + # SSL configuration from lockfile + blockscout_nginx_ssl_enabled: "{{ loadnet_lock.blockscout.ssl.enabled | default(true) }}" + blockscout_nginx_certbot_email: "{{ loadnet_lock.blockscout.ssl.email | default('admin@load.network') }}" + when: inventory_hostname == blockscout_host + +- name: Restrict EL HTTP to Docker bridge by default on Blockscout host + ansible.builtin.set_fact: + loadnet_el_http_allow_cidrs: "{{ loadnet_el_http_allow_cidrs | default(['172.17.0.0/16']) }}" + when: inventory_hostname == blockscout_host diff --git a/infra/ansible/roles/blockscout_nginx/defaults/main.yml b/infra/ansible/roles/blockscout_nginx/defaults/main.yml new file mode 100644 index 0000000..b2120d1 --- /dev/null +++ b/infra/ansible/roles/blockscout_nginx/defaults/main.yml @@ -0,0 +1,31 @@ +--- +# Enable/disable nginx role for this host +blockscout_nginx_enabled: false + +# SSL configuration +blockscout_nginx_ssl_enabled: true +blockscout_nginx_certbot_email: "admin@load.network" + +# Domain names +blockscout_nginx_explorer_domain: "fibernet.load.network" +blockscout_nginx_stats_domain: "stats.fibernet.load.network" +blockscout_nginx_rpc_domain: "rpc.fibernet.load.network" + +# SSL certificate paths (Let's Encrypt) +blockscout_nginx_ssl_cert: "/etc/letsencrypt/live/{{ blockscout_nginx_explorer_domain }}/fullchain.pem" +blockscout_nginx_ssl_key: "/etc/letsencrypt/live/{{ blockscout_nginx_explorer_domain }}/privkey.pem" + +# Nginx directories +blockscout_nginx_config_dir: "{{ (loadnet_net_dir | default('/opt/loadnet/current')) }}/nginx" +blockscout_nginx_webroot: "/var/www/html" + +# Backend services (upstream addresses) +blockscout_nginx_backend: "127.0.0.1:4000" +blockscout_nginx_frontend: "127.0.0.1:3001" +blockscout_nginx_visualizer: "127.0.0.1:8051" +blockscout_nginx_stats_backend: "127.0.0.1:8050" +blockscout_nginx_rpc_backend: "127.0.0.1:8545" + +# Rate limiting for RPC +blockscout_nginx_rpc_rate_limit: "100r/s" +blockscout_nginx_rpc_burst: 200 diff --git a/infra/ansible/roles/blockscout_nginx/handlers/main.yml b/infra/ansible/roles/blockscout_nginx/handlers/main.yml new file mode 100644 index 0000000..2858a44 --- /dev/null +++ b/infra/ansible/roles/blockscout_nginx/handlers/main.yml @@ -0,0 +1,16 @@ +--- +- name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + +- name: Restart nginx + ansible.builtin.systemd: + name: nginx-blockscout + state: restarted + daemon_reload: true + +- name: Restart nginx-blockscout + ansible.builtin.systemd: + name: nginx-blockscout + state: restarted + daemon_reload: true diff --git a/infra/ansible/roles/blockscout_nginx/tasks/certbot.yml b/infra/ansible/roles/blockscout_nginx/tasks/certbot.yml new file mode 100644 index 0000000..90dc4b4 --- /dev/null +++ b/infra/ansible/roles/blockscout_nginx/tasks/certbot.yml @@ -0,0 +1,83 @@ +--- +- name: Check if SSL certificate already exists + ansible.builtin.stat: + path: "{{ blockscout_nginx_ssl_cert }}" + register: blockscout_nginx_cert_stat + +- name: Ensure webroot directory exists + ansible.builtin.file: + path: "{{ blockscout_nginx_webroot }}/.well-known/acme-challenge" + state: directory + owner: root + group: root + mode: "0755" + +- name: Obtain SSL certificate with certbot (webroot mode) + when: not blockscout_nginx_cert_stat.stat.exists + block: + - name: Stop nginx-blockscout for initial cert issuance (standalone fallback) + ansible.builtin.systemd: + name: nginx-blockscout + state: stopped + register: blockscout_nginx_stop_result + failed_when: + - blockscout_nginx_stop_result.failed + - "'could not find' not in (blockscout_nginx_stop_result.msg | default(''))" + - "'not loaded' not in (blockscout_nginx_stop_result.msg | default(''))" + + - name: Obtain certificate using certbot standalone (initial) + ansible.builtin.command: > + certbot certonly + --standalone + --non-interactive + --agree-tos + --email {{ blockscout_nginx_certbot_email }} + -d {{ blockscout_nginx_explorer_domain }} + -d {{ blockscout_nginx_stats_domain }} + -d {{ blockscout_nginx_rpc_domain }} + register: blockscout_nginx_certbot_result + changed_when: >- + 'Successfully received certificate' in blockscout_nginx_certbot_result.stdout or + 'Congratulations' in blockscout_nginx_certbot_result.stdout + + - name: Display certbot output + ansible.builtin.debug: + var: blockscout_nginx_certbot_result.stdout_lines + when: blockscout_nginx_certbot_result.stdout_lines is defined + +- name: Configure certbot for webroot renewal + ansible.builtin.copy: + dest: /etc/letsencrypt/renewal-hooks/deploy/reload-nginx.sh + owner: root + group: root + mode: "0755" + content: | + #!/bin/bash + # Reload nginx after certificate renewal + docker exec nginx-blockscout nginx -s reload 2>/dev/null || true + +- name: Update certbot renewal config for webroot + ansible.builtin.lineinfile: + path: "/etc/letsencrypt/renewal/{{ blockscout_nginx_explorer_domain }}.conf" + regexp: "^authenticator\\s*=" + line: "authenticator = webroot" + when: blockscout_nginx_cert_stat.stat.exists or blockscout_nginx_certbot_result is defined + +- name: Set webroot path in renewal config + ansible.builtin.lineinfile: + path: "/etc/letsencrypt/renewal/{{ blockscout_nginx_explorer_domain }}.conf" + regexp: "^webroot_path\\s*=" + line: "webroot_path = {{ blockscout_nginx_webroot }}" + insertafter: "^authenticator" + when: blockscout_nginx_cert_stat.stat.exists or blockscout_nginx_certbot_result is defined + +- name: Ensure certbot renewal timer is enabled (systemd only - no cron duplication) + ansible.builtin.systemd: + name: certbot.timer + enabled: true + state: started + register: blockscout_nginx_certbot_timer_result + failed_when: + - blockscout_nginx_certbot_timer_result.failed + - "'could not find' not in (blockscout_nginx_certbot_timer_result.msg | default(''))" + - "'not found' not in (blockscout_nginx_certbot_timer_result.msg | default(''))" diff --git a/infra/ansible/roles/blockscout_nginx/tasks/main.yml b/infra/ansible/roles/blockscout_nginx/tasks/main.yml new file mode 100644 index 0000000..699b0a8 --- /dev/null +++ b/infra/ansible/roles/blockscout_nginx/tasks/main.yml @@ -0,0 +1,93 @@ +--- +- name: Check if nginx is enabled for this host + ansible.builtin.meta: end_host + when: not (blockscout_nginx_enabled | default(false) | bool) + +- name: Install certbot + block: + - name: Install certbot (with cache update) + ansible.builtin.apt: + name: + - certbot + state: present + update_cache: true + cache_valid_time: "{{ loadnet_apt_cache_valid_time | default(3600) }}" + update_cache_retries: "{{ loadnet_apt_update_retries | default(10) }}" + update_cache_retry_max_delay: "{{ loadnet_apt_update_max_delay | default(30) }}" + lock_timeout: "{{ loadnet_apt_lock_timeout | default(120) }}" + rescue: + - name: Warn about apt cache update failure + ansible.builtin.debug: + msg: "apt cache update failed; retrying install without update_cache." + - name: Install certbot (without cache update) + ansible.builtin.apt: + name: + - certbot + state: present + update_cache: false + lock_timeout: "{{ loadnet_apt_lock_timeout | default(120) }}" + +- name: Create nginx directories + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: root + group: root + mode: "0755" + loop: + - "{{ blockscout_nginx_config_dir }}" + - "{{ blockscout_nginx_config_dir }}/conf.d" + - "{{ blockscout_nginx_webroot }}" + - "{{ blockscout_nginx_webroot }}/.well-known" + - "{{ blockscout_nginx_webroot }}/.well-known/acme-challenge" + +- name: Check if nginx.conf is a directory + ansible.builtin.stat: + path: "{{ blockscout_nginx_config_dir }}/nginx.conf" + register: nginx_conf_stat + +- name: Remove nginx.conf if it's a directory (prevents mount failure) + ansible.builtin.file: + path: "{{ blockscout_nginx_config_dir }}/nginx.conf" + state: absent + when: nginx_conf_stat.stat.exists and nginx_conf_stat.stat.isdir + +- name: Render main nginx configuration + ansible.builtin.template: + src: "nginx.conf.j2" + dest: "{{ blockscout_nginx_config_dir }}/nginx.conf" + owner: root + group: root + mode: "0644" + notify: Restart nginx + +- name: Render blockscout site configuration + ansible.builtin.template: + src: "sites/blockscout.conf.j2" + dest: "{{ blockscout_nginx_config_dir }}/conf.d/blockscout.conf" + owner: root + group: root + mode: "0644" + notify: Restart nginx + +- name: Install nginx-blockscout systemd unit + ansible.builtin.template: + src: "nginx.service.j2" + dest: "/etc/systemd/system/nginx-blockscout.service" + owner: root + group: root + mode: "0644" + notify: + - Reload systemd + - Restart nginx-blockscout + +- name: Include certbot tasks + ansible.builtin.include_tasks: certbot.yml + when: blockscout_nginx_ssl_enabled | default(true) | bool + +- name: Enable and start nginx-blockscout service + ansible.builtin.systemd: + name: nginx-blockscout + enabled: true + state: started + daemon_reload: true diff --git a/infra/ansible/roles/blockscout_nginx/templates/nginx.conf.j2 b/infra/ansible/roles/blockscout_nginx/templates/nginx.conf.j2 new file mode 100644 index 0000000..680dadb --- /dev/null +++ b/infra/ansible/roles/blockscout_nginx/templates/nginx.conf.j2 @@ -0,0 +1,70 @@ +# {{ ansible_managed }} +# Nginx configuration for Load Network reverse proxy + +user nginx; +worker_processes auto; +error_log /var/log/nginx/error.log warn; +pid /var/run/nginx.pid; + +events { + worker_connections 4096; + use epoll; + multi_accept on; +} + +http { + include /etc/nginx/mime.types; + default_type application/octet-stream; + + log_format main '$remote_addr - $remote_user [$time_local] "$request" ' + '$status $body_bytes_sent "$http_referer" ' + '"$http_user_agent" "$http_x_forwarded_for" ' + 'rt=$request_time uct="$upstream_connect_time" ' + 'uht="$upstream_header_time" urt="$upstream_response_time"'; + + access_log /var/log/nginx/access.log main; + + sendfile on; + tcp_nopush on; + tcp_nodelay on; + keepalive_timeout 65; + types_hash_max_size 2048; + + # Gzip compression + gzip on; + gzip_vary on; + gzip_proxied any; + gzip_comp_level 6; + gzip_buffers 16 8k; + gzip_http_version 1.1; + gzip_min_length 256; + gzip_types + application/atom+xml + application/geo+json + application/javascript + application/x-javascript + application/json + application/ld+json + application/manifest+json + application/rdf+xml + application/rss+xml + application/xhtml+xml + application/xml + font/eot + font/otf + font/ttf + image/svg+xml + text/css + text/javascript + text/plain + text/xml; + + # Security headers + server_tokens off; + + # Rate limiting zones + limit_req_zone $binary_remote_addr zone=rpc:10m rate={{ blockscout_nginx_rpc_rate_limit }}; + + # Include site configurations + include /etc/nginx/conf.d/*.conf; +} diff --git a/infra/ansible/roles/blockscout_nginx/templates/nginx.service.j2 b/infra/ansible/roles/blockscout_nginx/templates/nginx.service.j2 new file mode 100644 index 0000000..00176b0 --- /dev/null +++ b/infra/ansible/roles/blockscout_nginx/templates/nginx.service.j2 @@ -0,0 +1,31 @@ +[Unit] +Description=Nginx Reverse Proxy for Blockscout (Docker) +After=docker.service network-online.target +Requires=docker.service +Wants=network-online.target + +[Service] +Type=simple +Restart=always +RestartSec=5 +TimeoutStartSec=0 + +ExecStartPre=-/usr/bin/docker stop nginx-blockscout +ExecStartPre=-/usr/bin/docker rm nginx-blockscout +ExecStartPre=/usr/bin/docker pull nginx:alpine + +ExecStart=/usr/bin/docker run \ + --name nginx-blockscout \ + --network host \ + --rm \ + -v {{ blockscout_nginx_config_dir }}/nginx.conf:/etc/nginx/nginx.conf:ro \ + -v {{ blockscout_nginx_config_dir }}/conf.d:/etc/nginx/conf.d:ro \ + -v /etc/letsencrypt:/etc/letsencrypt:ro \ + -v {{ blockscout_nginx_webroot }}:/var/www/html:ro \ + nginx:alpine + +ExecStop=/usr/bin/docker stop nginx-blockscout +ExecReload=/usr/bin/docker exec nginx-blockscout nginx -s reload + +[Install] +WantedBy=multi-user.target diff --git a/infra/ansible/roles/blockscout_nginx/templates/sites/blockscout.conf.j2 b/infra/ansible/roles/blockscout_nginx/templates/sites/blockscout.conf.j2 new file mode 100644 index 0000000..17ff7c3 --- /dev/null +++ b/infra/ansible/roles/blockscout_nginx/templates/sites/blockscout.conf.j2 @@ -0,0 +1,326 @@ +# {{ ansible_managed }} +# Blockscout Explorer, Stats API, and RPC endpoint configuration + +{% if blockscout_nginx_ssl_enabled %} +# ============================================================================ +# HTTP -> HTTPS Redirect (Port 80) +# ============================================================================ +server { + listen 80; + listen [::]:80; + server_name {{ blockscout_nginx_explorer_domain }} {{ blockscout_nginx_stats_domain }} {{ blockscout_nginx_rpc_domain }}; + + # ACME challenge for Let's Encrypt certificate renewal + location /.well-known/acme-challenge/ { + root {{ blockscout_nginx_webroot }}; + allow all; + } + + # Redirect all other HTTP traffic to HTTPS + location / { + return 301 https://$host$request_uri; + } +} + +# ============================================================================ +# Explorer Frontend/Backend ({{ blockscout_nginx_explorer_domain }}) +# ============================================================================ +server { + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name {{ blockscout_nginx_explorer_domain }}; + + # SSL Configuration + ssl_certificate {{ blockscout_nginx_ssl_cert }}; + ssl_certificate_key {{ blockscout_nginx_ssl_key }}; + ssl_session_timeout 1d; + ssl_session_cache shared:SSL:50m; + ssl_session_tickets off; + + # Modern TLS configuration + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_prefer_server_ciphers off; + + # HSTS + add_header Strict-Transport-Security "max-age=63072000" always; + + # Proxy settings + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_buffering off; + + # API endpoints -> Backend + location /api/ { + proxy_pass http://{{ blockscout_nginx_backend }}; + proxy_read_timeout 60s; + proxy_send_timeout 60s; + } + + # WebSocket support for real-time updates + location /socket/ { + proxy_pass http://{{ blockscout_nginx_backend }}; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_read_timeout 86400s; + proxy_send_timeout 86400s; + } + + # Visualizer service + location /visualize/ { + proxy_pass http://{{ blockscout_nginx_visualizer }}/; + proxy_read_timeout 30s; + } + + # Frontend (default) + location / { + proxy_pass http://{{ blockscout_nginx_frontend }}; + proxy_read_timeout 30s; + + # WebSocket support for frontend hot reload (dev) / live updates + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + } +} + +# ============================================================================ +# Stats API ({{ blockscout_nginx_stats_domain }}) +# ============================================================================ +server { + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name {{ blockscout_nginx_stats_domain }}; + + # SSL Configuration + ssl_certificate {{ blockscout_nginx_ssl_cert }}; + ssl_certificate_key {{ blockscout_nginx_ssl_key }}; + ssl_session_timeout 1d; + ssl_session_cache shared:SSL:50m; + ssl_session_tickets off; + + # Modern TLS configuration + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_prefer_server_ciphers off; + + # HSTS + add_header Strict-Transport-Security "max-age=63072000" always; + + # CORS headers for Stats API + add_header Access-Control-Allow-Origin "*" always; + add_header Access-Control-Allow-Methods "GET, POST, OPTIONS" always; + add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization" always; + add_header Access-Control-Expose-Headers "Content-Length,Content-Range" always; + + # Proxy settings + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # Handle preflight requests + location / { + if ($request_method = 'OPTIONS') { + add_header Access-Control-Allow-Origin "*"; + add_header Access-Control-Allow-Methods "GET, POST, OPTIONS"; + add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization"; + add_header Access-Control-Max-Age 1728000; + add_header Content-Type "text/plain; charset=utf-8"; + add_header Content-Length 0; + return 204; + } + + proxy_pass http://{{ blockscout_nginx_stats_backend }}; + proxy_read_timeout 30s; + } +} + +# ============================================================================ +# RPC Endpoint ({{ blockscout_nginx_rpc_domain }}) +# ============================================================================ +server { + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name {{ blockscout_nginx_rpc_domain }}; + + # SSL Configuration + ssl_certificate {{ blockscout_nginx_ssl_cert }}; + ssl_certificate_key {{ blockscout_nginx_ssl_key }}; + ssl_session_timeout 1d; + ssl_session_cache shared:SSL:50m; + ssl_session_tickets off; + + # Modern TLS configuration + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_prefer_server_ciphers off; + + # HSTS + add_header Strict-Transport-Security "max-age=63072000" always; + + # CORS headers for dApps + add_header Access-Control-Allow-Origin "*" always; + add_header Access-Control-Allow-Methods "GET, POST, OPTIONS" always; + add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization" always; + add_header Access-Control-Expose-Headers "Content-Length,Content-Range" always; + + # Proxy settings + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + location / { + # Handle preflight requests + if ($request_method = 'OPTIONS') { + add_header Access-Control-Allow-Origin "*"; + add_header Access-Control-Allow-Methods "GET, POST, OPTIONS"; + add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization"; + add_header Access-Control-Max-Age 1728000; + add_header Content-Type "text/plain; charset=utf-8"; + add_header Content-Length 0; + return 204; + } + + # Rate limiting + limit_req zone=rpc burst={{ blockscout_nginx_rpc_burst }} nodelay; + limit_req_status 429; + + proxy_pass http://{{ blockscout_nginx_rpc_backend }}; + proxy_read_timeout 30s; + proxy_send_timeout 30s; + proxy_connect_timeout 10s; + + # JSON-RPC specific settings + proxy_set_header Content-Type "application/json"; + } +} +{% else %} +# ============================================================================ +# Explorer Frontend/Backend (HTTP) ({{ blockscout_nginx_explorer_domain }}) +# ============================================================================ +server { + listen 80; + listen [::]:80; + server_name {{ blockscout_nginx_explorer_domain }}; + + # Proxy settings + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_buffering off; + + location /api/ { + proxy_pass http://{{ blockscout_nginx_backend }}; + proxy_read_timeout 60s; + proxy_send_timeout 60s; + } + + location /socket/ { + proxy_pass http://{{ blockscout_nginx_backend }}; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_read_timeout 86400s; + proxy_send_timeout 86400s; + } + + location /visualize/ { + proxy_pass http://{{ blockscout_nginx_visualizer }}/; + proxy_read_timeout 30s; + } + + location / { + proxy_pass http://{{ blockscout_nginx_frontend }}; + proxy_read_timeout 30s; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + } +} + +# ============================================================================ +# Stats API (HTTP) ({{ blockscout_nginx_stats_domain }}) +# ============================================================================ +server { + listen 80; + listen [::]:80; + server_name {{ blockscout_nginx_stats_domain }}; + + # CORS headers for Stats API + add_header Access-Control-Allow-Origin "*" always; + add_header Access-Control-Allow-Methods "GET, POST, OPTIONS" always; + add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization" always; + add_header Access-Control-Expose-Headers "Content-Length,Content-Range" always; + + # Proxy settings + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + location / { + if ($request_method = 'OPTIONS') { + add_header Access-Control-Allow-Origin "*"; + add_header Access-Control-Allow-Methods "GET, POST, OPTIONS"; + add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization"; + add_header Access-Control-Max-Age 1728000; + add_header Content-Type "text/plain; charset=utf-8"; + add_header Content-Length 0; + return 204; + } + + proxy_pass http://{{ blockscout_nginx_stats_backend }}; + proxy_read_timeout 30s; + } +} + +# ============================================================================ +# RPC Endpoint (HTTP) ({{ blockscout_nginx_rpc_domain }}) +# ============================================================================ +server { + listen 80; + listen [::]:80; + server_name {{ blockscout_nginx_rpc_domain }}; + + # CORS headers for dApps + add_header Access-Control-Allow-Origin "*" always; + add_header Access-Control-Allow-Methods "GET, POST, OPTIONS" always; + add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization" always; + add_header Access-Control-Expose-Headers "Content-Length,Content-Range" always; + + # Proxy settings + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + location / { + if ($request_method = 'OPTIONS') { + add_header Access-Control-Allow-Origin "*"; + add_header Access-Control-Allow-Methods "GET, POST, OPTIONS"; + add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization"; + add_header Access-Control-Max-Age 1728000; + add_header Content-Type "text/plain; charset=utf-8"; + add_header Content-Length 0; + return 204; + } + + limit_req zone=rpc burst={{ blockscout_nginx_rpc_burst }} nodelay; + limit_req_status 429; + + proxy_pass http://{{ blockscout_nginx_rpc_backend }}; + proxy_read_timeout 30s; + proxy_send_timeout 30s; + proxy_connect_timeout 10s; + proxy_set_header Content-Type "application/json"; + } +} +{% endif %} diff --git a/infra/ansible/roles/common/defaults/main.yml b/infra/ansible/roles/common/defaults/main.yml new file mode 100644 index 0000000..02e170b --- /dev/null +++ b/infra/ansible/roles/common/defaults/main.yml @@ -0,0 +1,78 @@ +--- +loadnet_apt_disable_proxy: false +loadnet_apt_cache_valid_time: 3600 +loadnet_apt_update_retries: 10 +loadnet_apt_update_max_delay: 30 +loadnet_apt_lock_timeout: 120 + +loadnet_docker_log_driver: "local" +loadnet_docker_log_max_size: "50m" +loadnet_docker_log_max_file: "5" + +loadnet_el_http_bind: "0.0.0.0" +loadnet_el_http_api: "eth,net,web3,txpool,debug,trace" + +# Keep debug/trace APIs exposed for operational troubleshooting and Blockscout. +# RPC admission defaults are tuned to protect node latency +# under sustained high load. +loadnet_el_rpc_max_connections: 12000 +loadnet_el_rpc_max_request_size: 100 +loadnet_el_rpc_max_response_size: 200 +loadnet_el_rpc_max_tracing_requests: 256 +loadnet_el_rpc_max_blocking_io_requests: 512 +loadnet_el_rpc_send_raw_transaction_sync_timeout: "3s" + +# load-reth method-specific RPC backpressure middleware +loadnet_el_rpc_bp_send_raw_tx_limit: 1024 +loadnet_el_rpc_bp_get_transaction_count_limit: 2048 +loadnet_el_rpc_bp_send_raw_tx_sync_limit: 256 +# Legacy alias supported by template for backward compatibility: +# loadnet_el_rpc_send_raw_tx_sync_timeout + +# RPC cache sizing keeps hot data resident while bounding DB fan-out. +loadnet_el_rpc_cache_max_blocks: 100000 +loadnet_el_rpc_cache_max_receipts: 50000 +loadnet_el_rpc_cache_max_headers: 50000 +loadnet_el_rpc_cache_max_concurrent_db_requests: 2048 + +# Builder settings for high-throughput (200k+ TPS) +# Reduce interval for more responsive block building +# Increase max-tasks for more parallel payload construction +loadnet_el_builder_gaslimit: 2000000000 +loadnet_el_builder_interval: "25ms" +loadnet_el_builder_deadline: 1 +loadnet_el_builder_max_tasks: 512 + +# TxPool settings for high-throughput with bounded failure domains. +# Large pools increase survivability for short bursts, +# but can amplify nonce-gap storms. +loadnet_el_txpool_pending_max_count: 1000000 +loadnet_el_txpool_queued_max_count: 1000000 +loadnet_el_txpool_max_pending_txns: 1500000 +loadnet_el_txpool_max_new_txns: 100000 +loadnet_el_txpool_max_account_slots: 128 +# Size limits in MB (blobs are ~128KB each, 6 blobs/tx = ~768KB) +# 32GB pending allows ~40k blob txs or ~hundreds of millions of regular txs +loadnet_el_txpool_pending_max_size: 32768 +loadnet_el_txpool_queued_max_size: 32768 +loadnet_el_txpool_gas_limit: 2000000000 + +loadnet_prometheus_bind: "127.0.0.1" +loadnet_prometheus_port: 9090 +loadnet_prometheus_scrape_interval: "5s" + +loadnet_grafana_bind: "127.0.0.1" +loadnet_grafana_port: 3000 + +loadnet_load_reth_uid: 10001 +loadnet_load_reth_gid: 10001 +loadnet_ultramarine_uid: 10002 +loadnet_ultramarine_gid: 10002 +loadnet_fix_ownership: true + +loadnet_journal_max_use: "4G" +loadnet_journal_max_file_size: "200M" +loadnet_journal_max_retention: "7day" +loadnet_forward_journal_to_syslog: false +loadnet_syslog_max_size: "100M" +loadnet_syslog_max_files: 5 diff --git a/infra/ansible/roles/common/handlers/main.yml b/infra/ansible/roles/common/handlers/main.yml new file mode 100644 index 0000000..967d348 --- /dev/null +++ b/infra/ansible/roles/common/handlers/main.yml @@ -0,0 +1,9 @@ +--- +- name: Apply sysctl settings + ansible.builtin.command: sysctl --system + changed_when: true + +- name: Restart journald + ansible.builtin.systemd: + name: systemd-journald + state: restarted diff --git a/infra/ansible/roles/common/tasks/main.yml b/infra/ansible/roles/common/tasks/main.yml new file mode 100644 index 0000000..2bfe539 --- /dev/null +++ b/infra/ansible/roles/common/tasks/main.yml @@ -0,0 +1,170 @@ +--- +- name: Optionally disable apt proxy + ansible.builtin.copy: + dest: /etc/apt/apt.conf.d/99loadnet-no-proxy + owner: root + group: root + mode: "0644" + content: | + Acquire::http::Proxy "false"; + Acquire::https::Proxy "false"; + Acquire::ftp::Proxy "false"; + when: loadnet_apt_disable_proxy | default(false) | bool + +- name: Install dependencies + block: + - name: Install dependencies (with cache update) + ansible.builtin.apt: + name: + - docker.io + - curl + - jq + - chrony + - logrotate + state: present + update_cache: true + cache_valid_time: "{{ loadnet_apt_cache_valid_time | default(3600) }}" + update_cache_retries: "{{ loadnet_apt_update_retries | default(10) }}" + update_cache_retry_max_delay: "{{ loadnet_apt_update_max_delay | default(30) }}" + lock_timeout: "{{ loadnet_apt_lock_timeout | default(120) }}" + rescue: + - name: Warn about apt cache update failure + ansible.builtin.debug: + msg: "apt cache update failed; retrying install without update_cache." + - name: Install dependencies (without cache update) + ansible.builtin.apt: + name: + - docker.io + - curl + - jq + - chrony + - logrotate + state: present + update_cache: false + lock_timeout: "{{ loadnet_apt_lock_timeout | default(120) }}" + +- name: Enable and start docker + ansible.builtin.systemd: + name: docker + enabled: true + state: started + +- name: Enable and start chrony (time sync) + ansible.builtin.systemd: + name: chrony + enabled: true + state: started + +- name: Configure sysctl baseline + ansible.builtin.copy: + dest: /etc/sysctl.d/99-loadnet.conf + owner: root + group: root + mode: "0644" + content: | + # Load Network baseline (multi-host) + fs.file-max = 1048576 + + # High-performance TCP settings for 100k+ TPS + net.ipv4.ip_local_port_range = 1024 65535 + net.ipv4.tcp_tw_reuse = 1 + net.core.somaxconn = 65535 + net.ipv4.tcp_max_syn_backlog = 65535 + net.core.netdev_max_backlog = 65535 + net.ipv4.tcp_fin_timeout = 15 + net.ipv4.tcp_keepalive_time = 300 + net.ipv4.tcp_keepalive_intvl = 30 + net.ipv4.tcp_keepalive_probes = 5 + net.netfilter.nf_conntrack_max = 1048576 + net.netfilter.nf_conntrack_tcp_timeout_established = 1800 + + # Network buffers for high throughput + net.core.rmem_max = 134217728 + net.core.wmem_max = 134217728 + net.ipv4.tcp_rmem = 4096 87380 134217728 + net.ipv4.tcp_wmem = 4096 65536 134217728 + notify: Apply sysctl settings + +- name: Configure user file descriptor limits + ansible.builtin.copy: + dest: /etc/security/limits.d/99-loadnet.conf + owner: root + group: root + mode: "0644" + content: | + # Load Network high performance limits + * soft nofile 1048576 + * hard nofile 1048576 + ubuntu soft nofile 1048576 + ubuntu hard nofile 1048576 + root soft nofile 1048576 + root hard nofile 1048576 + +- name: Create base directories + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: root + group: root + mode: "0755" + loop: + - /opt/loadnet + - /opt/loadnet/networks + +- name: Ensure journald drop-in dir exists + ansible.builtin.file: + path: /etc/systemd/journald.conf.d + state: directory + owner: root + group: root + mode: "0755" + +- name: Configure journald limits (prevent root disk exhaustion) + ansible.builtin.copy: + dest: /etc/systemd/journald.conf.d/loadnet.conf + owner: root + group: root + mode: "0644" + content: | + [Journal] + SystemMaxUse={{ loadnet_journal_max_use | default('2G') }} + SystemMaxFileSize={{ loadnet_journal_max_file_size | default('200M') }} + MaxRetentionSec={{ loadnet_journal_max_retention | default('7day') }} + ForwardToSyslog={{ (loadnet_forward_journal_to_syslog | default(false)) | ternary('yes', 'no') }} + notify: Restart journald + +- name: Install rsyslog logrotate policy (size-based) + ansible.builtin.copy: + dest: /etc/logrotate.d/rsyslog + owner: root + group: root + mode: "0644" + content: | + /var/log/syslog + { + size {{ loadnet_syslog_max_size | default('100M') }} + rotate {{ loadnet_syslog_max_files | default(5) }} + missingok + notifempty + compress + delaycompress + copytruncate + sharedscripts + postrotate + /usr/lib/rsyslog/rsyslog-rotate || true + endscript + } + when: loadnet_forward_journal_to_syslog | default(false) + +- name: Install systemd unit templates + ansible.builtin.template: + src: "{{ (playbook_dir | dirname | dirname) }}/templates/systemd/{{ item.src }}" + dest: "/etc/systemd/system/{{ item.dest }}" + owner: root + group: root + mode: "0644" + loop: + - { src: "load-reth@.service.j2", dest: "load-reth@.service" } + - { src: "ultramarine@.service.j2", dest: "ultramarine@.service" } + - { src: "prometheus.service.j2", dest: "prometheus.service" } + - { src: "grafana.service.j2", dest: "grafana.service" } diff --git a/infra/ansible/roles/firewall/tasks/main.yml b/infra/ansible/roles/firewall/tasks/main.yml new file mode 100644 index 0000000..edf39ca --- /dev/null +++ b/infra/ansible/roles/firewall/tasks/main.yml @@ -0,0 +1,186 @@ +--- +- name: Optionally disable apt proxy + ansible.builtin.copy: + dest: /etc/apt/apt.conf.d/99loadnet-no-proxy + owner: root + group: root + mode: "0644" + content: | + Acquire::http::Proxy "false"; + Acquire::https::Proxy "false"; + Acquire::ftp::Proxy "false"; + when: loadnet_apt_disable_proxy | default(false) | bool + +- name: Install ufw + block: + - name: Install ufw (with cache update) + ansible.builtin.apt: + name: ufw + state: present + update_cache: true + cache_valid_time: "{{ loadnet_apt_cache_valid_time | default(3600) }}" + update_cache_retries: "{{ loadnet_apt_update_retries | default(10) }}" + update_cache_retry_max_delay: "{{ loadnet_apt_update_max_delay | default(30) }}" + lock_timeout: "{{ loadnet_apt_lock_timeout | default(120) }}" + rescue: + - name: Warn about apt cache update failure + ansible.builtin.debug: + msg: "apt cache update failed; retrying install without update_cache." + - name: Install ufw (without cache update) + ansible.builtin.apt: + name: ufw + state: present + update_cache: false + lock_timeout: "{{ loadnet_apt_lock_timeout | default(120) }}" + +- name: Ensure SSH stays allowed (never lock out the operator) + community.general.ufw: + rule: allow + port: "{{ ansible_port | default(22) }}" + proto: tcp + comment: SSH access + +- name: Set UFW default incoming policy to deny + community.general.ufw: + default: deny + direction: incoming + +- name: Set UFW default outgoing policy to allow + community.general.ufw: + default: allow + direction: outgoing + +- name: Build node->ports map (from lockfile) + ansible.builtin.set_fact: + loadnet_ports_by_node: >- + {{ dict(loadnet_lock.nodes | map(attribute='id') | zip(loadnet_lock.nodes | map(attribute='ports'))) }} + +- name: Open EL P2P TCP port for nodes on this host + community.general.ufw: + rule: allow + port: "{{ loadnet_ports_by_node[node_id].el_p2p }}" + proto: tcp + comment: "EL P2P TCP {{ node_id }}" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + +- name: Open EL P2P UDP port for nodes on this host + community.general.ufw: + rule: allow + port: "{{ loadnet_ports_by_node[node_id].el_p2p }}" + proto: udp + comment: "EL P2P UDP {{ node_id }}" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + +- name: Open CL P2P port for nodes on this host + community.general.ufw: + rule: allow + port: "{{ loadnet_ports_by_node[node_id].cl_p2p }}" + proto: tcp + comment: "CL P2P {{ node_id }}" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + +- name: Open CL mempool port for nodes on this host + community.general.ufw: + rule: allow + port: "{{ loadnet_ports_by_node[node_id].cl_mempool }}" + proto: tcp + comment: "CL mempool {{ node_id }}" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + +- name: Open EL HTTP ports for allowed CIDRs + community.general.ufw: + rule: allow + port: "{{ loadnet_ports_by_node[node_id].el_http }}" + proto: tcp + from_ip: "{{ cidr }}" + comment: "EL HTTP {{ node_id }} (restricted)" + loop: "{{ loadnet_nodes | product(loadnet_el_http_allow_cidrs) | list }}" + loop_control: + loop_var: pair + vars: + node_id: "{{ pair.0 }}" + cidr: "{{ pair.1 }}" + when: + - loadnet_el_http_allow_cidrs is defined + - loadnet_el_http_allow_cidrs | length > 0 + - (loadnet_el_http_bind | default('0.0.0.0')) not in ['127.0.0.1', '::1'] + +- name: Open EL HTTP ports for nodes on this host + community.general.ufw: + rule: allow + port: "{{ loadnet_ports_by_node[node_id].el_http }}" + proto: tcp + comment: "EL HTTP {{ node_id }}" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + when: + - (loadnet_el_http_bind | default('0.0.0.0')) not in ['127.0.0.1', '::1'] + - loadnet_el_http_allow_cidrs is not defined or (loadnet_el_http_allow_cidrs | length == 0) + +- name: Open Grafana port (monitoring) + community.general.ufw: + rule: allow + port: "{{ loadnet_grafana_port | default(3000) }}" + proto: tcp + comment: Grafana dashboard + when: (loadnet_grafana_bind | default('0.0.0.0')) not in ['127.0.0.1', '::1'] + +- name: Open Prometheus port (monitoring) + community.general.ufw: + rule: allow + port: "{{ loadnet_prometheus_port | default(9090) }}" + proto: tcp + comment: Prometheus metrics + when: (loadnet_prometheus_bind | default('127.0.0.1')) not in ['127.0.0.1', '::1'] + +- name: Open HTTP port for Blockscout/nginx + community.general.ufw: + rule: allow + port: "80" + proto: tcp + comment: Blockscout HTTP + when: + - loadnet_lock.blockscout is defined + - loadnet_lock.blockscout.enabled | default(false) + - inventory_hostname == loadnet_lock.blockscout.host + +- name: Open HTTPS port for Blockscout/nginx + community.general.ufw: + rule: allow + port: "443" + proto: tcp + comment: Blockscout HTTPS + when: + - loadnet_lock.blockscout is defined + - loadnet_lock.blockscout.enabled | default(false) + - inventory_hostname == loadnet_lock.blockscout.host + +- name: Allow Docker subnet to access EL RPC port (Blockscout indexing) + community.general.ufw: + rule: allow + port: "{{ loadnet_ports_by_node[loadnet_lock.blockscout.rpc_node].el_http }}" + proto: tcp + from_ip: "172.16.0.0/12" + comment: Docker subnet access to RPC for Blockscout + when: + - loadnet_lock.blockscout is defined + - loadnet_lock.blockscout.enabled | default(false) + - inventory_hostname == loadnet_lock.blockscout.host + - loadnet_lock.blockscout.rpc_node is defined + +- name: Enable ufw + community.general.ufw: + state: enabled + +- name: Show ufw status + ansible.builtin.command: "ufw status verbose" + changed_when: false diff --git a/infra/ansible/roles/load_reth/tasks/main.yml b/infra/ansible/roles/load_reth/tasks/main.yml new file mode 100644 index 0000000..adc01e1 --- /dev/null +++ b/infra/ansible/roles/load_reth/tasks/main.yml @@ -0,0 +1,56 @@ +--- +- name: Ensure network directory exists + ansible.builtin.file: + path: "{{ loadnet_net_dir }}" + state: directory + owner: root + group: root + mode: "0755" + +- name: Copy lockfile + ansible.builtin.copy: + src: "{{ net_dir }}/network.lock.json" + dest: "{{ loadnet_net_dir }}/network.lock.json" + owner: root + group: root + mode: "0644" + +- name: Copy public bundle (genesis + network.json) + ansible.builtin.copy: + src: "{{ net_dir }}/bundle/public/" + dest: "{{ loadnet_net_dir }}/bundle/public/" + owner: root + group: root + mode: "0644" + directory_mode: "0755" + +- name: Verify public genesis.json checksum matches lockfile + ansible.builtin.stat: + path: "{{ loadnet_net_dir }}/{{ loadnet_lock.artifacts.public.genesis_json.path }}" + checksum_algorithm: sha256 + register: loadnet_genesis_stat + +- name: Assert public genesis.json checksum matches lockfile + ansible.builtin.assert: + that: + - loadnet_genesis_stat.stat.exists + - loadnet_genesis_stat.stat.checksum == loadnet_lock.artifacts.public.genesis_json.sha256 + fail_msg: "genesis.json checksum mismatch on host; rerun net-deploy after regenerating artifacts" + +- name: Copy per-node load-reth env + p2p key + ansible.builtin.include_tasks: per_node.yml + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + +- name: Install load-reth tmpfiles config + ansible.builtin.template: + src: "{{ (playbook_dir | dirname | dirname) }}/templates/tmpfiles/load-reth.conf.j2" + dest: /etc/tmpfiles.d/load-reth.conf + owner: root + group: root + mode: "0644" + +- name: Create load-reth runtime directories via tmpfiles + ansible.builtin.command: systemd-tmpfiles --create /etc/tmpfiles.d/load-reth.conf + changed_when: false diff --git a/infra/ansible/roles/load_reth/tasks/per_node.yml b/infra/ansible/roles/load_reth/tasks/per_node.yml new file mode 100644 index 0000000..85341b5 --- /dev/null +++ b/infra/ansible/roles/load_reth/tasks/per_node.yml @@ -0,0 +1,47 @@ +--- +- name: Ensure per-node directories exist + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: "{{ loadnet_load_reth_uid | default(10001) }}" + group: "{{ loadnet_load_reth_gid | default(10001) }}" + mode: "0755" + loop: + - "{{ loadnet_net_dir }}/bundle/private/env" + - "{{ loadnet_net_dir }}/bundle/private/load-reth/p2p-keys" + - "/var/lib/load-reth/{{ node_id }}" + - "/var/lib/load-reth/{{ node_id }}/.cache" + +- name: Fix load-reth data ownership (recursive) + ansible.builtin.file: + path: "/var/lib/load-reth/{{ node_id }}" + state: directory + owner: "{{ loadnet_load_reth_uid | default(10001) }}" + group: "{{ loadnet_load_reth_gid | default(10001) }}" + recurse: true + when: loadnet_fix_ownership | default(true) | bool + +- name: Copy env file + ansible.builtin.copy: + src: "{{ net_dir }}/bundle/private/env/load-reth-{{ node_id }}.env" + dest: "{{ loadnet_net_dir }}/bundle/private/env/load-reth-{{ node_id }}.env" + owner: root + group: root + mode: "0644" + +- name: Copy p2p key + ansible.builtin.copy: + src: "{{ net_dir }}/bundle/private/load-reth/p2p-keys/{{ node_id }}.key" + dest: "{{ loadnet_net_dir }}/bundle/private/load-reth/p2p-keys/{{ node_id }}.key" + owner: "{{ loadnet_load_reth_uid | default(10001) }}" + group: "{{ loadnet_load_reth_gid | default(10001) }}" + mode: "0600" + no_log: true + +- name: Fix load-reth p2p key ownership + ansible.builtin.file: + path: "{{ loadnet_net_dir }}/bundle/private/load-reth/p2p-keys/{{ node_id }}.key" + owner: "{{ loadnet_load_reth_uid | default(10001) }}" + group: "{{ loadnet_load_reth_gid | default(10001) }}" + mode: "0600" + when: loadnet_fix_ownership | default(true) | bool diff --git a/infra/ansible/roles/monitoring/tasks/main.yml b/infra/ansible/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000..9b8e559 --- /dev/null +++ b/infra/ansible/roles/monitoring/tasks/main.yml @@ -0,0 +1,221 @@ +--- +- name: Resolve repo root fallback + ansible.builtin.set_fact: + loadnet_repo_root: "{{ loadnet_repo_root | default(playbook_dir | dirname | dirname | dirname) }}" + +- name: Validate monitoring asset paths on controller + ansible.builtin.stat: + path: "{{ item }}" + loop: + - "{{ loadnet_repo_root }}/monitoring/config-grafana/grafana.ini" + - "{{ loadnet_repo_root }}/monitoring/config-grafana/provisioning/dashboards/default.yml" + - "{{ loadnet_repo_root }}/monitoring/config-grafana/provisioning/dashboards-data/default.json" + register: loadnet_monitoring_assets + delegate_to: localhost + become: false + run_once: true + +- name: Assert monitoring assets exist on controller + ansible.builtin.assert: + that: + - loadnet_monitoring_assets.results | map(attribute='stat.exists') | min + fail_msg: "monitoring assets missing on controller; run deploy from repo checkout with ultramarine/monitoring present" + run_once: true + +- name: Ensure monitoring secrets dir exists (controller-side) + ansible.builtin.file: + path: "{{ net_dir }}/bundle/private/monitoring" + state: directory + mode: "0700" + delegate_to: localhost + become: false + run_once: true + +- name: Write Grafana admin password env from extra var (controller-side) + ansible.builtin.copy: + dest: "{{ net_dir }}/bundle/private/monitoring/grafana_admin_password.env" + mode: "0600" + content: | + GRAFANA_ADMIN_PASSWORD_B64={{ loadnet_grafana_admin_password | b64encode }} + no_log: true + delegate_to: localhost + become: false + run_once: true + when: + - loadnet_grafana_admin_password is defined + - (loadnet_grafana_admin_password | length) > 0 + +- name: Check for Grafana admin password env (controller-side) + ansible.builtin.stat: + path: "{{ net_dir }}/bundle/private/monitoring/grafana_admin_password.env" + register: loadnet_grafana_secret_stat + delegate_to: localhost + become: false + run_once: true + +- name: Initialize Grafana admin password env body + ansible.builtin.set_fact: + loadnet_grafana_secret_body: "" + no_log: true + +- name: Read Grafana admin password env (controller-side) + ansible.builtin.set_fact: + loadnet_grafana_secret_body: "{{ lookup('ansible.builtin.file', net_dir + '/bundle/private/monitoring/grafana_admin_password.env') }}" + no_log: true + when: loadnet_grafana_secret_stat.stat.exists + +- name: Parse Grafana admin password (from secrets) + ansible.builtin.set_fact: + loadnet_grafana_admin_password_b64: >- + {{ + (loadnet_grafana_secret_body + | regex_findall('^GRAFANA_ADMIN_PASSWORD_B64=\"?(.*)\"?$', multiline=True) + | first + | default('', true)) + }} + loadnet_grafana_admin_password_plain: >- + {{ + (loadnet_grafana_secret_body + | regex_findall('^GRAFANA_ADMIN_PASSWORD=\"?(.*)\"?$', multiline=True) + | first + | default('', true)) + }} + no_log: true + +- name: Generate Grafana admin password env (controller-side) + ansible.builtin.copy: + dest: "{{ net_dir }}/bundle/private/monitoring/grafana_admin_password.env" + mode: "0600" + content: | + GRAFANA_ADMIN_PASSWORD_B64={{ lookup( + 'ansible.builtin.password', + net_dir + '/bundle/private/monitoring/.grafana_admin_password length=32 chars=ascii_letters,digits' + ) | b64encode }} + no_log: true + delegate_to: localhost + become: false + run_once: true + when: + - (not loadnet_grafana_secret_stat.stat.exists) + or ((loadnet_grafana_admin_password_b64 | length) == 0 and (loadnet_grafana_admin_password_plain | length) == 0) + +- name: Refresh Grafana admin password env (controller-side) + ansible.builtin.set_fact: + loadnet_grafana_secret_body: "{{ lookup('ansible.builtin.file', net_dir + '/bundle/private/monitoring/grafana_admin_password.env') }}" + no_log: true + +- name: Re-parse Grafana admin password (from secrets) + ansible.builtin.set_fact: + loadnet_grafana_admin_password_b64: >- + {{ + (loadnet_grafana_secret_body + | regex_findall('^GRAFANA_ADMIN_PASSWORD_B64=\"?(.*)\"?$', multiline=True) + | first + | default('', true)) + }} + loadnet_grafana_admin_password_plain: >- + {{ + (loadnet_grafana_secret_body + | regex_findall('^GRAFANA_ADMIN_PASSWORD=\"?(.*)\"?$', multiline=True) + | first + | default('', true)) + }} + no_log: true + +- name: Set Grafana admin password value + ansible.builtin.set_fact: + loadnet_grafana_admin_password_value: >- + {{ + (loadnet_grafana_admin_password_b64 | length > 0) + | ternary(loadnet_grafana_admin_password_b64 | b64decode, loadnet_grafana_admin_password_plain) + }} + no_log: true + +- name: Assert Grafana admin password is present + ansible.builtin.assert: + that: + - (loadnet_grafana_admin_password_value | default('', true) | length) > 0 + fail_msg: "grafana admin password is missing or empty (expected {{ net_dir }}/bundle/private/monitoring/grafana_admin_password.env)" + no_log: true + +- name: Build node->details map (from lockfile) + ansible.builtin.set_fact: + loadnet_nodes_by_id: "{{ dict(loadnet_lock.nodes | map(attribute='id') | zip(loadnet_lock.nodes)) }}" + +- name: Build host node list for monitoring + ansible.builtin.set_fact: + loadnet_host_nodes: "{{ loadnet_nodes | map('extract', loadnet_nodes_by_id) | list }}" + +- name: Ensure monitoring directories exist + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: root + group: root + mode: "0755" + loop: + - "{{ loadnet_net_dir }}/monitoring" + - "{{ loadnet_net_dir }}/monitoring/grafana" + - "{{ loadnet_net_dir }}/monitoring/grafana/provisioning" + - "{{ loadnet_net_dir }}/monitoring/grafana/provisioning/dashboards" + - "{{ loadnet_net_dir }}/monitoring/grafana/provisioning/dashboards-data" + - "{{ loadnet_net_dir }}/monitoring/grafana/provisioning/datasources" + +- name: Ensure monitoring data directories exist + ansible.builtin.file: + path: "{{ item.path }}" + state: directory + owner: "{{ item.owner }}" + group: "{{ item.group }}" + mode: "0755" + loop: + - { path: "/var/lib/grafana", owner: "472", group: "472" } + - { path: "/var/lib/prometheus", owner: "65534", group: "65534" } + +- name: Render Prometheus config for host nodes + ansible.builtin.template: + src: "prometheus.yml.j2" + dest: "{{ loadnet_net_dir }}/monitoring/prometheus.yml" + owner: root + group: root + mode: "0644" + +- name: Render Prometheus alerting rules + ansible.builtin.template: + src: "alert_rules.yml.j2" + dest: "{{ loadnet_net_dir }}/monitoring/alert_rules.yml" + owner: root + group: root + mode: "0644" + +- name: Render Grafana base config + ansible.builtin.template: + src: "grafana.ini.j2" + dest: "{{ loadnet_net_dir }}/monitoring/grafana/grafana.ini" + owner: "472" + group: "472" + mode: "0640" + +- name: Copy Grafana dashboards provisioning config + ansible.builtin.copy: + src: "{{ loadnet_repo_root }}/monitoring/config-grafana/provisioning/dashboards/default.yml" + dest: "{{ loadnet_net_dir }}/monitoring/grafana/provisioning/dashboards/default.yml" + owner: root + group: root + mode: "0644" + +- name: Copy Grafana dashboards data + ansible.builtin.copy: + src: "{{ loadnet_repo_root }}/monitoring/config-grafana/provisioning/dashboards-data/default.json" + dest: "{{ loadnet_net_dir }}/monitoring/grafana/provisioning/dashboards-data/default.json" + owner: root + group: root + mode: "0644" + +- name: Render Grafana Prometheus datasource + ansible.builtin.template: + src: "grafana-prometheus.yml.j2" + dest: "{{ loadnet_net_dir }}/monitoring/grafana/provisioning/datasources/prometheus.yml" + owner: root + group: root + mode: "0644" diff --git a/infra/ansible/roles/monitoring/templates/alert_rules.yml.j2 b/infra/ansible/roles/monitoring/templates/alert_rules.yml.j2 new file mode 100644 index 0000000..973a5be --- /dev/null +++ b/infra/ansible/roles/monitoring/templates/alert_rules.yml.j2 @@ -0,0 +1,72 @@ +groups: + - name: loadnet_alerts + rules: + # Node availability + - alert: NodeDown + expr: up == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Node {{ '{{' }} $labels.instance {{ '}}' }} is down" + description: "Prometheus target {{ '{{' }} $labels.instance {{ '}}' }} has been down for more than 2 minutes." + + # High memory usage + - alert: HighMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage on {{ '{{' }} $labels.instance {{ '}}' }}" + description: "Memory usage is above 90% on {{ '{{' }} $labels.instance {{ '}}' }} (current value: {{ '{{' }} $value | printf \"%.1f\" {{ '}}' }}%)." + + # Disk space low + - alert: DiskSpaceLow + expr: (1 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"})) * 100 > 85 + for: 5m + labels: + severity: warning + annotations: + summary: "Low disk space on {{ '{{' }} $labels.instance {{ '}}' }}" + description: "Disk usage on root filesystem is above 85% on {{ '{{' }} $labels.instance {{ '}}' }} (current value: {{ '{{' }} $value | printf \"%.1f\" {{ '}}' }}%)." + + - alert: DiskSpaceCritical + expr: (1 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"})) * 100 > 95 + for: 2m + labels: + severity: critical + annotations: + summary: "Critical disk space on {{ '{{' }} $labels.instance {{ '}}' }}" + description: "Disk usage on root filesystem is above 95% on {{ '{{' }} $labels.instance {{ '}}' }} (current value: {{ '{{' }} $value | printf \"%.1f\" {{ '}}' }}%)." + + # High CPU usage + - alert: HighCPUUsage + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90 + for: 10m + labels: + severity: warning + annotations: + summary: "High CPU usage on {{ '{{' }} $labels.instance {{ '}}' }}" + description: "CPU usage is above 90% on {{ '{{' }} $labels.instance {{ '}}' }} for more than 10 minutes (current value: {{ '{{' }} $value | printf \"%.1f\" {{ '}}' }}%)." + + # Transaction pool metrics (if available from load-reth) + # Alert at 200K pending (66% of 300K limit) to catch saturation early + - alert: TxPoolBacklog + expr: reth_txpool_pending > 200000 + for: 5m + labels: + severity: warning + annotations: + summary: "High txpool backlog on {{ '{{' }} $labels.node_id {{ '}}' }}" + description: "Transaction pool has more than 200,000 pending transactions on {{ '{{' }} $labels.node_id {{ '}}' }} (current: {{ '{{' }} $value {{ '}}' }}). Limit is 300K." + + # Execution layer not producing blocks + - alert: NoBlocksProduced + expr: increase(reth_sync_checkpoint_number[5m]) == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "No blocks produced on {{ '{{' }} $labels.node_id {{ '}}' }}" + description: "No new blocks have been produced on {{ '{{' }} $labels.node_id {{ '}}' }} for 5 minutes." diff --git a/infra/ansible/roles/monitoring/templates/grafana-prometheus.yml.j2 b/infra/ansible/roles/monitoring/templates/grafana-prometheus.yml.j2 new file mode 100644 index 0000000..1f2af9c --- /dev/null +++ b/infra/ansible/roles/monitoring/templates/grafana-prometheus.yml.j2 @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: prometheus + uid: prometheus + type: prometheus + access: proxy + url: http://127.0.0.1:{{ loadnet_prometheus_port | default(9090) }} + isDefault: true diff --git a/infra/ansible/roles/monitoring/templates/grafana.ini.j2 b/infra/ansible/roles/monitoring/templates/grafana.ini.j2 new file mode 100644 index 0000000..b06e872 --- /dev/null +++ b/infra/ansible/roles/monitoring/templates/grafana.ini.j2 @@ -0,0 +1,18 @@ +[dashboards] +min_refresh_interval = 1s + +[database] +# Avoid SQLite lock contention in dev/testnet stacks (Docker Desktop can be flaky). +high_availability = false +cache_mode = private +wal = true +query_retries = 5 +transaction_retries = 10 + +[security] +admin_user = admin +admin_password = {{ loadnet_grafana_admin_password_value }} + +[server] +http_addr = {{ loadnet_grafana_bind | default('0.0.0.0') }} +http_port = {{ loadnet_grafana_port | default(3000) }} diff --git a/infra/ansible/roles/monitoring/templates/prometheus.yml.j2 b/infra/ansible/roles/monitoring/templates/prometheus.yml.j2 new file mode 100644 index 0000000..98c4029 --- /dev/null +++ b/infra/ansible/roles/monitoring/templates/prometheus.yml.j2 @@ -0,0 +1,29 @@ +global: + scrape_interval: {{ loadnet_prometheus_scrape_interval | default('5s') }} + +rule_files: + - "/etc/prometheus/alert_rules.yml" + +scrape_configs: +{% for node in loadnet_host_nodes %} + - job_name: "load-reth-{{ node.id }}" + metrics_path: "/metrics" + scrape_interval: {{ loadnet_prometheus_scrape_interval | default('5s') }} + static_configs: + - targets: ['127.0.0.1:{{ node.ports.el_metrics }}'] + labels: + client_name: "load-reth" + client_type: "execution" + node_id: "{{ node.id }}" +{% endfor %} +{% for node in loadnet_host_nodes %} + - job_name: "ultramarine-{{ node.id }}" + metrics_path: "/metrics" + scrape_interval: {{ loadnet_prometheus_scrape_interval | default('5s') }} + static_configs: + - targets: ['127.0.0.1:{{ node.ports.cl_metrics }}'] + labels: + client_name: "ultramarine" + client_type: "consensus" + node_id: "{{ node.id }}" +{% endfor %} diff --git a/infra/ansible/roles/storage/tasks/main.yml b/infra/ansible/roles/storage/tasks/main.yml new file mode 100644 index 0000000..2c36ccb --- /dev/null +++ b/infra/ansible/roles/storage/tasks/main.yml @@ -0,0 +1,436 @@ +--- +- name: Optionally disable apt proxy + ansible.builtin.copy: + dest: /etc/apt/apt.conf.d/99loadnet-no-proxy + owner: root + group: root + mode: "0644" + content: | + Acquire::http::Proxy "false"; + Acquire::https::Proxy "false"; + Acquire::ftp::Proxy "false"; + when: loadnet_apt_disable_proxy | default(false) | bool + +- name: Install storage dependencies + block: + - name: Install storage dependencies (with cache update) + ansible.builtin.apt: + name: + - mdadm + - xfsprogs + - rsync + state: present + update_cache: true + cache_valid_time: "{{ loadnet_apt_cache_valid_time | default(3600) }}" + update_cache_retries: "{{ loadnet_apt_update_retries | default(10) }}" + update_cache_retry_max_delay: "{{ loadnet_apt_update_max_delay | default(30) }}" + lock_timeout: "{{ loadnet_apt_lock_timeout | default(120) }}" + rescue: + - name: Warn about apt cache update failure + ansible.builtin.debug: + msg: "apt cache update failed; retrying install without update_cache." + - name: Install storage dependencies (without cache update) + ansible.builtin.apt: + name: + - mdadm + - xfsprogs + - rsync + state: present + update_cache: false + lock_timeout: "{{ loadnet_apt_lock_timeout | default(120) }}" + +- name: Assert required inputs for destructive mode + ansible.builtin.assert: + that: + - loadnet_storage_wipe | bool + - loadnet_data_devices is iterable + - (loadnet_data_devices | length) >= 1 + fail_msg: "Destructive storage bootstrap requires loadnet_storage_wipe=true and explicit loadnet_data_devices=[/dev/disk/by-id/...]" + when: loadnet_storage_wipe | bool + +- name: Ensure mountpoint exists + ansible.builtin.file: + path: "{{ loadnet_data_mountpoint }}" + state: directory + owner: root + group: root + mode: "0755" + +- name: Create mdadm array (RAID1 or RAID0) (destructive) + ansible.builtin.command: >- + mdadm --create {{ loadnet_md_device }} + --name {{ loadnet_md_name }} + --level {{ loadnet_data_raid_level }} + --raid-devices {{ loadnet_data_devices | length }} + {{ loadnet_data_devices | join(' ') }} + when: + - loadnet_storage_wipe | bool + - (loadnet_data_devices | length) >= 2 + changed_when: true + +- name: Persist mdadm config (destructive) + ansible.builtin.shell: | + set -euo pipefail + mdadm --detail --scan | tee /etc/mdadm/mdadm.conf >/dev/null + update-initramfs -u + args: + executable: /bin/bash + when: + - loadnet_storage_wipe | bool + - (loadnet_data_devices | length) >= 2 + changed_when: true + +- name: Create filesystem on md device (destructive) + ansible.builtin.command: "mkfs.{{ loadnet_fs_type }} -f {{ loadnet_md_device }}" + when: + - loadnet_storage_wipe | bool + - (loadnet_data_devices | length) >= 2 + changed_when: true + +- name: Create filesystem on single device (destructive) + ansible.builtin.command: "mkfs.{{ loadnet_fs_type }} -f {{ loadnet_data_devices[0] }}" + when: + - loadnet_storage_wipe | bool + - (loadnet_data_devices | length) == 1 + changed_when: true + +- name: Get filesystem UUID + ansible.builtin.command: "blkid -s UUID -o value {{ (loadnet_data_devices | length) >= 2 | ternary(loadnet_md_device, loadnet_data_devices[0]) }}" + register: loadnet_fs_uuid + changed_when: false + when: loadnet_storage_wipe | bool + +- name: Persist fstab entry (destructive) + ansible.builtin.lineinfile: + path: /etc/fstab + create: true + mode: "0644" + line: "UUID={{ loadnet_fs_uuid.stdout }} {{ loadnet_data_mountpoint }} {{ loadnet_fs_type }} defaults,noatime 0 2" + regexp: "^UUID={{ loadnet_fs_uuid.stdout | regex_escape() }}\\s+{{ loadnet_data_mountpoint | regex_escape() }}\\s+" + when: loadnet_storage_wipe | bool + +- name: Mount data volume (destructive) + ansible.builtin.command: "mount {{ loadnet_data_mountpoint }}" + when: loadnet_storage_wipe | bool + changed_when: true + +- name: Verify data mount is present (after destructive bootstrap) + ansible.builtin.command: "findmnt -n {{ loadnet_data_mountpoint }}" + changed_when: false + when: loadnet_storage_wipe | bool + +- name: Verify data mount is present (non-destructive mode) + ansible.builtin.command: "findmnt -n {{ loadnet_data_mountpoint }}" + changed_when: false + when: not (loadnet_storage_wipe | bool) + register: mount_check + failed_when: false + +- name: Get root filesystem source device (for safety checks) + ansible.builtin.command: "findmnt -n -o SOURCE /" + changed_when: false + when: + - not (loadnet_storage_wipe | bool) + - mount_check.rc != 0 + register: root_source + +- name: Lookup configured source mount (auto-adopt) + ansible.builtin.set_fact: + loadnet_configured_source_mount: >- + {{ + (ansible_facts.mounts | selectattr('mount', 'equalto', loadnet_data_source_mountpoint) | list | first) + | default({}) + }} + when: + - not (loadnet_storage_wipe | bool) + - mount_check.rc != 0 + +- name: Build auto-adopt candidate list (non-root mounts) + ansible.builtin.set_fact: + loadnet_mount_candidates: >- + {{ + ansible_facts.mounts + | rejectattr('mount', 'equalto', '/') + | rejectattr('mount', 'match', '^/boot') + | rejectattr('mount', 'match', '^/efi') + | rejectattr('fstype', 'in', loadnet_mount_excluded_fstypes) + | rejectattr('device', 'equalto', root_source.stdout) + | list + }} + vars: + loadnet_mount_excluded_fstypes: + - tmpfs + - devtmpfs + - overlay + - squashfs + - proc + - sysfs + - cgroup + - cgroup2 + - mqueue + - pstore + - tracefs + - debugfs + - securityfs + - fusectl + - rpc_pipefs + - autofs + when: + - not (loadnet_storage_wipe | bool) + - mount_check.rc != 0 + +- name: Select auto-adopt source mount when configured mount is missing + ansible.builtin.set_fact: + loadnet_auto_source_mount: "{{ loadnet_mount_candidates[0] }}" + when: + - not (loadnet_storage_wipe | bool) + - mount_check.rc != 0 + - loadnet_configured_source_mount | length == 0 + - loadnet_mount_candidates | length == 1 + +- name: Warn when multiple non-root mount candidates exist + ansible.builtin.debug: + msg: >- + Multiple non-root mountpoints detected: {{ loadnet_mount_candidates | map(attribute='mount') | list }}. + Set loadnet_data_source_mountpoint explicitly to select one. + when: + - not (loadnet_storage_wipe | bool) + - mount_check.rc != 0 + - loadnet_configured_source_mount | length == 0 + - loadnet_mount_candidates | length > 1 + +- name: Select effective source mount (configured or auto-detected) + ansible.builtin.set_fact: + loadnet_effective_source_mount: >- + {{ + (loadnet_configured_source_mount | length > 0) + | ternary(loadnet_configured_source_mount, loadnet_auto_source_mount | default({})) + }} + loadnet_effective_source_mountpoint: >- + {{ + (loadnet_configured_source_mount | length > 0) + | ternary(loadnet_configured_source_mount.mount, (loadnet_auto_source_mount | default({})).mount | default('')) + }} + loadnet_effective_source_device: >- + {{ + (loadnet_configured_source_mount | length > 0) + | ternary(loadnet_configured_source_mount.device, (loadnet_auto_source_mount | default({})).device | default('')) + }} + when: + - not (loadnet_storage_wipe | bool) + - mount_check.rc != 0 + +- name: Select effective source dir + ansible.builtin.set_fact: + loadnet_effective_source_dir: >- + {{ + (loadnet_configured_source_mount | length > 0) + | ternary(loadnet_data_source_dir, + (loadnet_effective_source_mountpoint != '') | ternary(loadnet_effective_source_mountpoint + '/loadnet', '')) + }} + when: + - not (loadnet_storage_wipe | bool) + - mount_check.rc != 0 + +- name: Fail safely if source mountpoint is on the root filesystem (refuse to use OS disk) + ansible.builtin.fail: + msg: >- + Refusing to auto-adopt {{ loadnet_effective_source_mountpoint }} because it appears to be on the root filesystem + (SOURCE {{ loadnet_effective_source_device }} == {{ root_source.stdout }}). Mount a separate data volume at {{ loadnet_data_mountpoint }} + or provide explicit loadnet_data_devices + loadnet_storage_wipe=true. + when: + - not (loadnet_storage_wipe | bool) + - mount_check.rc != 0 + - loadnet_effective_source_mountpoint != '' + - loadnet_effective_source_device == root_source.stdout + +- name: Ensure source dir exists (auto-adopt) + ansible.builtin.file: + path: "{{ loadnet_effective_source_dir }}" + state: directory + owner: root + group: root + mode: "0755" + when: + - not (loadnet_storage_wipe | bool) + - mount_check.rc != 0 + - loadnet_effective_source_mountpoint != '' + - loadnet_effective_source_device != root_source.stdout + +- name: Bind-mount source dir into data mountpoint (auto-adopt) + ansible.builtin.command: "mount --bind {{ loadnet_effective_source_dir }} {{ loadnet_data_mountpoint }}" + changed_when: true + when: + - not (loadnet_storage_wipe | bool) + - mount_check.rc != 0 + - loadnet_effective_source_mountpoint != '' + - loadnet_effective_source_device != root_source.stdout + +- name: Persist bind-mount in fstab (auto-adopt) + ansible.builtin.lineinfile: + path: /etc/fstab + create: true + mode: "0644" + line: >- + {{ loadnet_effective_source_dir }} {{ loadnet_data_mountpoint }} none + bind,x-systemd.requires-mounts-for={{ loadnet_effective_source_mountpoint }} 0 0 + regexp: "^{{ loadnet_effective_source_dir | regex_escape() }}\\s+{{ loadnet_data_mountpoint | regex_escape() }}\\s+none\\s+" + when: + - not (loadnet_storage_wipe | bool) + - mount_check.rc != 0 + - loadnet_effective_source_mountpoint != '' + - loadnet_effective_source_device != root_source.stdout + +- name: Re-check data mount is present after auto-adopt + ansible.builtin.command: "findmnt -n {{ loadnet_data_mountpoint }}" + changed_when: false + when: + - not (loadnet_storage_wipe | bool) + - mount_check.rc != 0 + register: mount_check_after + failed_when: false + +- name: Fail with guidance if data mount is missing (non-destructive mode) + ansible.builtin.fail: + msg: >- + {{ loadnet_data_mountpoint }} is not mounted. + Either mount your data volume there, or run net-storage with loadnet_storage_wipe=true and explicit loadnet_data_devices. + If your provider image mounts the data volume elsewhere, set loadnet_data_source_mountpoint (default: /home) + and/or loadnet_data_source_dir (default: /home/loadnet). + Detected mountpoints: {{ loadnet_mount_candidates | default([]) | map(attribute='mount') | list }}. + when: + - not (loadnet_storage_wipe | bool) + - mount_check.rc != 0 + - mount_check_after.rc != 0 + +- name: Check existing /var/log mount + ansible.builtin.command: "findmnt -n -o SOURCE /var/log" + changed_when: false + failed_when: false + register: loadnet_var_log_source + +- name: Bind-mount /var/log to data volume (optional) + when: + - loadnet_bind_var_log | bool + - loadnet_var_log_source.rc != 0 or loadnet_var_log_source.stdout != loadnet_log_dir + block: + - name: Ensure log dir on data volume exists + ansible.builtin.file: + path: "{{ loadnet_log_dir }}" + state: directory + owner: root + group: root + mode: "0755" + + - name: Sync existing logs to data volume + ansible.builtin.command: "rsync -aHAX /var/log/ {{ loadnet_log_dir }}/" + changed_when: true + + - name: Bind-mount log dir onto /var/log + ansible.builtin.command: "mount --bind {{ loadnet_log_dir }} /var/log" + changed_when: true + + - name: Persist /var/log bind-mount in fstab + ansible.builtin.lineinfile: + path: /etc/fstab + create: true + mode: "0644" + line: "{{ loadnet_log_dir }} /var/log none bind,x-systemd.requires-mounts-for={{ loadnet_data_mountpoint }} 0 0" + regexp: "^{{ loadnet_log_dir | regex_escape() }}\\s+/var/log\\s+none\\s+" + +- name: Ensure chain state dirs exist under data mount + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: root + group: root + mode: "0755" + loop: + - "{{ loadnet_data_mountpoint }}/load-reth" + - "{{ loadnet_data_mountpoint }}/ultramarine" + +- name: Symlink /var/lib/load-reth to data mount + ansible.builtin.file: + src: "{{ loadnet_data_mountpoint }}/load-reth" + dest: /var/lib/load-reth + state: link + force: true + +- name: Symlink /var/lib/ultramarine to data mount + ansible.builtin.file: + src: "{{ loadnet_data_mountpoint }}/ultramarine" + dest: /var/lib/ultramarine + state: link + force: true + +- name: Ensure Docker data-root dir exists (optional) + ansible.builtin.file: + path: "{{ loadnet_docker_dataroot }}" + state: directory + owner: root + group: root + mode: "0711" + when: loadnet_move_docker_dataroot | bool + +- name: Ensure /etc/docker exists (optional) + ansible.builtin.file: + path: /etc/docker + state: directory + owner: root + group: root + mode: "0755" + when: loadnet_move_docker_dataroot | bool + +- name: Check if docker service exists (optional) + ansible.builtin.command: "systemctl status docker" + changed_when: false + failed_when: false + register: loadnet_docker_status + when: loadnet_move_docker_dataroot | bool + +- name: Check for existing docker data (optional) + ansible.builtin.stat: + path: /var/lib/docker + register: loadnet_docker_data + when: loadnet_move_docker_dataroot | bool + +- name: Configure Docker data-root (optional) + ansible.builtin.copy: + dest: /etc/docker/daemon.json + owner: root + group: root + mode: "0644" + content: | + { + "data-root": "{{ loadnet_docker_dataroot }}" + } + register: docker_daemon_json + when: loadnet_move_docker_dataroot | bool + +- name: Stop docker before moving data-root (optional) + ansible.builtin.systemd: + name: docker + state: stopped + when: + - loadnet_move_docker_dataroot | bool + - docker_daemon_json.changed + - loadnet_docker_status.rc != 4 + +- name: Migrate existing docker state (optional, best-effort) + ansible.builtin.command: "rsync -aHAX --delete /var/lib/docker/ {{ loadnet_docker_dataroot }}/" + when: + - loadnet_move_docker_dataroot | bool + - docker_daemon_json.changed + - loadnet_docker_data.stat.exists + changed_when: true + failed_when: false + +- name: Start docker after moving data-root (optional) + ansible.builtin.systemd: + name: docker + state: started + enabled: true + when: + - loadnet_move_docker_dataroot | bool + - docker_daemon_json.changed + - loadnet_docker_status.rc != 4 diff --git a/infra/ansible/roles/ultramarine/tasks/main.yml b/infra/ansible/roles/ultramarine/tasks/main.yml new file mode 100644 index 0000000..d114fdd --- /dev/null +++ b/infra/ansible/roles/ultramarine/tasks/main.yml @@ -0,0 +1,177 @@ +--- +- name: Ensure per-node directories exist + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: "{{ loadnet_ultramarine_uid | default(10002) }}" + group: "{{ loadnet_ultramarine_gid | default(10002) }}" + mode: "0755" + loop: + - "{{ loadnet_net_dir }}/bundle/private/env" + - "{{ loadnet_net_dir }}/bundle/private/ultramarine/secrets" + - "/var/lib/ultramarine" + +- name: Derive validator nodes from lockfile + ansible.builtin.set_fact: + loadnet_validator_nodes: "{{ loadnet_lock.nodes | selectattr('role', 'equalto', 'validator') | map(attribute='id') | list }}" + +- name: Copy per-node ultramarine env + ansible.builtin.copy: + src: "{{ net_dir }}/bundle/private/env/ultramarine-{{ node_id }}.env" + dest: "{{ loadnet_net_dir }}/bundle/private/env/ultramarine-{{ node_id }}.env" + owner: root + group: root + mode: "0644" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + +- name: Check for per-node archiver secret env (controller-side) + ansible.builtin.stat: + path: "{{ net_dir }}/bundle/private/ultramarine/secrets/{{ node_id }}.env" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + index_var: node_idx + register: archiver_secret_stats + delegate_to: localhost + become: false + +- name: Assert archiver secret env exists for validator nodes + ansible.builtin.assert: + that: + - archiver_secret_stats.results[node_idx].stat.exists + fail_msg: "missing archiver secret env for validator node {{ node_id }}; expected {{ net_dir }}/bundle/private/ultramarine/secrets/{{ node_id }}.env" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + index_var: node_idx + when: node_id in loadnet_validator_nodes + +- name: Copy per-node archiver secret env (if present) + ansible.builtin.copy: + src: "{{ net_dir }}/bundle/private/ultramarine/secrets/{{ node_id }}.env" + dest: "{{ loadnet_net_dir }}/bundle/private/ultramarine/secrets/{{ node_id }}.env" + owner: root + group: root + mode: "0600" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + index_var: node_idx + when: archiver_secret_stats.results[node_idx].stat.exists + no_log: true + +- name: Read archiver secret env (controller-side) + ansible.builtin.slurp: + src: "{{ net_dir }}/bundle/private/ultramarine/secrets/{{ node_id }}.env" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + index_var: node_idx + when: + - node_id in loadnet_validator_nodes + - archiver_secret_stats.results[node_idx].stat.exists + register: archiver_secret_contents + delegate_to: localhost + become: false + no_log: true + +- name: Assert archiver bearer token is present for validator nodes + ansible.builtin.assert: + that: + - (archiver_secret_contents.results[node_idx].content | default('') | b64decode) is search('^ULTRAMARINE_ARCHIVER_BEARER_TOKEN=\"?.+\"?$', multiline=True) + fail_msg: "archiver bearer token missing/empty for validator node {{ node_id }}" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + index_var: node_idx + when: + - node_id in loadnet_validator_nodes + - archiver_secret_stats.results[node_idx].stat.exists + no_log: true + +- name: Ensure per-node ultramarine home config dir exists + ansible.builtin.file: + path: "/var/lib/ultramarine/{{ node_id }}/config" + state: directory + owner: "{{ loadnet_ultramarine_uid | default(10002) }}" + group: "{{ loadnet_ultramarine_gid | default(10002) }}" + mode: "0755" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + +- name: Fix ultramarine data ownership (recursive) + ansible.builtin.file: + path: "/var/lib/ultramarine/{{ node_id }}" + state: directory + owner: "{{ loadnet_ultramarine_uid | default(10002) }}" + group: "{{ loadnet_ultramarine_gid | default(10002) }}" + recurse: true + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + when: loadnet_fix_ownership | default(true) | bool + +- name: Copy ultramarine config.toml + genesis.json (managed) + ansible.builtin.copy: + src: "{{ net_dir }}/bundle/private/ultramarine/homes/{{ node_id }}/config/{{ item }}" + dest: "/var/lib/ultramarine/{{ node_id }}/config/{{ item }}" + owner: root + group: root + mode: "0644" + loop: "{{ loadnet_nodes | product(['config.toml', 'genesis.json']) | list }}" + loop_control: + loop_var: pair + vars: + node_id: "{{ pair.0 }}" + item: "{{ pair.1 }}" + +- name: Check for priv_validator_key.json in bundle (controller-side) + ansible.builtin.stat: + path: "{{ net_dir }}/bundle/private/ultramarine/homes/{{ node_id }}/config/priv_validator_key.json" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + index_var: node_idx + register: bundle_priv_validator_key_stats + delegate_to: localhost + become: false + +- name: Check if priv_validator_key.json already exists on host + ansible.builtin.stat: + path: "/var/lib/ultramarine/{{ node_id }}/config/priv_validator_key.json" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + index_var: node_idx + register: host_priv_validator_key_stats + +- name: Seed priv_validator_key.json (never overwrite) + ansible.builtin.copy: + src: "{{ net_dir }}/bundle/private/ultramarine/homes/{{ node_id }}/config/priv_validator_key.json" + dest: "/var/lib/ultramarine/{{ node_id }}/config/priv_validator_key.json" + owner: "{{ loadnet_ultramarine_uid | default(10002) }}" + group: "{{ loadnet_ultramarine_gid | default(10002) }}" + mode: "0600" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + index_var: node_idx + when: + - bundle_priv_validator_key_stats.results[node_idx].stat.exists + - not host_priv_validator_key_stats.results[node_idx].stat.exists + no_log: true + +- name: Enforce priv_validator_key.json permissions (0600) + ansible.builtin.file: + path: "/var/lib/ultramarine/{{ node_id }}/config/priv_validator_key.json" + owner: "{{ loadnet_ultramarine_uid | default(10002) }}" + group: "{{ loadnet_ultramarine_gid | default(10002) }}" + mode: "0600" + loop: "{{ loadnet_nodes }}" + loop_control: + loop_var: node_id + index_var: node_idx + when: host_priv_validator_key_stats.results[node_idx].stat.exists diff --git a/infra/gen/netgen/Cargo.toml b/infra/gen/netgen/Cargo.toml new file mode 100644 index 0000000..e2d61d6 --- /dev/null +++ b/infra/gen/netgen/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "ultramarine-netgen" +version.workspace = true +edition.workspace = true +license.workspace = true + +[[bin]] +name = "netgen" +path = "src/main.rs" + +[dependencies] +clap = { workspace = true, features = ["derive"] } +color-eyre = { workspace = true } +hex = { workspace = true } +rand = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +sha2 = { workspace = true } +toml = { workspace = true } +ultramarine-cli = { path = "../../../crates/cli" } +ultramarine-genesis = { path = "../../../crates/genesis" } +ultramarine-types = { path = "../../../crates/types" } +which = "8.0.0" + +# YAML parsing for manifests/secrets. +serde_yaml = "0.9.34" + +# Load-reth p2p key generation + enode derivation. +k256 = { version = "0.13", features = ["ecdsa"] } + +# Human-readable duration parsing for sync config. +humantime = "2.1" + +# Byte size parsing for sync config (e.g. "150 MiB"). +# Must match malachitebft_config's bytesize version (1.3), not the workspace (2.x). +bytesize = "1.3" diff --git a/infra/gen/netgen/src/main.rs b/infra/gen/netgen/src/main.rs new file mode 100644 index 0000000..f095bcd --- /dev/null +++ b/infra/gen/netgen/src/main.rs @@ -0,0 +1,1303 @@ +use std::{ + collections::{BTreeMap, BTreeSet}, + fs, + path::{Path, PathBuf}, + process::Command, +}; + +use bytesize::ByteSize; +use clap::{Parser, Subcommand}; +use color_eyre::eyre::{Result, bail, eyre}; +use k256::{SecretKey, elliptic_curve::sec1::ToEncodedPoint}; +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; +use ultramarine_types::{ + genesis::Genesis as ConsensusGenesis, + signing::PrivateKey as ConsensusPrivateKey, + validator_set::{Validator, ValidatorSet}, +}; + +#[derive(Parser, Debug)] +#[command(author, version, about)] +struct Cli { + #[command(subcommand)] + cmd: Cmd, +} + +#[derive(Subcommand, Debug)] +enum Cmd { + /// Validate `infra/manifests/.yaml`. + Validate { + #[arg(long)] + manifest: PathBuf, + #[arg(long, default_value_t = false)] + allow_unsafe_failure_domains: bool, + }, + /// Generate lockfile + bundle outputs under `infra/networks//`. + Gen { + #[arg(long)] + manifest: PathBuf, + #[arg(long)] + out_dir: PathBuf, + /// Path to `secrets.sops.yaml` (encrypted) or plaintext YAML. If encrypted, requires + /// `sops` available on PATH. + #[arg(long)] + secrets_file: Option, + /// Allow generating bundles without providing validator archiver bearer tokens. + /// + /// By default, generation fails if any validator is missing a bearer token, because + /// Ultramarine validators fail fast without it. + #[arg(long, default_value_t = false)] + allow_missing_archiver_tokens: bool, + #[arg(long, default_value_t = false)] + allow_unsafe_failure_domains: bool, + }, +} + +#[derive(Clone, Debug, Deserialize)] +struct Manifest { + schema_version: u32, + network: Network, + execution: Option, + images: Images, + hosts: Vec, + nodes: Vec, + engine: Engine, + ports: Ports, + sync: Sync, + /// P2P message size configuration for handling large blocks. + #[serde(default)] + p2p: P2pConfig, + archiver: Archiver, + exposure: Exposure, + blockscout: Option, + #[serde(default)] + validation: Validation, +} + +#[derive(Clone, Debug, Deserialize)] +struct Network { + name: String, + chain_id: u64, +} + +#[derive(Clone, Debug, Deserialize)] +struct ExecutionGenesis { + alloc: Vec, +} + +#[derive(Clone, Debug, Deserialize)] +struct ExecutionAlloc { + address: String, + /// Balance as a decimal string in wei (e.g. "15000000000000000000000"). + balance_wei: String, +} + +#[derive(Clone, Debug, Deserialize)] +struct Images { + ultramarine: String, + load_reth: String, +} + +#[derive(Clone, Debug, Deserialize)] +struct Host { + id: String, + public_ip: String, + ssh_user: String, +} + +#[derive(Clone, Debug, Deserialize)] +struct Node { + id: String, + host: String, + role: String, // validator|fullnode|rpc +} + +#[derive(Clone, Debug, Deserialize)] +struct Engine { + mode: String, // ipc + ipc_path_template: String, +} + +#[derive(Clone, Debug, Deserialize)] +struct Ports { + allocation: String, // host-block|by-index + host_block_stride: Option, + el: ElPorts, + cl: ClPorts, +} + +#[derive(Clone, Debug, Deserialize)] +struct ElPorts { + http: u16, + authrpc: Option, + p2p: u16, + metrics: u16, +} + +#[derive(Clone, Debug, Deserialize)] +struct ClPorts { + p2p: u16, + mempool: u16, + metrics: u16, +} + +#[derive(Clone, Debug, Deserialize)] +struct Sync { + enabled: bool, + /// Global sync tuning applied to ALL nodes (validators and fullnodes). + /// These set the baseline; the `fullnode` section can override them for non-validators. + #[serde(default)] + parallel_requests: Option, + #[serde(default)] + request_timeout: Option, + #[serde(default)] + max_request_size: Option, + #[serde(default)] + max_response_size: Option, + #[serde(default)] + batch_size: Option, + /// Fullnode-specific sync tuning (optional). + /// These settings are applied only to non-validator nodes (fullnode/rpc roles) + /// to help them catch up faster when syncing from genesis. + #[serde(default)] + fullnode: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct FullnodeSyncConfig { + /// Number of parallel sync requests (default: 5 for validators, higher for fullnodes). + /// Recommended: 20-30 for fullnodes syncing a large chain. + #[serde(default)] + parallel_requests: Option, + /// Timeout for sync requests (default: "10s"). + /// Recommended: "30s" for fullnodes with high parallel_requests. + #[serde(default)] + request_timeout: Option, + /// Maximum request size (default: "1 MiB"). + #[serde(default)] + max_request_size: Option, + /// Maximum response size (default: "10 MiB"). + #[serde(default)] + max_response_size: Option, + /// Batch size for sync requests (default: 5). + #[serde(default)] + batch_size: Option, +} + +/// P2P message size configuration. +/// These limits must be large enough to handle big blocks during load tests. +#[derive(Clone, Debug, Default, Deserialize)] +struct P2pConfig { + /// Maximum pubsub message size (default: "4 MiB"). + /// Load tests can produce blocks up to 6+ MB, so increase this. + #[serde(default)] + pubsub_max_size: Option, + /// Maximum RPC message size (default: "10 MiB"). + /// Should be larger than pubsub_max_size. + #[serde(default)] + rpc_max_size: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct Archiver { + enabled: bool, + provider_url: String, + provider_id: String, +} + +#[derive(Clone, Debug, Deserialize)] +struct Exposure { + metrics_bind: String, +} + +#[derive(Clone, Debug, Deserialize)] +struct Blockscout { + enabled: bool, + host: String, + rpc_node: String, + domains: BlockscoutDomains, + ssl: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct BlockscoutDomains { + explorer: String, + stats: String, + rpc: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct BlockscoutSsl { + enabled: bool, + email: String, +} + +#[derive(Clone, Debug, Deserialize, Default)] +struct Validation { + allow_unsafe_failure_domains: bool, +} + +#[derive(Clone, Debug, Deserialize)] +struct Secrets { + schema_version: u32, + grafana_admin_password: Option, + nodes: BTreeMap, +} + +#[derive(Clone, Debug, Deserialize)] +struct NodeSecrets { + archiver_bearer_token: String, +} + +#[derive(Clone, Debug, Serialize)] +struct Lockfile { + schema_version: u32, + tool: ToolInfo, + network: LockNetwork, + inputs: Inputs, + policy: Policy, + hosts: Vec, + nodes: Vec, + blockscout: Option, + artifacts: Artifacts, +} + +#[derive(Clone, Debug, Serialize)] +struct ToolInfo { + name: &'static str, + version: &'static str, +} + +#[derive(Clone, Debug, Serialize)] +struct LockNetwork { + name: String, + chain_id: u64, +} + +#[derive(Clone, Debug, Serialize)] +struct Inputs { + manifest_path: String, + manifest_sha256: String, +} + +#[derive(Clone, Debug, Serialize)] +struct Policy { + engine: &'static str, + sync_enabled: bool, + metrics_bind: String, + unsafe_failure_domains_allowed: bool, +} + +#[derive(Clone, Debug, Serialize)] +struct LockHost { + id: String, + public_ip: String, + ssh_user: String, +} + +#[derive(Clone, Debug, Serialize)] +struct LockNode { + id: String, + host: String, + role: String, + images: ImagesOut, + engine: EngineOut, + ports: PortsOut, + load_reth: LoadRethOut, + archiver: Option, +} + +#[derive(Clone, Debug, Serialize)] +struct ImagesOut { + ultramarine: String, + load_reth: String, +} + +#[derive(Clone, Debug, Serialize)] +struct EngineOut { + mode: &'static str, + ipc_path: String, +} + +#[derive(Clone, Debug, Serialize)] +struct PortsOut { + el_http: u16, + el_authrpc: u16, + el_p2p: u16, + el_metrics: u16, + cl_p2p: u16, + cl_mempool: u16, + cl_metrics: u16, +} + +#[derive(Clone, Debug, Serialize)] +struct LoadRethOut { + p2p_key_path: String, + enode: String, + bootnodes: Vec, +} + +#[derive(Clone, Debug, Serialize)] +struct ArchiverOut { + enabled: bool, + provider_url: String, + provider_id: String, + bearer_token_present: bool, +} + +#[derive(Clone, Debug, Serialize)] +struct Artifacts { + public: PublicArtifacts, +} + +#[derive(Clone, Debug, Serialize)] +struct BlockscoutOut { + enabled: bool, + host: String, + rpc_node: String, + domains: BlockscoutDomains, + ssl: Option, +} + +#[derive(Clone, Debug, Serialize)] +struct NetworkJson { + schema_version: u32, + tool: ToolInfo, + network: LockNetwork, + nodes: Vec, + artifacts: Artifacts, +} + +#[derive(Clone, Debug, Serialize)] +struct NetworkNode { + id: String, + role: String, + host: String, + public_ip: String, + ports: PortsOut, + load_reth: NetworkLoadReth, + archiver: Option, +} + +#[derive(Clone, Debug, Serialize)] +struct NetworkLoadReth { + enode: String, + bootnodes: Vec, +} + +#[derive(Clone, Debug, Serialize)] +struct NetworkArchiver { + enabled: bool, + provider_url: String, + provider_id: String, +} + +#[derive(Clone, Debug, Serialize)] +struct PublicArtifacts { + genesis_json: ArtifactRef, +} + +#[derive(Clone, Debug, Serialize)] +struct ArtifactRef { + path: String, + sha256: String, +} + +fn sha256_file(path: &Path) -> Result { + let bytes = fs::read(path)?; + Ok(hex::encode(Sha256::digest(bytes))) +} + +fn read_yaml Deserialize<'de>>(path: &Path) -> Result { + let raw = fs::read_to_string(path)?; + Ok(serde_yaml::from_str(&raw)?) +} + +fn read_secrets(path: &Path) -> Result { + let raw = if path.extension().and_then(|s| s.to_str()) == Some("yaml") && + path.file_name().and_then(|s| s.to_str()).unwrap_or("").ends_with(".sops.yaml") + { + let sops = which::which("sops").map_err(|_| { + eyre!("secrets file looks like sops-encrypted, but `sops` is not available on PATH") + })?; + let out = Command::new(sops).args(["-d"]).arg(path).output()?; + if !out.status.success() { + bail!("sops -d failed: {}", String::from_utf8_lossy(&out.stderr)); + } + String::from_utf8(out.stdout)? + } else { + fs::read_to_string(path)? + }; + + Ok(serde_yaml::from_str(&raw)?) +} + +fn validate_manifest(m: &Manifest, allow_unsafe_failure_domains: bool) -> Result<()> { + if m.schema_version != 1 { + bail!("schema_version must be 1"); + } + if m.network.name.trim().is_empty() { + bail!("network.name must be non-empty"); + } + if m.images.ultramarine.trim().is_empty() || m.images.load_reth.trim().is_empty() { + bail!("images.ultramarine/images.load_reth must be non-empty"); + } + if m.engine.mode != "ipc" { + bail!("engine.mode must be 'ipc' (deploys are IPC-only)"); + } + if !m.engine.ipc_path_template.contains("{node_id}") { + bail!("engine.ipc_path_template must contain '{{node_id}}' placeholder"); + } + if !m.sync.enabled { + bail!("sync.enabled must be true for multi-host networks"); + } + if let Some(exe) = &m.execution { + if exe.alloc.is_empty() { + bail!("execution.alloc must be non-empty when execution section is provided"); + } + for a in &exe.alloc { + if !a.address.starts_with("0x") || a.address.len() != 42 { + bail!( + "execution.alloc.address must be a 0x-prefixed 20-byte hex address (got {})", + a.address + ); + } + if a.balance_wei.trim().is_empty() { + bail!("execution.alloc.balance_wei must be non-empty for {}", a.address); + } + } + } + if !m.archiver.enabled { + bail!("archiver.enabled must be true (validators require archiver)"); + } + if m.archiver.provider_url.trim().is_empty() || m.archiver.provider_id.trim().is_empty() { + bail!("archiver.provider_url/provider_id must be non-empty"); + } + + match m.ports.allocation.as_str() { + "host-block" | "by-index" => {} + other => bail!("ports.allocation must be host-block|by-index (got {other})"), + } + if m.ports.allocation == "host-block" { + let stride = m.ports.host_block_stride.unwrap_or(1_000); + if stride == 0 { + bail!("ports.host_block_stride must be >= 1"); + } + let mut counts: BTreeMap<&str, usize> = BTreeMap::new(); + for n in &m.nodes { + *counts.entry(n.host.as_str()).or_default() += 1; + } + if let Some((host, count)) = counts.into_iter().max_by_key(|(_, c)| *c) && + count > stride as usize + { + bail!( + "ports.host_block_stride={} is too small: host {} has {} nodes; need >= {} to avoid port collisions", + stride, + host, + count, + count + ); + } + } + + let mut host_ids = BTreeSet::new(); + for h in &m.hosts { + if !host_ids.insert(h.id.clone()) { + bail!("duplicate host id: {}", h.id); + } + if h.public_ip.trim().is_empty() || h.ssh_user.trim().is_empty() { + bail!("host {} public_ip/ssh_user must be non-empty", h.id); + } + } + if m.hosts.is_empty() { + bail!("hosts must be non-empty"); + } + + let mut node_ids = BTreeSet::new(); + for n in &m.nodes { + if !node_ids.insert(n.id.clone()) { + bail!("duplicate node id: {}", n.id); + } + if !host_ids.contains(&n.host) { + bail!("node {} references unknown host {}", n.id, n.host); + } + match n.role.as_str() { + "validator" | "fullnode" | "rpc" => {} + _ => bail!("node {} role must be validator|fullnode|rpc", n.id), + } + } + if m.nodes.is_empty() { + bail!("nodes must be non-empty"); + } + + if let Some(blockscout) = &m.blockscout && + blockscout.enabled + { + if !host_ids.contains(&blockscout.host) { + bail!("blockscout.host {} not found in hosts", blockscout.host); + } + if !m.nodes.iter().any(|n| n.id == blockscout.rpc_node) { + bail!("blockscout.rpc_node {} not found in nodes", blockscout.rpc_node); + } + if blockscout.domains.explorer.trim().is_empty() || + blockscout.domains.stats.trim().is_empty() || + blockscout.domains.rpc.trim().is_empty() + { + bail!("blockscout.domains must include explorer/stats/rpc"); + } + if let Some(ssl) = &blockscout.ssl && + ssl.enabled && + ssl.email.trim().is_empty() + { + bail!("blockscout.ssl.email must be set when ssl.enabled=true"); + } + } + + // Failure-domain liveness math: strict >2/3 (Tendermint/Malachite). + let validators: Vec<&Node> = m.nodes.iter().filter(|n| n.role == "validator").collect(); + if validators.is_empty() { + bail!("at least one validator node is required"); + } + let n_total = validators.len() as u64; + let max_allowed = (n_total.saturating_sub(1) / 3) as usize; + let mut counts: BTreeMap<&str, usize> = BTreeMap::new(); + for v in validators { + *counts.entry(v.host.as_str()).or_default() += 1; + } + let offenders: BTreeMap<&str, usize> = + counts.into_iter().filter(|(_, c)| *c > max_allowed).collect(); + if !offenders.is_empty() { + if n_total < 4 { + eprintln!( + "warning: validator placement cannot be resilient to losing any single host with n={} equal-weight validators (survival would require validators_per_host<=0). offenders={:?}", + n_total, offenders + ); + } else if !allow_unsafe_failure_domains { + bail!( + "unsafe validator placement across failure domains; validators_per_host should be <= {} for n={} to survive losing any single host; offenders={:?} (override with --allow-unsafe-failure-domains)", + max_allowed, + n_total, + offenders + ); + } else { + eprintln!( + "warning: unsafe validator placement across failure domains; offenders={:?}", + offenders + ); + } + } + + Ok(()) +} + +fn engine_ipc_path(m: &Manifest, node_id: &str) -> String { + m.engine.ipc_path_template.replace("{node_id}", node_id) +} + +fn port_offset_host_block(m: &Manifest, node: &Node) -> Result { + let stride = m.ports.host_block_stride.unwrap_or(1_000); + let host_index = m + .hosts + .iter() + .position(|h| h.id == node.host) + .ok_or_else(|| eyre!("unknown host referenced: {}", node.host))?; + let mut same_host: Vec<&Node> = m.nodes.iter().filter(|n| n.host == node.host).collect(); + same_host.sort_by(|a, b| a.id.cmp(&b.id)); + let local_index = same_host + .iter() + .position(|n| n.id == node.id) + .ok_or_else(|| eyre!("node not found in host set"))?; + let base = (host_index as u32) * (stride as u32); + let off = base + (local_index as u32); + u16::try_from(off).map_err(|_| eyre!("port offset overflow")) +} + +fn port_offset_by_index(m: &Manifest, node: &Node) -> Result { + let mut nodes: Vec<&Node> = m.nodes.iter().collect(); + nodes.sort_by(|a, b| a.id.cmp(&b.id)); + let idx = nodes.iter().position(|n| n.id == node.id).ok_or_else(|| eyre!("node not found"))?; + u16::try_from(idx).map_err(|_| eyre!("node index overflow")) +} + +fn ports_for_node(m: &Manifest, node: &Node) -> Result { + let off = match m.ports.allocation.as_str() { + "host-block" => port_offset_host_block(m, node)?, + "by-index" => port_offset_by_index(m, node)?, + other => bail!("unsupported ports.allocation: {other}"), + }; + let authrpc_base = m.ports.el.authrpc.unwrap_or(8551); + + fn add_port(base: u16, off: u16, what: &str) -> Result { + let v = (base as u32) + (off as u32); + if v > u16::MAX as u32 { + bail!( + "port allocation overflow for {what}: base={base} offset={off} => {v} (max {})", + u16::MAX + ); + } + Ok(v as u16) + } + + Ok(PortsOut { + el_http: add_port(m.ports.el.http, off, "el.http")?, + el_authrpc: add_port(authrpc_base, off, "el.authrpc")?, + el_p2p: add_port(m.ports.el.p2p, off, "el.p2p")?, + el_metrics: add_port(m.ports.el.metrics, off, "el.metrics")?, + cl_p2p: add_port(m.ports.cl.p2p, off, "cl.p2p")?, + cl_mempool: add_port(m.ports.cl.mempool, off, "cl.mempool")?, + cl_metrics: add_port(m.ports.cl.metrics, off, "cl.metrics")?, + }) +} + +fn write_atomic(path: &Path, bytes: &[u8]) -> Result<()> { + write_atomic_with_mode(path, bytes, None) +} + +fn write_atomic_with_mode(path: &Path, bytes: &[u8], mode: Option) -> Result<()> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + let tmp = path.with_extension("tmp"); + #[cfg(unix)] + { + use std::{ + io::Write, + os::unix::fs::{OpenOptionsExt, PermissionsExt}, + }; + + let mut opts = fs::OpenOptions::new(); + opts.write(true).create(true).truncate(true); + if let Some(m) = mode { + opts.mode(m); + } + + let mut f = opts.open(&tmp)?; + f.write_all(bytes)?; + + // Ensure mode even if the tmp file already existed. + if let Some(m) = mode { + fs::set_permissions(&tmp, fs::Permissions::from_mode(m))?; + } + } + #[cfg(not(unix))] + { + fs::write(&tmp, bytes)?; + } + fs::rename(tmp, path)?; + #[cfg(unix)] + { + if let Some(m) = mode { + use std::os::unix::fs::PermissionsExt; + fs::set_permissions(path, fs::Permissions::from_mode(m))?; + } + } + Ok(()) +} + +fn ensure_load_reth_p2p_key(private_dir: &Path, node_id: &str) -> Result<(PathBuf, String)> { + let key_path = private_dir.join("load-reth").join("p2p-keys").join(format!("{node_id}.key")); + + let secret = if key_path.try_exists()? { + let s = fs::read_to_string(&key_path)?; + let hex_str = s.trim().trim_start_matches("0x"); + let bytes = hex::decode(hex_str)?; + SecretKey::from_slice(&bytes).map_err(|_| eyre!("invalid p2p key bytes"))? + } else { + let secret = SecretKey::random(&mut rand::thread_rng()); + let bytes = secret.to_bytes(); + write_atomic_with_mode(&key_path, hex::encode(bytes).as_bytes(), Some(0o600))?; + secret + }; + + // Tighten permissions if the key already existed with unsafe mode. + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + fs::set_permissions(&key_path, fs::Permissions::from_mode(0o600))?; + } + + let public = secret.public_key(); + let encoded = public.to_encoded_point(false); + let pub_bytes = encoded.as_bytes(); + if pub_bytes.first().copied() != Some(0x04) { + bail!("expected uncompressed pubkey"); + } + let enode_pub_hex = hex::encode(&pub_bytes[1..]); + Ok((key_path, enode_pub_hex)) +} + +fn write_env_file(path: &Path, entries: &[(&str, String)], mode: Option) -> Result<()> { + let mut out = String::new(); + for (k, v) in entries { + let escaped = v.replace('\\', "\\\\").replace('"', "\\\""); + out.push_str(k); + out.push_str("=\""); + out.push_str(&escaped); + out.push_str("\"\n"); + } + write_atomic_with_mode(path, out.as_bytes(), mode) +} + +fn render_inventory(lock: &Lockfile) -> Result { + // Render as a standard static YAML inventory (no dynamic `_meta`), so Ansible + // consistently applies per-host vars (ansible_host/ansible_user/loadnet_nodes). + let mut hosts = serde_yaml::Mapping::new(); + + let mut by_host: BTreeMap<&str, Vec<&LockNode>> = BTreeMap::new(); + for n in &lock.nodes { + by_host.entry(n.host.as_str()).or_default().push(n); + } + + for h in &lock.hosts { + let mut hv = serde_yaml::Mapping::new(); + hv.insert( + serde_yaml::Value::String("ansible_host".into()), + serde_yaml::Value::String(h.public_ip.clone()), + ); + hv.insert( + serde_yaml::Value::String("ansible_user".into()), + serde_yaml::Value::String(h.ssh_user.clone()), + ); + + let mut nodes: Vec = + by_host.get(h.id.as_str()).into_iter().flatten().map(|n| n.id.clone()).collect(); + nodes.sort(); + hv.insert( + serde_yaml::Value::String("loadnet_nodes".into()), + serde_yaml::Value::Sequence(nodes.into_iter().map(serde_yaml::Value::String).collect()), + ); + + hosts.insert(serde_yaml::Value::String(h.id.clone()), serde_yaml::Value::Mapping(hv)); + } + + let mut all = serde_yaml::Mapping::new(); + all.insert(serde_yaml::Value::String("hosts".into()), serde_yaml::Value::Mapping(hosts)); + all.insert( + serde_yaml::Value::String("vars".into()), + serde_yaml::Value::Mapping(Default::default()), + ); + all.insert( + serde_yaml::Value::String("children".into()), + serde_yaml::Value::Mapping(Default::default()), + ); + + let mut root = serde_yaml::Mapping::new(); + root.insert(serde_yaml::Value::String("all".into()), serde_yaml::Value::Mapping(all)); + + Ok(serde_yaml::to_string(&serde_yaml::Value::Mapping(root))?) +} + +fn generate( + manifest_path: &Path, + out_dir: &Path, + secrets_path: Option<&Path>, + allow_missing_archiver_tokens: bool, + allow_unsafe: bool, +) -> Result<()> { + let manifest: Manifest = read_yaml(manifest_path)?; + let allow_unsafe_effective = allow_unsafe || manifest.validation.allow_unsafe_failure_domains; + validate_manifest(&manifest, allow_unsafe_effective)?; + + let secrets = if let Some(p) = secrets_path { + let s = read_secrets(p)?; + if s.schema_version != 1 { + bail!("secrets schema_version must be 1"); + } + Some(s) + } else { + None + }; + + // Critical invariant: validators require archiver bearer tokens (Ultramarine fails fast). + if !allow_missing_archiver_tokens { + let mut missing: Vec = Vec::new(); + for n in &manifest.nodes { + if n.role != "validator" { + continue; + } + let tok = secrets + .as_ref() + .and_then(|s| s.nodes.get(&n.id)) + .map(|ns| ns.archiver_bearer_token.trim()) + .unwrap_or(""); + if tok.is_empty() { + missing.push(n.id.clone()); + } + } + if !missing.is_empty() { + bail!( + "missing archiver bearer tokens for validator nodes: {:?}. Provide --secrets-file or pass --allow-missing-archiver-tokens (unsafe).", + missing + ); + } + } + + let manifest_sha = sha256_file(manifest_path)?; + + let public_dir = out_dir.join("bundle").join("public"); + let private_dir = out_dir.join("bundle").join("private"); + let env_dir = private_dir.join("env"); + let monitoring_secret_dir = private_dir.join("monitoring"); + let ultra_homes_dir = private_dir.join("ultramarine").join("homes"); + + // Public artifact: EL genesis. + let genesis = if let Some(exe) = &manifest.execution { + let pairs = exe + .alloc + .iter() + .map(|a| (a.address.clone(), a.balance_wei.clone())) + .collect::>(); + ultramarine_genesis::build_genesis_from_alloc_strings(manifest.network.chain_id, pairs) + .map_err(|e| eyre!("failed to build execution genesis from manifest: {e}"))? + } else { + ultramarine_genesis::build_dev_genesis(manifest.network.chain_id)? + }; + let genesis_path = public_dir.join("genesis.json"); + ultramarine_genesis::write_genesis(&genesis_path, &genesis)?; + let genesis_sha = sha256_file(&genesis_path)?; + + // Generate (or reuse) per-node signing keys and build the CL genesis validator set. + // + // Note: all nodes (including non-validators) need a `priv_validator_key.json` today because + // Ultramarine derives its libp2p identity from this key. + let mut nodes_by_id: Vec<&Node> = manifest.nodes.iter().collect(); + nodes_by_id.sort_by(|a, b| a.id.cmp(&b.id)); + + let mut genesis_validators: Vec = Vec::new(); + for n in &nodes_by_id { + let key_path = ultra_homes_dir.join(&n.id).join("config").join("priv_validator_key.json"); + let pk: ConsensusPrivateKey = if key_path.try_exists()? { + serde_json::from_str(&fs::read_to_string(&key_path)?)? + } else { + let pk = ConsensusPrivateKey::generate(rand::rngs::OsRng); + write_atomic_with_mode( + &key_path, + serde_json::to_string_pretty(&pk)?.as_bytes(), + Some(0o600), + )?; + pk + }; + if n.role == "validator" { + genesis_validators.push(Validator::new(pk.public_key(), 1u64)); + } + } + let consensus_genesis = + ConsensusGenesis { validator_set: ValidatorSet::new(genesis_validators) }; + + // Emit Ultramarine node homes (config/config.toml + config/genesis.json). + for n in &manifest.nodes { + let ports = ports_for_node(&manifest, n)?; + let node_home = ultra_homes_dir.join(&n.id); + let config_dir = node_home.join("config"); + + // genesis.json + write_atomic( + &config_dir.join("genesis.json"), + serde_json::to_string_pretty(&consensus_genesis)?.as_bytes(), + )?; + + // config.toml (Ultramarine CLI config wrapper) + let mut cfg = + ultramarine_cli::config::Config { moniker: n.id.clone(), ..Default::default() }; + cfg.metrics.enabled = true; + cfg.metrics.listen_addr = + format!("{}:{}", manifest.exposure.metrics_bind, ports.cl_metrics) + .parse() + .map_err(|e| eyre!("invalid metrics listen addr: {e}"))?; + + let transport = ultramarine_cli::config::TransportProtocol::Tcp; + cfg.consensus.p2p.listen_addr = transport.multiaddr("0.0.0.0", ports.cl_p2p as usize); + cfg.mempool.p2p.listen_addr = transport.multiaddr("0.0.0.0", ports.cl_mempool as usize); + + // Deterministic persistent peers (dialable public IPs). + let mut consensus_peers = Vec::new(); + let mut mempool_peers = Vec::new(); + for other in &manifest.nodes { + if other.id == n.id { + continue; + } + let other_ports = ports_for_node(&manifest, other)?; + let other_ip = host_ip(&manifest, &other.host)?; + consensus_peers.push(transport.multiaddr(&other_ip, other_ports.cl_p2p as usize)); + mempool_peers.push(transport.multiaddr(&other_ip, other_ports.cl_mempool as usize)); + } + consensus_peers.sort(); + mempool_peers.sort(); + cfg.consensus.p2p.persistent_peers = consensus_peers; + cfg.mempool.p2p.persistent_peers = mempool_peers; + + // Multi-host requires ValueSync. + cfg.sync.enabled = true; + + // Apply global sync tuning (applies to ALL nodes: validators + fullnodes). + if let Some(parallel_requests) = manifest.sync.parallel_requests { + cfg.sync.parallel_requests = parallel_requests; + } + if let Some(ref request_timeout) = manifest.sync.request_timeout { + if let Ok(dur) = humantime::parse_duration(request_timeout) { + cfg.sync.request_timeout = dur; + } else { + eprintln!( + "warning: invalid sync.request_timeout '{request_timeout}', using default" + ); + } + } + if let Some(ref max_request_size) = manifest.sync.max_request_size { + if let Ok(size) = max_request_size.parse::() { + cfg.sync.max_request_size = bytesize::ByteSize::b(size.as_u64()); + } else { + eprintln!( + "warning: invalid sync.max_request_size '{max_request_size}', using default" + ); + } + } + if let Some(ref max_response_size) = manifest.sync.max_response_size { + if let Ok(size) = max_response_size.parse::() { + cfg.sync.max_response_size = bytesize::ByteSize::b(size.as_u64()); + } else { + eprintln!( + "warning: invalid sync.max_response_size '{max_response_size}', using default" + ); + } + } + if let Some(batch_size) = manifest.sync.batch_size { + cfg.sync.batch_size = batch_size; + } + + // Apply P2P message size limits (for handling large blocks during load tests). + if let Some(ref pubsub_max_size) = manifest.p2p.pubsub_max_size { + if let Ok(size) = pubsub_max_size.parse::() { + cfg.consensus.p2p.pubsub_max_size = bytesize::ByteSize::b(size.as_u64()); + cfg.mempool.p2p.pubsub_max_size = bytesize::ByteSize::b(size.as_u64()); + } else { + eprintln!( + "warning: invalid p2p.pubsub_max_size '{pubsub_max_size}', using default" + ); + } + } + if let Some(ref rpc_max_size) = manifest.p2p.rpc_max_size { + if let Ok(size) = rpc_max_size.parse::() { + cfg.consensus.p2p.rpc_max_size = bytesize::ByteSize::b(size.as_u64()); + cfg.mempool.p2p.rpc_max_size = bytesize::ByteSize::b(size.as_u64()); + } else { + eprintln!("warning: invalid p2p.rpc_max_size '{rpc_max_size}', using default"); + } + } + + // Apply fullnode-specific sync tuning for non-validator nodes (overrides global). + if n.role != "validator" && + let Some(fullnode_sync) = &manifest.sync.fullnode + { + if let Some(parallel_requests) = fullnode_sync.parallel_requests { + cfg.sync.parallel_requests = parallel_requests; + } + if let Some(ref request_timeout) = fullnode_sync.request_timeout { + if let Ok(dur) = humantime::parse_duration(request_timeout) { + cfg.sync.request_timeout = dur; + } else { + eprintln!( + "warning: invalid fullnode request_timeout '{request_timeout}', using default" + ); + } + } + if let Some(ref max_request_size) = fullnode_sync.max_request_size { + if let Ok(size) = max_request_size.parse::() { + cfg.sync.max_request_size = bytesize::ByteSize::b(size.as_u64()); + } else { + eprintln!( + "warning: invalid fullnode max_request_size '{max_request_size}', using default" + ); + } + } + if let Some(ref max_response_size) = fullnode_sync.max_response_size { + if let Ok(size) = max_response_size.parse::() { + cfg.sync.max_response_size = bytesize::ByteSize::b(size.as_u64()); + } else { + eprintln!( + "warning: invalid fullnode max_response_size '{max_response_size}', using default" + ); + } + } + if let Some(batch_size) = fullnode_sync.batch_size { + cfg.sync.batch_size = batch_size; + } + } + + // Archiver baseline config (token is supplied via env for validators). + cfg.archiver.enabled = n.role == "validator"; + cfg.archiver.provider_url = manifest.archiver.provider_url.clone(); + cfg.archiver.provider_id = manifest.archiver.provider_id.clone(); + + write_atomic(&config_dir.join("config.toml"), toml::to_string_pretty(&cfg)?.as_bytes())?; + } + + let mut hosts: Vec = manifest + .hosts + .iter() + .map(|h| LockHost { + id: h.id.clone(), + public_ip: h.public_ip.clone(), + ssh_user: h.ssh_user.clone(), + }) + .collect(); + hosts.sort_by(|a, b| a.id.cmp(&b.id)); + + let mut nodes_sorted: Vec<&Node> = manifest.nodes.iter().collect(); + nodes_sorted.sort_by(|a, b| a.id.cmp(&b.id)); + + let mut lock_nodes: Vec = Vec::new(); + for n in nodes_sorted { + let ports = ports_for_node(&manifest, n)?; + let ipc_path = engine_ipc_path(&manifest, &n.id); + + let (key_path, enode_pub_hex) = ensure_load_reth_p2p_key(&private_dir, &n.id)?; + let enode = + format!("enode://{enode_pub_hex}@{}:{}", host_ip(&manifest, &n.host)?, ports.el_p2p); + + let p2p_key_rel = + key_path.strip_prefix(out_dir).unwrap_or(&key_path).to_string_lossy().to_string(); + + lock_nodes.push(LockNode { + id: n.id.clone(), + host: n.host.clone(), + role: n.role.clone(), + images: ImagesOut { + ultramarine: manifest.images.ultramarine.clone(), + load_reth: manifest.images.load_reth.clone(), + }, + engine: EngineOut { mode: "ipc", ipc_path }, + ports, + load_reth: LoadRethOut { p2p_key_path: p2p_key_rel, enode, bootnodes: vec![] }, + archiver: if n.role == "validator" { + Some(ArchiverOut { + enabled: true, + provider_url: manifest.archiver.provider_url.clone(), + provider_id: manifest.archiver.provider_id.clone(), + bearer_token_present: secrets.is_some(), + }) + } else { + None + }, + }); + } + + // Compute bootnodes (all enodes except self). + let all_enodes: Vec = lock_nodes.iter().map(|n| n.load_reth.enode.clone()).collect(); + for n in &mut lock_nodes { + let mut bootnodes: Vec = + all_enodes.iter().filter(|e| *e != &n.load_reth.enode).cloned().collect(); + if bootnodes.is_empty() { + // Avoid passing an empty --bootnodes flag on single-node networks. + bootnodes.push(n.load_reth.enode.clone()); + } + n.load_reth.bootnodes = bootnodes; + } + + let lock = Lockfile { + schema_version: 1, + tool: ToolInfo { name: "netgen", version: env!("CARGO_PKG_VERSION") }, + network: LockNetwork { + name: manifest.network.name.clone(), + chain_id: manifest.network.chain_id, + }, + inputs: Inputs { + manifest_path: manifest_path.display().to_string(), + manifest_sha256: manifest_sha, + }, + policy: Policy { + engine: "ipc-only", + sync_enabled: true, + metrics_bind: manifest.exposure.metrics_bind.clone(), + unsafe_failure_domains_allowed: allow_unsafe_effective, + }, + hosts, + nodes: lock_nodes, + blockscout: manifest.blockscout.as_ref().map(|b| BlockscoutOut { + enabled: b.enabled, + host: b.host.clone(), + rpc_node: b.rpc_node.clone(), + domains: b.domains.clone(), + ssl: b.ssl.clone(), + }), + artifacts: Artifacts { + public: PublicArtifacts { + genesis_json: ArtifactRef { + path: genesis_path + .strip_prefix(out_dir) + .unwrap_or(&genesis_path) + .display() + .to_string(), + sha256: genesis_sha, + }, + }, + }, + }; + + // Write lockfile and public network JSON. + let lock_path = out_dir.join("network.lock.json"); + write_atomic(&lock_path, serde_json::to_string_pretty(&lock)?.as_bytes())?; + let network_json_path = public_dir.join("network.json"); + let mut host_ip_by_id: BTreeMap<&str, &str> = BTreeMap::new(); + for h in &lock.hosts { + host_ip_by_id.insert(h.id.as_str(), h.public_ip.as_str()); + } + let mut public_nodes: Vec = Vec::new(); + for n in &lock.nodes { + let public_ip = host_ip_by_id + .get(n.host.as_str()) + .ok_or_else(|| eyre!("lockfile missing host entry: {}", n.host))? + .to_string(); + public_nodes.push(NetworkNode { + id: n.id.clone(), + role: n.role.clone(), + host: n.host.clone(), + public_ip, + ports: n.ports.clone(), + load_reth: NetworkLoadReth { + enode: n.load_reth.enode.clone(), + bootnodes: n.load_reth.bootnodes.clone(), + }, + archiver: n.archiver.as_ref().map(|a| NetworkArchiver { + enabled: a.enabled, + provider_url: a.provider_url.clone(), + provider_id: a.provider_id.clone(), + }), + }); + } + let public_network = NetworkJson { + schema_version: 1, + tool: lock.tool.clone(), + network: lock.network.clone(), + nodes: public_nodes, + artifacts: lock.artifacts.clone(), + }; + write_atomic(&network_json_path, serde_json::to_string_pretty(&public_network)?.as_bytes())?; + + // Inventory. + let inv = render_inventory(&lock)?; + write_atomic(&out_dir.join("inventory.yml"), inv.as_bytes())?; + + // Per-node runtime env files (consumed by systemd/Ansible). + if let Some(s) = &secrets && + let Some(pw) = + s.grafana_admin_password.as_ref().map(|v| v.trim()).filter(|v| !v.is_empty()) + { + let secret_path = monitoring_secret_dir.join("grafana_admin_password.env"); + write_env_file(&secret_path, &[("GRAFANA_ADMIN_PASSWORD", pw.to_string())], Some(0o600))?; + } + + for node in &lock.nodes { + let ultramarine_env_path = env_dir.join(format!("ultramarine-{}.env", node.id)); + let load_reth_env_path = env_dir.join(format!("load-reth-{}.env", node.id)); + + let mut ultra_entries: Vec<(&str, String)> = Vec::new(); + ultra_entries.push(("ULTRAMARINE_NODE_ID", node.id.clone())); + ultra_entries.push(( + "ULTRAMARINE_HOME_DIR", + format!("bundle/private/ultramarine/homes/{}", node.id), + )); + ultra_entries.push(("ULTRAMARINE_ENGINE_IPC_PATH", node.engine.ipc_path.clone())); + ultra_entries + .push(("ULTRAMARINE_ETH1_RPC_URL", format!("http://127.0.0.1:{}", node.ports.el_http))); + ultra_entries.push(("ULTRAMARINE_METRICS_BIND", manifest.exposure.metrics_bind.clone())); + ultra_entries.push(("ULTRAMARINE_CL_P2P_PORT", node.ports.cl_p2p.to_string())); + ultra_entries.push(("ULTRAMARINE_CL_MEMPOOL_PORT", node.ports.cl_mempool.to_string())); + ultra_entries.push(("ULTRAMARINE_CL_METRICS_PORT", node.ports.cl_metrics.to_string())); + ultra_entries.push(("ULTRAMARINE_IMAGE", node.images.ultramarine.clone())); + ultra_entries.push(("ULTRAMARINE_UID", "10002".to_string())); + ultra_entries.push(("ULTRAMARINE_GID", "10002".to_string())); + ultra_entries.push(("RUST_LOG", "info".to_string())); + + if node.role == "validator" { + ultra_entries.push(("ULTRAMARINE_ARCHIVER_ENABLED", "true".to_string())); + ultra_entries.push(( + "ULTRAMARINE_ARCHIVER_PROVIDER_URL", + manifest.archiver.provider_url.clone(), + )); + ultra_entries + .push(("ULTRAMARINE_ARCHIVER_PROVIDER_ID", manifest.archiver.provider_id.clone())); + + if let Some(s) = &secrets { + let tok = s + .nodes + .get(&node.id) + .ok_or_else(|| eyre!("missing secrets.nodes.{}", node.id))? + .archiver_bearer_token + .trim() + .to_string(); + if tok.is_empty() { + bail!("empty archiver token for node {}", node.id); + } + let secret_path = private_dir + .join("ultramarine") + .join("secrets") + .join(format!("{}.env", node.id)); + write_env_file( + &secret_path, + &[("ULTRAMARINE_ARCHIVER_BEARER_TOKEN", tok)], + Some(0o600), + )?; + } + } + write_env_file(&ultramarine_env_path, &ultra_entries, None)?; + + let reth_public_ip = host_ip(&manifest, &node.host)?; + let reth_entries: Vec<(&str, String)> = vec![ + ("LOAD_RETH_NODE_ID", node.id.clone()), + ("LOAD_RETH_IMAGE", node.images.load_reth.clone()), + ("LOAD_RETH_PUBLIC_IP", reth_public_ip), + ("LOAD_RETH_HTTP_PORT", node.ports.el_http.to_string()), + ("LOAD_RETH_AUTHRPC_PORT", node.ports.el_authrpc.to_string()), + ("LOAD_RETH_P2P_PORT", node.ports.el_p2p.to_string()), + ("LOAD_RETH_METRICS_PORT", node.ports.el_metrics.to_string()), + ("LOAD_RETH_ENGINE_IPC_PATH", node.engine.ipc_path.clone()), + ("LOAD_RETH_P2P_KEY_PATH", node.load_reth.p2p_key_path.clone()), + ("LOAD_RETH_BOOTNODES", node.load_reth.bootnodes.join(",")), + ("LOAD_RETH_GENESIS_JSON", lock.artifacts.public.genesis_json.path.clone()), + ("LOAD_RETH_UID", "10001".to_string()), + ("LOAD_RETH_GID", "10001".to_string()), + ("RUST_LOG", "info".to_string()), + ]; + write_env_file(&load_reth_env_path, &reth_entries, None)?; + } + + println!("wrote {}", lock_path.display()); + Ok(()) +} + +fn host_ip(m: &Manifest, host_id: &str) -> Result { + m.hosts + .iter() + .find(|h| h.id == host_id) + .map(|h| h.public_ip.clone()) + .ok_or_else(|| eyre!("unknown host id: {host_id}")) +} + +fn main() -> Result<()> { + color_eyre::install()?; + let cli = Cli::parse(); + match cli.cmd { + Cmd::Validate { manifest, allow_unsafe_failure_domains } => { + let m: Manifest = read_yaml(&manifest)?; + let allow_unsafe_effective = + allow_unsafe_failure_domains || m.validation.allow_unsafe_failure_domains; + validate_manifest(&m, allow_unsafe_effective)?; + println!("ok"); + Ok(()) + } + Cmd::Gen { + manifest, + out_dir, + secrets_file, + allow_missing_archiver_tokens, + allow_unsafe_failure_domains, + } => generate( + &manifest, + &out_dir, + secrets_file.as_deref(), + allow_missing_archiver_tokens, + allow_unsafe_failure_domains, + ), + } +} diff --git a/infra/manifests/.gitkeep b/infra/manifests/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/infra/manifests/.gitkeep @@ -0,0 +1 @@ + diff --git a/infra/manifests/example.yaml b/infra/manifests/example.yaml new file mode 100644 index 0000000..1614616 --- /dev/null +++ b/infra/manifests/example.yaml @@ -0,0 +1,70 @@ +schema_version: 1 + +network: + name: example + chain_id: 16383 + +# Container images pinned by tag (MVP). M4 can add optional digests. +images: + ultramarine: "ultramarine:local" + load_reth: "docker.io/loadnetwork/load-reth:v0.1.2" + +# Hosts are the failure domains. For public testnets, these should be public IPs/DNS. +hosts: + - id: host-0 + public_ip: "203.0.113.10" + ssh_user: ubuntu + - id: host-1 + public_ip: "203.0.113.11" + ssh_user: ubuntu + - id: host-2 + public_ip: "203.0.113.12" + ssh_user: ubuntu + +# Nodes are CL+EL pairs and must be co-located (one node references one host). +nodes: + - id: node-0 + host: host-0 + role: validator + - id: node-1 + host: host-1 + role: validator + - id: node-2 + host: host-2 + role: validator + +# Engine API policy for deploys: IPC-only. +engine: + mode: ipc + ipc_path_template: "/run/load-reth/{node_id}/engine.ipc" + +# Port allocation is deterministic. "host-block" matches the devnet pattern: +# host index 0 => 8545/30303/9001, host index 1 => 9545/31303/10001, etc. +ports: + allocation: host-block + # Use 1000 to stay under 65535 for typical base ports across more hosts. + host_block_stride: 1000 + el: + http: 8545 + p2p: 30303 + metrics: 9001 + cl: + p2p: 27000 + mempool: 28000 + metrics: 29000 + +# ValueSync must be enabled for multi-host networks. +sync: + enabled: true + +# Validators require archiver. The generator enforces presence of provider+token. +archiver: + enabled: true + provider_url: "https://archiver.example.com" + provider_id: "example" + # Per-node bearer tokens come from decrypted secrets input (not stored here). + +# Metrics exposure policy: "localhost" is safest for public testnets without centralized Prometheus. +# Use SSH port-forwarding for debugging. +exposure: + metrics_bind: "127.0.0.1" diff --git a/infra/manifests/fibernet.yaml b/infra/manifests/fibernet.yaml new file mode 100644 index 0000000..80f8ba6 --- /dev/null +++ b/infra/manifests/fibernet.yaml @@ -0,0 +1,130 @@ +schema_version: 1 + +network: + name: fibernet + chain_id: 1984 + +# Execution genesis (alloc) for fibernet. +execution: + alloc: + # Treasury / operator address (50% of supply: 5_000_000 / 10_000_000). + - address: "0x197f818c1313dc58b32d88078ecdfb40ea822614" + balance_wei: "5000000000000000000000000" + # Spam/blobs funding account (remaining supply: 50%). + - address: "0x2411c6e43299a09d2225d58e9e7977237b3e4c49" + balance_wei: "5000000000000000000000000" + +# Container images pinned by tag (MVP). M4 can add optional digests. +images: + ultramarine: "docker.io/loadnetwork/ultramarine:fibernet" + load_reth: "docker.io/loadnetwork/load-reth:fibernet" + +# Hosts are the failure domains. For public testnets, these should be public IPs/DNS. +hosts: + - id: f4-metal-medium-lon2-fibernet-1 + public_ip: "67.213.117.143" + ssh_user: ubuntu + - id: f4-metal-medium-ams-fibernet-2 + public_ip: "64.34.87.1" + ssh_user: ubuntu + - id: f4-metal-medium-fra2-fibernet-2 + public_ip: "67.213.121.57" + ssh_user: ubuntu + # Dedicated host for Blockscout explorer + full node (Xeon E-2286G, 32GB RAM, 1TB SSD) + - id: blockscout-fibernet + public_ip: "72.46.84.15" + ssh_user: ubuntu + +# 6 validators: 2 per host (3x2 topology) +nodes: + - id: node-0 + host: f4-metal-medium-lon2-fibernet-1 + role: validator + - id: node-1 + host: f4-metal-medium-lon2-fibernet-1 + role: validator + - id: node-2 + host: f4-metal-medium-ams-fibernet-2 + role: validator + - id: node-3 + host: f4-metal-medium-ams-fibernet-2 + role: validator + - id: node-4 + host: f4-metal-medium-fra2-fibernet-2 + role: validator + - id: node-5 + host: f4-metal-medium-fra2-fibernet-2 + role: validator + # Full node for Blockscout indexing and public RPC (non-validator) + - id: node-rpc + host: blockscout-fibernet + role: fullnode + +# Engine API policy for deploys: IPC-only. +engine: + mode: ipc + ipc_path_template: "/run/load-reth/{node_id}/engine.ipc" + +# Port allocation is deterministic. "host-block" is easiest to operate. +ports: + allocation: host-block + host_block_stride: 1000 + el: + http: 8545 + authrpc: 8551 + p2p: 30303 + metrics: 9001 + cl: + p2p: 27000 + mempool: 28000 + metrics: 29000 + +# ValueSync must be enabled for multi-host networks. +sync: + enabled: true + # Global sync tuning (applies to ALL nodes: validators + fullnodes). + # Blocks with 1024 blobs can be ~134 MB; default 10 MiB response limit is too small. + # Load tests can produce blocks up to 6 MB - increase all limits accordingly. + max_request_size: "50 MiB" + max_response_size: "500 MiB" + request_timeout: "60s" + parallel_requests: 100 + batch_size: 5 + # Fullnode-specific sync tuning (overrides global for non-validator nodes). + fullnode: + parallel_requests: 100 + request_timeout: "60s" + max_response_size: "500 MiB" + batch_size: 10 + +# P2P message size limits - must be large enough for big blocks. +p2p: + pubsub_max_size: "50 MiB" + rpc_max_size: "100 MiB" + +# Validators require archiver. The generator enforces presence of provider+token. +archiver: + enabled: true + provider_url: "https://archiver.example.com" + provider_id: "loads3" + +# Metrics exposure policy: "localhost" is safest for public testnets without centralized Prometheus. +exposure: + metrics_bind: "127.0.0.1" + +# Testnet override: allow >1 validator per host (unsafe for single-host failure tolerance). +validation: + allow_unsafe_failure_domains: true + +# Blockscout explorer configuration +blockscout: + enabled: true + host: blockscout-fibernet + rpc_node: node-rpc + domains: + explorer: fibernet.load.network + stats: stats.fibernet.load.network + rpc: rpc.fibernet.load.network + ssl: + enabled: true + email: admin@load.network diff --git a/infra/networks/.gitkeep b/infra/networks/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/infra/networks/.gitkeep @@ -0,0 +1 @@ + diff --git a/infra/networks/example/bundle/public/genesis.json b/infra/networks/example/bundle/public/genesis.json new file mode 100644 index 0000000..076f476 --- /dev/null +++ b/infra/networks/example/bundle/public/genesis.json @@ -0,0 +1,43 @@ +{ + "config": { + "chainId": 16383, + "homesteadBlock": 0, + "daoForkSupport": false, + "eip150Block": 0, + "eip155Block": 0, + "eip158Block": 0, + "byzantiumBlock": 0, + "constantinopleBlock": 0, + "petersburgBlock": 0, + "istanbulBlock": 0, + "berlinBlock": 0, + "londonBlock": 0, + "mergeNetsplitBlock": 0, + "shanghaiTime": 0, + "cancunTime": 0, + "pragueTime": 0, + "terminalTotalDifficulty": 0, + "terminalTotalDifficultyPassed": true + }, + "nonce": "0x0", + "timestamp": "0x0", + "extraData": "0x4c6f6164204e6574776f726b20446576", + "gasLimit": "0x77359400", + "difficulty": "0x0", + "mixHash": "0x0000000000000000000000000000000000000000000000000000000000000000", + "coinbase": "0x0000000000000000000000000000000000000000", + "alloc": { + "0x6512b55f1debdabeb01fe96d891971cc23e24ecb": { + "balance": "0x32d26d12e980b600000" + }, + "0x9858effd232b4033e47d90003d41ec34ecaeda94": { + "balance": "0x32d26d12e980b600000" + }, + "0xf39fd6e51aad88f6f4ce6ab8827279cfffb92266": { + "balance": "0x32d26d12e980b600000" + } + }, + "baseFeePerGas": "0x7", + "number": "0x0", + "parentHash": "0x0000000000000000000000000000000000000000000000000000000000000000" +} \ No newline at end of file diff --git a/infra/networks/example/bundle/public/network.json b/infra/networks/example/bundle/public/network.json new file mode 100644 index 0000000..1e27774 --- /dev/null +++ b/infra/networks/example/bundle/public/network.json @@ -0,0 +1,99 @@ +{ + "schema_version": 1, + "tool": { + "name": "netgen", + "version": "0.1.0" + }, + "network": { + "name": "example", + "chain_id": 16383 + }, + "nodes": [ + { + "id": "node-0", + "role": "validator", + "host": "host-0", + "public_ip": "203.0.113.10", + "ports": { + "el_http": 8545, + "el_p2p": 30303, + "el_metrics": 9001, + "cl_p2p": 27000, + "cl_mempool": 28000, + "cl_metrics": 29000 + }, + "load_reth": { + "enode": "enode://ba44456804b2e026978293895e4fa0d0e20b191affcf809a47de026950c68b77c4eec93ea94af1e5fde27a24f1fdc9fe2debc6f3cc536611a54e854b8b0ab070@203.0.113.10:30303", + "bootnodes": [ + "enode://cf823e78e1dc1c6845957149bdbf7ec564600c264716b2d50f6c64e23f516bd6dfbf191d611139ad848444f7d0ec01d44557ba6617647153ecc6ab45a6598586@203.0.113.11:31303", + "enode://5535659ef2bb6dd62a42a82431f22ca6770505f6422227af00c85d40ef752b8d0304e2a7b6a7c1aea333c001fc77dd2de9cf1827c4cb8998ebceb32412beecc4@203.0.113.12:32303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://archiver.example.com", + "provider_id": "example" + } + }, + { + "id": "node-1", + "role": "validator", + "host": "host-1", + "public_ip": "203.0.113.11", + "ports": { + "el_http": 9545, + "el_p2p": 31303, + "el_metrics": 10001, + "cl_p2p": 28000, + "cl_mempool": 29000, + "cl_metrics": 30000 + }, + "load_reth": { + "enode": "enode://cf823e78e1dc1c6845957149bdbf7ec564600c264716b2d50f6c64e23f516bd6dfbf191d611139ad848444f7d0ec01d44557ba6617647153ecc6ab45a6598586@203.0.113.11:31303", + "bootnodes": [ + "enode://ba44456804b2e026978293895e4fa0d0e20b191affcf809a47de026950c68b77c4eec93ea94af1e5fde27a24f1fdc9fe2debc6f3cc536611a54e854b8b0ab070@203.0.113.10:30303", + "enode://5535659ef2bb6dd62a42a82431f22ca6770505f6422227af00c85d40ef752b8d0304e2a7b6a7c1aea333c001fc77dd2de9cf1827c4cb8998ebceb32412beecc4@203.0.113.12:32303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://archiver.example.com", + "provider_id": "example" + } + }, + { + "id": "node-2", + "role": "validator", + "host": "host-2", + "public_ip": "203.0.113.12", + "ports": { + "el_http": 10545, + "el_p2p": 32303, + "el_metrics": 11001, + "cl_p2p": 29000, + "cl_mempool": 30000, + "cl_metrics": 31000 + }, + "load_reth": { + "enode": "enode://5535659ef2bb6dd62a42a82431f22ca6770505f6422227af00c85d40ef752b8d0304e2a7b6a7c1aea333c001fc77dd2de9cf1827c4cb8998ebceb32412beecc4@203.0.113.12:32303", + "bootnodes": [ + "enode://ba44456804b2e026978293895e4fa0d0e20b191affcf809a47de026950c68b77c4eec93ea94af1e5fde27a24f1fdc9fe2debc6f3cc536611a54e854b8b0ab070@203.0.113.10:30303", + "enode://cf823e78e1dc1c6845957149bdbf7ec564600c264716b2d50f6c64e23f516bd6dfbf191d611139ad848444f7d0ec01d44557ba6617647153ecc6ab45a6598586@203.0.113.11:31303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://archiver.example.com", + "provider_id": "example" + } + } + ], + "artifacts": { + "public": { + "genesis_json": { + "path": "bundle/public/genesis.json", + "sha256": "1643cbd6641602c85602871327a2e38924de95cabd64682e0d7d2b0080348fc7" + } + } + } +} \ No newline at end of file diff --git a/infra/networks/example/inventory.yml b/infra/networks/example/inventory.yml new file mode 100644 index 0000000..e413a38 --- /dev/null +++ b/infra/networks/example/inventory.yml @@ -0,0 +1,19 @@ +all: + hosts: + host-0: + ansible_host: 203.0.113.10 + ansible_user: ubuntu + loadnet_nodes: + - node-0 + host-1: + ansible_host: 203.0.113.11 + ansible_user: ubuntu + loadnet_nodes: + - node-1 + host-2: + ansible_host: 203.0.113.12 + ansible_user: ubuntu + loadnet_nodes: + - node-2 + vars: {} + children: {} diff --git a/infra/networks/example/network.lock.json b/infra/networks/example/network.lock.json new file mode 100644 index 0000000..8309583 --- /dev/null +++ b/infra/networks/example/network.lock.json @@ -0,0 +1,152 @@ +{ + "schema_version": 1, + "tool": { + "name": "netgen", + "version": "0.1.0" + }, + "network": { + "name": "example", + "chain_id": 16383 + }, + "inputs": { + "manifest_path": "infra/manifests/example.yaml", + "manifest_sha256": "d00e5d4bff1048541fe099c32d4e4950d6fcdea599b0ea5dcd022a2081989839" + }, + "policy": { + "engine": "ipc-only", + "sync_enabled": true, + "metrics_bind": "127.0.0.1" + }, + "hosts": [ + { + "id": "host-0", + "public_ip": "203.0.113.10", + "ssh_user": "ubuntu" + }, + { + "id": "host-1", + "public_ip": "203.0.113.11", + "ssh_user": "ubuntu" + }, + { + "id": "host-2", + "public_ip": "203.0.113.12", + "ssh_user": "ubuntu" + } + ], + "nodes": [ + { + "id": "node-0", + "host": "host-0", + "role": "validator", + "images": { + "ultramarine": "ultramarine:local", + "load_reth": "docker.io/loadnetwork/load-reth:v0.1.2" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-0/engine.ipc" + }, + "ports": { + "el_http": 8545, + "el_p2p": 30303, + "el_metrics": 9001, + "cl_p2p": 27000, + "cl_mempool": 28000, + "cl_metrics": 29000 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-0.key", + "enode": "enode://ba44456804b2e026978293895e4fa0d0e20b191affcf809a47de026950c68b77c4eec93ea94af1e5fde27a24f1fdc9fe2debc6f3cc536611a54e854b8b0ab070@203.0.113.10:30303", + "bootnodes": [ + "enode://cf823e78e1dc1c6845957149bdbf7ec564600c264716b2d50f6c64e23f516bd6dfbf191d611139ad848444f7d0ec01d44557ba6617647153ecc6ab45a6598586@203.0.113.11:31303", + "enode://5535659ef2bb6dd62a42a82431f22ca6770505f6422227af00c85d40ef752b8d0304e2a7b6a7c1aea333c001fc77dd2de9cf1827c4cb8998ebceb32412beecc4@203.0.113.12:32303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://archiver.example.com", + "provider_id": "example", + "bearer_token_present": true + } + }, + { + "id": "node-1", + "host": "host-1", + "role": "validator", + "images": { + "ultramarine": "ultramarine:local", + "load_reth": "docker.io/loadnetwork/load-reth:v0.1.2" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-1/engine.ipc" + }, + "ports": { + "el_http": 9545, + "el_p2p": 31303, + "el_metrics": 10001, + "cl_p2p": 28000, + "cl_mempool": 29000, + "cl_metrics": 30000 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-1.key", + "enode": "enode://cf823e78e1dc1c6845957149bdbf7ec564600c264716b2d50f6c64e23f516bd6dfbf191d611139ad848444f7d0ec01d44557ba6617647153ecc6ab45a6598586@203.0.113.11:31303", + "bootnodes": [ + "enode://ba44456804b2e026978293895e4fa0d0e20b191affcf809a47de026950c68b77c4eec93ea94af1e5fde27a24f1fdc9fe2debc6f3cc536611a54e854b8b0ab070@203.0.113.10:30303", + "enode://5535659ef2bb6dd62a42a82431f22ca6770505f6422227af00c85d40ef752b8d0304e2a7b6a7c1aea333c001fc77dd2de9cf1827c4cb8998ebceb32412beecc4@203.0.113.12:32303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://archiver.example.com", + "provider_id": "example", + "bearer_token_present": true + } + }, + { + "id": "node-2", + "host": "host-2", + "role": "validator", + "images": { + "ultramarine": "ultramarine:local", + "load_reth": "docker.io/loadnetwork/load-reth:v0.1.2" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-2/engine.ipc" + }, + "ports": { + "el_http": 10545, + "el_p2p": 32303, + "el_metrics": 11001, + "cl_p2p": 29000, + "cl_mempool": 30000, + "cl_metrics": 31000 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-2.key", + "enode": "enode://5535659ef2bb6dd62a42a82431f22ca6770505f6422227af00c85d40ef752b8d0304e2a7b6a7c1aea333c001fc77dd2de9cf1827c4cb8998ebceb32412beecc4@203.0.113.12:32303", + "bootnodes": [ + "enode://ba44456804b2e026978293895e4fa0d0e20b191affcf809a47de026950c68b77c4eec93ea94af1e5fde27a24f1fdc9fe2debc6f3cc536611a54e854b8b0ab070@203.0.113.10:30303", + "enode://cf823e78e1dc1c6845957149bdbf7ec564600c264716b2d50f6c64e23f516bd6dfbf191d611139ad848444f7d0ec01d44557ba6617647153ecc6ab45a6598586@203.0.113.11:31303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://archiver.example.com", + "provider_id": "example", + "bearer_token_present": true + } + } + ], + "artifacts": { + "public": { + "genesis_json": { + "path": "bundle/public/genesis.json", + "sha256": "1643cbd6641602c85602871327a2e38924de95cabd64682e0d7d2b0080348fc7" + } + } + } +} \ No newline at end of file diff --git a/infra/networks/example_tmp.err b/infra/networks/example_tmp.err new file mode 100644 index 0000000..9e12624 --- /dev/null +++ b/infra/networks/example_tmp.err @@ -0,0 +1,9 @@ +warning: validator placement cannot be resilient to losing any single host with n=3 equal-weight validators (survival would require validators_per_host<=0). offenders={"host-0": 1, "host-1": 1, "host-2": 1} +Error: + 0: missing archiver bearer tokens for validator nodes: ["node-0", "node-1", "node-2"]. Provide --secrets-file or pass --allow-missing-archiver-tokens (unsafe). + +Location: + infra/gen/netgen/src/main.rs:571 + +Backtrace omitted. Run with RUST_BACKTRACE=1 environment variable to display it. +Run with RUST_BACKTRACE=full to include source snippets. diff --git a/infra/networks/example_tmp/bundle/public/genesis.json b/infra/networks/example_tmp/bundle/public/genesis.json new file mode 100644 index 0000000..076f476 --- /dev/null +++ b/infra/networks/example_tmp/bundle/public/genesis.json @@ -0,0 +1,43 @@ +{ + "config": { + "chainId": 16383, + "homesteadBlock": 0, + "daoForkSupport": false, + "eip150Block": 0, + "eip155Block": 0, + "eip158Block": 0, + "byzantiumBlock": 0, + "constantinopleBlock": 0, + "petersburgBlock": 0, + "istanbulBlock": 0, + "berlinBlock": 0, + "londonBlock": 0, + "mergeNetsplitBlock": 0, + "shanghaiTime": 0, + "cancunTime": 0, + "pragueTime": 0, + "terminalTotalDifficulty": 0, + "terminalTotalDifficultyPassed": true + }, + "nonce": "0x0", + "timestamp": "0x0", + "extraData": "0x4c6f6164204e6574776f726b20446576", + "gasLimit": "0x77359400", + "difficulty": "0x0", + "mixHash": "0x0000000000000000000000000000000000000000000000000000000000000000", + "coinbase": "0x0000000000000000000000000000000000000000", + "alloc": { + "0x6512b55f1debdabeb01fe96d891971cc23e24ecb": { + "balance": "0x32d26d12e980b600000" + }, + "0x9858effd232b4033e47d90003d41ec34ecaeda94": { + "balance": "0x32d26d12e980b600000" + }, + "0xf39fd6e51aad88f6f4ce6ab8827279cfffb92266": { + "balance": "0x32d26d12e980b600000" + } + }, + "baseFeePerGas": "0x7", + "number": "0x0", + "parentHash": "0x0000000000000000000000000000000000000000000000000000000000000000" +} \ No newline at end of file diff --git a/infra/networks/example_tmp/bundle/public/network.json b/infra/networks/example_tmp/bundle/public/network.json new file mode 100644 index 0000000..c22f2f5 --- /dev/null +++ b/infra/networks/example_tmp/bundle/public/network.json @@ -0,0 +1,149 @@ +{ + "schema_version": 1, + "tool": { + "name": "netgen", + "version": "0.1.0" + }, + "network": { + "name": "example", + "chain_id": 16383 + }, + "inputs": { + "manifest_path": "infra/manifests/example.yaml", + "manifest_sha256": "857b1028b21194e4e75ab95bfaf6bf5aa6591c6719523826b96f1e95ff0bc681" + }, + "policy": { + "engine": "ipc-only", + "sync_enabled": true, + "metrics_bind": "127.0.0.1" + }, + "hosts": [ + { + "id": "host-0", + "public_ip": "203.0.113.10", + "ssh_user": "ubuntu" + }, + { + "id": "host-1", + "public_ip": "203.0.113.11", + "ssh_user": "ubuntu" + }, + { + "id": "host-2", + "public_ip": "203.0.113.12", + "ssh_user": "ubuntu" + } + ], + "nodes": [ + { + "id": "node-0", + "host": "host-0", + "role": "validator", + "images": { + "ultramarine": "ultramarine:local", + "load_reth": "docker.io/loadnetwork/load-reth:v0.1.2" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-0/engine.ipc" + }, + "ports": { + "el_http": 8545, + "el_p2p": 30303, + "el_metrics": 9001, + "cl_p2p": 27000, + "cl_metrics": 29000 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-0.key", + "enode": "enode://6b49a6676952956d1995b8d4e3d006a9f46941a5a869da8060d907691f77649d6f48337a77ccbb03ad953344886bad3d8e5bdd4b574768ce748ba2f6db53ac63@203.0.113.10:30303", + "bootnodes": [ + "enode://7226cdabd7cbe15fe29655cde6dd5d4ca1b0c40fecaf7a4fe54e6b6169238efb6f1100e5f0e1a746bdfd4972bdfef21627ec056756a6ccfb178b38b130662226@203.0.113.11:31303", + "enode://79a053618e831d246ca210909a7fb089b06c08d6ca598810619d24aebf0d2655ef9ad8273ac79f84c10c5c424ed7890f077351938cc9b66c5937d1342b67c637@203.0.113.12:32303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://archiver.example.com", + "provider_id": "example", + "bearer_token_present": false + } + }, + { + "id": "node-1", + "host": "host-1", + "role": "validator", + "images": { + "ultramarine": "ultramarine:local", + "load_reth": "docker.io/loadnetwork/load-reth:v0.1.2" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-1/engine.ipc" + }, + "ports": { + "el_http": 9545, + "el_p2p": 31303, + "el_metrics": 10001, + "cl_p2p": 28000, + "cl_metrics": 30000 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-1.key", + "enode": "enode://7226cdabd7cbe15fe29655cde6dd5d4ca1b0c40fecaf7a4fe54e6b6169238efb6f1100e5f0e1a746bdfd4972bdfef21627ec056756a6ccfb178b38b130662226@203.0.113.11:31303", + "bootnodes": [ + "enode://6b49a6676952956d1995b8d4e3d006a9f46941a5a869da8060d907691f77649d6f48337a77ccbb03ad953344886bad3d8e5bdd4b574768ce748ba2f6db53ac63@203.0.113.10:30303", + "enode://79a053618e831d246ca210909a7fb089b06c08d6ca598810619d24aebf0d2655ef9ad8273ac79f84c10c5c424ed7890f077351938cc9b66c5937d1342b67c637@203.0.113.12:32303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://archiver.example.com", + "provider_id": "example", + "bearer_token_present": false + } + }, + { + "id": "node-2", + "host": "host-2", + "role": "validator", + "images": { + "ultramarine": "ultramarine:local", + "load_reth": "docker.io/loadnetwork/load-reth:v0.1.2" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-2/engine.ipc" + }, + "ports": { + "el_http": 10545, + "el_p2p": 32303, + "el_metrics": 11001, + "cl_p2p": 29000, + "cl_metrics": 31000 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-2.key", + "enode": "enode://79a053618e831d246ca210909a7fb089b06c08d6ca598810619d24aebf0d2655ef9ad8273ac79f84c10c5c424ed7890f077351938cc9b66c5937d1342b67c637@203.0.113.12:32303", + "bootnodes": [ + "enode://6b49a6676952956d1995b8d4e3d006a9f46941a5a869da8060d907691f77649d6f48337a77ccbb03ad953344886bad3d8e5bdd4b574768ce748ba2f6db53ac63@203.0.113.10:30303", + "enode://7226cdabd7cbe15fe29655cde6dd5d4ca1b0c40fecaf7a4fe54e6b6169238efb6f1100e5f0e1a746bdfd4972bdfef21627ec056756a6ccfb178b38b130662226@203.0.113.11:31303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://archiver.example.com", + "provider_id": "example", + "bearer_token_present": false + } + } + ], + "artifacts": { + "public": { + "genesis_json": { + "path": "bundle/public/genesis.json", + "sha256": "1643cbd6641602c85602871327a2e38924de95cabd64682e0d7d2b0080348fc7" + } + } + } +} \ No newline at end of file diff --git a/infra/networks/example_tmp/inventory.yml b/infra/networks/example_tmp/inventory.yml new file mode 100644 index 0000000..d9347e9 --- /dev/null +++ b/infra/networks/example_tmp/inventory.yml @@ -0,0 +1,24 @@ +all: + hosts: + host-0: {} + host-1: {} + host-2: {} + vars: {} + children: {} +_meta: + hostvars: + host-0: + ansible_host: 203.0.113.10 + ansible_user: ubuntu + loadnet_nodes: + - node-0 + host-1: + ansible_host: 203.0.113.11 + ansible_user: ubuntu + loadnet_nodes: + - node-1 + host-2: + ansible_host: 203.0.113.12 + ansible_user: ubuntu + loadnet_nodes: + - node-2 diff --git a/infra/networks/example_tmp/network.lock.json b/infra/networks/example_tmp/network.lock.json new file mode 100644 index 0000000..c22f2f5 --- /dev/null +++ b/infra/networks/example_tmp/network.lock.json @@ -0,0 +1,149 @@ +{ + "schema_version": 1, + "tool": { + "name": "netgen", + "version": "0.1.0" + }, + "network": { + "name": "example", + "chain_id": 16383 + }, + "inputs": { + "manifest_path": "infra/manifests/example.yaml", + "manifest_sha256": "857b1028b21194e4e75ab95bfaf6bf5aa6591c6719523826b96f1e95ff0bc681" + }, + "policy": { + "engine": "ipc-only", + "sync_enabled": true, + "metrics_bind": "127.0.0.1" + }, + "hosts": [ + { + "id": "host-0", + "public_ip": "203.0.113.10", + "ssh_user": "ubuntu" + }, + { + "id": "host-1", + "public_ip": "203.0.113.11", + "ssh_user": "ubuntu" + }, + { + "id": "host-2", + "public_ip": "203.0.113.12", + "ssh_user": "ubuntu" + } + ], + "nodes": [ + { + "id": "node-0", + "host": "host-0", + "role": "validator", + "images": { + "ultramarine": "ultramarine:local", + "load_reth": "docker.io/loadnetwork/load-reth:v0.1.2" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-0/engine.ipc" + }, + "ports": { + "el_http": 8545, + "el_p2p": 30303, + "el_metrics": 9001, + "cl_p2p": 27000, + "cl_metrics": 29000 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-0.key", + "enode": "enode://6b49a6676952956d1995b8d4e3d006a9f46941a5a869da8060d907691f77649d6f48337a77ccbb03ad953344886bad3d8e5bdd4b574768ce748ba2f6db53ac63@203.0.113.10:30303", + "bootnodes": [ + "enode://7226cdabd7cbe15fe29655cde6dd5d4ca1b0c40fecaf7a4fe54e6b6169238efb6f1100e5f0e1a746bdfd4972bdfef21627ec056756a6ccfb178b38b130662226@203.0.113.11:31303", + "enode://79a053618e831d246ca210909a7fb089b06c08d6ca598810619d24aebf0d2655ef9ad8273ac79f84c10c5c424ed7890f077351938cc9b66c5937d1342b67c637@203.0.113.12:32303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://archiver.example.com", + "provider_id": "example", + "bearer_token_present": false + } + }, + { + "id": "node-1", + "host": "host-1", + "role": "validator", + "images": { + "ultramarine": "ultramarine:local", + "load_reth": "docker.io/loadnetwork/load-reth:v0.1.2" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-1/engine.ipc" + }, + "ports": { + "el_http": 9545, + "el_p2p": 31303, + "el_metrics": 10001, + "cl_p2p": 28000, + "cl_metrics": 30000 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-1.key", + "enode": "enode://7226cdabd7cbe15fe29655cde6dd5d4ca1b0c40fecaf7a4fe54e6b6169238efb6f1100e5f0e1a746bdfd4972bdfef21627ec056756a6ccfb178b38b130662226@203.0.113.11:31303", + "bootnodes": [ + "enode://6b49a6676952956d1995b8d4e3d006a9f46941a5a869da8060d907691f77649d6f48337a77ccbb03ad953344886bad3d8e5bdd4b574768ce748ba2f6db53ac63@203.0.113.10:30303", + "enode://79a053618e831d246ca210909a7fb089b06c08d6ca598810619d24aebf0d2655ef9ad8273ac79f84c10c5c424ed7890f077351938cc9b66c5937d1342b67c637@203.0.113.12:32303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://archiver.example.com", + "provider_id": "example", + "bearer_token_present": false + } + }, + { + "id": "node-2", + "host": "host-2", + "role": "validator", + "images": { + "ultramarine": "ultramarine:local", + "load_reth": "docker.io/loadnetwork/load-reth:v0.1.2" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-2/engine.ipc" + }, + "ports": { + "el_http": 10545, + "el_p2p": 32303, + "el_metrics": 11001, + "cl_p2p": 29000, + "cl_metrics": 31000 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-2.key", + "enode": "enode://79a053618e831d246ca210909a7fb089b06c08d6ca598810619d24aebf0d2655ef9ad8273ac79f84c10c5c424ed7890f077351938cc9b66c5937d1342b67c637@203.0.113.12:32303", + "bootnodes": [ + "enode://6b49a6676952956d1995b8d4e3d006a9f46941a5a869da8060d907691f77649d6f48337a77ccbb03ad953344886bad3d8e5bdd4b574768ce748ba2f6db53ac63@203.0.113.10:30303", + "enode://7226cdabd7cbe15fe29655cde6dd5d4ca1b0c40fecaf7a4fe54e6b6169238efb6f1100e5f0e1a746bdfd4972bdfef21627ec056756a6ccfb178b38b130662226@203.0.113.11:31303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://archiver.example.com", + "provider_id": "example", + "bearer_token_present": false + } + } + ], + "artifacts": { + "public": { + "genesis_json": { + "path": "bundle/public/genesis.json", + "sha256": "1643cbd6641602c85602871327a2e38924de95cabd64682e0d7d2b0080348fc7" + } + } + } +} \ No newline at end of file diff --git a/infra/networks/fibernet/bundle/public/genesis.json b/infra/networks/fibernet/bundle/public/genesis.json new file mode 100644 index 0000000..a72e627 --- /dev/null +++ b/infra/networks/fibernet/bundle/public/genesis.json @@ -0,0 +1,40 @@ +{ + "alloc": { + "0x197f818c1313dc58b32d88078ecdfb40ea822614": { + "balance": "0x422ca8b0a00a425000000" + }, + "0x2411c6e43299a09d2225d58e9e7977237b3e4c49": { + "balance": "0x422ca8b0a00a425000000" + } + }, + "baseFeePerGas": "0x7", + "coinbase": "0x0000000000000000000000000000000000000000", + "config": { + "berlinBlock": 0, + "byzantiumBlock": 0, + "cancunTime": 0, + "chainId": 1984, + "constantinopleBlock": 0, + "eip150Block": 0, + "eip155Block": 0, + "eip158Block": 0, + "homesteadBlock": 0, + "istanbulBlock": 0, + "londonBlock": 0, + "mergeNetsplitBlock": 0, + "petersburgBlock": 0, + "pragueTime": 0, + "shanghaiTime": 0, + "terminalTotalDifficulty": 0, + "terminalTotalDifficultyPassed": true + }, + "difficulty": "0x0", + "extraData": "0x4c6f6164204e6574776f726b20446576", + "gasLimit": "0x77359400", + "gasUsed": "0x0", + "mixHash": "0x0000000000000000000000000000000000000000000000000000000000000000", + "nonce": "0x0", + "number": "0x0", + "parentHash": "0x0000000000000000000000000000000000000000000000000000000000000000", + "timestamp": "0x698ac934" +} \ No newline at end of file diff --git a/infra/networks/fibernet/bundle/public/network.json b/infra/networks/fibernet/bundle/public/network.json new file mode 100644 index 0000000..e4285ac --- /dev/null +++ b/infra/networks/fibernet/bundle/public/network.json @@ -0,0 +1,234 @@ +{ + "schema_version": 1, + "tool": { + "name": "netgen", + "version": "0.1.0" + }, + "network": { + "name": "fibernet", + "chain_id": 1984 + }, + "nodes": [ + { + "id": "node-0", + "role": "validator", + "host": "f4-metal-medium-lon2-fibernet-1", + "public_ip": "67.213.117.143", + "ports": { + "el_http": 8545, + "el_authrpc": 8551, + "el_p2p": 30303, + "el_metrics": 9001, + "cl_p2p": 27000, + "cl_mempool": 28000, + "cl_metrics": 29000 + }, + "load_reth": { + "enode": "enode://7b40084dbcc14a0ee9119c8d31c3bfc82d9999a685ad5c738f60f028767bd6ec50958a41aa8e928300c457536bee4666d300bbf2ba898ff7793326673239647e@67.213.117.143:30303", + "bootnodes": [ + "enode://d4dcdd9e94ac204136b020a4711a248c782dc6c0ca093e082abf5dde015b8c787c859b7e479c1db47df2e2861f629abf0b8f37bdd7186039478df6991a9cb21b@67.213.117.143:30304", + "enode://96ba08af99d694218743fb4838a4c27ef36283bf1b1e621ff0d9e03e272cc74d31737ddca999329213b51fb1de9b917509a036f7d139b1686d07672ae3635783@64.34.87.1:31303", + "enode://b6afab72f0b820e27f98a25f5d1aced98e655d292101ad9eaf666dab0c57c9d778b27dfda16601e5489b876de04a61a70d62ebf64a02d83d18204244dc51768a@64.34.87.1:31304", + "enode://bf9d91378abb271be5a22ca842222b6660c171d00b730570d039f3e4c99abf31de5b711fe46db024d775c1ec8efedd20c2cea913373c94603f062e61532bf25e@67.213.121.57:32303", + "enode://8f46c95abfe0a2d17707df7ecc5e8ae4cc65ed5532a9826b0c46f27f0f2c186ddf4b3bef4c64756643d7532d05587924f6babe2db14f45645f05349f00dc56ee@67.213.121.57:32304", + "enode://75366c8abe777f76d5fd6091e82e2e499efe8c10610a1dee724ff6ebfc8bd655b3e61cfea62e5340b75c9d8e4f75612f1dd21dea13363a28f06a9c51ec949ee1@72.46.84.15:33303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://archiver.example.com", + "provider_id": "loads3" + } + }, + { + "id": "node-1", + "role": "validator", + "host": "f4-metal-medium-lon2-fibernet-1", + "public_ip": "67.213.117.143", + "ports": { + "el_http": 8546, + "el_authrpc": 8552, + "el_p2p": 30304, + "el_metrics": 9002, + "cl_p2p": 27001, + "cl_mempool": 28001, + "cl_metrics": 29001 + }, + "load_reth": { + "enode": "enode://d4dcdd9e94ac204136b020a4711a248c782dc6c0ca093e082abf5dde015b8c787c859b7e479c1db47df2e2861f629abf0b8f37bdd7186039478df6991a9cb21b@67.213.117.143:30304", + "bootnodes": [ + "enode://7b40084dbcc14a0ee9119c8d31c3bfc82d9999a685ad5c738f60f028767bd6ec50958a41aa8e928300c457536bee4666d300bbf2ba898ff7793326673239647e@67.213.117.143:30303", + "enode://96ba08af99d694218743fb4838a4c27ef36283bf1b1e621ff0d9e03e272cc74d31737ddca999329213b51fb1de9b917509a036f7d139b1686d07672ae3635783@64.34.87.1:31303", + "enode://b6afab72f0b820e27f98a25f5d1aced98e655d292101ad9eaf666dab0c57c9d778b27dfda16601e5489b876de04a61a70d62ebf64a02d83d18204244dc51768a@64.34.87.1:31304", + "enode://bf9d91378abb271be5a22ca842222b6660c171d00b730570d039f3e4c99abf31de5b711fe46db024d775c1ec8efedd20c2cea913373c94603f062e61532bf25e@67.213.121.57:32303", + "enode://8f46c95abfe0a2d17707df7ecc5e8ae4cc65ed5532a9826b0c46f27f0f2c186ddf4b3bef4c64756643d7532d05587924f6babe2db14f45645f05349f00dc56ee@67.213.121.57:32304", + "enode://75366c8abe777f76d5fd6091e82e2e499efe8c10610a1dee724ff6ebfc8bd655b3e61cfea62e5340b75c9d8e4f75612f1dd21dea13363a28f06a9c51ec949ee1@72.46.84.15:33303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://load-s3-agent.load.network", + "provider_id": "load-s3-agent" + } + }, + { + "id": "node-2", + "role": "validator", + "host": "f4-metal-medium-ams-fibernet-2", + "public_ip": "64.34.87.1", + "ports": { + "el_http": 9545, + "el_authrpc": 9551, + "el_p2p": 31303, + "el_metrics": 10001, + "cl_p2p": 28000, + "cl_mempool": 29000, + "cl_metrics": 30000 + }, + "load_reth": { + "enode": "enode://96ba08af99d694218743fb4838a4c27ef36283bf1b1e621ff0d9e03e272cc74d31737ddca999329213b51fb1de9b917509a036f7d139b1686d07672ae3635783@64.34.87.1:31303", + "bootnodes": [ + "enode://7b40084dbcc14a0ee9119c8d31c3bfc82d9999a685ad5c738f60f028767bd6ec50958a41aa8e928300c457536bee4666d300bbf2ba898ff7793326673239647e@67.213.117.143:30303", + "enode://d4dcdd9e94ac204136b020a4711a248c782dc6c0ca093e082abf5dde015b8c787c859b7e479c1db47df2e2861f629abf0b8f37bdd7186039478df6991a9cb21b@67.213.117.143:30304", + "enode://b6afab72f0b820e27f98a25f5d1aced98e655d292101ad9eaf666dab0c57c9d778b27dfda16601e5489b876de04a61a70d62ebf64a02d83d18204244dc51768a@64.34.87.1:31304", + "enode://bf9d91378abb271be5a22ca842222b6660c171d00b730570d039f3e4c99abf31de5b711fe46db024d775c1ec8efedd20c2cea913373c94603f062e61532bf25e@67.213.121.57:32303", + "enode://8f46c95abfe0a2d17707df7ecc5e8ae4cc65ed5532a9826b0c46f27f0f2c186ddf4b3bef4c64756643d7532d05587924f6babe2db14f45645f05349f00dc56ee@67.213.121.57:32304", + "enode://75366c8abe777f76d5fd6091e82e2e499efe8c10610a1dee724ff6ebfc8bd655b3e61cfea62e5340b75c9d8e4f75612f1dd21dea13363a28f06a9c51ec949ee1@72.46.84.15:33303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://load-s3-agent.load.network", + "provider_id": "load-s3-agent" + } + }, + { + "id": "node-3", + "role": "validator", + "host": "f4-metal-medium-ams-fibernet-2", + "public_ip": "64.34.87.1", + "ports": { + "el_http": 9546, + "el_authrpc": 9552, + "el_p2p": 31304, + "el_metrics": 10002, + "cl_p2p": 28001, + "cl_mempool": 29001, + "cl_metrics": 30001 + }, + "load_reth": { + "enode": "enode://b6afab72f0b820e27f98a25f5d1aced98e655d292101ad9eaf666dab0c57c9d778b27dfda16601e5489b876de04a61a70d62ebf64a02d83d18204244dc51768a@64.34.87.1:31304", + "bootnodes": [ + "enode://7b40084dbcc14a0ee9119c8d31c3bfc82d9999a685ad5c738f60f028767bd6ec50958a41aa8e928300c457536bee4666d300bbf2ba898ff7793326673239647e@67.213.117.143:30303", + "enode://d4dcdd9e94ac204136b020a4711a248c782dc6c0ca093e082abf5dde015b8c787c859b7e479c1db47df2e2861f629abf0b8f37bdd7186039478df6991a9cb21b@67.213.117.143:30304", + "enode://96ba08af99d694218743fb4838a4c27ef36283bf1b1e621ff0d9e03e272cc74d31737ddca999329213b51fb1de9b917509a036f7d139b1686d07672ae3635783@64.34.87.1:31303", + "enode://bf9d91378abb271be5a22ca842222b6660c171d00b730570d039f3e4c99abf31de5b711fe46db024d775c1ec8efedd20c2cea913373c94603f062e61532bf25e@67.213.121.57:32303", + "enode://8f46c95abfe0a2d17707df7ecc5e8ae4cc65ed5532a9826b0c46f27f0f2c186ddf4b3bef4c64756643d7532d05587924f6babe2db14f45645f05349f00dc56ee@67.213.121.57:32304", + "enode://75366c8abe777f76d5fd6091e82e2e499efe8c10610a1dee724ff6ebfc8bd655b3e61cfea62e5340b75c9d8e4f75612f1dd21dea13363a28f06a9c51ec949ee1@72.46.84.15:33303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://load-s3-agent.load.network", + "provider_id": "load-s3-agent" + } + }, + { + "id": "node-4", + "role": "validator", + "host": "f4-metal-medium-fra2-fibernet-2", + "public_ip": "67.213.121.57", + "ports": { + "el_http": 10545, + "el_authrpc": 10551, + "el_p2p": 32303, + "el_metrics": 11001, + "cl_p2p": 29000, + "cl_mempool": 30000, + "cl_metrics": 31000 + }, + "load_reth": { + "enode": "enode://bf9d91378abb271be5a22ca842222b6660c171d00b730570d039f3e4c99abf31de5b711fe46db024d775c1ec8efedd20c2cea913373c94603f062e61532bf25e@67.213.121.57:32303", + "bootnodes": [ + "enode://7b40084dbcc14a0ee9119c8d31c3bfc82d9999a685ad5c738f60f028767bd6ec50958a41aa8e928300c457536bee4666d300bbf2ba898ff7793326673239647e@67.213.117.143:30303", + "enode://d4dcdd9e94ac204136b020a4711a248c782dc6c0ca093e082abf5dde015b8c787c859b7e479c1db47df2e2861f629abf0b8f37bdd7186039478df6991a9cb21b@67.213.117.143:30304", + "enode://96ba08af99d694218743fb4838a4c27ef36283bf1b1e621ff0d9e03e272cc74d31737ddca999329213b51fb1de9b917509a036f7d139b1686d07672ae3635783@64.34.87.1:31303", + "enode://b6afab72f0b820e27f98a25f5d1aced98e655d292101ad9eaf666dab0c57c9d778b27dfda16601e5489b876de04a61a70d62ebf64a02d83d18204244dc51768a@64.34.87.1:31304", + "enode://8f46c95abfe0a2d17707df7ecc5e8ae4cc65ed5532a9826b0c46f27f0f2c186ddf4b3bef4c64756643d7532d05587924f6babe2db14f45645f05349f00dc56ee@67.213.121.57:32304", + "enode://75366c8abe777f76d5fd6091e82e2e499efe8c10610a1dee724ff6ebfc8bd655b3e61cfea62e5340b75c9d8e4f75612f1dd21dea13363a28f06a9c51ec949ee1@72.46.84.15:33303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://load-s3-agent.load.network", + "provider_id": "load-s3-agent" + } + }, + { + "id": "node-5", + "role": "validator", + "host": "f4-metal-medium-fra2-fibernet-2", + "public_ip": "67.213.121.57", + "ports": { + "el_http": 10546, + "el_authrpc": 10552, + "el_p2p": 32304, + "el_metrics": 11002, + "cl_p2p": 29001, + "cl_mempool": 30001, + "cl_metrics": 31001 + }, + "load_reth": { + "enode": "enode://8f46c95abfe0a2d17707df7ecc5e8ae4cc65ed5532a9826b0c46f27f0f2c186ddf4b3bef4c64756643d7532d05587924f6babe2db14f45645f05349f00dc56ee@67.213.121.57:32304", + "bootnodes": [ + "enode://7b40084dbcc14a0ee9119c8d31c3bfc82d9999a685ad5c738f60f028767bd6ec50958a41aa8e928300c457536bee4666d300bbf2ba898ff7793326673239647e@67.213.117.143:30303", + "enode://d4dcdd9e94ac204136b020a4711a248c782dc6c0ca093e082abf5dde015b8c787c859b7e479c1db47df2e2861f629abf0b8f37bdd7186039478df6991a9cb21b@67.213.117.143:30304", + "enode://96ba08af99d694218743fb4838a4c27ef36283bf1b1e621ff0d9e03e272cc74d31737ddca999329213b51fb1de9b917509a036f7d139b1686d07672ae3635783@64.34.87.1:31303", + "enode://b6afab72f0b820e27f98a25f5d1aced98e655d292101ad9eaf666dab0c57c9d778b27dfda16601e5489b876de04a61a70d62ebf64a02d83d18204244dc51768a@64.34.87.1:31304", + "enode://bf9d91378abb271be5a22ca842222b6660c171d00b730570d039f3e4c99abf31de5b711fe46db024d775c1ec8efedd20c2cea913373c94603f062e61532bf25e@67.213.121.57:32303", + "enode://75366c8abe777f76d5fd6091e82e2e499efe8c10610a1dee724ff6ebfc8bd655b3e61cfea62e5340b75c9d8e4f75612f1dd21dea13363a28f06a9c51ec949ee1@72.46.84.15:33303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://archiver.example.com", + "provider_id": "loads3" + } + }, + { + "id": "node-rpc", + "role": "fullnode", + "host": "blockscout-fibernet", + "public_ip": "72.46.84.15", + "ports": { + "el_http": 11545, + "el_authrpc": 11551, + "el_p2p": 33303, + "el_metrics": 12001, + "cl_p2p": 30000, + "cl_mempool": 31000, + "cl_metrics": 32000 + }, + "load_reth": { + "enode": "enode://75366c8abe777f76d5fd6091e82e2e499efe8c10610a1dee724ff6ebfc8bd655b3e61cfea62e5340b75c9d8e4f75612f1dd21dea13363a28f06a9c51ec949ee1@72.46.84.15:33303", + "bootnodes": [ + "enode://7b40084dbcc14a0ee9119c8d31c3bfc82d9999a685ad5c738f60f028767bd6ec50958a41aa8e928300c457536bee4666d300bbf2ba898ff7793326673239647e@67.213.117.143:30303", + "enode://d4dcdd9e94ac204136b020a4711a248c782dc6c0ca093e082abf5dde015b8c787c859b7e479c1db47df2e2861f629abf0b8f37bdd7186039478df6991a9cb21b@67.213.117.143:30304", + "enode://96ba08af99d694218743fb4838a4c27ef36283bf1b1e621ff0d9e03e272cc74d31737ddca999329213b51fb1de9b917509a036f7d139b1686d07672ae3635783@64.34.87.1:31303", + "enode://b6afab72f0b820e27f98a25f5d1aced98e655d292101ad9eaf666dab0c57c9d778b27dfda16601e5489b876de04a61a70d62ebf64a02d83d18204244dc51768a@64.34.87.1:31304", + "enode://bf9d91378abb271be5a22ca842222b6660c171d00b730570d039f3e4c99abf31de5b711fe46db024d775c1ec8efedd20c2cea913373c94603f062e61532bf25e@67.213.121.57:32303", + "enode://8f46c95abfe0a2d17707df7ecc5e8ae4cc65ed5532a9826b0c46f27f0f2c186ddf4b3bef4c64756643d7532d05587924f6babe2db14f45645f05349f00dc56ee@67.213.121.57:32304" + ] + }, + "archiver": null + } + ], + "artifacts": { + "public": { + "genesis_json": { + "path": "bundle/public/genesis.json", + "sha256": "72909350ccb198782324ca5da83647c194f20b9604a2d0ac75d131da3868baf5" + } + } + } +} \ No newline at end of file diff --git a/infra/networks/fibernet/inventory.yml b/infra/networks/fibernet/inventory.yml new file mode 100644 index 0000000..d3d216d --- /dev/null +++ b/infra/networks/fibernet/inventory.yml @@ -0,0 +1,27 @@ +all: + hosts: + blockscout-fibernet: + ansible_host: 72.46.84.15 + ansible_user: ubuntu + loadnet_nodes: + - node-rpc + f4-metal-medium-ams-fibernet-2: + ansible_host: 64.34.87.1 + ansible_user: ubuntu + loadnet_nodes: + - node-2 + - node-3 + f4-metal-medium-fra2-fibernet-2: + ansible_host: 67.213.121.57 + ansible_user: ubuntu + loadnet_nodes: + - node-4 + - node-5 + f4-metal-medium-lon2-fibernet-1: + ansible_host: 67.213.117.143 + ansible_user: ubuntu + loadnet_nodes: + - node-0 + - node-1 + vars: {} + children: {} diff --git a/infra/networks/fibernet/network.lock.json b/infra/networks/fibernet/network.lock.json new file mode 100644 index 0000000..5ed4c22 --- /dev/null +++ b/infra/networks/fibernet/network.lock.json @@ -0,0 +1,342 @@ +{ + "schema_version": 1, + "tool": { + "name": "netgen", + "version": "0.1.0" + }, + "network": { + "name": "fibernet", + "chain_id": 1984 + }, + "inputs": { + "manifest_path": "infra/manifests/fibernet.yaml", + "manifest_sha256": "14239f78d432ee65694dd42e35b2a6c609fcd5837e7b46450c81dca53e0797af" + }, + "policy": { + "engine": "ipc-only", + "sync_enabled": true, + "metrics_bind": "127.0.0.1", + "unsafe_failure_domains_allowed": true + }, + "hosts": [ + { + "id": "blockscout-fibernet", + "public_ip": "72.46.84.15", + "ssh_user": "ubuntu" + }, + { + "id": "f4-metal-medium-ams-fibernet-2", + "public_ip": "64.34.87.1", + "ssh_user": "ubuntu" + }, + { + "id": "f4-metal-medium-fra2-fibernet-2", + "public_ip": "67.213.121.57", + "ssh_user": "ubuntu" + }, + { + "id": "f4-metal-medium-lon2-fibernet-1", + "public_ip": "67.213.117.143", + "ssh_user": "ubuntu" + } + ], + "nodes": [ + { + "id": "node-0", + "host": "f4-metal-medium-lon2-fibernet-1", + "role": "validator", + "images": { + "ultramarine": "docker.io/loadnetwork/ultramarine:fibernet", + "load_reth": "docker.io/loadnetwork/load-reth:fibernet" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-0/engine.ipc" + }, + "ports": { + "el_http": 8545, + "el_authrpc": 8551, + "el_p2p": 30303, + "el_metrics": 9001, + "cl_p2p": 27000, + "cl_mempool": 28000, + "cl_metrics": 29000 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-0.key", + "enode": "enode://7b40084dbcc14a0ee9119c8d31c3bfc82d9999a685ad5c738f60f028767bd6ec50958a41aa8e928300c457536bee4666d300bbf2ba898ff7793326673239647e@67.213.117.143:30303", + "bootnodes": [ + "enode://d4dcdd9e94ac204136b020a4711a248c782dc6c0ca093e082abf5dde015b8c787c859b7e479c1db47df2e2861f629abf0b8f37bdd7186039478df6991a9cb21b@67.213.117.143:30304", + "enode://96ba08af99d694218743fb4838a4c27ef36283bf1b1e621ff0d9e03e272cc74d31737ddca999329213b51fb1de9b917509a036f7d139b1686d07672ae3635783@64.34.87.1:31303", + "enode://b6afab72f0b820e27f98a25f5d1aced98e655d292101ad9eaf666dab0c57c9d778b27dfda16601e5489b876de04a61a70d62ebf64a02d83d18204244dc51768a@64.34.87.1:31304", + "enode://bf9d91378abb271be5a22ca842222b6660c171d00b730570d039f3e4c99abf31de5b711fe46db024d775c1ec8efedd20c2cea913373c94603f062e61532bf25e@67.213.121.57:32303", + "enode://8f46c95abfe0a2d17707df7ecc5e8ae4cc65ed5532a9826b0c46f27f0f2c186ddf4b3bef4c64756643d7532d05587924f6babe2db14f45645f05349f00dc56ee@67.213.121.57:32304", + "enode://75366c8abe777f76d5fd6091e82e2e499efe8c10610a1dee724ff6ebfc8bd655b3e61cfea62e5340b75c9d8e4f75612f1dd21dea13363a28f06a9c51ec949ee1@72.46.84.15:33303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://load-s3-agent.load.network", + "provider_id": "load-s3-agent", + "bearer_token_present": true + } + }, + { + "id": "node-1", + "host": "f4-metal-medium-lon2-fibernet-1", + "role": "validator", + "images": { + "ultramarine": "docker.io/loadnetwork/ultramarine:fibernet", + "load_reth": "docker.io/loadnetwork/load-reth:fibernet" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-1/engine.ipc" + }, + "ports": { + "el_http": 8546, + "el_authrpc": 8552, + "el_p2p": 30304, + "el_metrics": 9002, + "cl_p2p": 27001, + "cl_mempool": 28001, + "cl_metrics": 29001 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-1.key", + "enode": "enode://d4dcdd9e94ac204136b020a4711a248c782dc6c0ca093e082abf5dde015b8c787c859b7e479c1db47df2e2861f629abf0b8f37bdd7186039478df6991a9cb21b@67.213.117.143:30304", + "bootnodes": [ + "enode://7b40084dbcc14a0ee9119c8d31c3bfc82d9999a685ad5c738f60f028767bd6ec50958a41aa8e928300c457536bee4666d300bbf2ba898ff7793326673239647e@67.213.117.143:30303", + "enode://96ba08af99d694218743fb4838a4c27ef36283bf1b1e621ff0d9e03e272cc74d31737ddca999329213b51fb1de9b917509a036f7d139b1686d07672ae3635783@64.34.87.1:31303", + "enode://b6afab72f0b820e27f98a25f5d1aced98e655d292101ad9eaf666dab0c57c9d778b27dfda16601e5489b876de04a61a70d62ebf64a02d83d18204244dc51768a@64.34.87.1:31304", + "enode://bf9d91378abb271be5a22ca842222b6660c171d00b730570d039f3e4c99abf31de5b711fe46db024d775c1ec8efedd20c2cea913373c94603f062e61532bf25e@67.213.121.57:32303", + "enode://8f46c95abfe0a2d17707df7ecc5e8ae4cc65ed5532a9826b0c46f27f0f2c186ddf4b3bef4c64756643d7532d05587924f6babe2db14f45645f05349f00dc56ee@67.213.121.57:32304", + "enode://75366c8abe777f76d5fd6091e82e2e499efe8c10610a1dee724ff6ebfc8bd655b3e61cfea62e5340b75c9d8e4f75612f1dd21dea13363a28f06a9c51ec949ee1@72.46.84.15:33303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://load-s3-agent.load.network", + "provider_id": "load-s3-agent", + "bearer_token_present": true + } + }, + { + "id": "node-2", + "host": "f4-metal-medium-ams-fibernet-2", + "role": "validator", + "images": { + "ultramarine": "docker.io/loadnetwork/ultramarine:fibernet", + "load_reth": "docker.io/loadnetwork/load-reth:fibernet" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-2/engine.ipc" + }, + "ports": { + "el_http": 9545, + "el_authrpc": 9551, + "el_p2p": 31303, + "el_metrics": 10001, + "cl_p2p": 28000, + "cl_mempool": 29000, + "cl_metrics": 30000 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-2.key", + "enode": "enode://96ba08af99d694218743fb4838a4c27ef36283bf1b1e621ff0d9e03e272cc74d31737ddca999329213b51fb1de9b917509a036f7d139b1686d07672ae3635783@64.34.87.1:31303", + "bootnodes": [ + "enode://7b40084dbcc14a0ee9119c8d31c3bfc82d9999a685ad5c738f60f028767bd6ec50958a41aa8e928300c457536bee4666d300bbf2ba898ff7793326673239647e@67.213.117.143:30303", + "enode://d4dcdd9e94ac204136b020a4711a248c782dc6c0ca093e082abf5dde015b8c787c859b7e479c1db47df2e2861f629abf0b8f37bdd7186039478df6991a9cb21b@67.213.117.143:30304", + "enode://b6afab72f0b820e27f98a25f5d1aced98e655d292101ad9eaf666dab0c57c9d778b27dfda16601e5489b876de04a61a70d62ebf64a02d83d18204244dc51768a@64.34.87.1:31304", + "enode://bf9d91378abb271be5a22ca842222b6660c171d00b730570d039f3e4c99abf31de5b711fe46db024d775c1ec8efedd20c2cea913373c94603f062e61532bf25e@67.213.121.57:32303", + "enode://8f46c95abfe0a2d17707df7ecc5e8ae4cc65ed5532a9826b0c46f27f0f2c186ddf4b3bef4c64756643d7532d05587924f6babe2db14f45645f05349f00dc56ee@67.213.121.57:32304", + "enode://75366c8abe777f76d5fd6091e82e2e499efe8c10610a1dee724ff6ebfc8bd655b3e61cfea62e5340b75c9d8e4f75612f1dd21dea13363a28f06a9c51ec949ee1@72.46.84.15:33303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://load-s3-agent.load.network", + "provider_id": "load-s3-agent", + "bearer_token_present": true + } + }, + { + "id": "node-3", + "host": "f4-metal-medium-ams-fibernet-2", + "role": "validator", + "images": { + "ultramarine": "docker.io/loadnetwork/ultramarine:fibernet", + "load_reth": "docker.io/loadnetwork/load-reth:fibernet" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-3/engine.ipc" + }, + "ports": { + "el_http": 9546, + "el_authrpc": 9552, + "el_p2p": 31304, + "el_metrics": 10002, + "cl_p2p": 28001, + "cl_mempool": 29001, + "cl_metrics": 30001 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-3.key", + "enode": "enode://b6afab72f0b820e27f98a25f5d1aced98e655d292101ad9eaf666dab0c57c9d778b27dfda16601e5489b876de04a61a70d62ebf64a02d83d18204244dc51768a@64.34.87.1:31304", + "bootnodes": [ + "enode://7b40084dbcc14a0ee9119c8d31c3bfc82d9999a685ad5c738f60f028767bd6ec50958a41aa8e928300c457536bee4666d300bbf2ba898ff7793326673239647e@67.213.117.143:30303", + "enode://d4dcdd9e94ac204136b020a4711a248c782dc6c0ca093e082abf5dde015b8c787c859b7e479c1db47df2e2861f629abf0b8f37bdd7186039478df6991a9cb21b@67.213.117.143:30304", + "enode://96ba08af99d694218743fb4838a4c27ef36283bf1b1e621ff0d9e03e272cc74d31737ddca999329213b51fb1de9b917509a036f7d139b1686d07672ae3635783@64.34.87.1:31303", + "enode://bf9d91378abb271be5a22ca842222b6660c171d00b730570d039f3e4c99abf31de5b711fe46db024d775c1ec8efedd20c2cea913373c94603f062e61532bf25e@67.213.121.57:32303", + "enode://8f46c95abfe0a2d17707df7ecc5e8ae4cc65ed5532a9826b0c46f27f0f2c186ddf4b3bef4c64756643d7532d05587924f6babe2db14f45645f05349f00dc56ee@67.213.121.57:32304", + "enode://75366c8abe777f76d5fd6091e82e2e499efe8c10610a1dee724ff6ebfc8bd655b3e61cfea62e5340b75c9d8e4f75612f1dd21dea13363a28f06a9c51ec949ee1@72.46.84.15:33303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://load-s3-agent.load.network", + "provider_id": "load-s3-agent", + "bearer_token_present": true + } + }, + { + "id": "node-4", + "host": "f4-metal-medium-fra2-fibernet-2", + "role": "validator", + "images": { + "ultramarine": "docker.io/loadnetwork/ultramarine:fibernet", + "load_reth": "docker.io/loadnetwork/load-reth:fibernet" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-4/engine.ipc" + }, + "ports": { + "el_http": 10545, + "el_authrpc": 10551, + "el_p2p": 32303, + "el_metrics": 11001, + "cl_p2p": 29000, + "cl_mempool": 30000, + "cl_metrics": 31000 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-4.key", + "enode": "enode://bf9d91378abb271be5a22ca842222b6660c171d00b730570d039f3e4c99abf31de5b711fe46db024d775c1ec8efedd20c2cea913373c94603f062e61532bf25e@67.213.121.57:32303", + "bootnodes": [ + "enode://7b40084dbcc14a0ee9119c8d31c3bfc82d9999a685ad5c738f60f028767bd6ec50958a41aa8e928300c457536bee4666d300bbf2ba898ff7793326673239647e@67.213.117.143:30303", + "enode://d4dcdd9e94ac204136b020a4711a248c782dc6c0ca093e082abf5dde015b8c787c859b7e479c1db47df2e2861f629abf0b8f37bdd7186039478df6991a9cb21b@67.213.117.143:30304", + "enode://96ba08af99d694218743fb4838a4c27ef36283bf1b1e621ff0d9e03e272cc74d31737ddca999329213b51fb1de9b917509a036f7d139b1686d07672ae3635783@64.34.87.1:31303", + "enode://b6afab72f0b820e27f98a25f5d1aced98e655d292101ad9eaf666dab0c57c9d778b27dfda16601e5489b876de04a61a70d62ebf64a02d83d18204244dc51768a@64.34.87.1:31304", + "enode://8f46c95abfe0a2d17707df7ecc5e8ae4cc65ed5532a9826b0c46f27f0f2c186ddf4b3bef4c64756643d7532d05587924f6babe2db14f45645f05349f00dc56ee@67.213.121.57:32304", + "enode://75366c8abe777f76d5fd6091e82e2e499efe8c10610a1dee724ff6ebfc8bd655b3e61cfea62e5340b75c9d8e4f75612f1dd21dea13363a28f06a9c51ec949ee1@72.46.84.15:33303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://load-s3-agent.load.network", + "provider_id": "load-s3-agent", + "bearer_token_present": true + } + }, + { + "id": "node-5", + "host": "f4-metal-medium-fra2-fibernet-2", + "role": "validator", + "images": { + "ultramarine": "docker.io/loadnetwork/ultramarine:fibernet", + "load_reth": "docker.io/loadnetwork/load-reth:fibernet" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-5/engine.ipc" + }, + "ports": { + "el_http": 10546, + "el_authrpc": 10552, + "el_p2p": 32304, + "el_metrics": 11002, + "cl_p2p": 29001, + "cl_mempool": 30001, + "cl_metrics": 31001 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-5.key", + "enode": "enode://8f46c95abfe0a2d17707df7ecc5e8ae4cc65ed5532a9826b0c46f27f0f2c186ddf4b3bef4c64756643d7532d05587924f6babe2db14f45645f05349f00dc56ee@67.213.121.57:32304", + "bootnodes": [ + "enode://7b40084dbcc14a0ee9119c8d31c3bfc82d9999a685ad5c738f60f028767bd6ec50958a41aa8e928300c457536bee4666d300bbf2ba898ff7793326673239647e@67.213.117.143:30303", + "enode://d4dcdd9e94ac204136b020a4711a248c782dc6c0ca093e082abf5dde015b8c787c859b7e479c1db47df2e2861f629abf0b8f37bdd7186039478df6991a9cb21b@67.213.117.143:30304", + "enode://96ba08af99d694218743fb4838a4c27ef36283bf1b1e621ff0d9e03e272cc74d31737ddca999329213b51fb1de9b917509a036f7d139b1686d07672ae3635783@64.34.87.1:31303", + "enode://b6afab72f0b820e27f98a25f5d1aced98e655d292101ad9eaf666dab0c57c9d778b27dfda16601e5489b876de04a61a70d62ebf64a02d83d18204244dc51768a@64.34.87.1:31304", + "enode://bf9d91378abb271be5a22ca842222b6660c171d00b730570d039f3e4c99abf31de5b711fe46db024d775c1ec8efedd20c2cea913373c94603f062e61532bf25e@67.213.121.57:32303", + "enode://75366c8abe777f76d5fd6091e82e2e499efe8c10610a1dee724ff6ebfc8bd655b3e61cfea62e5340b75c9d8e4f75612f1dd21dea13363a28f06a9c51ec949ee1@72.46.84.15:33303" + ] + }, + "archiver": { + "enabled": true, + "provider_url": "https://load-s3-agent.load.network", + "provider_id": "load-s3-agent", + "bearer_token_present": true + } + }, + { + "id": "node-rpc", + "host": "blockscout-fibernet", + "role": "fullnode", + "images": { + "ultramarine": "docker.io/loadnetwork/ultramarine:fibernet", + "load_reth": "docker.io/loadnetwork/load-reth:fibernet" + }, + "engine": { + "mode": "ipc", + "ipc_path": "/run/load-reth/node-rpc/engine.ipc" + }, + "ports": { + "el_http": 11545, + "el_authrpc": 11551, + "el_p2p": 33303, + "el_metrics": 12001, + "cl_p2p": 30000, + "cl_mempool": 31000, + "cl_metrics": 32000 + }, + "load_reth": { + "p2p_key_path": "bundle/private/load-reth/p2p-keys/node-rpc.key", + "enode": "enode://75366c8abe777f76d5fd6091e82e2e499efe8c10610a1dee724ff6ebfc8bd655b3e61cfea62e5340b75c9d8e4f75612f1dd21dea13363a28f06a9c51ec949ee1@72.46.84.15:33303", + "bootnodes": [ + "enode://7b40084dbcc14a0ee9119c8d31c3bfc82d9999a685ad5c738f60f028767bd6ec50958a41aa8e928300c457536bee4666d300bbf2ba898ff7793326673239647e@67.213.117.143:30303", + "enode://d4dcdd9e94ac204136b020a4711a248c782dc6c0ca093e082abf5dde015b8c787c859b7e479c1db47df2e2861f629abf0b8f37bdd7186039478df6991a9cb21b@67.213.117.143:30304", + "enode://96ba08af99d694218743fb4838a4c27ef36283bf1b1e621ff0d9e03e272cc74d31737ddca999329213b51fb1de9b917509a036f7d139b1686d07672ae3635783@64.34.87.1:31303", + "enode://b6afab72f0b820e27f98a25f5d1aced98e655d292101ad9eaf666dab0c57c9d778b27dfda16601e5489b876de04a61a70d62ebf64a02d83d18204244dc51768a@64.34.87.1:31304", + "enode://bf9d91378abb271be5a22ca842222b6660c171d00b730570d039f3e4c99abf31de5b711fe46db024d775c1ec8efedd20c2cea913373c94603f062e61532bf25e@67.213.121.57:32303", + "enode://8f46c95abfe0a2d17707df7ecc5e8ae4cc65ed5532a9826b0c46f27f0f2c186ddf4b3bef4c64756643d7532d05587924f6babe2db14f45645f05349f00dc56ee@67.213.121.57:32304" + ] + }, + "archiver": null + } + ], + "blockscout": { + "enabled": true, + "host": "blockscout-fibernet", + "rpc_node": "node-rpc", + "domains": { + "explorer": "fibernet.load.network", + "stats": "stats.fibernet.load.network", + "rpc": "rpc.fibernet.load.network" + }, + "ssl": { + "enabled": true, + "email": "admin@load.network" + } + }, + "artifacts": { + "public": { + "genesis_json": { + "path": "bundle/public/genesis.json", + "sha256": "72909350ccb198782324ca5da83647c194f20b9604a2d0ac75d131da3868baf5" + } + } + } +} \ No newline at end of file diff --git a/infra/networks/fibernet/secrets.sops.yaml b/infra/networks/fibernet/secrets.sops.yaml new file mode 100644 index 0000000..192016c --- /dev/null +++ b/infra/networks/fibernet/secrets.sops.yaml @@ -0,0 +1,29 @@ +schema_version: ENC[AES256_GCM,data:hA==,iv:2K146b55sE2bP9yZo8FU4wdUdKh6f4fSr7E3mbWd5WU=,tag:OVLPSOmJTeN/DbVjyUVZZQ==,type:int] +nodes: + node-0: + archiver_bearer_token: ENC[AES256_GCM,data:PRL2isUOfMBX6hDf8rLz,iv:IXWlpQG2B/+kba4OOu7qX12c2B9u7hyBVGFDm520WJU=,tag:Cza0qTJ+uiGPV0ae5WVDqQ==,type:str] + node-1: + archiver_bearer_token: ENC[AES256_GCM,data:BsVh9tVjOSrmhAqktAbU,iv:36X8LUDj3k6bRHC0Ou3AHUcfRg98JGBFHzJpWjMJqsw=,tag:TxIVMp/IoHDF1CuD7GSG/A==,type:str] + node-2: + archiver_bearer_token: ENC[AES256_GCM,data:bi90KMOeeM+E4eaQIti/,iv:o0Foh69CFsfcWJjOzegb6Hv5C9JY1hPqYeUawWbE5+c=,tag:Hev8b/L4zlMLtQ39HLMv5Q==,type:str] + node-3: + archiver_bearer_token: ENC[AES256_GCM,data:/zpL/4MDIEjV7CqR8Qsn,iv:hc6234u9xBX8MHxsMEpKe5upoUQdUd2Xr3hXYfYXM+E=,tag:Qo7RcFY5mcumacz1qN15EA==,type:str] + node-4: + archiver_bearer_token: ENC[AES256_GCM,data:OcJvHrvlRgLTCKEKUTDn,iv:g2K/5BtNlCq0CpYTz4puMI+/vPvH66j5dE8tKuJiseU=,tag:Ebix5Ak9EKPMQIigF1qWjA==,type:str] + node-5: + archiver_bearer_token: ENC[AES256_GCM,data:sDhcPg0p/w5zY/VzaUYZ,iv:200urNqVSqdpu4pjaKpVByL5HEV4KtnMCho0ewy7sso=,tag:Wb8UQACi6U1AJcePKL53vg==,type:str] +sops: + age: + - recipient: age16v6pmudrd5dmrarpm953skc65tfua0n7s8uwkxzdc3075zsuaq8qjv7an9 + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSAzK09MeExwbk00a2NLSlox + MXh1M290TXRRY1AvMWNTc0c1ZEJHbHlHK0Z3CnZpc0pCZFJYaHU3SldiNjZtMjd0 + Z0h2L3ZrQStuQWs1ejl4VHIzZG5wT1kKLS0tIHNwa3RaclNMdUVwZEFlcUZRREQy + VmI1MW1YcHUyMnh5TFBaQTJ3TEo2czgKlikuvNd7XpDq5IlNzmN75vS+W2KxjtL/ + MIKOqvailp/c5fgzFtx3/cTtIUAFtlivYnh6h2Tu8j710ZG4fkxH9A== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2026-01-09T15:20:14Z" + mac: ENC[AES256_GCM,data:CApOd33Lf/yp3M8ZStwWE8Ya7l8ovhQzpflx3q/7D8gT0N3wznsc66R7TygjN6aAuw/T443A2+JDlzh+niMG0rSqOfGCvNTiP9F+C3/aWukf+7kQJHg/00GxlaCnLoyrTNq0vWyJ7JHF5NqX4b1Ehk1qOEcjgDsBe/e75OcRIBM=,iv:ilR1n2QO45iKipmIKn2c05yNTnnLUHt2xPiNtVDkQo4=,tag:hRogi0gRwzXhFttXH1nl1Q==,type:str] + unencrypted_suffix: _unencrypted + version: 3.11.0 diff --git a/infra/templates/systemd/grafana.service.j2 b/infra/templates/systemd/grafana.service.j2 new file mode 100644 index 0000000..24ea15a --- /dev/null +++ b/infra/templates/systemd/grafana.service.j2 @@ -0,0 +1,31 @@ +[Unit] +Description=loadnet-grafana +After=network-online.target docker.service +Wants=network-online.target +Requires=docker.service + +[Service] +Type=simple +Restart=always +RestartSec=2 +LimitNOFILE=1048576 + +WorkingDirectory=/opt/loadnet/current + +ExecStartPre=/bin/bash -lc 'docker rm -f loadnet-grafana >/dev/null 2>&1 || true' + +ExecStart=/bin/bash -lc '/usr/bin/docker run --rm --name loadnet-grafana --network host --platform=linux/amd64 \ + --log-driver={{ loadnet_docker_log_driver | default("local") }} \ + --log-opt max-size={{ loadnet_docker_log_max_size | default("50m") }} \ + --log-opt max-file={{ loadnet_docker_log_max_file | default("5") }} \ + -e GF_SERVER_HTTP_ADDR={{ loadnet_grafana_bind | default("0.0.0.0") }} \ + -e GF_SERVER_HTTP_PORT={{ loadnet_grafana_port | default(3000) }} \ + -v /opt/loadnet/current/monitoring/grafana/grafana.ini:/etc/grafana/grafana.ini:ro \ + -v /opt/loadnet/current/monitoring/grafana/provisioning:/etc/grafana/provisioning:ro \ + -v /var/lib/grafana:/var/lib/grafana \ + docker.io/grafana/grafana:10.4.2' + +ExecStop=/usr/bin/docker stop -t 10 loadnet-grafana + +[Install] +WantedBy=multi-user.target diff --git a/infra/templates/systemd/load-reth@.service.j2 b/infra/templates/systemd/load-reth@.service.j2 new file mode 100644 index 0000000..a189b53 --- /dev/null +++ b/infra/templates/systemd/load-reth@.service.j2 @@ -0,0 +1,83 @@ +[Unit] +Description=load-reth (%i) +After=network-online.target docker.service +Wants=network-online.target +Requires=docker.service + +[Service] +Type=simple +Restart=always +RestartSec=2 +LimitNOFILE=1048576 + +WorkingDirectory=/opt/loadnet/current + +EnvironmentFile=/opt/loadnet/current/bundle/private/env/load-reth-%i.env + +ExecStartPre=/bin/bash -lc 'test -f "/opt/loadnet/current/bundle/public/genesis.json"' +ExecStartPre=/bin/bash -lc 'test -f "/opt/loadnet/current/$LOAD_RETH_P2P_KEY_PATH"' +ExecStartPre=/bin/bash -lc 'docker image inspect "$LOAD_RETH_IMAGE" >/dev/null 2>&1 || docker pull "$LOAD_RETH_IMAGE"' +ExecStartPre=/bin/bash -lc 'rm -f "/run/load-reth/%i/engine.ipc"' +ExecStartPre=/bin/bash -lc 'docker rm -f load-reth-%i >/dev/null 2>&1 || true' + +ExecStart=/bin/bash -lc '/usr/bin/docker run --rm --name load-reth-%i --network host --platform=linux/amd64 \ + --cap-drop=ALL \ + --security-opt=no-new-privileges \ + --env HOME=/data/load-reth \ + --env XDG_CACHE_HOME=/data/load-reth/.cache \ + --env LOAD_RETH_RPC_SEND_RAW_TX_LIMIT={{ loadnet_el_rpc_bp_send_raw_tx_limit | default(1024) }} \ + --env LOAD_RETH_RPC_GET_TRANSACTION_COUNT_LIMIT={{ loadnet_el_rpc_bp_get_transaction_count_limit | default(2048) }} \ + --env LOAD_RETH_RPC_SEND_RAW_TX_SYNC_LIMIT={{ loadnet_el_rpc_bp_send_raw_tx_sync_limit | default(256) }} \ + --env LOAD_RETH_RPC_BATCH_RESPONSE_LIMIT_MB={{ loadnet_el_rpc_max_response_size | default(200) }} \ + --log-driver={{ loadnet_docker_log_driver | default("local") }} \ + --log-opt max-size={{ loadnet_docker_log_max_size | default("50m") }} \ + --log-opt max-file={{ loadnet_docker_log_max_file | default("5") }} \ + --user $LOAD_RETH_UID:$LOAD_RETH_GID \ + -v /var/lib/load-reth/%i:/data/load-reth \ + -v /opt/loadnet/current/bundle/public:/assets:ro \ + -v /opt/loadnet/current/$LOAD_RETH_P2P_KEY_PATH:/run/load-reth/%i/p2p.key:ro \ + -v /run/load-reth/%i:/run/load-reth/%i \ + $LOAD_RETH_IMAGE node \ + --datadir=/data/load-reth \ + --chain=/assets/genesis.json \ + --http --http.addr={{ loadnet_el_http_bind | default("0.0.0.0") }} --http.port=$LOAD_RETH_HTTP_PORT --http.api={{ loadnet_el_http_api | default("eth,net,web3,txpool,debug,trace") }} \ + --rpc.max-request-size={{ loadnet_el_rpc_max_request_size | default(100) }} \ + --rpc.max-response-size={{ loadnet_el_rpc_max_response_size | default(200) }} \ + --rpc.max-connections={{ loadnet_el_rpc_max_connections | default(12000) }} \ + --rpc.max-tracing-requests={{ loadnet_el_rpc_max_tracing_requests | default(256) }} \ + --rpc.max-blocking-io-requests={{ loadnet_el_rpc_max_blocking_io_requests | default(512) }} \ + --rpc.send-raw-transaction-sync-timeout={{ loadnet_el_rpc_send_raw_transaction_sync_timeout | default(loadnet_el_rpc_send_raw_tx_sync_timeout | default("3s")) }} \ + --rpc-cache.max-blocks={{ loadnet_el_rpc_cache_max_blocks | default(5000) }} \ + --rpc-cache.max-receipts={{ loadnet_el_rpc_cache_max_receipts | default(2000) }} \ + --rpc-cache.max-headers={{ loadnet_el_rpc_cache_max_headers | default(1000) }} \ + --rpc-cache.max-concurrent-db-requests={{ loadnet_el_rpc_cache_max_concurrent_db_requests | default(2048) }} \ + --authrpc.addr=127.0.0.1 --authrpc.port=$LOAD_RETH_AUTHRPC_PORT \ + --metrics=127.0.0.1:$LOAD_RETH_METRICS_PORT \ + --port=$LOAD_RETH_P2P_PORT --discovery.port=$LOAD_RETH_P2P_PORT \ + --nat=extip:$LOAD_RETH_PUBLIC_IP \ + --p2p-secret-key=/run/load-reth/%i/p2p.key \ + --bootnodes=$LOAD_RETH_BOOTNODES \ + --trusted-peers=$LOAD_RETH_BOOTNODES \ +{% if loadnet_el_debug_tip is defined and loadnet_el_debug_tip %} + --debug.tip={{ loadnet_el_debug_tip }} \ +{% endif %} + --auth-ipc --auth-ipc.path=$LOAD_RETH_ENGINE_IPC_PATH \ + --engine.persistence-threshold={{ loadnet_el_engine_persistence_threshold | default(0) }} \ + --builder.gaslimit={{ loadnet_el_builder_gaslimit | default(2000000000) }} \ + --builder.interval={{ loadnet_el_builder_interval | default("50ms") }} \ + --builder.deadline={{ loadnet_el_builder_deadline | default(2) }} \ + --builder.max-tasks={{ loadnet_el_builder_max_tasks | default(10) }} \ + --txpool.gas-limit={{ loadnet_el_txpool_gas_limit | default(2000000000) }} \ + --txpool.max-account-slots={{ loadnet_el_txpool_max_account_slots | default(64) }} \ + --txpool.pending-max-count={{ loadnet_el_txpool_pending_max_count | default(200000) }} \ + --txpool.queued-max-count={{ loadnet_el_txpool_queued_max_count | default(200000) }} \ + --txpool.pending-max-size={{ loadnet_el_txpool_pending_max_size | default(2048) }} \ + --txpool.queued-max-size={{ loadnet_el_txpool_queued_max_size | default(2048) }} \ + --txpool.max-pending-txns={{ loadnet_el_txpool_max_pending_txns | default(500000) }} \ + --txpool.max-new-txns={{ loadnet_el_txpool_max_new_txns | default(50000) }}' + +ExecStop=/usr/bin/docker stop -t 10 load-reth-%i +ExecStartPost=/bin/bash -lc 'for i in {1..60}; do if [ -S "/run/load-reth/%i/engine.ipc" ]; then chgrp {{ loadnet_ultramarine_gid | default(10002) }} "/run/load-reth/%i/engine.ipc"; chmod 660 "/run/load-reth/%i/engine.ipc"; exit 0; fi; sleep 1; done; echo "engine IPC not ready for permission fix: /run/load-reth/%i/engine.ipc" >&2; exit 0' + +[Install] +WantedBy=multi-user.target diff --git a/infra/templates/systemd/prometheus.service.j2 b/infra/templates/systemd/prometheus.service.j2 new file mode 100644 index 0000000..9001e08 --- /dev/null +++ b/infra/templates/systemd/prometheus.service.j2 @@ -0,0 +1,32 @@ +[Unit] +Description=loadnet-prometheus +After=network-online.target docker.service +Wants=network-online.target +Requires=docker.service + +[Service] +Type=simple +Restart=always +RestartSec=2 +LimitNOFILE=1048576 + +WorkingDirectory=/opt/loadnet/current + +ExecStartPre=/bin/bash -lc 'docker rm -f loadnet-prometheus >/dev/null 2>&1 || true' + +ExecStart=/bin/bash -lc '/usr/bin/docker run --rm --name loadnet-prometheus --network host --platform=linux/amd64 \ + --log-driver={{ loadnet_docker_log_driver | default("local") }} \ + --log-opt max-size={{ loadnet_docker_log_max_size | default("50m") }} \ + --log-opt max-file={{ loadnet_docker_log_max_file | default("5") }} \ + -v /opt/loadnet/current/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro \ + -v /opt/loadnet/current/monitoring/alert_rules.yml:/etc/prometheus/alert_rules.yml:ro \ + -v /var/lib/prometheus:/prometheus \ + docker.io/prom/prometheus:v2.54.1 \ + --config.file=/etc/prometheus/prometheus.yml \ + --storage.tsdb.path=/prometheus \ + --web.listen-address={{ loadnet_prometheus_bind | default("127.0.0.1") }}:{{ loadnet_prometheus_port | default(9090) }}' + +ExecStop=/usr/bin/docker stop -t 10 loadnet-prometheus + +[Install] +WantedBy=multi-user.target diff --git a/infra/templates/systemd/ultramarine@.service.j2 b/infra/templates/systemd/ultramarine@.service.j2 new file mode 100644 index 0000000..cb3bc7b --- /dev/null +++ b/infra/templates/systemd/ultramarine@.service.j2 @@ -0,0 +1,47 @@ +[Unit] +Description=ultramarine (%i) +After=network-online.target docker.service load-reth@%i.service +Wants=network-online.target +Requires=docker.service load-reth@%i.service + +[Service] +Type=simple +Restart=always +RestartSec=2 +LimitNOFILE=1048576 + +WorkingDirectory=/opt/loadnet/current + +EnvironmentFile=/opt/loadnet/current/bundle/private/env/ultramarine-%i.env +EnvironmentFile=-/opt/loadnet/current/bundle/private/ultramarine/secrets/%i.env + +ExecStartPre=/bin/bash -lc 'test -d "/var/lib/ultramarine/%i/config"' +ExecStartPre=/bin/bash -lc 'docker pull $ULTRAMARINE_IMAGE' +ExecStartPre=/bin/bash -lc 'for i in {1..60}; do if [ -S "$ULTRAMARINE_ENGINE_IPC_PATH" ]; then exit 0; fi; sleep 1; done; echo "engine IPC not ready: $ULTRAMARINE_ENGINE_IPC_PATH" >&2; exit 1' +ExecStartPre=/bin/bash -lc 'docker rm -f ultramarine-%i >/dev/null 2>&1 || true' + +ExecStart=/bin/bash -lc '/usr/bin/docker run --rm --name ultramarine-%i --network host --platform=linux/amd64 \ + --cap-drop=ALL \ + --security-opt=no-new-privileges \ + --log-driver={{ loadnet_docker_log_driver | default("local") }} \ + --log-opt max-size={{ loadnet_docker_log_max_size | default("50m") }} \ + --log-opt max-file={{ loadnet_docker_log_max_file | default("5") }} \ + --user $ULTRAMARINE_UID:$ULTRAMARINE_GID \ + --env ULTRAMARINE_ARCHIVER_ENABLED \ + --env ULTRAMARINE_ARCHIVER_PROVIDER_URL \ + --env ULTRAMARINE_ARCHIVER_PROVIDER_ID \ + --env ULTRAMARINE_ARCHIVER_BEARER_TOKEN \ + -v /var/lib/ultramarine/%i:/home \ + -v /run/load-reth/%i:/run/load-reth/%i \ + -v /opt/loadnet/current/bundle/public:/assets:ro \ + $ULTRAMARINE_IMAGE start \ + --home=/home \ + --engine-ipc-path=$ULTRAMARINE_ENGINE_IPC_PATH \ + --execution-genesis-path=/assets/genesis.json \ + --eth1-rpc-url=$ULTRAMARINE_ETH1_RPC_URL' + +# Allow 30 seconds for graceful shutdown (flush blob engine, drain archiver queue) +ExecStop=/usr/bin/docker stop -t 30 ultramarine-%i + +[Install] +WantedBy=multi-user.target diff --git a/infra/templates/tmpfiles/load-reth.conf.j2 b/infra/templates/tmpfiles/load-reth.conf.j2 new file mode 100644 index 0000000..16cde64 --- /dev/null +++ b/infra/templates/tmpfiles/load-reth.conf.j2 @@ -0,0 +1,4 @@ +d /run/load-reth 0755 root root - +{% for node_id in loadnet_nodes %} +d /run/load-reth/{{ node_id }} 0755 {{ loadnet_load_reth_uid | default(10001) }} {{ loadnet_load_reth_gid | default(10001) }} - +{% endfor %} diff --git a/typos.toml b/typos.toml index b5221aa..74df461 100644 --- a/typos.toml +++ b/typos.toml @@ -1,3 +1,7 @@ [default.extend-words] # Common shorthand used to avoid the Rust keyword `type`. typ = "typ" +# rsync flags: -H (hard links), -A (ACLs), -X (extended attributes) +HAX = "HAX" +# Recovery Time Objective - standard devops term +RTO = "RTO"