Skip to content

Commit b50ce52

Browse files
authored
feat: add challenge integration infrastructure with checkpoint persistence (#2)
* feat: add challenges directory structure and workspace configuration * feat: add challenge-registry crate for challenge lifecycle management Create new platform-challenge-registry crate with: - Challenge discovery and registration - Version management (semver-based) - Lifecycle state machine (registered/starting/running/stopping/stopped) - Health monitoring with configurable checks - State persistence and hot-reload support - Migration planning for version upgrades Modules: - registry: Main registry with CRUD operations - lifecycle: State machine for challenge states - health: Health monitoring and status tracking - state: State snapshots for hot-reload - discovery: Challenge discovery from various sources - migration: Version migration planning - version: Semantic versioning support - error: Registry-specific error types * feat(core): add checkpoint system for state persistence * feat: add restoration system for checkpoint recovery * feat(rpc-server): add health check endpoints for rolling updates * docs: add challenge integration guide * test: add integration tests for checkpoint and restoration system * feat: add graceful shutdown with checkpoint persistence - Add ShutdownHandler struct for checkpoint management - Create periodic checkpoints every 5 minutes - Save final checkpoint on graceful shutdown (Ctrl+C) - Persist evaluation state for hot-reload recovery This enables validators to update without losing evaluation progress.
1 parent f892ea9 commit b50ce52

File tree

25 files changed

+5051
-2
lines changed

25 files changed

+5051
-2
lines changed

Cargo.lock

Lines changed: 28 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ members = [
55
"crates/storage",
66
"crates/distributed-storage",
77
"crates/challenge-sdk",
8+
"crates/challenge-registry",
89
"crates/epoch",
910
"crates/bittensor-integration",
1011
"crates/subnet-manager",
@@ -20,6 +21,10 @@ members = [
2021
# Note: WASM runtime removed - updates via git, version checked at handshake
2122
# Note: P2P-only architecture - no centralized platform-server
2223

24+
# Challenge crates can be added here or as optional path/git dependencies
25+
# Example:
26+
# "challenges/example-challenge",
27+
2328
[workspace.package]
2429
version = "0.1.0"
2530
edition = "2021"
@@ -95,3 +100,11 @@ type_complexity = "allow"
95100
await_holding_lock = "warn" # TODO: Fix async lock issues properly
96101
collapsible_match = "allow"
97102
collapsible_if = "allow"
103+
104+
# Workspace-level feature flags for challenge integration
105+
# Individual crates can enable these by adding features in their Cargo.toml:
106+
# [features]
107+
# dynamic-challenges = ["libloading"]
108+
[workspace.metadata.challenge-features]
109+
# Enable dynamic challenge loading (crates opt-in via features)
110+
dynamic-loading-available = true

bins/validator-node/src/main.rs

Lines changed: 124 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,108 @@ use platform_bittensor::{
1212
sync_metagraph, BittensorClient, BlockSync, BlockSyncConfig, BlockSyncEvent, Metagraph,
1313
Subtensor, SubtensorClient,
1414
};
15-
use platform_core::{Hotkey, Keypair, SUDO_KEY_SS58};
15+
use platform_core::{
16+
checkpoint::{
17+
CheckpointData, CheckpointManager, CompletedEvaluationState, PendingEvaluationState,
18+
WeightVoteState,
19+
},
20+
Hotkey, Keypair, SUDO_KEY_SS58,
21+
};
1622
use platform_distributed_storage::{
1723
DistributedStoreExt, LocalStorage, LocalStorageBuilder, StorageKey,
1824
};
1925
use platform_p2p_consensus::{
2026
ChainState, ConsensusEngine, NetworkEvent, P2PConfig, P2PMessage, P2PNetwork, StateManager,
2127
ValidatorRecord, ValidatorSet,
2228
};
23-
use std::path::PathBuf;
29+
use std::path::{Path, PathBuf};
2430
use std::sync::Arc;
2531
use std::time::Duration;
2632
use tracing::{debug, error, info, warn};
2733

2834
/// Storage key for persisted chain state
2935
const STATE_STORAGE_KEY: &str = "chain_state";
3036

37+
// ==================== Shutdown Handler ====================
38+
39+
/// Handles graceful shutdown with state persistence
40+
struct ShutdownHandler {
41+
checkpoint_manager: CheckpointManager,
42+
state_manager: Arc<StateManager>,
43+
netuid: u16,
44+
}
45+
46+
impl ShutdownHandler {
47+
fn new(checkpoint_dir: &Path, state_manager: Arc<StateManager>, netuid: u16) -> Result<Self> {
48+
let checkpoint_manager = CheckpointManager::new(checkpoint_dir.join("checkpoints"), 10)?;
49+
Ok(Self {
50+
checkpoint_manager,
51+
state_manager,
52+
netuid,
53+
})
54+
}
55+
56+
/// Create checkpoint from current state
57+
fn create_checkpoint(&mut self) -> Result<()> {
58+
let state = self.state_manager.snapshot();
59+
60+
let mut checkpoint_data = CheckpointData::new(state.sequence, state.epoch, self.netuid);
61+
62+
// Convert pending evaluations
63+
for (id, record) in &state.pending_evaluations {
64+
let pending = PendingEvaluationState {
65+
submission_id: id.clone(),
66+
challenge_id: record.challenge_id,
67+
miner: record.miner.clone(),
68+
submission_hash: record.agent_hash.clone(),
69+
scores: record
70+
.evaluations
71+
.iter()
72+
.map(|(k, v)| (k.clone(), v.score))
73+
.collect(),
74+
created_at: record.created_at,
75+
finalizing: record.finalized,
76+
};
77+
checkpoint_data.add_pending(pending);
78+
}
79+
80+
// Convert completed evaluations (current epoch only)
81+
if let Some(completed) = state.completed_evaluations.get(&state.epoch) {
82+
for record in completed {
83+
if let Some(score) = record.aggregated_score {
84+
let completed_state = CompletedEvaluationState {
85+
submission_id: record.submission_id.clone(),
86+
challenge_id: record.challenge_id,
87+
final_score: score,
88+
epoch: state.epoch,
89+
completed_at: record.finalized_at.unwrap_or(record.created_at),
90+
};
91+
checkpoint_data.add_completed(completed_state);
92+
}
93+
}
94+
}
95+
96+
// Convert weight votes
97+
if let Some(ref votes) = state.weight_votes {
98+
checkpoint_data.weight_votes = Some(WeightVoteState {
99+
epoch: votes.epoch,
100+
netuid: votes.netuid,
101+
votes: votes.votes.clone(),
102+
finalized: votes.finalized,
103+
final_weights: votes.final_weights.clone(),
104+
});
105+
}
106+
107+
checkpoint_data.bittensor_block = state.bittensor_block;
108+
109+
self.checkpoint_manager
110+
.create_checkpoint(&checkpoint_data)?;
111+
info!("Shutdown checkpoint created at sequence {}", state.sequence);
112+
113+
Ok(())
114+
}
115+
}
116+
31117
// ==================== CLI ====================
32118

33119
#[derive(Parser, Debug)]
@@ -252,6 +338,22 @@ async fn main() -> Result<()> {
252338
bittensor_client_for_metagraph = None;
253339
}
254340

341+
// Initialize shutdown handler for graceful checkpoint persistence
342+
let mut shutdown_handler =
343+
match ShutdownHandler::new(&data_dir, state_manager.clone(), args.netuid) {
344+
Ok(handler) => {
345+
info!("Shutdown handler initialized with checkpoint directory");
346+
Some(handler)
347+
}
348+
Err(e) => {
349+
warn!(
350+
"Failed to initialize shutdown handler: {}. Checkpoints disabled.",
351+
e
352+
);
353+
None
354+
}
355+
};
356+
255357
info!("Decentralized validator running. Press Ctrl+C to stop.");
256358

257359
let netuid = args.netuid;
@@ -260,6 +362,7 @@ async fn main() -> Result<()> {
260362
let mut metagraph_interval = tokio::time::interval(Duration::from_secs(300));
261363
let mut stale_check_interval = tokio::time::interval(Duration::from_secs(60));
262364
let mut state_persist_interval = tokio::time::interval(Duration::from_secs(60));
365+
let mut checkpoint_interval = tokio::time::interval(Duration::from_secs(300)); // 5 minutes
263366

264367
loop {
265368
tokio::select! {
@@ -335,8 +438,27 @@ async fn main() -> Result<()> {
335438
debug!("Active validators: {}", validator_set.active_count());
336439
}
337440

441+
// Periodic checkpoint
442+
_ = checkpoint_interval.tick() => {
443+
if let Some(handler) = shutdown_handler.as_mut() {
444+
if let Err(e) = handler.create_checkpoint() {
445+
warn!("Failed to create periodic checkpoint: {}", e);
446+
} else {
447+
debug!("Periodic checkpoint created");
448+
}
449+
}
450+
}
451+
338452
// Ctrl+C
339453
_ = tokio::signal::ctrl_c() => {
454+
info!("Received shutdown signal, creating final checkpoint...");
455+
if let Some(handler) = shutdown_handler.as_mut() {
456+
if let Err(e) = handler.create_checkpoint() {
457+
error!("Failed to create shutdown checkpoint: {}", e);
458+
} else {
459+
info!("Shutdown checkpoint saved successfully");
460+
}
461+
}
340462
info!("Shutting down...");
341463
break;
342464
}

challenges/.gitkeep

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

challenges/README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Platform Challenge Crates
2+
3+
This directory contains challenge crates that can be integrated with the Platform validator network.
4+
5+
## Directory Structure
6+
7+
```
8+
challenges/
9+
├── README.md # This file
10+
├── example-challenge/ # Example challenge template (future)
11+
└── [your-challenge]/ # Your custom challenge crate
12+
```
13+
14+
## Adding a New Challenge Crate
15+
16+
1. Create your challenge crate in this directory or reference it as a git dependency
17+
2. Implement the `Challenge` trait from `platform-challenge-sdk`
18+
3. Register your challenge in the challenge registry
19+
4. Update the workspace `Cargo.toml` if adding locally
20+
21+
## External Challenge Crates
22+
23+
Challenge crates can also be external (like term-challenge). They should:
24+
- Import `platform-challenge-sdk` as a dependency
25+
- Implement the `ServerChallenge` trait
26+
- Provide Docker configuration for evaluation
27+
28+
## Challenge Crate Requirements
29+
30+
- Must implement `platform-challenge-sdk::ServerChallenge`
31+
- Must provide `/evaluate` HTTP endpoint
32+
- Must handle graceful shutdown signals
33+
- Must support state persistence for hot-reload
34+
35+
## Example
36+
37+
See [term-challenge](https://github.com/PlatformNetwork/term-challenge) for a complete example.
38+
39+
## Documentation
40+
41+
For detailed integration instructions, see the [Challenge Integration Guide](../docs/challenge-integration.md).
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
[package]
2+
name = "platform-challenge-registry"
3+
version.workspace = true
4+
edition.workspace = true
5+
description = "Challenge registry and lifecycle management for Platform Network"
6+
7+
[dependencies]
8+
platform-core = { path = "../core" }
9+
platform-challenge-sdk = { path = "../challenge-sdk" }
10+
platform-storage = { path = "../storage" }
11+
12+
# Async
13+
tokio = { workspace = true }
14+
async-trait = { workspace = true }
15+
futures = { workspace = true }
16+
17+
# Serialization
18+
serde = { workspace = true }
19+
serde_json = { workspace = true }
20+
bincode = { workspace = true }
21+
22+
# Utils
23+
tracing = { workspace = true }
24+
thiserror = { workspace = true }
25+
anyhow = { workspace = true }
26+
chrono = { workspace = true }
27+
parking_lot = { workspace = true }
28+
uuid = { workspace = true }
29+
30+
# Crypto for checksums
31+
sha2 = { workspace = true }
32+
hex = { workspace = true }
33+
34+
# Versioning
35+
semver = "1.0"
36+
37+
# Health checks
38+
reqwest = { workspace = true }
39+
40+
[dev-dependencies]
41+
tempfile = { workspace = true }
42+
tokio-test = { workspace = true }

0 commit comments

Comments
 (0)