Skip to content

Commit f06e67d

Browse files
committed
feat: validator metrics reporting (CPU/RAM) with 5s interval and in-memory cache
- Add sysinfo crate to validator-node for system metrics collection - Implement collect_system_metrics() for CPU and RAM usage - Add 5-second interval reporting to platform-server - Add MetricsCache with 5-minute TTL in platform-server state - New endpoints: POST /api/v1/validators/metrics, GET /api/v1/validators/stats - Add list_validators() query for fetching all validators
1 parent 19e9d1f commit f06e67d

File tree

8 files changed

+425
-7
lines changed

8 files changed

+425
-7
lines changed

Cargo.lock

Lines changed: 101 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bins/validator-node/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ anyhow = { workspace = true }
4444
hex = { workspace = true }
4545
parking_lot = { workspace = true }
4646
sp-core = { workspace = true }
47+
sysinfo = "0.31"
4748
uuid = { version = "1.0", features = ["v5"] }
4849

4950
# WebSocket for platform-server events

bins/validator-node/src/main.rs

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use std::net::SocketAddr;
2222
use std::path::PathBuf;
2323
use std::sync::Arc;
2424
use std::time::Duration;
25+
use sysinfo::System;
2526
use tokio_tungstenite::{connect_async, tungstenite::Message};
2627
use tracing::{debug, error, info, warn};
2728

@@ -240,6 +241,66 @@ fn default_memory() -> u64 {
240241
4096
241242
}
242243

244+
/// Collect current system metrics (CPU and memory)
245+
fn collect_system_metrics() -> (f32, u64, u64) {
246+
let mut sys = System::new_all();
247+
sys.refresh_all();
248+
249+
let cpu_percent = sys.global_cpu_usage();
250+
let memory_used_mb = sys.used_memory() / 1024 / 1024;
251+
let memory_total_mb = sys.total_memory() / 1024 / 1024;
252+
253+
(cpu_percent, memory_used_mb, memory_total_mb)
254+
}
255+
256+
/// Report metrics to platform server
257+
async fn report_metrics_to_platform(
258+
client: &reqwest::Client,
259+
platform_url: &str,
260+
keypair: &Keypair,
261+
hotkey: &str,
262+
) -> anyhow::Result<()> {
263+
use std::time::{SystemTime, UNIX_EPOCH};
264+
265+
let (cpu_percent, memory_used_mb, memory_total_mb) = collect_system_metrics();
266+
267+
let timestamp = SystemTime::now()
268+
.duration_since(UNIX_EPOCH)
269+
.unwrap()
270+
.as_secs() as i64;
271+
272+
let message = format!("metrics:{}:{}", hotkey, timestamp);
273+
let signature = keypair.sign_bytes(message.as_bytes()).unwrap_or_default();
274+
let signature_hex = format!("0x{}", hex::encode(signature));
275+
276+
let payload = serde_json::json!({
277+
"hotkey": hotkey,
278+
"signature": signature_hex,
279+
"timestamp": timestamp,
280+
"cpu_percent": cpu_percent,
281+
"memory_used_mb": memory_used_mb,
282+
"memory_total_mb": memory_total_mb,
283+
});
284+
285+
let url = format!("{}/api/v1/validators/metrics", platform_url);
286+
287+
client
288+
.post(&url)
289+
.json(&payload)
290+
.timeout(std::time::Duration::from_secs(5))
291+
.send()
292+
.await?;
293+
294+
debug!(
295+
cpu = %cpu_percent,
296+
mem_used = %memory_used_mb,
297+
mem_total = %memory_total_mb,
298+
"Reported metrics to platform"
299+
);
300+
301+
Ok(())
302+
}
303+
243304
/// Custom event from a challenge
244305
#[derive(Debug, Clone, serde::Deserialize)]
245306
pub struct ChallengeCustomEvent {
@@ -576,6 +637,15 @@ async fn main() -> Result<()> {
576637
let netuid = args.netuid;
577638
let version_key = args.version_key;
578639
let mut interval = tokio::time::interval(Duration::from_secs(60));
640+
let mut metrics_interval = tokio::time::interval(Duration::from_secs(5));
641+
642+
// Create HTTP client and extract values for metrics reporting
643+
let metrics_client = reqwest::Client::builder()
644+
.timeout(Duration::from_secs(10))
645+
.build()
646+
.expect("HTTP client for metrics");
647+
let platform_url = args.platform_server.clone();
648+
let hotkey = keypair.ss58_address();
579649

580650
loop {
581651
tokio::select! {
@@ -599,6 +669,17 @@ async fn main() -> Result<()> {
599669
debug!("Heartbeat");
600670
}
601671

672+
_ = metrics_interval.tick() => {
673+
if let Err(e) = report_metrics_to_platform(
674+
&metrics_client,
675+
&platform_url,
676+
&keypair,
677+
&hotkey,
678+
).await {
679+
debug!("Failed to report metrics: {}", e);
680+
}
681+
}
682+
602683
_ = tokio::signal::ctrl_c() => {
603684
info!("Shutting down...");
604685
break;

crates/platform-server/src/api/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ pub mod evaluations;
77
pub mod events;
88
pub mod jobs;
99
pub mod leaderboard;
10-
pub mod llm;
10+
// LLM proxy moved to term-challenge-server (via bridge)
11+
// pub mod llm;
1112
pub mod submissions;
1213
pub mod validators;

0 commit comments

Comments
 (0)