Skip to content

Commit d8e28b6

Browse files
committed
feat: add infinite retry loop (30s) for platform-server connection in validator-node
All PlatformServerClient methods now retry infinitely every 30s: - health_with_retry(): waits for platform-server to be reachable - list_challenges(): retries until successful response - get_weights(): retries until successful response Validator startup now waits for platform-server connection before proceeding.
1 parent 2c6cb55 commit d8e28b6

File tree

1 file changed

+121
-39
lines changed

1 file changed

+121
-39
lines changed

bins/validator-node/src/main.rs

Lines changed: 121 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,39 @@ impl PlatformServerClient {
4141
}
4242
}
4343

44+
/// Health check with infinite retry loop (30s interval)
45+
pub async fn health_with_retry(&self) -> bool {
46+
let mut attempt = 0u64;
47+
loop {
48+
attempt += 1;
49+
match self
50+
.client
51+
.get(format!("{}/health", self.base_url))
52+
.send()
53+
.await
54+
{
55+
Ok(r) if r.status().is_success() => {
56+
info!("Platform server connected (attempt {})", attempt);
57+
return true;
58+
}
59+
Ok(r) => {
60+
warn!(
61+
"Platform server health check failed: {} (attempt {}, retrying in 30s)",
62+
r.status(),
63+
attempt
64+
);
65+
}
66+
Err(e) => {
67+
warn!(
68+
"Platform server not reachable: {} (attempt {}, retrying in 30s)",
69+
e, attempt
70+
);
71+
}
72+
}
73+
tokio::time::sleep(Duration::from_secs(30)).await;
74+
}
75+
}
76+
4477
pub async fn health(&self) -> bool {
4578
self.client
4679
.get(format!("{}/health", self.base_url))
@@ -50,42 +83,96 @@ impl PlatformServerClient {
5083
.unwrap_or(false)
5184
}
5285

86+
/// List challenges with infinite retry loop (30s interval)
5387
pub async fn list_challenges(&self) -> Result<Vec<ChallengeInfo>> {
54-
Ok(self
55-
.client
56-
.get(format!("{}/api/v1/challenges", self.base_url))
57-
.send()
58-
.await?
59-
.json()
60-
.await?)
88+
let url = format!("{}/api/v1/challenges", self.base_url);
89+
let mut attempt = 0u64;
90+
loop {
91+
attempt += 1;
92+
match self.client.get(&url).send().await {
93+
Ok(resp) if resp.status().is_success() => {
94+
match resp.json::<Vec<ChallengeInfo>>().await {
95+
Ok(challenges) => return Ok(challenges),
96+
Err(e) => {
97+
warn!(
98+
"Failed to parse challenges response: {} (attempt {}, retrying in 30s)",
99+
e, attempt
100+
);
101+
}
102+
}
103+
}
104+
Ok(resp) => {
105+
warn!(
106+
"Failed to list challenges: {} (attempt {}, retrying in 30s)",
107+
resp.status(),
108+
attempt
109+
);
110+
}
111+
Err(e) => {
112+
warn!(
113+
"Platform server not reachable: {} (attempt {}, retrying in 30s)",
114+
e, attempt
115+
);
116+
}
117+
}
118+
tokio::time::sleep(Duration::from_secs(30)).await;
119+
}
61120
}
62121

122+
/// Get weights with infinite retry loop (30s interval)
63123
pub async fn get_weights(&self, challenge_id: &str, epoch: u64) -> Result<Vec<(u16, u16)>> {
64-
let resp: serde_json::Value = self
65-
.client
66-
.get(format!(
67-
"{}/api/v1/challenges/{}/get_weights?epoch={}",
68-
self.base_url, challenge_id, epoch
69-
))
70-
.send()
71-
.await?
72-
.json()
73-
.await?;
74-
75-
Ok(resp
76-
.get("weights")
77-
.and_then(|w| w.as_array())
78-
.map(|arr| {
79-
arr.iter()
80-
.filter_map(|w| {
81-
Some((
82-
w.get("uid")?.as_u64()? as u16,
83-
w.get("weight")?.as_u64()? as u16,
84-
))
85-
})
86-
.collect()
87-
})
88-
.unwrap_or_default())
124+
let url = format!(
125+
"{}/api/v1/challenges/{}/get_weights?epoch={}",
126+
self.base_url, challenge_id, epoch
127+
);
128+
let mut attempt = 0u64;
129+
loop {
130+
attempt += 1;
131+
match self.client.get(&url).send().await {
132+
Ok(resp) if resp.status().is_success() => {
133+
match resp.json::<serde_json::Value>().await {
134+
Ok(data) => {
135+
let weights = data
136+
.get("weights")
137+
.and_then(|w| w.as_array())
138+
.map(|arr| {
139+
arr.iter()
140+
.filter_map(|w| {
141+
Some((
142+
w.get("uid")?.as_u64()? as u16,
143+
w.get("weight")?.as_u64()? as u16,
144+
))
145+
})
146+
.collect()
147+
})
148+
.unwrap_or_default();
149+
return Ok(weights);
150+
}
151+
Err(e) => {
152+
warn!(
153+
"Failed to parse weights response: {} (attempt {}, retrying in 30s)",
154+
e, attempt
155+
);
156+
}
157+
}
158+
}
159+
Ok(resp) => {
160+
warn!(
161+
"Failed to get weights for {}: {} (attempt {}, retrying in 30s)",
162+
challenge_id,
163+
resp.status(),
164+
attempt
165+
);
166+
}
167+
Err(e) => {
168+
warn!(
169+
"Platform server not reachable: {} (attempt {}, retrying in 30s)",
170+
e, attempt
171+
);
172+
}
173+
}
174+
tokio::time::sleep(Duration::from_secs(30)).await;
175+
}
89176
}
90177
}
91178

@@ -190,15 +277,10 @@ async fn main() -> Result<()> {
190277
)));
191278
let bans = Arc::new(RwLock::new(BanList::default()));
192279

193-
// Platform server
280+
// Platform server - wait until connected (infinite retry)
194281
let platform_client = Arc::new(PlatformServerClient::new(&args.platform_server));
195282
info!("Platform server: {}", args.platform_server);
196-
197-
if platform_client.health().await {
198-
info!("Platform server: connected");
199-
} else {
200-
warn!("Platform server: not reachable (will retry)");
201-
}
283+
platform_client.health_with_retry().await;
202284

203285
// List challenges
204286
match platform_client.list_challenges().await {

0 commit comments

Comments
 (0)