Skip to content

Commit 00df049

Browse files
committed
fix: correct sled corruption recovery to remove actual db files
The previous recovery code tried to remove data_dir/state/ which doesn't exist - sled stores files directly in data_dir (db, conf, snap.*, blobs/). This caused validators with corrupted storage to restart in a loop instead of auto-recovering. Changes: - Storage recovery now correctly removes sled files (db, conf, snap.*, blobs/) - Preserves distributed-db directory (handled separately) - ChainState recovery simplified - corrupted data overwritten on save
1 parent 1039741 commit 00df049

File tree

1 file changed

+22
-12
lines changed

1 file changed

+22
-12
lines changed

bins/validator-node/src/main.rs

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -348,13 +348,26 @@ async fn main() -> Result<()> {
348348
Ok(s) => s,
349349
Err(e) if is_corruption_error(&e) => {
350350
warn!("Storage corruption detected: {}. Attempting recovery...", e);
351-
// Delete corrupted state file
352-
let state_path = data_dir.join("state");
353-
if state_path.exists() {
354-
warn!("Removing corrupted state directory: {:?}", state_path);
355-
std::fs::remove_dir_all(&state_path)?;
351+
// Delete corrupted sled database files (stored directly in data_dir)
352+
// Sled files: db, conf, snap.*, blobs/
353+
warn!("Removing corrupted sled database in: {:?}", data_dir);
354+
for entry in std::fs::read_dir(&data_dir).into_iter().flatten() {
355+
if let Ok(entry) = entry {
356+
let path = entry.path();
357+
let name = entry.file_name();
358+
let name_str = name.to_string_lossy();
359+
// Remove sled files but preserve distributed-db (handled separately)
360+
if name_str == "db" || name_str == "conf" || name_str.starts_with("snap.") || name_str == "blobs" {
361+
warn!("Removing sled file: {:?}", path);
362+
if path.is_dir() {
363+
let _ = std::fs::remove_dir_all(&path);
364+
} else {
365+
let _ = std::fs::remove_file(&path);
366+
}
367+
}
368+
}
356369
}
357-
// Retry opening
370+
// Retry opening (will create fresh database)
358371
Storage::open(&data_dir)?
359372
}
360373
Err(e) => return Err(e.into()),
@@ -441,12 +454,9 @@ async fn main() -> Result<()> {
441454
}
442455
Err(e) if is_corruption_error(&e) => {
443456
warn!("Chain state corruption detected: {}. Creating fresh state...", e);
444-
// Delete corrupted state
445-
let state_path = data_dir.join("state");
446-
if state_path.exists() {
447-
warn!("Removing corrupted state: {:?}", state_path);
448-
std::fs::remove_dir_all(&state_path)?;
449-
}
457+
// Note: Storage is already open, corrupted data will be overwritten on save
458+
// The state is stored in sled's "state" tree, which will be updated
459+
// when save_state() is called with the fresh state
450460

451461
// Create fresh state
452462
let sudo_key = if let Some(sudo_hex) = &args.sudo_key {

0 commit comments

Comments
 (0)