From 9f50fba6fb64ae75dee79eb85560427c6e2b33a3 Mon Sep 17 00:00:00 2001 From: Rishabh Patel Date: Wed, 18 Mar 2026 13:03:00 -0700 Subject: [PATCH 1/2] HDDS-14859. Ignore transient errors while validating RocksDb on volumes. --- .../statemachine/DatanodeConfiguration.java | 31 ++++++++++++++++++ .../container/common/volume/HddsVolume.java | 32 +++++++++++++------ 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeConfiguration.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeConfiguration.java index 41f6d36971ff..26be0cb2c42d 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeConfiguration.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeConfiguration.java @@ -61,6 +61,7 @@ public class DatanodeConfiguration extends ReconfigurableConfig { public static final String FAILED_DB_VOLUMES_TOLERATED_KEY = "hdds.datanode.failed.db.volumes.tolerated"; public static final String DISK_CHECK_MIN_GAP_KEY = "hdds.datanode.disk.check.min.gap"; public static final String DISK_CHECK_TIMEOUT_KEY = "hdds.datanode.disk.check.timeout"; + public static final String DISK_CHECK_RETRY_GAP_KEY = "hdds.datanode.disk.check.retry.gap"; // Minimum space should be left on volume. // Ex: If volume has 1000GB and minFreeSpace is configured as 10GB, @@ -99,6 +100,8 @@ public class DatanodeConfiguration extends ReconfigurableConfig { static final Duration DISK_CHECK_TIMEOUT_DEFAULT = Duration.ofMinutes(10); + static final Duration DISK_CHECK_RETRY_GAP_DEFAULT = Duration.ofMinutes(1); + static final boolean CONTAINER_SCHEMA_V3_ENABLED_DEFAULT = true; static final long ROCKSDB_LOG_MAX_FILE_SIZE_BYTES_DEFAULT = 32 * 1024 * 1024; static final int ROCKSDB_LOG_MAX_FILE_NUM_DEFAULT = 64; @@ -404,6 +407,17 @@ public class DatanodeConfiguration extends ReconfigurableConfig { ) private Duration diskCheckTimeout = DISK_CHECK_TIMEOUT_DEFAULT; + @Config(key = DISK_CHECK_RETRY_GAP_KEY, + defaultValue = "1m", + type = ConfigType.TIME, + tags = {DATANODE}, + description = "Time to wait between retries of disk checks." + + " To ignore transient issues, the RocksDb instance on a disk is validated multiple times before" + + " declaring failure. This configuration defines the time to wait between the retry attempts." + + " Unit could be defined with postfix (ns,ms,s,m,h,d)." + ) + private Duration diskCheckRetryGap = DISK_CHECK_RETRY_GAP_DEFAULT; + @Config(key = "hdds.datanode.chunk.data.validation.check", defaultValue = "false", type = ConfigType.BOOLEAN, @@ -688,6 +702,19 @@ public void validate() { diskCheckTimeout = DISK_CHECK_TIMEOUT_DEFAULT; } + if (diskCheckRetryGap.isNegative()) { + LOG.warn("{} must be greater than zero and was set to {}. Defaulting to {}", + DISK_CHECK_RETRY_GAP_KEY, diskCheckRetryGap, DISK_CHECK_RETRY_GAP_DEFAULT); + diskCheckRetryGap = DISK_CHECK_RETRY_GAP_DEFAULT; + } + + if (diskCheckRetryGap.compareTo(diskCheckTimeout) > 0) { + LOG.warn("{} was set to {}. It must be less than {} which is {}. Defaulting to {}", + DISK_CHECK_RETRY_GAP_KEY, diskCheckRetryGap, DISK_CHECK_TIMEOUT_KEY, diskCheckTimeout, + DISK_CHECK_RETRY_GAP_DEFAULT); + diskCheckRetryGap = DISK_CHECK_RETRY_GAP_DEFAULT; + } + if (blockDeleteCommandWorkerInterval.isNegative()) { LOG.warn(BLOCK_DELETE_COMMAND_WORKER_INTERVAL + " must be greater than zero and was set to {}. Defaulting to {}", @@ -903,6 +930,10 @@ public Duration getDiskCheckTimeout() { return diskCheckTimeout; } + public Duration getDiskCheckRetryGap() { + return diskCheckRetryGap; + } + public void setDiskCheckTimeout(Duration duration) { diskCheckTimeout = duration; } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java index f331db7defc3..695616b46d89 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java @@ -25,6 +25,7 @@ import jakarta.annotation.Nullable; import java.io.File; import java.io.IOException; +import java.time.Duration; import java.util.Iterator; import java.util.LinkedList; import java.util.List; @@ -326,17 +327,30 @@ public VolumeCheckResult checkDbHealth(File dbFile) throws InterruptedException return VolumeCheckResult.HEALTHY; } + // We attempt to open RocksDb twice to ignore any transient errors + // and to confirm that we actually cannot open RocksDb in readonly mode. final boolean isVolumeTestResultHealthy = true; - try (ManagedOptions managedOptions = new ManagedOptions(); - ManagedRocksDB ignored = ManagedRocksDB.openReadOnly(managedOptions, dbFile.toString())) { - volumeTestResultQueue.add(isVolumeTestResultHealthy); - } catch (Exception e) { - if (Thread.currentThread().isInterrupted()) { - throw new InterruptedException("Check of database for volume " + this + " interrupted."); + final int maxAttempts = 2; + final Duration maxRetryGap = getDatanodeConfig().getDiskCheckRetryGap(); + for (int attempt = 0; attempt < maxAttempts; attempt++) { + try (ManagedOptions managedOptions = new ManagedOptions(); + ManagedRocksDB ignored = ManagedRocksDB.openReadOnly(managedOptions, dbFile.toString())) { + volumeTestResultQueue.add(isVolumeTestResultHealthy); + break; + } catch (Exception e) { + if (Thread.currentThread().isInterrupted()) { + throw new InterruptedException("Check of database for volume " + this + " interrupted."); + } + + if (attempt == maxAttempts - 1) { + LOG.error("Could not open Volume DB located at {}", dbFile, e); + volumeTestResultQueue.add(!isVolumeTestResultHealthy); + volumeTestFailureCount.incrementAndGet(); + } else { + LOG.warn("Could not open Volume DB located at {}", dbFile, e); + Thread.sleep(maxRetryGap.toMillis()); + } } - LOG.warn("Could not open Volume DB located at {}", dbFile, e); - volumeTestResultQueue.add(!isVolumeTestResultHealthy); - volumeTestFailureCount.incrementAndGet(); } if (volumeTestResultQueue.size() > volumeTestCount From c2ffea9233772089bd52304ddf2176f64ea56658 Mon Sep 17 00:00:00 2001 From: Rishabh Patel Date: Tue, 24 Mar 2026 13:58:45 -0700 Subject: [PATCH 2/2] HDDS-14859. Add support for opening RocksDB as a secondary for volume validation. --- .../hadoop/ozone/container/common/volume/HddsVolume.java | 3 ++- .../hadoop/hdds/utils/db/managed/ManagedRocksDB.java | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java index 695616b46d89..1055590a62b1 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java @@ -334,7 +334,8 @@ public VolumeCheckResult checkDbHealth(File dbFile) throws InterruptedException final Duration maxRetryGap = getDatanodeConfig().getDiskCheckRetryGap(); for (int attempt = 0; attempt < maxAttempts; attempt++) { try (ManagedOptions managedOptions = new ManagedOptions(); - ManagedRocksDB ignored = ManagedRocksDB.openReadOnly(managedOptions, dbFile.toString())) { + ManagedRocksDB ignored = + ManagedRocksDB.openAsSecondary(managedOptions, dbFile.toString(), getTmpDir().getPath())) { volumeTestResultQueue.add(isVolumeTestResultHealthy); break; } catch (Exception e) { diff --git a/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java b/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java index 3401469f6824..420da42a0be0 100644 --- a/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java +++ b/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java @@ -75,6 +75,14 @@ public static ManagedRocksDB openReadOnly( ); } + public static ManagedRocksDB openAsSecondary( + final ManagedOptions options, + final String dbPath, + final String secondaryDbLogFilePath) + throws RocksDBException { + return new ManagedRocksDB(RocksDB.openAsSecondary(options, dbPath, secondaryDbLogFilePath)); + } + public static ManagedRocksDB open( final DBOptions options, final String path, final List columnFamilyDescriptors,