apache · devmadhuu · Mar 20, 2026 · Mar 23, 2026 · Mar 24, 2026 · ChenSammi
diff --git a/...ain/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeConfiguration.java b/...ain/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeConfiguration.java
@@ -61,7 +61,6 @@ public class DatanodeConfiguration extends ReconfigurableConfig {
   public static final String FAILED_DB_VOLUMES_TOLERATED_KEY = "hdds.datanode.failed.db.volumes.tolerated";
   public static final String DISK_CHECK_MIN_GAP_KEY = "hdds.datanode.disk.check.min.gap";
   public static final String DISK_CHECK_TIMEOUT_KEY = "hdds.datanode.disk.check.timeout";
-
   // Minimum space should be left on volume.
   // Ex: If volume has 1000GB and minFreeSpace is configured as 10GB,
   // In this case when availableSpace is 10GB or below, volume is assumed as full

diff --git a/...-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java b/...-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
@@ -708,23 +708,11 @@ public synchronized VolumeCheckResult check(@Nullable Boolean unused)
       return VolumeCheckResult.HEALTHY;
     }
 
-    // Move the sliding window of IO test results forward 1 by adding the
-    // latest entry and removing the oldest entry from the window.
-    // Update the failure counter for the new window.
-    ioTestSlidingWindow.add(diskChecksPassed);
-    if (!diskChecksPassed) {
-      currentIOFailureCount.incrementAndGet();
-    }
-    if (ioTestSlidingWindow.size() > ioTestCount &&
-        Objects.equals(ioTestSlidingWindow.poll(), Boolean.FALSE)) {
-      currentIOFailureCount.decrementAndGet();
-    }
-
-    // If the failure threshold has been crossed, fail the volume without
-    // further scans.
-    // Once the volume is failed, it will not be checked anymore.
-    // The failure counts can be left as is.
-    if (currentIOFailureCount.get() > ioFailureTolerance) {
+    // Move the sliding window of IO test results forward 1 and check threshold.
+    if (advanceIOWindow(diskChecksPassed)) {
+      // If the failure threshold has been crossed, fail the volume without
+      // further scans. Once the volume is failed, it will not be checked
+      // anymore. The failure counts can be left as is.
       LOG.error("Failed IO test for volume {}: the last {} runs " +
               "encountered {} out of {} tolerated failures.", this,
           ioTestSlidingWindow.size(), currentIOFailureCount,
@@ -740,6 +728,65 @@ public synchronized VolumeCheckResult check(@Nullable Boolean unused)
     return VolumeCheckResult.HEALTHY;
   }
 
+  /**
+   * Called by {@link StorageVolumeChecker} when a volume check times out —
+   * either because the global {@code checkAllVolumes()} latch expired before
+   * this volume's async check completed, or because the per-check timeout
+   * inside {@link ThrottledAsyncChecker} fired.
+   *
+   * <p>Records a synthetic IO-test failure in the existing sliding window,
+   * making latch timeouts subject to the same {@code ioFailureTolerance}
+   * threshold as genuine read/write failures.  No separate configuration key
+   * is required: the existing
+   * {@code hdds.datanode.disk.check.io.failures.tolerated} governs both.
+   *
+   * <p>Recovery is automatic: each successful {@link #check} call records a
+   * {@code true} entry in the window, gradually evicting the synthetic
+   * failure once {@code ioTestCount} healthy results have accumulated.
+   *
+   * @return {@code true} if {@code currentIOFailureCount > ioFailureTolerance},
+   *         meaning the volume should now be marked FAILED; {@code false} if
+   *         the failure is still within tolerance this round.
+   */
+  public synchronized boolean recordTimeoutAsIOFailure() {
+    if (advanceIOWindow(false)) {
+      LOG.error("Volume {} check timed out: IO-failure count ({}) exceeds"
+              + " tolerance ({}). Marking FAILED.",
+          this, currentIOFailureCount, ioFailureTolerance);
+      return true;
+    }
+    LOG.warn("Volume {} check timed out. IO-failure count: {} / tolerance: {}."
+            + " Volume will not be failed until tolerance is exceeded."
+            + " Common transient causes: kernel I/O scheduler saturation"
+            + " or JVM GC pressure.",
+        this, currentIOFailureCount, ioFailureTolerance);
+    return false;
+  }
+
+  /**
+   * Advances the IO-test sliding window by one entry and updates the rolling
+   * failure counter.
+   *
+   * <p>Called by both {@link #check} (genuine IO test result) and
+   * {@link #recordTimeoutAsIOFailure} (synthetic failure for a check timeout),
+   * keeping the window-update logic in a single place.
+   *
+   * @param passed {@code true} if the IO test passed; {@code false} otherwise.
+   * @return {@code true} if {@code currentIOFailureCount} now exceeds
+   *         {@code ioFailureTolerance}; {@code false} if still within bounds.
+   */
+  private boolean advanceIOWindow(boolean passed) {
+    ioTestSlidingWindow.add(passed);
+    if (!passed) {
+      currentIOFailureCount.incrementAndGet();
+    }
+    if (ioTestSlidingWindow.size() > ioTestCount &&
+        Objects.equals(ioTestSlidingWindow.poll(), Boolean.FALSE)) {
+      currentIOFailureCount.decrementAndGet();
+    }
+    return currentIOFailureCount.get() > ioFailureTolerance;
+  }
+
   @Override
   public int hashCode() {
     return Objects.hash(storageDir);

diff --git a/...e/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolumeChecker.java b/...e/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolumeChecker.java
@@ -42,6 +42,7 @@
 import java.util.concurrent.ScheduledFuture;
 import java.util.concurrent.ThreadFactory;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicLong;
 import org.apache.hadoop.hdds.conf.ConfigurationSource;
@@ -246,18 +247,40 @@ public Set<? extends StorageVolume> checkAllVolumes(
 
     // Wait until our timeout elapses, after which we give up on
     // the remaining volumes.
-    if (!latch.await(maxAllowedTimeForCheckMs, TimeUnit.MILLISECONDS)) {
-      LOG.warn("checkAllVolumes timed out after {} ms",
-          maxAllowedTimeForCheckMs);
-    }
+    boolean completedOnTime =
+        latch.await(maxAllowedTimeForCheckMs, TimeUnit.MILLISECONDS);
 
     synchronized (this) {
-      // All volumes that have not been detected as healthy should be
-      // considered failed. This is a superset of 'failedVolumes'.
-      //
-      // Make a copy under the mutex as Sets.difference() returns a view
-      // of a potentially changing set.
-      return new HashSet<>(Sets.difference(allVolumes, healthyVolumes));
+      if (!completedOnTime) {
+        LOG.warn("checkAllVolumes timed out after {} ms."
+            + " Evaluating per-volume latch-timeout tolerance.",
+            maxAllowedTimeForCheckMs);
+      }
+
+      // Volumes that explicitly reported FAILED via check() are always
+      // returned — the IO-failure sliding window in StorageVolume.check()
+      // already applied its own tolerance.
+      final Set<StorageVolume> result = new HashSet<>(failedVolumes);
+
+      // Volumes still pending (neither healthy nor explicitly failed):
+      // the latch expired before they reported a result. Record a synthetic
+      // IO failure in each volume's existing sliding window so latch timeouts
+      // share the same ioFailureTolerance threshold as genuine IO failures.
+      // Healthy volumes need no special action: their successful check() call
+      // already recorded TRUE in the sliding window.
+      final Set<StorageVolume> pendingVolumes =
+          new HashSet<>(Sets.difference(allVolumes,
+              Sets.union(healthyVolumes, failedVolumes)));
+
+      for (StorageVolume v : pendingVolumes) {
+        if (v.recordTimeoutAsIOFailure()) {
+          // Tolerance exceeded — mark as failed.
+          result.add(v);
+        }
+        // else: within tolerance this round — omit from failed set.
+      }
+
+      return result;
     }
   }
 
@@ -376,10 +399,22 @@ public void onFailure(@Nonnull Throwable t) {
           volume, exception);
       // If the scan was interrupted, do not count it as a volume failure.
       // This should only happen if the volume checker is being shut down.
-      if (!(t instanceof InterruptedException)) {
-        markFailed();
-        cleanup();
+      if (t instanceof InterruptedException) {
+        return;
+      }
+      if (exception instanceof TimeoutException) {
+        // Per-check timeout from ThrottledAsyncChecker: apply the same
+        // IO-failure tolerance as a failed read/write test, rather than
+        // failing the volume immediately on the first timeout.
+        if (!volume.recordTimeoutAsIOFailure()) {
+          // Within tolerance this round. Still call cleanup() so numVolumes
+          // decrements correctly and the latch/callback fires on time.
+          cleanup();
+          return;
+        }
       }
+      markFailed();
+      cleanup();
     }
 
     private void markHealthy() {

diff --git a/...t/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java b/...t/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java
@@ -19,6 +19,8 @@
 
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.File;
 import java.nio.file.Path;
@@ -341,6 +343,91 @@ public void testCorrectDirectoryChecked(StorageVolume.Builder<?> builder)
     volume.check(false);
   }
 
+  /**
+   * With the default settings (ioTestCount=3, ioFailureTolerance=1), the
+   * first simulated check timeout must be tolerated: it records one synthetic
+   * IO failure in the sliding window (count=1, which is NOT &gt; tolerance=1),
+   * so {@code recordTimeoutAsIOFailure()} returns false.
+   */
+  @ParameterizedTest
+  @MethodSource("volumeBuilders")
+  public void testFirstTimeoutIsTolerated(StorageVolume.Builder<?> builder)
+      throws Exception {
+    StorageVolume volume = builder.build();
+    volume.format(CLUSTER_ID);
+    volume.createTmpDirs(CLUSTER_ID);
+
+    // First simulated check timeout: tolerance not exceeded.
+    assertFalse(volume.recordTimeoutAsIOFailure(),
+        "First timeout should be tolerated (IO failure count 1 is not > tolerance 1)");
+  }
+
+  /**
+   * With the default settings (ioTestCount=3, ioFailureTolerance=1), the
+   * second consecutive check timeout must cause
+   * {@code recordTimeoutAsIOFailure()} to return true: count=2 which IS
+   * &gt; tolerance=1.
+   */
+  @ParameterizedTest
+  @MethodSource("volumeBuilders")
+  public void testSecondConsecutiveTimeoutFails(StorageVolume.Builder<?> builder)
+      throws Exception {
+    StorageVolume volume = builder.build();
+    volume.format(CLUSTER_ID);
+    volume.createTmpDirs(CLUSTER_ID);
+
+    assertFalse(volume.recordTimeoutAsIOFailure(),
+        "First timeout should be tolerated");
+    assertTrue(volume.recordTimeoutAsIOFailure(),
+        "Second consecutive timeout should exceed tolerance and return true");
+  }
+
+  /**
+   * After a simulated timeout, {@code ioTestCount} healthy check() calls
+   * gradually evict the synthetic failure from the sliding window. Once
+   * evicted, the IO failure count drops back to 0 and a new single timeout
+   * is tolerated again — no separate reset API is required.
+   *
+   * <p>With the defaults (ioTestCount=3, ioFailureTolerance=1):
+   * <ol>
+   *   <li>1 timeout: window=[F], failures=1</li>
+   *   <li>3 healthy checks push T, T, T — the 4th entry evicts F:
+   *       window=[T,T,T], failures=0</li>
+   *   <li>New timeout: window=[T,T,F] (evicts oldest T), failures=1
+   *       → 1 is not &gt; 1 → tolerated again</li>
+   * </ol>
+   */
+  @ParameterizedTest
+  @MethodSource("volumeBuilders")
+  public void testHealthyChecksEvictTimeoutFromSlidingWindow(
+      StorageVolume.Builder<?> builder) throws Exception {
+    StorageVolume volume = builder.build();
+    volume.format(CLUSTER_ID);
+    volume.createTmpDirs(CLUSTER_ID);
+
+    // Simulate one tolerated timeout.
+    assertFalse(volume.recordTimeoutAsIOFailure(),
+        "First timeout should be tolerated");
+
+    // Three healthy checks push TRUE entries into the sliding window,
+    // eventually evicting the synthetic FALSE.
+    DiskCheckUtil.DiskChecks alwaysPass = new DiskCheckUtil.DiskChecks() {
+      @Override
+      public boolean checkReadWrite(File storageDir, File testFileDir,
+          int numBytesToWrite) {
+        return true;
+      }
+    };
+    DiskCheckUtil.setTestImpl(alwaysPass);
+    assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));
+    assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));
+    assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));
+
+    // After recovery a new single timeout is tolerated again.
+    assertFalse(volume.recordTimeoutAsIOFailure(),
+        "Timeout after recovery should be tolerated again");
+  }
+
   /**
    * Asserts that the disk checks are being done on the correct directory for
    * each volume type.