Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ public class DatanodeConfiguration extends ReconfigurableConfig {
public static final String FAILED_DB_VOLUMES_TOLERATED_KEY = "hdds.datanode.failed.db.volumes.tolerated";
public static final String DISK_CHECK_MIN_GAP_KEY = "hdds.datanode.disk.check.min.gap";
public static final String DISK_CHECK_TIMEOUT_KEY = "hdds.datanode.disk.check.timeout";

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please revert this change.

// Minimum space should be left on volume.
// Ex: If volume has 1000GB and minFreeSpace is configured as 10GB,
// In this case when availableSpace is 10GB or below, volume is assumed as full
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -708,23 +708,11 @@ public synchronized VolumeCheckResult check(@Nullable Boolean unused)
return VolumeCheckResult.HEALTHY;
}

// Move the sliding window of IO test results forward 1 by adding the
// latest entry and removing the oldest entry from the window.
// Update the failure counter for the new window.
ioTestSlidingWindow.add(diskChecksPassed);
if (!diskChecksPassed) {
currentIOFailureCount.incrementAndGet();
}
if (ioTestSlidingWindow.size() > ioTestCount &&
Objects.equals(ioTestSlidingWindow.poll(), Boolean.FALSE)) {
currentIOFailureCount.decrementAndGet();
}

// If the failure threshold has been crossed, fail the volume without
// further scans.
// Once the volume is failed, it will not be checked anymore.
// The failure counts can be left as is.
if (currentIOFailureCount.get() > ioFailureTolerance) {
// Move the sliding window of IO test results forward 1 and check threshold.
if (advanceIOWindow(diskChecksPassed)) {
// If the failure threshold has been crossed, fail the volume without
// further scans. Once the volume is failed, it will not be checked
// anymore. The failure counts can be left as is.
LOG.error("Failed IO test for volume {}: the last {} runs " +
"encountered {} out of {} tolerated failures.", this,
ioTestSlidingWindow.size(), currentIOFailureCount,
Expand All @@ -740,6 +728,65 @@ public synchronized VolumeCheckResult check(@Nullable Boolean unused)
return VolumeCheckResult.HEALTHY;
}

/**
* Called by {@link StorageVolumeChecker} when a volume check times out —
* either because the global {@code checkAllVolumes()} latch expired before
* this volume's async check completed, or because the per-check timeout
* inside {@link ThrottledAsyncChecker} fired.
*
* <p>Records a synthetic IO-test failure in the existing sliding window,
* making latch timeouts subject to the same {@code ioFailureTolerance}
* threshold as genuine read/write failures. No separate configuration key
* is required: the existing
* {@code hdds.datanode.disk.check.io.failures.tolerated} governs both.
*
* <p>Recovery is automatic: each successful {@link #check} call records a
* {@code true} entry in the window, gradually evicting the synthetic
* failure once {@code ioTestCount} healthy results have accumulated.
*
* @return {@code true} if {@code currentIOFailureCount > ioFailureTolerance},
* meaning the volume should now be marked FAILED; {@code false} if
* the failure is still within tolerance this round.
*/
public synchronized boolean recordTimeoutAsIOFailure() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

check is synchronized. So if timeout, check doesn't return, then this recordTimeoutAsIOFailure will wait for check to complete.

if (advanceIOWindow(false)) {
LOG.error("Volume {} check timed out: IO-failure count ({}) exceeds"
+ " tolerance ({}). Marking FAILED.",
this, currentIOFailureCount, ioFailureTolerance);
return true;
}
LOG.warn("Volume {} check timed out. IO-failure count: {} / tolerance: {}."
+ " Volume will not be failed until tolerance is exceeded."
+ " Common transient causes: kernel I/O scheduler saturation"
+ " or JVM GC pressure.",
this, currentIOFailureCount, ioFailureTolerance);
return false;
}

/**
* Advances the IO-test sliding window by one entry and updates the rolling
* failure counter.
*
* <p>Called by both {@link #check} (genuine IO test result) and
* {@link #recordTimeoutAsIOFailure} (synthetic failure for a check timeout),
* keeping the window-update logic in a single place.
*
* @param passed {@code true} if the IO test passed; {@code false} otherwise.
* @return {@code true} if {@code currentIOFailureCount} now exceeds
* {@code ioFailureTolerance}; {@code false} if still within bounds.
*/
private boolean advanceIOWindow(boolean passed) {
ioTestSlidingWindow.add(passed);
if (!passed) {
currentIOFailureCount.incrementAndGet();
}
if (ioTestSlidingWindow.size() > ioTestCount &&
Objects.equals(ioTestSlidingWindow.poll(), Boolean.FALSE)) {
currentIOFailureCount.decrementAndGet();
}
return currentIOFailureCount.get() > ioFailureTolerance;
}

@Override
public int hashCode() {
return Objects.hash(storageDir);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
Expand Down Expand Up @@ -246,18 +247,40 @@ public Set<? extends StorageVolume> checkAllVolumes(

// Wait until our timeout elapses, after which we give up on
// the remaining volumes.
if (!latch.await(maxAllowedTimeForCheckMs, TimeUnit.MILLISECONDS)) {
LOG.warn("checkAllVolumes timed out after {} ms",
maxAllowedTimeForCheckMs);
}
boolean completedOnTime =
latch.await(maxAllowedTimeForCheckMs, TimeUnit.MILLISECONDS);

synchronized (this) {
// All volumes that have not been detected as healthy should be
// considered failed. This is a superset of 'failedVolumes'.
//
// Make a copy under the mutex as Sets.difference() returns a view
// of a potentially changing set.
return new HashSet<>(Sets.difference(allVolumes, healthyVolumes));
if (!completedOnTime) {
LOG.warn("checkAllVolumes timed out after {} ms."
+ " Evaluating per-volume latch-timeout tolerance.",
maxAllowedTimeForCheckMs);
}

// Volumes that explicitly reported FAILED via check() are always
// returned — the IO-failure sliding window in StorageVolume.check()
// already applied its own tolerance.
final Set<StorageVolume> result = new HashSet<>(failedVolumes);

// Volumes still pending (neither healthy nor explicitly failed):
// the latch expired before they reported a result. Record a synthetic
// IO failure in each volume's existing sliding window so latch timeouts
// share the same ioFailureTolerance threshold as genuine IO failures.
// Healthy volumes need no special action: their successful check() call
// already recorded TRUE in the sliding window.
final Set<StorageVolume> pendingVolumes =
new HashSet<>(Sets.difference(allVolumes,
Sets.union(healthyVolumes, failedVolumes)));

for (StorageVolume v : pendingVolumes) {
if (v.recordTimeoutAsIOFailure()) {
// Tolerance exceeded — mark as failed.
result.add(v);
}
// else: within tolerance this round — omit from failed set.
}

return result;
}
}

Expand Down Expand Up @@ -376,10 +399,22 @@ public void onFailure(@Nonnull Throwable t) {
volume, exception);
// If the scan was interrupted, do not count it as a volume failure.
// This should only happen if the volume checker is being shut down.
if (!(t instanceof InterruptedException)) {
markFailed();
cleanup();
if (t instanceof InterruptedException) {
return;
}
if (exception instanceof TimeoutException) {
Copy link
Contributor

@ChenSammi ChenSammi Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Devesh, can you add a unit test of real timeout case for both check all volume and check one volume? The exception thrown out from ListenableFuture is not direct TimeoutException in my test.

// Per-check timeout from ThrottledAsyncChecker: apply the same
// IO-failure tolerance as a failed read/write test, rather than
// failing the volume immediately on the first timeout.
if (!volume.recordTimeoutAsIOFailure()) {
// Within tolerance this round. Still call cleanup() so numVolumes
// decrements correctly and the latch/callback fires on time.
cleanup();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one of the callback of checkVolume is

  volumeChecker.checkVolume(
        volume, (healthyVolumes, failedVolumes) -> {
          if (!failedVolumes.isEmpty()) {
            LOG.warn("checkVolumeAsync callback got {} failed volumes: {}",
                failedVolumes.size(), failedVolumes);
          } else {
            LOG.debug("checkVolumeAsync: no volume failures detected");
          }
          handleVolumeFailures(failedVolumes);
        })

We need to consider whether call the cleanup in this case.

return;
}
}
markFailed();
cleanup();
}

private void markHealthy() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.File;
import java.nio.file.Path;
Expand Down Expand Up @@ -341,6 +343,91 @@ public void testCorrectDirectoryChecked(StorageVolume.Builder<?> builder)
volume.check(false);
}

/**
* With the default settings (ioTestCount=3, ioFailureTolerance=1), the
* first simulated check timeout must be tolerated: it records one synthetic
* IO failure in the sliding window (count=1, which is NOT &gt; tolerance=1),
* so {@code recordTimeoutAsIOFailure()} returns false.
*/
@ParameterizedTest
@MethodSource("volumeBuilders")
public void testFirstTimeoutIsTolerated(StorageVolume.Builder<?> builder)
throws Exception {
StorageVolume volume = builder.build();
volume.format(CLUSTER_ID);
volume.createTmpDirs(CLUSTER_ID);

// First simulated check timeout: tolerance not exceeded.
assertFalse(volume.recordTimeoutAsIOFailure(),
"First timeout should be tolerated (IO failure count 1 is not > tolerance 1)");
}

/**
* With the default settings (ioTestCount=3, ioFailureTolerance=1), the
* second consecutive check timeout must cause
* {@code recordTimeoutAsIOFailure()} to return true: count=2 which IS
* &gt; tolerance=1.
*/
@ParameterizedTest
@MethodSource("volumeBuilders")
public void testSecondConsecutiveTimeoutFails(StorageVolume.Builder<?> builder)
throws Exception {
StorageVolume volume = builder.build();
volume.format(CLUSTER_ID);
volume.createTmpDirs(CLUSTER_ID);

assertFalse(volume.recordTimeoutAsIOFailure(),
"First timeout should be tolerated");
assertTrue(volume.recordTimeoutAsIOFailure(),
"Second consecutive timeout should exceed tolerance and return true");
}

/**
* After a simulated timeout, {@code ioTestCount} healthy check() calls
* gradually evict the synthetic failure from the sliding window. Once
* evicted, the IO failure count drops back to 0 and a new single timeout
* is tolerated again — no separate reset API is required.
*
* <p>With the defaults (ioTestCount=3, ioFailureTolerance=1):
* <ol>
* <li>1 timeout: window=[F], failures=1</li>
* <li>3 healthy checks push T, T, T — the 4th entry evicts F:
* window=[T,T,T], failures=0</li>
* <li>New timeout: window=[T,T,F] (evicts oldest T), failures=1
* → 1 is not &gt; 1 → tolerated again</li>
* </ol>
*/
@ParameterizedTest
@MethodSource("volumeBuilders")
public void testHealthyChecksEvictTimeoutFromSlidingWindow(
StorageVolume.Builder<?> builder) throws Exception {
StorageVolume volume = builder.build();
volume.format(CLUSTER_ID);
volume.createTmpDirs(CLUSTER_ID);

// Simulate one tolerated timeout.
assertFalse(volume.recordTimeoutAsIOFailure(),
"First timeout should be tolerated");

// Three healthy checks push TRUE entries into the sliding window,
// eventually evicting the synthetic FALSE.
DiskCheckUtil.DiskChecks alwaysPass = new DiskCheckUtil.DiskChecks() {
@Override
public boolean checkReadWrite(File storageDir, File testFileDir,
int numBytesToWrite) {
return true;
}
};
DiskCheckUtil.setTestImpl(alwaysPass);
assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));
assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));
assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));

// After recovery a new single timeout is tolerated again.
assertFalse(volume.recordTimeoutAsIOFailure(),
"Timeout after recovery should be tolerated again");
}

/**
* Asserts that the disk checks are being done on the correct directory for
* each volume type.
Expand Down