From ec0939f934963108d24f75e8b0f4ea47fb0d47ff Mon Sep 17 00:00:00 2001 From: Rishabh Patel Date: Wed, 18 Mar 2026 19:58:41 -0700 Subject: [PATCH 1/2] HDDS-14862. Log volume failures and database errors as warnings/errors --- .../hadoop/ozone/container/common/volume/HddsVolume.java | 4 ++-- .../ozone/container/common/volume/MutableVolumeSet.java | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java index f331db7defc3..e2563fc24bb8 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java @@ -632,7 +632,7 @@ private void closeDbStore() { DatanodeStoreCache.getInstance().removeDB(containerDBPath); dbLoaded.set(false); dbLoadFailure.set(false); - LOG.info("SchemaV3 db is stopped at {} for volume {}", containerDBPath, + LOG.warn("SchemaV3 db is stopped at {} for volume {}", containerDBPath, getStorageID()); } @@ -648,7 +648,7 @@ public void compactDb() { volumeInfoMetrics.dbCompactTimesNanoSecondsIncr( Time.monotonicNowNanos() - start); } catch (Exception e) { - LOG.warn("compact rocksdb error in {}", dbFilePath, e); + LOG.error("compact rocksdb error in {}", dbFilePath, e); } } } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MutableVolumeSet.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MutableVolumeSet.java index 9ce69fa14bdf..e8dc96889469 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MutableVolumeSet.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MutableVolumeSet.java @@ -337,9 +337,9 @@ public void failVolume(String volumeRoot) { failedVolumeMap.put(volumeRoot, volume); volumeHealthMetrics.decrementHealthyVolumes(); volumeHealthMetrics.incrementFailedVolumes(); - LOG.info("Moving Volume : {} to failed Volumes", volumeRoot); + LOG.error("Moving Volume : {} to failed Volumes", volumeRoot); } else if (failedVolumeMap.containsKey(volumeRoot)) { - LOG.info("Volume : {} is not active", volumeRoot); + LOG.warn("Volume : {} is not active", volumeRoot); } else { LOG.warn("Volume : {} does not exist in VolumeSet", volumeRoot); } From 991bfb9944907c0cb9e3266c9ea78dac86be02d0 Mon Sep 17 00:00:00 2001 From: Rishabh Patel Date: Tue, 24 Mar 2026 13:12:56 -0700 Subject: [PATCH 2/2] HDDS-14862. Log volume failures as errors and adjust log levels --- .../org/apache/hadoop/ozone/HddsDatanodeService.java | 1 + .../common/states/endpoint/VersionEndpointTask.java | 1 + .../hadoop/ozone/container/common/volume/HddsVolume.java | 9 ++++----- .../ozone/container/common/volume/MutableVolumeSet.java | 9 ++++----- .../ozone/container/common/volume/StorageVolume.java | 2 ++ .../ozone/container/ozoneimpl/ContainerReader.java | 4 ++-- .../upgrade/ScmHAFinalizeUpgradeActionDatanode.java | 1 + 7 files changed, 15 insertions(+), 12 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java index 9b0d87479331..c49978ce6f27 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java @@ -412,6 +412,7 @@ private void startRatisForTest() throws IOException { HddsVolume hddsVolume = (HddsVolume) storageVolume; boolean result = StorageVolumeUtil.checkVolume(hddsVolume, clusterId, clusterId, conf, LOG, null); if (!result) { + LOG.error("Marking volume {} as failed", hddsVolume.getStorageDir().getPath()); volumeSet.failVolume(hddsVolume.getHddsRootDir().getPath()); } } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/VersionEndpointTask.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/VersionEndpointTask.java index b9326c07c5b1..40844c563e9a 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/VersionEndpointTask.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/VersionEndpointTask.java @@ -120,6 +120,7 @@ private void checkVolumeSet(MutableVolumeSet volumeSet, scmId, clusterId, configuration, LOG, ozoneContainer.getDbVolumeSet()); if (!result) { + LOG.error("Marking volume {} as failed", volume.getStorageDir().getPath()); volumeSet.failVolume(volume.getStorageDir().getPath()); } } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java index e2563fc24bb8..a4f1afa08ea8 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java @@ -299,7 +299,7 @@ public synchronized VolumeCheckResult check(@Nullable Boolean unused) VolumeCheckResult result = super.check(unused); if (isDbLoadFailure()) { - LOG.warn("Volume {} failed to access RocksDB: RocksDB parent directory is null, " + + LOG.error("Volume {} failed to access RocksDB: RocksDB parent directory is null, " + "the volume might not have been loaded properly.", getStorageDir()); return VolumeCheckResult.FAILED; } @@ -312,8 +312,7 @@ public synchronized VolumeCheckResult check(@Nullable Boolean unused) // Check that per-volume RocksDB is present. File dbFile = new File(dbParentDir, CONTAINER_DB_NAME); if (!dbFile.exists() || !dbFile.canRead()) { - LOG.warn("Volume {} failed health check. Could not access RocksDB at " + - "{}", getStorageDir(), dbFile); + LOG.error("Volume {} failed health check. Could not access RocksDB at {}", getStorageDir(), dbFile); return VolumeCheckResult.FAILED; } @@ -632,7 +631,7 @@ private void closeDbStore() { DatanodeStoreCache.getInstance().removeDB(containerDBPath); dbLoaded.set(false); dbLoadFailure.set(false); - LOG.warn("SchemaV3 db is stopped at {} for volume {}", containerDBPath, + LOG.info("SchemaV3 db is stopped at {} for volume {}", containerDBPath, getStorageID()); } @@ -648,7 +647,7 @@ public void compactDb() { volumeInfoMetrics.dbCompactTimesNanoSecondsIncr( Time.monotonicNowNanos() - start); } catch (Exception e) { - LOG.error("compact rocksdb error in {}", dbFilePath, e); + LOG.warn("compact rocksdb error in {}", dbFilePath, e); } } } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MutableVolumeSet.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MutableVolumeSet.java index e8dc96889469..a79a06b6541f 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MutableVolumeSet.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MutableVolumeSet.java @@ -223,8 +223,7 @@ public void checkAllVolumes(StorageVolumeChecker checker) } if (!failedVolumes.isEmpty()) { - LOG.warn("checkAllVolumes got {} failed volumes - {}", - failedVolumes.size(), failedVolumes); + LOG.error("checkAllVolumes got {} failed volumes - {}", failedVolumes.size(), failedVolumes); handleVolumeFailures(failedVolumes); } else { LOG.debug("checkAllVolumes encountered no failures"); @@ -242,6 +241,7 @@ private void handleVolumeFailures( for (StorageVolume v : failedVolumes) { // Immediately mark the volume as failed so it is unavailable // for new containers. + LOG.error("Marking volume {} as failed", v.getStorageDir().getPath()); failVolume(v.getStorageDir().getPath()); } @@ -337,11 +337,10 @@ public void failVolume(String volumeRoot) { failedVolumeMap.put(volumeRoot, volume); volumeHealthMetrics.decrementHealthyVolumes(); volumeHealthMetrics.incrementFailedVolumes(); - LOG.error("Moving Volume : {} to failed Volumes", volumeRoot); } else if (failedVolumeMap.containsKey(volumeRoot)) { - LOG.warn("Volume : {} is not active", volumeRoot); + LOG.warn("Unable to fail the volume: {} as it is inactive", volumeRoot); } else { - LOG.warn("Volume : {} does not exist in VolumeSet", volumeRoot); + LOG.warn("Unable to fail the volume: {} as it does not exist in the VolumeSet", volumeRoot); } } finally { this.writeUnlock(); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java index 5260f8468930..68be2ae227ae 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java @@ -605,6 +605,7 @@ public DatanodeConfiguration getDatanodeConfig() { } public void failVolume() { + LOG.warn("Volume {} failed", this); setState(VolumeState.FAILED); if (volumeUsage != null) { volumeUsage.shutdown(); @@ -685,6 +686,7 @@ public synchronized VolumeCheckResult check(@Nullable Boolean unused) throw new InterruptedException("Directory check of volume " + this + " interrupted."); } + LOG.error("Directory check of volume {} failed", this); return VolumeCheckResult.FAILED; } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerReader.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerReader.java index a89a4958aa7a..43aa05c850c5 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerReader.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerReader.java @@ -96,8 +96,8 @@ public void run() { try { readVolume(hddsVolumeDir); } catch (Throwable t) { - LOG.error("Caught an exception during reading container files" + - " from Volume {} {}", hddsVolumeDir, t); + LOG.error("Could not read container files from the volume {}. " + + "Marking the volume as failed", hddsVolumeDir, t); volumeSet.failVolume(hddsVolumeDir.getPath()); } } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/upgrade/ScmHAFinalizeUpgradeActionDatanode.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/upgrade/ScmHAFinalizeUpgradeActionDatanode.java index 74ccdfb87986..37d3b241f0d3 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/upgrade/ScmHAFinalizeUpgradeActionDatanode.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/upgrade/ScmHAFinalizeUpgradeActionDatanode.java @@ -54,6 +54,7 @@ public void execute(DatanodeStateMachine dsm) throws Exception { if (volume instanceof HddsVolume) { HddsVolume hddsVolume = (HddsVolume) volume; if (!upgradeVolume(hddsVolume, hddsVolume.getClusterID())) { + LOG.error("Marking volume {} as failed", volume.getStorageDir().getAbsolutePath()); volumeSet.failVolume(volume.getStorageDir().getAbsolutePath()); } }