From 5d66c942d27c86c26f23f915aa29bc771db5b17a Mon Sep 17 00:00:00 2001 From: terrytlu Date: Fri, 25 Apr 2025 18:46:39 +0800 Subject: [PATCH 1/9] HBASE-29272 When Spark reads an HBase snapshot, it always read empty data. --- .../TableSnapshotInputFormatImpl.java | 20 +++-- .../hadoop/hbase/snapshot/SnapshotInfo.java | 39 +++++++++- .../SnapshotRegionSizeCalculator.java | 62 +++++++++++++++ .../TestSnapshotRegionSizeCalculator.java | 75 +++++++++++++++++++ 4 files changed, 189 insertions(+), 7 deletions(-) create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java index 501209f1c902..cfb2a1c2d755 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java @@ -47,6 +47,7 @@ import org.apache.hadoop.hbase.snapshot.RestoreSnapshotHelper; import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils; import org.apache.hadoop.hbase.snapshot.SnapshotManifest; +import org.apache.hadoop.hbase.snapshot.SnapshotRegionSizeCalculator; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.CommonFSUtils; import org.apache.hadoop.hbase.util.RegionSplitter; @@ -145,13 +146,19 @@ public static class InputSplit implements Writable { private String[] locations; private String scan; private String restoreDir; + private long length; // constructor for mapreduce framework / Writable public InputSplit() { } public InputSplit(TableDescriptor htd, RegionInfo regionInfo, List locations, Scan scan, - Path restoreDir) { + Path restoreDir){ + this(htd, regionInfo, locations, scan, restoreDir, 1); + } + + public InputSplit(TableDescriptor htd, RegionInfo regionInfo, List locations, Scan scan, + Path restoreDir, long length) { this.htd = htd; this.regionInfo = regionInfo; if (locations == null || locations.isEmpty()) { @@ -166,6 +173,7 @@ public InputSplit(TableDescriptor htd, RegionInfo regionInfo, List locat } this.restoreDir = restoreDir.toString(); + this.length = length; } public TableDescriptor getHtd() { @@ -181,8 +189,7 @@ public String getRestoreDir() { } public long getLength() { - // TODO: We can obtain the file sizes of the snapshot here. - return 0; + return length; } public String[] getLocations() { @@ -440,6 +447,8 @@ public static List getSplits(Scan scan, SnapshotManifest manifest, } } + SnapshotRegionSizeCalculator snapshotRegionSizeCalculator = new SnapshotRegionSizeCalculator( + conf, manifest); List splits = new ArrayList<>(); for (RegionInfo hri : regionManifests) { // load region descriptor @@ -456,6 +465,7 @@ public static List getSplits(Scan scan, SnapshotManifest manifest, } } + long snapshotRegionSize = snapshotRegionSizeCalculator.getRegionSize(hri.getEncodedName()); if (numSplits > 1) { byte[][] sp = sa.split(hri.getStartKey(), hri.getEndKey(), numSplits, true); for (int i = 0; i < sp.length - 1; i++) { @@ -478,7 +488,7 @@ public static List getSplits(Scan scan, SnapshotManifest manifest, Bytes.compareTo(scan.getStopRow(), sp[i + 1]) < 0 ? scan.getStopRow() : sp[i + 1]); } - splits.add(new InputSplit(htd, hri, hosts, boundedScan, restoreDir)); + splits.add(new InputSplit(htd, hri, hosts, boundedScan, restoreDir,snapshotRegionSize / numSplits)); } } } else { @@ -487,7 +497,7 @@ public static List getSplits(Scan scan, SnapshotManifest manifest, hri.getEndKey()) ) { - splits.add(new InputSplit(htd, hri, hosts, scan, restoreDir)); + splits.add(new InputSplit(htd, hri, hosts, scan, restoreDir, snapshotRegionSize)); } } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotInfo.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotInfo.java index 151319e165a2..680bedd6e30c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotInfo.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotInfo.java @@ -40,8 +40,10 @@ import org.apache.hadoop.hbase.client.SnapshotDescription; import org.apache.hadoop.hbase.io.HFileLink; import org.apache.hadoop.hbase.io.WALLink; +import org.apache.hadoop.hbase.regionserver.StoreFileInfo; import org.apache.hadoop.hbase.util.AbstractHBaseTool; import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.Strings; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; @@ -51,7 +53,6 @@ import org.apache.hbase.thirdparty.org.apache.commons.cli.CommandLineParser; import org.apache.hbase.thirdparty.org.apache.commons.cli.DefaultParser; import org.apache.hbase.thirdparty.org.apache.commons.cli.Option; -import org.apache.hbase.thirdparty.org.apache.commons.cli.Options; import org.apache.hbase.thirdparty.org.apache.commons.cli.ParseException; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; @@ -151,6 +152,7 @@ String getStateToString() { private AtomicInteger logsCount = new AtomicInteger(); private AtomicLong hfilesArchiveSize = new AtomicLong(); private AtomicLong hfilesSize = new AtomicLong(); + private Map regionSizeMap = new ConcurrentHashMap<>(); private AtomicLong hfilesMobSize = new AtomicLong(); private AtomicLong nonSharedHfilesArchiveSize = new AtomicLong(); private AtomicLong logSize = new AtomicLong(); @@ -176,6 +178,23 @@ String getStateToString() { this.fs = fs; } + SnapshotStats(final Configuration conf, final FileSystem fs, + final SnapshotManifest mainfest) throws CorruptedSnapshotException { + this.snapshot = SnapshotDescriptionUtils.readSnapshotInfo(fs, mainfest.getSnapshotDir());; + this.snapshotTable = mainfest.getTableDescriptor().getTableName(); + this.conf = conf; + this.fs = fs; + } + + /** + * Returns the map containing region sizes. + * + * @return A map where keys are region names and values are their corresponding sizes. + */ + public Map getRegionSizeMap() { + return regionSizeMap; + } + /** Returns the snapshot descriptor */ public SnapshotDescription getSnapshotDescription() { return ProtobufUtil.createSnapshotDesc(this.snapshot); @@ -347,6 +366,13 @@ FileInfo addStoreFile(final RegionInfo region, final String family, return new FileInfo(inArchive, size, isCorrupted); } + void updateRegionSizeMap(final RegionInfo region, + final SnapshotRegionManifest.StoreFile storeFile){ + long currentSize = regionSizeMap.getOrDefault(region.getEncodedName(), 0L); + regionSizeMap.put(region.getEncodedName(), currentSize + storeFile.getFileSize()); + } + + /** * Add the specified log file to the stats * @param server server name @@ -604,7 +630,15 @@ public static SnapshotStats getSnapshotStats(final Configuration conf, FileSystem fs = FileSystem.get(rootDir.toUri(), conf); Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshotDesc, rootDir); SnapshotManifest manifest = SnapshotManifest.open(conf, fs, snapshotDir, snapshotDesc); - final SnapshotStats stats = new SnapshotStats(conf, fs, snapshotDesc); + return getSnapshotStats(conf, manifest, filesMap); + } + + public static SnapshotStats getSnapshotStats(final Configuration conf, + final SnapshotManifest manifest, final Map filesMap) + throws IOException { + Path rootDir = CommonFSUtils.getRootDir(conf); + FileSystem fs = FileSystem.get(rootDir.toUri(), conf); + final SnapshotStats stats = new SnapshotStats(conf, fs, manifest); SnapshotReferenceUtil.concurrentVisitReferencedFiles(conf, fs, manifest, "SnapshotsStatsAggregation", new SnapshotReferenceUtil.SnapshotVisitor() { @Override @@ -613,6 +647,7 @@ public void storeFile(final RegionInfo regionInfo, final String family, if (!storeFile.hasReference()) { stats.addStoreFile(regionInfo, family, storeFile, filesMap); } + stats.updateRegionSizeMap(regionInfo, storeFile); } }); return stats; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java new file mode 100644 index 000000000000..956149ed6598 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java @@ -0,0 +1,62 @@ +package org.apache.hadoop.hbase.snapshot; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.io.HFileLink; +import org.apache.hadoop.hbase.regionserver.StoreFileInfo; +import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription; +import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotRegionManifest; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.apache.hadoop.hbase.snapshot.SnapshotInfo.SnapshotStats; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Utility class to calculate the size of each region in a snapshot. + */ +public class SnapshotRegionSizeCalculator { + private static final Logger LOG = LoggerFactory.getLogger(SnapshotRegionSizeCalculator.class); + private final SnapshotManifest manifest; + private final Configuration conf; + private final Map regionSizes; + + + public SnapshotRegionSizeCalculator(Configuration conf, SnapshotManifest manifest) + throws IOException { + this.conf = conf; + this.manifest = manifest; + this.regionSizes = calculateRegionSizes(); + } + + /** + * Calculate the size of each region in the snapshot. + * @return A map of region encoded names to their total size in bytes. + */ + public Map calculateRegionSizes() throws IOException { + SnapshotStats stats = SnapshotInfo.getSnapshotStats(conf, manifest, null); + return stats.getRegionSizeMap(); + } + + + + public long getRegionSize(String encodedRegionName) { + Long size = regionSizes.get(encodedRegionName); + if (size == null) { + LOG.debug("Unknown region:" + encodedRegionName); + return 0; + } else { + return size; + } + } + +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java new file mode 100644 index 000000000000..0e58c9404b14 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java @@ -0,0 +1,75 @@ +package org.apache.hadoop.hbase.snapshot; + +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos; +import org.apache.hadoop.hbase.testclassification.SmallTests; +import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category(SmallTests.class) +public class TestSnapshotRegionSizeCalculator { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestSnapshotRegionSizeCalculator.class); + + private static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); + private static FileSystem fs; + private static Path rootDir; + private static Path snapshotDir; + private static SnapshotProtos.SnapshotDescription snapshotDesc; + private static SnapshotManifest manifest; + + @BeforeClass + public static void setUp() throws Exception { + Configuration conf = TEST_UTIL.getConfiguration(); + fs = TEST_UTIL.getTestFileSystem(); + rootDir = TEST_UTIL.getDataTestDir("TestSnapshotRegionSizeCalculator"); + CommonFSUtils.setRootDir(conf, rootDir); + + // Create a mock snapshot with a region and store files + SnapshotTestingUtils.SnapshotMock snapshotMock = + new SnapshotTestingUtils.SnapshotMock(conf, fs, rootDir); + SnapshotTestingUtils.SnapshotMock.SnapshotBuilder builder = + snapshotMock.createSnapshotV2("snapshot", "testTable", 4); + builder.addRegion(); + builder.addRegion(); + builder.addRegion(); + builder.addRegion(); + snapshotDir = builder.commit(); + snapshotDesc = builder.getSnapshotDescription(); + manifest = SnapshotManifest.open(conf, fs, snapshotDir, snapshotDesc); + + } + + @AfterClass + public static void tearDown() throws Exception { + fs.delete(rootDir, true); + } + + @Test + public void testCalculateRegionSizes() throws IOException { + SnapshotRegionSizeCalculator calculator = + new SnapshotRegionSizeCalculator(TEST_UTIL.getConfiguration(), manifest); + Map regionSizes = calculator.calculateRegionSizes(); + + // Verify that the region sizes are calculated correctly + assertTrue("No regions found in the snapshot", !regionSizes.isEmpty()); + for (Map.Entry entry : regionSizes.entrySet()) { + assertTrue("Region size should be non-negative", entry.getValue() == 0); + } + } +} From 5154ecfff593052fb86eb9255806ace37a230451 Mon Sep 17 00:00:00 2001 From: terrytlu Date: Wed, 7 May 2025 16:20:19 +0800 Subject: [PATCH 2/9] HBASE-29272 When Spark reads an HBase snapshot, it always read empty data. --- .../TableSnapshotInputFormatImpl.java | 9 +++-- .../hadoop/hbase/snapshot/SnapshotInfo.java | 16 +++----- .../SnapshotRegionSizeCalculator.java | 38 +++++++++---------- .../TestSnapshotRegionSizeCalculator.java | 25 ++++++++++-- 4 files changed, 51 insertions(+), 37 deletions(-) diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java index cfb2a1c2d755..1e653222caee 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java @@ -153,7 +153,7 @@ public InputSplit() { } public InputSplit(TableDescriptor htd, RegionInfo regionInfo, List locations, Scan scan, - Path restoreDir){ + Path restoreDir) { this(htd, regionInfo, locations, scan, restoreDir, 1); } @@ -447,8 +447,8 @@ public static List getSplits(Scan scan, SnapshotManifest manifest, } } - SnapshotRegionSizeCalculator snapshotRegionSizeCalculator = new SnapshotRegionSizeCalculator( - conf, manifest); + SnapshotRegionSizeCalculator snapshotRegionSizeCalculator = + new SnapshotRegionSizeCalculator(conf, manifest); List splits = new ArrayList<>(); for (RegionInfo hri : regionManifests) { // load region descriptor @@ -488,7 +488,8 @@ public static List getSplits(Scan scan, SnapshotManifest manifest, Bytes.compareTo(scan.getStopRow(), sp[i + 1]) < 0 ? scan.getStopRow() : sp[i + 1]); } - splits.add(new InputSplit(htd, hri, hosts, boundedScan, restoreDir,snapshotRegionSize / numSplits)); + splits.add(new InputSplit(htd, hri, hosts, boundedScan, restoreDir, + snapshotRegionSize / numSplits)); } } } else { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotInfo.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotInfo.java index 680bedd6e30c..c696d1ca2dd6 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotInfo.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotInfo.java @@ -40,10 +40,8 @@ import org.apache.hadoop.hbase.client.SnapshotDescription; import org.apache.hadoop.hbase.io.HFileLink; import org.apache.hadoop.hbase.io.WALLink; -import org.apache.hadoop.hbase.regionserver.StoreFileInfo; import org.apache.hadoop.hbase.util.AbstractHBaseTool; import org.apache.hadoop.hbase.util.CommonFSUtils; -import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.Strings; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; @@ -178,9 +176,10 @@ String getStateToString() { this.fs = fs; } - SnapshotStats(final Configuration conf, final FileSystem fs, - final SnapshotManifest mainfest) throws CorruptedSnapshotException { - this.snapshot = SnapshotDescriptionUtils.readSnapshotInfo(fs, mainfest.getSnapshotDir());; + SnapshotStats(final Configuration conf, final FileSystem fs, final SnapshotManifest mainfest) + throws CorruptedSnapshotException { + this.snapshot = SnapshotDescriptionUtils.readSnapshotInfo(fs, mainfest.getSnapshotDir()); + ; this.snapshotTable = mainfest.getTableDescriptor().getTableName(); this.conf = conf; this.fs = fs; @@ -188,7 +187,6 @@ String getStateToString() { /** * Returns the map containing region sizes. - * * @return A map where keys are region names and values are their corresponding sizes. */ public Map getRegionSizeMap() { @@ -367,12 +365,11 @@ FileInfo addStoreFile(final RegionInfo region, final String family, } void updateRegionSizeMap(final RegionInfo region, - final SnapshotRegionManifest.StoreFile storeFile){ + final SnapshotRegionManifest.StoreFile storeFile) { long currentSize = regionSizeMap.getOrDefault(region.getEncodedName(), 0L); regionSizeMap.put(region.getEncodedName(), currentSize + storeFile.getFileSize()); } - /** * Add the specified log file to the stats * @param server server name @@ -634,8 +631,7 @@ public static SnapshotStats getSnapshotStats(final Configuration conf, } public static SnapshotStats getSnapshotStats(final Configuration conf, - final SnapshotManifest manifest, final Map filesMap) - throws IOException { + final SnapshotManifest manifest, final Map filesMap) throws IOException { Path rootDir = CommonFSUtils.getRootDir(conf); FileSystem fs = FileSystem.get(rootDir.toUri(), conf); final SnapshotStats stats = new SnapshotStats(conf, fs, manifest); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java index 956149ed6598..fb6f93e2f247 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java @@ -1,22 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.hadoop.hbase.snapshot; import java.io.IOException; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; import java.util.Map; - import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.client.RegionInfo; -import org.apache.hadoop.hbase.io.HFileLink; -import org.apache.hadoop.hbase.regionserver.StoreFileInfo; -import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription; -import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotRegionManifest; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.util.CommonFSUtils; import org.apache.hadoop.hbase.snapshot.SnapshotInfo.SnapshotStats; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,7 +33,6 @@ public class SnapshotRegionSizeCalculator { private final Configuration conf; private final Map regionSizes; - public SnapshotRegionSizeCalculator(Configuration conf, SnapshotManifest manifest) throws IOException { this.conf = conf; @@ -43,13 +45,11 @@ public SnapshotRegionSizeCalculator(Configuration conf, SnapshotManifest manifes * @return A map of region encoded names to their total size in bytes. */ public Map calculateRegionSizes() throws IOException { - SnapshotStats stats = SnapshotInfo.getSnapshotStats(conf, manifest, null); + SnapshotStats stats = SnapshotInfo.getSnapshotStats(conf, manifest, null); return stats.getRegionSizeMap(); } - - - public long getRegionSize(String encodedRegionName) { + public long getRegionSize(String encodedRegionName) { Long size = regionSizes.get(encodedRegionName); if (size == null) { LOG.debug("Unknown region:" + encodedRegionName); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java index 0e58c9404b14..1454763eed4c 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java @@ -1,16 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.hadoop.hbase.snapshot; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.util.Map; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtil; -import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos; import org.apache.hadoop.hbase.testclassification.SmallTests; import org.apache.hadoop.hbase.util.CommonFSUtils; import org.junit.AfterClass; @@ -19,6 +34,8 @@ import org.junit.Test; import org.junit.experimental.categories.Category; +import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos; + @Category(SmallTests.class) public class TestSnapshotRegionSizeCalculator { @@ -67,9 +84,9 @@ public void testCalculateRegionSizes() throws IOException { Map regionSizes = calculator.calculateRegionSizes(); // Verify that the region sizes are calculated correctly - assertTrue("No regions found in the snapshot", !regionSizes.isEmpty()); + assertTrue("No regions found in the snapshot", regionSizes.size() == 4); for (Map.Entry entry : regionSizes.entrySet()) { - assertTrue("Region size should be non-negative", entry.getValue() == 0); + assertTrue("Region size should be non-negative", entry.getValue() > 0); } } } From 2d55b760c1cf4322a0ab196dfdc0fbf7072a4b62 Mon Sep 17 00:00:00 2001 From: terrytlu Date: Fri, 23 May 2025 15:31:23 +0800 Subject: [PATCH 3/9] HBASE-29272 When Spark reads an HBase snapshot, it always read empty data. HBASE-29272 When Spark reads an HBase snapshot, it always read empty data. --- .../hbase/mapred/TableSnapshotInputFormat.java | 9 --------- .../mapreduce/TableSnapshotInputFormat.java | 7 ------- .../TableSnapshotInputFormatImpl.java | 5 ----- .../hadoop/hbase/snapshot/SnapshotInfo.java | 18 +----------------- .../snapshot/SnapshotRegionSizeCalculator.java | 15 ++++++++++++++- 5 files changed, 15 insertions(+), 39 deletions(-) diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapred/TableSnapshotInputFormat.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapred/TableSnapshotInputFormat.java index 34d2e200d967..f351464d31e2 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapred/TableSnapshotInputFormat.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapred/TableSnapshotInputFormat.java @@ -22,10 +22,7 @@ import java.io.IOException; import java.util.List; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.Result; -import org.apache.hadoop.hbase.client.Scan; -import org.apache.hadoop.hbase.client.TableDescriptor; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormatImpl; import org.apache.hadoop.hbase.util.RegionSplitter; @@ -56,12 +53,6 @@ public TableSnapshotRegionSplit(TableSnapshotInputFormatImpl.InputSplit delegate this.delegate = delegate; } - public TableSnapshotRegionSplit(TableDescriptor htd, RegionInfo regionInfo, - List locations, Scan scan, Path restoreDir) { - this.delegate = - new TableSnapshotInputFormatImpl.InputSplit(htd, regionInfo, locations, scan, restoreDir); - } - @Override public long getLength() throws IOException { return delegate.getLength(); diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormat.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormat.java index c71a42aea5d1..e72aac092d44 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormat.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormat.java @@ -26,7 +26,6 @@ import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Scan; -import org.apache.hadoop.hbase.client.TableDescriptor; import org.apache.hadoop.hbase.client.metrics.ScanMetrics; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.util.RegionSplitter; @@ -94,12 +93,6 @@ public TableSnapshotRegionSplit(TableSnapshotInputFormatImpl.InputSplit delegate this.delegate = delegate; } - public TableSnapshotRegionSplit(TableDescriptor htd, RegionInfo regionInfo, - List locations, Scan scan, Path restoreDir) { - this.delegate = - new TableSnapshotInputFormatImpl.InputSplit(htd, regionInfo, locations, scan, restoreDir); - } - @Override public long getLength() throws IOException, InterruptedException { return delegate.getLength(); diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java index 1e653222caee..179adf1eecd7 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java @@ -152,11 +152,6 @@ public static class InputSplit implements Writable { public InputSplit() { } - public InputSplit(TableDescriptor htd, RegionInfo regionInfo, List locations, Scan scan, - Path restoreDir) { - this(htd, regionInfo, locations, scan, restoreDir, 1); - } - public InputSplit(TableDescriptor htd, RegionInfo regionInfo, List locations, Scan scan, Path restoreDir, long length) { this.htd = htd; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotInfo.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotInfo.java index c696d1ca2dd6..f982b8652224 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotInfo.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotInfo.java @@ -176,15 +176,6 @@ String getStateToString() { this.fs = fs; } - SnapshotStats(final Configuration conf, final FileSystem fs, final SnapshotManifest mainfest) - throws CorruptedSnapshotException { - this.snapshot = SnapshotDescriptionUtils.readSnapshotInfo(fs, mainfest.getSnapshotDir()); - ; - this.snapshotTable = mainfest.getTableDescriptor().getTableName(); - this.conf = conf; - this.fs = fs; - } - /** * Returns the map containing region sizes. * @return A map where keys are region names and values are their corresponding sizes. @@ -627,14 +618,7 @@ public static SnapshotStats getSnapshotStats(final Configuration conf, FileSystem fs = FileSystem.get(rootDir.toUri(), conf); Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshotDesc, rootDir); SnapshotManifest manifest = SnapshotManifest.open(conf, fs, snapshotDir, snapshotDesc); - return getSnapshotStats(conf, manifest, filesMap); - } - - public static SnapshotStats getSnapshotStats(final Configuration conf, - final SnapshotManifest manifest, final Map filesMap) throws IOException { - Path rootDir = CommonFSUtils.getRootDir(conf); - FileSystem fs = FileSystem.get(rootDir.toUri(), conf); - final SnapshotStats stats = new SnapshotStats(conf, fs, manifest); + final SnapshotStats stats = new SnapshotStats(conf, fs, snapshotDesc); SnapshotReferenceUtil.concurrentVisitReferencedFiles(conf, fs, manifest, "SnapshotsStatsAggregation", new SnapshotReferenceUtil.SnapshotVisitor() { @Override diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java index fb6f93e2f247..137bccc9779c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java @@ -20,24 +20,35 @@ import java.io.IOException; import java.util.Map; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.snapshot.SnapshotInfo.SnapshotStats; +import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos; + /** * Utility class to calculate the size of each region in a snapshot. */ +@InterfaceAudience.Private public class SnapshotRegionSizeCalculator { private static final Logger LOG = LoggerFactory.getLogger(SnapshotRegionSizeCalculator.class); private final SnapshotManifest manifest; private final Configuration conf; private final Map regionSizes; + private final FileSystem fs; + public SnapshotRegionSizeCalculator(Configuration conf, SnapshotManifest manifest) throws IOException { this.conf = conf; this.manifest = manifest; this.regionSizes = calculateRegionSizes(); + Path rootDir = CommonFSUtils.getRootDir(conf); + fs = FileSystem.get(rootDir.toUri(), conf); } /** @@ -45,7 +56,9 @@ public SnapshotRegionSizeCalculator(Configuration conf, SnapshotManifest manifes * @return A map of region encoded names to their total size in bytes. */ public Map calculateRegionSizes() throws IOException { - SnapshotStats stats = SnapshotInfo.getSnapshotStats(conf, manifest, null); + SnapshotProtos.SnapshotDescription snapshot = + SnapshotDescriptionUtils.readSnapshotInfo(fs, manifest.getSnapshotDir()); + SnapshotStats stats = SnapshotInfo.getSnapshotStats(conf, snapshot, null); return stats.getRegionSizeMap(); } From 124d8798062fe35c6bf6b8bc4dd79f897bea7244 Mon Sep 17 00:00:00 2001 From: terrytlu Date: Wed, 28 May 2025 16:19:25 +0800 Subject: [PATCH 4/9] HBASE-29272 When Spark reads an HBase snapshot, it always read empty data. HBASE-29272 When Spark reads an HBase snapshot, it always read empty data. --- .../hbase/snapshot/SnapshotRegionSizeCalculator.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java index 137bccc9779c..47507ced7fbf 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java @@ -46,14 +46,15 @@ public SnapshotRegionSizeCalculator(Configuration conf, SnapshotManifest manifes throws IOException { this.conf = conf; this.manifest = manifest; - this.regionSizes = calculateRegionSizes(); Path rootDir = CommonFSUtils.getRootDir(conf); - fs = FileSystem.get(rootDir.toUri(), conf); + this.fs = FileSystem.get(rootDir.toUri(), conf); + this.regionSizes = calculateRegionSizes(); } /** * Calculate the size of each region in the snapshot. * @return A map of region encoded names to their total size in bytes. + * @throws IOException If an error occurs during calculation. */ public Map calculateRegionSizes() throws IOException { SnapshotProtos.SnapshotDescription snapshot = @@ -62,6 +63,11 @@ public Map calculateRegionSizes() throws IOException { return stats.getRegionSizeMap(); } + /** + * Retrieves the size of a specific region by its encoded name. + * @param encodedRegionName The encoded name of the region. + * @return The size of the region in bytes, or 0 if the region is not found. + */ public long getRegionSize(String encodedRegionName) { Long size = regionSizes.get(encodedRegionName); if (size == null) { From 191c69dfb686160dcd5c4faaf674d86c1ef00e40 Mon Sep 17 00:00:00 2001 From: terrytlu Date: Thu, 5 Jun 2025 21:22:29 +0800 Subject: [PATCH 5/9] HBASE-29272 When Spark reads an HBase snapshot, it always read empty data. abc --- .../SnapshotRegionSizeCalculator.java | 2 +- .../TestSnapshotRegionSizeCalculator.java | 71 ++++++++++++++++--- 2 files changed, 62 insertions(+), 11 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java index 47507ced7fbf..d712a763e42e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java @@ -71,7 +71,7 @@ public Map calculateRegionSizes() throws IOException { public long getRegionSize(String encodedRegionName) { Long size = regionSizes.get(encodedRegionName); if (size == null) { - LOG.debug("Unknown region:" + encodedRegionName); + LOG.debug("Unknown or empty region:" + encodedRegionName); return 0; } else { return size; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java index 1454763eed4c..e2b8d9588ce5 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java @@ -26,7 +26,10 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.testclassification.SmallTests; +import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.CommonFSUtils; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -49,14 +52,71 @@ public class TestSnapshotRegionSizeCalculator { private static Path snapshotDir; private static SnapshotProtos.SnapshotDescription snapshotDesc; private static SnapshotManifest manifest; + private static Admin admin; + private static Configuration conf; @BeforeClass public static void setUp() throws Exception { - Configuration conf = TEST_UTIL.getConfiguration(); + TEST_UTIL.startMiniCluster(2); + conf = TEST_UTIL.getConfiguration(); fs = TEST_UTIL.getTestFileSystem(); rootDir = TEST_UTIL.getDataTestDir("TestSnapshotRegionSizeCalculator"); CommonFSUtils.setRootDir(conf, rootDir); + admin = TEST_UTIL.getConnection().getAdmin(); + } + + @AfterClass + public static void tearDown() throws Exception { + fs.delete(rootDir, true); + TEST_UTIL.shutdownMiniCluster(); + } + + @Test + public void testCalculateRegionSizeOneRegion() throws IOException { + TableName tableName = TableName.valueOf("test_table"); + String snapshotName = "test_snapshot"; + + // table has no data + TEST_UTIL.createTable(tableName, Bytes.toBytes("info")); + admin = TEST_UTIL.getConnection().getAdmin(); + admin.snapshot(snapshotName, tableName); + Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshotName, + TEST_UTIL.getDefaultRootDirPath()); + SnapshotProtos.SnapshotDescription snapshotDesc = + SnapshotDescriptionUtils.readSnapshotInfo(TEST_UTIL.getTestFileSystem(), snapshotDir); + SnapshotManifest manifest = SnapshotManifest.open(TEST_UTIL.getConfiguration(), + TEST_UTIL.getTestFileSystem(), snapshotDir, snapshotDesc); + SnapshotRegionSizeCalculator calculator = + new SnapshotRegionSizeCalculator(TEST_UTIL.getConfiguration(), manifest); + Map regionSizes = calculator.calculateRegionSizes(); + + for (Map.Entry entry : regionSizes.entrySet()) { + assertTrue("Region size should be 0.", entry.getValue() == 0); + } + admin.deleteSnapshot(snapshotName); + + // table has some data + TEST_UTIL.loadTable(admin.getConnection().getTable(tableName), Bytes.toBytes("info")); + admin.snapshot(snapshotName, tableName); + snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshotName, + TEST_UTIL.getDefaultRootDirPath()); + snapshotDesc = + SnapshotDescriptionUtils.readSnapshotInfo(TEST_UTIL.getTestFileSystem(), snapshotDir); + manifest = SnapshotManifest.open(TEST_UTIL.getConfiguration(), TEST_UTIL.getTestFileSystem(), + snapshotDir, snapshotDesc); + calculator = new SnapshotRegionSizeCalculator(TEST_UTIL.getConfiguration(), manifest); + regionSizes = calculator.calculateRegionSizes(); + for (Map.Entry entry : regionSizes.entrySet()) { + assertTrue("Region size should be greater than 0.", entry.getValue() > 0); + } + + TEST_UTIL.deleteTable(tableName); + admin.deleteSnapshot(snapshotName); + } + + @Test + public void testCalculateRegionSizesMultiRegion() throws IOException { // Create a mock snapshot with a region and store files SnapshotTestingUtils.SnapshotMock snapshotMock = new SnapshotTestingUtils.SnapshotMock(conf, fs, rootDir); @@ -70,15 +130,6 @@ public static void setUp() throws Exception { snapshotDesc = builder.getSnapshotDescription(); manifest = SnapshotManifest.open(conf, fs, snapshotDir, snapshotDesc); - } - - @AfterClass - public static void tearDown() throws Exception { - fs.delete(rootDir, true); - } - - @Test - public void testCalculateRegionSizes() throws IOException { SnapshotRegionSizeCalculator calculator = new SnapshotRegionSizeCalculator(TEST_UTIL.getConfiguration(), manifest); Map regionSizes = calculator.calculateRegionSizes(); From 5aed8c90d1dc9767375892e6cdd3d16a1a908b21 Mon Sep 17 00:00:00 2001 From: terrytlu Date: Tue, 24 Jun 2025 15:15:01 +0800 Subject: [PATCH 6/9] HBASE-29272 When Spark reads an HBase snapshot, it always read empty data. --- .../hbase/snapshot/TestSnapshotRegionSizeCalculator.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java index e2b8d9588ce5..a624a79f1cd6 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java @@ -56,13 +56,13 @@ public class TestSnapshotRegionSizeCalculator { private static Configuration conf; @BeforeClass - public static void setUp() throws Exception { - TEST_UTIL.startMiniCluster(2); + public static void setupCluster() throws Exception { + TEST_UTIL.startMiniCluster(3); conf = TEST_UTIL.getConfiguration(); fs = TEST_UTIL.getTestFileSystem(); rootDir = TEST_UTIL.getDataTestDir("TestSnapshotRegionSizeCalculator"); CommonFSUtils.setRootDir(conf, rootDir); - admin = TEST_UTIL.getConnection().getAdmin(); + admin = TEST_UTIL.getAdmin(); } @AfterClass From b94965557b3642d53fc0dbe34e0f4cc24e52173f Mon Sep 17 00:00:00 2001 From: terrytlu Date: Tue, 24 Jun 2025 21:31:02 +0800 Subject: [PATCH 7/9] HBASE-29272 When Spark reads an HBase snapshot, it always read empty data. --- .../TestSnapshotRegionSizeCalculator.java | 28 +------------------ 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java index a624a79f1cd6..6c83b8b04211 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java @@ -72,7 +72,7 @@ public static void tearDown() throws Exception { } @Test - public void testCalculateRegionSizeOneRegion() throws IOException { + public void testCalculateRegionSize() throws IOException { TableName tableName = TableName.valueOf("test_table"); String snapshotName = "test_snapshot"; @@ -114,30 +114,4 @@ public void testCalculateRegionSizeOneRegion() throws IOException { TEST_UTIL.deleteTable(tableName); admin.deleteSnapshot(snapshotName); } - - @Test - public void testCalculateRegionSizesMultiRegion() throws IOException { - // Create a mock snapshot with a region and store files - SnapshotTestingUtils.SnapshotMock snapshotMock = - new SnapshotTestingUtils.SnapshotMock(conf, fs, rootDir); - SnapshotTestingUtils.SnapshotMock.SnapshotBuilder builder = - snapshotMock.createSnapshotV2("snapshot", "testTable", 4); - builder.addRegion(); - builder.addRegion(); - builder.addRegion(); - builder.addRegion(); - snapshotDir = builder.commit(); - snapshotDesc = builder.getSnapshotDescription(); - manifest = SnapshotManifest.open(conf, fs, snapshotDir, snapshotDesc); - - SnapshotRegionSizeCalculator calculator = - new SnapshotRegionSizeCalculator(TEST_UTIL.getConfiguration(), manifest); - Map regionSizes = calculator.calculateRegionSizes(); - - // Verify that the region sizes are calculated correctly - assertTrue("No regions found in the snapshot", regionSizes.size() == 4); - for (Map.Entry entry : regionSizes.entrySet()) { - assertTrue("Region size should be non-negative", entry.getValue() > 0); - } - } } From f501f83a7381112d0209a4f15ea82ad583a740e8 Mon Sep 17 00:00:00 2001 From: terrytlu Date: Fri, 4 Jul 2025 18:10:57 +0800 Subject: [PATCH 8/9] HBASE-29272 When Spark reads an HBase snapshot, it always read empty data. --- .../mapred/TableSnapshotInputFormat.java | 10 +++++ .../mapreduce/TableSnapshotInputFormat.java | 8 ++++ .../TableSnapshotInputFormatImpl.java | 14 +++++-- .../SnapshotRegionSizeCalculator.java | 37 ++----------------- .../TestSnapshotRegionSizeCalculator.java | 10 ++--- 5 files changed, 36 insertions(+), 43 deletions(-) diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapred/TableSnapshotInputFormat.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapred/TableSnapshotInputFormat.java index f351464d31e2..5a151c8c9550 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapred/TableSnapshotInputFormat.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapred/TableSnapshotInputFormat.java @@ -22,7 +22,10 @@ import java.io.IOException; import java.util.List; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.TableDescriptor; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormatImpl; import org.apache.hadoop.hbase.util.RegionSplitter; @@ -53,6 +56,13 @@ public TableSnapshotRegionSplit(TableSnapshotInputFormatImpl.InputSplit delegate this.delegate = delegate; } + @Deprecated + public TableSnapshotRegionSplit(TableDescriptor htd, RegionInfo regionInfo, + List locations, Scan scan, Path restoreDir) { + this.delegate = + new TableSnapshotInputFormatImpl.InputSplit(htd, regionInfo, locations, scan, restoreDir); + } + @Override public long getLength() throws IOException { return delegate.getLength(); diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormat.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormat.java index e72aac092d44..24432f963ea9 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormat.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormat.java @@ -26,6 +26,7 @@ import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.TableDescriptor; import org.apache.hadoop.hbase.client.metrics.ScanMetrics; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.util.RegionSplitter; @@ -93,6 +94,13 @@ public TableSnapshotRegionSplit(TableSnapshotInputFormatImpl.InputSplit delegate this.delegate = delegate; } + @Deprecated + public TableSnapshotRegionSplit(TableDescriptor htd, RegionInfo regionInfo, + List locations, Scan scan, Path restoreDir) { + this.delegate = + new TableSnapshotInputFormatImpl.InputSplit(htd, regionInfo, locations, scan, restoreDir); + } + @Override public long getLength() throws IOException, InterruptedException { return delegate.getLength(); diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java index 179adf1eecd7..565558da870b 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java @@ -24,6 +24,7 @@ import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.UUID; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -152,6 +153,12 @@ public static class InputSplit implements Writable { public InputSplit() { } + @Deprecated + public InputSplit(TableDescriptor htd, RegionInfo regionInfo, List locations, Scan scan, + Path restoreDir) { + this(htd, regionInfo, locations, scan, restoreDir, 1); + } + public InputSplit(TableDescriptor htd, RegionInfo regionInfo, List locations, Scan scan, Path restoreDir, long length) { this.htd = htd; @@ -442,8 +449,8 @@ public static List getSplits(Scan scan, SnapshotManifest manifest, } } - SnapshotRegionSizeCalculator snapshotRegionSizeCalculator = - new SnapshotRegionSizeCalculator(conf, manifest); + Map regionSizes = + SnapshotRegionSizeCalculator.calculateRegionSizes(conf, manifest); List splits = new ArrayList<>(); for (RegionInfo hri : regionManifests) { // load region descriptor @@ -459,8 +466,7 @@ public static List getSplits(Scan scan, SnapshotManifest manifest, hosts = calculateLocationsForInputSplit(conf, htd, hri, tableDir); } } - - long snapshotRegionSize = snapshotRegionSizeCalculator.getRegionSize(hri.getEncodedName()); + long snapshotRegionSize = regionSizes.getOrDefault(hri.getEncodedName(), 0L); if (numSplits > 1) { byte[][] sp = sa.split(hri.getStartKey(), hri.getEndKey(), numSplits, true); for (int i = 0; i < sp.length - 1; i++) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java index d712a763e42e..6c12e8587d09 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotRegionSizeCalculator.java @@ -21,12 +21,9 @@ import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.snapshot.SnapshotInfo.SnapshotStats; import org.apache.hadoop.hbase.util.CommonFSUtils; import org.apache.yetus.audience.InterfaceAudience; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos; @@ -35,47 +32,19 @@ */ @InterfaceAudience.Private public class SnapshotRegionSizeCalculator { - private static final Logger LOG = LoggerFactory.getLogger(SnapshotRegionSizeCalculator.class); - private final SnapshotManifest manifest; - private final Configuration conf; - private final Map regionSizes; - - private final FileSystem fs; - - public SnapshotRegionSizeCalculator(Configuration conf, SnapshotManifest manifest) - throws IOException { - this.conf = conf; - this.manifest = manifest; - Path rootDir = CommonFSUtils.getRootDir(conf); - this.fs = FileSystem.get(rootDir.toUri(), conf); - this.regionSizes = calculateRegionSizes(); - } /** * Calculate the size of each region in the snapshot. * @return A map of region encoded names to their total size in bytes. * @throws IOException If an error occurs during calculation. */ - public Map calculateRegionSizes() throws IOException { + public static Map calculateRegionSizes(Configuration conf, + SnapshotManifest manifest) throws IOException { + FileSystem fs = FileSystem.get(CommonFSUtils.getRootDir(conf).toUri(), conf); SnapshotProtos.SnapshotDescription snapshot = SnapshotDescriptionUtils.readSnapshotInfo(fs, manifest.getSnapshotDir()); SnapshotStats stats = SnapshotInfo.getSnapshotStats(conf, snapshot, null); return stats.getRegionSizeMap(); } - /** - * Retrieves the size of a specific region by its encoded name. - * @param encodedRegionName The encoded name of the region. - * @return The size of the region in bytes, or 0 if the region is not found. - */ - public long getRegionSize(String encodedRegionName) { - Long size = regionSizes.get(encodedRegionName); - if (size == null) { - LOG.debug("Unknown or empty region:" + encodedRegionName); - return 0; - } else { - return size; - } - } - } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java index 6c83b8b04211..62d56dd7b38f 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSnapshotRegionSizeCalculator.java @@ -86,9 +86,9 @@ public void testCalculateRegionSize() throws IOException { SnapshotDescriptionUtils.readSnapshotInfo(TEST_UTIL.getTestFileSystem(), snapshotDir); SnapshotManifest manifest = SnapshotManifest.open(TEST_UTIL.getConfiguration(), TEST_UTIL.getTestFileSystem(), snapshotDir, snapshotDesc); - SnapshotRegionSizeCalculator calculator = - new SnapshotRegionSizeCalculator(TEST_UTIL.getConfiguration(), manifest); - Map regionSizes = calculator.calculateRegionSizes(); + + Map regionSizes = + SnapshotRegionSizeCalculator.calculateRegionSizes(TEST_UTIL.getConfiguration(), manifest); for (Map.Entry entry : regionSizes.entrySet()) { assertTrue("Region size should be 0.", entry.getValue() == 0); @@ -105,8 +105,8 @@ public void testCalculateRegionSize() throws IOException { SnapshotDescriptionUtils.readSnapshotInfo(TEST_UTIL.getTestFileSystem(), snapshotDir); manifest = SnapshotManifest.open(TEST_UTIL.getConfiguration(), TEST_UTIL.getTestFileSystem(), snapshotDir, snapshotDesc); - calculator = new SnapshotRegionSizeCalculator(TEST_UTIL.getConfiguration(), manifest); - regionSizes = calculator.calculateRegionSizes(); + regionSizes = + SnapshotRegionSizeCalculator.calculateRegionSizes(TEST_UTIL.getConfiguration(), manifest); for (Map.Entry entry : regionSizes.entrySet()) { assertTrue("Region size should be greater than 0.", entry.getValue() > 0); } From 123234859556313ec2153368551a607489b4ce16 Mon Sep 17 00:00:00 2001 From: Peng Lu Date: Wed, 4 Feb 2026 21:36:53 +0800 Subject: [PATCH 9/9] Update code based on the review comments --- .../hadoop/hbase/mapred/TableSnapshotInputFormat.java | 10 ++++++++-- .../hbase/mapreduce/TableSnapshotInputFormat.java | 10 ++++++++-- .../hbase/mapreduce/TableSnapshotInputFormatImpl.java | 6 ------ 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapred/TableSnapshotInputFormat.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapred/TableSnapshotInputFormat.java index 5a151c8c9550..2189f4e8c4df 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapred/TableSnapshotInputFormat.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapred/TableSnapshotInputFormat.java @@ -56,11 +56,17 @@ public TableSnapshotRegionSplit(TableSnapshotInputFormatImpl.InputSplit delegate this.delegate = delegate; } + /** + * @deprecated since 4.0.0. Use + * {@link #TableSnapshotRegionSplit(TableSnapshotInputFormatImpl.InputSplit)}. This + * constructor will be removed in a future. + * @see HBASE-29272 + */ @Deprecated public TableSnapshotRegionSplit(TableDescriptor htd, RegionInfo regionInfo, List locations, Scan scan, Path restoreDir) { - this.delegate = - new TableSnapshotInputFormatImpl.InputSplit(htd, regionInfo, locations, scan, restoreDir); + this.delegate = new TableSnapshotInputFormatImpl.InputSplit(htd, regionInfo, locations, scan, + restoreDir, 1); } @Override diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormat.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormat.java index 24432f963ea9..126eb8f3881a 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormat.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormat.java @@ -94,11 +94,17 @@ public TableSnapshotRegionSplit(TableSnapshotInputFormatImpl.InputSplit delegate this.delegate = delegate; } + /** + * @deprecated since 4.0.0. Use + * {@link #TableSnapshotRegionSplit(TableSnapshotInputFormatImpl.InputSplit)}. This + * constructor will be removed in a future. + * @see HBASE-29272 + */ @Deprecated public TableSnapshotRegionSplit(TableDescriptor htd, RegionInfo regionInfo, List locations, Scan scan, Path restoreDir) { - this.delegate = - new TableSnapshotInputFormatImpl.InputSplit(htd, regionInfo, locations, scan, restoreDir); + this.delegate = new TableSnapshotInputFormatImpl.InputSplit(htd, regionInfo, locations, scan, + restoreDir, 1); } @Override diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java index 565558da870b..44c40e130a2e 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java @@ -153,12 +153,6 @@ public static class InputSplit implements Writable { public InputSplit() { } - @Deprecated - public InputSplit(TableDescriptor htd, RegionInfo regionInfo, List locations, Scan scan, - Path restoreDir) { - this(htd, regionInfo, locations, scan, restoreDir, 1); - } - public InputSplit(TableDescriptor htd, RegionInfo regionInfo, List locations, Scan scan, Path restoreDir, long length) { this.htd = htd;