apache · amogh-jahagirdar · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/core/src/main/java/org/apache/iceberg/DVUtil.java b/core/src/main/java/org/apache/iceberg/DVUtil.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.concurrent.ExecutorService;
+import java.util.stream.Collectors;
+import org.apache.iceberg.deletes.BaseDVFileWriter;
+import org.apache.iceberg.deletes.DVFileWriter;
+import org.apache.iceberg.deletes.Deletes;
+import org.apache.iceberg.deletes.PositionDeleteIndex;
+import org.apache.iceberg.io.DeleteWriteResult;
+import org.apache.iceberg.io.OutputFileFactory;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
+import org.apache.iceberg.util.Tasks;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+class DVUtil {
+  private static final Logger LOG = LoggerFactory.getLogger(DVUtil.class);
+
+  private DVUtil() {}
+
+  /**
+   * Merges duplicate DVs for the same data file and writes the merged DV Puffin files.
+   *
+   * @param duplicateDVsByReferencedFile map of data file location to duplicate DVs (all entries
+   *     must have size > 1)
+   * @return newly merged DVs
+   */
+  static List<DeleteFile> mergeDVsAndWrite(
+      TableOperations ops,
+      Map<String, List<DeleteFile>> duplicateDVsByReferencedFile,
+      String tableName,
+      ExecutorService threadpool) {
+    Map<String, PositionDeleteIndex> mergedIndices =
+        duplicateDVsByReferencedFile.entrySet().stream()
+            .collect(
+                Collectors.toMap(
+                    Map.Entry::getKey,
+                    entry -> readDVsAndMerge(ops, entry.getValue(), threadpool)));
+
+    return writeMergedDVs(
+        mergedIndices, duplicateDVsByReferencedFile, ops, tableName, ops.current().specsById());
+  }
+
+  // Merges the position indices for the duplicate DVs for a given referenced file
+  private static PositionDeleteIndex readDVsAndMerge(
+      TableOperations ops, List<DeleteFile> dvsForFile, ExecutorService pool) {
+    Preconditions.checkArgument(dvsForFile.size() > 1, "Expected more than 1 DV");
+    PositionDeleteIndex[] dvIndices = readDVs(dvsForFile, pool, ops);
+    PositionDeleteIndex mergedPositions = dvIndices[0];
+    DeleteFile firstDV = dvsForFile.get(0);
+
+    for (int i = 1; i < dvIndices.length; i++) {
+      DeleteFile dv = dvsForFile.get(i);
+      Preconditions.checkArgument(
+          Objects.equals(dv.dataSequenceNumber(), firstDV.dataSequenceNumber()),
+          "Cannot merge duplicate added DVs when data sequence numbers are different, "
+              + "expected all to be added with sequence %s, but got %s",
+          firstDV.dataSequenceNumber(),
+          dv.dataSequenceNumber());
+
+      Preconditions.checkArgument(
+          dv.specId() == firstDV.specId(),
+          "Cannot merge duplicate added DVs when partition specs are different, "
+              + "expected all to be added with spec %s, but got %s",
+          firstDV.specId(),
+          dv.specId());
+
+      Preconditions.checkArgument(
+          Objects.equals(dv.partition(), firstDV.partition()),
+          "Cannot merge duplicate added DVs when partition tuples are different");
+
+      mergedPositions.merge(dvIndices[i]);
+    }
+
+    return mergedPositions;
+  }
+
+  private static PositionDeleteIndex[] readDVs(
+      List<DeleteFile> dvs, ExecutorService pool, TableOperations ops) {
+    PositionDeleteIndex[] dvIndices = new PositionDeleteIndex[dvs.size()];
+    Tasks.range(dvIndices.length)
+        .executeWith(pool)
+        .stopOnFailure()
+        .throwFailureWhenFinished()
+        .run(
+            i -> {
+              dvIndices[i] = Deletes.readDV(dvs.get(i), ops.io(), ops.encryption());
+            });
+
+    return dvIndices;
+  }
+
+  // Produces a Puffin per partition spec containing the merged DVs for that spec
+  private static List<DeleteFile> writeMergedDVs(
+      Map<String, PositionDeleteIndex> mergedIndices,
+      Map<String, List<DeleteFile>> dataFilesWithDuplicateDVs,
+      TableOperations ops,
+      String tableName,
+      Map<Integer, PartitionSpec> specsById) {
+    try (DVFileWriter dvFileWriter =
+        new BaseDVFileWriter(
+            // Use an unpartitioned spec for the location provider for the puffin containing
+            // all the merged DVs
+            OutputFileFactory.builderFor(
+                    ops, PartitionSpec.unpartitioned(), FileFormat.PUFFIN, 1, 1)
+                .build(),
+            path -> null)) {
+
+      for (Map.Entry<String, PositionDeleteIndex> entry : mergedIndices.entrySet()) {
+        String referencedLocation = entry.getKey();
+        PositionDeleteIndex mergedPositions = entry.getValue();
+        List<DeleteFile> duplicateDVs = dataFilesWithDuplicateDVs.get(referencedLocation);
+        DeleteFile firstDV = duplicateDVs.get(0);
+        LOG.warn(
+            "Merged {} DVs for data file {}. These will be orphaned DVs in table {}",
+            duplicateDVs.size(),
+            referencedLocation,
+            tableName);
+        dvFileWriter.delete(
+            referencedLocation,
+            mergedPositions,
+            specsById.get(firstDV.specId()),
+            firstDV.partition());
+      }
+
+      dvFileWriter.close();
+      DeleteWriteResult writeResult = dvFileWriter.result();
+      return writeResult.deleteFiles();
+    } catch (IOException e) {
+      throw new UncheckedIOException(e);
+    }
+  }
+}
diff --git a/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java b/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java
@@ -30,6 +30,7 @@
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
+import java.util.stream.Collectors;
 import org.apache.iceberg.encryption.EncryptedOutputFile;
 import org.apache.iceberg.events.CreateSnapshotEvent;
 import org.apache.iceberg.exceptions.ValidationException;
@@ -47,6 +48,7 @@
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.relocated.com.google.common.collect.Streams;
 import org.apache.iceberg.util.CharSequenceSet;
 import org.apache.iceberg.util.ContentFileUtil;
 import org.apache.iceberg.util.DataFileSet;
@@ -55,6 +57,7 @@
 import org.apache.iceberg.util.PartitionSet;
 import org.apache.iceberg.util.SnapshotUtil;
 import org.apache.iceberg.util.Tasks;
+import org.apache.iceberg.util.ThreadPools;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -86,8 +89,8 @@ abstract class MergingSnapshotProducer<ThisT> extends SnapshotProducer<ThisT> {
   // update data
   private final Map<Integer, DataFileSet> newDataFilesBySpec = Maps.newHashMap();
   private Long newDataFilesDataSequenceNumber;
-  private final Map<Integer, DeleteFileSet> newDeleteFilesBySpec = Maps.newHashMap();
-  private final Set<String> newDVRefs = Sets.newHashSet();
+  private final List<DeleteFile> positionAndEqualityDeletes = Lists.newArrayList();
+  private final Map<String, List<DeleteFile>> dvsByReferencedFile = Maps.newLinkedHashMap();
   private final List<ManifestFile> appendManifests = Lists.newArrayList();
   private final List<ManifestFile> rewrittenAppendManifests = Lists.newArrayList();
   private final SnapshotSummary.Builder addedFilesSummary = SnapshotSummary.builder();
@@ -222,7 +225,7 @@ protected boolean addsDataFiles() {
   }
 
   protected boolean addsDeleteFiles() {
-    return !newDeleteFilesBySpec.isEmpty();
+    return !positionAndEqualityDeletes.isEmpty() || !dvsByReferencedFile.isEmpty();
   }
 
   /** Add a data file to the new snapshot. */
@@ -265,15 +268,14 @@ private void addInternal(DeleteFile file) {
         "Cannot find partition spec %s for delete file: %s",
         file.specId(),
         file.location());
-
-    DeleteFileSet deleteFiles =
-        newDeleteFilesBySpec.computeIfAbsent(spec.specId(), ignored -> DeleteFileSet.create());
-    if (deleteFiles.add(file)) {
-      addedFilesSummary.addedFile(spec, file);
-      hasNewDeleteFiles = true;
-      if (ContentFileUtil.isDV(file)) {
-        newDVRefs.add(file.referencedDataFile());
-      }
+    hasNewDeleteFiles = true;
+    if (ContentFileUtil.isDV(file)) {
+      List<DeleteFile> dvsForReferencedFile =
+          dvsByReferencedFile.computeIfAbsent(
+              file.referencedDataFile(), newFile -> Lists.newArrayList());
+      dvsForReferencedFile.add(file);
+    } else {
+      positionAndEqualityDeletes.add(file);
     }
   }
 
@@ -814,7 +816,7 @@ protected void validateAddedDVs(
       Expression conflictDetectionFilter,
       Snapshot parent) {
     // skip if there is no current table state or this operation doesn't add new DVs
-    if (parent == null || newDVRefs.isEmpty()) {
+    if (parent == null || dvsByReferencedFile.isEmpty()) {
       return;
     }
 
@@ -847,7 +849,7 @@ private void validateAddedDVs(
         DeleteFile file = entry.file();
         if (newSnapshotIds.contains(entry.snapshotId()) && ContentFileUtil.isDV(file)) {
           ValidationException.check(
-              !newDVRefs.contains(file.referencedDataFile()),
+              !dvsByReferencedFile.containsKey(file.referencedDataFile()),
               "Found concurrently added DV for %s: %s",
               file.referencedDataFile(),
               ContentFileUtil.dvDesc(file));
@@ -1042,7 +1044,7 @@ private List<ManifestFile> newDataFilesAsManifests() {
   }
 
   private Iterable<ManifestFile> prepareDeleteManifests() {
-    if (newDeleteFilesBySpec.isEmpty()) {
+    if (!addsDeleteFiles()) {
       return ImmutableList.of();
     }
 
@@ -1060,9 +1062,32 @@ private List<ManifestFile> newDeleteFilesAsManifests() {
     }
 
     if (cachedNewDeleteManifests.isEmpty()) {
+      Map<String, List<DeleteFile>> duplicateDVs = Maps.newHashMap();
+      List<DeleteFile> validDVs = Lists.newArrayList();
+      for (Map.Entry<String, List<DeleteFile>> entry : dvsByReferencedFile.entrySet()) {
+        if (entry.getValue().size() > 1) {
+          duplicateDVs.put(entry.getKey(), entry.getValue());
+        } else {
+          validDVs.addAll(entry.getValue());
+        }
+      }
+
+      List<DeleteFile> mergedDVs =
+          duplicateDVs.isEmpty()
+              ? ImmutableList.of()
+              : DVUtil.mergeDVsAndWrite(
+                  ops(), duplicateDVs, tableName, ThreadPools.getDeleteWorkerPool());
+      // Prevent commiting duplicate V2 deletes by deduping them
+      Map<Integer, List<DeleteFile>> newDeleteFilesBySpec =
+          Streams.stream(
+                  Iterables.concat(
+                      mergedDVs, validDVs, DeleteFileSet.of(positionAndEqualityDeletes)))
+              .map(file -> Delegates.pendingDeleteFile(file, file.dataSequenceNumber()))
+              .collect(Collectors.groupingBy(ContentFile::specId));
       newDeleteFilesBySpec.forEach(
           (specId, deleteFiles) -> {
             PartitionSpec spec = ops().current().spec(specId);
+            deleteFiles.forEach(file -> addedFilesSummary.addedFile(spec, file));
             List<ManifestFile> newDeleteManifests = writeDeleteManifests(deleteFiles, spec);
             cachedNewDeleteManifests.addAll(newDeleteManifests);
           });

diff --git a/core/src/main/java/org/apache/iceberg/deletes/BaseDVFileWriter.java b/core/src/main/java/org/apache/iceberg/deletes/BaseDVFileWriter.java
@@ -71,6 +71,17 @@ public void delete(String path, long pos, PartitionSpec spec, StructLike partiti
     positions.delete(pos);
   }
 
+  @Override
+  public void delete(
+      String path,
+      PositionDeleteIndex positionDeleteIndex,
+      PartitionSpec spec,
+      StructLike partition) {
+    Deletes deletes =
+        deletesByPath.computeIfAbsent(path, key -> new Deletes(path, spec, partition));
+    deletes.positions().merge(positionDeleteIndex);
+  }
+
   @Override
   public DeleteWriteResult result() {
     Preconditions.checkState(result != null, "Cannot get result from unclosed writer");

diff --git a/core/src/main/java/org/apache/iceberg/deletes/DVFileWriter.java b/core/src/main/java/org/apache/iceberg/deletes/DVFileWriter.java
@@ -43,4 +43,21 @@ public interface DVFileWriter extends Closeable {
    * @return the writer result
    */
   DeleteWriteResult result();
+
+  /**
+   * Marks every position that is deleted in positionDeleteIndex as deleted in the given data file.
+   * Implementations should merge with existing position indices for the provided path
+   *
+   * @param path the data file path
+   * @param positionDeleteIndex the position delete index containing all the positions to delete
+   * @param spec the data file partition spec
+   * @param partition the data file partition
+   */
+  default void delete(
+      String path,
+      PositionDeleteIndex positionDeleteIndex,
+      PartitionSpec spec,
+      StructLike partition) {
+    throw new UnsupportedOperationException("Delete with positionDeleteIndex is not supported");
+  }
 }
diff --git a/core/src/main/java/org/apache/iceberg/deletes/Deletes.java b/core/src/main/java/org/apache/iceberg/deletes/Deletes.java
@@ -29,13 +29,20 @@
 import org.apache.iceberg.MetadataColumns;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.StructLike;
+import org.apache.iceberg.encryption.EncryptingFileIO;
+import org.apache.iceberg.encryption.EncryptionManager;
 import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.IOUtil;
+import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.types.Comparators;
 import org.apache.iceberg.types.Types;
 import org.apache.iceberg.util.CharSequenceMap;
+import org.apache.iceberg.util.ContentFileUtil;
 import org.apache.iceberg.util.Filter;
 import org.apache.iceberg.util.SortedMerge;
 import org.apache.iceberg.util.StructLikeSet;
@@ -126,6 +133,18 @@ public static <T extends StructLike> CharSequenceMap<PositionDeleteIndex> toPosi
     return toPositionIndexes(posDeletes, null /* unknown delete file */);
   }
 
+  public static PositionDeleteIndex readDV(
+      DeleteFile deleteFile, FileIO fileIO, EncryptionManager encryptionManager) {
+    Preconditions.checkArgument(
+        ContentFileUtil.isDV(deleteFile), "Delete file must be a deletion vector");
+    InputFile inputFile =
+        EncryptingFileIO.combine(fileIO, encryptionManager).newInputFile(deleteFile);
+    long offset = deleteFile.contentOffset();
+    int length = deleteFile.contentSizeInBytes().intValue();
+    byte[] bytes = IOUtil.readBytes(inputFile, offset, length);
+    return PositionDeleteIndex.deserialize(bytes, deleteFile);
+  }
+
   /**
    * Builds a map of position delete indexes by path.
    *

diff --git a/core/src/main/java/org/apache/iceberg/io/IOUtil.java b/core/src/main/java/org/apache/iceberg/io/IOUtil.java
@@ -22,7 +22,9 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.io.UncheckedIOException;
 import java.nio.ByteBuffer;
+import org.apache.iceberg.relocated.com.google.common.io.ByteStreams;
 
 public class IOUtil {
   // not meant to be instantiated
@@ -49,6 +51,24 @@ public static void readFully(InputStream stream, byte[] bytes, int offset, int l
     }
   }
 
+  public static byte[] readBytes(InputFile inputFile, long offset, int length) {
+    try (SeekableInputStream stream = inputFile.newStream()) {
+      byte[] bytes = new byte[length];
+
+      if (stream instanceof RangeReadable) {
+        RangeReadable rangeReadable = (RangeReadable) stream;
+        rangeReadable.readFully(offset, bytes);
+      } else {
+        stream.seek(offset);
+        ByteStreams.readFully(stream, bytes);
+      }
+
+      return bytes;
+    } catch (IOException e) {
+      throw new UncheckedIOException(e);
+    }
+  }
+
   /** Writes a buffer into a stream, making multiple write calls if necessary. */
   public static void writeFully(OutputStream outputStream, ByteBuffer buffer) throws IOException {
     if (!buffer.hasRemaining()) {