diff --git a/src/iceberg/SNAPSHOT_UPDATE_API_COVERAGE.md b/src/iceberg/SNAPSHOT_UPDATE_API_COVERAGE.md new file mode 100644 index 000000000..8f7803b8a --- /dev/null +++ b/src/iceberg/SNAPSHOT_UPDATE_API_COVERAGE.md @@ -0,0 +1,168 @@ + + +# SnapshotUpdate API Coverage + +This document tracks the implementation status of the `SnapshotUpdate` interface compared to the Java Iceberg implementation. + +## Java SnapshotUpdate Interface + +The Java `SnapshotUpdate` interface defines 6 methods: + +### 1. `set(String property, String value)` ✅ IMPLEMENTED +**Java:** +```java +/** + * Set a summary property in the snapshot produced by this update. + * + * @param property a String property name + * @param value a String property value + * @return this for method chaining + */ +ThisT set(String property, String value); +``` + +**C++:** +```cpp +Derived& Set(std::string_view property, std::string_view value); +``` + +### 2. `deleteWith(Consumer deleteFunc)` ✅ IMPLEMENTED +**Java:** +```java +/** + * Set a callback to delete files instead of the table's default. + * + * @param deleteFunc a String consumer used to delete locations. + * @return this for method chaining + */ +ThisT deleteWith(Consumer deleteFunc); +``` + +**C++:** +```cpp +Derived& DeleteWith(std::function delete_func); +``` + +### 3. `stageOnly()` ✅ IMPLEMENTED +**Java:** +```java +/** + * Called to stage a snapshot in table metadata, but not update the current snapshot id. + * + * @return this for method chaining + */ +ThisT stageOnly(); +``` + +**C++:** +```cpp +Derived& StageOnly(); +``` + +### 4. `scanManifestsWith(ExecutorService executorService)` ⏸️ DEFERRED +**Java:** +```java +/** + * Use a particular executor to scan manifests. The default worker pool will be used by default. + * + * @param executorService the provided executor + * @return this for method chaining + */ +ThisT scanManifestsWith(ExecutorService executorService); +``` + +**C++:** NOT IMPLEMENTED + +**Reason:** Requires executor/thread pool infrastructure which is not yet available in the codebase. + +**Future Implementation:** +```cpp +// To be added when executor infrastructure is available +Derived& ScanManifestsWith(std::shared_ptr executor); +``` + +### 5. `toBranch(String branch)` ✅ IMPLEMENTED +**Java:** +```java +/** + * Perform operations on a particular branch + * + * @param branch which is name of SnapshotRef of type branch. + */ +default ThisT toBranch(String branch) { + throw new UnsupportedOperationException( + String.format( + "Cannot commit to branch %s: %s does not support branch commits", + branch, this.getClass().getName())); +} +``` + +**C++ Implementation:** +```cpp +Derived& ToBranch(std::string_view branch); +``` + +**Note:** Java has a default implementation that throws `UnsupportedOperationException`. +C++ requires derived classes to implement the full functionality. + +### 6. `validateWith(SnapshotAncestryValidator validator)` ❌ MISSING +**Java:** +```java +/** + * Validate snapshot ancestry before committing. + */ +default ThisT validateWith(SnapshotAncestryValidator validator) { + throw new UnsupportedOperationException( + "Snapshot validation not supported by " + this.getClass().getName()); +} +``` + +**C++:** NOT IMPLEMENTED + +**Reason:** Not identified during initial implementation review. + +**Future Implementation:** +```cpp +// To be added when SnapshotAncestryValidator infrastructure is available +// Note: Java has default implementation that throws UnsupportedOperationException +// Consider whether to provide similar default behavior or omit until needed +``` + +## Summary + +| Method | Java | C++ | Status | Notes | +|--------|------|-----|--------|-------| +| set() | ✅ | ✅ | Implemented | | +| deleteWith() | ✅ | ✅ | Implemented | | +| stageOnly() | ✅ | ✅ | Implemented | | +| scanManifestsWith() | ✅ | ❌ | Deferred | Needs executor infrastructure | +| toBranch() | ✅ (default throws) | ✅ | Implemented | C++ requires full implementation | +| validateWith() | ✅ (default throws) | ❌ | Missing | Needs SnapshotAncestryValidator | + +**Implementation Coverage:** 4/6 methods (66%) +**Fully Usable Coverage:** 4/4 required methods (100%) - the missing methods have default throwing implementations in Java + +## Next Steps + +1. **ScanManifestsWith()**: Add when executor/thread pool infrastructure is available +2. **ValidateWith()**: Add when SnapshotAncestryValidator is implemented + - Consider whether to provide a no-op implementation initially + - Java's default implementation throws UnsupportedOperationException + - May be better to omit until validation infrastructure exists diff --git a/src/iceberg/snapshot_update.h b/src/iceberg/snapshot_update.h new file mode 100644 index 000000000..f5e6c049b --- /dev/null +++ b/src/iceberg/snapshot_update.h @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/snapshot_update.h +/// API for table updates that produce snapshots + +#include +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/pending_update.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +/// \brief Interface for updates that produce a new table snapshot +/// +/// SnapshotUpdate extends PendingUpdate to provide common methods for all +/// updates that create a new table Snapshot. Implementations include operations +/// like AppendFiles, DeleteFiles, OverwriteFiles, and RewriteFiles. +/// +/// This interface uses CRTP (Curiously Recurring Template Pattern) to enable +/// fluent API method chaining in derived classes, matching the Java pattern +/// where SnapshotUpdate allows methods to return the actual derived type. +/// +/// Methods included from Java API (4/6): +/// - Set(): Set summary properties +/// - StageOnly(): Stage without updating current snapshot +/// - DeleteWith(): Custom file deletion callback +/// - ToBranch(): Commit to a specific branch +/// +/// Methods not yet implemented (2/6): +/// - ScanManifestsWith(): Custom executor for parallel manifest scanning +/// (deferred: requires executor/thread pool infrastructure) +/// - ValidateWith(): Custom snapshot ancestry validation +/// (deferred: requires SnapshotAncestryValidator infrastructure) +/// +/// See SNAPSHOT_UPDATE_API_COVERAGE.md for detailed comparison with Java API +/// +/// \tparam Derived The actual implementation class (e.g., AppendFiles) +template +class ICEBERG_EXPORT SnapshotUpdate : public PendingUpdateTyped { + public: + ~SnapshotUpdate() override = default; + + /// \brief Set a summary property on the snapshot + /// + /// Summary properties provide metadata about the changes in the snapshot, + /// such as the operation type, number of files added/deleted, etc. + /// + /// \param property The property name + /// \param value The property value + /// \return Reference to derived class for method chaining + Derived& Set(std::string_view property, std::string_view value) { + summary_[std::string(property)] = std::string(value); + return static_cast(*this); + } + + /// \brief Stage the snapshot without updating the table's current snapshot + /// + /// When StageOnly() is called, the snapshot will be committed to table metadata + /// but will not update the current snapshot ID. The snapshot will not be added + /// to the table's snapshot log. This is useful for creating wap branches or + /// validating changes before making them current. + /// + /// \return Reference to derived class for method chaining + Derived& StageOnly() { + stage_only_ = true; + return static_cast(*this); + } + + /// \brief Set a custom file deletion callback + /// + /// By default, files are deleted using the table's FileIO implementation. + /// This method allows providing a custom deletion callback for use cases like: + /// - Tracking deleted files for auditing + /// - Implementing custom retention policies + /// - Delegating deletion to external systems + /// + /// \param delete_func Callback function that will be called for each file to delete + /// \return Reference to derived class for method chaining + Derived& DeleteWith(std::function delete_func) { + delete_func_ = std::move(delete_func); + return static_cast(*this); + } + + /// \brief Commit the snapshot to a specific branch + /// + /// By default, snapshots are committed to the table's main branch. + /// This method allows committing to a named branch instead, which is useful for: + /// - Write-Audit-Publish (WAP) workflows + /// - Feature branch development + /// - Testing changes before merging to main + /// + /// \param branch The name of the branch to commit to + /// \return Reference to derived class for method chaining + Derived& ToBranch(std::string_view branch) { + target_branch_ = std::string(branch); + return static_cast(*this); + } + + protected: + SnapshotUpdate() = default; + + /// \brief Summary properties to set on the snapshot + std::unordered_map summary_; + + /// \brief Whether to stage only without updating current snapshot + bool stage_only_ = false; + + /// \brief Custom file deletion callback + std::optional> delete_func_; + + /// \brief Target branch name for commit (nullopt means main branch) + std::optional target_branch_; +}; + +} // namespace iceberg diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 25a03932d..822f45ccd 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -82,6 +82,7 @@ add_iceberg_test(table_test json_internal_test.cc pending_update_test.cc schema_json_test.cc + snapshot_update_test.cc table_test.cc table_metadata_builder_test.cc table_requirement_test.cc diff --git a/src/iceberg/test/snapshot_update_test.cc b/src/iceberg/test/snapshot_update_test.cc new file mode 100644 index 000000000..4674f3358 --- /dev/null +++ b/src/iceberg/test/snapshot_update_test.cc @@ -0,0 +1,219 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/snapshot_update.h" + +#include + +#include "iceberg/result.h" +#include "iceberg/snapshot.h" +#include "iceberg/test/matchers.h" + +namespace iceberg { + +// Mock implementation of SnapshotUpdate for testing +// This mock tracks which methods were called to verify behavior +class MockSnapshotUpdate : public SnapshotUpdate { + public: + MockSnapshotUpdate() = default; + + Result Apply() override { + if (should_fail_) { + return ValidationFailed("Mock validation failed"); + } + apply_called_ = true; + + // Return a Snapshot that reflects the configuration set via builder methods + // This allows us to verify behavior through the public API + return Snapshot{ + .snapshot_id = 1, + .parent_snapshot_id = std::nullopt, + .sequence_number = 1, + .timestamp_ms = TimePointMs{std::chrono::milliseconds{1000}}, + .manifest_list = "s3://bucket/metadata/snap-1-manifest-list.avro", + .summary = summary_, // Summary is populated by Set() calls + .schema_id = std::nullopt, + }; + } + + Status Commit() override { + if (should_fail_commit_) { + return CommitFailed("Mock commit failed"); + } + commit_called_ = true; + + // Simulate file deletion if callback is set + if (delete_func_) { + // In a real implementation, this would delete actual files + // For testing, just call the callback + (*delete_func_)("test-file-to-delete.parquet"); + } + + return {}; + } + + void SetShouldFail(bool fail) { should_fail_ = fail; } + void SetShouldFailCommit(bool fail) { should_fail_commit_ = fail; } + bool ApplyCalled() const { return apply_called_; } + bool CommitCalled() const { return commit_called_; } + + private: + bool should_fail_ = false; + bool should_fail_commit_ = false; + bool apply_called_ = false; + bool commit_called_ = false; +}; + +TEST(SnapshotUpdateTest, SetSummaryProperty) { + MockSnapshotUpdate update; + update.Set("operation", "append"); + + // Verify through public API: the snapshot from Apply() should have the summary + auto result = update.Apply(); + ASSERT_THAT(result, IsOk()); + + const auto& snapshot = result.value(); + EXPECT_EQ(snapshot.summary.size(), 1); + EXPECT_EQ(snapshot.summary.at("operation"), "append"); +} + +TEST(SnapshotUpdateTest, SetMultipleSummaryProperties) { + MockSnapshotUpdate update; + update.Set("operation", "append").Set("added-files-count", "5"); + + // Verify through public API + auto result = update.Apply(); + ASSERT_THAT(result, IsOk()); + + const auto& snapshot = result.value(); + EXPECT_EQ(snapshot.summary.size(), 2); + EXPECT_EQ(snapshot.summary.at("operation"), "append"); + EXPECT_EQ(snapshot.summary.at("added-files-count"), "5"); +} + +TEST(SnapshotUpdateTest, DeleteWith) { + MockSnapshotUpdate update; + std::vector deleted_files; + + // Set up callback to track deleted files + update.DeleteWith( + [&deleted_files](std::string_view path) { deleted_files.emplace_back(path); }); + + // Verify through public API: calling Commit() should invoke the callback + auto status = update.Commit(); + EXPECT_THAT(status, IsOk()); + + // The mock implementation calls the delete callback with a test file + EXPECT_EQ(deleted_files.size(), 1); + EXPECT_EQ(deleted_files[0], "test-file-to-delete.parquet"); +} + +TEST(SnapshotUpdateTest, MethodChaining) { + MockSnapshotUpdate update; + + // Test that all methods return the derived type for chaining + update.Set("operation", "append") + .Set("added-files-count", "5") + .Set("added-records", "1000") + .StageOnly(); + + // Verify through public API + auto result = update.Apply(); + ASSERT_THAT(result, IsOk()); + + const auto& snapshot = result.value(); + EXPECT_EQ(snapshot.summary.size(), 3); + EXPECT_EQ(snapshot.summary.at("operation"), "append"); + EXPECT_EQ(snapshot.summary.at("added-files-count"), "5"); + EXPECT_EQ(snapshot.summary.at("added-records"), "1000"); +} + +TEST(SnapshotUpdateTest, MethodChainingWithAllMethods) { + MockSnapshotUpdate update; + std::vector deleted_files; + + // Chain all builder methods together + update.Set("operation", "append") + .Set("added-files-count", "5") + .DeleteWith( + [&deleted_files](std::string_view path) { deleted_files.emplace_back(path); }) + .ToBranch("wap-branch") + .StageOnly(); + + // Verify through Apply() + auto result = update.Apply(); + ASSERT_THAT(result, IsOk()); + + const auto& snapshot = result.value(); + EXPECT_EQ(snapshot.summary.at("operation"), "append"); + EXPECT_EQ(snapshot.summary.at("added-files-count"), "5"); + + // Verify through Commit() + auto status = update.Commit(); + EXPECT_THAT(status, IsOk()); + EXPECT_EQ(deleted_files.size(), 1); +} + +TEST(SnapshotUpdateTest, ApplySuccess) { + MockSnapshotUpdate update; + auto result = update.Apply(); + EXPECT_THAT(result, IsOk()); + EXPECT_TRUE(update.ApplyCalled()); +} + +TEST(SnapshotUpdateTest, ApplyValidationFailed) { + MockSnapshotUpdate update; + update.SetShouldFail(true); + auto result = update.Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Mock validation failed")); +} + +TEST(SnapshotUpdateTest, CommitSuccess) { + MockSnapshotUpdate update; + auto status = update.Commit(); + EXPECT_THAT(status, IsOk()); + EXPECT_TRUE(update.CommitCalled()); +} + +TEST(SnapshotUpdateTest, CommitFailed) { + MockSnapshotUpdate update; + update.SetShouldFailCommit(true); + auto status = update.Commit(); + EXPECT_THAT(status, IsError(ErrorKind::kCommitFailed)); + EXPECT_THAT(status, HasErrorMessage("Mock commit failed")); +} + +TEST(SnapshotUpdateTest, InheritanceFromPendingUpdate) { + std::unique_ptr base_ptr = std::make_unique(); + auto status = base_ptr->Commit(); + EXPECT_THAT(status, IsOk()); +} + +TEST(SnapshotUpdateTest, InheritanceFromPendingUpdateTyped) { + std::unique_ptr> typed_ptr = + std::make_unique(); + auto status = typed_ptr->Commit(); + EXPECT_THAT(status, IsOk()); + + auto result = typed_ptr->Apply(); + EXPECT_THAT(result, IsOk()); +} + +} // namespace iceberg diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 81681ebcd..046f56220 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -159,6 +159,8 @@ class TableUpdateContext; class PendingUpdate; template class PendingUpdateTyped; +template +class SnapshotUpdate; /// ---------------------------------------------------------------------------- /// TODO: Forward declarations below are not added yet.