diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index ca0d943fa..414219f86 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -33,8 +33,28 @@ namespace iceberg { -Schema::Schema(std::vector fields, std::optional schema_id) - : StructType(std::move(fields)), schema_id_(schema_id) {} +Schema::Schema(std::vector fields, std::optional schema_id, + std::vector identifier_field_ids) + : StructType(std::move(fields)), + schema_id_(schema_id), + identifier_field_ids_(std::move(identifier_field_ids)) {} + +Result> Schema::Make( + std::vector fields, std::optional schema_id, + const std::vector& identifier_field_names) { + auto schema = std::make_unique(std::move(fields), schema_id); + + std::vector fresh_identifier_ids; + for (const auto& name : identifier_field_names) { + ICEBERG_ASSIGN_OR_RAISE(auto field, schema->FindFieldByName(name)); + if (!field) { + return InvalidSchema("Cannot find identifier field: {}", name); + } + fresh_identifier_ids.push_back(field.value().get().field_id()); + } + schema->identifier_field_ids_ = std::move(fresh_identifier_ids); + return schema; +} std::optional Schema::schema_id() const { return schema_id_; } @@ -48,15 +68,16 @@ std::string Schema::ToString() const { } bool Schema::Equals(const Schema& other) const { - return schema_id_ == other.schema_id_ && fields_ == other.fields_; + return schema_id_ == other.schema_id_ && fields_ == other.fields_ && + identifier_field_ids_ == other.identifier_field_ids_; } Result>> Schema::FindFieldByName( std::string_view name, bool case_sensitive) const { if (case_sensitive) { - ICEBERG_ASSIGN_OR_RAISE(auto name_to_id, name_to_id_.Get(*this)); - auto it = name_to_id.get().find(name); - if (it == name_to_id.get().end()) { + ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this)); + auto it = name_id_map.get().name_to_id.find(name); + if (it == name_id_map.get().name_to_id.end()) { return std::nullopt; }; return FindFieldById(it->second); @@ -77,21 +98,22 @@ Schema::InitIdToFieldMap(const Schema& self) { return id_to_field; } -Result>> -Schema::InitNameToIdMap(const Schema& self) { - std::unordered_map> name_to_id; - NameToIdVisitor visitor(name_to_id, /*case_sensitive=*/true); +Result Schema::InitNameIdMap(const Schema& self) { + NameIdMap name_id_map; + NameToIdVisitor visitor(name_id_map.name_to_id, &name_id_map.id_to_name, + /*case_sensitive=*/true); ICEBERG_RETURN_UNEXPECTED( VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/"")); visitor.Finish(); - return name_to_id; + return name_id_map; } Result>> Schema::InitLowerCaseNameToIdMap(const Schema& self) { std::unordered_map> lowercase_name_to_id; - NameToIdVisitor visitor(lowercase_name_to_id, /*case_sensitive=*/false); + NameToIdVisitor visitor(lowercase_name_to_id, /*id_to_name=*/nullptr, + /*case_sensitive=*/false); ICEBERG_RETURN_UNEXPECTED( VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/"")); visitor.Finish(); @@ -108,6 +130,16 @@ Result>> Schema::FindFie return it->second; } +Result> Schema::FindColumnNameById( + int32_t field_id) const { + ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this)); + auto it = name_id_map.get().id_to_name.find(field_id); + if (it == name_id_map.get().id_to_name.end()) { + return std::nullopt; + } + return it->second; +} + Result>> Schema::InitIdToPositionPath( const Schema& self) { PositionPathVisitor visitor; @@ -179,4 +211,21 @@ Result> Schema::Project( std::nullopt); } +const std::vector& Schema::IdentifierFieldIds() const { + return identifier_field_ids_; +} + +Result> Schema::IdentifierFieldNames() const { + std::vector names; + names.reserve(identifier_field_ids_.size()); + for (auto id : identifier_field_ids_) { + ICEBERG_ASSIGN_OR_RAISE(auto name, FindColumnNameById(id)); + if (!name.has_value()) { + return InvalidSchema("Cannot find the field of the specified field id: {}", id); + } + names.emplace_back(name.value()); + } + return names; +} + } // namespace iceberg diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index f6c459d8d..bb9839625 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -49,7 +49,18 @@ class ICEBERG_EXPORT Schema : public StructType { static constexpr int32_t kInvalidColumnId = -1; explicit Schema(std::vector fields, - std::optional schema_id = std::nullopt); + std::optional schema_id = std::nullopt, + std::vector identifier_field_ids = {}); + + /// \brief Create a schema. + /// + /// \param fields The fields that make up the schema. + /// \param schema_id The unique identifier for this schema (default: kInitialSchemaId). + /// \param identifier_field_names Canonical names of fields that uniquely identify rows + /// in the table (default: empty). \return A new Schema instance or Status if failed. + static Result> Make( + std::vector fields, std::optional schema_id = std::nullopt, + const std::vector& identifier_field_names = {}); /// \brief Get the schema ID. /// @@ -78,6 +89,13 @@ class ICEBERG_EXPORT Schema : public StructType { Result>> FindFieldById( int32_t field_id) const; + /// \brief Returns the canonical field name for the given id. + /// + /// \param field_id The id of the field to get the canonical name for. + /// \return The canocinal column name of the field with the given id, or std::nullopt if + /// not found. + Result> FindColumnNameById(int32_t field_id) const; + /// \brief Get the accessor to access the field by field id. /// /// \param field_id The id of the field to get the accessor for. @@ -103,26 +121,48 @@ class ICEBERG_EXPORT Schema : public StructType { Result> Project( const std::unordered_set& field_ids) const; + /// \brief Return the field IDs of the identifier fields. + const std::vector& IdentifierFieldIds() const; + + /// \brief Return the canonical field names of the identifier fields. + Result> IdentifierFieldNames() const; + friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); } private: /// \brief Compare two schemas for equality. bool Equals(const Schema& other) const; + struct NameIdMap { + /// \brief Mapping from canonical field name to ID + /// + /// \note Short names for maps and lists are included for any name that does not + /// conflict with a canonical name. For example, a list, 'l', of structs with field + /// 'x' will produce short name 'l.x' in addition to canonical name 'l.element.x'. + std::unordered_map> name_to_id; + + /// \brief Mapping from field ID to canonical name + /// + /// \note Canonical names, but not short names are set, for example + /// 'list.element.field' instead of 'list.field'. + std::unordered_map id_to_name; + }; + static Result>> InitIdToFieldMap(const Schema&); - static Result>> - InitNameToIdMap(const Schema&); + static Result InitNameIdMap(const Schema&); static Result>> InitLowerCaseNameToIdMap(const Schema&); static Result>> InitIdToPositionPath( const Schema&); const std::optional schema_id_; + /// Field IDs that uniquely identify rows in the table. + std::vector identifier_field_ids_; /// Mapping from field id to field. Lazy id_to_field_; /// Mapping from field name to field id. - Lazy name_to_id_; + Lazy name_id_map_; /// Mapping from lowercased field name to field id Lazy lowercase_name_to_id_; /// Mapping from field id to (nested) position path to access the field. diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 28178b883..fef63efee 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -54,6 +54,7 @@ endfunction() add_iceberg_test(schema_test SOURCES + assign_id_visitor_test.cc name_mapping_test.cc partition_field_test.cc partition_spec_test.cc diff --git a/src/iceberg/test/assign_id_visitor_test.cc b/src/iceberg/test/assign_id_visitor_test.cc new file mode 100644 index 000000000..f9290d7f8 --- /dev/null +++ b/src/iceberg/test/assign_id_visitor_test.cc @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include +#include + +#include "iceberg/schema.h" +#include "iceberg/schema_field.h" +#include "iceberg/test/matchers.h" +#include "iceberg/type.h" +#include "iceberg/util/type_util.h" + +namespace iceberg { + +namespace { + +Schema CreateFlatSchema() { + return Schema({ + SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()), + SchemaField::MakeOptional(/*field_id=*/20, "name", iceberg::string()), + SchemaField::MakeOptional(/*field_id=*/30, "age", iceberg::int32()), + SchemaField::MakeRequired(/*field_id=*/40, "data", iceberg::float64()), + }); +} + +std::shared_ptr CreateListOfStruct() { + return std::make_shared(SchemaField::MakeOptional( + /*field_id=*/101, "element", + std::make_shared(std::vector{ + SchemaField::MakeOptional(/*field_id=*/102, "x", iceberg::int32()), + SchemaField::MakeRequired(/*field_id=*/103, "y", iceberg::string()), + }))); +} + +std::shared_ptr CreateMapWithStructValue() { + return std::make_shared( + SchemaField::MakeRequired(/*field_id=*/201, "key", iceberg::string()), + SchemaField::MakeRequired( + /*field_id=*/202, "value", + std::make_shared(std::vector{ + SchemaField::MakeRequired(/*field_id=*/203, "id", iceberg::int64()), + SchemaField::MakeOptional(/*field_id=*/204, "name", iceberg::string()), + }))); +} + +std::shared_ptr CreateNestedStruct() { + return std::make_shared(std::vector{ + SchemaField::MakeRequired(/*field_id=*/301, "outer_id", iceberg::int64()), + SchemaField::MakeRequired( + /*field_id=*/302, "nested", + std::make_shared(std::vector{ + SchemaField::MakeOptional(/*field_id=*/303, "inner_id", iceberg::int32()), + SchemaField::MakeRequired(/*field_id=*/304, "inner_name", + iceberg::string()), + })), + }); +} + +Schema CreateNestedSchema(std::vector identifier_field_ids = {}) { + return Schema( + { + SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()), + SchemaField::MakeOptional(/*field_id=*/20, "list", CreateListOfStruct()), + SchemaField::MakeOptional(/*field_id=*/30, "map", CreateMapWithStructValue()), + SchemaField::MakeRequired(/*field_id=*/40, "struct", CreateNestedStruct()), + }, + Schema::kInitialSchemaId, std::move(identifier_field_ids)); +} + +} // namespace + +TEST(AssignFreshIdVisitorTest, FlatSchema) { + Schema schema = CreateFlatSchema(); + + std::atomic id = 0; + auto next_id = [&id]() { return ++id; }; + ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema, + AssignFreshIds(Schema::kInitialSchemaId, schema, next_id)); + + ASSERT_EQ(fresh_schema->fields().size(), schema.fields().size()); + EXPECT_EQ(Schema( + { + SchemaField::MakeRequired(/*field_id=*/1, "id", iceberg::int64()), + SchemaField::MakeOptional(/*field_id=*/2, "name", iceberg::string()), + SchemaField::MakeOptional(/*field_id=*/3, "age", iceberg::int32()), + SchemaField::MakeRequired(/*field_id=*/4, "data", iceberg::float64()), + }, + Schema::kInitialSchemaId), + *fresh_schema); +} + +TEST(AssignFreshIdVisitorTest, NestedSchema) { + Schema schema = CreateNestedSchema(); + std::atomic id = 0; + auto next_id = [&id]() { return ++id; }; + ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema, + AssignFreshIds(Schema::kInitialSchemaId, schema, next_id)); + + ASSERT_EQ(4, fresh_schema->fields().size()); + for (int32_t i = 0; i < fresh_schema->fields().size(); ++i) { + EXPECT_EQ(i + 1, fresh_schema->fields()[i].field_id()); + } + + auto list_field = fresh_schema->fields()[1]; + auto list_type = std::dynamic_pointer_cast(list_field.type()); + ASSERT_TRUE(list_type); + auto list_element_field = list_type->fields()[0]; + EXPECT_EQ(5, list_element_field.field_id()); + auto list_element_type = + std::dynamic_pointer_cast(list_element_field.type()); + ASSERT_TRUE(list_element_type); + EXPECT_EQ(StructType(std::vector{ + SchemaField::MakeOptional(/*field_id=*/6, "x", iceberg::int32()), + SchemaField::MakeRequired(/*field_id=*/7, "y", iceberg::string()), + }), + *list_element_type); + + auto map_field = fresh_schema->fields()[2]; + auto map_type = std::dynamic_pointer_cast(map_field.type()); + ASSERT_TRUE(map_type); + EXPECT_EQ(8, map_type->fields()[0].field_id()); + auto map_value_field = map_type->fields()[1]; + EXPECT_EQ(9, map_value_field.field_id()); + auto map_value_type = std::dynamic_pointer_cast(map_value_field.type()); + ASSERT_TRUE(map_value_type); + EXPECT_EQ(StructType(std::vector{ + SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()), + SchemaField::MakeOptional(/*field_id=*/11, "name", iceberg::string()), + }), + *map_value_type); + + auto struct_field = fresh_schema->fields()[3]; + auto struct_type = std::dynamic_pointer_cast(struct_field.type()); + ASSERT_TRUE(struct_type); + + auto expect_nested_struct_type = std::make_shared(std::vector{ + SchemaField::MakeOptional(/*field_id=*/14, "inner_id", iceberg::int32()), + SchemaField::MakeRequired(/*field_id=*/15, "inner_name", iceberg::string()), + }); + EXPECT_EQ(StructType(std::vector{ + SchemaField::MakeRequired(/*field_id=*/12, "outer_id", iceberg::int64()), + SchemaField::MakeRequired( + /*field_id=*/13, "nested", expect_nested_struct_type)}), + *struct_type); + + auto nested_struct_field = struct_type->fields()[1]; + auto nested_struct_type = + std::dynamic_pointer_cast(nested_struct_field.type()); + ASSERT_TRUE(nested_struct_type); + EXPECT_EQ(*expect_nested_struct_type, *nested_struct_type); +} + +TEST(AssignFreshIdVisitorTest, RefreshIdentifierId) { + std::atomic id = 0; + auto next_id = [&id]() { return ++id; }; + + Schema invalid_schema = CreateNestedSchema({10, 400}); + // Invalid identified field id + auto result = AssignFreshIds(Schema::kInitialSchemaId, invalid_schema, next_id); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + EXPECT_THAT(result, HasErrorMessage("Cannot find")); + + id = 0; + Schema schema = CreateNestedSchema({10, 301}); + ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema, + AssignFreshIds(Schema::kInitialSchemaId, schema, next_id)); + EXPECT_THAT(fresh_schema->IdentifierFieldIds(), testing::ElementsAre(1, 12)); +} + +} // namespace iceberg diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index fcd397b9e..378182819 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -30,6 +30,7 @@ configure_file( iceberg_tests = { 'schema_test': { 'sources': files( + 'assign_id_visitor_test.cc', 'name_mapping_test.cc', 'partition_field_test.cc', 'partition_spec_test.cc', diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index 89a8d54b5..ff6bf060a 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -70,6 +70,21 @@ TEST(SchemaTest, Basics) { ASSERT_THAT(result, iceberg::HasErrorMessage("Invalid index -1 to get field from struct")); ASSERT_EQ(std::nullopt, schema.GetFieldByName("element")); + ASSERT_EQ(0, schema.IdentifierFieldIds().size()); + auto identifier_field_names = schema.IdentifierFieldNames(); + ASSERT_THAT(identifier_field_names, iceberg::IsOk()); + ASSERT_THAT(identifier_field_names.value(), ::testing::IsEmpty()); + } + + { + // identifier fields not empty + iceberg::SchemaField field1(5, "foo", iceberg::int32(), true); + iceberg::SchemaField field2(7, "bar", iceberg::string(), true); + iceberg::Schema schema({field1, field2}, 100, {5, 7}); + ASSERT_THAT(schema.IdentifierFieldIds(), testing::ElementsAre(5, 7)); + auto result = schema.IdentifierFieldNames(); + ASSERT_THAT(result, iceberg::IsOk()); + ASSERT_THAT(result.value(), testing::ElementsAre("foo", "bar")); } } @@ -82,6 +97,9 @@ TEST(SchemaTest, Equality) { iceberg::Schema schema3({field1}, 101); iceberg::Schema schema4({field3, field2}, 101); iceberg::Schema schema5({field1, field2}, 100); + iceberg::Schema schema6({field1, field2}, 100, {5}); + iceberg::Schema schema7({field1, field2}, 100, {5}); + iceberg::Schema schema8({field1, field2}, 100, {7}); ASSERT_EQ(schema1, schema1); ASSERT_NE(schema1, schema2); @@ -92,6 +110,10 @@ TEST(SchemaTest, Equality) { ASSERT_NE(schema4, schema1); ASSERT_EQ(schema1, schema5); ASSERT_EQ(schema5, schema1); + + ASSERT_NE(schema5, schema6); + ASSERT_EQ(schema6, schema7); + ASSERT_NE(schema6, schema8); } class BasicShortNameTest : public ::testing::Test { @@ -215,8 +237,8 @@ class ComplexShortNameTest : public ::testing::Test { field9_ = std::make_unique(9, "Map", maptype, false); - schema_ = - std::make_unique(std::vector{*field9_}, 1); + schema_ = std::make_unique( + std::vector{*field9_}, 1, std::vector{1, 2}); } std::unique_ptr schema_; @@ -245,6 +267,27 @@ TEST_F(ComplexShortNameTest, TestFindById) { ASSERT_THAT(schema_->FindFieldById(0), ::testing::Optional(std::nullopt)); } +TEST_F(ComplexShortNameTest, TestFindColumnNameById) { + ASSERT_THAT(schema_->FindColumnNameById(0), ::testing::Optional(std::nullopt)); + ASSERT_THAT(schema_->FindColumnNameById(1), + ::testing::Optional(std::string("Map.value.Second_child.element.Foo"))); + ASSERT_THAT(schema_->FindColumnNameById(2), + ::testing::Optional(std::string("Map.value.Second_child.element.Bar"))); + ASSERT_THAT(schema_->FindColumnNameById(3), + ::testing::Optional(std::string("Map.value.Second_child.element.Foobar"))); + ASSERT_THAT(schema_->FindColumnNameById(4), + ::testing::Optional(std::string("Map.value.Second_child.element"))); + ASSERT_THAT(schema_->FindColumnNameById(5), + ::testing::Optional(std::string("Map.value.First_child"))); + ASSERT_THAT(schema_->FindColumnNameById(6), + ::testing::Optional(std::string("Map.value.Second_child"))); + ASSERT_THAT(schema_->FindColumnNameById(7), + ::testing::Optional(std::string("Map.key"))); + ASSERT_THAT(schema_->FindColumnNameById(8), + ::testing::Optional(std::string("Map.value"))); + ASSERT_THAT(schema_->FindColumnNameById(9), ::testing::Optional(std::string("Map"))); +} + TEST_F(ComplexShortNameTest, TestFindByName) { ASSERT_THAT(schema_->FindFieldByName("Map"), ::testing::Optional(*field9_)); ASSERT_THAT(schema_->FindFieldByName("Map.value"), ::testing::Optional(*field8_)); @@ -315,6 +358,14 @@ TEST_F(ComplexShortNameTest, TestFindByShortNameCaseInsensitive) { ::testing::Optional(std::nullopt)); } +TEST_F(ComplexShortNameTest, TestIdentifierFieldNames) { + auto result = schema_->IdentifierFieldNames(); + ASSERT_THAT(result, iceberg::IsOk()); + ASSERT_THAT(result.value(), + ::testing::ElementsAre("Map.value.Second_child.element.Foo", + "Map.value.Second_child.element.Bar")); +} + class ComplexMapStructShortNameTest : public ::testing::Test { protected: void SetUp() override { diff --git a/src/iceberg/util/type_util.cc b/src/iceberg/util/type_util.cc index 016397f05..a6cfd645a 100644 --- a/src/iceberg/util/type_util.cc +++ b/src/iceberg/util/type_util.cc @@ -22,6 +22,7 @@ #include #include "iceberg/result.h" +#include "iceberg/schema.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/formatter_internal.h" #include "iceberg/util/string_util.h" @@ -50,9 +51,11 @@ Status IdToFieldVisitor::Visit(const NestedType& type) { NameToIdVisitor::NameToIdVisitor( std::unordered_map>& name_to_id, - bool case_sensitive, std::function quoting_func) + std::unordered_map* id_to_name, bool case_sensitive, + std::function quoting_func) : case_sensitive_(case_sensitive), name_to_id_(name_to_id), + id_to_name_(id_to_name), quoting_func_(std::move(quoting_func)) {} Status NameToIdVisitor::Visit(const ListType& type, const std::string& path, @@ -140,6 +143,11 @@ std::string NameToIdVisitor::BuildPath(std::string_view prefix, } void NameToIdVisitor::Finish() { + if (id_to_name_) { + for (auto& [name, id] : name_to_id_) { + id_to_name_->try_emplace(id, name); + } + } for (auto&& it : short_name_to_id_) { name_to_id_.try_emplace(it.first, it.second); } @@ -294,4 +302,72 @@ std::unordered_map IndexParents(const StructType& root_struct) return id_to_parent; } +AssignFreshIdVisitor::AssignFreshIdVisitor(std::function next_id) + : next_id_(std::move(next_id)) {} + +std::shared_ptr AssignFreshIdVisitor::Visit( + const std::shared_ptr& type) const { + switch (type->type_id()) { + case TypeId::kStruct: + return Visit(*internal::checked_pointer_cast(type)); + case TypeId::kMap: + return Visit(*internal::checked_pointer_cast(type)); + case TypeId::kList: + return Visit(*internal::checked_pointer_cast(type)); + default: + return type; + } +} + +std::shared_ptr AssignFreshIdVisitor::Visit(const StructType& type) const { + auto fresh_ids = + type.fields() | + std::views::transform([&](const auto& /* unused */) { return next_id_(); }) | + std::ranges::to>(); + std::vector fresh_fields; + for (size_t i = 0; i < type.fields().size(); ++i) { + const auto& field = type.fields()[i]; + fresh_fields.emplace_back(fresh_ids[i], std::string(field.name()), + Visit(field.type()), field.optional(), + std::string(field.doc())); + } + return std::make_shared(std::move(fresh_fields)); +} + +std::shared_ptr AssignFreshIdVisitor::Visit(const ListType& type) const { + const auto& elem_field = type.fields()[0]; + int32_t fresh_id = next_id_(); + SchemaField fresh_elem_field(fresh_id, std::string(elem_field.name()), + Visit(elem_field.type()), elem_field.optional(), + std::string(elem_field.doc())); + return std::make_shared(std::move(fresh_elem_field)); +} + +std::shared_ptr AssignFreshIdVisitor::Visit(const MapType& type) const { + const auto& key_field = type.fields()[0]; + const auto& value_field = type.fields()[1]; + + int32_t fresh_key_id = next_id_(); + int32_t fresh_value_id = next_id_(); + + SchemaField fresh_key_field(fresh_key_id, std::string(key_field.name()), + Visit(key_field.type()), key_field.optional(), + std::string(key_field.doc())); + SchemaField fresh_value_field(fresh_value_id, std::string(value_field.name()), + Visit(value_field.type()), value_field.optional(), + std::string(value_field.doc())); + return std::make_shared(std::move(fresh_key_field), + std::move(fresh_value_field)); +} + +Result> AssignFreshIds(int32_t schema_id, const Schema& schema, + std::function next_id) { + auto fresh_type = AssignFreshIdVisitor(std::move(next_id)) + .Visit(internal::checked_cast(schema)); + std::vector fields = + fresh_type->fields() | std::ranges::to>(); + ICEBERG_ASSIGN_OR_RAISE(auto identifier_field_names, schema.IdentifierFieldNames()); + return Schema::Make(std::move(fields), schema_id, identifier_field_names); +} + } // namespace iceberg diff --git a/src/iceberg/util/type_util.h b/src/iceberg/util/type_util.h index 7cc274b0e..959bdb9f9 100644 --- a/src/iceberg/util/type_util.h +++ b/src/iceberg/util/type_util.h @@ -51,12 +51,13 @@ class IdToFieldVisitor { std::unordered_map>& id_to_field_; }; -/// \brief Visitor for building a map from field name to field ID. +/// \brief Visitor for building maps from field name to field ID and field ID to field +/// name. class NameToIdVisitor { public: explicit NameToIdVisitor( std::unordered_map>& name_to_id, - bool case_sensitive = true, + std::unordered_map* id_to_name, bool case_sensitive = true, std::function quoting_func = {}); Status Visit(const ListType& type, const std::string& path, const std::string& short_path); @@ -75,6 +76,7 @@ class NameToIdVisitor { private: bool case_sensitive_; std::unordered_map>& name_to_id_; + std::unordered_map* id_to_name_; std::unordered_map> short_name_to_id_; std::function quoting_func_; }; @@ -131,4 +133,27 @@ class PruneColumnVisitor { ICEBERG_EXPORT std::unordered_map IndexParents( const StructType& root_struct); +/// \brief Assigns fresh IDs to all fields in the schema. +class AssignFreshIdVisitor { + public: + explicit AssignFreshIdVisitor(std::function next_id); + + std::shared_ptr Visit(const std::shared_ptr& type) const; + std::shared_ptr Visit(const StructType& type) const; + std::shared_ptr Visit(const ListType& type) const; + std::shared_ptr Visit(const MapType& type) const; + + private: + std::function next_id_; +}; + +/// \brief Assigns fresh IDs to all fields in a schema. +/// +/// \param schema_id An ID assigned to this schema +/// \param schema The schema to assign IDs to. +/// \param next_id An id assignment function, which returns the next ID to assign. +/// \return A schema with new ids assigned by the next_id function. +ICEBERG_EXPORT Result> AssignFreshIds( + int32_t schema_id, const Schema& schema, std::function next_id); + } // namespace iceberg