From 46b9d876ed3a81d27a1019a74d024a2db10af19d Mon Sep 17 00:00:00 2001 From: yi-xmu <69192773+yi-xmu@users.noreply.github.com> Date: Fri, 13 Mar 2026 18:33:34 +0800 Subject: [PATCH 01/15] Range sizes definition (#438) In order to trigger range splits in a timely manner, the range size is tracked in memory. Init range sizes interface Update range size interface Reset range splitting status interface --- tx_service/include/cc/cc_map.h | 25 +++++++++ tx_service/include/cc/template_cc_map.h | 73 +++++++++++++++++++++++++ tx_service/include/type.h | 7 +++ tx_service/src/cc/cc_map.cpp | 54 ++++++++++++++++++ 4 files changed, 159 insertions(+) diff --git a/tx_service/include/cc/cc_map.h b/tx_service/include/cc/cc_map.h index 0d1434b6..9aaa8c58 100644 --- a/tx_service/include/cc/cc_map.h +++ b/tx_service/include/cc/cc_map.h @@ -21,10 +21,12 @@ */ #pragma once +#include #include #include #include // std::pair +#include "absl/container/flat_hash_map.h" #include "cc/cc_req_base.h" #include "cc_protocol.h" #include "error_messages.h" // CcErrorCode @@ -260,6 +262,20 @@ class CcMap virtual const txservice::KeySchema *KeySchema() const = 0; virtual const txservice::RecordSchema *RecordSchema() const = 0; + /** + * Called by FetchTableRangeSizeCc::Execute when async load completes. + * Merges loaded size with accumulated delta (second), or resets to + * kNotInitialized on failure. + * When emplace is true and partition_id is absent, inserts (partition_id, + * (0,0)) before merging; used for new ranges after split. + */ + bool InitRangeSize(uint32_t partition_id, + int32_t persisted_size, + bool succeed = true, + bool emplace = false); + + void ResetRangeStatus(uint32_t partition_id); + uint64_t SchemaTs() const { return schema_ts_; @@ -294,6 +310,15 @@ class CcMap uint64_t last_dirty_commit_ts_{0}; protected: + // Range id -> (range_size, delta_range_size). Only used when + // RangePartitioned. + // - first: current range size; RangeSizeState::Loading (-1) = loading from + // store; RangeSizeState::Uninitialized (-2) = not yet loaded. + // - second: delta accumulated during load (first==-1) or split (first>=0). + // - third: True if a split task been triggered due to reaching a threshold. + absl::flat_hash_map> + range_sizes_; + /** * @brief After the input request is executed at the current shard, moves * the request to another shard for execution. diff --git a/tx_service/include/cc/template_cc_map.h b/tx_service/include/cc/template_cc_map.h index 77fb5be1..75a973f9 100644 --- a/tx_service/include/cc/template_cc_map.h +++ b/tx_service/include/cc/template_cc_map.h @@ -38,6 +38,7 @@ #include #include +#include "absl/container/flat_hash_map.h" #include "cc_entry.h" #include "cc_map.h" #include "cc_page_clean_guard.h" @@ -8771,6 +8772,10 @@ class TemplateCcMap : public CcMap } normal_obj_sz_ = 0; + if constexpr (RangePartitioned) + { + range_sizes_.clear(); + } ccmp_.clear(); } @@ -11914,6 +11919,74 @@ class TemplateCcMap : public CcMap return &pos_inf_page_; } + bool UpdateRangeSize(uint32_t partition_id, + int32_t delta_size, + bool is_dirty) + { + if constexpr (RangePartitioned) + { + auto it = range_sizes_.find(partition_id); + if (it == range_sizes_.end()) + { + it = range_sizes_ + .emplace(partition_id, + std::make_tuple( + static_cast( + RangeSizeStatus::kNotInitialized), + 0, + false)) + .first; + } + if (std::get<0>(it->second) == + static_cast(RangeSizeStatus::kNotInitialized) && + !is_dirty) + { + std::get<1>(it->second) += delta_size; + // Init the range size of this range. + std::get<0>(it->second) = + static_cast(RangeSizeStatus::kLoading); + + int64_t ng_term = Sharder::Instance().LeaderTerm(cc_ng_id_); + shard_->FetchTableRangeSize(table_name_, + static_cast(partition_id), + cc_ng_id_, + ng_term); + return false; + } + + if (std::get<0>(it->second) == + static_cast(RangeSizeStatus::kLoading) || + is_dirty) + { + // Loading or split: record delta in delta part (.second). + std::get<1>(it->second) += delta_size; + } + else + { + int32_t new_range_size = std::get<0>(it->second) + delta_size; + std::get<0>(it->second) = + new_range_size > 0 ? new_range_size : 0; + + bool trigger_split = + !is_dirty && !std::get<2>(it->second) && + std::get<0>(it->second) >= + static_cast(StoreRange::range_max_size); + + DLOG_IF(INFO, trigger_split) + << "Range size is too large, need to split. table: " + << table_name_.StringView() + << " partition: " << partition_id + << " range size: " << std::get<0>(it->second) + << " range max size: " << StoreRange::range_max_size; + std::get<2>(it->second) = + trigger_split == true ? true : std::get<2>(it->second); + return trigger_split; + } + } // RangePartitioned + + return false; + } + absl::btree_map< KeyT, std::unique_ptr< diff --git a/tx_service/include/type.h b/tx_service/include/type.h index 2fe288c5..566e4171 100644 --- a/tx_service/include/type.h +++ b/tx_service/include/type.h @@ -167,6 +167,13 @@ enum class TableEngine : uint8_t InternalHash = 5, // eg. Sequence table is a kind of internal hash table. }; +// Status values for range_sizes_.first (range size not yet known). +enum RangeSizeStatus : int32_t +{ + kNotInitialized = -2, // Range size not yet initialized; need to fetch. + kLoading = -1, // Range size is being loaded; delta goes to .second. +}; + inline std::string KvTablePrefixOf(TableEngine engine) { switch (engine) diff --git a/tx_service/src/cc/cc_map.cpp b/tx_service/src/cc/cc_map.cpp index 52443b45..ede1962c 100644 --- a/tx_service/src/cc/cc_map.cpp +++ b/tx_service/src/cc/cc_map.cpp @@ -27,6 +27,7 @@ #include "cc/local_cc_shards.h" #include "cc_entry.h" #include "tx_trace.h" +#include "type.h" namespace txservice { @@ -461,4 +462,57 @@ void CcMap::DecrReadIntent(NonBlockingLock *lock, } } +bool CcMap::InitRangeSize(uint32_t partition_id, + int32_t persisted_size, + bool succeed, + bool emplace) +{ + auto it = range_sizes_.find(partition_id); + if (it == range_sizes_.end()) + { + if (!emplace) + { + return false; + } + it = range_sizes_.emplace(partition_id, std::make_tuple(0, 0, false)) + .first; + } + + if (succeed) + { + int32_t final_size = persisted_size + std::get<1>(it->second); + std::get<0>(it->second) = final_size < 0 ? 0 : final_size; + std::get<1>(it->second) = 0; + + bool trigger_split = + !std::get<2>(it->second) && + std::get<0>(it->second) >= + static_cast(StoreRange::range_max_size); + std::get<2>(it->second) = + trigger_split == true ? true : std::get<2>(it->second); + return trigger_split; + } + else + { + // Load range size failed; reset to not-initialized for retry. + std::get<0>(it->second) = + static_cast(RangeSizeStatus::kNotInitialized); + } + return false; +} + +void CcMap::ResetRangeStatus(uint32_t partition_id) +{ + auto it = range_sizes_.find(partition_id); + if (it == range_sizes_.end()) + { + return; + } + std::get<2>(it->second) = false; + + DLOG(INFO) << "ResetRangeStatus: table: " << table_name_.StringView() + << " partition: " << partition_id + << " status: " << std::boolalpha << std::get<2>(it->second); +} + } // namespace txservice From 654db51d097561186844decffbe7e43e2ec85a32 Mon Sep 17 00:00:00 2001 From: yi-xmu <69192773+yi-xmu@users.noreply.github.com> Date: Fri, 13 Mar 2026 18:34:40 +0800 Subject: [PATCH 02/15] Update store range size after datasync (#439) 1. Update storage range size After a successful flush, the store range size is persisted. 2. Load range size from storage When accessing the memory range size, if it has not yet been initialized, a fetch operation from storage is performed to retrieve the range size. --- store_handler/bigtable_handler.cpp | 7 +++ store_handler/bigtable_handler.h | 3 + store_handler/data_store_service_client.cpp | 48 ++++++++++++++-- store_handler/data_store_service_client.h | 14 ++++- .../data_store_service_client_closure.cpp | 49 +++++++++++++++-- .../data_store_service_client_closure.h | 8 +++ store_handler/dynamo_handler.cpp | 6 ++ store_handler/dynamo_handler.h | 1 + store_handler/rocksdb_handler.cpp | 7 +++ store_handler/rocksdb_handler.h | 3 + tx_service/include/cc/cc_req_misc.h | 31 +++++++++++ tx_service/include/cc/cc_shard.h | 6 ++ tx_service/include/store/data_store_handler.h | 2 + tx_service/src/cc/cc_req_misc.cpp | 55 +++++++++++++++++++ tx_service/src/cc/cc_shard.cpp | 20 +++++++ 15 files changed, 250 insertions(+), 10 deletions(-) diff --git a/store_handler/bigtable_handler.cpp b/store_handler/bigtable_handler.cpp index 52a712de..172c321f 100644 --- a/store_handler/bigtable_handler.cpp +++ b/store_handler/bigtable_handler.cpp @@ -710,6 +710,13 @@ void EloqDS::BigTableHandler::FetchRangeSlices( fetch_cc)); } +void EloqDS::BigTableHandler::FetchTableRangeSize( + txservice::FetchTableRangeSizeCc *fetch_cc) +{ + LOG(ERROR) << "BigTableHandler::FetchTableRangeSize not implemented"; + assert(false); +} + void EloqDS::BigTableHandler::OnFetchRangeSlices( google::cloud::future>> f, diff --git a/store_handler/bigtable_handler.h b/store_handler/bigtable_handler.h index 10006bbe..e3ccd39c 100644 --- a/store_handler/bigtable_handler.h +++ b/store_handler/bigtable_handler.h @@ -82,6 +82,9 @@ class BigTableHandler : public txservice::store::DataStoreHandler void FetchRangeSlices(txservice::FetchRangeSlicesReq *fetch_cc) override; + void FetchTableRangeSize( + txservice::FetchTableRangeSizeCc *fetch_cc) override; + /** * @brief Read a row from base table or skindex table in datastore with * specified key. Caller should pass in complete primary key or skindex key. diff --git a/store_handler/data_store_service_client.cpp b/store_handler/data_store_service_client.cpp index 1c55d901..2fd359b8 100644 --- a/store_handler/data_store_service_client.cpp +++ b/store_handler/data_store_service_client.cpp @@ -1059,6 +1059,30 @@ void DataStoreServiceClient::FetchRangeSlices( &FetchRangeSlicesCallback); } +void DataStoreServiceClient::FetchTableRangeSize( + txservice::FetchTableRangeSizeCc *fetch_cc) +{ + txservice::TableName range_table_name(fetch_cc->table_name_->StringView(), + txservice::TableType::RangePartition, + fetch_cc->table_name_->Engine()); + + int32_t kv_partition_id = + KvPartitionIdOfRangeSlices(range_table_name, fetch_cc->partition_id_); + uint32_t shard_id = GetShardIdByPartitionId(kv_partition_id, false); + + auto catalog_factory = GetCatalogFactory(range_table_name.Engine()); + assert(catalog_factory != nullptr); + fetch_cc->kv_start_key_ = + EncodeRangeKey(catalog_factory, range_table_name, fetch_cc->start_key_); + + Read(kv_range_table_name, + kv_partition_id, + shard_id, + fetch_cc->kv_start_key_, + fetch_cc, + &FetchRangeSizeCallback); +} + /** * @brief Deletes data that is out of the specified range. * @@ -1275,16 +1299,19 @@ std::string DataStoreServiceClient::EncodeRangeKey( * @param range_version The version of the range. * @param version The general version number. * @param segment_cnt The number of segments in the range. + * @param range_size The size of the range. * @return Binary string containing the encoded range value. */ std::string DataStoreServiceClient::EncodeRangeValue(int32_t range_id, uint64_t range_version, uint64_t version, - uint32_t segment_cnt) + uint32_t segment_cnt, + int32_t range_size) { std::string kv_range_record; kv_range_record.reserve(sizeof(int32_t) + sizeof(uint64_t) + - sizeof(uint64_t) + sizeof(uint32_t)); + sizeof(uint64_t) + sizeof(uint32_t) + + sizeof(int32_t)); kv_range_record.append(reinterpret_cast(&range_id), sizeof(int32_t)); kv_range_record.append(reinterpret_cast(&range_version), @@ -1294,6 +1321,8 @@ std::string DataStoreServiceClient::EncodeRangeValue(int32_t range_id, // segment_cnt of slices kv_range_record.append(reinterpret_cast(&segment_cnt), sizeof(uint32_t)); + kv_range_record.append(reinterpret_cast(&range_size), + sizeof(int32_t)); return kv_range_record; } @@ -1361,6 +1390,7 @@ RangeSliceBatchPlan DataStoreServiceClient::PrepareRangeSliceBatches( RangeSliceBatchPlan plan; plan.segment_cnt = 0; plan.version = version; + plan.range_size = 0; // Estimate capacity based on slices size plan.segment_keys.reserve(slices.size() / 10 + 1); // Rough estimate @@ -1409,6 +1439,7 @@ RangeSliceBatchPlan DataStoreServiceClient::PrepareRangeSliceBatches( sizeof(uint32_t)); segment_record.append(slice_start_key.Data(), key_size); uint32_t slice_size = static_cast(slices[i]->Size()); + plan.range_size += static_cast(slice_size); segment_record.append(reinterpret_cast(&slice_size), sizeof(uint32_t)); } @@ -1574,6 +1605,7 @@ void DataStoreServiceClient::EnqueueRangeMetadataRecord( uint64_t range_version, uint64_t version, uint32_t segment_cnt, + int32_t range_size, RangeMetadataAccumulator &accumulator) { // Compute kv_table_name and kv_partition_id @@ -1584,8 +1616,8 @@ void DataStoreServiceClient::EnqueueRangeMetadataRecord( // Encode key and value std::string key_str = EncodeRangeKey(catalog_factory, table_name, range_start_key); - std::string rec_str = - EncodeRangeValue(partition_id, range_version, version, segment_cnt); + std::string rec_str = EncodeRangeValue( + partition_id, range_version, version, segment_cnt, range_size); // Get or create entry in accumulator auto key = std::make_pair(kv_table_name, kv_partition_id); @@ -1753,6 +1785,7 @@ bool DataStoreServiceClient::UpdateRangeSlices( req.range_slices_, req.partition_id_); uint32_t segment_cnt = slice_plan.segment_cnt; + int32_t range_size = slice_plan.range_size; int32_t kv_partition_id = KvPartitionIdOfRangeSlices(*req.table_name_, req.partition_id_); auto iter = slice_plans.find(kv_partition_id); @@ -1777,6 +1810,7 @@ bool DataStoreServiceClient::UpdateRangeSlices( req.range_version_, req.ckpt_ts_, segment_cnt, + range_size, meta_acc); } @@ -1978,6 +2012,7 @@ bool DataStoreServiceClient::UpdateRangeSlices( range_version, version, segment_cnt, + slice_plans[0].range_size, meta_acc); SyncConcurrentRequest *meta_sync_concurrent = @@ -2069,6 +2104,7 @@ bool DataStoreServiceClient::UpsertRanges( auto slice_plan = PrepareRangeSliceBatches( table_name, version, range.slices_, range.partition_id_); uint32_t segment_cnt = slice_plan.segment_cnt; + int32_t range_size = slice_plan.range_size; int32_t kv_partition_id = KvPartitionIdOfRangeSlices(table_name, range.partition_id_); @@ -2092,6 +2128,7 @@ bool DataStoreServiceClient::UpsertRanges( version, // range_version (using version for now) version, segment_cnt, + range_size, meta_acc); } @@ -4683,7 +4720,8 @@ bool DataStoreServiceClient::InitTableRanges( std::string key_str = EncodeRangeKey(catalog_factory, table_name, *neg_inf_key); - std::string rec_str = EncodeRangeValue(init_range_id, version, version, 0); + std::string rec_str = + EncodeRangeValue(init_range_id, version, version, 0, 0); keys.emplace_back(std::string_view(key_str.data(), key_str.size())); records.emplace_back(std::string_view(rec_str.data(), rec_str.size())); diff --git a/store_handler/data_store_service_client.h b/store_handler/data_store_service_client.h index 4d860174..fb877d1e 100644 --- a/store_handler/data_store_service_client.h +++ b/store_handler/data_store_service_client.h @@ -66,6 +66,7 @@ struct RangeSliceBatchPlan std::vector segment_keys; // Owned string buffers std::vector segment_records; // Owned string buffers size_t version; + int32_t range_size{0}; // Clear method for reuse void Clear() @@ -74,6 +75,7 @@ struct RangeSliceBatchPlan segment_keys.clear(); segment_records.clear(); version = 0; + range_size = 0; } }; @@ -278,6 +280,9 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler void FetchRangeSlices(txservice::FetchRangeSlicesReq *fetch_cc) override; + void FetchTableRangeSize( + txservice::FetchTableRangeSizeCc *fetch_cc) override; + bool DeleteOutOfRangeData( const txservice::TableName &table_name, int32_t partition_id, @@ -346,7 +351,8 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler std::string EncodeRangeValue(int32_t range_id, uint64_t range_version, uint64_t version, - uint32_t segment_cnt); + uint32_t segment_cnt, + int32_t range_size); std::string EncodeRangeSliceKey(const txservice::TableName &table_name, int32_t range_id, uint32_t segment_id); @@ -654,6 +660,7 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler uint64_t range_version, uint64_t version, uint32_t segment_cnt, + int32_t range_size, RangeMetadataAccumulator &accumulator); void DispatchRangeMetadataBatches( @@ -934,6 +941,11 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); + + friend void FetchRangeSizeCallback(void *data, + ::google::protobuf::Closure *closure, + DataStoreServiceClient &client, + const remote::CommonResult &result); }; struct UpsertTableData diff --git a/store_handler/data_store_service_client_closure.cpp b/store_handler/data_store_service_client_closure.cpp index ab11ce5b..bdddbec3 100644 --- a/store_handler/data_store_service_client_closure.cpp +++ b/store_handler/data_store_service_client_closure.cpp @@ -811,8 +811,9 @@ void FetchTableRangesCallback(void *data, for (uint32_t i = 0; i < items_size; i++) { scan_next_closure->GetItem(i, key, value, ts, ttl); - assert(value.size() == (sizeof(int32_t) + sizeof(uint64_t) + - sizeof(uint64_t) + sizeof(uint32_t))); + assert(value.size() == + (sizeof(int32_t) + sizeof(uint64_t) + sizeof(uint64_t) + + sizeof(uint32_t) + sizeof(int32_t))); const char *buf = value.data(); int32_t partition_id = *(reinterpret_cast(buf)); buf += sizeof(partition_id); @@ -925,6 +926,45 @@ void FetchTableRangesCallback(void *data, } } +void FetchRangeSizeCallback(void *data, + ::google::protobuf::Closure *closure, + DataStoreServiceClient &client, + const remote::CommonResult &result) +{ + txservice::FetchTableRangeSizeCc *fetch_range_size_cc = + static_cast(data); + + if (result.error_code() == remote::DataStoreError::KEY_NOT_FOUND) + { + fetch_range_size_cc->store_range_size_ = 0; + fetch_range_size_cc->SetFinish( + static_cast(txservice::CcErrorCode::NO_ERROR)); + } + else if (result.error_code() != remote::DataStoreError::NO_ERROR) + { + LOG(ERROR) << "Fetch range size failed with error code: " + << result.error_code(); + fetch_range_size_cc->SetFinish( + static_cast(txservice::CcErrorCode::DATA_STORE_ERR)); + } + else + { + ReadClosure *read_closure = static_cast(closure); + std::string_view read_val = read_closure->Value(); + assert(read_closure->TableName() == kv_range_table_name); + assert(read_val.size() == + (sizeof(int32_t) + sizeof(uint64_t) + sizeof(uint64_t) + + sizeof(uint32_t) + sizeof(int32_t))); + const char *buf = read_val.data(); + buf += read_val.size() - sizeof(int32_t); + fetch_range_size_cc->store_range_size_ = + *reinterpret_cast(buf); + + fetch_range_size_cc->SetFinish( + static_cast(txservice::CcErrorCode::NO_ERROR)); + } +} + void FetchRangeSlicesCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, @@ -965,8 +1005,9 @@ void FetchRangeSlicesCallback(void *data, else { assert(read_closure->TableName() == kv_range_table_name); - assert(read_val.size() == (sizeof(int32_t) + sizeof(uint64_t) + - sizeof(uint64_t) + sizeof(uint32_t))); + assert(read_val.size() == + (sizeof(int32_t) + sizeof(uint64_t) + sizeof(uint64_t) + + sizeof(uint32_t) + sizeof(int32_t))); const char *buf = read_val.data(); int32_t range_partition_id = *(reinterpret_cast(buf)); diff --git a/store_handler/data_store_service_client_closure.h b/store_handler/data_store_service_client_closure.h index 4bb72373..b8c3813c 100644 --- a/store_handler/data_store_service_client_closure.h +++ b/store_handler/data_store_service_client_closure.h @@ -3102,6 +3102,14 @@ void FetchTableRangesCallback(void *data, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback for fetching range size from table_ranges. + */ +void FetchRangeSizeCallback(void *data, + ::google::protobuf::Closure *closure, + DataStoreServiceClient &client, + const remote::CommonResult &result); + /** * Callback for fetching range slices. * diff --git a/store_handler/dynamo_handler.cpp b/store_handler/dynamo_handler.cpp index 0aa7ef78..5bfa9029 100644 --- a/store_handler/dynamo_handler.cpp +++ b/store_handler/dynamo_handler.cpp @@ -2534,6 +2534,12 @@ void EloqDS::DynamoHandler::FetchRangeSlices(FetchRangeSlicesReq *fetch_cc) assert(false); } +void EloqDS::DynamoHandler::FetchTableRangeSize(FetchTableRangeSizeCc *fetch_cc) +{ + LOG(ERROR) << "DynamoHandler::FetchTableRangeSize not implemented"; + assert(false); +} + void EloqDS::DynamoHandler::OnFetchRangeSlices( const Aws::DynamoDB::DynamoDBClient *client, const Aws::DynamoDB::Model::GetItemRequest &request, diff --git a/store_handler/dynamo_handler.h b/store_handler/dynamo_handler.h index f2fc9ba5..704200e6 100644 --- a/store_handler/dynamo_handler.h +++ b/store_handler/dynamo_handler.h @@ -158,6 +158,7 @@ class DynamoHandler : public txservice::store::DataStoreHandler //-- range partition void FetchTableRanges(FetchTableRangesCc *fetch_cc) override; void FetchRangeSlices(FetchRangeSlicesReq *fetch_cc) override; + void FetchTableRangeSize(FetchTableRangeSizeCc *fetch_cc) override; bool DeleteOutOfRangeData( const txservice::TableName &table_name, diff --git a/store_handler/rocksdb_handler.cpp b/store_handler/rocksdb_handler.cpp index e741748b..47c039aa 100644 --- a/store_handler/rocksdb_handler.cpp +++ b/store_handler/rocksdb_handler.cpp @@ -1128,6 +1128,13 @@ void RocksDBHandler::FetchRangeSlices(txservice::FetchRangeSlicesReq *fetch_cc) assert(false); } +void RocksDBHandler::FetchTableRangeSize( + txservice::FetchTableRangeSizeCc *fetch_cc) +{ + LOG(ERROR) << "RocksDBHandler::FetchTableRangeSize not implemented"; + assert(false); +} + bool DeleteOutOfRangeDataInternal(std::string delete_from_partition_sql, int32_t partition_id, const txservice::TxKey *start_k) diff --git a/store_handler/rocksdb_handler.h b/store_handler/rocksdb_handler.h index c8717a49..8742b064 100644 --- a/store_handler/rocksdb_handler.h +++ b/store_handler/rocksdb_handler.h @@ -346,6 +346,9 @@ class RocksDBHandler : public txservice::store::DataStoreHandler void FetchRangeSlices(txservice::FetchRangeSlicesReq *fetch_cc) override; + void FetchTableRangeSize( + txservice::FetchTableRangeSizeCc *fetch_cc) override; + bool DeleteOutOfRangeDataInternal(std::string delete_from_partition_sql, int32_t partition_id, const txservice::TxKey *start_k); diff --git a/tx_service/include/cc/cc_req_misc.h b/tx_service/include/cc/cc_req_misc.h index 2c1807dd..27b290c7 100644 --- a/tx_service/include/cc/cc_req_misc.h +++ b/tx_service/include/cc/cc_req_misc.h @@ -1157,4 +1157,35 @@ struct ShardCleanCc : public CcRequestBase private: size_t free_count_{0}; }; + +struct FetchTableRangeSizeCc : public CcRequestBase +{ +public: + FetchTableRangeSizeCc() = default; + ~FetchTableRangeSizeCc() = default; + + void Reset(const TableName &table_name, + int32_t partition_id, + const TxKey &start_key, + CcShard *ccs, + NodeGroupId ng_id, + int64_t ng_term); + + bool ValidTermCheck(); + bool Execute(CcShard &ccs) override; + void SetFinish(uint32_t error); + + const TableName *table_name_; + int32_t partition_id_{0}; + TxKey start_key_{}; + NodeGroupId node_group_id_{0}; + int64_t node_group_term_{-1}; + CcShard *ccs_{nullptr}; + + uint32_t error_code_{0}; + int32_t store_range_size_{0}; + + // Only used in DataStoreHandler + std::string kv_start_key_; +}; } // namespace txservice diff --git a/tx_service/include/cc/cc_shard.h b/tx_service/include/cc/cc_shard.h index 09e4081d..8927cfdd 100644 --- a/tx_service/include/cc/cc_shard.h +++ b/tx_service/include/cc/cc_shard.h @@ -315,6 +315,11 @@ class CcShard */ CcMap *GetCcm(const TableName &table_name, uint32_t node_group); + void FetchTableRangeSize(const TableName &table_name, + int32_t partition_id, + NodeGroupId cc_ng_id, + int64_t cc_ng_term); + void AdjustDataKeyStats(const TableName &table_name, int64_t size_delta, int64_t dirty_delta); @@ -1222,6 +1227,7 @@ class CcShard CcRequestPool fill_store_slice_cc_pool_; CcRequestPool init_key_cache_cc_pool_; + CcRequestPool fetch_range_size_cc_pool_; // CcRequest queue on this shard/core. moodycamel::ConcurrentQueue cc_queue_; diff --git a/tx_service/include/store/data_store_handler.h b/tx_service/include/store/data_store_handler.h index d0ca96d8..4059431a 100644 --- a/tx_service/include/store/data_store_handler.h +++ b/tx_service/include/store/data_store_handler.h @@ -135,6 +135,8 @@ class DataStoreHandler virtual void FetchRangeSlices(FetchRangeSlicesReq *fetch_cc) = 0; + virtual void FetchTableRangeSize(FetchTableRangeSizeCc *fetch_cc) = 0; + /** * @brief Read a row from base table or skindex table in datastore with * specified key. Caller should pass in complete primary key or skindex key. diff --git a/tx_service/src/cc/cc_req_misc.cpp b/tx_service/src/cc/cc_req_misc.cpp index eae335c7..014795cd 100644 --- a/tx_service/src/cc/cc_req_misc.cpp +++ b/tx_service/src/cc/cc_req_misc.cpp @@ -1535,4 +1535,59 @@ bool ShardCleanCc::Execute(CcShard &ccs) } } +void FetchTableRangeSizeCc::Reset(const TableName &table_name, + int32_t partition_id, + const TxKey &start_key, + CcShard *ccs, + NodeGroupId ng_id, + int64_t ng_term) +{ + table_name_ = &table_name; + partition_id_ = partition_id; + start_key_ = start_key.GetShallowCopy(); + node_group_id_ = ng_id; + node_group_term_ = ng_term; + ccs_ = ccs; + error_code_ = 0; + store_range_size_ = 0; +} + +bool FetchTableRangeSizeCc::ValidTermCheck() +{ + int64_t ng_leader_term = Sharder::Instance().LeaderTerm(node_group_id_); + return ng_leader_term == node_group_term_; +} + +bool FetchTableRangeSizeCc::Execute(CcShard &ccs) +{ + if (!ValidTermCheck()) + { + error_code_ = static_cast(CcErrorCode::NG_TERM_CHANGED); + } + + bool succ = (error_code_ == 0); + CcMap *ccm = ccs.GetCcm(*table_name_, node_group_id_); + assert(ccm != nullptr); + bool need_split = ccm->InitRangeSize( + static_cast(partition_id_), store_range_size_, succ); + + if (need_split) + { + uint64_t data_sync_ts = ccs.local_shards_.ClockTs(); + ccs.CreateSplitRangeDataSyncTask(*table_name_, + node_group_id_, + node_group_term_, + partition_id_, + data_sync_ts); + } + + return true; +} + +void FetchTableRangeSizeCc::SetFinish(uint32_t error) +{ + error_code_ = error; + ccs_->Enqueue(this); +} + } // namespace txservice diff --git a/tx_service/src/cc/cc_shard.cpp b/tx_service/src/cc/cc_shard.cpp index 2036d569..b9c8f8e4 100644 --- a/tx_service/src/cc/cc_shard.cpp +++ b/tx_service/src/cc/cc_shard.cpp @@ -398,6 +398,26 @@ CcMap *CcShard::GetCcm(const TableName &table_name, uint32_t node_group) } } +void CcShard::FetchTableRangeSize(const TableName &table_name, + int32_t partition_id, + NodeGroupId cc_ng_id, + int64_t cc_ng_term) +{ + FetchTableRangeSizeCc *fetch_cc = fetch_range_size_cc_pool_.NextRequest(); + + const TableName range_table_name(table_name.StringView(), + TableType::RangePartition, + table_name.Engine()); + const TableRangeEntry *range_entry = + GetTableRangeEntry(range_table_name, cc_ng_id, partition_id); + assert(range_entry != nullptr); + TxKey start_key = range_entry->GetRangeInfo()->StartTxKey(); + + fetch_cc->Reset( + table_name, partition_id, start_key, this, cc_ng_id, cc_ng_term); + local_shards_.store_hd_->FetchTableRangeSize(fetch_cc); +} + void CcShard::AdjustDataKeyStats(const TableName &table_name, int64_t size_delta, int64_t dirty_delta) From 1093ffceb75cff210acf9ec47c1630c2af22a07d Mon Sep 17 00:00:00 2001 From: yi-xmu <69192773+yi-xmu@users.noreply.github.com> Date: Fri, 13 Mar 2026 18:35:19 +0800 Subject: [PATCH 03/15] Maintaining range size on postwritecc (#440) 1. During a post-write operation, the range size information for the corresponding partition is maintained. 2. If a range is in the process of splitting, the range size is updated in the delta size information. 3. For a "double-write" operation, only the range size information for the newly split partition is updated. --- tx_service/include/cc/cc_handler.h | 4 +- tx_service/include/cc/cc_request.h | 42 ++++++++++-- tx_service/include/cc/local_cc_handler.h | 4 +- tx_service/include/cc/template_cc_map.h | 38 +++++++++++ tx_service/include/proto/cc_request.proto | 6 ++ tx_service/include/read_write_entry.h | 15 ++++- tx_service/include/remote/remote_cc_handler.h | 4 +- tx_service/src/cc/local_cc_handler.cpp | 12 +++- tx_service/src/remote/remote_cc_handler.cpp | 6 +- tx_service/src/remote/remote_cc_request.cpp | 4 +- tx_service/src/tx_execution.cpp | 25 +++++-- tx_service/src/tx_operation.cpp | 66 ++++++++++++++----- 12 files changed, 190 insertions(+), 36 deletions(-) diff --git a/tx_service/include/cc/cc_handler.h b/tx_service/include/cc/cc_handler.h index 3d4640b8..cad6db33 100644 --- a/tx_service/include/cc/cc_handler.h +++ b/tx_service/include/cc/cc_handler.h @@ -166,7 +166,9 @@ class CcHandler const TxRecord *record, OperationType operation_type, uint32_t key_shard_code, - CcHandlerResult &hres) = 0; + CcHandlerResult &hres, + int32_t partition_id = -1, + bool on_dirty_range = false) = 0; /** * @briefPost-processes a read/scan key. Post-processing clears the read diff --git a/tx_service/include/cc/cc_request.h b/tx_service/include/cc/cc_request.h index 97e93fae..056b1ffb 100644 --- a/tx_service/include/cc/cc_request.h +++ b/tx_service/include/cc/cc_request.h @@ -740,7 +740,9 @@ struct PostWriteCc : public TemplatedCcRequest const TxRecord *rec, OperationType operation_type, uint32_t key_shard_code, - CcHandlerResult *res) + CcHandlerResult *res, + int32_t partition_id = -1, + bool on_dirty_range = false) { TemplatedCcRequest::Reset( nullptr, res, addr->NodeGroupId(), tx_number, tx_term); @@ -754,6 +756,8 @@ struct PostWriteCc : public TemplatedCcRequest is_remote_ = false; ccm_ = nullptr; is_initial_insert_ = false; + partition_id_ = partition_id; + on_dirty_range_ = on_dirty_range; } void Reset(const TxKey *key, @@ -767,7 +771,9 @@ struct PostWriteCc : public TemplatedCcRequest uint32_t key_shard_code, CcHandlerResult *res, bool initial_insertion = false, - int64_t ng_term = INIT_TERM) + int64_t ng_term = INIT_TERM, + int32_t partition_id = -1, + bool on_dirty_range = false) { TemplatedCcRequest::Reset( &table_name, @@ -788,6 +794,8 @@ struct PostWriteCc : public TemplatedCcRequest is_remote_ = false; ccm_ = nullptr; is_initial_insert_ = initial_insertion; + partition_id_ = partition_id; + on_dirty_range_ = on_dirty_range; } void Reset(const CcEntryAddr *addr, @@ -797,7 +805,9 @@ struct PostWriteCc : public TemplatedCcRequest const std::string *rec, OperationType operation_type, uint32_t key_shard_code, - CcHandlerResult *res) + CcHandlerResult *res, + int32_t partition_id = -1, + bool on_dirty_range = false) { TemplatedCcRequest::Reset( nullptr, res, addr->NodeGroupId(), tx_number, tx_term); @@ -811,6 +821,8 @@ struct PostWriteCc : public TemplatedCcRequest is_remote_ = true; ccm_ = nullptr; is_initial_insert_ = false; + partition_id_ = partition_id; + on_dirty_range_ = on_dirty_range; } void Reset(const TableName *table_name, @@ -824,7 +836,9 @@ struct PostWriteCc : public TemplatedCcRequest uint32_t key_shard_code, CcHandlerResult *res, bool initial_insertion = false, - int64_t ng_term = INIT_TERM) + int64_t ng_term = INIT_TERM, + int32_t partition_id = -1, + bool on_dirty_range = false) { TemplatedCcRequest::Reset( table_name, @@ -845,6 +859,8 @@ struct PostWriteCc : public TemplatedCcRequest is_remote_ = true; ccm_ = nullptr; is_initial_insert_ = initial_insertion; + partition_id_ = partition_id; + on_dirty_range_ = on_dirty_range; } const CcEntryAddr *CceAddr() const @@ -877,6 +893,11 @@ struct PostWriteCc : public TemplatedCcRequest return key_shard_code_; } + int32_t PartitionId() const + { + return partition_id_; + } + const void *Key() const { return is_remote_ ? nullptr : key_; @@ -892,6 +913,16 @@ struct PostWriteCc : public TemplatedCcRequest return is_initial_insert_; } + bool OnDirtyRange() const + { + return on_dirty_range_; + } + + bool NeedUpdateRangeSize() const + { + return partition_id_ >= 0; + } + private: const CcEntryAddr *cce_addr_; uint64_t commit_ts_; @@ -909,6 +940,9 @@ struct PostWriteCc : public TemplatedCcRequest const void *key_; const std::string *key_str_; }; + int32_t partition_id_{-1}; + // True if the key is located in a splitting range. + bool on_dirty_range_{false}; }; struct PostWriteAllCc diff --git a/tx_service/include/cc/local_cc_handler.h b/tx_service/include/cc/local_cc_handler.h index eae6ba46..8e0fb115 100644 --- a/tx_service/include/cc/local_cc_handler.h +++ b/tx_service/include/cc/local_cc_handler.h @@ -103,7 +103,9 @@ class LocalCcHandler : public CcHandler const TxRecord *record, OperationType operation_type, uint32_t key_shard_code, - CcHandlerResult &hres) override; + CcHandlerResult &hres, + int32_t partition_id = -1, + bool on_dirty_range = false) override; CcReqStatus PostRead( uint64_t tx_number, diff --git a/tx_service/include/cc/template_cc_map.h b/tx_service/include/cc/template_cc_map.h index 75a973f9..2f134cc9 100644 --- a/tx_service/include/cc/template_cc_map.h +++ b/tx_service/include/cc/template_cc_map.h @@ -592,6 +592,8 @@ class TemplateCcMap : public CcMap cce->ArchiveBeforeUpdate(); } + [[maybe_unused]] const size_t old_payload_size = + cce->PayloadSize(); if (is_del) { cce->payload_.SetCurrentPayload(nullptr); @@ -613,6 +615,42 @@ class TemplateCcMap : public CcMap bool was_dirty = cce->IsDirty(); cce->SetCommitTsPayloadStatus(commit_ts, new_status); + if constexpr (RangePartitioned) + { + if (req.NeedUpdateRangeSize()) + { + const int64_t key_delta_size = + (new_status == RecordStatus::Deleted) + ? (-static_cast(write_key->Size() + + old_payload_size)) + : (cce_old_status != RecordStatus::Normal + ? static_cast( + write_key->Size() + + cce->PayloadSize()) + : static_cast( + cce->PayloadSize() - + old_payload_size)); + const uint32_t range_id = req.PartitionId(); + // is_dirty: true when range is splitting. + bool need_split = UpdateRangeSize( + range_id, + static_cast(key_delta_size), + req.OnDirtyRange()); + + if (need_split) + { + assert(!req.OnDirtyRange()); + // Create a data sync task for the range. + shard_->CreateSplitRangeDataSyncTask( + table_name_, + cc_ng_id_, + cce_addr->Term(), + range_id, + commit_ts); + } + } + } + if (req.IsInitialInsert()) { // Updates the ckpt ts after commit ts is set. diff --git a/tx_service/include/proto/cc_request.proto b/tx_service/include/proto/cc_request.proto index 889d8259..71909157 100644 --- a/tx_service/include/proto/cc_request.proto +++ b/tx_service/include/proto/cc_request.proto @@ -176,6 +176,10 @@ message UploadBatchRequest bytes commit_ts = 9; bytes rec_status = 10; UploadBatchKind kind = 11; + // Target range partition; + int32 partition_id = 12; + // Per-key one byte: [uint8_t, ...] + bytes range_size_flags = 13; } message UploadBatchSlicesRequest @@ -920,6 +924,8 @@ message PostCommitRequest { bytes record = 5; uint32 operation_type = 6; uint32 key_shard_code = 7; + int32 partition_id = 8; + bool on_dirty_range = 9; } message ForwardPostCommitRequest { diff --git a/tx_service/include/read_write_entry.h b/tx_service/include/read_write_entry.h index 4d86c34c..36463be1 100644 --- a/tx_service/include/read_write_entry.h +++ b/tx_service/include/read_write_entry.h @@ -49,17 +49,25 @@ struct WriteSetEntry op_(other.op_), cce_addr_(other.cce_addr_), key_shard_code_(other.key_shard_code_), - forward_addr_(std::move(other.forward_addr_)) + partition_id_(other.partition_id_), + forward_addr_(std::move(other.forward_addr_)), + on_dirty_range_(other.on_dirty_range_) { } WriteSetEntry &operator=(WriteSetEntry &&other) noexcept { + if (this == &other) + { + return *this; + } rec_ = std::move(other.rec_); op_ = other.op_; cce_addr_ = other.cce_addr_; key_shard_code_ = other.key_shard_code_; + partition_id_ = other.partition_id_; forward_addr_ = std::move(other.forward_addr_); + on_dirty_range_ = other.on_dirty_range_; return *this; } @@ -68,8 +76,11 @@ struct WriteSetEntry OperationType op_; CcEntryAddr cce_addr_; uint32_t key_shard_code_{}; + int32_t partition_id_{-1}; // Used in double write scenarios during online DDL. - std::unordered_map forward_addr_; + // key shard code -> (partition id, cce addr) + std::unordered_map> forward_addr_; + bool on_dirty_range_{false}; }; /** diff --git a/tx_service/include/remote/remote_cc_handler.h b/tx_service/include/remote/remote_cc_handler.h index 83695f21..b7c43cdd 100644 --- a/tx_service/include/remote/remote_cc_handler.h +++ b/tx_service/include/remote/remote_cc_handler.h @@ -84,7 +84,9 @@ class RemoteCcHandler const TxRecord *record, OperationType operation_type, uint32_t key_shard_code, - CcHandlerResult &hres); + CcHandlerResult &hres, + int32_t partition_id = -1, + bool on_dirty_range = false); void PostWriteAll(uint32_t src_node_id, const TableName &table_name, diff --git a/tx_service/src/cc/local_cc_handler.cpp b/tx_service/src/cc/local_cc_handler.cpp index 9dd7962d..4a582dc1 100644 --- a/tx_service/src/cc/local_cc_handler.cpp +++ b/tx_service/src/cc/local_cc_handler.cpp @@ -274,7 +274,9 @@ txservice::CcReqStatus txservice::LocalCcHandler::PostWrite( const TxRecord *record, OperationType operation_type, uint32_t key_shard_code, - CcHandlerResult &hres) + CcHandlerResult &hres, + int32_t partition_id, + bool on_dirty_range) { uint32_t ng_id = cce_addr.NodeGroupId(); uint32_t dest_node_id = Sharder::Instance().LeaderNodeId(ng_id); @@ -293,7 +295,9 @@ txservice::CcReqStatus txservice::LocalCcHandler::PostWrite( record, operation_type, key_shard_code, - &hres); + &hres, + partition_id, + on_dirty_range); TX_TRACE_ACTION(this, req); TX_TRACE_DUMP(req); cc_shards_.EnqueueCcRequest(thd_id_, cce_addr.CoreId(), req); @@ -312,7 +316,9 @@ txservice::CcReqStatus txservice::LocalCcHandler::PostWrite( record, operation_type, key_shard_code, - hres); + hres, + partition_id, + on_dirty_range); } return req_status; } diff --git a/tx_service/src/remote/remote_cc_handler.cpp b/tx_service/src/remote/remote_cc_handler.cpp index 848ae8f7..eb9952bf 100644 --- a/tx_service/src/remote/remote_cc_handler.cpp +++ b/tx_service/src/remote/remote_cc_handler.cpp @@ -159,7 +159,9 @@ void txservice::remote::RemoteCcHandler::PostWrite( const TxRecord *record, OperationType operation_type, uint32_t key_shard_code, - CcHandlerResult &hres) + CcHandlerResult &hres, + int32_t partition_id, + bool on_dirty_range) { CcMessage send_msg; @@ -194,6 +196,8 @@ void txservice::remote::RemoteCcHandler::PostWrite( post_commit->set_commit_ts(commit_ts); post_commit->set_operation_type(static_cast(operation_type)); post_commit->set_key_shard_code(key_shard_code); + post_commit->set_partition_id(partition_id); + post_commit->set_on_dirty_range(on_dirty_range); stream_sender_.SendMessageToNg(cce_addr.NodeGroupId(), send_msg, &hres); } diff --git a/tx_service/src/remote/remote_cc_request.cpp b/tx_service/src/remote/remote_cc_request.cpp index 32fbb935..17ebab49 100644 --- a/tx_service/src/remote/remote_cc_request.cpp +++ b/tx_service/src/remote/remote_cc_request.cpp @@ -594,7 +594,9 @@ void txservice::remote::RemotePostWrite::Reset( rec_str, static_cast(post_commit.operation_type()), post_commit.key_shard_code(), - &cc_res_); + &cc_res_, + post_commit.partition_id(), + post_commit.on_dirty_range()); } else { diff --git a/tx_service/src/tx_execution.cpp b/tx_service/src/tx_execution.cpp index 6e80dbba..65abedc5 100644 --- a/tx_service/src/tx_execution.cpp +++ b/tx_service/src/tx_execution.cpp @@ -4611,12 +4611,17 @@ bool TransactionExecution::FillDataLogRequest(WriteToLogOp &write_log) // ngs, write log for both ngs. uint32_t forward_ng_id = Sharder::Instance().ShardToCcNodeGroup(forward_shard_code); - auto table_rec_it = ng_table_set.try_emplace(forward_ng_id); + auto [table_rec_it, inserted] = + ng_table_set.try_emplace(forward_ng_id); + if (!inserted) + { + continue; + } std::unordered_map< TableName, std::vector< std::pair>> - &table_rec_set = table_rec_it.first->second.second; + &table_rec_set = table_rec_it->second.second; auto rec_vec_it = table_rec_set.emplace( std::piecewise_construct, @@ -5288,6 +5293,7 @@ void TransactionExecution::Process(PostProcessOp &post_process) { for (const auto &[key, write_entry] : pair.second) { + bool on_dirty_range = write_entry.on_dirty_range_; CcReqStatus ret = cc_handler_->PostWrite(tx_number, tx_term_, @@ -5297,10 +5303,12 @@ void TransactionExecution::Process(PostProcessOp &post_process) write_entry.rec_.get(), write_entry.op_, write_entry.key_shard_code_, - post_process.hd_result_); + post_process.hd_result_, + write_entry.partition_id_, + on_dirty_range); update_post_cnt(ret); - for (auto &[forward_shard_code, cce_addr] : + for (auto &[forward_shard_code, forward_pair] : write_entry.forward_addr_) { CcReqStatus ret = @@ -5308,11 +5316,13 @@ void TransactionExecution::Process(PostProcessOp &post_process) tx_term_, command_id, commit_ts_, - cce_addr, + forward_pair.second, write_entry.rec_.get(), write_entry.op_, forward_shard_code, - post_process.hd_result_); + post_process.hd_result_, + forward_pair.first, + on_dirty_range); update_post_cnt(ret); } } @@ -5394,9 +5404,10 @@ void TransactionExecution::Process(PostProcessOp &post_process) // Keys that were not successfully locked in the cc // map do not need post-processing. - for (const auto &[forward_shard_code, cce_addr] : + for (const auto &[forward_shard_code, forward_pair] : write_entry.forward_addr_) { + const CcEntryAddr &cce_addr = forward_pair.second; if (cce_addr.Term() >= 0) { assert(!cce_addr.Empty()); diff --git a/tx_service/src/tx_operation.cpp b/tx_service/src/tx_operation.cpp index 926ff090..218fdd74 100644 --- a/tx_service/src/tx_operation.cpp +++ b/tx_service/src/tx_operation.cpp @@ -464,19 +464,20 @@ void AcquireWriteOperation::AggregateAcquiredKeys(TransactionExecution *txm) } } - for (auto &[forward_shard_code, cce_addr] : write_entry->forward_addr_) + for (auto &[forward_shard_code, forward_pair] : + write_entry->forward_addr_) { AcquireKeyResult &acquire_key_res = acquire_key_vec[res_idx++]; CcEntryAddr &addr = acquire_key_res.cce_addr_; term = addr.Term(); if (term < 0) { - cce_addr.SetCceLock(0, -1, 0); + forward_pair.second.SetCceLock(0, -1, 0); } else if (acquire_key_res.commit_ts_ == 0) { // acqurie write failed on forward addr. - cce_addr.SetCceLock(0, -1, 0); + forward_pair.second.SetCceLock(0, -1, 0); // Set term to -1 so that post write will not be sent to this // addr. addr.SetTerm(-1); @@ -485,7 +486,7 @@ void AcquireWriteOperation::AggregateAcquiredKeys(TransactionExecution *txm) { // Assigns to the write entry the cc entry address obtained // in the acquire phase. - cce_addr = addr; + forward_pair.second = addr; } // No need to dedup forwarded req since they are not visible to read @@ -720,17 +721,23 @@ void LockWriteRangeBucketsOp::Advance(TransactionExecution *txm) size_t new_range_idx = 0; auto *range_info = txm->range_rec_.GetRangeInfo(); + int32_t range_id = range_info->PartitionId(); + uint32_t residual = static_cast(range_id & 0x3FF); + bool on_dirty_range = range_info->IsDirty(); while (write_key_it_ != next_range_start) { const TxKey &write_tx_key = write_key_it_->first; WriteSetEntry &write_entry = write_key_it_->second; - size_t hash = write_tx_key.Hash(); - write_entry.key_shard_code_ = (range_ng << 10) | (hash & 0x3FF); + write_entry.key_shard_code_ = (range_ng << 10) | residual; + write_entry.partition_id_ = range_id; + write_entry.on_dirty_range_ = on_dirty_range; // If current range is migrating, forward to new range owner. if (new_bucket_ng != UINT32_MAX) { - write_entry.forward_addr_.try_emplace((new_bucket_ng << 10) | - (hash & 0x3FF)); + assert(new_bucket_ng != range_ng); + write_entry.forward_addr_.try_emplace( + ((new_bucket_ng << 10) | residual), + std::make_pair(range_id, CcEntryAddr())); } // If range is splitting and the key will fall on a new range after @@ -748,18 +755,47 @@ void LockWriteRangeBucketsOp::Advance(TransactionExecution *txm) } if (new_range_ng != UINT32_MAX) { - if (new_range_ng != range_ng) + int32_t new_range_id = + range_info->NewPartitionId()->at(new_range_idx - 1); + uint32_t new_residual = + static_cast(new_range_id & 0x3FF); + uint16_t core_cnt = + Sharder::Instance().GetLocalCcShards()->Count(); + uint16_t new_range_shard = + static_cast(new_residual % core_cnt); + uint16_t range_shard = + static_cast(residual % core_cnt); + if (new_range_ng != range_ng || new_range_shard != range_shard) + { + write_entry.forward_addr_.try_emplace( + ((new_range_ng << 10) | new_residual), + std::make_pair(new_range_id, CcEntryAddr())); + // There is no need to update the range size of the old + // range. + write_entry.partition_id_ = -1; + } + else if (new_range_ng == range_ng && + new_range_shard == range_shard) { - write_entry.forward_addr_.try_emplace((new_range_ng << 10) | - (hash & 0x3FF)); + // Only update the range size on the new range id in case of + // the new range and the old range are located on the same + // shard. + write_entry.partition_id_ = new_range_id; } + // If the new range is migrating, forward to the new owner of // new range. - if (new_range_new_bucket_ng != UINT32_MAX && - new_range_new_bucket_ng != range_ng) + // TODO(ysw): double check the logic here. + if (new_range_new_bucket_ng != UINT32_MAX) { - write_entry.forward_addr_.try_emplace( - (new_range_new_bucket_ng << 10) | (hash & 0x3FF)); + assert(new_range_new_bucket_ng != new_range_ng); + if (new_range_new_bucket_ng != range_ng || + new_range_shard != range_shard) + { + write_entry.forward_addr_.try_emplace( + ((new_range_new_bucket_ng << 10) | new_residual), + std::make_pair(new_range_id, CcEntryAddr())); + } } } From bc562be63888759e0e2a1a54a1195080810205c6 Mon Sep 17 00:00:00 2001 From: yi-xmu <69192773+yi-xmu@users.noreply.github.com> Date: Fri, 13 Mar 2026 18:36:01 +0800 Subject: [PATCH 04/15] Reset range size after range split (#441) 1. In the post-commit phase of a range split transaction, the range size of all related partitions is updated: base range size + delta size. 2. Reset the range splitting flag. 3. Update the kickoutcc process to accommodate the new key sharding logic. 4. Update the processing procedure for SampleSubRangeKeys to accommodate the new key sharding logic. --- tx_service/include/cc/range_cc_map.h | 49 +++++++++++++++++++++++++ tx_service/include/cc/template_cc_map.h | 7 +++- tx_service/src/cc/local_cc_handler.cpp | 11 +++++- tx_service/src/cc/range_slice.cpp | 11 +++--- tx_service/src/tx_operation.cpp | 12 +++++- 5 files changed, 79 insertions(+), 11 deletions(-) diff --git a/tx_service/include/cc/range_cc_map.h b/tx_service/include/cc/range_cc_map.h index 29b679a5..b16642ea 100644 --- a/tx_service/include/cc/range_cc_map.h +++ b/tx_service/include/cc/range_cc_map.h @@ -743,7 +743,56 @@ class RangeCcMap : public TemplateCcMap // update previous cce's end key cce->SetCommitTsPayloadStatus(new_range_info->version_ts_, RecordStatus::Normal); + + // Reset new range size on the data table ccmap (emplace if + // absent). + int32_t new_range_id = new_range_info->PartitionId(); + NodeGroupId new_range_owner = + shard_->GetRangeOwner(new_range_id, this->cc_ng_id_) + ->BucketOwner(); + if (new_range_owner == this->cc_ng_id_ && + static_cast((new_range_id & 0x3FF) % + shard_->core_cnt_) == + shard_->core_id_) + { + TableType data_table_type = + TableName::Type(this->table_name_.StringView()); + TableName data_table_name(this->table_name_.StringView(), + data_table_type, + this->table_name_.Engine()); + CcMap *ccm = + shard_->GetCcm(data_table_name, this->cc_ng_id_); + assert(ccm != nullptr); + size_t range_size = new_range_entries.at(idx) + ->TypedStoreRange() + ->PostCkptSize(); + ccm->InitRangeSize(static_cast(new_range_id), + static_cast(range_size), + true, + true); + } } + // Reset old range size on the data table ccmap (no emplace). + int32_t old_partition_id = + upload_range_rec->GetRangeInfo()->PartitionId(); + if (range_owner == this->cc_ng_id_ && + static_cast((old_partition_id & 0x3FF) % + shard_->core_cnt_) == shard_->core_id_) + { + TableType data_table_type = + TableName::Type(this->table_name_.StringView()); + TableName data_table_name(this->table_name_.StringView(), + data_table_type, + this->table_name_.Engine()); + CcMap *ccm = shard_->GetCcm(data_table_name, this->cc_ng_id_); + assert(ccm != nullptr); + size_t old_range_size = + old_entry->TypedStoreRange()->PostCkptSize(); + ccm->InitRangeSize(static_cast(old_partition_id), + static_cast(old_range_size)); + ccm->ResetRangeStatus(static_cast(old_partition_id)); + } + // range_owner_rec_ needs to be reset on each core since they point // to bucket records on different cores. upload_range_rec->range_owner_rec_ = diff --git a/tx_service/include/cc/template_cc_map.h b/tx_service/include/cc/template_cc_map.h index 2f134cc9..d0f84289 100644 --- a/tx_service/include/cc/template_cc_map.h +++ b/tx_service/include/cc/template_cc_map.h @@ -7438,9 +7438,12 @@ class TemplateCcMap : public CcMap } LruPage *lru_page; uint16_t pause_idx = shard_->core_id_; - if (req.GetCleanType() == CleanType::CleanBucketData) + CleanType clean_type = req.GetCleanType(); + if (clean_type == CleanType::CleanBucketData || + clean_type == CleanType::CleanRangeData) { - // For clean bucket data, cc req is only sent to 1 core. + // For clean bucket data and range data, cc req is only sent to 1 + // core. pause_idx = 0; } if (req.ResumeKey(pause_idx)->KeyPtr() != nullptr) diff --git a/tx_service/src/cc/local_cc_handler.cpp b/tx_service/src/cc/local_cc_handler.cpp index 4a582dc1..8f761609 100644 --- a/tx_service/src/cc/local_cc_handler.cpp +++ b/tx_service/src/cc/local_cc_handler.cpp @@ -1913,7 +1913,8 @@ void txservice::LocalCcHandler::KickoutData(const TableName &table_name, KickoutCcEntryCc *req = kickout_ccentry_pool_.NextRequest(); // For hash partition, all data in a single bucket should be hashed to // the same core. - uint16_t core_cnt = clean_type == CleanType::CleanBucketData + uint16_t core_cnt = (clean_type == CleanType::CleanBucketData || + clean_type == CleanType::CleanRangeData) ? 1 : Sharder::Instance().GetLocalCcShardsCount(); req->Reset(table_name, @@ -1940,6 +1941,14 @@ void txservice::LocalCcHandler::KickoutData(const TableName &table_name, Sharder::Instance().ShardBucketIdToCoreIdx((*bucket_id)[0]), req); } + else if (clean_type == CleanType::CleanRangeData) + { + assert(range_id != INT32_MAX); + uint16_t dest_core = static_cast( + (range_id & 0x3FF) % + Sharder::Instance().GetLocalCcShardsCount()); + cc_shards_.EnqueueToCcShard(dest_core, req); + } else { // Dispatch the request to all cores and run in parallel diff --git a/tx_service/src/cc/range_slice.cpp b/tx_service/src/cc/range_slice.cpp index 91b1973b..3fea4287 100644 --- a/tx_service/src/cc/range_slice.cpp +++ b/tx_service/src/cc/range_slice.cpp @@ -449,12 +449,11 @@ bool StoreRange::SampleSubRangeKeys(StoreSlice *slice, &end_key, key_cnt); - // Send the request to one shard randomly. - uint64_t core_rand = butil::fast_rand(); - local_cc_shards_.EnqueueLowPriorityCcRequestToShard( - core_rand % local_cc_shards_.Count(), &sample_keys_cc); - DLOG(INFO) << "Send the sample range keys request to shard#" - << core_rand % local_cc_shards_.Count(); + uint16_t dest_core = static_cast((partition_id_ & 0x3FF) % + local_cc_shards_.Count()); + local_cc_shards_.EnqueueLowPriorityCcRequestToShard(dest_core, + &sample_keys_cc); + DLOG(INFO) << "Send the sample range keys request to shard#" << dest_core; sample_keys_cc.Wait(); CcErrorCode res = sample_keys_cc.ErrorCode(); diff --git a/tx_service/src/tx_operation.cpp b/tx_service/src/tx_operation.cpp index 218fdd74..42bc3796 100644 --- a/tx_service/src/tx_operation.cpp +++ b/tx_service/src/tx_operation.cpp @@ -5168,8 +5168,13 @@ bool SplitFlushRangeOp::ForwardKickoutIterator(TransactionExecution *txm) NodeGroupId new_owner = new_range_bucket_info->BucketOwner(); NodeGroupId dirty_new_owner = new_range_bucket_info->DirtyBucketOwner(); - if (new_owner != txm->TxCcNodeId() && - dirty_new_owner != txm->TxCcNodeId()) + uint16_t range_shard_id = static_cast( + (range_info_->PartitionId() & 0x3FF) % local_shards->Count()); + uint16_t new_range_shard_id = static_cast( + (kickout_data_it_->second & 0x3FF) % local_shards->Count()); + if ((new_owner != txm->TxCcNodeId() && + dirty_new_owner != txm->TxCcNodeId()) || + (range_shard_id != new_range_shard_id)) { // Note that even if the new node group falls on the same node, // we still need to clean the cc entry from native ccmap since @@ -5188,11 +5193,14 @@ bool SplitFlushRangeOp::ForwardKickoutIterator(TransactionExecution *txm) } kickout_old_range_data_op_.clean_type_ = CleanType::CleanRangeData; + kickout_old_range_data_op_.range_id_ = + range_info_->PartitionId(); kickout_old_range_data_op_.node_group_ = txm->TxCcNodeId(); LOG(INFO) << "Split Flush transaction kickout old data in range " << kickout_data_it_->second << ", original range id " << range_info_->PartitionId() + << ", new range id: " << kickout_data_it_->second << ", txn: " << txm->TxNumber(); kickout_data_it_++; return false; From 0ada65fa27d59bb8cdf0cf21d29062f50042d406 Mon Sep 17 00:00:00 2001 From: yi-xmu <69192773+yi-xmu@users.noreply.github.com> Date: Fri, 13 Mar 2026 18:36:40 +0800 Subject: [PATCH 05/15] Update range size during create secondary index (#442) 1. Update range size during UploadBatchCc request for new index. 2. Update the UploadBatchCc process to accommodate the new key sharding logic. 3. Update the create secondary index process to accommodate the new key sharding logic. --- tx_service/include/cc/cc_request.h | 42 +++- tx_service/include/cc/object_cc_map.h | 6 +- tx_service/include/cc/template_cc_map.h | 103 ++++++++- tx_service/include/sk_generator.h | 29 ++- tx_service/src/cc/local_cc_shards.cpp | 1 + tx_service/src/remote/cc_node_service.cpp | 20 +- tx_service/src/sk_generator.cpp | 255 +++++++++++----------- 7 files changed, 290 insertions(+), 166 deletions(-) diff --git a/tx_service/include/cc/cc_request.h b/tx_service/include/cc/cc_request.h index 056b1ffb..5b5559f4 100644 --- a/tx_service/include/cc/cc_request.h +++ b/tx_service/include/cc/cc_request.h @@ -7748,7 +7748,9 @@ struct CollectMemStatsCc : public CcRequestBase struct UploadBatchCc : public CcRequestBase { + // keys, records, commit_ts, rec_status, range_size_flags using WriteEntryTuple = std::tuple; @@ -7765,10 +7767,10 @@ struct UploadBatchCc : public CcRequestBase void Reset(const TableName &table_name, txservice::NodeGroupId ng_id, int64_t &ng_term, - size_t core_cnt, + int32_t partition_id, size_t batch_size, size_t start_key_idx, - const std::vector &entry_vec, + const std::vector> &entry_vec, bthread::Mutex &req_mux, bthread::ConditionVariable &req_cv, size_t &finished_req_cnt, @@ -7779,6 +7781,7 @@ struct UploadBatchCc : public CcRequestBase node_group_id_ = ng_id; node_group_term_ = &ng_term; is_remote_ = false; + partition_id_ = partition_id; batch_size_ = batch_size; start_key_idx_ = start_key_idx; entry_vector_ = &entry_vec; @@ -7786,16 +7789,17 @@ struct UploadBatchCc : public CcRequestBase req_cv_ = &req_cv; finished_req_cnt_ = &finished_req_cnt; req_result_ = &req_result; - unfinished_cnt_.store(core_cnt, std::memory_order_relaxed); + unfinished_cnt_.store(1, std::memory_order_relaxed); err_code_.store(CcErrorCode::NO_ERROR, std::memory_order_relaxed); paused_pos_.clear(); - paused_pos_.resize(core_cnt, {}); + paused_pos_.resize(1, {}); data_type_ = data_type; } void Reset(const TableName &table_name, txservice::NodeGroupId ng_id, int64_t &ng_term, + int32_t partition_id, size_t core_cnt, uint32_t batch_size, const WriteEntryTuple &entry_tuple, @@ -7808,6 +7812,7 @@ struct UploadBatchCc : public CcRequestBase node_group_id_ = ng_id; node_group_term_ = &ng_term; is_remote_ = true; + partition_id_ = partition_id; batch_size_ = batch_size; start_key_idx_ = 0; entry_tuples_ = &entry_tuple; @@ -7950,7 +7955,12 @@ struct UploadBatchCc : public CcRequestBase return batch_size_; } - const std::vector *EntryVector() const + int32_t PartitionId() const + { + return partition_id_; + } + + const std::vector> *EntryVector() const { return is_remote_ ? nullptr : entry_vector_; } @@ -7965,19 +7975,23 @@ struct UploadBatchCc : public CcRequestBase size_t key_off, size_t rec_off, size_t ts_off, - size_t status_off) + size_t status_off, + size_t flags_off) { + core_id = partition_id_ >= 0 ? 0 : core_id; auto &key_pos = paused_pos_.at(core_id); std::get<0>(key_pos) = key_index; std::get<1>(key_pos) = key_off; std::get<2>(key_pos) = rec_off; std::get<3>(key_pos) = ts_off; std::get<4>(key_pos) = status_off; + std::get<5>(key_pos) = flags_off; } - const std::tuple &GetPausedPosition( - uint16_t core_id) const + const std::tuple & + GetPausedPosition(uint16_t core_id) const { + core_id = partition_id_ >= 0 ? 0 : core_id; return paused_pos_.at(core_id); } @@ -8001,12 +8015,14 @@ struct UploadBatchCc : public CcRequestBase uint32_t node_group_id_{0}; int64_t *node_group_term_{nullptr}; bool is_remote_{false}; + // -1 means broadcast to all shards(used by hash partition) + int32_t partition_id_{-1}; uint32_t batch_size_{0}; size_t start_key_idx_{0}; union { - // for local request - const std::vector *entry_vector_; + // for local request: (range_size_flags, WriteEntry*) + const std::vector> *entry_vector_; // for remote request const WriteEntryTuple *entry_tuples_; }; @@ -8018,8 +8034,10 @@ struct UploadBatchCc : public CcRequestBase // This two variables may be accessed by multi-cores. std::atomic unfinished_cnt_{0}; std::atomic err_code_{CcErrorCode::NO_ERROR}; - // key index, key offset, record offset, ts offset, record status offset - std::vector> paused_pos_; + // key index, key offset, record offset, ts offset, record status offset, + // range_size_flags offset + std::vector> + paused_pos_; UploadBatchType data_type_{UploadBatchType::SkIndexData}; }; diff --git a/tx_service/include/cc/object_cc_map.h b/tx_service/include/cc/object_cc_map.h index a2b31c8e..bbd4d17b 100644 --- a/tx_service/include/cc/object_cc_map.h +++ b/tx_service/include/cc/object_cc_map.h @@ -1571,7 +1571,8 @@ class ObjectCcMap : public TemplateCcMap next_ts_offset = ts_offset; next_status_offset = status_offset; - auto [key_str, rec_str, ts_str, status_str] = *entry_tuples; + auto [key_str, rec_str, ts_str, status_str, flags_str] = + *entry_tuples; // deserialize key decoded_key.Deserialize( key_str.data(), next_key_offset, KeySchema()); @@ -1739,7 +1740,8 @@ class ObjectCcMap : public TemplateCcMap key_offset, rec_offset, ts_offset, - status_offset); + status_offset, + 0); shard_->Enqueue(shard_->LocalCoreId(), &req); return false; } diff --git a/tx_service/include/cc/template_cc_map.h b/tx_service/include/cc/template_cc_map.h index d0f84289..2dc2fe88 100644 --- a/tx_service/include/cc/template_cc_map.h +++ b/tx_service/include/cc/template_cc_map.h @@ -7663,6 +7663,7 @@ class TemplateCcMap : public CcMap auto entry_tuples = req.EntryTuple(); size_t batch_size = req.BatchSize(); size_t start_key_index = req.StartKeyIndex(); + const int32_t partition_id = req.PartitionId(); const TxRecord *req_rec = nullptr; @@ -7672,6 +7673,7 @@ class TemplateCcMap : public CcMap ValueT decoded_rec; uint64_t commit_ts = 0; RecordStatus rec_status = RecordStatus::Normal; + uint8_t range_size_flags = 0; auto &resume_pos = req.GetPausedPosition(shard_->core_id_); size_t key_pos = std::get<0>(resume_pos); @@ -7679,6 +7681,7 @@ class TemplateCcMap : public CcMap size_t rec_offset = std::get<2>(resume_pos); size_t ts_offset = std::get<3>(resume_pos); size_t status_offset = std::get<4>(resume_pos); + size_t flags_offset = std::get<5>(resume_pos); size_t hash = 0; Iterator it; @@ -7691,6 +7694,7 @@ class TemplateCcMap : public CcMap size_t next_rec_offset = 0; size_t next_ts_offset = 0; size_t next_status_offset = 0; + size_t next_flags_offset = 0; for (size_t cnt = 0; key_pos < batch_size && cnt < UploadBatchCc::UploadBatchBatchSize; ++key_pos, ++cnt) @@ -7699,13 +7703,16 @@ class TemplateCcMap : public CcMap next_rec_offset = rec_offset; next_ts_offset = ts_offset; next_status_offset = status_offset; + next_flags_offset = flags_offset; + if (entry_vec != nullptr) { key_idx = start_key_index + key_pos; - // get key - key = entry_vec->at(key_idx)->key_.GetKey(); - // get record - req_rec = entry_vec->at(key_idx)->rec_.get(); + const auto &pair = entry_vec->at(key_idx); + range_size_flags = pair.first; + const WriteEntry *we = pair.second; + key = we->key_.GetKey(); + req_rec = we->rec_.get(); if (req_rec) { rec_status = RecordStatus::Normal; @@ -7717,11 +7724,12 @@ class TemplateCcMap : public CcMap commit_val = nullptr; } // get commit ts - commit_ts = entry_vec->at(key_idx)->commit_ts_; + commit_ts = we->commit_ts_; } else { - auto [key_str, rec_str, ts_str, status_str] = *entry_tuples; + auto [key_str, rec_str, ts_str, status_str, flags_str] = + *entry_tuples; // deserialize key decoded_key.Deserialize( key_str.data(), next_key_offset, KeySchema()); @@ -7744,21 +7752,43 @@ class TemplateCcMap : public CcMap // deserialize commit ts commit_ts = *((uint64_t *) (ts_str.data() + next_ts_offset)); next_ts_offset += sizeof(uint64_t); + if (RangePartitioned) + { + range_size_flags = + static_cast(flags_str[next_flags_offset]); + next_flags_offset += sizeof(uint8_t); + } } - hash = key->Hash(); - size_t core_idx = (hash & 0x3FF) % shard_->core_cnt_; - if (!(core_idx == shard_->core_id_) || commit_ts <= 1) + if (commit_ts <= 1) { - // Skip the key that does not belong to this core or - // commit ts does not greater than 1. Move to next key. + // skip the key that commit ts does not greater than 1. key_offset = next_key_offset; rec_offset = next_rec_offset; ts_offset = next_ts_offset; status_offset = next_status_offset; + if constexpr (RangePartitioned) + { + flags_offset = next_flags_offset; + } continue; } + if constexpr (!RangePartitioned) + { + hash = key->Hash(); + size_t core_idx = (hash & 0x3FF) % shard_->core_cnt_; + if (core_idx != shard_->core_id_) + { + // skip the key that does not belong to this core. + key_offset = next_key_offset; + rec_offset = next_rec_offset; + ts_offset = next_ts_offset; + status_offset = next_status_offset; + continue; + } + } + it = FindEmplace(*key); cce = it->second; cc_page = it.GetPage(); @@ -7790,9 +7820,14 @@ class TemplateCcMap : public CcMap rec_offset = next_rec_offset; ts_offset = next_ts_offset; status_offset = next_status_offset; + if constexpr (RangePartitioned) + { + flags_offset = next_flags_offset; + } continue; } + [[maybe_unused]] const size_t old_payload_size = cce->PayloadSize(); // Now, all versions of non-unique SecondaryIndex key shared // the unpack info in current version's payload, though the // unpack info will not be used for deleted key, we must not @@ -7812,6 +7847,8 @@ class TemplateCcMap : public CcMap } bool was_dirty = cce->IsDirty(); + [[maybe_unused]] const RecordStatus cce_old_status = + cce->PayloadStatus(); cce->SetCommitTsPayloadStatus(commit_ts, rec_status); if (req.Kind() == UploadBatchType::DirtyBucketData) { @@ -7825,6 +7862,43 @@ class TemplateCcMap : public CcMap } cce->SetCkptTs(commit_ts); } + + if constexpr (RangePartitioned) + { + if ((range_size_flags >> 4) != 0) + { + int32_t delta = + (rec_status == RecordStatus::Deleted) + ? -(static_cast(write_key->Size() + + old_payload_size)) + : (cce_old_status != RecordStatus::Normal + ? static_cast(write_key->Size() + + cce->PayloadSize()) + : static_cast(cce->PayloadSize() - + old_payload_size)); + bool need_split = + UpdateRangeSize(static_cast(partition_id), + delta, + (range_size_flags & 0x0F) != 0); + if (need_split) + { + // Create a data sync task for the range. + uint64_t data_sync_ts = + std::chrono::duration_cast< + std::chrono::microseconds>( + std::chrono::high_resolution_clock::now() + .time_since_epoch()) + .count(); + shard_->CreateSplitRangeDataSyncTask( + table_name_, + cc_ng_id_, + req.CcNgTerm(), + static_cast(partition_id), + data_sync_ts); + } + } + } + OnCommittedUpdate(cce, was_dirty); OnFlushed(cce, was_dirty); DLOG_IF(INFO, TRACE_OCC_ERR) @@ -7851,6 +7925,10 @@ class TemplateCcMap : public CcMap rec_offset = next_rec_offset; ts_offset = next_ts_offset; status_offset = next_status_offset; + if constexpr (RangePartitioned) + { + flags_offset = next_flags_offset; + } } if (key_pos < batch_size) { @@ -7862,7 +7940,8 @@ class TemplateCcMap : public CcMap key_offset, rec_offset, ts_offset, - status_offset); + status_offset, + flags_offset); shard_->Enqueue(shard_->LocalCoreId(), &req); return false; } diff --git a/tx_service/include/sk_generator.h b/tx_service/include/sk_generator.h index 050d6b27..b33941e8 100644 --- a/tx_service/include/sk_generator.h +++ b/tx_service/include/sk_generator.h @@ -40,8 +40,11 @@ class UploadIndexContext public: using TableIndexSet = std::unordered_map>; - using NGIndexSet = - std::unordered_map>; + // ng_id -> (range_id -> vector of (range_size_flags, WriteEntry*)) + using NGIndexSet = std::unordered_map< + NodeGroupId, + std::unordered_map>>>; private: enum struct UploadTaskStatus @@ -101,16 +104,18 @@ class UploadIndexContext CcErrorCode UploadEncodedIndex(UploadIndexTask &upload_task); CcErrorCode UploadIndexInternal( std::unordered_map &ng_index_set); - void SendIndexes(const TableName &table_name, - NodeGroupId dest_ng_id, - int64_t &ng_term, - const std::vector &write_entry_vec, - size_t batch_size, - size_t start_key_idx, - bthread::Mutex &req_mux, - bthread::ConditionVariable &req_cv, - size_t &finished_req_cnt, - CcErrorCode &res_code); + void SendIndexes( + const TableName &table_name, + NodeGroupId dest_ng_id, + int64_t &ng_term, + int32_t partition_id, + const std::vector> &write_entry_vec, + size_t batch_size, + size_t start_key_idx, + bthread::Mutex &req_mux, + bthread::ConditionVariable &req_cv, + size_t &finished_req_cnt, + CcErrorCode &res_code); // Acquire and release range read lock. CcErrorCode AcquireRangeReadLocks( TransactionExecution *acq_lock_txm, diff --git a/tx_service/src/cc/local_cc_shards.cpp b/tx_service/src/cc/local_cc_shards.cpp index 810b5607..956427e2 100644 --- a/tx_service/src/cc/local_cc_shards.cpp +++ b/tx_service/src/cc/local_cc_shards.cpp @@ -4939,6 +4939,7 @@ void LocalCcShards::DataSyncForHashPartition( req_ptr = upload_batch_closure->UploadBatchRequest(); req_ptr->set_node_group_id(dest_ng); req_ptr->set_node_group_term(-1); + req_ptr->set_partition_id(-1); req_ptr->set_table_name_str(table_name.String()); req_ptr->set_table_type( remote::ToRemoteType::ConvertTableType( diff --git a/tx_service/src/remote/cc_node_service.cpp b/tx_service/src/remote/cc_node_service.cpp index 123cd440..df0369aa 100644 --- a/tx_service/src/remote/cc_node_service.cpp +++ b/tx_service/src/remote/cc_node_service.cpp @@ -1172,6 +1172,7 @@ void CcNodeService::UploadBatch( NodeGroupId ng_id = request->node_group_id(); int64_t ng_term = request->node_group_term(); + int32_t partition_id = request->partition_id(); std::string_view table_name_sv{request->table_name_str()}; TableType table_type = @@ -1199,14 +1200,15 @@ void CcNodeService::UploadBatch( << " for table:" << table_name.Trace(); LocalCcShards *cc_shards = Sharder::Instance().GetLocalCcShards(); - size_t core_cnt = cc_shards->Count(); + size_t core_cnt = (partition_id >= 0) ? 1 : cc_shards->Count(); uint32_t batch_size = request->batch_size(); auto write_entry_tuple = UploadBatchCc::WriteEntryTuple(request->keys(), request->records(), request->commit_ts(), - request->rec_status()); + request->rec_status(), + request->range_size_flags()); size_t finished_req = 0; bthread::Mutex req_mux; @@ -1217,6 +1219,7 @@ void CcNodeService::UploadBatch( req.Reset(table_name, ng_id, ng_term, + partition_id, core_cnt, batch_size, write_entry_tuple, @@ -1224,9 +1227,18 @@ void CcNodeService::UploadBatch( req_cv, finished_req, data_type); - for (size_t core = 0; core < core_cnt; ++core) + if (partition_id >= 0) { - cc_shards->EnqueueToCcShard(core, &req); + uint16_t dest_core = + static_cast((partition_id & 0x3FF) % cc_shards->Count()); + cc_shards->EnqueueToCcShard(dest_core, &req); + } + else + { + for (size_t core = 0; core < cc_shards->Count(); ++core) + { + cc_shards->EnqueueToCcShard(core, &req); + } } { diff --git a/tx_service/src/sk_generator.cpp b/tx_service/src/sk_generator.cpp index e3fc928e..01ff589e 100644 --- a/tx_service/src/sk_generator.cpp +++ b/tx_service/src/sk_generator.cpp @@ -324,7 +324,6 @@ void SkGenerator::ScanAndEncodeIndex(const TxKey *start_key, scan_ts_, node_group_id_, ng_term, - core_cnt, scan_batch_size_, tx_number, start_key, @@ -336,12 +335,7 @@ void SkGenerator::ScanAndEncodeIndex(const TxKey *start_key, CcErrorCode scan_res = CcErrorCode::NO_ERROR; bool scan_data_drained = false; bool scan_pk_finished = false; - std::vector last_finished_pos; - last_finished_pos.reserve(core_cnt); - for (size_t i = 0; i < core_cnt; ++i) - { - last_finished_pos.emplace_back(start_key->Clone()); - } + TxKey last_finished_pos = start_key->Clone(); TxKey target_key; const TxRecord *target_rec = nullptr; @@ -355,11 +349,8 @@ void SkGenerator::ScanAndEncodeIndex(const TxKey *start_key, { batch_tuples = 0; - uint32_t core_rand = butil::fast_rand(); - // The scan slice request is dispatched to the first core. The first - // core tries to pin the slice if necessary and if succeeds, further - // dispatches the request to remaining cores for parallel scans. - cc_shards->EnqueueToCcShard(core_rand % core_cnt, &scan_req); + uint16_t dest_core = (partition_id_ & 0x3FF) % cc_shards->Count(); + cc_shards->EnqueueToCcShard(dest_core, &scan_req); scan_req.Wait(); if (scan_req.IsError()) @@ -381,17 +372,14 @@ void SkGenerator::ScanAndEncodeIndex(const TxKey *start_key, { std::this_thread::sleep_for(std::chrono::seconds(30)); // Reset the paused key. - for (size_t i = 0; i < core_cnt; ++i) + const TxKey &paused_key = scan_req.PausePos().first; + if (!scan_req.IsDrained()) { - const TxKey &paused_key = scan_req.PausePos(i).first; - if (!scan_req.IsDrained(i)) - { - // Should use one copy of the key, instead of move the - // ownership of the key, because this round of scan may - // failed again. - assert(paused_key.IsOwner()); - paused_key.Copy(last_finished_pos[i]); - } + // Should use one copy of the key, instead of move the + // ownership of the key, because this round of scan may + // failed again. + assert(paused_key.IsOwner()); + paused_key.Copy(last_finished_pos); } scan_req.Reset(); scan_pk_finished = false; @@ -431,71 +419,63 @@ void SkGenerator::ScanAndEncodeIndex(const TxKey *start_key, } sk_encoder = sk_encoder_vec_[vec_idx].get(); - for (size_t core_idx = 0; core_idx < core_cnt; ++core_idx) + for (size_t key_idx = 0; key_idx < scan_req.accumulated_scan_cnt_; + ++key_idx) { - for (size_t key_idx = 0; - key_idx < scan_req.accumulated_scan_cnt_.at(core_idx); - ++key_idx) + auto &tuple = scan_req.DataSyncVec().at(key_idx); + target_key = tuple.Key(); + target_rec = tuple.Payload(); + version_ts = tuple.commit_ts_; + if (tuple.payload_status_ == RecordStatus::Deleted) { - auto &tuple = scan_req.DataSyncVec(core_idx).at(key_idx); - target_key = tuple.Key(); - target_rec = tuple.Payload(); - version_ts = tuple.commit_ts_; - if (tuple.payload_status_ == RecordStatus::Deleted) - { - // Skip the deleted record. - continue; - } - assert(target_key.KeyPtr() != nullptr && - target_rec != nullptr); + // Skip the deleted record. + continue; + } + assert(target_key.KeyPtr() != nullptr && target_rec != nullptr); - int32_t appended_sk_size = sk_encoder->AppendPackedSk( - &target_key, target_rec, version_ts, index_set); - if (appended_sk_size < 0) - { - LOG(ERROR) - << "ScanAndEncodeIndex: Failed to encode " - << "key for index: " << tbl_name_it->StringView() - << "of ng#" << node_group_id_; - // Finish the pack sk operation - task_result_ = CcErrorCode::PACK_SK_ERR; - pack_sk_err_ = std::move(sk_encoder->GetError()); - return; - } - } /* End of each key */ + int32_t appended_sk_size = sk_encoder->AppendPackedSk( + &target_key, target_rec, version_ts, index_set); + if (appended_sk_size < 0) + { + LOG(ERROR) << "ScanAndEncodeIndex: Failed to encode " + << "key for index: " << tbl_name_it->StringView() + << "of ng#" << node_group_id_; + // Finish the pack sk operation + task_result_ = CcErrorCode::PACK_SK_ERR; + pack_sk_err_ = std::move(sk_encoder->GetError()); + return; + } + } /* End of each key */ - if (tbl_name_it == new_indexes_name_->cbegin()) + if (tbl_name_it == new_indexes_name_->cbegin()) + { + batch_tuples += scan_req.accumulated_scan_cnt_; + if (batch_tuples % 10240 == 0 && + !task_status_->CheckTxTermStatus()) + { + LOG(WARNING) + << "ScanAndEncodeIndex: Terminate this task cause " + << "the tx leader transferred of ng#" << node_group_id_; + task_status_->TerminateGenerateSk(); + task_result_ = CcErrorCode::TX_NODE_NOT_LEADER; + return; + } + // Update the last finished key. + auto &paused_key = scan_req.PausePos().first; + if (!scan_req.IsDrained()) { - batch_tuples += scan_req.accumulated_scan_cnt_.at(core_idx); - if (batch_tuples % 10240 == 0 && - !task_status_->CheckTxTermStatus()) + if (last_finished_pos.IsOwner()) { - LOG(WARNING) - << "ScanAndEncodeIndex: Terminate this task cause " - << "the tx leader transferred of ng#" - << node_group_id_; - task_status_->TerminateGenerateSk(); - task_result_ = CcErrorCode::TX_NODE_NOT_LEADER; - return; + last_finished_pos.Copy(paused_key); } - // Update the last finished key. - auto &paused_key = scan_req.PausePos(core_idx).first; - if (!scan_req.IsDrained(core_idx)) + else { - if (last_finished_pos[core_idx].IsOwner()) - { - last_finished_pos[core_idx].Copy(paused_key); - } - else - { - last_finished_pos[core_idx] = paused_key.Clone(); - } + last_finished_pos = paused_key.Clone(); } - // If the data is drained - scan_data_drained = - scan_req.IsDrained(core_idx) && scan_data_drained; } - } /* End of each core */ + // If the data is drained + scan_data_drained = scan_req.IsDrained(); + } } /* End of foreach new_indexes_name */ scan_pk_finished = scan_data_drained; @@ -680,37 +660,41 @@ CcErrorCode UploadIndexContext::UploadIndexInternal( size_t finished_upload_count = 0; CcErrorCode upload_res_code = CcErrorCode::NO_ERROR; size_t upload_req_count = 0; + for (auto &[table_name, ng_entries] : ng_index_set) { - for (auto &[ng_id, entry_vec] : ng_entries) + for (auto &[ng_id, range_entries] : ng_entries) { - entry_vec_size = entry_vec.size(); - batch_req_cnt = (entry_vec_size / upload_batch_size_ + - (entry_vec_size % upload_batch_size_ ? 1 : 0)); - int64_t &expected_term = leader_terms_.at(ng_id); - size_t start_idx = 0; - size_t end_idx = - (batch_req_cnt > 1 ? upload_batch_size_ : entry_vec_size); - for (size_t idx = 0; idx < batch_req_cnt; ++idx) + for (auto &[range_id, entry_vec] : range_entries) { - SendIndexes(table_name, - ng_id, - expected_term, - entry_vec, - (end_idx - start_idx), - start_idx, - req_mux, - req_cv, - finished_upload_count, - upload_res_code); - ++upload_req_count; - // Next batch - start_idx = end_idx; - end_idx = ((start_idx + upload_batch_size_) > entry_vec_size - ? entry_vec_size - : (start_idx + upload_batch_size_)); + entry_vec_size = entry_vec.size(); + batch_req_cnt = (entry_vec_size / upload_batch_size_ + + (entry_vec_size % upload_batch_size_ ? 1 : 0)); + + size_t start_idx = 0; + size_t end_idx = + (batch_req_cnt > 1 ? upload_batch_size_ : entry_vec_size); + for (size_t idx = 0; idx < batch_req_cnt; ++idx) + { + SendIndexes(table_name, + ng_id, + expected_term, + range_id, + entry_vec, + (end_idx - start_idx), + start_idx, + req_mux, + req_cv, + finished_upload_count, + upload_res_code); + ++upload_req_count; + start_idx = end_idx; + end_idx = ((start_idx + upload_batch_size_) > entry_vec_size + ? entry_vec_size + : (start_idx + upload_batch_size_)); + } } } } @@ -730,7 +714,8 @@ void UploadIndexContext::SendIndexes( const TableName &table_name, NodeGroupId dest_ng_id, int64_t &ng_term, - const std::vector &write_entry_vec, + int32_t partition_id, + const std::vector> &write_entry_vec, size_t batch_size, size_t start_key_idx, bthread::Mutex &req_mux, @@ -740,14 +725,13 @@ void UploadIndexContext::SendIndexes( { uint32_t dest_node_id = Sharder::Instance().LeaderNodeId(dest_ng_id); LocalCcShards *cc_shards = Sharder::Instance().GetLocalCcShards(); - size_t core_cnt = cc_shards->Count(); if (dest_node_id == cc_shards->NodeId()) { UploadBatchCc *req_ptr = NextRequest(); req_ptr->Reset(table_name, dest_ng_id, ng_term, - core_cnt, + partition_id, batch_size, start_key_idx, write_entry_vec, @@ -757,10 +741,9 @@ void UploadIndexContext::SendIndexes( res_code, UploadBatchType::SkIndexData); - for (size_t core = 0; core < core_cnt; ++core) - { - cc_shards->EnqueueToCcShard(core, req_ptr); - } + uint16_t dest_core = + static_cast((partition_id & 0x3FF) % cc_shards->Count()); + cc_shards->EnqueueToCcShard(dest_core, req_ptr); } else { @@ -834,6 +817,7 @@ void UploadIndexContext::SendIndexes( remote::ToRemoteType::ConvertTableType(table_name.Type())); req_ptr->set_table_engine( remote::ToRemoteType::ConvertTableEngine(table_name.Engine())); + req_ptr->set_partition_id(partition_id); size_t end_key_idx = start_key_idx + batch_size; req_ptr->set_kind(remote::UploadBatchKind::SK_DATA); req_ptr->set_batch_size(batch_size); @@ -853,15 +837,24 @@ void UploadIndexContext::SendIndexes( std::string *rec_status_str = req_ptr->mutable_rec_status(); // All generated sk should be normal status. const RecordStatus rec_status = RecordStatus::Normal; + // range_size_flags + req_ptr->clear_range_size_flags(); + std::string *range_size_flags_str = req_ptr->mutable_range_size_flags(); + for (size_t idx = start_key_idx; idx < end_key_idx; ++idx) { - write_entry_vec.at(idx)->key_.Serialize(*keys_str); - write_entry_vec.at(idx)->rec_->Serialize(*recs_str); - val_ptr = reinterpret_cast( - &(write_entry_vec.at(idx)->commit_ts_)); + uint8_t range_size_flags = write_entry_vec.at(idx).first; + WriteEntry *write_entry = write_entry_vec.at(idx).second; + write_entry->key_.Serialize(*keys_str); + write_entry->rec_->Serialize(*recs_str); + val_ptr = + reinterpret_cast(&(write_entry->commit_ts_)); commit_ts_str->append(val_ptr, len_sizeof); rec_status_str->append(reinterpret_cast(&rec_status), sizeof(rec_status)); + range_size_flags_str->append( + reinterpret_cast(&range_size_flags), + sizeof(range_size_flags)); } brpc::Controller *cntl_ptr = upload_batch_closure->Controller(); @@ -989,17 +982,24 @@ void UploadIndexContext::AdvanceWriteEntryForRangeInfo( size_t new_range_idx = 0; auto *range_info = range_record.GetRangeInfo(); + const int32_t range_id = range_info->PartitionId(); + const uint8_t default_flags = + 0x10 | static_cast(range_info->IsDirty()); while (cur_write_entry_it != next_range_start) { WriteEntry &write_entry = *cur_write_entry_it; - auto ng_it = ng_write_entrys.try_emplace(range_ng); - ng_it.first->second.push_back(&write_entry); + auto &range_vec = ng_write_entrys[range_ng][range_id]; + range_vec.emplace_back(default_flags, &write_entry); + uint8_t *old_range_flags_ptr = &range_vec.back().first; + + uint8_t *new_bucket_flags_ptr = nullptr; // If current range is migrating, forward to new range owner. if (new_bucket_ng != UINT32_MAX) { - ng_write_entrys.try_emplace(new_bucket_ng) - .first->second.push_back(&write_entry); + auto &new_bucket_vec = ng_write_entrys[new_bucket_ng][range_id]; + new_bucket_vec.emplace_back(default_flags, &write_entry); + new_bucket_flags_ptr = &new_bucket_vec.back().first; } // If range is splitting and the key will fall on a new range after @@ -1016,18 +1016,25 @@ void UploadIndexContext::AdvanceWriteEntryForRangeInfo( } if (new_range_ng != UINT32_MAX) { - if (new_range_ng != range_ng) - { - ng_write_entrys.try_emplace(new_range_ng) - .first->second.push_back(&write_entry); - } + const int32_t new_range_id = + range_info->NewPartitionId()->at(new_range_idx - 1); + + ng_write_entrys[new_range_ng][new_range_id].emplace_back( + default_flags, &write_entry); + // Only update range size on the new range + *old_range_flags_ptr &= 0x0F; + // If the new range is migrating, forward to the new owner of new // range. if (new_range_new_bucket_ng != UINT32_MAX && new_range_new_bucket_ng != range_ng) { - ng_write_entrys.try_emplace(new_range_new_bucket_ng) - .first->second.push_back(&write_entry); + ng_write_entrys[new_range_new_bucket_ng][new_range_id] + .emplace_back(default_flags, &write_entry); + if (new_bucket_flags_ptr) + { + *new_bucket_flags_ptr &= 0x0F; + } } } From edb547d7270a922a2deb1ad0c0aca72c2f7fdc5b Mon Sep 17 00:00:00 2001 From: yi-xmu <69192773+yi-xmu@users.noreply.github.com> Date: Fri, 13 Mar 2026 18:37:14 +0800 Subject: [PATCH 06/15] Update range split replay log execution (#443) 1. Update the range size during data log replay. 2. For post commit range split log, update range size for each newly splitting ranges. 3. Update the data log replay process to accommodate the new key sharding logic. --- tx_service/include/cc/cc_request.h | 64 +++++++-- tx_service/include/cc/range_cc_map.h | 53 +++++++ tx_service/include/cc/template_cc_map.h | 136 ++++++++++++++---- tx_service/include/fault/log_replay_service.h | 19 +++ tx_service/src/fault/log_replay_service.cpp | 61 +++++++- 5 files changed, 295 insertions(+), 38 deletions(-) diff --git a/tx_service/include/cc/cc_request.h b/tx_service/include/cc/cc_request.h index 5b5559f4..59998209 100644 --- a/tx_service/include/cc/cc_request.h +++ b/tx_service/include/cc/cc_request.h @@ -4871,7 +4871,10 @@ struct ReplayLogCc : public TemplatedCcRequest std::shared_ptr range_split_started = nullptr, std::unordered_set *range_splitting = nullptr, uint16_t first_core = 0, - ParseDataLogCc *parse_cc = nullptr) + ParseDataLogCc *parse_cc = nullptr, + const std::unordered_map> + *split_range_info = nullptr) { table_name_holder_ = TableName(table_name_view, table_type, table_engine); @@ -4899,6 +4902,15 @@ struct ReplayLogCc : public TemplatedCcRequest is_lock_recovery_ = is_lock_recovery; upsert_kv_err_code_ = {true, CcErrorCode::NO_ERROR}; parse_cc_ = parse_cc; + split_ranges_ = nullptr; + if (split_range_info != nullptr) + { + auto table_it = split_range_info->find(table_name_holder_); + if (table_it != split_range_info->end()) + { + split_ranges_ = &table_it->second; + } + } } ReplayLogCc(const ReplayLogCc &rhs) = delete; @@ -5097,6 +5109,16 @@ struct ReplayLogCc : public TemplatedCcRequest return first_core_; } + uint64_t RangeSplitCommitTs(int32_t range_id) const + { + if (split_ranges_ == nullptr) + { + return 0; + } + auto it = split_ranges_->find(range_id); + return it == split_ranges_->end() ? 0 : it->second; + } + void SetOffset(size_t offset) { offset_ = offset; @@ -5164,6 +5186,9 @@ struct ReplayLogCc : public TemplatedCcRequest CcErrorCode::NO_ERROR}; ParseDataLogCc *parse_cc_{nullptr}; + // Range split commit ts per range for the current table, if available. + const std::unordered_map *split_ranges_{nullptr}; + friend std::ostream &operator<<(std::ostream &outs, txservice::ReplayLogCc *r); }; @@ -5180,7 +5205,10 @@ struct ParseDataLogCc : public CcRequestBase std::atomic &status, std::atomic &on_fly_cnt, bool &recovery_error, - const bool is_lock_recovery = false) + const bool is_lock_recovery = false, + const std::unordered_map> + *split_range_info = nullptr) { log_records_sv_ = std::string_view(log_records.data(), log_records.size()); @@ -5192,6 +5220,7 @@ struct ParseDataLogCc : public CcRequestBase on_fly_cnt_ = &on_fly_cnt; recovery_error_ = &recovery_error; is_lock_recovery_ = is_lock_recovery; + split_range_info_ = split_range_info; } void Reset(::txlog::ReplayMessage &&replay_message, @@ -5201,7 +5230,10 @@ struct ParseDataLogCc : public CcRequestBase std::atomic &status, std::atomic &on_fly_cnt, bool &recovery_error, - const bool is_lock_recovery = false) + const bool is_lock_recovery = false, + const std::unordered_map> + *split_range_info = nullptr) { replay_message_ = std::make_unique<::txlog::ReplayMessage>(std::move(replay_message)); @@ -5216,13 +5248,15 @@ struct ParseDataLogCc : public CcRequestBase on_fly_cnt_ = &on_fly_cnt; recovery_error_ = &recovery_error; is_lock_recovery_ = is_lock_recovery; + split_range_info_ = split_range_info; } bool Execute(CcShard &ccs) override { size_t offset = 0; // core of first key in log - int dest_core = 0; + uint32_t core_rand = butil::fast_rand(); + uint16_t dest_core = static_cast(core_rand % ccs.core_cnt_); std::vector replay_cc_list; replay_cc_list.reserve(160); while (offset < log_records_sv_.size()) @@ -5293,10 +5327,19 @@ struct ParseDataLogCc : public CcRequestBase uint32_t kv_len = *reinterpret_cast( blob.data() + blob_offset); blob_offset += sizeof(uint32_t); - size_t hash = ccs.GetCatalogFactory(table_engine) - ->KeyHash(blob.data(), blob_offset, nullptr); - dest_core = hash ? (hash & 0x3FF) % ccs.core_cnt_ - : (dest_core + 1) % ccs.core_cnt_; + if (table_engine == TableEngine::EloqSql || + table_engine == TableEngine::EloqDoc) + { + dest_core = (dest_core + 1) % ccs.core_cnt_; + } + else + { + size_t hash = + ccs.GetCatalogFactory(table_engine) + ->KeyHash(blob.data(), blob_offset, nullptr); + dest_core = hash ? (hash & 0x3FF) % ccs.core_cnt_ + : (dest_core + 1) % ccs.core_cnt_; + } ReplayLogCc *cc_req = replay_cc_pool_.NextRequest(); replay_cc_list.push_back(cc_req); assert(cc_ng_term_ >= 0); @@ -5317,7 +5360,8 @@ struct ParseDataLogCc : public CcRequestBase nullptr, nullptr, dest_core, - this); + this, + split_range_info_); blob_offset += kv_len; } @@ -5355,6 +5399,8 @@ struct ParseDataLogCc : public CcRequestBase std::atomic *on_fly_cnt_; bool *recovery_error_; bool is_lock_recovery_; + const std::unordered_map> + *split_range_info_{nullptr}; }; struct BroadcastStatisticsCc diff --git a/tx_service/include/cc/range_cc_map.h b/tx_service/include/cc/range_cc_map.h index b16642ea..d2a39d50 100644 --- a/tx_service/include/cc/range_cc_map.h +++ b/tx_service/include/cc/range_cc_map.h @@ -1208,6 +1208,14 @@ class RangeCcMap : public TemplateCcMap // add new range entry to range cc map auto bucket_map = static_cast( shard_->GetCcm(range_bucket_ccm_name, this->cc_ng_id_)); + TableType data_table_type = + TableName::Type(this->table_name_.StringView()); + TableName data_table_name(this->table_name_.StringView(), + data_table_type, + this->table_name_.Engine()); + CcMap *data_ccm = shard_->GetCcm(data_table_name, this->cc_ng_id_); + assert(data_ccm != nullptr); + for (uint idx = 0; idx < new_range_infos.size(); idx++) { const TemplateRangeInfo *new_range_info = @@ -1230,6 +1238,51 @@ class RangeCcMap : public TemplateCcMap new_range_info->PartitionId())); cce->SetCommitTsPayloadStatus(new_range_info->version_ts_, RecordStatus::Normal); + + // Reset new range size on data table ccmap if this core owns + // it. + int32_t new_range_id = new_range_info->PartitionId(); + NodeGroupId new_range_owner = + shard_->GetRangeOwner(new_range_id, this->cc_ng_id_) + ->BucketOwner(); + if (new_range_owner == this->cc_ng_id_ && + static_cast((new_range_id & 0x3FF) % + shard_->core_cnt_) == + shard_->core_id_) + { + const TableRangeEntry *new_range_entry = + shard_->GetTableRangeEntry( + this->table_name_, this->cc_ng_id_, new_range_id); + assert(new_range_entry != nullptr); + size_t range_size = + static_cast *>( + new_range_entry) + ->TypedStoreRange() + ->PostCkptSize(); + data_ccm->InitRangeSize(static_cast(new_range_id), + static_cast(range_size), + true, + true); + } + } + + // Reset old range size on the data table ccmap if this core owns + // it. + int32_t old_range_id = + old_table_range_entry->GetRangeInfo()->PartitionId(); + NodeGroupId range_owner = + shard_->GetRangeOwner(old_range_id, this->cc_ng_id_) + ->BucketOwner(); + if (range_owner == this->cc_ng_id_ && + static_cast((old_range_id & 0x3FF) % + shard_->core_cnt_) == shard_->core_id_) + { + size_t old_range_size = + old_table_range_entry->RangeSlices()->PostCkptSize(); + data_ccm->InitRangeSize(static_cast(old_range_id), + static_cast(old_range_size), + true, + true); } } diff --git a/tx_service/include/cc/template_cc_map.h b/tx_service/include/cc/template_cc_map.h index 2dc2fe88..c1c6f451 100644 --- a/tx_service/include/cc/template_cc_map.h +++ b/tx_service/include/cc/template_cc_map.h @@ -6897,37 +6897,33 @@ class TemplateCcMap : public CcMap offset += sizeof(uint8_t); - uint16_t core_id = (key.Hash() & 0x3FF) % shard_->core_cnt_; - if (core_id != shard_->core_id_) - { - // Skips the key in the log record that is not sharded - // to this core. - if (op_type == OperationType::Insert || - op_type == OperationType::Update) - { - rec.Deserialize(log_blob.data(), offset); - } - if (shard_->core_id_ == req.FirstCore() || - (core_id != req.FirstCore() && core_id > shard_->core_id_)) - { - // Move to the smallest unvisited core id - next_core = std::min(core_id, next_core); - } - continue; - } + uint16_t core_id = 0; + bool is_dirty = false; + bool need_update_size = true; + int32_t partition_id = -1; - // Skip records that no longer belong to this ng. - if (RangePartitioned) + if constexpr (RangePartitioned) { const TableRangeEntry *range_entry = shard_->GetTableRangeEntry( table_name_, cc_ng_id_, TxKey(&key)); + if (range_entry == nullptr) + { + // range metadata missing, conservative handling: only + // consume value / skip. + if (op_type == OperationType::Insert || + op_type == OperationType::Update) + { + rec.Deserialize(log_blob.data(), offset); + } + continue; + } + partition_id = range_entry->GetRangeInfo()->PartitionId(); const BucketInfo *bucket_info = shard_->GetBucketInfo( - Sharder::MapRangeIdToBucketId( - range_entry->GetRangeInfo()->PartitionId()), - cc_ng_id_); - // Check if range bucket belongs to this ng or is migrating - // to this ng. + Sharder::MapRangeIdToBucketId(partition_id), cc_ng_id_); + + // Old range bucket does not belong to this ng, nor is it a + // "dirty bucket" migrating to this ng. if (bucket_info->BucketOwner() != cc_ng_id_ && bucket_info->DirtyBucketOwner() != cc_ng_id_) { @@ -6940,20 +6936,60 @@ class TemplateCcMap : public CcMap { const BucketInfo *new_bucket_info = shard_->GetBucketInfo( - Sharder::MapRangeIdToBucketId( - range_entry->GetRangeInfo()->PartitionId()), + Sharder::MapRangeIdToBucketId(new_range_id), cc_ng_id_); if (new_bucket_info->BucketOwner() != cc_ng_id_ && new_bucket_info->DirtyBucketOwner() != cc_ng_id_) { + // Neither old bucket nor new bucket belongs to this + // ng: only consume value and continue. if (op_type != OperationType::Delete) { rec.Deserialize(log_blob.data(), offset); } continue; } + + // new range belongs to this ng: determine core based on + // new_range_id and mark dirty. + core_id = static_cast((new_range_id & 0x3FF) % + shard_->core_cnt_); + is_dirty = true; + + uint64_t range_split_commit_ts = + req.RangeSplitCommitTs(partition_id); + // Only update range size for keys updated during the + // double-write phase. + need_update_size = + (range_split_commit_ts == 0) || + (req.CommitTs() > range_split_commit_ts); + } + else + { + // new_range_id < 0: key still belongs to old range, but + // old range bucket does not belong to this ng. + // Semantically, it should not be applied to this ng: + // only consume and continue. + if (op_type != OperationType::Delete) + { + rec.Deserialize(log_blob.data(), offset); + } + continue; } } + else + { + // Old range bucket belongs to this ng or is migrating to + // this ng. + core_id = static_cast((partition_id & 0x3FF) % + shard_->core_cnt_); + is_dirty = range_entry->GetRangeInfo()->IsDirty(); + + uint64_t range_split_commit_ts = + req.RangeSplitCommitTs(partition_id); + need_update_size = (range_split_commit_ts == 0) || + (req.CommitTs() > range_split_commit_ts); + } } else { @@ -6965,6 +7001,26 @@ class TemplateCcMap : public CcMap { continue; } + core_id = static_cast((key.Hash() & 0x3FF) % + shard_->core_cnt_); + } + + if (core_id != shard_->core_id_) + { + // Skips the key in the log record that is not sharded + // to this core. + if (op_type == OperationType::Insert || + op_type == OperationType::Update) + { + rec.Deserialize(log_blob.data(), offset); + } + if (shard_->core_id_ == req.FirstCore() || + (core_id != req.FirstCore() && core_id > shard_->core_id_)) + { + // Move to the smallest unvisited core id + next_core = std::min(core_id, next_core); + } + continue; } Iterator it = FindEmplace(key); @@ -7039,6 +7095,12 @@ class TemplateCcMap : public CcMap { cce->ArchiveBeforeUpdate(); } + + [[maybe_unused]] const size_t old_payload_size = + cce->PayloadSize(); + [[maybe_unused]] const RecordStatus cce_old_status = + cce->PayloadStatus(); + RecordStatus rec_status; if (op_type == OperationType::Insert || op_type == OperationType::Update) @@ -7060,6 +7122,26 @@ class TemplateCcMap : public CcMap cce->SetCommitTsPayloadStatus(commit_ts, rec_status); OnCommittedUpdate(cce, was_dirty); + if constexpr (RangePartitioned) + { + if (need_update_size) + { + int32_t delta_size = + (rec_status == RecordStatus::Deleted) + ? -static_cast(key.Size() + + old_payload_size) + : static_cast( + cce_old_status != RecordStatus::Normal + ? (key.Size() + cce->PayloadSize()) + : (cce->PayloadSize() - + old_payload_size)); + + UpdateRangeSize(static_cast(partition_id), + delta_size, + is_dirty); + } + } + if (commit_ts > last_dirty_commit_ts_) { last_dirty_commit_ts_ = commit_ts; diff --git a/tx_service/include/fault/log_replay_service.h b/tx_service/include/fault/log_replay_service.h index e9fa2fc2..eb308a58 100644 --- a/tx_service/include/fault/log_replay_service.h +++ b/tx_service/include/fault/log_replay_service.h @@ -35,6 +35,7 @@ #include #include "txlog.h" +#include "type.h" namespace txservice { @@ -174,6 +175,17 @@ class RecoveryService : public brpc::StreamInputHandler, void ProcessRecoverTxTask(RecoverTxTask &task); + // Range split info management. + void SetSplitRangeInfo(uint32_t ng_id, + TableName table_name, + int32_t range_id, + uint64_t commit_ts); + + const std::unordered_map> * + GetSplitRangeInfo(uint32_t ng_id) const; + + void CleanSplitRangeInfo(uint32_t ng_id); + struct ConnectionInfo { ConnectionInfo() = default; @@ -237,6 +249,13 @@ class RecoveryService : public brpc::StreamInputHandler, uint16_t port_; void ClearTx(uint64_t tx_number); + + // Range split info for each node group: + // ng_id -> split range commit ts>> + std::unordered_map< + uint32_t, + std::unordered_map>> + split_range_info_; }; } // namespace fault } // namespace txservice diff --git a/tx_service/src/fault/log_replay_service.cpp b/tx_service/src/fault/log_replay_service.cpp index 739caa70..8c65958f 100644 --- a/tx_service/src/fault/log_replay_service.cpp +++ b/tx_service/src/fault/log_replay_service.cpp @@ -584,6 +584,21 @@ int RecoveryService::on_received_messages(brpc::StreamId stream_id, auto res_pair = table_range_split_cnt.try_emplace( base_table_name, std::make_shared(0)); + // Record split range commit ts for data log replay. + ::txlog::SplitRangeOpMessage ds_split_range_op_msg; + if (!ds_split_range_op_msg.ParseFromArray( + split_range_op_blob.data() + blob_offset, + split_range_op_blob.length() - blob_offset)) + { + recovery_error = true; + CleanSplitRangeInfo(cc_ng_id); + return 0; + } + int32_t range_id = ds_split_range_op_msg.partition_id(); + uint64_t split_commit_ts = split_range_msg.commit_ts(); + SetSplitRangeInfo( + cc_ng_id, base_table_name, range_id, split_commit_ts); + // Replay Split ReplayLogCc *cc_req = replay_cc_pool_.NextRequest(); cc_req->Reset( @@ -611,6 +626,7 @@ int RecoveryService::on_received_messages(brpc::StreamId stream_id, stream_id, mux, on_fly_cnt, status, recovery_error); if (recovery_error) { + CleanSplitRangeInfo(cc_ng_id); return 0; } } @@ -618,6 +634,7 @@ int RecoveryService::on_received_messages(brpc::StreamId stream_id, // parse and process log records if (!msg.has_finish()) { + const auto *split_range_info = GetSplitRangeInfo(cc_ng_id); ParseDataLogCc *cc_req = parse_datalog_cc_pool_.NextRequest(); cc_req->Reset(std::move(msg), cc_ng_id, @@ -626,7 +643,8 @@ int RecoveryService::on_received_messages(brpc::StreamId stream_id, status, on_fly_cnt, recovery_error, - is_lock_recovery); + is_lock_recovery, + split_range_info); on_fly_cnt.fetch_add(1, std::memory_order_release); local_shards_.EnqueueCcRequest(next_core, cc_req); next_core = (next_core + 1) % local_shards_.Count(); @@ -634,6 +652,7 @@ int RecoveryService::on_received_messages(brpc::StreamId stream_id, else // has finish message { const std::string &log_records = msg.binary_log_records(); + const auto *split_range_info = GetSplitRangeInfo(cc_ng_id); ParseDataLogCc *cc_req = parse_datalog_cc_pool_.NextRequest(); cc_req->Reset(log_records, cc_ng_id, @@ -642,7 +661,8 @@ int RecoveryService::on_received_messages(brpc::StreamId stream_id, status, on_fly_cnt, recovery_error, - is_lock_recovery); + is_lock_recovery, + split_range_info); on_fly_cnt.fetch_add(1, std::memory_order_release); local_shards_.EnqueueCcRequest(next_core, cc_req); next_core = (next_core + 1) % local_shards_.Count(); @@ -687,6 +707,8 @@ int RecoveryService::on_received_messages(brpc::StreamId stream_id, << ", log group: " << info->log_group_id_ << ", set recovering status to finished"; } + // Clean up split range info for this node group. + CleanSplitRangeInfo(cc_ng_id); brpc::StreamClose(stream_id); // assumption: finish message must be the last message so return return 0; @@ -1060,5 +1082,40 @@ void RecoveryService::ProcessRecoverTxTask(RecoverTxTask &task) } } +void RecoveryService::SetSplitRangeInfo(uint32_t ng_id, + TableName table_name, + int32_t range_id, + uint64_t commit_ts) +{ + auto ng_it = split_range_info_.try_emplace(ng_id).first; + auto &table_map = ng_it->second; + auto table_it = + table_map + .try_emplace(table_name, std::unordered_map{}) + .first; + auto &range_map = table_it->second; + auto [it, inserted] = range_map.try_emplace(range_id, commit_ts); + if (!inserted) + { + it->second = commit_ts; + } +} + +const std::unordered_map> * +RecoveryService::GetSplitRangeInfo(uint32_t ng_id) const +{ + auto ng_it = split_range_info_.find(ng_id); + if (ng_it == split_range_info_.end()) + { + return nullptr; + } + return &ng_it->second; +} + +void RecoveryService::CleanSplitRangeInfo(uint32_t ng_id) +{ + split_range_info_.erase(ng_id); +} + } // namespace fault } // namespace txservice From a25bcda7c0accd52dee33a1489472c6d20cfc961 Mon Sep 17 00:00:00 2001 From: yi-xmu <69192773+yi-xmu@users.noreply.github.com> Date: Fri, 13 Mar 2026 18:38:02 +0800 Subject: [PATCH 07/15] Update scanslice request (#444) Update the structure definitions and related processing procedures of ScanSliceCc and RemoteScanSlice to adapt to the new key sharding logic. --- tx_service/include/cc/cc_request.h | 411 ++------ tx_service/include/cc/ccm_scanner.h | 389 +------- tx_service/include/cc/template_cc_map.h | 933 ++++-------------- tx_service/include/proto/cc_request.proto | 6 +- tx_service/include/remote/remote_cc_request.h | 4 +- tx_service/include/tx_operation_result.h | 118 +-- tx_service/src/cc/local_cc_handler.cpp | 22 +- tx_service/src/remote/cc_stream_receiver.cpp | 60 +- tx_service/src/remote/remote_cc_handler.cpp | 15 +- tx_service/src/remote/remote_cc_request.cpp | 150 +-- 10 files changed, 400 insertions(+), 1708 deletions(-) diff --git a/tx_service/include/cc/cc_request.h b/tx_service/include/cc/cc_request.h index 59998209..b8b57c97 100644 --- a/tx_service/include/cc/cc_request.h +++ b/tx_service/include/cc/cc_request.h @@ -2375,7 +2375,6 @@ struct ScanSliceCc end_key_type_(RangeKeyType::RawPtr), schema_version_(0) { - parallel_req_ = true; } ~ScanSliceCc() @@ -2443,12 +2442,12 @@ struct ScanSliceCc is_require_keys_ = is_require_keys; is_require_recs_ = is_require_recs; - unfinished_core_cnt_.store(1, std::memory_order_relaxed); range_slice_id_.Reset(); last_pinned_slice_ = nullptr; prefetch_size_ = prefetch_size; - err_.store(CcErrorCode::NO_ERROR, std::memory_order_relaxed); + err_ = CcErrorCode::NO_ERROR; cache_hit_miss_collected_ = false; + blocking_info_.Reset(); } void Set(const TableName &tbl_name, @@ -2506,11 +2505,11 @@ struct ScanSliceCc is_require_recs_ = is_require_recs; prefetch_size_ = prefetch_size; - unfinished_core_cnt_.store(1, std::memory_order_relaxed); range_slice_id_.Reset(); last_pinned_slice_ = nullptr; - err_.store(CcErrorCode::NO_ERROR, std::memory_order_relaxed); + err_ = CcErrorCode::NO_ERROR; cache_hit_miss_collected_ = false; + blocking_info_.Reset(); } bool Execute(CcShard &ccs) override @@ -2519,7 +2518,8 @@ struct ScanSliceCc { // Do not modify res_ directly since there could be other cores // still working on this cc req. - return SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); + SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); + return true; } CcMap *ccm = nullptr; @@ -2552,7 +2552,8 @@ struct ScanSliceCc // is marked as errored. if (init_res.error != CcErrorCode::NO_ERROR) { - return SetError(init_res.error); + SetError(init_res.error); + return true; } // The req will be re-enqueued. return false; @@ -2579,16 +2580,13 @@ struct ScanSliceCc void AbortCcRequest(CcErrorCode err_code) override { - if (SetError(err_code)) + SetError(err_code); + // If the request has pinned any slice, unpin it. + if (range_slice_id_.Range() != nullptr) { - // Last core finished. If the request has pinned any slice, unpin - // it. - if (range_slice_id_.Range() != nullptr) - { - UnpinSlices(); - } - Free(); + UnpinSlices(); } + Free(); } bool IsLocal() const @@ -2719,18 +2717,18 @@ struct ScanSliceCc return ts_; } - ScanCache *GetLocalScanCache(size_t shard_id) + ScanCache *GetLocalScanCache() { assert(IsLocal()); - return res_->Value().ccm_scanner_->Cache(shard_id); + return res_->Value().ccm_scanner_->Cache(0); } - RemoteScanSliceCache *GetRemoteScanCache(size_t shard_id) + RemoteScanSliceCache *GetRemoteScanCache() { assert(!IsLocal()); RangeScanSliceResult &slice_result = res_->Value(); - assert(shard_id < slice_result.remote_scan_caches_->size()); - return &slice_result.remote_scan_caches_->at(shard_id); + assert(slice_result.remote_scan_caches_ != nullptr); + return slice_result.remote_scan_caches_; } CcScanner *GetLocalScanner() @@ -2738,161 +2736,70 @@ struct ScanSliceCc return IsLocal() ? res_->Value().ccm_scanner_ : nullptr; } - uint64_t BlockingCceLockAddr(uint16_t core_id) + uint64_t BlockingCceLockAddr() const { - assert(core_id < blocking_vec_.size()); - return blocking_vec_[core_id].cce_lock_addr_; + return blocking_info_.cce_lock_addr_; } - std::pair BlockingPair(uint16_t core_id) + std::pair BlockingPair() const { - assert(core_id < blocking_vec_.size()); - return {blocking_vec_[core_id].type_, - blocking_vec_[core_id].scan_type_}; + return {blocking_info_.type_, blocking_info_.scan_type_}; } - void SetBlockingInfo(uint16_t core_id, - uint64_t cce_lock_addr, + void SetBlockingInfo(uint64_t cce_lock_addr, ScanType scan_type, ScanBlockingType blocking_type) { - assert(core_id < blocking_vec_.size()); - blocking_vec_[core_id] = {cce_lock_addr, scan_type, blocking_type}; + blocking_info_.cce_lock_addr_ = cce_lock_addr; + blocking_info_.scan_type_ = scan_type; + blocking_info_.type_ = blocking_type; } - void SetShardCount(uint16_t shard_cnt) + void SetPriorCceLockAddr(uint64_t addr) { - blocking_vec_.resize(shard_cnt); - for (auto &it : blocking_vec_) - { - it.cce_lock_addr_ = 0; - it.scan_type_ = ScanType::ScanUnknow; - it.type_ = ScanBlockingType::NoBlocking; - } - - wait_for_snapshot_cnt_.resize(shard_cnt); - for (uint16_t i = 0; i < shard_cnt; ++i) - { - wait_for_snapshot_cnt_[i] = 0; - } - } - - uint64_t GetShardCount() const - { - return blocking_vec_.size(); - } - - void SetUnfinishedCoreCnt(uint16_t core_cnt) - { - unfinished_core_cnt_.store(core_cnt, std::memory_order_release); - } - - void SetPriorCceLockAddr(uint64_t addr, uint16_t shard_id) - { - assert(shard_id < blocking_vec_.size()); - blocking_vec_[shard_id] = { - addr, ScanType::ScanUnknow, ScanBlockingType::NoBlocking}; + blocking_info_.cce_lock_addr_ = addr; + blocking_info_.scan_type_ = ScanType::ScanUnknow; + blocking_info_.type_ = ScanBlockingType::NoBlocking; } /** * @brief Notifies the scan slice request that the scan at the calling core * has finished. * - * @return true, if all cores have finished the scan. - * @return false, if the scan is not completed in all cores. */ - bool SetFinish() + void SetFinish() { - uint16_t remaining_cnt = - unfinished_core_cnt_.fetch_sub(1, std::memory_order_acq_rel); - - if (remaining_cnt == 1) + if (err_ == CcErrorCode::NO_ERROR) { - // Only update result if this is local request. Remote request - // result will be updated by dedicated core. - if (res_->Value().is_local_) - { - if (err_.load(std::memory_order_relaxed) == - CcErrorCode::NO_ERROR) - { - res_->Value().ccm_scanner_->FinalizeCommit(); - - res_->SetFinished(); - } - else - { - res_->SetError(err_.load(std::memory_order_relaxed)); - } - } + res_->SetFinished(); + } + else + { + res_->SetError(err_); } - - return remaining_cnt == 1; } - bool SetError(CcErrorCode err) + void SetError(CcErrorCode err) { - CcErrorCode expected = CcErrorCode::NO_ERROR; - err_.compare_exchange_strong(expected, - err, - std::memory_order_relaxed, - std::memory_order_relaxed); - uint16_t remaining_cnt = - unfinished_core_cnt_.fetch_sub(1, std::memory_order_acq_rel); - - // remaining_cnt might be 0 if all cores have finished and the req is - // put back into the result sending core's queue. - if (remaining_cnt <= 1) + if (err_ == CcErrorCode::NO_ERROR) { - res_->SetError(err_.load(std::memory_order_relaxed)); + err_ = err; } - return remaining_cnt <= 1; + res_->SetError(err_); } void DeferSetError(CcErrorCode err) { - CcErrorCode expected = CcErrorCode::NO_ERROR; - err_.compare_exchange_strong(expected, - err, - std::memory_order_relaxed, - std::memory_order_relaxed); - } - - CcErrorCode GetError() const - { - return err_.load(std::memory_order_acquire); - } - - /** - * @brief Send response to src node if all cores have finished. - * We use this method to send scan slice response if this request is - * a remote request. - * We assign a dedicated core to be the response sender instead of directly - * sending the response on the last finished core. This is to avoid - * serialization of response message causing one core to become - * significantly slower than others and would end up being the sender of all - * scan slice response. - */ - bool SendResponseIfFinished() - { - if (unfinished_core_cnt_.load(std::memory_order_relaxed) == 0) + if (err_ == CcErrorCode::NO_ERROR) { - if (err_.load(std::memory_order_relaxed) == CcErrorCode::NO_ERROR) - { - res_->SetFinished(); - } - else - { - res_->SetError(err_.load(std::memory_order_relaxed)); - } - return true; + err_ = err; } - return false; } - bool IsResponseSender(uint16_t core_id) const + CcErrorCode GetError() const { - return ((tx_number_ & 0x3FF) % blocking_vec_.size()) == core_id; + return err_; } bool IsForWrite() const @@ -2965,30 +2872,30 @@ struct ScanSliceCc cache_hit_miss_collected_ = true; } - bool IsWaitForSnapshot(uint16_t core_id) const + bool IsWaitForSnapshot() const { - return blocking_vec_[core_id].type_ == - ScanBlockingType::BlockOnWaitSnapshots; + return blocking_info_.type_ == ScanBlockingType::BlockOnWaitSnapshots; } - void SetIsWaitForSnapshot(uint16_t core_id) + void SetIsWaitForSnapshot() { - blocking_vec_[core_id].type_ = ScanBlockingType::BlockOnWaitSnapshots; + blocking_info_.type_ = ScanBlockingType::BlockOnWaitSnapshots; } - size_t WaitForSnapshotCnt(uint16_t core_id) const + size_t WaitForSnapshotCnt() const { - return wait_for_snapshot_cnt_[core_id]; + return wait_for_snapshot_cnt_; } - void DecreaseWaitForSnapshotCnt(uint16_t core_id) + void DecreaseWaitForSnapshotCnt() { - wait_for_snapshot_cnt_[core_id]--; + assert(wait_for_snapshot_cnt_ > 0); + wait_for_snapshot_cnt_--; } - void IncreaseWaitForSnapshotCnt(uint16_t core_id) + void IncreaseWaitForSnapshotCnt() { - wait_for_snapshot_cnt_[core_id]++; + wait_for_snapshot_cnt_++; } bool AbortIfOom() const override @@ -3042,8 +2949,7 @@ struct ScanSliceCc uint32_t range_id_{0}; - std::atomic unfinished_core_cnt_{1}; - std::atomic err_{CcErrorCode::NO_ERROR}; + CcErrorCode err_{CcErrorCode::NO_ERROR}; uint64_t ts_{0}; @@ -3053,13 +2959,20 @@ struct ScanSliceCc struct ScanBlockingInfo { - uint64_t cce_lock_addr_; - ScanType scan_type_; - ScanBlockingType type_; + void Reset() + { + cce_lock_addr_ = 0; + scan_type_ = ScanType::ScanUnknow; + type_ = ScanBlockingType::NoBlocking; + } + + uint64_t cce_lock_addr_{0}; + ScanType scan_type_{ScanType::ScanUnknow}; + ScanBlockingType type_{ScanBlockingType::NoBlocking}; }; - std::vector blocking_vec_; + ScanBlockingInfo blocking_info_; - std::vector wait_for_snapshot_cnt_; + size_t wait_for_snapshot_cnt_{0}; RangeSliceId range_slice_id_; @@ -3268,36 +3181,14 @@ struct ProcessRemoteScanRespCc : public CcRequestBase void Reset(remote::CcStreamReceiver *receiver, std::unique_ptr resp_msg, - std::vector &&offset_tables, - CcHandlerResult *hd_res, - size_t worker_cnt) + CcHandlerResult *hd_res) { receiver_ = receiver; resp_msg_ = std::move(resp_msg); - offset_tables_ = std::move(offset_tables); hd_res_ = hd_res; - - unfinished_cnt_ = worker_cnt; - next_remote_core_idx_ = worker_cnt; - - assert(offset_tables_.size() == RemoteCoreCnt()); - assert(worker_cnt <= RemoteCoreCnt()); - - cur_idxs_.clear(); - key_offsets_.clear(); - rec_offsets_.clear(); - - assert(cur_idxs_.empty()); - assert(key_offsets_.empty()); - assert(rec_offsets_.empty()); - - for (size_t worker_idx = 0; worker_idx < worker_cnt; ++worker_idx) - { - // worker idx must be less or equal than remote core count - cur_idxs_.push_back({worker_idx, 0}); - key_offsets_.push_back(KeyStartOffset(worker_idx)); - rec_offsets_.push_back(RecStartOffset(worker_idx)); - } + cur_tuple_idx_ = 0; + key_offset_ = 0; + rec_offset_ = 0; } ProcessRemoteScanRespCc(const ProcessRemoteScanRespCc &) = delete; @@ -3310,74 +3201,56 @@ struct ProcessRemoteScanRespCc : public CcRequestBase do { - auto &[remote_core_idx, tuple_idx] = cur_idxs_.at(ccs.core_id_); - + uint32_t remote_core_idx = resp_msg_->core_id(); const uint64_t *key_ts_ptr = (const uint64_t *) resp_msg_->key_ts().data(); - key_ts_ptr += MetaOffset(remote_core_idx); const uint64_t *gap_ts_ptr = (const uint64_t *) resp_msg_->gap_ts().data(); - gap_ts_ptr += MetaOffset(remote_core_idx); const uint64_t *term_ptr = (const uint64_t *) resp_msg_->term().data(); - term_ptr += MetaOffset(remote_core_idx); const uint64_t *cce_lock_ptr_ptr = (const uint64_t *) resp_msg_->cce_lock_ptr().data(); - cce_lock_ptr_ptr += MetaOffset(remote_core_idx); const remote::RecordStatusType *rec_status_ptr = (const remote::RecordStatusType *) resp_msg_->rec_status() .data(); - rec_status_ptr += MetaOffset(remote_core_idx); RangeScanSliceResult &scan_slice_result = hd_res_->Value(); CcScanner &range_scanner = *scan_slice_result.ccm_scanner_; - ScanCache *shard_cache = range_scanner.Cache(remote_core_idx); + ScanCache *shard_cache = range_scanner.Cache(0); - size_t &key_offset = key_offsets_[ccs.core_id_]; - size_t &rec_offset = rec_offsets_[ccs.core_id_]; - size_t tuple_cnt = TupleCnt(remote_core_idx); + size_t tuple_cnt = TupleCnt(); - for (; tuple_idx < tuple_cnt && scan_cnt < SCAN_BATCH_SIZE; - ++tuple_idx, ++scan_cnt) + for (; cur_tuple_idx_ < tuple_cnt && scan_cnt < SCAN_BATCH_SIZE; + ++cur_tuple_idx_, ++scan_cnt) { RecordStatus rec_status = remote::ToLocalType::ConvertRecordStatusType( - rec_status_ptr[tuple_idx]); + rec_status_ptr[cur_tuple_idx_]); shard_cache->AddScanTuple(resp_msg_->keys(), - key_offset, - key_ts_ptr[tuple_idx], + key_offset_, + key_ts_ptr[cur_tuple_idx_], resp_msg_->records(), - rec_offset, + rec_offset_, rec_status, -1, - gap_ts_ptr[tuple_idx], - cce_lock_ptr_ptr[tuple_idx], - term_ptr[tuple_idx], + gap_ts_ptr[cur_tuple_idx_], + cce_lock_ptr_ptr[cur_tuple_idx_], + term_ptr[cur_tuple_idx_], remote_core_idx, scan_slice_result.cc_ng_id_, true); } - if (tuple_idx == tuple_cnt) + if (cur_tuple_idx_ == tuple_cnt) { - size_t trailing_cnt = TrailingCnt(remote_core_idx); - while (trailing_cnt-- > 0) - { - shard_cache->RemoveLast(); - } - - range_scanner.CommitAtCore(remote_core_idx); - - if (!MoveForward(ccs.core_id_)) - { - // No more data - return SetFinished(); - } + // No more data + SetFinished(); + return true; } // To avoid blocking other request for a long time, we only process @@ -3389,115 +3262,43 @@ struct ProcessRemoteScanRespCc : public CcRequestBase return false; } - bool SetFinished() + void SetFinished() { - // This core is last finished worker. We need to set handler result and - // recycle message. - if (unfinished_cnt_.fetch_sub(1, std::memory_order_release) == 1) + if (resp_msg_->error_code() != 0) { - if (resp_msg_->error_code() != 0) - { - hd_res_->SetError(remote::ToLocalType::ConvertCcErrorCode( - resp_msg_->error_code())); - } - else - { - hd_res_->Value().ccm_scanner_->FinalizeCommit(); - - hd_res_->SetFinished(); - } - - TransactionExecution *txm = - reinterpret_cast(resp_msg_->txm_addr()); - txm->ReleaseSharedForwardLatch(); - - // Recycle message - receiver_->RecycleScanSliceResp(std::move(resp_msg_)); - - // Return true to recycle this request - return true; + hd_res_->SetError(remote::ToLocalType::ConvertCcErrorCode( + resp_msg_->error_code())); } - - return false; - } - -private: - bool MoveForward(size_t worker_idx) - { - size_t new_remote_core_idx = next_remote_core_idx_.fetch_add(1); - if (new_remote_core_idx < RemoteCoreCnt()) + else { - cur_idxs_.at(worker_idx) = {new_remote_core_idx, 0}; - key_offsets_.at(worker_idx) = KeyStartOffset(new_remote_core_idx); - rec_offsets_.at(worker_idx) = RecStartOffset(new_remote_core_idx); - - return true; + hd_res_->SetFinished(); } - // No more data - return false; - } - - size_t KeyStartOffset(size_t remote_core_idx) const - { - const size_t *ptr = reinterpret_cast( - resp_msg_->key_start_offsets().data()); - ptr += remote_core_idx; - return *ptr; - } + TransactionExecution *txm = + reinterpret_cast(resp_msg_->txm_addr()); + txm->ReleaseSharedForwardLatch(); - size_t RecStartOffset(size_t remote_core_idx) const - { - const size_t *ptr = reinterpret_cast( - resp_msg_->record_start_offsets().data()); - ptr += remote_core_idx; - return *ptr; - } - - size_t MetaOffset(size_t remote_core_idx) const - { - return offset_tables_[remote_core_idx]; + // Recycle message + receiver_->RecycleScanSliceResp(std::move(resp_msg_)); } - size_t TupleCnt(size_t remote_core_idx) const +private: + size_t TupleCnt() const { const char *tuple_cnt_info = resp_msg_->tuple_cnt().data(); - // remote core count - tuple_cnt_info += sizeof(uint16_t); - // tuple count - tuple_cnt_info += remote_core_idx * sizeof(size_t); return *(reinterpret_cast(tuple_cnt_info)); } - size_t TrailingCnt(size_t remote_core_idx) const - { - const size_t *ptr = - reinterpret_cast(resp_msg_->trailing_cnts().data()); - ptr += remote_core_idx; - return *ptr; - } - - uint16_t RemoteCoreCnt() const - { - const char *tuple_cnt_info = resp_msg_->tuple_cnt().data(); - return *reinterpret_cast(tuple_cnt_info); - } - remote::CcStreamReceiver *receiver_{nullptr}; std::unique_ptr resp_msg_{nullptr}; - // Store the start postition of meta data like `key_ts`. - std::vector offset_tables_; - // The vector of {remote_core_idx, current_tuple_idx}. - std::vector> cur_idxs_; + // current_tuple_idx}. + size_t cur_tuple_idx_; // We need to store key/rec offset so that we could restart from pause // point. - std::vector key_offsets_; - std::vector rec_offsets_; + size_t key_offset_; + size_t rec_offset_; - // Unfinished worker count. std::min(this_node_core_count, - // remote_core_count) - std::atomic unfinished_cnt_{0}; // Next remote core idx we need to process. std::atomic next_remote_core_idx_{0}; CcHandlerResult *hd_res_{nullptr}; diff --git a/tx_service/include/cc/ccm_scanner.h b/tx_service/include/cc/ccm_scanner.h index 7de8dbb2..96c5d898 100644 --- a/tx_service/include/cc/ccm_scanner.h +++ b/tx_service/include/cc/ccm_scanner.h @@ -424,7 +424,6 @@ class CcScanner return TxKey(); } - virtual void ResetShards(size_t shard_cnt) = 0; virtual void ResetCaches() = 0; virtual void Reset(const KeySchema *key_schema) = 0; virtual void Close() = 0; @@ -466,16 +465,6 @@ class CcScanner virtual uint32_t ShardCount() const = 0; - virtual void CommitAtCore(uint16_t core_id) - { - assert(false); - } - - virtual void FinalizeCommit() - { - assert(false); - } - ScanDirection Direction() const { return direct_; @@ -841,12 +830,6 @@ class HashParitionCcScanner : public CcScanner { } - void ResetShards(size_t shard_cnt) override - { - assert(false && - "ResetShards is designed for RangePartitionedCcmScanner."); - } - void ResetCaches() override { for (auto &[shard_code, cache] : shard_caches_) @@ -1199,7 +1182,9 @@ class RangePartitionedCcmScanner : public CcScanner RangePartitionedCcmScanner(ScanDirection direct, ScanIndexType index_type, const KeySchema *schema) - : CcScanner(direct, index_type), scans_(), key_schema_(schema) + : CcScanner(direct, index_type), + scan_cache_(this, schema), + key_schema_(schema) { } @@ -1207,113 +1192,59 @@ class RangePartitionedCcmScanner : public CcScanner { } - void ResetShards(size_t shard_cnt) override - { - size_t old_size = scans_.size(); - if (shard_cnt > old_size) - { - scans_.reserve(shard_cnt); - index_chain_.reserve(shard_cnt); - for (size_t idx = old_size; idx < shard_cnt; ++idx) - { - scans_.emplace_back(this, key_schema_); - index_chain_.emplace_back(); - } - } - else if (shard_cnt < old_size) - { - for (size_t idx = shard_cnt; idx < old_size; ++idx) - { - scans_.pop_back(); - } - index_chain_.resize(shard_cnt); - } - - assert(scans_.size() == shard_cnt); - - for (size_t idx = 0; idx < old_size && idx < shard_cnt; ++idx) - { - scans_[idx].Reset(); - index_chain_[idx].clear(); - } - - std::unique_lock lk(mux_); - head_index_ = Inf(); - head_occupied_ = false; - } - void ResetCaches() override { - for (size_t core_id = 0; core_id < scans_.size(); ++core_id) - { - scans_[core_id].Reset(); - index_chain_[core_id].clear(); - } - - head_index_ = Inf(); - head_occupied_ = false; + scan_cache_.Reset(); } ScanCache *Cache(uint32_t shard_code) override { - // For RangePartitionedCcmScanner, shard_code is core_id. - return &scans_[shard_code]; + (void) shard_code; + return &scan_cache_; } void ShardCacheSizes(std::vector> *shard_code_and_sizes) const override { - for (size_t core_id = 0; core_id < scans_.size(); ++core_id) - { - shard_code_and_sizes->emplace_back(core_id, scans_[core_id].Size()); - } + shard_code_and_sizes->emplace_back(0u, scan_cache_.Size()); } void MemoryShardCacheLastTuples( std::vector *last_tuples) const override { - last_tuples->reserve(scans_.size()); - for (size_t core_id = 0; core_id < scans_.size(); ++core_id) - { - last_tuples->emplace_back(scans_[core_id].LastTuple()); - } + last_tuples->emplace_back(scan_cache_.LastTuple()); } void MemoryShardCacheTrailingTuples( std::vector *trailing_tuples) const override { - for (size_t core_id = 0; core_id < scans_.size(); ++core_id) - { - scans_[core_id].TrailingTuples(*trailing_tuples); - } + scan_cache_.TrailingTuples(*trailing_tuples); } const ScanTuple *Current() override { - if (head_index_ == Inf()) + if (status_ != ScannerStatus::Open) { - status_ = ScannerStatus::Blocked; return nullptr; } - else + + const TemplateScanTuple *tuple = scan_cache_.Current(); + if (tuple == nullptr) { - assert(status_ == ScannerStatus::Open); - return At(head_index_); + status_ = ScannerStatus::Blocked; } + + return tuple; } void MoveNext() override { - if (head_index_ == Inf()) + if (status_ != ScannerStatus::Open) { return; } - head_index_ = AdvanceMergeIndex(head_index_); - if (head_index_ == Inf()) - { - status_ = ScannerStatus::Blocked; - } + scan_cache_.MoveNext(); } CcmScannerType Type() const override @@ -1342,7 +1273,7 @@ class RangePartitionedCcmScanner : public CcScanner uint32_t ShardCount() const override { - return scans_.size(); + return 1; } void Reset(const KeySchema *key_schema) override @@ -1354,289 +1285,11 @@ class RangePartitionedCcmScanner : public CcScanner void Close() override { status_ = ScannerStatus::Closed; - scans_.clear(); - index_chain_.clear(); - head_index_ = Inf(); - head_occupied_ = false; - } - - /** - * @brief Commits the scan at the specified core. - * - * @param core_id - */ - void CommitAtCore(uint16_t core_id) override - { - size_t sz = scans_[core_id].Size(); - if (sz > 0) - { - std::vector &next_chain = index_chain_[core_id]; - assert(next_chain.empty()); - next_chain.reserve(sz); - - for (uint32_t idx = 0; idx < sz - 1; ++idx) - { - next_chain.emplace_back(core_id, idx + 1); - } - // The next index of the last tuple is infinity. - next_chain.emplace_back(Inf()); - assert(next_chain.size() == sz); - - if (is_require_sort_) - { - CompoundIndex head_index(core_id, 0); - MergeCompoundIndex(head_index); - } - else - { - // Concat. Delay concat to FinalizeCommit() to avoid lock. - } - } - } - - void FinalizeCommit() override - { - if (is_require_sort_) - { - // Already sorted by CommitAtCore(). - } - else - { - ConcatAll(); - } + scan_cache_.Reset(); } private: - struct CompoundIndex - { - public: - CompoundIndex() : index_(UINT32_MAX) - { - } - - CompoundIndex(uint16_t core_id, uint32_t offset) - { - index_ = (offset << 10) | core_id; - } - - friend bool operator==(const CompoundIndex &lhs, - const CompoundIndex &rhs) - { - return lhs.index_ == rhs.index_; - } - - friend bool operator!=(const CompoundIndex &lhs, - const CompoundIndex &rhs) - { - return !(lhs == rhs); - } - - uint16_t CoreId() const - { - return index_ & 0x3FF; - } - - uint32_t Offset() const - { - return index_ >> 10; - } - - private: - /** - * @brief The lower 10 bits represent the core ID. The remaining higher - * bits represent the offset in the scan result vector. - * - */ - uint32_t index_; - }; - - const CompoundIndex &Inf() const - { - static CompoundIndex inf; - return inf; - } - - void MergeCompoundIndex(CompoundIndex head) - { - std::unique_lock lk(mux_); - if (!head_occupied_) - { - // The head is empty. There is nothing to merge. Sets the head to - // the input scan list's head. - head_index_ = head; - head_occupied_ = true; - } - else if (head != Inf()) - { - // Merges the input scan list with the list pointed by the head. - if (head_index_ == Inf()) - { - head_index_ = head; - return; - } - CompoundIndex curr_head = head_index_; - head_occupied_ = false; - - lk.unlock(); - MergeCompoundIndex(head, curr_head); - } - } - - void MergeCompoundIndex(CompoundIndex left, CompoundIndex right) - { - CompoundIndex merge_head; - CompoundIndex prev_index; - - if (left == Inf()) - { - // The left is empty. - return MergeCompoundIndex(right); - } - else if (right == Inf()) - { - // The right is empty. - return MergeCompoundIndex(left); - } - - const TemplateScanTuple *left_tuple = At(left); - const TemplateScanTuple *right_tuple = At(right); - - if (IsForward) - { - if (left_tuple->KeyObj() < right_tuple->KeyObj()) - { - merge_head = left; - prev_index = left; - left = AdvanceMergeIndex(left); - } - else - { - merge_head = right; - prev_index = right; - right = AdvanceMergeIndex(right); - } - - while (left != Inf() && right != Inf()) - { - left_tuple = At(left); - right_tuple = At(right); - - if (left_tuple->KeyObj() < right_tuple->KeyObj()) - { - UpdateNextIndex(prev_index, left); - prev_index = left; - left = AdvanceMergeIndex(left); - } - else - { - UpdateNextIndex(prev_index, right); - prev_index = right; - right = AdvanceMergeIndex(right); - } - } - } - else - { - if (left_tuple->KeyObj() < right_tuple->KeyObj()) - { - merge_head = right; - prev_index = right; - right = AdvanceMergeIndex(right); - } - else - { - merge_head = left; - prev_index = left; - left = AdvanceMergeIndex(left); - } - - while (left != Inf() && right != Inf()) - { - left_tuple = At(left); - right_tuple = At(right); - - if (left_tuple->KeyObj() < right_tuple->KeyObj()) - { - UpdateNextIndex(prev_index, right); - prev_index = right; - right = AdvanceMergeIndex(right); - } - else - { - UpdateNextIndex(prev_index, left); - prev_index = left; - left = AdvanceMergeIndex(left); - } - } - } - - if (left != Inf()) - { - UpdateNextIndex(prev_index, left); - } - - if (right != Inf()) - { - UpdateNextIndex(prev_index, right); - } - - MergeCompoundIndex(merge_head); - } - - /** - * @brief Concat all chains at last finished core to avoid lock. - */ - void ConcatAll() - { - assert(head_index_ == Inf()); - for (uint16_t core_id = 0; core_id < index_chain_.size(); ++core_id) - { - std::vector &chain = index_chain_[core_id]; - if (!chain.empty()) - { - ConcatLockFree(core_id, chain); - } - } - } - - void ConcatLockFree(uint16_t core_id, std::vector &chain) - { - chain.back() = head_index_; - head_index_ = {core_id, 0}; - } - - CompoundIndex AdvanceMergeIndex(CompoundIndex index) - { - assert(index.CoreId() < index_chain_.size()); - assert(index.Offset() < index_chain_[index.CoreId()].size()); - - return index_chain_[index.CoreId()][index.Offset()]; - } - - const TemplateScanTuple *At(CompoundIndex index) const - { - assert(index.CoreId() < scans_.size()); - assert(index.Offset() < scans_[index.CoreId()].Size()); - - return scans_[index.CoreId()].At(index.Offset()); - } - - void UpdateNextIndex(CompoundIndex prev_index, CompoundIndex index) - { - assert(prev_index.CoreId() < index_chain_.size()); - assert(prev_index.Offset() < index_chain_[prev_index.CoreId()].size()); - - index_chain_[prev_index.CoreId()][prev_index.Offset()] = index; - } - - // Scan caches of the target node group. Its size is core count of the - // target node. - std::vector> scans_; - std::vector> index_chain_; - std::mutex mux_; - bool head_occupied_{false}; - CompoundIndex head_index_{Inf()}; - + TemplateScanCache scan_cache_; const KeySchema *key_schema_; /** * @brief The term of the cc node group where the range partition resides. diff --git a/tx_service/include/cc/template_cc_map.h b/tx_service/include/cc/template_cc_map.h index c1c6f451..22a21f8d 100644 --- a/tx_service/include/cc/template_cc_map.h +++ b/tx_service/include/cc/template_cc_map.h @@ -3473,7 +3473,8 @@ class TemplateCcMap : public CcMap if (ng_term < 0 || (req.RangeCcNgTerm() > 0 && req.RangeCcNgTerm() != ng_term)) { - return req.SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); + req.SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); + return true; } if (req.SchemaVersion() != 0 && req.SchemaVersion() != schema_ts_) @@ -3482,41 +3483,14 @@ class TemplateCcMap : public CcMap return true; } - if (req.SendResponseIfFinished()) + if (req.IsWaitForSnapshot()) { + assert(req.WaitForSnapshotCnt() == 0); req.UnpinSlices(); + req.SetFinish(); return true; } - if (req.IsWaitForSnapshot(shard_->core_id_)) - { - assert(req.WaitForSnapshotCnt(shard_->core_id_) == 0); - if (req.SetFinish()) - { - if (req.Result()->Value().is_local_) - { - req.UnpinSlices(); - return true; - } - else if (req.IsResponseSender(shard_->core_id_)) - { - req.SendResponseIfFinished(); - req.UnpinSlices(); - return true; - } - else - { - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, req.Txn(), &req); - return false; - } - } - else - { - return false; - } - } - CcOperation cc_op; bool is_read_snapshot; if (table_name_.Type() == TableType::Secondary || @@ -3583,18 +3557,17 @@ class TemplateCcMap : public CcMap req.SetEndKey(TxKey(std::move(decoded_end_key))); } - uint16_t core_id = shard_->LocalCoreId(); TemplateScanCache *scan_cache = nullptr; RemoteScanSliceCache *remote_scan_cache = nullptr; if (req.IsLocal()) { scan_cache = static_cast *>( - req.GetLocalScanCache(core_id)); + req.GetLocalScanCache()); assert(scan_cache != nullptr); } else { - remote_scan_cache = req.GetRemoteScanCache(core_id); + remote_scan_cache = req.GetRemoteScanCache(); assert(remote_scan_cache != nullptr); } @@ -3636,10 +3609,6 @@ class TemplateCcMap : public CcMap if (req.SliceId().Slice() == nullptr) { - // The scan slice request is first dispatched to one core, which - // pins the slice in memory. After the slice is pinned, the request - // is dispatched to other cores to scan in parallel. The slice is - // unpinned by the last core finishing the scan batch. RangeSliceOpStatus pin_status = RangeSliceOpStatus::NotPinned; uint32_t max_pin_cnt = req.PrefetchSize(); const StoreSlice *last_pinned_slice; @@ -3689,7 +3658,8 @@ class TemplateCcMap : public CcMap { if (slice_id.Range()->HasLock()) { - return req.SetError(CcErrorCode::OUT_OF_MEMORY); + req.SetError(CcErrorCode::OUT_OF_MEMORY); + return true; } else { @@ -3706,27 +3676,12 @@ class TemplateCcMap : public CcMap { // If the pin operation returns an error, the data store // is inaccessible. - return req.SetError(CcErrorCode::PIN_RANGE_SLICE_FAILED); + req.SetError(CcErrorCode::PIN_RANGE_SLICE_FAILED); + return true; } assert(pin_status == RangeSliceOpStatus::Successful); req.PinSlices(slice_id, last_pinned_slice); - // Update unfinished cnt before dispatching to remaining cores. - req.SetUnfinishedCoreCnt(req.GetShardCount()); - - // Dispatches to remaining cores to scan pinned slice(s) in - // parallel. - for (uint16_t core_id = 0; core_id < shard_->local_shards_.Count(); - ++core_id) - { - if (core_id == shard_->core_id_) - { - continue; - } - - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, core_id, &req); - } } Iterator scan_ccm_it; @@ -3784,7 +3739,6 @@ class TemplateCcMap : public CcMap case CcErrorCode::MVCC_READ_MUST_WAIT_WRITE: { req.SetBlockingInfo( - shard_->core_id_, reinterpret_cast(cce->GetLockAddr()), scan_type, ScanBlockingType::BlockOnFuture); @@ -3793,7 +3747,6 @@ class TemplateCcMap : public CcMap case CcErrorCode::ACQUIRE_LOCK_BLOCKED: { req.SetBlockingInfo( - shard_->core_id_, reinterpret_cast(cce->GetLockAddr()), scan_type, ScanBlockingType::BlockOnLock); @@ -3853,7 +3806,7 @@ class TemplateCcMap : public CcMap assert(fetch_ret_status == store::DataStoreHandler::DataStoreOpStatus::Success); (void) fetch_ret_status; - req.IncreaseWaitForSnapshotCnt(shard_->core_id_); + req.IncreaseWaitForSnapshotCnt(); } } else @@ -3903,14 +3856,14 @@ class TemplateCcMap : public CcMap assert(fetch_ret_status == store::DataStoreHandler::DataStoreOpStatus::Success); (void) fetch_ret_status; - req.IncreaseWaitForSnapshotCnt(shard_->core_id_); + req.IncreaseWaitForSnapshotCnt(); } } return {ScanReturnType::Success, CcErrorCode::NO_ERROR}; }; - uint64_t cce_lock_addr = req.BlockingCceLockAddr(core_id); + uint64_t cce_lock_addr = req.BlockingCceLockAddr(); if (cce_lock_addr != 0) { KeyGapLockAndExtraData *lock = @@ -3920,7 +3873,7 @@ class TemplateCcMap : public CcMap CcEntry *>( lock->GetCcEntry()); - auto [blocking_type, scan_type] = req.BlockingPair(core_id); + auto [blocking_type, scan_type] = req.BlockingPair(); CcPage *ccp = static_cast< CcPage *>( @@ -3975,43 +3928,16 @@ class TemplateCcMap : public CcMap assert(lock_pair.second == CcErrorCode::MVCC_READ_FOR_WRITE_CONFLICT); - if (req.IsLocal()) + if (is_read_snapshot && req.WaitForSnapshotCnt() > 0) { - req.GetLocalScanner()->CommitAtCore(core_id); - } - - if (is_read_snapshot && - req.WaitForSnapshotCnt(shard_->core_id_) > 0) - { - req.SetIsWaitForSnapshot(shard_->core_id_); + req.SetIsWaitForSnapshot(); req.DeferSetError(lock_pair.second); return false; } - if (req.SetError(lock_pair.second)) - { - if (req.Result()->Value().is_local_) - { - req.UnpinSlices(); - return true; - } - else if (req.IsResponseSender(shard_->core_id_)) - { - req.SendResponseIfFinished(); - req.UnpinSlices(); - return true; - } - else - { - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, req.Txn(), &req); - return false; - } - } - else - { - return false; - } + req.UnpinSlices(); + req.SetError(lock_pair.second); + return true; } is_locked = lock_pair.first != LockType::NoLock; @@ -4058,7 +3984,7 @@ class TemplateCcMap : public CcMap store::DataStoreHandler::DataStoreOpStatus:: Success); (void) fetch_ret_status; - req.IncreaseWaitForSnapshotCnt(shard_->core_id_); + req.IncreaseWaitForSnapshotCnt(); } } else @@ -4109,7 +4035,7 @@ class TemplateCcMap : public CcMap store::DataStoreHandler::DataStoreOpStatus:: Success); (void) fetch_ret_status; - req.IncreaseWaitForSnapshotCnt(shard_->core_id_); + req.IncreaseWaitForSnapshotCnt(); } } } @@ -4155,90 +4081,39 @@ class TemplateCcMap : public CcMap } RangeScanSliceResult &slice_result = hd_res->Value(); - auto [final_end_tx_key, end_finalized] = slice_result.PeekLastKey(); if (req.Direction() == ScanDirection::Forward) { const TemplateStoreSlice *last_slice = static_cast *>( req.LastPinnedSlice()); - // The scan at core 0 sets the scan's end key. By default, the - // scan's end is the exclusive end of the slice or the request's - // specified end key, whichever is smaller. In case keys in the - // slice are too many to fit into the scan cache, the key right - // after the last scanned tuple at core 0 becomes the exclusive end - // of scans at other cores. In such a case, it is mandatory that all - // keys smaller than the end key at other cores are returned in this - // batch. So, scans at other cores may slightly exceed the scan - // cache's capacity. - + // By default, the scan's end is the exclusive end of the slice or + // the request's specified end key, whichever is smaller. In case + // keys in the slice are too many to fit into the scan cache, the + // key right after the last scanned tuple becomes the exclusive end + // of scans. const KeyT *initial_end = nullptr; bool init_end_inclusive = false; - // Given the scan batch's final end key, deduces the local scan's - // end and inclusiveness. - auto deduce_scan_end = - [](const KeyT *batch_end_key, - const KeyT *req_end_key, - bool req_inclusive) -> std::pair - { - const KeyT *end = nullptr; - bool inclusive = false; - - assert(batch_end_key != nullptr); - // If the request specifies the end key and it is the scan - // batch's end key, the scan's inclusiveness is determined by - // the request. Or, the scan batch's end must be the exclusive - // end of a slice or positive infinity. - if (batch_end_key == req_end_key) - { - end = req_end_key; - inclusive = req_inclusive; - } - else - { - end = batch_end_key; - inclusive = false; - } + // Takes the smaller of the slice's last key and the request's end + // key as the local scan's initial end. + const KeyT *slice_end = last_slice->EndKey(); + assert(slice_end != nullptr); - return {end, inclusive}; - }; - - if (!end_finalized) + // If the request specifies the end key and it falls into the + // slice, initializes the local scan's end to the request's end + // key. Or, the scan end is the slice's end. + if (req_end_key != nullptr && + (*req_end_key < *slice_end || + (*req_end_key == *slice_end && !req.EndInclusive()))) { - // This scan batch's end key has not been set. Takes the smaller - // of the slice's last key and the request's end key as the - // local scan's initial end. The initial end may be modified, if - // another core finishes earlier and finalizes the batch's end - // before this core. The final end may be smaller or greater - // than the initial end. - const KeyT *slice_end = last_slice->EndKey(); - assert(slice_end != nullptr); - - // If the request specifies the end key and it falls into the - // slice, initializes the local scan's end to the request's end - // key. Or, the scan end is the slice's end. - if (req_end_key != nullptr && - (*req_end_key < *slice_end || - (*req_end_key == *slice_end && !req.EndInclusive()))) - { - initial_end = req_end_key; - init_end_inclusive = req.EndInclusive(); - } - else - { - initial_end = slice_end; - init_end_inclusive = false; - } + initial_end = req_end_key; + init_end_inclusive = req.EndInclusive(); } else { - // This scan batch's end key has been finalized by one of the - // cores. Deduces the local scan's end and inclusiveness. - std::tie(initial_end, init_end_inclusive) = - deduce_scan_end(final_end_tx_key->GetKey(), - req_end_key, - req.EndInclusive()); + initial_end = slice_end; + init_end_inclusive = false; } auto scan_batch_func = @@ -4265,12 +4140,11 @@ class TemplateCcMap : public CcMap return {scan_ret, err_code}; }; - auto scan_loop_func = [this, &scan_batch_func, &is_cache_full]( - Iterator &scan_ccm_it, - const KeyT &end_key, - bool inclusive, - bool end_finalized) - -> std::pair + auto scan_loop_func = + [this, &scan_batch_func, &is_cache_full]( + Iterator &scan_ccm_it, + const KeyT &end_key, + bool inclusive) -> std::pair { ScanReturnType scan_ret = ScanReturnType::Success; CcErrorCode err_code = CcErrorCode::NO_ERROR; @@ -4322,7 +4196,7 @@ class TemplateCcMap : public CcMap scan_ccm_it = End(); ccp = nullptr; } - else if (!end_finalized && is_cache_full()) + else if (is_cache_full()) { scan_ccm_it = Iterator(ccp->next_page_, 0, &neg_inf_); @@ -4344,50 +4218,23 @@ class TemplateCcMap : public CcMap return {scan_ret, err_code}; }; - auto [scan_ret, err] = scan_loop_func( - scan_ccm_it, *initial_end, init_end_inclusive, end_finalized); + auto [scan_ret, err] = + scan_loop_func(scan_ccm_it, *initial_end, init_end_inclusive); switch (scan_ret) { case ScanReturnType::Blocked: return false; case ScanReturnType::Error: - if (req.IsLocal()) + if (is_read_snapshot && req.WaitForSnapshotCnt() > 0) { - req.GetLocalScanner()->CommitAtCore(core_id); - } - - if (is_read_snapshot && - req.WaitForSnapshotCnt(shard_->core_id_) > 0) - { - req.SetIsWaitForSnapshot(shard_->core_id_); + req.SetIsWaitForSnapshot(); req.DeferSetError(err); return false; } - if (req.SetError(err)) - { - if (req.Result()->Value().is_local_) - { - req.UnpinSlices(); - return true; - } - else if (req.IsResponseSender(shard_->core_id_)) - { - req.SendResponseIfFinished(); - req.UnpinSlices(); - return true; - } - else - { - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, req.Txn(), &req); - return false; - } - } - else - { - return false; - } + req.UnpinSlices(); + req.SetError(err); + return true; case ScanReturnType::Yield: shard_->Enqueue(shard_->core_id_, &req); return false; @@ -4395,234 +4242,61 @@ class TemplateCcMap : public CcMap break; } - // If the end of this scan batch is not finalized when the local - // scan at this core started, tries to set the batch's end using the - // local end. If another core has finalized the batch's end, the - // scan at this core may need to be adjusted: if the batch's final - // end is less than the end at this core, keys after the final end - // needs to be removed from the local scan cache; if the batch's - // final end is greater than the end of this core, keys smaller than - // the batch's final end but greater than the local end need to be - // included in the local scan cache. - if (!end_finalized) - { - const KeyT *local_end = nullptr; - SlicePosition slice_position; - - // scan_ccm_it points to the entry after the last scanned tuple. - // If the slice ends with positive infinity and has been fully - // scanned, scan_ccm_it would point to positive infinity. - auto pos_inf_it = End(); - if (scan_ccm_it != pos_inf_it && - (*scan_ccm_it->first < *initial_end || - (init_end_inclusive && - *scan_ccm_it->first == *initial_end))) - { - // The slice is too large. The scan has not fully scanned - // the slice, before reaching the cache's size limit. - // Pretends the slice's exclusive end to be the key after - // the last scanned tuple, from which the next scan batch - // resume. - local_end = scan_ccm_it->first; - slice_position = SlicePosition::Middle; - } - else - { - // The slice has been fully scanned. If the request - // specifies the end key, which falls into the slice, given - // that the slice has been fully scanned, no future scan - // batches are needed. So, we pretend that the scan has - // reached the last slice ending with positive infinity. - // The calling tx will terminate the scan. - if (initial_end == KeyT::PositiveInfinity() || - req_end_key == initial_end) - { - local_end = initial_end; - slice_position = SlicePosition::LastSlice; - } - else - { - // The local scan end must be the end of the slice. - local_end = initial_end; - const TemplateStoreRange *range = - static_cast *>( - req.SliceId().Range()); - const KeyT *range_end = range->RangeEndKey(); - if (range_end != nullptr && *initial_end == *range_end) - { - slice_position = SlicePosition::LastSliceInRange; - } - else - { - slice_position = SlicePosition::Middle; - } - } - } - - auto [batch_end, set_success] = - slice_result.UpdateLastKey(local_end, slice_position); + const KeyT *local_end = nullptr; + SlicePosition slice_position; - if (set_success) + // scan_ccm_it points to the entry after the last scanned tuple. + // If the slice ends with positive infinity and has been fully + // scanned, scan_ccm_it would point to positive infinity. + auto pos_inf_it = End(); + if (scan_ccm_it != pos_inf_it && + (*scan_ccm_it->first < *initial_end || + (init_end_inclusive && *scan_ccm_it->first == *initial_end))) + { + // The slice is too large. The scan has not fully scanned + // the slice, before reaching the cache's size limit. + // Pretends the slice's exclusive end to be the key after + // the last scanned tuple, from which the next scan batch + // resume. + local_end = scan_ccm_it->first; + slice_position = SlicePosition::Middle; + } + else + { + // The slice has been fully scanned. If the request + // specifies the end key, which falls into the slice, given + // that the slice has been fully scanned, no future scan + // batches are needed. So, we pretend that the scan has + // reached the last slice ending with positive infinity. + // The calling tx will terminate the scan. + if (initial_end == KeyT::PositiveInfinity() || + req_end_key == initial_end) { - req.SetRangeCcNgTerm(ng_term); + local_end = initial_end; + slice_position = SlicePosition::LastSlice; } else { - // The local scan tries to set the scan batch's end, but the - // scan at another core have set the batch's end. The scan - // results need to be adjusted, if the results include the - // keys greater than the batch's end, or the results miss - // some keys smaller than the batch's end. - auto [end_key, end_inclusive] = deduce_scan_end( - batch_end, req_end_key, req.EndInclusive()); - size_t trailing_cnt = 0; - - // Excludes keys from the scan cache greater than the - // batch's end. - if (req.IsLocal()) + // The local scan end must be the end of the slice. + local_end = initial_end; + const TemplateStoreRange *range = + static_cast *>( + req.SliceId().Range()); + const KeyT *range_end = range->RangeEndKey(); + if (range_end != nullptr && *initial_end == *range_end) { - while (scan_cache->Size() > 0) - { - // If req.is_require_keys_ is false, the KeyT object - // in scan cache is invalid, so, should use the cce, - // which is valid in any situation, to get the - // corresponding key. - auto last_cce = - reinterpret_cast *>( - scan_cache->Last()->cce_ptr_); - while (scan_ccm_it->second != last_cce) - { - --scan_ccm_it; - assert(scan_ccm_it != Begin()); - } - const KeyT *last_key = - static_cast(scan_ccm_it->first); - if (*end_key < *last_key || - (*end_key == *last_key && !end_inclusive)) - { - ++trailing_cnt; - // Remove cce from scan cache, but keep possible - // locks, because those locks might acquired by - // other ScanSliceCc/ReadCc from the - // transaction. - scan_cache->RemoveLast(); - } - else - { - // Reset iterator to the key after the last - // scanned tuple since we might need to continue - // scanning if trailing_cnt == 0. - ++scan_ccm_it; - break; - } - } + slice_position = SlicePosition::LastSliceInRange; } else { - while (remote_scan_cache->Size() > 0) - { - // Cc entry pointers here are always valid since - // the slices are still pinned so the cce cannot - // be kicked from memory regardless of the lock - // type. - auto last_remote_cce = - reinterpret_cast *>( - remote_scan_cache->LastCce()); - while (scan_ccm_it->second != last_remote_cce) - { - // As long as remote scan cache is not empty, - // iterator should not reach neg inf. - --scan_ccm_it; - assert(scan_ccm_it != Begin()); - } - const KeyT *last_key = - static_cast(scan_ccm_it->first); - if (*end_key < *last_key || - (*end_key == *last_key && !end_inclusive)) - { - trailing_cnt++; - // Remove cce from scan cache, but keep possible - // locks, because those locks might acquired by - // other ScanSliceCc/ReadCc from the - // transaction. - remote_scan_cache->RemoveLast(); - } - else - { - // Reset iterator to the key after the last - // scanned tuple since we might need to continue - // scanning if trailing_cnt == 0. - ++scan_ccm_it; - break; - } - } - } - - // If no key is removed from the scan cache, it's possible - // that the local scan may miss keys smaller than the - // batch's end. Re-scans the cc map using the batch's end. - if (trailing_cnt == 0) - { - auto [scan_ret, err] = scan_loop_func( - scan_ccm_it, *end_key, end_inclusive, true); - switch (scan_ret) - { - case ScanReturnType::Blocked: - return false; - case ScanReturnType::Error: - if (req.IsLocal()) - { - req.GetLocalScanner()->CommitAtCore(core_id); - } - - if (is_read_snapshot && - req.WaitForSnapshotCnt(shard_->core_id_) > 0) - { - req.SetIsWaitForSnapshot(shard_->core_id_); - req.DeferSetError(err); - return false; - } - - if (req.SetError(err)) - { - if (req.Result()->Value().is_local_) - { - req.UnpinSlices(); - return true; - } - else if (req.IsResponseSender(shard_->core_id_)) - { - req.SendResponseIfFinished(); - req.UnpinSlices(); - return true; - } - else - { - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, req.Txn(), &req); - return false; - } - } - else - { - return false; - } - case ScanReturnType::Yield: - shard_->Enqueue(shard_->core_id_, &req); - return false; - default: - break; - } + slice_position = SlicePosition::Middle; } } } + slice_result.SetLastKey(local_end, slice_position); + req.SetRangeCcNgTerm(ng_term); + // Sets the iterator to the last cce, which may need to be pinned to // resume the next scan batch. if (CcEntry @@ -4635,7 +4309,7 @@ class TemplateCcMap : public CcMap } } } - else + else // Backward scan { const TemplateStoreSlice *last_slice = static_cast *>( @@ -4644,53 +4318,19 @@ class TemplateCcMap : public CcMap const KeyT *initial_end = nullptr; bool init_end_inclusive = false; - auto deduce_scan_end = - [](const KeyT *batch_end_key, - const KeyT *req_end_key, - bool req_inclusive) -> std::pair - { - const KeyT *end = nullptr; - bool inclusive = false; - - if (batch_end_key == req_end_key) - { - end = req_end_key; - inclusive = req_inclusive; - } - else - { - end = batch_end_key; - inclusive = true; - } - - return {end, inclusive}; - }; + const KeyT *slice_begin = last_slice->StartKey(); + assert(slice_begin != nullptr); - if (!end_finalized) + if (req_end_key != nullptr && + (*slice_begin < *req_end_key || *slice_begin == *req_end_key)) { - const KeyT *slice_begin = last_slice->StartKey(); - assert(slice_begin != nullptr); - - if (req_end_key != nullptr && (*slice_begin < *req_end_key || - *slice_begin == *req_end_key)) - { - initial_end = req_end_key; - init_end_inclusive = req.EndInclusive(); - } - else - { - initial_end = slice_begin; - init_end_inclusive = true; - } + initial_end = req_end_key; + init_end_inclusive = req.EndInclusive(); } else { - // This scan batch's end key has been finalized by one of the - // cores. Deduces the local scan's end and inclusiveness. - std::tie(initial_end, init_end_inclusive) = - deduce_scan_end(final_end_tx_key->GetKey(), - req_end_key, - req.EndInclusive()); + initial_end = slice_begin; + init_end_inclusive = true; } auto scan_batch_func = @@ -4717,12 +4357,11 @@ class TemplateCcMap : public CcMap return {scan_ret, err_code}; }; - auto scan_loop_func = [this, &scan_batch_func, &is_cache_full]( - Iterator &scan_ccm_it, - const KeyT &end_key, - bool inclusive, - bool end_finalized) - -> std::pair + auto scan_loop_func = + [this, &scan_batch_func, &is_cache_full]( + Iterator &scan_ccm_it, + const KeyT &end_key, + bool inclusive) -> std::pair { ScanReturnType scan_ret = ScanReturnType::Success; CcErrorCode err_code = CcErrorCode::NO_ERROR; @@ -4777,7 +4416,7 @@ class TemplateCcMap : public CcMap scan_ccm_it = Begin(); ccp = nullptr; } - else if (!end_finalized && is_cache_full()) + else if (is_cache_full()) { scan_ccm_it = Iterator(ccp->prev_page_, ccp->prev_page_->Size() - 1, @@ -4800,50 +4439,23 @@ class TemplateCcMap : public CcMap return {scan_ret, err_code}; }; - auto [scan_ret, err] = scan_loop_func( - scan_ccm_it, *initial_end, init_end_inclusive, end_finalized); + auto [scan_ret, err] = + scan_loop_func(scan_ccm_it, *initial_end, init_end_inclusive); switch (scan_ret) { case ScanReturnType::Blocked: return false; case ScanReturnType::Error: - if (req.IsLocal()) - { - req.GetLocalScanner()->CommitAtCore(core_id); - } - - if (is_read_snapshot && - req.WaitForSnapshotCnt(shard_->core_id_) > 0) + if (is_read_snapshot && req.WaitForSnapshotCnt() > 0) { - req.SetIsWaitForSnapshot(shard_->core_id_); + req.SetIsWaitForSnapshot(); req.DeferSetError(err); return false; } - if (req.SetError(err)) - { - if (req.Result()->Value().is_local_) - { - req.UnpinSlices(); - return true; - } - else if (req.IsResponseSender(shard_->core_id_)) - { - req.SendResponseIfFinished(); - req.UnpinSlices(); - return true; - } - else - { - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, req.Txn(), &req); - return false; - } - } - else - { - return false; - } + req.UnpinSlices(); + req.SetError(err); + return true; case ScanReturnType::Yield: shard_->Enqueue(shard_->core_id_, &req); return false; @@ -4851,234 +4463,61 @@ class TemplateCcMap : public CcMap break; } - // If the end of this scan batch is not finalized when the local - // scan at this core started, tries to set the batch's end using the - // local end. If another core has finalized the batch's end, the - // scan at this core may need to be adjusted: if the batch's final - // end is less than the end at this core, keys before the final end - // needs to be removed from the local scan cache; if the batch's - // final end is smaller than the end of this core, keys greater than - // the batch's final end but less than the local end need to be - // included in the local scan cache. - - if (!end_finalized) - { - const KeyT *local_end = nullptr; - SlicePosition slice_position; - - // scan_ccm_it points to the entry before the last scanned - // tuple. - auto neg_inf_it = Begin(); - if (scan_ccm_it != neg_inf_it && - (*initial_end < *scan_ccm_it->first || - (init_end_inclusive && - *scan_ccm_it->first == *initial_end))) - { - // The slice is too large. The scan has not fully scanned - // the slice, before reaching the cache's size limit. - // Pretends the slice's inclusive start to be the last - // scanned key, from which the next scan batch resumes. - ++scan_ccm_it; - local_end = scan_ccm_it->first; - slice_position = SlicePosition::Middle; - } - else - { - // The slice has been fully scanned. If the request - // specifies the end key, which falls into the slice, given - // that the slice has been fully scanned, no future scan - // batches are needed. So, we pretend that the scan has - // reached the first slice (starting with negative - // infinity). The calling tx will terminate the scan. - if (initial_end == KeyT::NegativeInfinity() || - req_end_key == initial_end) - { - local_end = initial_end; - slice_position = SlicePosition::FirstSlice; - } - else - { - // The local scan end must be the start of the slice. - local_end = initial_end; - - const TemplateStoreRange *range = - static_cast *>( - req.SliceId().Range()); - const KeyT *range_start = range->RangeStartKey(); - if (range_start != nullptr && - *initial_end == *range_start) - { - slice_position = SlicePosition::FirstSliceInRange; - } - else - { - slice_position = SlicePosition::Middle; - } - } - } - - auto [batch_end, set_success] = - slice_result.UpdateLastKey(local_end, slice_position); + const KeyT *local_end = nullptr; + SlicePosition slice_position; - if (set_success) + // scan_ccm_it points to the entry before the last scanned + // tuple. + auto neg_inf_it = Begin(); + if (scan_ccm_it != neg_inf_it && + (*initial_end < *scan_ccm_it->first || + (init_end_inclusive && *scan_ccm_it->first == *initial_end))) + { + // The slice is too large. The scan has not fully scanned + // the slice, before reaching the cache's size limit. + // Pretends the slice's inclusive start to be the last + // scanned key, from which the next scan batch resumes. + ++scan_ccm_it; + local_end = scan_ccm_it->first; + slice_position = SlicePosition::Middle; + } + else + { + // The slice has been fully scanned. If the request + // specifies the end key, which falls into the slice, given + // that the slice has been fully scanned, no future scan + // batches are needed. So, we pretend that the scan has + // reached the first slice (starting with negative + // infinity). The calling tx will terminate the scan. + if (initial_end == KeyT::NegativeInfinity() || + req_end_key == initial_end) { - req.SetRangeCcNgTerm(ng_term); + local_end = initial_end; + slice_position = SlicePosition::FirstSlice; } else { - // The local scan tries to set the scan batch's end, but the - // scan at another core have set the batch's end. The scan - // results need to be adjusted, if the results include the - // keys smaller than the batch's end, or the results miss - // some keys greater than the batch's end. - auto [end_key, end_inclusive] = deduce_scan_end( - batch_end, req_end_key, req.EndInclusive()); - size_t trailing_cnt = 0; - - // Excludes keys from the scan cache smaller than the - // batch's end. - if (req.IsLocal()) + // The local scan end must be the start of the slice. + local_end = initial_end; + + const TemplateStoreRange *range = + static_cast *>( + req.SliceId().Range()); + const KeyT *range_start = range->RangeStartKey(); + if (range_start != nullptr && *initial_end == *range_start) { - while (scan_cache->Size() > 0) - { - // If req.is_require_keys_ is false, the KeyT object - // in scan cache is invalid, so, should use the cce, - // which is valid in any situation, to get the - // corresponding key. - CcEntry *last_cce = - reinterpret_cast *>( - scan_cache->Last()->cce_ptr_); - while (scan_ccm_it->second != last_cce) - { - ++scan_ccm_it; - assert(scan_ccm_it != End()); - } - const KeyT *last_key = - static_cast(scan_ccm_it->first); - if (*last_key < *end_key || - (*last_key == *end_key && !end_inclusive)) - { - ++trailing_cnt; - scan_cache->RemoveLast(); - } - else - { - // Reset iterator to the key after the last - // scanned tuple since we might need to continue - // scanning if trailing_cnt == 0. - --scan_ccm_it; - break; - } - } + slice_position = SlicePosition::FirstSliceInRange; } else { - while (remote_scan_cache->Size() > 0) - { - // Cc entry pointers here are always valid since - // the slices are still pinned so the cce cannot - // be kicked from memory regardless of the lock - // type. - CcEntry *last_remote_cce = - reinterpret_cast *>( - remote_scan_cache->LastCce()); - while (scan_ccm_it->second != last_remote_cce) - { - // As long as remote scan cache is not empty, - // iterator should not reach pos inf. - ++scan_ccm_it; - assert(scan_ccm_it != End()); - } - const KeyT *last_key = - static_cast(scan_ccm_it->first); - if (*last_key < *end_key || - (*last_key == *end_key && !end_inclusive)) - { - trailing_cnt++; - remote_scan_cache->RemoveLast(); - } - else - { - // Reset iterator to the key after the last - // scanned tuple since we might need to continue - // scanning if trailing_cnt == 0. - --scan_ccm_it; - break; - } - } - } - - // If no key is removed from the scan cache, it's possible - // that the local scan may miss keys greater than the - // batch's end. Re-scans the cc map using the batch's end. - if (trailing_cnt == 0) - { - auto [scan_ret, err] = scan_loop_func( - scan_ccm_it, *end_key, end_inclusive, true); - switch (scan_ret) - { - case ScanReturnType::Blocked: - return false; - case ScanReturnType::Error: - if (req.IsLocal()) - { - req.GetLocalScanner()->CommitAtCore(core_id); - } - - if (is_read_snapshot && - req.WaitForSnapshotCnt(shard_->core_id_) > 0) - { - req.SetIsWaitForSnapshot(shard_->core_id_); - req.DeferSetError(err); - return false; - } - - if (req.SetError(err)) - { - if (req.Result()->Value().is_local_) - { - req.UnpinSlices(); - return true; - } - else if (req.IsResponseSender(shard_->core_id_)) - { - req.SendResponseIfFinished(); - req.UnpinSlices(); - return true; - } - else - { - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, req.Txn(), &req); - return false; - } - } - else - { - return false; - } - case ScanReturnType::Yield: - shard_->Enqueue(shard_->core_id_, &req); - return false; - default: - break; - } + slice_position = SlicePosition::Middle; } } } + slice_result.SetLastKey(local_end, slice_position); + req.SetRangeCcNgTerm(ng_term); + // Sets the iterator to the last cce, which may need to be pinned to // resume the next scan batch. if (CcEntry @@ -5129,47 +4568,15 @@ class TemplateCcMap : public CcMap } } - if (req.IsLocal()) - { - req.GetLocalScanner()->CommitAtCore(core_id); - } - - if (is_read_snapshot && req.WaitForSnapshotCnt(shard_->core_id_) > 0) + if (is_read_snapshot && req.WaitForSnapshotCnt() > 0) { - req.SetIsWaitForSnapshot(shard_->core_id_); + req.SetIsWaitForSnapshot(); return false; } - if (req.SetFinish()) - { - if (req.Result()->Value().is_local_) - { - req.UnpinSlices(); - return true; - } - else if (req.IsResponseSender(shard_->core_id_)) - { - req.SendResponseIfFinished(); - req.UnpinSlices(); - return true; - } - else - { - // Renqueue the cc req to the sender req list. - // We assign a dedicated core to be the response sender instead - // of directly sending the response on the last finished core. - // This is to avoid serialization of response message causing - // one core to become significantly slower than others and would - // end up being the sender of all scan slice response. - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, req.Txn(), &req); - return false; - } - } - else - { - return false; - } + req.UnpinSlices(); + req.SetFinish(); + return true; } /** @@ -12216,7 +11623,7 @@ void BackfillSnapshotForScanSlice(FetchSnapshotCc *fetch_cc, { TemplateScanCache *scan_cache = static_cast *>( - req->GetLocalScanCache(core_id)); + req->GetLocalScanCache()); assert(scan_cache != nullptr); auto *scan_tuple = const_cast *>( scan_cache->At(tuple_idx)); @@ -12235,8 +11642,7 @@ void BackfillSnapshotForScanSlice(FetchSnapshotCc *fetch_cc, } else { - RemoteScanSliceCache *remote_scan_cache = - req->GetRemoteScanCache(core_id); + RemoteScanSliceCache *remote_scan_cache = req->GetRemoteScanCache(); assert(remote_scan_cache != nullptr); assert(remote_scan_cache->archive_records_.size() >= tuple_idx); auto &tmp_pair = remote_scan_cache->archive_positions_[tuple_idx]; @@ -12252,9 +11658,8 @@ void BackfillSnapshotForScanSlice(FetchSnapshotCc *fetch_cc, } // trigger request - req->DecreaseWaitForSnapshotCnt(core_id); - if (req->IsWaitForSnapshot(core_id) && - req->WaitForSnapshotCnt(core_id) == 0) + req->DecreaseWaitForSnapshotCnt(); + if (req->IsWaitForSnapshot() && req->WaitForSnapshotCnt() == 0) { shard.Enqueue(core_id, req); } diff --git a/tx_service/include/proto/cc_request.proto b/tx_service/include/proto/cc_request.proto index 71909157..d9d722ec 100644 --- a/tx_service/include/proto/cc_request.proto +++ b/tx_service/include/proto/cc_request.proto @@ -1094,7 +1094,7 @@ message ScanSliceRequest { bool end_inclusive = 11; bool is_forward = 12; uint64 ts = 13; - repeated uint64 prior_cce_lock_vec = 14; + uint64 prior_cce_lock = 14; IsolationType iso_level = 15; CcProtocolType protocol = 16; bool is_for_write = 17; @@ -1111,6 +1111,7 @@ message ScanSliceResponse { int64 tx_term = 3; uint32 command_id=4; int32 error_code = 5; + uint32 core_id = 16; bytes tuple_cnt = 6; bytes last_key = 7; SlicePosition slice_position = 8; @@ -1121,9 +1122,6 @@ message ScanSliceResponse { bytes gap_ts = 13; bytes cce_lock_ptr = 14; bytes term = 15; - bytes key_start_offsets = 16; - bytes record_start_offsets = 17; - bytes trailing_cnts = 18; uint64 txm_addr = 19; } diff --git a/tx_service/include/remote/remote_cc_request.h b/tx_service/include/remote/remote_cc_request.h index b59af76b..1c0c2604 100644 --- a/tx_service/include/remote/remote_cc_request.h +++ b/tx_service/include/remote/remote_cc_request.h @@ -763,7 +763,7 @@ struct RemoteScanSlice : public ScanSliceCc { public: RemoteScanSlice(); - void Reset(std::unique_ptr input_msg, uint16_t core_cnt); + void Reset(std::unique_ptr input_msg); private: ScanSliceResponse output_msg_; @@ -773,7 +773,7 @@ struct RemoteScanSlice : public ScanSliceCc TableName remote_tbl_name_{ empty_sv, TableType::Primary, txservice::TableEngine::None}; CcHandlerResult cc_res_{nullptr}; - std::vector scan_cache_vec_; + RemoteScanSliceCache scan_cache_; }; struct RemoteReloadCacheCc : public ReloadCacheCc diff --git a/tx_service/include/tx_operation_result.h b/tx_service/include/tx_operation_result.h index d31492dc..b03417fa 100644 --- a/tx_service/include/tx_operation_result.h +++ b/tx_service/include/tx_operation_result.h @@ -447,11 +447,8 @@ struct RemoteScanSliceCache static constexpr size_t MetaDataSize = 8; static constexpr size_t DefaultCacheMaxBytes = 10 * 1024 * 1024; - RemoteScanSliceCache(uint16_t shard_cnt) - : cache_mem_size_(0), - mem_max_bytes_(DefaultCacheMaxBytes), - shard_cnt_(shard_cnt), - trailing_cnt_(0) + RemoteScanSliceCache() + : cache_mem_size_(0), mem_max_bytes_(DefaultCacheMaxBytes) { } @@ -465,7 +462,7 @@ struct RemoteScanSliceCache mem_max_bytes_ = max_bytes; } - void Reset(uint16_t shard_cnt) + void Reset() { key_ts_.clear(); gap_ts_.clear(); @@ -476,26 +473,19 @@ struct RemoteScanSliceCache keys_.clear(); records_.clear(); cache_mem_size_ = 0; - trailing_cnt_ = 0; mem_max_bytes_ = DefaultCacheMaxBytes; - shard_cnt_ = shard_cnt; archive_positions_.clear(); archive_records_.clear(); } - void RemoveLast() - { - trailing_cnt_++; - } - uint64_t LastCce() { - return cce_ptr_.at(cce_ptr_.size() - 1 - trailing_cnt_); + return cce_ptr_.at(cce_ptr_.size() - 1); } size_t Size() const { - return cce_ptr_.size() - trailing_cnt_; + return cce_ptr_.size(); } void SetLastCceLock(uint64_t lock_ptr) @@ -514,8 +504,6 @@ struct RemoteScanSliceCache std::string records_; uint32_t cache_mem_size_; uint32_t mem_max_bytes_; - uint16_t shard_cnt_; - size_t trailing_cnt_; // The first element of archive_positions_ is the index of key_ts_ to // backfill and the second element is the position in records_ to be @@ -531,8 +519,7 @@ struct RangeScanSliceResult slice_position_(SlicePosition::FirstSlice), cc_ng_id_(0), ccm_scanner_(nullptr), - is_local_(true), - last_key_status_(LastKeySetStatus::Unset) + is_local_(true) { } @@ -541,8 +528,7 @@ struct RangeScanSliceResult slice_position_(status), cc_ng_id_(0), ccm_scanner_(nullptr), - is_local_(true), - last_key_status_(LastKeySetStatus::Setup) + is_local_(true) { } @@ -550,8 +536,7 @@ struct RangeScanSliceResult : last_key_(std::move(rhs.last_key_)), slice_position_(rhs.slice_position_), cc_ng_id_(rhs.cc_ng_id_), - is_local_(rhs.is_local_), - last_key_status_(rhs.last_key_status_.load(std::memory_order_acquire)) + is_local_(rhs.is_local_) { if (rhs.is_local_) { @@ -576,9 +561,6 @@ struct RangeScanSliceResult slice_position_ = rhs.slice_position_; is_local_ = rhs.is_local_; cc_ng_id_ = rhs.cc_ng_id_; - last_key_status_.store( - rhs.last_key_status_.load(std::memory_order_acquire), - std::memory_order_release); if (rhs.is_local_) { @@ -594,85 +576,47 @@ struct RangeScanSliceResult void Reset() { - last_key_status_.store(LastKeySetStatus::Unset, - std::memory_order_release); last_key_ = TxKey(); } const TxKey *SetLastKey(TxKey key) { - assert(last_key_status_.load(std::memory_order_acquire) == - LastKeySetStatus::Unset); last_key_ = std::move(key); - last_key_status_.store(LastKeySetStatus::Setup, - std::memory_order_release); - return &last_key_; } template - std::pair UpdateLastKey(const KeyT *key, - SlicePosition slice_pos) + void SetLastKey(const KeyT *key, SlicePosition slice_pos) { - bool success = false; + slice_position_ = slice_pos; - LastKeySetStatus actual = LastKeySetStatus::Unset; - if (last_key_status_.compare_exchange_strong( - actual, LastKeySetStatus::Setting, std::memory_order_acq_rel)) + // If the slice position is the last or the first, this is the last + // scan batch, which must end with positive/negative infinity or the + // request's end key. In both cases, the input key is a valid + // reference throughout the lifetime of RangeScanSliceResult. So, + // the tx key does not own a new copy of the input key. + if (slice_pos == SlicePosition::FirstSlice || + slice_pos == SlicePosition::LastSlice) { - slice_position_ = slice_pos; - - // If the slice position is the last or the first, this is the last - // scan batch, which must end with positive/negative infinity or the - // request's end key. In both cases, the input key is a valid - // reference throughout the lifetime of RangeScanSliceResult. So, - // the tx key does not own a new copy of the input key. - if (slice_pos == SlicePosition::FirstSlice || - slice_pos == SlicePosition::LastSlice) - { - last_key_ = TxKey(key); - } - else - { - last_key_ = key->CloneTxKey(); - } - - last_key_status_.store(LastKeySetStatus::Setup, - std::memory_order_release); - success = true; + last_key_ = TxKey(key); } else { - if (actual != LastKeySetStatus::Setup) - { - while (last_key_status_.load(std::memory_order_acquire) != - LastKeySetStatus::Setup) - { - // Busy poll. - } - } + last_key_ = key->CloneTxKey(); } - - return {last_key_.GetKey(), success}; } - std::pair PeekLastKey() const + const TxKey *LastKey() const { - if (last_key_status_.load(std::memory_order_acquire) == - LastKeySetStatus::Setup) - { - return {&last_key_, true}; - } - else + if (last_key_.KeyPtr() != nullptr) { - return {nullptr, false}; + return &last_key_; } + return nullptr; } TxKey MoveLastKey() { - last_key_status_.store(LastKeySetStatus::Unset, - std::memory_order_release); return std::move(last_key_); } @@ -691,23 +635,9 @@ struct RangeScanSliceResult union { CcScanner *ccm_scanner_; - std::vector *remote_scan_caches_; + RemoteScanSliceCache *remote_scan_caches_; }; bool is_local_{true}; - - /** - * For scene like: (1-write, n-read), atomic variable has obvious - * performance advantage over mutex/shared_mutex. For readers, mutex needs - * to modify a flag, and shared_mutex needs to modify a counter. However, - * atomic variable merely load a variable. - */ - enum struct LastKeySetStatus : uint8_t - { - Unset, - Setting, - Setup, - }; - std::atomic last_key_status_; }; struct BucketScanProgress diff --git a/tx_service/src/cc/local_cc_handler.cpp b/tx_service/src/cc/local_cc_handler.cpp index 8f761609..b69d6b9b 100644 --- a/tx_service/src/cc/local_cc_handler.cpp +++ b/tx_service/src/cc/local_cc_handler.cpp @@ -1289,34 +1289,22 @@ void txservice::LocalCcHandler::ScanNextBatch( scanner.is_require_recs_, prefetch_size); - uint32_t core_cnt = cc_shards_.Count(); - req->SetShardCount(core_cnt); - // When the cc ng term is less than 0, this is the first scan of the // specified range. - if (cc_ng_term < 0) - { - scanner.ResetShards(core_cnt); - } - - for (uint32_t core_id = 0; core_id < core_cnt; ++core_id) + if (cc_ng_term >= 0) { - ScanCache *cache = scanner.Cache(core_id); + ScanCache *cache = scanner.Cache(0); const ScanTuple *last_tuple = cache->LastTuple(); req->SetPriorCceLockAddr( - last_tuple != nullptr ? last_tuple->cce_addr_.CceLockPtr() : 0, - core_id); + last_tuple != nullptr ? last_tuple->cce_addr_.CceLockPtr() : 0); } scanner.ResetCaches(); - uint32_t core_rand = butil::fast_rand(); + uint16_t dest_core = (range_id & 0x3FF) % cc_shards_.Count(); - // The scan slice request is dispatched to the first core. The first - // core tries to pin the slice in memory and if succeeds, further - // dispatches the request to remaining cores for parallel scans. - cc_shards_.EnqueueCcRequest(thd_id_, core_rand % core_cnt, req); + cc_shards_.EnqueueCcRequest(thd_id_, dest_core, req); } else { diff --git a/tx_service/src/remote/cc_stream_receiver.cpp b/tx_service/src/remote/cc_stream_receiver.cpp index 3a0166e7..e015881e 100644 --- a/tx_service/src/remote/cc_stream_receiver.cpp +++ b/tx_service/src/remote/cc_stream_receiver.cpp @@ -377,44 +377,14 @@ void CcStreamReceiver::PreProcessScanResp( ToLocalType::ConvertSlicePosition(msg->slice_position()); const char *tuple_cnt_info = msg->tuple_cnt().data(); - uint16_t remote_core_cnt = *((const uint16_t *) tuple_cnt_info); - tuple_cnt_info += sizeof(uint16_t); - range_scanner.ResetShards(remote_core_cnt); + size_t tuple_cnt = *((const size_t *) tuple_cnt_info); - const uint64_t *term_ptr = (const uint64_t *) msg->term().data(); - - // The offset_table stores the start postition of meta data like `key_ts` - // for all remote cores - std::vector offset_table; - size_t meta_offset = 0; - - range_scanner.SetPartitionNgTerm(-1); - - bool all_remote_core_no_more_data = true; - - for (uint16_t core_id = 0; core_id < remote_core_cnt; ++core_id) - { - size_t tuple_cnt = *((const size_t *) tuple_cnt_info); - tuple_cnt_info += sizeof(size_t); - - all_remote_core_no_more_data = - all_remote_core_no_more_data && (tuple_cnt == 0); - - // All term value are same. We only set `partition_ng_term` once. - if (range_scanner.PartitionNgTerm() == -1 && tuple_cnt != 0) - { - range_scanner.SetPartitionNgTerm(term_ptr[0]); - } + bool remote_no_more_data = tuple_cnt == 0; - offset_table.push_back(meta_offset); - meta_offset += tuple_cnt; - term_ptr += tuple_cnt; - } - - assert(offset_table.size() == remote_core_cnt); + const uint64_t *term_ptr = (const uint64_t *) msg->term().data(); // No more data. - if (all_remote_core_no_more_data) + if (remote_no_more_data) { if (msg->error_code() != 0) { @@ -430,21 +400,18 @@ void CcStreamReceiver::PreProcessScanResp( RecycleScanSliceResp(std::move(msg)); return; } - - // Worker count means how many tx processer to parallel deserialize msg. - // remote core count is not always equal to local core count - size_t worker_cnt = std::min((size_t) remote_core_cnt, - Sharder::Instance().GetLocalCcShardsCount()); + else + { + range_scanner.SetPartitionNgTerm(term_ptr[0]); + } ProcessRemoteScanRespCc *request = process_remote_scan_resp_pool_.NextRequest(); - request->Reset( - this, std::move(msg), std::move(offset_table), hd_res, worker_cnt); + request->Reset(this, std::move(msg), hd_res); - for (size_t idx = 0; idx < worker_cnt; ++idx) - { - local_shards_.EnqueueCcRequest(idx, request); - } + uint32_t core_rand = butil::fast_rand(); + uint16_t dest_core = core_rand % local_shards_.Count(); + local_shards_.EnqueueCcRequest(dest_core, request); } void CcStreamReceiver::OnReceiveCcMsg(std::unique_ptr msg) @@ -1283,9 +1250,8 @@ void CcStreamReceiver::OnReceiveCcMsg(std::unique_ptr msg) case CcMessage::MessageType::CcMessage_MessageType_ScanSliceRequest: { RemoteScanSlice *scan_slice_req = scan_slice_pool.NextRequest(); - uint32_t local_core_cnt = (uint32_t) local_shards_.Count(); TX_TRACE_ASSOCIATE(msg.get(), scan_slice_req); - scan_slice_req->Reset(std::move(msg), local_core_cnt); + scan_slice_req->Reset(std::move(msg)); // The scan slice request is enqueued into the first core, where it pins // the slice and sets the scan's end key. The request is then dispatched // to remaining cores to scan the slice in parallel. diff --git a/tx_service/src/remote/remote_cc_handler.cpp b/tx_service/src/remote/remote_cc_handler.cpp index eb9952bf..7b863703 100644 --- a/tx_service/src/remote/remote_cc_handler.cpp +++ b/tx_service/src/remote/remote_cc_handler.cpp @@ -724,20 +724,15 @@ void txservice::remote::RemoteCcHandler::ScanNext( CcScanner &scanner = *hd_res.Value().ccm_scanner_; - scan_slice->clear_prior_cce_lock_vec(); + scan_slice->clear_prior_cce_lock(); // When the cc ng term is greater than 0, this scan resumes the last scan in // the range. Sets the cc entry addresses where last scan stops. if (cc_ng_term > 0) { - uint32_t remote_core_cnt = scanner.ShardCount(); - - for (uint32_t core_id = 0; core_id < remote_core_cnt; ++core_id) - { - ScanCache *cache = scanner.Cache(core_id); - const ScanTuple *last_tuple = cache->LastTuple(); - scan_slice->add_prior_cce_lock_vec( - last_tuple != nullptr ? last_tuple->cce_addr_.CceLockPtr() : 0); - } + ScanCache *cache = scanner.Cache(0); + const ScanTuple *last_tuple = cache->LastTuple(); + scan_slice->set_prior_cce_lock( + last_tuple != nullptr ? last_tuple->cce_addr_.CceLockPtr() : 0); scanner.ResetCaches(); } diff --git a/tx_service/src/remote/remote_cc_request.cpp b/tx_service/src/remote/remote_cc_request.cpp index 17ebab49..7b24630b 100644 --- a/tx_service/src/remote/remote_cc_request.cpp +++ b/tx_service/src/remote/remote_cc_request.cpp @@ -1319,7 +1319,6 @@ bool txservice::remote::RemoteScanNextBatch::EndKeyInclusive() txservice::remote::RemoteScanSlice::RemoteScanSlice() { - parallel_req_ = true; res_ = &cc_res_; cc_res_.Value().is_local_ = false; @@ -1361,8 +1360,8 @@ txservice::remote::RemoteScanSlice::RemoteScanSlice() const RangeScanSliceResult &slice_result = cc_res_.Value(); output_msg_.clear_last_key(); - auto [last_key, key_set] = slice_result.PeekLastKey(); - assert(key_set || cc_res_.IsError()); + const TxKey *last_key = slice_result.LastKey(); + assert(last_key != nullptr || cc_res_.IsError()); // Only sends back the last key if this scan batch is not the last. The // next scan batch will use this last key as the beginning of the next // batch. @@ -1378,95 +1377,69 @@ txservice::remote::RemoteScanSlice::RemoteScanSlice() output_msg_.set_slice_position( ToRemoteType::ConvertSlicePosition(slice_result.slice_position_)); - uint16_t core_cnt = GetShardCount(); - // Add core cnt first - output_msg_.mutable_tuple_cnt()->append((const char *) &core_cnt, - sizeof(uint16_t)); - // Add tuple count for each core - for (size_t idx = 0; idx < core_cnt; ++idx) - { - size_t tuple_cnt; - if (send_cache) - { - tuple_cnt = scan_cache_vec_[idx].rec_status_.size(); - } - else - { - tuple_cnt = 0; - } - output_msg_.mutable_tuple_cnt()->append((const char *) &tuple_cnt, - sizeof(size_t)); - } + // Add tuple count + size_t tuple_cnt = + send_cache ? slice_result.remote_scan_caches_->Size() : 0; + output_msg_.mutable_tuple_cnt()->append((const char *) &tuple_cnt, + sizeof(size_t)); if (send_cache) { - // Merge scan cache info into a single byte array to reduce - // deserialization time on the receiver side. - for (size_t idx = 0; idx < core_cnt; ++idx) - { - RemoteScanSliceCache &cache = scan_cache_vec_[idx]; - - size_t keys_start_offset = output_msg_.keys().size(); - output_msg_.mutable_key_start_offsets()->append( - (const char *) &keys_start_offset, sizeof(size_t)); - size_t record_start_offset = output_msg_.records().size(); - output_msg_.mutable_record_start_offsets()->append( - (const char *) &record_start_offset, sizeof(size_t)); + output_msg_.mutable_keys()->append(scan_cache_.keys_); - output_msg_.mutable_keys()->append(cache.keys_); - - if (cache.archive_positions_.size() > 0) + if (scan_cache_.archive_positions_.size() > 0) + { + // Merge the backfilled archive records. + size_t rec_offset = 0; + for (size_t j = 0; j < scan_cache_.archive_positions_.size(); + j++) { - // Merge the backfilled archive records. - size_t rec_offset = 0; - for (size_t j = 0; j < cache.archive_positions_.size(); j++) - { - output_msg_.mutable_records()->append( - cache.records_.data() + rec_offset, - cache.records_.data() + - cache.archive_positions_[j].second); - rec_offset = cache.archive_positions_[j].second; - assert(cache.archive_records_[j].size() > 0); - output_msg_.mutable_records()->append( - cache.archive_records_[j]); - } output_msg_.mutable_records()->append( - cache.records_.data() + rec_offset, - cache.records_.data() + cache.records_.size()); - } - else - { - output_msg_.mutable_records()->append(cache.records_); + scan_cache_.records_.data() + rec_offset, + scan_cache_.records_.data() + + scan_cache_.archive_positions_[j].second); + rec_offset = scan_cache_.archive_positions_[j].second; + assert(scan_cache_.archive_records_[j].size() > 0); + output_msg_.mutable_records()->append( + scan_cache_.archive_records_[j]); } - - output_msg_.mutable_key_ts()->append( - (const char *) cache.key_ts_.data(), - cache.key_ts_.size() * sizeof(uint64_t)); - output_msg_.mutable_gap_ts()->append( - (const char *) cache.gap_ts_.data(), - cache.gap_ts_.size() * sizeof(uint64_t)); - output_msg_.mutable_term()->append( - (const char *) cache.term_.data(), - cache.term_.size() * sizeof(uint64_t)); - output_msg_.mutable_cce_lock_ptr()->append( - (const char *) cache.cce_lock_ptr_.data(), - cache.cce_lock_ptr_.size() * sizeof(uint64_t)); - output_msg_.mutable_rec_status()->append( - (const char *) cache.rec_status_.data(), - cache.rec_status_.size() * sizeof(RecordStatusType)); - - output_msg_.mutable_trailing_cnts()->append( - (const char *) &cache.trailing_cnt_, sizeof(size_t)); + output_msg_.mutable_records()->append( + scan_cache_.records_.data() + rec_offset, + scan_cache_.records_.data() + scan_cache_.records_.size()); } + else + { + output_msg_.mutable_records()->append(scan_cache_.records_); + } + + output_msg_.mutable_key_ts()->append( + (const char *) scan_cache_.key_ts_.data(), + scan_cache_.key_ts_.size() * sizeof(uint64_t)); + output_msg_.mutable_gap_ts()->append( + (const char *) scan_cache_.gap_ts_.data(), + scan_cache_.gap_ts_.size() * sizeof(uint64_t)); + output_msg_.mutable_term()->append( + (const char *) scan_cache_.term_.data(), + scan_cache_.term_.size() * sizeof(uint64_t)); + output_msg_.mutable_cce_lock_ptr()->append( + (const char *) scan_cache_.cce_lock_ptr_.data(), + scan_cache_.cce_lock_ptr_.size() * sizeof(uint64_t)); + output_msg_.mutable_rec_status()->append( + (const char *) scan_cache_.rec_status_.data(), + scan_cache_.rec_status_.size() * sizeof(RecordStatusType)); } const ScanSliceRequest &req = input_msg_->scan_slice_req(); + uint32_t range_id = req.range_id(); + uint32_t core_id = + (range_id & 0x3FF) % Sharder::Instance().GetLocalCcShardsCount(); + output_msg_.set_core_id(core_id); hd_->SendScanRespToNode(req.src_node_id(), output_msg_, false); hd_->RecycleCcMsg(std::move(input_msg_)); }; } void txservice::remote::RemoteScanSlice::Reset( - std::unique_ptr input_msg, uint16_t core_cnt) + std::unique_ptr input_msg) { assert(input_msg->has_scan_slice_req()); @@ -1510,30 +1483,13 @@ void txservice::remote::RemoteScanSlice::Reset( output_msg_.set_tx_term(input_msg->tx_term()); output_msg_.set_command_id(input_msg->command_id()); - SetShardCount(core_cnt); - - size_t vec_size = scan_slice_req.prior_cce_lock_vec_size(); - for (size_t core_id = 0; core_id < core_cnt; ++core_id) - { - uint64_t cce_lock_addr = - core_id < vec_size ? scan_slice_req.prior_cce_lock_vec(core_id) : 0; - SetPriorCceLockAddr(cce_lock_addr, core_id); - } + uint64_t cce_lock_addr = scan_slice_req.prior_cce_lock(); + SetPriorCceLockAddr(cce_lock_addr); RangeScanSliceResult &slice_result = cc_res_.Value(); - for (uint16_t core_id = 0; core_id < core_cnt; ++core_id) - { - if (core_id == scan_cache_vec_.size()) - { - scan_cache_vec_.emplace_back(core_cnt); - } - else - { - scan_cache_vec_[core_id].Reset(core_cnt); - } - } - slice_result.remote_scan_caches_ = &scan_cache_vec_; + scan_cache_.Reset(); + slice_result.remote_scan_caches_ = &scan_cache_; input_msg_ = std::move(input_msg); From 67b57723efbc17d8e8baf268e89199acfc543c30 Mon Sep 17 00:00:00 2001 From: yi-xmu <69192773+yi-xmu@users.noreply.github.com> Date: Fri, 13 Mar 2026 18:38:59 +0800 Subject: [PATCH 08/15] Adapt DataSync with new key sharding (#445) 1. Update the structure definitions and related processing procedures of DataSyncScanCc and ScanSliceDeltaSizeCc, as well as the DataSync processing procedure, to adapt to the new key sharding logic. 2. Update key shard code for UpdateCkptTs request. --- tx_service/include/cc/cc_request.h | 247 +++++++---------- tx_service/include/cc/template_cc_map.h | 91 +++---- tx_service/src/cc/local_cc_shards.cpp | 344 +++++++----------------- 3 files changed, 219 insertions(+), 463 deletions(-) diff --git a/tx_service/include/cc/cc_request.h b/tx_service/include/cc/cc_request.h index b8b57c97..ed83229f 100644 --- a/tx_service/include/cc/cc_request.h +++ b/tx_service/include/cc/cc_request.h @@ -3872,7 +3872,6 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase uint64_t data_sync_ts, uint64_t node_group_id, int64_t node_group_term, - uint16_t core_cnt, size_t scan_batch_size, uint64_t txn, const TxKey *target_start_key, @@ -3887,14 +3886,13 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase table_name_(&table_name), node_group_id_(node_group_id), node_group_term_(node_group_term), - core_cnt_(core_cnt), last_data_sync_ts_(last_data_sync_ts), data_sync_ts_(data_sync_ts), start_key_(target_start_key), end_key_(target_end_key), scan_batch_size_(scan_batch_size), err_(CcErrorCode::NO_ERROR), - unfinished_cnt_(core_cnt_), + finished_(false), mux_(), cv_(), export_base_table_item_(export_base_table_item), @@ -3917,24 +3915,19 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase false); }); } - for (size_t i = 0; i < core_cnt; i++) + data_sync_vec_.resize(scan_batch_size); + if (!export_base_table_item_only_) { - data_sync_vec_.emplace_back(); - data_sync_vec_.back().resize(scan_batch_size); - if (!export_base_table_item_only_) - { - archive_vec_.emplace_back(); - archive_vec_.back().reserve(scan_batch_size); - mv_base_idx_vec_.emplace_back(); - mv_base_idx_vec_.back().reserve(scan_batch_size); - } - - pause_pos_.emplace_back(TxKey(), false); - curr_slice_index_.emplace_back(0); - accumulated_scan_cnt_.emplace_back(0); - accumulated_flush_data_size_.emplace_back(0); - scan_heap_is_full_.emplace_back(0); + archive_vec_.reserve(scan_batch_size); + mv_base_idx_vec_.reserve(scan_batch_size); } + + pause_pos_.first = std::move(TxKey()); + pause_pos_.second = false; + curr_slice_index_ = 0; + accumulated_scan_cnt_ = 0; + accumulated_flush_data_size_ = 0; + scan_heap_is_full_ = 0; } bool ValidTermCheck() @@ -3968,7 +3961,6 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); return false; } - scan_count_++; CcMap *ccm = ccs.GetCcm(*table_name_, node_group_id_); if (ccm == nullptr) { @@ -4004,49 +3996,44 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase return false; } - bool IsDrained(size_t core_idx) const + bool IsDrained() const { - return pause_pos_[core_idx].second; + return pause_pos_.second; } - std::pair &PausePos(size_t core_idx) + std::pair &PausePos() { - return pause_pos_[core_idx]; + return pause_pos_; } void Wait() { std::unique_lock lk(mux_); - cv_.wait(lk, [this] { return unfinished_cnt_ == 0; }); + cv_.wait(lk, [this] { return finished_; }); } void Reset(OpType op_type = OpType::Normal) { std::lock_guard lk(mux_); - unfinished_cnt_ = 1; - for (size_t i = 0; i < core_cnt_; i++) + finished_ = false; + if (!export_base_table_item_only_) { - if (!export_base_table_item_only_) - { - archive_vec_.at(i).clear(); - archive_vec_.at(i).reserve(scan_batch_size_); - mv_base_idx_vec_.at(i).clear(); - mv_base_idx_vec_.at(i).reserve(scan_batch_size_); - } + archive_vec_.clear(); + mv_base_idx_vec_.clear(); + } - accumulated_scan_cnt_.at(i) = 0; - accumulated_flush_data_size_.at(i) = 0; - if (scan_heap_is_full_[i] == 1) - { - // vec has been cleared during ReleaseDataSyncScanHeapCc, - // resize to prepared size - data_sync_vec_[i].resize(scan_batch_size_); - scan_heap_is_full_[i] = 0; - } - if (export_base_table_item_) - { - curr_slice_index_[i] = 0; - } + accumulated_scan_cnt_ = 0; + accumulated_flush_data_size_ = 0; + if (scan_heap_is_full_ == 1) + { + // vec has been cleared during ReleaseDataSyncScanHeapCc, + // resize to prepared size + data_sync_vec_.resize(scan_batch_size_); + scan_heap_is_full_ = 0; + } + if (export_base_table_item_) + { + curr_slice_index_ = 0; } err_ = CcErrorCode::NO_ERROR; @@ -4058,12 +4045,9 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase { std::lock_guard lk(mux_); err_ = err; - --unfinished_cnt_; - if (unfinished_cnt_ == 0) - { - UnpinSlices(); - cv_.notify_one(); - } + finished_ = true; + UnpinSlices(); + cv_.notify_one(); } void AbortCcRequest(CcErrorCode err_code) override @@ -4084,26 +4068,22 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase return err_; } - void SetFinish(size_t core_id) + void SetFinish() { std::unique_lock lk(mux_); - --unfinished_cnt_; - if (export_base_table_item_ && !pause_pos_[core_id].second) + finished_ = true; + if (export_base_table_item_ && !pause_pos_.second) { // Only not drained on this core, should set the paused key. - UpdateMinPausedSlice(&pause_pos_[core_id].first); + UpdateMinPausedSlice(&pause_pos_.first); } else if (!export_base_table_item_) { - UpdateMinPausedSlice(curr_slice_index_[core_id]); - } - - if (unfinished_cnt_ == 0) - { - // Unpin the slices - UnpinSlices(); - cv_.notify_one(); + UpdateMinPausedSlice(curr_slice_index_); } + // Unpin the slices + UnpinSlices(); + cv_.notify_one(); } uint32_t NodeGroupId() @@ -4111,19 +4091,19 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase return node_group_id_; } - std::vector &DataSyncVec(uint16_t core_id) + std::vector &DataSyncVec() { - return data_sync_vec_[core_id]; + return data_sync_vec_; } - std::vector &ArchiveVec(uint16_t core_id) + std::vector &ArchiveVec() { - return archive_vec_[core_id]; + return archive_vec_; } - std::vector &MoveBaseIdxVec(uint16_t core_id) + std::vector &MoveBaseIdxVec() { - return mv_base_idx_vec_[core_id]; + return mv_base_idx_vec_; } int64_t NodeGroupTerm() const @@ -4147,66 +4127,47 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase return store_range_; } - void FixCurrentSliceIndex(uint16_t core_id) - { - assert(export_base_table_item_); - if (pause_pos_[core_id].first.KeyPtr() != nullptr) - { - size_t curr_slice_idx = 0; - StoreSlice *curr_slice = - slice_coordinator_.pinned_slices_[curr_slice_idx]; - while (curr_slice->EndTxKey() < pause_pos_[core_id].first) - { - ++curr_slice_idx; - assert(curr_slice_idx < - slice_coordinator_.pinned_slices_.size()); - curr_slice = slice_coordinator_.pinned_slices_[curr_slice_idx]; - } - curr_slice_index_[core_id] = curr_slice_idx; - } - } - - StoreSlice *CurrentSlice(uint16_t core_id) const + StoreSlice *CurrentSlice() const { - size_t curr_slice_idx = curr_slice_index_[core_id]; if (export_base_table_item_) { - assert(curr_slice_idx < slice_coordinator_.pinned_slices_.size()); - return slice_coordinator_.pinned_slices_.at(curr_slice_idx); + assert(curr_slice_index_ < + slice_coordinator_.pinned_slices_.size()); + return slice_coordinator_.pinned_slices_.at(curr_slice_index_); } - assert(curr_slice_idx < slices_to_scan_.size()); - const TxKey &curr_slice_key = slices_to_scan_.at(curr_slice_idx).first; + assert(curr_slice_index_ < slices_to_scan_.size()); + const TxKey &curr_slice_key = + slices_to_scan_.at(curr_slice_index_).first; return store_range_->FindSlice(curr_slice_key); } - const TxKey &CurrentSliceKey(uint16_t core_id) const + const TxKey &CurrentSliceKey() const { assert(!export_base_table_item_); - size_t curr_slice_index = curr_slice_index_[core_id]; - assert(curr_slice_index < slices_to_scan_.size()); - return slices_to_scan_[curr_slice_index].first; + assert(curr_slice_index_ < slices_to_scan_.size()); + return slices_to_scan_[curr_slice_index_].first; } - void MoveToNextSlice(uint16_t core_id) + void MoveToNextSlice() { - curr_slice_index_[core_id]++; + curr_slice_index_++; } - bool TheBatchEnd(uint16_t core_id) const + bool TheBatchEnd() const { - return curr_slice_index_[core_id] >= + return curr_slice_index_ >= (export_base_table_item_ ? slice_coordinator_.pinned_slices_.size() : slice_coordinator_.batch_end_slice_index_); } - bool IsSlicePinned(uint16_t core_id) const + bool IsSlicePinned() const { assert(export_base_table_item_ || - curr_slice_index_[core_id] < slices_to_scan_.size()); + curr_slice_index_ < slices_to_scan_.size()); return export_base_table_item_ ? true - : slices_to_scan_[curr_slice_index_[core_id]].second; + : slices_to_scan_[curr_slice_index_].second; } uint64_t SchemaVersion() const override @@ -4214,11 +4175,6 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase return schema_version_; } - void SetUnfinishedCoreCnt(uint16_t core_cnt) - { - unfinished_cnt_ = core_cnt; - } - void UnpinSlices() { if (slice_coordinator_.first_slice_id_.Range() != nullptr) @@ -4262,13 +4218,10 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase return last_data_sync_ts_; } - std::vector accumulated_scan_cnt_; - std::vector accumulated_flush_data_size_; - - // std::vector is not safe to use in multi-threaded environment, - std::vector scan_heap_is_full_{0}; + size_t accumulated_scan_cnt_; + uint64_t accumulated_flush_data_size_; - size_t scan_count_{0}; + uint32_t scan_heap_is_full_{0}; private: struct SliceCoordinator @@ -4388,7 +4341,6 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase const TableName *table_name_{nullptr}; uint32_t node_group_id_; int64_t node_group_term_; - uint16_t core_cnt_; // It is used as a hint to decide if a page has dirty data since last round // of checkpoint. It is guaranteed that all entries committed before this ts // are synced into data store. @@ -4396,10 +4348,10 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase // Target ts. Collect all data changes committed before this ts into data // sync vec. uint64_t data_sync_ts_; - std::vector> data_sync_vec_; - std::vector> archive_vec_; + std::vector data_sync_vec_; + std::vector archive_vec_; // Cache the entries to move record from "base" table to "archive" table - std::vector> mv_base_idx_vec_; + std::vector mv_base_idx_vec_; // Start/end key of target range if the scan is on a range only, nullptr if // it's on entire table. @@ -4408,11 +4360,11 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase // Position that we left off during last round of ckpt scan. // pause_pos_.first is the key that we stopped at (has not been scanned // though), bool is if this core has finished scanning all keys already. - std::vector> pause_pos_; + std::pair pause_pos_; size_t scan_batch_size_; CcErrorCode err_{CcErrorCode::NO_ERROR}; - uint32_t unfinished_cnt_; + bool finished_{false}; std::mutex mux_; std::condition_variable cv_; @@ -4430,7 +4382,7 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase // The index of the current slice to be scanned. If export_base_table_item_ // is true, it is the index of the SliceCoordinator::pinned_slices_ vector, // and if false, it is the index of the slices_to_scan_ vector. - std::vector curr_slice_index_; + size_t curr_slice_index_; // keep schema vesion after acquire read lock on catalog, to prevent the // concurrency issue with Truncate Table, detail ref to tx issue #1130 // If schema_version_ is 0, the check will be bypassed, since this data sync @@ -8646,7 +8598,6 @@ struct ScanSliceDeltaSizeCcForRangePartition : public CcRequestBase uint64_t scan_ts, uint64_t ng_id, int64_t ng_term, - uint64_t core_cnt, uint64_t txn, const TxKey &target_start_key, const TxKey &target_end_key, @@ -8663,20 +8614,14 @@ struct ScanSliceDeltaSizeCcForRangePartition : public CcRequestBase store_range_(store_range), is_dirty_(is_dirty), has_dml_since_ddl_(false), - unfinished_cnt_(core_cnt), + finished_(false), schema_version_(schema_version) { tx_number_ = txn; - pause_pos_.resize(core_cnt); + pause_pos_.first = std::move(TxKey()); + pause_pos_.second = nullptr; size_t slice_cnt = store_range ? store_range->SlicesCount() : 0; - for (size_t i = 0; i < core_cnt; ++i) - { - slice_delta_size_.emplace_back(); - if (slice_cnt > 0) - { - slice_delta_size_.back().reserve(slice_cnt); - } - } + slice_delta_size_.reserve(slice_cnt); } bool ValidTermCheck() const @@ -8719,26 +8664,22 @@ struct ScanSliceDeltaSizeCcForRangePartition : public CcRequestBase void Wait() { std::unique_lock lk(mux_); - cv_.wait(lk, [this] { return unfinished_cnt_ == 0; }); + cv_.wait(lk, [this] { return finished_; }); } void SetFinish() { std::unique_lock lk(mux_); - if (--unfinished_cnt_ == 0) - { - cv_.notify_one(); - } + finished_ = true; + cv_.notify_one(); } void SetError(CcErrorCode err) { std::unique_lock lk(mux_); err_ = err; - if (--unfinished_cnt_ == 0) - { - cv_.notify_one(); - } + finished_ = true; + cv_.notify_one(); } bool IsError() @@ -8800,18 +8741,18 @@ struct ScanSliceDeltaSizeCcForRangePartition : public CcRequestBase assert(store_range); bool res = store_range_.compare_exchange_strong( expect, store_range, std::memory_order_acq_rel); - slice_delta_size_[core_id].reserve(store_range->SlicesCount()); + slice_delta_size_.reserve(store_range->SlicesCount()); return res; } - std::pair &PausedPos(size_t core_id) + std::pair &PausedPos() { - return pause_pos_[core_id]; + return pause_pos_; } - std::vector> &SliceDeltaSize(size_t core_id) + std::vector> &SliceDeltaSize() { - return slice_delta_size_[core_id]; + return slice_delta_size_; } bool IsDirty() const @@ -8855,10 +8796,10 @@ struct ScanSliceDeltaSizeCcForRangePartition : public CcRequestBase // pause_pos_.first is the key that we stopped at (has not been scanned // though), .second is the slice that we stopped in (has not been scanned // completed yet). - std::vector> pause_pos_; + std::pair pause_pos_; // The delta size of the slices. First is the TxKey of the slice, second is // the delta size. The TxKey is not the owner of the key. - std::vector>> slice_delta_size_; + std::vector> slice_delta_size_; // Generally, if the size of a key in the data store is unknown (the // data_store_size_ is INT32_MAX), we need to read the storage (via @@ -8876,7 +8817,7 @@ struct ScanSliceDeltaSizeCcForRangePartition : public CcRequestBase std::atomic has_dml_since_ddl_{false}; CcErrorCode err_{CcErrorCode::NO_ERROR}; - uint32_t unfinished_cnt_; + bool finished_{false}; uint64_t schema_version_; std::mutex mux_; std::condition_variable cv_; diff --git a/tx_service/include/cc/template_cc_map.h b/tx_service/include/cc/template_cc_map.h index 22a21f8d..070085a1 100644 --- a/tx_service/include/cc/template_cc_map.h +++ b/tx_service/include/cc/template_cc_map.h @@ -4970,37 +4970,17 @@ class TemplateCcMap : public CcMap req.slice_coordinator_.UpdatePreparedSliceCnt(prepared_slice_cnt); req.slice_coordinator_.UpdateBatchEnd(); - if (req.export_base_table_item_) - { - // Fix the slice index of the current core - for (uint16_t core_id = 0; core_id < shard_->core_cnt_; - ++core_id) - { - req.FixCurrentSliceIndex(core_id); - } - } req.slice_coordinator_.SetReadyForScan(); - req.SetUnfinishedCoreCnt(shard_->core_cnt_); - - // Dispatch the request to the cores - for (uint16_t core_id = 0; core_id < shard_->core_cnt_; ++core_id) - { - if (core_id == shard_->core_id_) - { - continue; - } - shard_->Enqueue(shard_->LocalCoreId(), core_id, &req); - } } - if (req.IsDrained(shard_->core_id_)) + if (req.IsDrained()) { // scan is already finished on this core - req.SetFinish(shard_->core_id_); + req.SetFinish(); return false; } - auto &pause_key_and_is_drained = req.PausePos(shard_->core_id_); + auto &pause_key_and_is_drained = req.PausePos(); auto find_non_empty_slice = [this, &req, &deduce_iterator](const KeyT &search_key) @@ -5014,8 +4994,7 @@ class TemplateCcMap : public CcMap } else { - const TxKey &curr_start_tx_key = - req.CurrentSliceKey(shard_->core_id_); + const TxKey &curr_start_tx_key = req.CurrentSliceKey(); const KeyT *curr_start_key = curr_start_tx_key.GetKey(); start_key = (*curr_start_key < search_key ? &search_key : curr_start_key); @@ -5040,7 +5019,7 @@ class TemplateCcMap : public CcMap const KeyT *slice_end_key = nullptr; do { - store_slice = req.CurrentSlice(shard_->core_id_); + store_slice = req.CurrentSlice(); const TemplateStoreSlice *typed_slice = static_cast *>(store_slice); start_key = @@ -5057,11 +5036,11 @@ class TemplateCcMap : public CcMap } // The current slice is empty, try to find next slice. - req.MoveToNextSlice(shard_->core_id_); + req.MoveToNextSlice(); start_key = nullptr; // Continue to handle the next slice if not the batch end - } while (!req.TheBatchEnd(shard_->core_id_)); + } while (!req.TheBatchEnd()); return {it, end_it, slice_end_key}; }; @@ -5102,16 +5081,14 @@ class TemplateCcMap : public CcMap // If reach to the batch end, it means there are no slices that need to // be scanned. - bool slice_pinned = req.TheBatchEnd(shard_->core_id_) - ? false - : req.IsSlicePinned(shard_->core_id_); + bool slice_pinned = req.TheBatchEnd() ? false : req.IsSlicePinned(); // The following flag is used to mark the behavior of one slice. // Only need to export the key if the key is already persisted, this // will happen when the slice need to split, and should export all the // keys in this slice to get the subslice keys. bool export_persisted_key_only = !req.export_base_table_item_ && slice_pinned; - assert(key_it != slice_end_it || req.TheBatchEnd(shard_->core_id_)); + assert(key_it != slice_end_it || req.TheBatchEnd()); // 3. Loop to scan keys // DataSyncScanCc is running on TxProcessor thread. To avoid @@ -5120,8 +5097,7 @@ class TemplateCcMap : public CcMap for (size_t scan_cnt = 0; key_it != slice_end_it && key_it != slice_end_next_page_it && scan_cnt < RangePartitionDataSyncScanCc::DataSyncScanBatchSize && - req.accumulated_scan_cnt_.at(shard_->core_id_) < - req.scan_batch_size_; + req.accumulated_scan_cnt_ < req.scan_batch_size_; ++scan_cnt) { const KeyT *key = key_it->first; @@ -5154,8 +5130,8 @@ class TemplateCcMap : public CcMap { // Reach to the end of current slice. // Move to the next slice. - req.MoveToNextSlice(shard_->core_id_); - if (!req.TheBatchEnd(shard_->core_id_)) + req.MoveToNextSlice(); + if (!req.TheBatchEnd()) { search_start_key = slice_end_key; std::tie(key_it, slice_end_it, slice_end_key) = @@ -5166,9 +5142,7 @@ class TemplateCcMap : public CcMap // If reach to the batch end, it means there are no // slices that need to be scanned. slice_pinned = - req.TheBatchEnd(shard_->core_id_) - ? false - : req.IsSlicePinned(shard_->core_id_); + req.TheBatchEnd() ? false : req.IsSlicePinned(); export_persisted_key_only = !req.export_base_table_item_ && slice_pinned; } @@ -5222,20 +5196,19 @@ class TemplateCcMap : public CcMap auto export_result = ExportForCkpt(cce, *key, - req.DataSyncVec(shard_->core_id_), - req.ArchiveVec(shard_->core_id_), - req.MoveBaseIdxVec(shard_->core_id_), + req.DataSyncVec(), + req.ArchiveVec(), + req.MoveBaseIdxVec(), req.data_sync_ts_, recycle_ts, shard_->EnableMvcc(), - req.accumulated_scan_cnt_[shard_->core_id_], + req.accumulated_scan_cnt_, req.export_base_table_item_, req.export_base_table_item_only_, export_persisted_key_only, flush_size); - req.accumulated_flush_data_size_[shard_->core_id_] += - flush_size; + req.accumulated_flush_data_size_ += flush_size; if (export_result.second) { @@ -5252,8 +5225,8 @@ class TemplateCcMap : public CcMap { slice_pinned = false; // Reach to the end of current slice. Move to the next slice. - req.MoveToNextSlice(shard_->core_id_); - if (!req.TheBatchEnd(shard_->core_id_)) + req.MoveToNextSlice(); + if (!req.TheBatchEnd()) { search_start_key = slice_end_key; std::tie(key_it, slice_end_it, slice_end_key) = @@ -5263,9 +5236,8 @@ class TemplateCcMap : public CcMap // If reach to the batch end, it means there are no slices // that need to be scanned. - slice_pinned = req.TheBatchEnd(shard_->core_id_) - ? false - : req.IsSlicePinned(shard_->core_id_); + slice_pinned = + req.TheBatchEnd() ? false : req.IsSlicePinned(); export_persisted_key_only = !req.export_base_table_item_ && slice_pinned; } @@ -5276,7 +5248,7 @@ class TemplateCcMap : public CcMap // scan batch size, or reach to the end slice of the current batch // slices. assert((key_it != slice_end_it && key_it != slice_end_next_page_it) || - req.TheBatchEnd(shard_->core_id_)); + req.TheBatchEnd()); // 4. Check whether the request is finished. TxKey next_pause_key; bool no_more_data = @@ -5298,16 +5270,15 @@ class TemplateCcMap : public CcMap if (is_scan_mem_full) { - req.scan_heap_is_full_[shard_->core_id_] = 1; + req.scan_heap_is_full_ = 1; } if (is_scan_mem_full || no_more_data || - req.accumulated_scan_cnt_[shard_->core_id_] >= - req.scan_batch_size_ || - req.TheBatchEnd(shard_->core_id_)) + req.accumulated_scan_cnt_ >= req.scan_batch_size_ || + req.TheBatchEnd()) { // Request is finished - req.SetFinish(shard_->core_id_); + req.SetFinish(); return false; } @@ -7660,7 +7631,7 @@ class TemplateCcMap : public CcMap const KeyT *const req_start_key = req.StartTxKey().GetKey(); const KeyT *const req_end_key = req.EndTxKey().GetKey(); - auto &paused_position = req.PausedPos(shard_->core_id_); + auto &paused_position = req.PausedPos(); bool is_dirty = req.IsDirty(); @@ -7731,8 +7702,7 @@ class TemplateCcMap : public CcMap slice_end_next_page_it = next_page_it(slice_end_it); - curr_slice_delta_size = - &(req.SliceDeltaSize(shard_->core_id_).back().second); + curr_slice_delta_size = &(req.SliceDeltaSize().back().second); } bool has_dml_since_ddl = false; @@ -7934,8 +7904,7 @@ class TemplateCcMap : public CcMap slice_end_next_page_it = next_page_it(slice_end_it); - auto &slice_delta_size = - req.SliceDeltaSize(shard_->core_id_); + auto &slice_delta_size = req.SliceDeltaSize(); slice_delta_size.emplace_back(slice->StartTxKey(), 0); curr_slice_delta_size = &slice_delta_size.back().second; } diff --git a/tx_service/src/cc/local_cc_shards.cpp b/tx_service/src/cc/local_cc_shards.cpp index 956427e2..57f5cf12 100644 --- a/tx_service/src/cc/local_cc_shards.cpp +++ b/tx_service/src/cc/local_cc_shards.cpp @@ -3871,7 +3871,6 @@ void LocalCcShards::DataSyncForRangePartition( data_sync_task->data_sync_ts_, ng_id, ng_term, - cc_shards_.size(), tx_number, start_tx_key, end_tx_key, @@ -3879,10 +3878,10 @@ void LocalCcShards::DataSyncForRangePartition( is_dirty, schema_version); - for (size_t i = 0; i < cc_shards_.size(); i++) - { - EnqueueLowPriorityCcRequestToShard(i, &scan_delta_size_cc); - } + uint16_t dest_core = static_cast( + (range_entry->GetRangeInfo()->PartitionId() & 0x3FF) % Count()); + EnqueueLowPriorityCcRequestToShard(dest_core, &scan_delta_size_cc); + scan_delta_size_cc.Wait(); if (scan_delta_size_cc.IsError()) @@ -3905,14 +3904,10 @@ void LocalCcShards::DataSyncForRangePartition( return; } - for (size_t i = 0; i < cc_shards_.size(); ++i) + auto &delta_size = scan_delta_size_cc.SliceDeltaSize(); + for (auto &delta : delta_size) { - auto &delta_size = scan_delta_size_cc.SliceDeltaSize(i); - for (size_t j = 0; j < delta_size.size(); ++j) - { - slices_delta_size[std::move(delta_size[j].first)] += - delta_size[j].second; - } + slices_delta_size[std::move(delta.first)] += delta.second; } if (!export_base_table_items && slices_delta_size.size() == 0) @@ -4007,40 +4002,6 @@ void LocalCcShards::DataSyncForRangePartition( } // 3. Scan records. - // The data sync worker thread is the owner of those vectors. - - // Sort output vectors in key sorting order. - auto key_greater = [](const std::pair &r1, - const std::pair &r2) -> bool - { return r2.first < r1.first; }; - auto rec_greater = [](const FlushRecord &r1, const FlushRecord &r2) -> bool - { return r2.Key() < r1.Key(); }; - - std::vector> data_sync_vecs; - std::vector> archive_vecs; - std::vector>> mv_base_vecs; - - // Add an extra vector as a remaining vector to store the remaining keys - // of the current batch of FlushRecords. - // DataSyncScanCc request is executed in parallel on all cores. For a - // batch of scan results, the end keys among the cores are different. - // In order to ensure the accuracy of the calculated subslice keys, for - // this batch of FlushRecords, the minimum end key of all cores's scan - // result is obtained, and the FlushRecords after this key is placed in - // this remaining vector, which will be merged with the next batch of - // FlushRecords. For example: core1[10,15,20], core2[8,16,24,32], only - // [8,10,15,16,20] will be flushed into data store in this round,and - // the remaining vector stores [24,32] - for (size_t i = 0; i < (cc_shards_.size() + 1); ++i) - { - data_sync_vecs.emplace_back(); - data_sync_vecs.back().reserve(DATA_SYNC_SCAN_BATCH_SIZE); - archive_vecs.emplace_back(); - archive_vecs.back().reserve(DATA_SYNC_SCAN_BATCH_SIZE); - mv_base_vecs.emplace_back(); - mv_base_vecs.back().reserve(DATA_SYNC_SCAN_BATCH_SIZE); - } - // Scan the FlushRecords. // Paused position UpdateSliceStatus update_slice_status; @@ -4073,7 +4034,6 @@ void LocalCcShards::DataSyncForRangePartition( data_sync_task->data_sync_ts_, ng_id, ng_term, - cc_shards_.size(), DATA_SYNC_SCAN_BATCH_SIZE, tx_number, &start_tx_key, @@ -4095,12 +4055,7 @@ void LocalCcShards::DataSyncForRangePartition( while (!scan_data_drained) { - uint32_t core_rand = butil::fast_rand(); - // The scan slice request is dispatched to the first core. The first - // core tries to pin the slice if necessary and if succeeds, further - // dispatches the request to remaining cores for parallel scans. - EnqueueLowPriorityCcRequestToShard(core_rand % cc_shards_.size(), - &scan_cc); + EnqueueLowPriorityCcRequestToShard(dest_core, &scan_cc); scan_cc.Wait(); if (scan_cc.IsError()) @@ -4119,61 +4074,51 @@ void LocalCcShards::DataSyncForRangePartition( else { scan_data_drained = true; - assert(scan_cc.accumulated_flush_data_size_.size() == - cc_shards_.size()); - uint64_t flush_data_size = 0; - for (size_t flush_data_size_per_core : - scan_cc.accumulated_flush_data_size_) - { - flush_data_size += flush_data_size_per_core; - } + uint64_t flush_data_size = scan_cc.accumulated_flush_data_size_; // The cost of FlushRecord also needs to be considered. - for (size_t i = 0; i < cc_shards_.size(); ++i) - { #ifdef WITH_JEMALLOC - flush_data_size += - (scan_cc.DataSyncVec(i).size() * sizeof(FlushRecord) + - scan_cc.ArchiveVec(i).size() * sizeof(FlushRecord) + - scan_cc.MoveBaseIdxVec(i).size() * - sizeof(std::pair)); + flush_data_size += + (scan_cc.DataSyncVec().size() * sizeof(FlushRecord) + + scan_cc.ArchiveVec().size() * sizeof(FlushRecord) + + scan_cc.MoveBaseIdxVec().size() * + sizeof(std::pair)); #else - // Check if vectors are empty before calling malloc_usable_size - // to avoid SEGV on nullptr or invalid pointers. - // Use malloc_usable_size when ASan is enabled (vectors may be - // allocated by ASan's allocator), otherwise use - // mi_malloc_usable_size for mimalloc-allocated memory. - auto &data_sync_vec_ref = scan_cc.DataSyncVec(i); - auto &archive_vec_ref = scan_cc.ArchiveVec(i); - auto &move_base_idx_vec_ref = scan_cc.MoveBaseIdxVec(i); + // Check if vectors are empty before calling malloc_usable_size + // to avoid SEGV on nullptr or invalid pointers. + // Use malloc_usable_size when ASan is enabled (vectors may be + // allocated by ASan's allocator), otherwise use + // mi_malloc_usable_size for mimalloc-allocated memory. + auto &data_sync_vec_ref = scan_cc.DataSyncVec(); + auto &archive_vec_ref = scan_cc.ArchiveVec(); + auto &move_base_idx_vec_ref = scan_cc.MoveBaseIdxVec(); #ifdef __SANITIZE_ADDRESS__ - // When ASan is enabled, use standard malloc_usable_size - flush_data_size += - (data_sync_vec_ref.empty() - ? 0 - : malloc_usable_size(data_sync_vec_ref.data())) + - (archive_vec_ref.empty() - ? 0 - : malloc_usable_size(archive_vec_ref.data())) + - (move_base_idx_vec_ref.empty() - ? 0 - : malloc_usable_size(move_base_idx_vec_ref.data())); + // When ASan is enabled, use standard malloc_usable_size + flush_data_size += + (data_sync_vec_ref.empty() + ? 0 + : malloc_usable_size(data_sync_vec_ref.data())) + + (archive_vec_ref.empty() + ? 0 + : malloc_usable_size(archive_vec_ref.data())) + + (move_base_idx_vec_ref.empty() + ? 0 + : malloc_usable_size(move_base_idx_vec_ref.data())); #else - // When ASan is not enabled, use mimalloc's API - flush_data_size += - (data_sync_vec_ref.empty() - ? 0 - : mi_malloc_usable_size(data_sync_vec_ref.data())) + - (archive_vec_ref.empty() - ? 0 - : mi_malloc_usable_size(archive_vec_ref.data())) + - (move_base_idx_vec_ref.empty() - ? 0 - : mi_malloc_usable_size(move_base_idx_vec_ref.data())); + // When ASan is not enabled, use mimalloc's API + flush_data_size += + (data_sync_vec_ref.empty() + ? 0 + : mi_malloc_usable_size(data_sync_vec_ref.data())) + + (archive_vec_ref.empty() + ? 0 + : mi_malloc_usable_size(archive_vec_ref.data())) + + (move_base_idx_vec_ref.empty() + ? 0 + : mi_malloc_usable_size(move_base_idx_vec_ref.data())); #endif #endif - } // This thread will wait in AllocatePendingFlushDataMemQuota if // quota is not available @@ -4189,53 +4134,6 @@ void LocalCcShards::DataSyncForRangePartition( << " of range: " << range_id << " for table: " << table_name.StringView(); - // The minimum end key of this batch data between all the cores. - TxKey min_scanned_end_key = - GetCatalogFactory(table_name.Engine())->PositiveInfKey(); - for (size_t i = 0; i < cc_shards_.size(); ++i) - { - for (size_t j = 0; j < scan_cc.accumulated_scan_cnt_[i]; ++j) - { - auto &rec = scan_cc.DataSyncVec(i)[j]; - // Clone key - data_sync_vecs[i].emplace_back( - rec.Key().Clone(), - rec.ReleaseVersionedPayload(), - rec.payload_status_, - rec.commit_ts_, - rec.cce_, - rec.post_flush_size_, - range_id); - } - - // Get the minimum end key. - if (!data_sync_vecs[i].empty() && - data_sync_vecs[i].back().Key() < min_scanned_end_key) - { - min_scanned_end_key = data_sync_vecs[i].back().Key(); - } - - for (size_t j = 0; j < scan_cc.ArchiveVec(i).size(); ++j) - { - auto &rec = scan_cc.ArchiveVec(i)[j]; - rec.SetKey(data_sync_vecs[i][rec.GetKeyIndex()].Key()); - } - - for (size_t j = 0; j < scan_cc.MoveBaseIdxVec(i).size(); ++j) - { - size_t key_idx = scan_cc.MoveBaseIdxVec(i)[j]; - TxKey key_raw = data_sync_vecs[i][key_idx].Key(); - mv_base_vecs[i].emplace_back(std::move(key_raw), range_id); - } - - // Move the bucket into the tank - std::move(scan_cc.ArchiveVec(i).begin(), - scan_cc.ArchiveVec(i).end(), - std::back_inserter(archive_vecs.at(i))); - - scan_data_drained = scan_cc.IsDrained(i) && scan_data_drained; - } - std::unique_ptr> data_sync_vec = std::make_unique>(); std::unique_ptr> archive_vec = @@ -4244,90 +4142,46 @@ void LocalCcShards::DataSyncForRangePartition( mv_base_vec = std::make_unique>>(); - MergeSortedVectors( - std::move(mv_base_vecs), *mv_base_vec, key_greater, false); - - // Set the ckpt_ts_ of a cc entry repeatedly, which might cause the - // ccentry become invalid in between. But, there should be no - // duplication here. we don't need to remove duplicate record. - MergeSortedVectors( - std::move(data_sync_vecs), *data_sync_vec, rec_greater, false); - - // For archive vec we don't need to worry about duplicate causing - // issue since we're not visiting their cc entry. Also we cannot - // rely on key compare to dedup archive vec since a key could have - // multiple version of archive versions. - MergeSortedVectors( - std::move(archive_vecs), *archive_vec, rec_greater, false); - - data_sync_vecs.resize(cc_shards_.size() + 1); - archive_vecs.resize(cc_shards_.size() + 1); - mv_base_vecs.resize(cc_shards_.size() + 1); - for (size_t i = 0; i <= cc_shards_.size(); ++i) + data_sync_vec->reserve(DATA_SYNC_SCAN_BATCH_SIZE); + archive_vec->reserve(DATA_SYNC_SCAN_BATCH_SIZE); + mv_base_vec->reserve(DATA_SYNC_SCAN_BATCH_SIZE); + + for (size_t j = 0; j < scan_cc.accumulated_scan_cnt_; ++j) { - data_sync_vecs.at(i).clear(); - archive_vecs.at(i).clear(); - mv_base_vecs.at(i).clear(); + auto &rec = scan_cc.DataSyncVec()[j]; + // Clone key + data_sync_vec->emplace_back(rec.Key().Clone(), + rec.ReleaseVersionedPayload(), + rec.payload_status_, + rec.commit_ts_, + rec.cce_, + rec.post_flush_size_, + range_id); } - size_t data_sync_vec_size = data_sync_vec->size(); - // Fix the vector of FlushRecords. - if (!scan_data_drained) + for (size_t j = 0; j < scan_cc.ArchiveVec().size(); ++j) { - // Only flush the keys that are not greater than the - // min_scanned_end_key - auto iter = std::upper_bound( - data_sync_vec->begin(), - data_sync_vec->end(), - min_scanned_end_key, - [](const TxKey &key, const FlushRecord &rec) - { return key < rec.Key(); }); - - auto &remaining_vec = data_sync_vecs[cc_shards_.size()]; - remaining_vec.clear(); - remaining_vec.insert( - remaining_vec.begin(), - std::make_move_iterator(iter), - std::make_move_iterator(data_sync_vec->end())); - data_sync_vec->erase(iter, data_sync_vec->end()); - - // archive vector - auto archive_iter = std::upper_bound( - archive_vec->begin(), - archive_vec->end(), - min_scanned_end_key, - [](const TxKey &key, const FlushRecord &rec) - { return key < rec.Key(); }); - auto &archive_remaining_vec = archive_vecs[cc_shards_.size()]; - archive_remaining_vec.clear(); - archive_remaining_vec.insert( - archive_remaining_vec.begin(), - std::make_move_iterator(archive_iter), - std::make_move_iterator(archive_vec->end())); - archive_vec->erase(archive_iter, archive_vec->end()); - - // mv base vector - auto mv_base_iter = std::upper_bound( - mv_base_vec->begin(), - mv_base_vec->end(), - min_scanned_end_key, - [](const TxKey &t_key, - const std::pair &key_and_partition_id) - { return t_key < key_and_partition_id.first; }); - auto &mv_base_remaining_vec = mv_base_vecs[cc_shards_.size()]; - mv_base_remaining_vec.clear(); - mv_base_remaining_vec.insert( - mv_base_remaining_vec.begin(), - std::make_move_iterator(mv_base_iter), - std::make_move_iterator(mv_base_vec->end())); - mv_base_vec->erase(mv_base_iter, mv_base_vec->end()); + auto &rec = scan_cc.ArchiveVec()[j]; + rec.SetKey(data_sync_vec->at(rec.GetKeyIndex()).Key()); } + for (size_t j = 0; j < scan_cc.MoveBaseIdxVec().size(); ++j) + { + size_t key_idx = scan_cc.MoveBaseIdxVec()[j]; + TxKey key_raw = data_sync_vec->at(key_idx).Key(); + mv_base_vec->emplace_back(std::move(key_raw), range_id); + } + + // Move the bucket into the tank + std::move(scan_cc.ArchiveVec().begin(), + scan_cc.ArchiveVec().end(), + std::back_inserter(*archive_vec)); + + scan_data_drained = scan_cc.IsDrained(); + if (data_sync_vec->empty()) { - LOG(WARNING) << "data_sync_vec becomes empty after erase, old " - "size of data_sync_vec_size: " - << data_sync_vec_size; + LOG(WARNING) << "data_sync_vec is empty."; // Reset scan_cc.Reset(); // Return the quota to flush data memory usage pool since the @@ -4403,20 +4257,17 @@ void LocalCcShards::DataSyncForRangePartition( table_schema, flush_data_size)); - for (size_t i = 0; i < cc_shards_.size(); ++i) + if (scan_cc.scan_heap_is_full_ == 1) { - if (scan_cc.scan_heap_is_full_[i] == 1) - { - // Clear the FlushRecords' memory of scan cc since the - // DataSyncScan heap is full. - auto &data_sync_vec_ref = scan_cc.DataSyncVec(i); - auto &archive_vec_ref = scan_cc.ArchiveVec(i); - ReleaseDataSyncScanHeapCc release_scan_heap_cc( - &data_sync_vec_ref, &archive_vec_ref); - EnqueueLowPriorityCcRequestToShard(i, - &release_scan_heap_cc); - release_scan_heap_cc.Wait(); - } + // Clear the FlushRecords' memory of scan cc since the + // DataSyncScan heap is full. + auto &data_sync_vec_ref = scan_cc.DataSyncVec(); + auto &archive_vec_ref = scan_cc.ArchiveVec(); + ReleaseDataSyncScanHeapCc release_scan_heap_cc( + &data_sync_vec_ref, &archive_vec_ref); + EnqueueLowPriorityCcRequestToShard(dest_core, + &release_scan_heap_cc); + release_scan_heap_cc.Wait(); } // Reset scan_cc.Reset(); @@ -4431,19 +4282,12 @@ void LocalCcShards::DataSyncForRangePartition( } // Release scan heap memory after scan finish. - std::list req_vec; - for (size_t core_idx = 0; core_idx < Count(); ++core_idx) - { - auto &data_sync_vec_ref = scan_cc.DataSyncVec(core_idx); - auto &archive_vec_ref = scan_cc.ArchiveVec(core_idx); - req_vec.emplace_back(&data_sync_vec_ref, &archive_vec_ref); - EnqueueLowPriorityCcRequestToShard(core_idx, &req_vec.back()); - } - while (req_vec.size() > 0) - { - req_vec.back().Wait(); - req_vec.pop_back(); - } + auto &data_sync_vec_ref = scan_cc.DataSyncVec(); + auto &archive_vec_ref = scan_cc.ArchiveVec(); + ReleaseDataSyncScanHeapCc release_scan_heap_cc(&data_sync_vec_ref, + &archive_vec_ref); + EnqueueLowPriorityCcRequestToShard(dest_core, &release_scan_heap_cc); + release_scan_heap_cc.Wait(); PostProcessRangePartitionDataSyncTask(std::move(data_sync_task), data_sync_txm, @@ -5900,7 +5744,9 @@ void LocalCcShards::FlushData(std::unique_lock &flush_worker_lk) size_t key_core_idx = 0; if (!table_name.IsHashPartitioned()) { - key_core_idx = (rec.Key().Hash() & 0x3FF) % Count(); + int32_t range_id = entry->data_sync_task_->id_; + key_core_idx = static_cast( + (range_id & 0x3FF) % Count()); } else { From 1aded310662ef988aaa6eb82699725e822270427 Mon Sep 17 00:00:00 2001 From: yi-xmu <69192773+yi-xmu@users.noreply.github.com> Date: Fri, 13 Mar 2026 18:39:40 +0800 Subject: [PATCH 09/15] Adapt load slice with new key sharding (#446) Update the structure definition and related processing procedures of FillStoreSlice to adapt to the new key sharding logic. --- tx_service/include/cc/cc_req_misc.h | 37 +++--- tx_service/include/cc/template_cc_map.h | 9 +- tx_service/src/cc/cc_req_misc.cpp | 149 +++++++++--------------- tx_service/src/cc/range_slice.cpp | 7 +- 4 files changed, 79 insertions(+), 123 deletions(-) diff --git a/tx_service/include/cc/cc_req_misc.h b/tx_service/include/cc/cc_req_misc.h index 27b290c7..558ce2c6 100644 --- a/tx_service/include/cc/cc_req_misc.h +++ b/tx_service/include/cc/cc_req_misc.h @@ -426,10 +426,9 @@ struct FillStoreSliceCc : public CcRequestBase bool Execute(CcShard &ccs) override; - std::deque &SliceData(uint16_t core_id) + std::deque &SliceData() { - assert(core_id < partitioned_slice_data_.size()); - return partitioned_slice_data_[core_id]; + return slice_data_; } void AddDataItem(TxKey key, @@ -437,8 +436,8 @@ struct FillStoreSliceCc : public CcRequestBase uint64_t version_ts, bool is_deleted); - bool SetFinish(CcShard *cc_shard); - bool SetError(CcErrorCode err_code); + void SetFinish(CcShard *cc_shard); + void SetError(CcErrorCode err_code); void SetKvFinish(bool success); @@ -447,12 +446,9 @@ struct FillStoreSliceCc : public CcRequestBase assert(err_code != CcErrorCode::NO_ERROR); DLOG(ERROR) << "Abort this FillStoreSliceCc request with error: " << CcErrorMessage(err_code); - bool finish_all = SetError(err_code); + SetError(err_code); // Recycle request - if (finish_all) - { - Free(); - } + Free(); } const TableName &TblName() const @@ -485,17 +481,16 @@ struct FillStoreSliceCc : public CcRequestBase force_load_ = force_load; } - size_t NextIndex(size_t core_idx) const + size_t NextIndex() const { - size_t next_idx = next_idxs_[core_idx]; - assert(next_idx <= partitioned_slice_data_[core_idx].size()); - return next_idx; + assert(next_idx_ <= slice_data_.size()); + return next_idx_; } - void SetNextIndex(size_t core_idx, size_t index) + void SetNextIndex(size_t index) { - assert(index <= partitioned_slice_data_[core_idx].size()); - next_idxs_[core_idx] = index; + assert(index <= slice_data_.size()); + next_idx_ = index; } NodeGroupId NodeGroup() const @@ -533,6 +528,8 @@ struct FillStoreSliceCc : public CcRequestBase return true; } + int32_t PartitionId() const; + metrics::TimePoint start_; private: @@ -540,13 +537,11 @@ struct FillStoreSliceCc : public CcRequestBase NodeGroupId cc_ng_id_; int64_t cc_ng_term_; bool force_load_; - uint16_t finish_cnt_; - uint16_t core_cnt_; std::mutex mux_; CcErrorCode err_code_{CcErrorCode::NO_ERROR}; - std::vector next_idxs_; - std::vector> partitioned_slice_data_; + size_t next_idx_; + std::deque slice_data_; StoreSlice *range_slice_ = nullptr; StoreRange *range_ = nullptr; diff --git a/tx_service/include/cc/template_cc_map.h b/tx_service/include/cc/template_cc_map.h index 070085a1..41a1e11b 100644 --- a/tx_service/include/cc/template_cc_map.h +++ b/tx_service/include/cc/template_cc_map.h @@ -6704,9 +6704,9 @@ class TemplateCcMap : public CcMap bool Execute(FillStoreSliceCc &req) override { - std::deque &slice_vec = req.SliceData(shard_->core_id_); + std::deque &slice_vec = req.SliceData(); - size_t index = req.NextIndex(shard_->core_id_); + size_t index = req.NextIndex(); size_t last_index = std::min(index + FillStoreSliceCc::MaxScanBatchSize, slice_vec.size()); @@ -6723,11 +6723,12 @@ class TemplateCcMap : public CcMap if (index == slice_vec.size()) { slice_vec.clear(); - return req.SetFinish(shard_); + req.SetFinish(shard_); + return true; } else { - req.SetNextIndex(shard_->core_id_, index); + req.SetNextIndex(index); shard_->Enqueue(shard_->LocalCoreId(), &req); return false; } diff --git a/tx_service/src/cc/cc_req_misc.cpp b/tx_service/src/cc/cc_req_misc.cpp index 014795cd..c4ea6e5c 100644 --- a/tx_service/src/cc/cc_req_misc.cpp +++ b/tx_service/src/cc/cc_req_misc.cpp @@ -590,14 +590,9 @@ void FillStoreSliceCc::Reset(const TableName &table_name, cc_ng_id_ = cc_ng_id; cc_ng_term_ = cc_ng_term; force_load_ = force_load; - finish_cnt_ = 0; - core_cnt_ = cc_shards.Count(); - next_idxs_.clear(); - next_idxs_.resize(cc_shards.Count(), 0); - - partitioned_slice_data_.clear(); - partitioned_slice_data_.resize(cc_shards.Count()); + next_idx_ = 0; + slice_data_.clear(); range_slice_ = slice; range_ = range; @@ -619,7 +614,7 @@ void FillStoreSliceCc::SetKvFinish(bool success) { CODE_FAULT_INJECTOR("LoadRangeSliceRequest_SetFinish_Error", { success = false; - partitioned_slice_data_.clear(); + slice_data_.clear(); slice_size_ = 0; snapshot_ts_ = 0; }); @@ -656,7 +651,8 @@ bool FillStoreSliceCc::Execute(CcShard &ccs) int64_t cc_ng_term = Sharder::Instance().LeaderTerm(cc_ng_id_); if (std::max(cc_ng_candid_term, cc_ng_term) != cc_ng_term_) { - return SetError(CcErrorCode::NG_TERM_CHANGED); + SetError(CcErrorCode::NG_TERM_CHANGED); + return true; } CcMap *ccm = ccs.GetCcm(*table_name_, cc_ng_id_); @@ -705,106 +701,65 @@ void FillStoreSliceCc::AddDataItem( rec_cnt_++; } - size_t hash = key.Hash(); - // Uses the lower 10 bits of the hash code to shard the key across - // CPU cores at this node. - uint16_t core_code = hash & 0x3FF; - uint16_t core_id = core_code % core_cnt_; - - partitioned_slice_data_[core_id].emplace_back( + slice_data_.emplace_back( std::move(key), std::move(record), version_ts, is_deleted); } -bool FillStoreSliceCc::SetFinish(CcShard *cc_shard) +void FillStoreSliceCc::SetFinish(CcShard *cc_shard) { - bool finish_all = false; - CcErrorCode err_code; + if (err_code_ == CcErrorCode::NO_ERROR) { - std::lock_guard lk(mux_); - ++finish_cnt_; - - if (finish_cnt_ == core_cnt_) + bool init_key_cache = + txservice_enable_key_cache && table_name_->IsBase(); + // Cache the pointer since FillStoreSliceCc will be freed after + // CommitLoading. + + const TableName *tbl_name = table_name_; + auto cc_ng_id = cc_ng_id_; + auto cc_ng_term = cc_ng_term_; + if (init_key_cache && rec_cnt_ > 0) { - finish_all = true; - err_code = err_code_; - } - } + LocalCcShards *shards = Sharder::Instance().GetLocalCcShards(); + size_t estimate_rec_size = UINT64_MAX; - if (finish_all) - { - if (err_code == CcErrorCode::NO_ERROR) - { - bool init_key_cache = - txservice_enable_key_cache && table_name_->IsBase(); - // Cache the pointer since FillStoreSliceCc will be freed after - // CommitLoading. - - const TableName *tbl_name = table_name_; - auto cc_ng_id = cc_ng_id_; - auto cc_ng_term = cc_ng_term_; - if (init_key_cache && rec_cnt_ > 0) - { - LocalCcShards *shards = Sharder::Instance().GetLocalCcShards(); - size_t estimate_rec_size = UINT64_MAX; - - // Get estiamte record size for key cache - auto schema = shards->GetSharedTableSchema( - TableName(table_name_->GetBaseTableNameSV(), - TableType::Primary, - table_name_->Engine()), - cc_ng_id_); - auto stats = schema->StatisticsObject(); - assert(slice_size_ > 0); - estimate_rec_size = slice_size_ / rec_cnt_; - if (stats) - { - // Update estimate size in table stats with the loaded - // slice. - stats->SetEstimateRecordSize(estimate_rec_size); - } - } - range_slice_->CommitLoading(*range_, slice_size_); - if (init_key_cache) + // Get estiamte record size for key cache + auto schema = shards->GetSharedTableSchema( + TableName(table_name_->GetBaseTableNameSV(), + TableType::Primary, + table_name_->Engine()), + cc_ng_id_); + auto stats = schema->StatisticsObject(); + assert(slice_size_ > 0); + estimate_rec_size = slice_size_ / rec_cnt_; + if (stats) { - range_slice_->InitKeyCache( - cc_shard, range_, tbl_name, cc_ng_id, cc_ng_term); + // Update estimate size in table stats with the loaded + // slice. + stats->SetEstimateRecordSize(estimate_rec_size); } } - else - { - range_slice_->SetLoadingError(*range_, err_code); - } - - next_idxs_.clear(); - partitioned_slice_data_.clear(); - } - - return finish_all; -} - -bool FillStoreSliceCc::SetError(CcErrorCode err_code) -{ - bool finish_all = false; - { - std::lock_guard lk(mux_); - ++finish_cnt_; - err_code_ = err_code; - - if (finish_cnt_ == core_cnt_) + range_slice_->CommitLoading(*range_, slice_size_); + if (init_key_cache) { - finish_all = true; + range_slice_->InitKeyCache( + cc_shard, range_, tbl_name, cc_ng_id, cc_ng_term); } } - - if (finish_all) + else { range_slice_->SetLoadingError(*range_, err_code_); - - next_idxs_.clear(); - partitioned_slice_data_.clear(); } - return finish_all; + next_idx_ = 0; + slice_data_.clear(); +} + +void FillStoreSliceCc::SetError(CcErrorCode err_code) +{ + err_code_ = err_code; + range_slice_->SetLoadingError(*range_, err_code_); + next_idx_ = 0; + slice_data_.clear(); } void FillStoreSliceCc::StartFilling() @@ -818,8 +773,14 @@ void FillStoreSliceCc::TerminateFilling() // The slice has not been filled into memory. So, the out-of-memory flag is // false. range_slice_->SetLoadingError(*range_, CcErrorCode::DATA_STORE_ERR); - next_idxs_.clear(); - partitioned_slice_data_.clear(); + next_idx_ = 0; + slice_data_.clear(); +} + +int32_t FillStoreSliceCc::PartitionId() const +{ + assert(range_ != nullptr); + return range_->PartitionId(); } FetchRecordCc::FetchRecordCc(const TableName *tbl_name, diff --git a/tx_service/src/cc/range_slice.cpp b/tx_service/src/cc/range_slice.cpp index 3fea4287..e76611a8 100644 --- a/tx_service/src/cc/range_slice.cpp +++ b/tx_service/src/cc/range_slice.cpp @@ -70,10 +70,9 @@ void StoreSlice::StartLoading(FillStoreSliceCc *fill_req, assert(pins_ == 0); status_ = SliceStatus::BeingLoaded; - for (uint16_t core_id = 0; core_id < cc_shards.Count(); ++core_id) - { - cc_shards.EnqueueCcRequest(core_id, fill_req); - } + uint16_t dest_core = static_cast( + (fill_req->PartitionId() & 0x3FF) % cc_shards.Count()); + cc_shards.EnqueueToCcShard(dest_core, fill_req); } void StoreSlice::CommitLoading(StoreRange &range, uint32_t slice_size) From 621623262de6220a942d9419792dd00a0a552bd5 Mon Sep 17 00:00:00 2001 From: yi-xmu <69192773+yi-xmu@users.noreply.github.com> Date: Fri, 13 Mar 2026 18:40:11 +0800 Subject: [PATCH 10/15] Update process read/batchread operation (#447) Adapt read operation with new key sharding for range partition. --- tx_service/src/tx_execution.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tx_service/src/tx_execution.cpp b/tx_service/src/tx_execution.cpp index 65abedc5..f46fb46a 100644 --- a/tx_service/src/tx_execution.cpp +++ b/tx_service/src/tx_execution.cpp @@ -1963,13 +1963,14 @@ void TransactionExecution::Process(ReadOperation &read) // error to the tx read request. assert(!lock_range_bucket_result_.IsError()); - // Uses the lower 10 bits of the key's hash code to shard - // the key across CPU cores in a cc node. - uint32_t residual = key.Hash() & 0x3FF; + // Uses the partition id to shard the key across CPU cores + // in a cc node. + partition_id = range_rec_.GetRangeInfo()->PartitionId(); + uint32_t residual = + static_cast((partition_id & 0x3FF)); NodeGroupId range_ng = range_rec_.GetRangeOwnerNg()->BucketOwner(); key_shard_code = range_ng << 10 | residual; - partition_id = range_rec_.GetRangeInfo()->PartitionId(); } } else @@ -7774,17 +7775,19 @@ void TransactionExecution::Process(BatchReadOperation &batch_read_op) TxRecord &rec = *read_batch[idx].record_; uint32_t sharding_code = 0; - size_t key_hash = key.Hash(); - sharding_code = - read_batch[idx].cce_addr_.NodeGroupId() << 10 | (key_hash & 0x3FF); int32_t partition_id = -1; if (table_name.IsHashPartitioned()) { + size_t key_hash = key.Hash(); + sharding_code = read_batch[idx].cce_addr_.NodeGroupId() << 10 | + (key_hash & 0x3FF); partition_id = Sharder::MapKeyHashToHashPartitionId(key_hash); } else { partition_id = batch_read_op.range_ids_[idx]; + sharding_code = read_batch[idx].cce_addr_.NodeGroupId() << 10 | + (partition_id & 0x3FF); } cc_handler_->Read( table_name, From 8195398d0b8bc7be63b742e3fe017252c47b1fcc Mon Sep 17 00:00:00 2001 From: yi-xmu <69192773+yi-xmu@users.noreply.github.com> Date: Fri, 13 Mar 2026 18:40:48 +0800 Subject: [PATCH 11/15] Create a high-priority DataSync task to trigger range split (#448) Update datasynctask constructor to check new and old range owner shard --- tx_service/include/cc/cc_shard.h | 6 + tx_service/include/cc/local_cc_shards.h | 11 +- tx_service/include/data_sync_task.h | 7 +- tx_service/src/cc/cc_shard.cpp | 10 ++ tx_service/src/cc/local_cc_shards.cpp | 173 ++++++++++++++++-------- tx_service/src/data_sync_task.cpp | 13 +- 6 files changed, 161 insertions(+), 59 deletions(-) diff --git a/tx_service/include/cc/cc_shard.h b/tx_service/include/cc/cc_shard.h index 8927cfdd..9c409871 100644 --- a/tx_service/include/cc/cc_shard.h +++ b/tx_service/include/cc/cc_shard.h @@ -1161,6 +1161,12 @@ class CcShard void DeleteSchemaCntl(const TableName &tbl_name); + void CreateSplitRangeDataSyncTask(const TableName &table_name, + uint32_t ng_id, + int64_t ng_term, + int32_t range_id, + uint64_t data_sync_ts); + void ClearNativeSchemaCntl(); void CollectCacheHit(); void CollectCacheMiss(); diff --git a/tx_service/include/cc/local_cc_shards.h b/tx_service/include/cc/local_cc_shards.h index 961bee52..32b2ff0a 100644 --- a/tx_service/include/cc/local_cc_shards.h +++ b/tx_service/include/cc/local_cc_shards.h @@ -1757,6 +1757,12 @@ class LocalCcShards uint64_t txn, CcHandlerResult *hres); + void CreateSplitRangeDataSyncTask(const TableName &table_name, + uint32_t ng_id, + int64_t ng_term, + int32_t range_id, + uint64_t data_sync_ts); + std::pair PinStoreRange( const TableName &table_name, const NodeGroupId ng_id, @@ -1913,7 +1919,8 @@ class LocalCcShards bool can_be_skipped, uint64_t &last_sync_ts, std::shared_ptr status, - CcHandlerResult *hres); + CcHandlerResult *hres, + bool high_priority = false); bool EnqueueDataSyncTaskToCore( const TableName &table_name, uint32_t ng_id, @@ -2303,7 +2310,7 @@ class LocalCcShards { // `0` means no pending task uint64_t latest_pending_task_ts_{0}; - std::queue> pending_tasks_; + std::deque> pending_tasks_; uint64_t UnsetLatestPendingTs() { diff --git a/tx_service/include/data_sync_task.h b/tx_service/include/data_sync_task.h index 06aa8d01..8cb4ecc7 100644 --- a/tx_service/include/data_sync_task.h +++ b/tx_service/include/data_sync_task.h @@ -138,7 +138,8 @@ struct DataSyncTask CcHandlerResult *hres, std::function filter_lambda = nullptr, bool forward_cache = false, - bool is_standby_node_ckpt = false) + bool is_standby_node_ckpt = false, + bool high_priority = false) : table_name_(table_name), id_(id), range_version_(range_version), @@ -152,7 +153,8 @@ struct DataSyncTask is_dirty_(is_dirty), sync_ts_adjustable_(need_adjust_ts), task_res_(hres), - need_update_ckpt_ts_(true) + need_update_ckpt_ts_(true), + high_priority_(high_priority) { } @@ -252,6 +254,7 @@ struct DataSyncTask cce_entries_; bool need_update_ckpt_ts_{true}; + bool high_priority_{false}; }; struct FlushTaskEntry diff --git a/tx_service/src/cc/cc_shard.cpp b/tx_service/src/cc/cc_shard.cpp index b9c8f8e4..9c7b2c93 100644 --- a/tx_service/src/cc/cc_shard.cpp +++ b/tx_service/src/cc/cc_shard.cpp @@ -3580,6 +3580,16 @@ void CcShard::RecycleTxLockInfo(TxLockInfo::uptr lock_info) tx_lock_info_head_.next_ = std::move(lock_info); } +void CcShard::CreateSplitRangeDataSyncTask(const TableName &table_name, + uint32_t ng_id, + int64_t ng_term, + int32_t range_id, + uint64_t data_sync_ts) +{ + local_shards_.CreateSplitRangeDataSyncTask( + table_name, ng_id, ng_term, range_id, data_sync_ts); +} + void CcShard::CollectCacheHit() { assert(metrics::enable_cache_hit_rate); diff --git a/tx_service/src/cc/local_cc_shards.cpp b/tx_service/src/cc/local_cc_shards.cpp index 57f5cf12..872bdeac 100644 --- a/tx_service/src/cc/local_cc_shards.cpp +++ b/tx_service/src/cc/local_cc_shards.cpp @@ -2337,7 +2337,8 @@ bool LocalCcShards::EnqueueRangeDataSyncTask( bool can_be_skipped, uint64_t &last_sync_ts, std::shared_ptr status, - CcHandlerResult *hres) + CcHandlerResult *hres, + bool high_priority) { const RangeInfo *range_info = range_entry->GetRangeInfo(); NodeGroupId range_ng = @@ -2371,19 +2372,33 @@ bool LocalCcShards::EnqueueRangeDataSyncTask( // Push task to worker task queue. std::lock_guard task_worker_lk( data_sync_worker_ctx_.mux_); - data_sync_task_queue_[range_info->PartitionId() % - data_sync_task_queue_.size()] - .emplace_back( - std::make_shared(table_name, - range_info->PartitionId(), - range_info->VersionTs(), - ng_id, - ng_term, - data_sync_ts, - status, - is_dirty, - can_be_skipped, - hres)); + std::deque> &task_queue = + data_sync_task_queue_[range_info->PartitionId() % + data_sync_task_queue_.size()]; + + auto task = + std::make_shared(table_name, + range_info->PartitionId(), + range_info->VersionTs(), + ng_id, + ng_term, + data_sync_ts, + status, + is_dirty, + can_be_skipped, + hres, + nullptr, + false, + false, + high_priority); + if (high_priority) + { + task_queue.push_front(std::move(task)); + } + else + { + task_queue.push_back(std::move(task)); + } return true; } else @@ -2391,11 +2406,12 @@ bool LocalCcShards::EnqueueRangeDataSyncTask( if (can_be_skipped) { assert(hres == nullptr); + assert(!high_priority); // '0' means have no pending task on queue. if (iter->second->latest_pending_task_ts_ == 0) { iter->second->latest_pending_task_ts_ = data_sync_ts; - iter->second->pending_tasks_.push( + iter->second->pending_tasks_.push_back( std::make_shared( table_name, range_info->PartitionId(), @@ -2424,7 +2440,7 @@ bool LocalCcShards::EnqueueRangeDataSyncTask( // This task can't be skipped(DataMigration, CraeteIndex, // LastCheckpoint). So we push this task to the pending task // queue of `Limiter` - iter->second->pending_tasks_.push( + auto task = std::make_shared(table_name, range_info->PartitionId(), range_info->VersionTs(), @@ -2434,7 +2450,19 @@ bool LocalCcShards::EnqueueRangeDataSyncTask( status, is_dirty, can_be_skipped, - hres)); + hres, + nullptr, + false, + false, + high_priority); + if (high_priority) + { + iter->second->pending_tasks_.push_front(std::move(task)); + } + else + { + iter->second->pending_tasks_.push_back(std::move(task)); + } return true; } } @@ -2509,22 +2537,24 @@ void LocalCcShards::EnqueueDataSyncTaskForSplittingRange( TxKey old_start_key = range_entry->GetRangeInfo()->StartTxKey(); TxKey old_end_key = range_entry->GetRangeInfo()->EndTxKey(); // The old range - data_sync_task_queue_[range_entry->GetRangeInfo()->PartitionId() % - data_sync_task_queue_.size()] - .emplace_back(std::make_shared( - table_name, - ng_id, - ng_term, - table_schema, - range_entry, - range_entry->GetRangeInfo()->StartTxKey(), - *new_keys->begin(), - data_sync_ts, - is_dirty, - false, - txn, - status, - hres)); + auto task_queue = + data_sync_task_queue_[range_entry->GetRangeInfo()->PartitionId() % + data_sync_task_queue_.size()]; + auto old_range_task = std::make_shared( + table_name, + ng_id, + ng_term, + table_schema, + range_entry, + range_entry->GetRangeInfo()->StartTxKey(), + *new_keys->begin(), + data_sync_ts, + is_dirty, + false, + txn, + status, + hres); + task_queue.push_front(std::move(old_range_task)); bool need_copy_range = store_hd_->NeedCopyRange(); @@ -2534,20 +2564,22 @@ void LocalCcShards::EnqueueDataSyncTaskForSplittingRange( TxKey end_key = (i == new_keys->size() - 1 ? range_entry->GetRangeInfo()->EndTxKey() : (*new_keys)[i + 1].GetShallowCopy()); - data_sync_task_queue_[new_range_id % data_sync_task_queue_.size()] - .emplace_back(std::make_shared(table_name, - ng_id, - ng_term, - table_schema, - range_entry, - (*new_keys)[i], - end_key, - data_sync_ts, - is_dirty, - need_copy_range, - txn, - status, - hres)); + auto task_queue = + data_sync_task_queue_[new_range_id % data_sync_task_queue_.size()]; + auto new_range_task = std::make_shared(table_name, + ng_id, + ng_term, + table_schema, + range_entry, + (*new_keys)[i], + end_key, + data_sync_ts, + is_dirty, + need_copy_range, + txn, + status, + hres); + task_queue.push_front(std::move(new_range_task)); } data_sync_worker_ctx_.cv_.notify_all(); @@ -2641,7 +2673,7 @@ bool LocalCcShards::EnqueueDataSyncTaskToCore( if (iter->second->latest_pending_task_ts_ == 0) { iter->second->latest_pending_task_ts_ = data_sync_ts; - iter->second->pending_tasks_.push( + iter->second->pending_tasks_.push_back( std::make_shared(table_name, core_idx, 0, @@ -2672,7 +2704,7 @@ bool LocalCcShards::EnqueueDataSyncTaskToCore( // LastCheckpoint). Because these operations need to explicitly // flush data into storage, rather than relying on other // checkpoint tasks. - iter->second->pending_tasks_.push( + iter->second->pending_tasks_.push_back( std::make_shared(table_name, core_idx, 0, @@ -2913,6 +2945,33 @@ void LocalCcShards::EnqueueDataSyncTaskForBucket( data_sync_worker_ctx_.cv_.notify_all(); } +void LocalCcShards::CreateSplitRangeDataSyncTask(const TableName &table_name, + uint32_t ng_id, + int64_t ng_term, + int32_t range_id, + uint64_t data_sync_ts) +{ + std::shared_ptr status = + std::make_shared(ng_id, ng_term, false); + TableRangeEntry *range_entry = const_cast( + GetTableRangeEntry(table_name, ng_id, range_id)); + assert(range_entry != nullptr); + uint64_t last_sync_ts = 0; + EnqueueRangeDataSyncTask(table_name, + ng_id, + ng_term, + range_entry, + data_sync_ts, + false, + false, + last_sync_ts, + status, + nullptr, + true); + + data_sync_worker_ctx_.cv_.notify_all(); +} + void LocalCcShards::Terminate() { // Terminate the data sync task worker thds. @@ -5126,12 +5185,20 @@ void LocalCcShards::PopPendingTask(NodeGroupId ng_id, { std::shared_ptr task = iter->second->pending_tasks_.front(); - iter->second->pending_tasks_.pop(); + iter->second->pending_tasks_.pop_front(); task_limiter_lk.unlock(); std::lock_guard task_worker_lk(data_sync_worker_ctx_.mux_); - data_sync_task_queue_[id % data_sync_task_queue_.size()].push_back( - std::move(task)); + auto &task_queue = + data_sync_task_queue_[id % data_sync_task_queue_.size()]; + if (task->high_priority_) + { + task_queue.push_front(std::move(task)); + } + else + { + task_queue.push_back(std::move(task)); + } data_sync_worker_ctx_.cv_.notify_all(); } else @@ -5163,7 +5230,7 @@ void LocalCcShards::ClearAllPendingTasks(NodeGroupId ng_id, auto &task = iter->second->pending_tasks_.front(); task->SetError(CcErrorCode::REQUESTED_TABLE_NOT_EXISTS); task->SetScanTaskFinished(); - iter->second->pending_tasks_.pop(); + iter->second->pending_tasks_.pop_front(); } task_limiters_.erase(iter); diff --git a/tx_service/src/data_sync_task.cpp b/tx_service/src/data_sync_task.cpp index ec1e4815..70d56892 100644 --- a/tx_service/src/data_sync_task.cpp +++ b/tx_service/src/data_sync_task.cpp @@ -79,7 +79,8 @@ DataSyncTask::DataSyncTask(const TableName &table_name, range_entry_(range_entry), during_split_range_(true), export_base_table_items_(export_base_table_items), - tx_number_(txn) + tx_number_(txn), + high_priority_(true) { assert(!table_name_.IsHashPartitioned()); if (start_key_.KeyPtr() == @@ -98,7 +99,15 @@ DataSyncTask::DataSyncTask(const TableName &table_name, .GetLocalCcShards() ->GetRangeOwner(id_, ng_id) ->BucketOwner(); - need_update_ckpt_ts_ = range_owner == ng_id; + + size_t local_shard_count = Sharder::Instance().GetLocalCcShardsCount(); + int32_t old_range_id = range_entry_->GetRangeInfo()->PartitionId(); + uint16_t old_range_owner_shard = + static_cast((old_range_id & 0x3FF) % local_shard_count); + uint16_t new_range_owner_shard = + static_cast((id_ & 0x3FF) % local_shard_count); + need_update_ckpt_ts_ = + range_owner == ng_id && old_range_owner_shard == new_range_owner_shard; } void DataSyncTask::SetFinish() From 2d47d6f4bd8639415efa12c3264a63e46018e38a Mon Sep 17 00:00:00 2001 From: yi-xmu <69192773+yi-xmu@users.noreply.github.com> Date: Fri, 13 Mar 2026 18:41:32 +0800 Subject: [PATCH 12/15] Adapt cache sender with new key sharding (#449) To reduce cache hit rate, during range splitting, keys located on the new range that fall on other cores (local nodes or remote nodes) can be sent to the corresponding core. 1. Update the logic and related requests for sending range cache during range split 2. Update key shard for UploadBatchSlices rpc. --- tx_service/include/cc/cc_request.h | 76 ++++------- tx_service/include/cc/local_cc_shards.h | 1 - tx_service/include/cc/template_cc_map.h | 16 +-- tx_service/src/cc/local_cc_shards.cpp | 148 ++++++++++++---------- tx_service/src/remote/cc_node_service.cpp | 20 +-- 5 files changed, 121 insertions(+), 140 deletions(-) diff --git a/tx_service/include/cc/cc_request.h b/tx_service/include/cc/cc_request.h index ed83229f..0b360351 100644 --- a/tx_service/include/cc/cc_request.h +++ b/tx_service/include/cc/cc_request.h @@ -8121,25 +8121,19 @@ struct UploadBatchSlicesCc : public CcRequestBase void Reset(const TableName &table_name, txservice::NodeGroupId ng_id, int64_t &ng_term, - size_t core_cnt, const WriteEntryTuple &entry_tuple, std::shared_ptr slice_info) { table_name_ = &table_name; node_group_id_ = ng_id; node_group_term_ = &ng_term; - core_cnt_ = core_cnt; - partitioned_slice_data_.resize(core_cnt); - next_idxs_.resize(core_cnt); - for (size_t i = 0; i < core_cnt; i++) - { - next_idxs_[i] = 0; - } + slice_data_.clear(); + next_idx_ = 0; entry_tuples_ = &entry_tuple; slices_info_ = slice_info; - unfinished_cnt_ = core_cnt; + finished_ = false; err_code_ = CcErrorCode::NO_ERROR; } @@ -8205,14 +8199,12 @@ struct UploadBatchSlicesCc : public CcRequestBase std::pair> SetFinish() { std::unique_lock req_lk(req_mux_); - if (--unfinished_cnt_ == 0) - { - // Make a copy of slices_info_ to avoid race condition. - std::shared_ptr slices_info = slices_info_; - req_cv_.notify_one(); - return {true, std::move(slices_info)}; - } - return {false, nullptr}; + finished_ = true; + + // Make a copy of slices_info_ to avoid race condition. + std::shared_ptr slices_info = slices_info_; + req_cv_.notify_one(); + return {true, std::move(slices_info)}; } bool SetError(CcErrorCode err_code) @@ -8222,13 +8214,9 @@ struct UploadBatchSlicesCc : public CcRequestBase { err_code_ = err_code; } - if (--unfinished_cnt_ == 0) - { - req_cv_.notify_one(); - - return true; - } - return false; + finished_ = true; + req_cv_.notify_one(); + return true; } void AbortCcRequest(CcErrorCode err_code) override @@ -8245,7 +8233,7 @@ struct UploadBatchSlicesCc : public CcRequestBase void Wait() { std::unique_lock lk(req_mux_); - while (unfinished_cnt_ != 0) + while (!finished_) { req_cv_.wait(lk); } @@ -8308,7 +8296,7 @@ struct UploadBatchSlicesCc : public CcRequestBase } void SetParsed() { - parsed_.store(true, std::memory_order_release); + parsed_ = true; } void AddDataItem(TxKey key, @@ -8316,34 +8304,26 @@ struct UploadBatchSlicesCc : public CcRequestBase uint64_t version_ts, bool is_deleted) { - size_t hash = key.Hash(); - // Uses the lower 10 bits of the hash code to shard the key across - // CPU cores at this node. - uint16_t core_code = hash & 0x3FF; - uint16_t core_id = core_code % core_cnt_; - - partitioned_slice_data_[core_id].emplace_back( + slice_data_.emplace_back( std::move(key), std::move(record), version_ts, is_deleted); } - size_t NextIndex(size_t core_idx) const + size_t NextIndex() const { - size_t next_idx = next_idxs_[core_idx]; - assert(next_idx <= partitioned_slice_data_[core_idx].size()); - return next_idx; + assert(next_idx_ <= slice_data_.size()); + return next_idx_; } - void SetNextIndex(size_t core_idx, size_t index) + void SetNextIndex(size_t index) { - assert(index <= partitioned_slice_data_[core_idx].size()); - next_idxs_[core_idx] = index; + assert(index <= slice_data_.size()); + next_idx_ = index; } // Notice: these data items belong to multi slices. - std::deque &SliceData(uint16_t core_id) + std::deque &SliceData() { - assert(core_id < partitioned_slice_data_.size()); - return partitioned_slice_data_[core_id]; + return slice_data_; } bool AbortIfOom() const override @@ -8352,7 +8332,6 @@ struct UploadBatchSlicesCc : public CcRequestBase } private: - uint16_t core_cnt_; const TableName *table_name_{nullptr}; uint32_t node_group_id_{0}; int64_t *node_group_term_{nullptr}; @@ -8365,17 +8344,16 @@ struct UploadBatchSlicesCc : public CcRequestBase // key offset, record offset, ts offset, record status offset // when parse items std::tuple parse_offset_{0, 0, 0, 0}; - // parse items on one core, then put the req to other cores. - std::atomic_bool parsed_{false}; + bool parsed_{false}; - std::vector> partitioned_slice_data_; + std::deque slice_data_; // pause position when emplace keys into ccmap in batches - std::vector next_idxs_; + size_t next_idx_; bthread::Mutex req_mux_{}; bthread::ConditionVariable req_cv_{}; // This two variables may be accessed by multi-cores. - size_t unfinished_cnt_{0}; + bool finished_{false}; CcErrorCode err_code_{CcErrorCode::NO_ERROR}; }; diff --git a/tx_service/include/cc/local_cc_shards.h b/tx_service/include/cc/local_cc_shards.h index 32b2ff0a..98a11c1b 100644 --- a/tx_service/include/cc/local_cc_shards.h +++ b/tx_service/include/cc/local_cc_shards.h @@ -2127,7 +2127,6 @@ class LocalCcShards .GetLocalCcShards() ->GetRangeOwner(new_range_id_, ng_id_) ->BucketOwner(); - assert(new_range_owner_ != ng_id_); dest_node_id_ = Sharder::Instance().LeaderNodeId(new_range_owner_); channel_ = diff --git a/tx_service/include/cc/template_cc_map.h b/tx_service/include/cc/template_cc_map.h index 41a1e11b..ceabea9e 100644 --- a/tx_service/include/cc/template_cc_map.h +++ b/tx_service/include/cc/template_cc_map.h @@ -7484,22 +7484,12 @@ class TemplateCcMap : public CcMap { // Parsed all records req.SetParsed(); - - // Emplace key on all cores - for (size_t core = 0; core < shard_->core_cnt_; ++core) - { - if (core != shard_->core_id_) - { - shard_->Enqueue(shard_->core_id_, core, &req); - } - } } - } // end-parsed - std::deque &slice_vec = req.SliceData(shard_->core_id_); + std::deque &slice_vec = req.SliceData(); - size_t index = req.NextIndex(shard_->core_id_); + size_t index = req.NextIndex(); size_t last_index = std::min( index + UploadBatchSlicesCc::MaxEmplaceBatchSize, slice_vec.size()); @@ -7535,7 +7525,7 @@ class TemplateCcMap : public CcMap else { index = last_index; - req.SetNextIndex(shard_->core_id_, index); + req.SetNextIndex(index); shard_->Enqueue(shard_->LocalCoreId(), &req); } return false; diff --git a/tx_service/src/cc/local_cc_shards.cpp b/tx_service/src/cc/local_cc_shards.cpp index 872bdeac..edcdd520 100644 --- a/tx_service/src/cc/local_cc_shards.cpp +++ b/tx_service/src/cc/local_cc_shards.cpp @@ -4073,8 +4073,11 @@ void LocalCcShards::DataSyncForRangePartition( GetRangeOwner(old_range_id, ng_id)->BucketOwner(); NodeGroupId new_range_owner = GetRangeOwner(range_id, ng_id)->BucketOwner(); + uint16_t old_range_owner_shard = (old_range_id & 0x3FF) % Count(); + uint16_t new_range_owner_shard = (range_id & 0x3FF) % Count(); - need_send_range_cache = new_range_owner != old_range_owner; + need_send_range_cache = new_range_owner != old_range_owner || + new_range_owner_shard != old_range_owner_shard; if (need_send_range_cache) { range_cache_sender = std::make_unique( @@ -6906,79 +6909,84 @@ void LocalCcShards::RangeCacheSender::SendRangeCacheRequest( // 1- upload dirty range slices info (with PartiallyCached) int64_t ng_term = INIT_TERM; - remote::CcRpcService_Stub stub(channel_.get()); - - brpc::Controller cntl; - cntl.set_timeout_ms(10000); - cntl.set_write_to_socket_in_background(true); - // cntl.ignore_eovercrowded(true); - remote::UploadRangeSlicesRequest req; - remote::UploadRangeSlicesResponse resp; - - req.set_node_group_id(new_range_owner_); - req.set_ng_term(ng_term); - req.set_table_name_str(table_name_.String()); - req.set_table_engine( - remote::ToRemoteType::ConvertTableEngine(table_name_.Engine())); - req.set_old_partition_id(old_range_id_); - req.set_version_ts(version_ts_); - req.set_new_partition_id(new_range_id_); - req.set_new_slices_num(slices_vec_.size()); - std::string *keys_str = req.mutable_new_slices_keys(); - std::string *sizes_str = req.mutable_new_slices_sizes(); - std::string *status_str = req.mutable_new_slices_status(); - for (const StoreSlice *slice : slices_vec_) - { - // key - TxKey slice_key = slice->StartTxKey(); - slice_key.Serialize(*keys_str); - // size - // If post ckpt size of the slice is UINT64_MAX, it means that there is - // no item need to be ckpt in this slice, so should use the current size - // of the slice. - uint32_t slice_size = - (slice->PostCkptSize() == UINT64_MAX ? slice->Size() - : slice->PostCkptSize()); - const char *slice_size_ptr = - reinterpret_cast(&slice_size); - sizes_str->append(slice_size_ptr, sizeof(slice_size)); - // status - int8_t slice_status = static_cast(SliceStatus::PartiallyCached); - const char *slice_status_ptr = - reinterpret_cast(&slice_status); - status_str->append(slice_status_ptr, sizeof(slice_status)); - } - req.set_has_dml_since_ddl(store_range_->HasDmlSinceDdl()); - stub.UploadRangeSlices(&cntl, &req, &resp, nullptr); - - if (cntl.Failed()) - { - LOG(WARNING) << "SendRangeCacheRequest: Fail to upload dirty range " - "slices RPC ng#" - << new_range_owner_ << ". Error code: " << cntl.ErrorCode() - << ". Msg: " << cntl.ErrorText(); - return; - } + if (new_range_owner_ != ng_id_) + { + remote::CcRpcService_Stub stub(channel_.get()); + + brpc::Controller cntl; + cntl.set_timeout_ms(10000); + cntl.set_write_to_socket_in_background(true); + // cntl.ignore_eovercrowded(true); + remote::UploadRangeSlicesRequest req; + remote::UploadRangeSlicesResponse resp; + + req.set_node_group_id(new_range_owner_); + req.set_ng_term(ng_term); + req.set_table_name_str(table_name_.String()); + req.set_table_engine( + remote::ToRemoteType::ConvertTableEngine(table_name_.Engine())); + req.set_old_partition_id(old_range_id_); + req.set_version_ts(version_ts_); + req.set_new_partition_id(new_range_id_); + req.set_new_slices_num(slices_vec_.size()); + std::string *keys_str = req.mutable_new_slices_keys(); + std::string *sizes_str = req.mutable_new_slices_sizes(); + std::string *status_str = req.mutable_new_slices_status(); + for (const StoreSlice *slice : slices_vec_) + { + // key + TxKey slice_key = slice->StartTxKey(); + slice_key.Serialize(*keys_str); + // size + // If post ckpt size of the slice is UINT64_MAX, it means that there + // is no item need to be ckpt in this slice, so should use the + // current size of the slice. + uint32_t slice_size = + (slice->PostCkptSize() == UINT64_MAX ? slice->Size() + : slice->PostCkptSize()); + const char *slice_size_ptr = + reinterpret_cast(&slice_size); + sizes_str->append(slice_size_ptr, sizeof(slice_size)); + // status + int8_t slice_status = + static_cast(SliceStatus::PartiallyCached); + const char *slice_status_ptr = + reinterpret_cast(&slice_status); + status_str->append(slice_status_ptr, sizeof(slice_status)); + } + req.set_has_dml_since_ddl(store_range_->HasDmlSinceDdl()); + stub.UploadRangeSlices(&cntl, &req, &resp, nullptr); + + if (cntl.Failed()) + { + LOG(WARNING) << "SendRangeCacheRequest: Fail to upload dirty range " + "slices RPC ng#" + << new_range_owner_ + << ". Error code: " << cntl.ErrorCode() + << ". Msg: " << cntl.ErrorText(); + return; + } - if (remote::ToLocalType::ConvertCcErrorCode(resp.error_code()) != - CcErrorCode::NO_ERROR) - { - LOG(WARNING) << "SendRangeCacheRequest: New owner ng#" - << new_range_owner_ - << " reject to receive dirty range data"; - return; - } + if (remote::ToLocalType::ConvertCcErrorCode(resp.error_code()) != + CcErrorCode::NO_ERROR) + { + LOG(WARNING) << "SendRangeCacheRequest: New owner ng#" + << new_range_owner_ + << " reject to receive dirty range data"; + return; + } - ng_term = resp.ng_term(); - LOG(INFO) << "SendRangeCacheRequest: Uploaded new range slices info to " - "future owner, range#" - << old_range_id_ << ", new_range#" << new_range_id_; + ng_term = resp.ng_term(); + LOG(INFO) << "SendRangeCacheRequest: Uploaded new range slices info to " + "future owner, range#" + << old_range_id_ << ", new_range#" << new_range_id_; + } // 2- upload records belongs to dirty range assert(closure_vec_->size() > 0); LOG(INFO) << "SendRangeCacheRequest: Sending range data, old_range_id: " << old_range_id_ << ", to upload " << closure_vec_->size() - << " batches to ng#" << new_range_owner_; + << " batches to ng#" << new_range_owner_ << " from ng#" << ng_id_; uint32_t sender_cnt = 5; auto closures_idx = std::make_shared(sender_cnt); @@ -6998,6 +7006,8 @@ void LocalCcShards::RangeCacheSender::SendRangeCacheRequest( size_t vec_size = vec.size(); size_t end_idx = std::min(begin_idx + 5, vec_size); bool rejected = false; + int64_t term = + ng_term == INIT_TERM ? dest_ng_term : ng_term; while (begin_idx < end_idx) { std::unique_ptr closure( @@ -7010,6 +7020,7 @@ void LocalCcShards::RangeCacheSender::SendRangeCacheRequest( end_idx = std::min(begin_idx + 5, vec_size); } + rejected = rejected || term != dest_ng_term; if (rejected) { // Must continue to delete left closures in @@ -7024,7 +7035,7 @@ void LocalCcShards::RangeCacheSender::SendRangeCacheRequest( cntl_ptr->set_timeout_ms(closure->TimeoutValue()); // Fix the term closure->UploadBatchRequest()->set_node_group_term( - ng_term); + term); stub.UploadBatchSlices(cntl_ptr, closure->UploadBatchRequest(), closure->UploadBatchResponse(), @@ -7045,6 +7056,7 @@ void LocalCcShards::RangeCacheSender::SendRangeCacheRequest( << closure->NodeId() << " is reject for no free memory"; } + term = resp->ng_term(); } LOG(INFO) << "Old_Range#" << range_id diff --git a/tx_service/src/remote/cc_node_service.cpp b/tx_service/src/remote/cc_node_service.cpp index df0369aa..37a2e7c7 100644 --- a/tx_service/src/remote/cc_node_service.cpp +++ b/tx_service/src/remote/cc_node_service.cpp @@ -1395,30 +1395,32 @@ void CcNodeService::UploadBatchSlices( } UploadBatchSlicesCc req; - req.Reset( - table_name, ng_id, ng_term, core_cnt, write_entry_tuple, slices_info); + req.Reset(table_name, ng_id, ng_term, write_entry_tuple, slices_info); - // Select a core randomly to parse items. After parsed, this core will push - // the request to other cores to emplace keys. - uint16_t rand_core = std::rand() % core_cnt; - cc_shards->EnqueueToCcShard(rand_core, &req); + uint16_t dest_core = + static_cast((slices_info->new_range_ & 0x3FF) % core_cnt); + cc_shards->EnqueueToCcShard(dest_core, &req); req.Wait(); CcErrorCode err = CcErrorCode::NO_ERROR; if (req.ErrorCode() != CcErrorCode::NO_ERROR) { - LOG(INFO) << "CcNodeService UploadBatch RPC of #ng" << ng_id + LOG(INFO) << "CcNodeService UploadBatchRecordCache RPC of #ng" << ng_id + << " for range#" << slices_info->range_ << ", new_range#" + << slices_info->new_range_ << " finished with error: " << static_cast(err); err = req.ErrorCode(); } else { - DLOG(INFO) << "CcNodeService UploadBatch RPC of #ng" << ng_id + DLOG(INFO) << "CcNodeService UploadBatchRecordCache RPC of #ng" << ng_id + << " for range#" << slices_info->range_ << ", new_range#" + << slices_info->new_range_ << " finished with error: " << static_cast(err); } response->set_error_code(ToRemoteType::ConvertCcErrorCode(err)); - response->set_ng_term(ng_term); + response->set_ng_term(req.CcNgTerm()); } void CcNodeService::FetchPayload( From a6d6f74ba53bbc65b5776bfae05553db240b83b2 Mon Sep 17 00:00:00 2001 From: yi-xmu <69192773+yi-xmu@users.noreply.github.com> Date: Fri, 13 Mar 2026 18:42:08 +0800 Subject: [PATCH 13/15] Update keycach to adapt to the new key sharding strategy. (#450) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Including InitKeyCacheCc , UpdateKeyCacheCc ,StoreSlice::cache_validity_ , StoreRange::key_cache_ --- tx_service/include/cc/cc_page_clean_guard.h | 3 +- tx_service/include/cc/cc_req_misc.h | 15 +-- tx_service/include/cc/cc_request.h | 20 +-- tx_service/include/cc/local_cc_shards.h | 3 +- tx_service/include/cc/range_slice.h | 131 +++++++++----------- tx_service/include/cc/template_cc_map.h | 51 ++++---- tx_service/src/cc/cc_req_misc.cpp | 39 +++--- tx_service/src/cc/local_cc_handler.cpp | 19 +-- tx_service/src/cc/range_slice.cpp | 27 +--- tx_service/src/tx_operation.cpp | 11 +- 10 files changed, 129 insertions(+), 190 deletions(-) diff --git a/tx_service/include/cc/cc_page_clean_guard.h b/tx_service/include/cc/cc_page_clean_guard.h index 39c1c316..c2a8d94d 100644 --- a/tx_service/include/cc/cc_page_clean_guard.h +++ b/tx_service/include/cc/cc_page_clean_guard.h @@ -263,8 +263,7 @@ struct CcPageCleanGuard cce->PayloadStatus() != RecordStatus::Unknown) || cce->PayloadStatus() == RecordStatus::Deleted)) { - store_range->DeleteKey( - key, cc_shard_->core_id_, store_slice); + store_range->DeleteKey(key, store_slice); } MarkClean(cc_ng_id_, idx, delay_free); diff --git a/tx_service/include/cc/cc_req_misc.h b/tx_service/include/cc/cc_req_misc.h index 558ce2c6..eedae7e7 100644 --- a/tx_service/include/cc/cc_req_misc.h +++ b/tx_service/include/cc/cc_req_misc.h @@ -367,7 +367,6 @@ struct InitKeyCacheCc : public CcRequestBase void Reset(StoreRange *range, StoreSlice *slice, - uint16_t core_cnt, const TableName &tbl_name, int64_t term, NodeGroupId ng_id) @@ -380,18 +379,15 @@ struct InitKeyCacheCc : public CcRequestBase ng_id_ = ng_id; range_ = range; slice_ = slice; - unfinished_cnt_ = core_cnt; - - pause_pos_.clear(); - pause_pos_.resize(core_cnt); + pause_pos_ = TxKey(); } bool Execute(CcShard &ccs) override; - bool SetFinish(uint16_t core, bool succ); + void SetFinish(bool succ); StoreSlice &Slice(); StoreRange &Range(); - void SetPauseKey(TxKey &key, uint16_t core_id); - TxKey &PauseKey(uint16_t core_id); + void SetPauseKey(TxKey &key); + TxKey &PauseKey(); private: TableName tbl_name_{std::string(""), TableType::Primary, TableEngine::None}; @@ -399,8 +395,7 @@ struct InitKeyCacheCc : public CcRequestBase NodeGroupId ng_id_; StoreRange *range_; StoreSlice *slice_; - std::atomic unfinished_cnt_{0}; - std::vector pause_pos_; + TxKey pause_pos_; }; struct FillStoreSliceCc : public CcRequestBase diff --git a/tx_service/include/cc/cc_request.h b/tx_service/include/cc/cc_request.h index 0b360351..8096672a 100644 --- a/tx_service/include/cc/cc_request.h +++ b/tx_service/include/cc/cc_request.h @@ -6482,7 +6482,6 @@ struct UpdateKeyCacheCc : public CcRequestBase void Reset(const TableName &tbl_name, uint32_t ng_id, int64_t ng_term, - size_t core_cnt, const TxKey &start_key, const TxKey &end_key, StoreRange *range, @@ -6496,10 +6495,8 @@ struct UpdateKeyCacheCc : public CcRequestBase start_key_ = &start_key; end_key_ = &end_key; store_range_ = range; - unfinished_core_ = core_cnt; hd_res_ = res; - paused_pos_.clear(); - paused_pos_.resize(core_cnt); + paused_pos_ = TxKey(); } bool Execute(CcShard &ccs) override @@ -6507,7 +6504,8 @@ struct UpdateKeyCacheCc : public CcRequestBase int64_t ng_term = Sharder::Instance().LeaderTerm(node_group_id_); if (ng_term < 0 || ng_term != ng_term_) { - return SetFinish(); + SetFinish(); + return true; } CcMap *ccm = ccs.GetCcm(*table_name_, node_group_id_); @@ -6516,14 +6514,9 @@ struct UpdateKeyCacheCc : public CcRequestBase return ccm->Execute(*this); } - bool SetFinish() + void SetFinish() { - if (unfinished_core_.fetch_sub(1, std::memory_order_acq_rel) == 1) - { - hd_res_->SetFinished(); - return true; - } - return false; + hd_res_->SetFinished(); } const TableName *table_name_{nullptr}; @@ -6532,8 +6525,7 @@ struct UpdateKeyCacheCc : public CcRequestBase const TxKey *start_key_{nullptr}; const TxKey *end_key_{nullptr}; StoreRange *store_range_{nullptr}; - std::vector paused_pos_; - std::atomic unfinished_core_; + TxKey paused_pos_; CcHandlerResult *hd_res_{nullptr}; }; diff --git a/tx_service/include/cc/local_cc_shards.h b/tx_service/include/cc/local_cc_shards.h index 98a11c1b..870eb7a0 100644 --- a/tx_service/include/cc/local_cc_shards.h +++ b/tx_service/include/cc/local_cc_shards.h @@ -1129,7 +1129,6 @@ class LocalCcShards template RangeSliceOpStatus AddKeyToKeyCache(const TableName &table_name, NodeGroupId cc_ng_id, - uint16_t core_id, const KeyT &key) { std::shared_lock lk(meta_data_mux_); @@ -1156,7 +1155,7 @@ class LocalCcShards return RangeSliceOpStatus::Error; } store_range->UpdateLastAccessedTs(ClockTs()); - return store_range->AddKey(key, core_id); + return store_range->AddKey(key); } template diff --git a/tx_service/include/cc/range_slice.h b/tx_service/include/cc/range_slice.h index 0291d224..0961534c 100644 --- a/tx_service/include/cc/range_slice.h +++ b/tx_service/include/cc/range_slice.h @@ -303,22 +303,12 @@ class StoreSlice SliceStatus status, bool init_key_cache, bool empty_slice) - : size_(size), - status_(status), - fetch_slice_cc_(nullptr), - cache_validity_((txservice_enable_key_cache && init_key_cache) - ? Sharder::Instance().GetLocalCcShardsCount() - : 0) - { - if (empty_slice && !cache_validity_.empty()) + : size_(size), status_(status), fetch_slice_cc_(nullptr) + { + if (empty_slice && (txservice_enable_key_cache && init_key_cache)) { // If slice is empty, set the key cache as valid at the start. - for (uint16_t i = 0; - i < Sharder::Instance().GetLocalCcShardsCount(); - i++) - { - SetKeyCacheValidity(i, true); - } + SetKeyCacheValidity(true); } } @@ -419,42 +409,38 @@ class StoreSlice last_load_ts_ = load_ts; } - bool IsValidInKeyCache(uint16_t core_id) const + bool IsValidInKeyCache() const { - assert(!cache_validity_.empty()); - return cache_validity_[core_id] & 1; + return cache_validity_ & 1; } - void SetKeyCacheValidity(uint16_t core_id, bool valid) + void SetKeyCacheValidity(bool valid) { - assert(!cache_validity_.empty()); if (valid) { - cache_validity_[core_id] |= 1; + cache_validity_ |= 1; } else { - cache_validity_[core_id] &= ~(1); + cache_validity_ &= ~(1); } } - void SetLoadingKeyCache(uint16_t core_id, bool status) + void SetLoadingKeyCache(bool status) { - assert(!cache_validity_.empty()); if (status) { - cache_validity_[core_id] |= (1 << 1); + cache_validity_ |= (1 << 1); } else { - cache_validity_[core_id] &= ~(1 << 1); + cache_validity_ &= ~(1 << 1); } } - bool IsLoadingKeyCache(uint16_t core_id) + bool IsLoadingKeyCache() { - assert(!cache_validity_.empty()); - return cache_validity_[core_id] & (1 << 1); + return cache_validity_ & (1 << 1); } void InitKeyCache(CcShard *cc_shard, @@ -508,13 +494,12 @@ class StoreSlice std::mutex slice_mux_; - // If this slice is included in the range key filter. Each core should only - // access its own bitset, so we do not need mutex protection. - // Note that byte is the smallest unit c++ sync across threads. To avoid - // data corruption we need at least 1 byte for each core mask. - // The first bit implies if the key cache is valid on this core, the second - // bit implies if the key cache is being loaded on this core. - std::vector cache_validity_; + // If this slice is included in the range key filter. The first bit implies + // if the key cache is valid, the second bit implies if the key cache is + // being loaded. + // All keys in this range are sharding to the same core, so we only need to + // maintain one cache validity for this range. + uint8_t cache_validity_{0}; friend class StoreRange; template @@ -722,10 +707,9 @@ class StoreRange return last_accessed_ts_.load(std::memory_order_relaxed); } - std::string KeyCacheInfo(uint16_t core_id) const + std::string KeyCacheInfo() const { - assert(core_id < key_cache_.size()); - return key_cache_[core_id]->Info(); + return key_cache_->Info(); } void SetHasDmlSinceDdl() @@ -856,8 +840,9 @@ class StoreRange // cache. Removing keys from cache when they are evicted reduces the number // of look ups to find the slice of the key since we can evict the keys in // batch. - std::vector>> - key_cache_; + // All keys in this range are sharding to the same core, so we only need to + // maintain one key cache for this range. + std::unique_ptr> key_cache_; std::atomic last_init_key_cache_time_{0}; // This variable is used during the upsert table scheme transaction(such as, @@ -957,7 +942,7 @@ class TemplateStoreRange : public StoreRange slice_end, slice_size, slice_status, - !key_cache_.empty()); + key_cache_ != nullptr); slices_.emplace_back(std::move(slice)); @@ -970,12 +955,12 @@ class TemplateStoreRange : public StoreRange slice_size = slice_keys[idx].size_; slice_status = slice_keys[idx].status_; - slice = - std::make_unique>(slice_start, - slice_end, - slice_size, - slice_status, - !key_cache_.empty()); + slice = std::make_unique>( + slice_start, + slice_end, + slice_size, + slice_status, + key_cache_ != nullptr); slices_.emplace_back(std::move(slice)); @@ -1063,25 +1048,24 @@ class TemplateStoreRange : public StoreRange return slices_; } - void InvalidateKeyCache(uint16_t core_id) + void InvalidateKeyCache() { - if (key_cache_.empty()) + if (key_cache_ == nullptr) { return; } LOG(INFO) << "Invalidate key cache of range " << partition_id_ - << " on core " << core_id << " due to collision"; + << " due to collision"; std::shared_lock s_lk(mux_); // shared lock to avoid slice split for (auto &slice : slices_) { - slice->SetKeyCacheValidity(core_id, false); + slice->SetKeyCacheValidity(false); } // Create a larger key cache if the old one cannot hold enough keys. - size_t last_key_cache_size = key_cache_[core_id]->Size(); - key_cache_[core_id] = - std::make_unique>( - last_key_cache_size * 1.2); + size_t last_key_cache_size = key_cache_->Size(); + key_cache_ = std::make_unique>( + last_key_cache_size * 1.2); } /** * @brief Split the range with new_end. new_end will be the new @@ -1212,7 +1196,7 @@ class TemplateStoreRange : public StoreRange } CODE_FAULT_INJECTOR("PinSlices_Fail", { LOG(INFO) << "FaultInject PinSlices_Fail, " << check_key_cache - << ", is valid " << slice->IsValidInKeyCache(shard_id); + << ", is valid " << slice->IsValidInKeyCache(); if (slice->status_ == SliceStatus::FullyCached) { slice->status_ = SliceStatus::PartiallyCached; @@ -1305,9 +1289,9 @@ class TemplateStoreRange : public StoreRange else if (check_key_cache) { assert(to_prefetch == false); - if (slice->IsValidInKeyCache(shard_id)) + if (slice->IsValidInKeyCache()) { - bool found = ContainsKey(search_key, shard_id); + bool found = ContainsKey(search_key); if (!found) { // If the key is not found in range, directly return and @@ -1318,7 +1302,7 @@ class TemplateStoreRange : public StoreRange // If key is found in range key cache, the key must exist in kv // store. Load slice from kv to get the value. } - else if (!slice->IsLoadingKeyCache(shard_id)) + else if (!slice->IsLoadingKeyCache()) { // If this slice can use key cache but the key cache is not // intialized, always load slice from kv to initialize the key @@ -1628,17 +1612,16 @@ class TemplateStoreRange : public StoreRange return true; } - void DeleteKey(const KeyT &key, uint16_t core_id, StoreSlice *slice) + void DeleteKey(const KeyT &key, StoreSlice *slice) { if (slice == nullptr) { TxKey search_key(&key); slice = FindSlice(search_key); } - if (slice->IsValidInKeyCache(core_id)) + if (slice->IsValidInKeyCache()) { - cuckoofilter::Status status = - key_cache_[core_id]->Delete(key.Hash()); + cuckoofilter::Status status = key_cache_->Delete(key.Hash()); // We should not try to delete a non-existing key. if (status == cuckoofilter::Status::NotFound) { @@ -1651,9 +1634,9 @@ class TemplateStoreRange : public StoreRange } // NOTE: The slice to which the @@key belong must be valid in key cache. - void DeleteKey(const KeyT &key, uint16_t core_id) + void DeleteKey(const KeyT &key) { - cuckoofilter::Status status = key_cache_[core_id]->Delete(key.Hash()); + cuckoofilter::Status status = key_cache_->Delete(key.Hash()); // We should not try to delete a non-existing key. if (status == cuckoofilter::Status::NotFound) { @@ -1663,7 +1646,6 @@ class TemplateStoreRange : public StoreRange } RangeSliceOpStatus AddKey(const KeyT &key, - uint16_t core_id, StoreSlice *slice = nullptr, bool init = false) { @@ -1673,10 +1655,10 @@ class TemplateStoreRange : public StoreRange TxKey search_key(&key); slice = FindSlice(search_key); } - if (init || slice->IsValidInKeyCache(core_id)) + if (init || slice->IsValidInKeyCache()) { - assert(init || !slice->IsLoadingKeyCache(core_id)); - cuckoofilter::Status status = key_cache_[core_id]->Add(key.Hash()); + assert(init || !slice->IsLoadingKeyCache()); + cuckoofilter::Status status = key_cache_->Add(key.Hash()); if (status == cuckoofilter::Status::Ok) { return RangeSliceOpStatus::Successful; @@ -1685,11 +1667,11 @@ class TemplateStoreRange : public StoreRange { assert(status == cuckoofilter::Status::NotEnoughSpace); // Add failed, we need to invalidate the filter. - InvalidateKeyCache(core_id); + InvalidateKeyCache(); return RangeSliceOpStatus::Error; } } - else if (slice->IsLoadingKeyCache(core_id)) + else if (slice->IsLoadingKeyCache()) { // Retry later when key cache is initialized. return RangeSliceOpStatus::Retry; @@ -1720,10 +1702,9 @@ class TemplateStoreRange : public StoreRange } } - bool ContainsKey(const KeyT &key, uint16_t core_id) + bool ContainsKey(const KeyT &key) { - return key_cache_[core_id]->Contain(key.Hash()) == - cuckoofilter::Status::Ok; + return key_cache_->Contain(key.Hash()) == cuckoofilter::Status::Ok; } size_t PostCkptSize() override @@ -1940,7 +1921,7 @@ class TemplateStoreRange : public StoreRange sub_slice_end, split_keys[idx].cur_size_, SliceStatus::PartiallyCached, - !slice->cache_validity_.empty()); + slice->cache_validity_ != 0); sub_slice->post_ckpt_size_ = split_keys[idx].post_update_size_; sub_slice->status_ = slice->status_; diff --git a/tx_service/include/cc/template_cc_map.h b/tx_service/include/cc/template_cc_map.h index ceabea9e..136b9b00 100644 --- a/tx_service/include/cc/template_cc_map.h +++ b/tx_service/include/cc/template_cc_map.h @@ -251,7 +251,7 @@ class TemplateCcMap : public CcMap auto it = Iterator(cce_ptr, ccp, &neg_inf_); target_key = it->first; auto res = shard_->local_shards_.AddKeyToKeyCache( - table_name_, cc_ng_id_, shard_->core_id_, *target_key); + table_name_, cc_ng_id_, *target_key); if (res == RangeSliceOpStatus::Retry) { // If the insert fails due to key cache is being @@ -419,10 +419,7 @@ class TemplateCcMap : public CcMap // or auto incr pk insert, the ReadCc is skipped and we // need to update key cache here. auto res = shard_->local_shards_.AddKeyToKeyCache( - table_name_, - cc_ng_id_, - shard_->core_id_, - *target_key); + table_name_, cc_ng_id_, *target_key); if (res == RangeSliceOpStatus::Retry) { // If the insert fails due to key cache is being @@ -1712,7 +1709,6 @@ class TemplateCcMap : public CcMap static_cast *>( slice_id.Range()); auto res = range->AddKey(*look_key, - shard_->core_id_, slice_id.Slice()); if (res == RangeSliceOpStatus::Error) { @@ -6737,17 +6733,18 @@ class TemplateCcMap : public CcMap bool Execute(InitKeyCacheCc &req) override { Iterator map_it, map_end_it; - TxKey &resume_key = req.PauseKey(shard_->core_id_); + TxKey &resume_key = req.PauseKey(); const KeyT *start_key = nullptr; if (!resume_key.KeyPtr()) { // First time being processed. - if (req.Slice().IsValidInKeyCache(shard_->core_id_)) + if (req.Slice().IsValidInKeyCache()) { // No need to init key cache. - return req.SetFinish(shard_->core_id_, true); + req.SetFinish(true); + return true; } - req.Slice().SetLoadingKeyCache(shard_->core_id_, true); + req.Slice().SetLoadingKeyCache(true); start_key = req.Slice().StartTxKey().GetKey(); } else @@ -6807,24 +6804,25 @@ class TemplateCcMap : public CcMap continue; } const KeyT *key = map_it->first; - auto ret = - range->AddKey(*key, shard_->core_id_, &req.Slice(), true); + auto ret = range->AddKey(*key, &req.Slice(), true); if (ret == RangeSliceOpStatus::Error) { // Stop immediately if one of the add key fails. - return req.SetFinish(shard_->core_id_, false); + req.SetFinish(false); + return true; } } if (map_it == map_end_it) { - return req.SetFinish(shard_->core_id_, true); + req.SetFinish(true); + return true; } else { // record pause position and resume in next round. TxKey pause_key(map_it->first); - req.SetPauseKey(pause_key, shard_->core_id_); + req.SetPauseKey(pause_key); shard_->Enqueue(&req); return false; } @@ -7004,8 +7002,8 @@ class TemplateCcMap : public CcMap : KeyT::PositiveInfinity(); const KeyT *start_key = - req.paused_pos_[shard_->core_id_].KeyPtr() != nullptr - ? req.paused_pos_[shard_->core_id_].GetKey() + req.paused_pos_.KeyPtr() != nullptr + ? req.paused_pos_.GetKey() : (req.end_key_ != nullptr ? req.start_key_->GetKey() : KeyT::NegativeInfinity()); @@ -7042,8 +7040,7 @@ class TemplateCcMap : public CcMap curr_slice = range->FindSlice(*key); it = deduce_iterator(*key); end_it = deduce_iterator(*(curr_slice->EndKey())); - if ((!curr_slice->IsValidInKeyCache(shard_->core_id_) || - it == end_it) && + if ((!curr_slice->IsValidInKeyCache() || it == end_it) && end_it != req_end_it) { // The slice is empty or the slice is invalid in key cache, @@ -7052,7 +7049,7 @@ class TemplateCcMap : public CcMap key = curr_slice->EndKey(); curr_slice = nullptr; } - else if (!curr_slice->IsValidInKeyCache(shard_->core_id_) && + else if (!curr_slice->IsValidInKeyCache() && end_it == req_end_it) { // Reach to the last slice, and the slice is invalid in key @@ -7080,7 +7077,7 @@ class TemplateCcMap : public CcMap { assert(cce->PayloadStatus() == RecordStatus::Normal || cce->PayloadStatus() == RecordStatus::Deleted); - range->DeleteKey(*cce_key, shard_->core_id_); + range->DeleteKey(*cce_key); } // Forward the iterator. @@ -7096,12 +7093,13 @@ class TemplateCcMap : public CcMap if (key_it == slice_end_it) { - req.paused_pos_[shard_->core_id_] = TxKey(); - return req.SetFinish(); + req.paused_pos_ = TxKey(); + req.SetFinish(); + return true; } else { - req.paused_pos_[shard_->core_id_] = key_it->first->CloneTxKey(); + req.paused_pos_ = key_it->first->CloneTxKey(); shard_->Enqueue(&req); return false; } @@ -10057,10 +10055,7 @@ class TemplateCcMap : public CcMap // status, it should already be in the key cache. Only add it if // it's in DELETED. auto res = shard_->local_shards_.AddKeyToKeyCache( - table_name_, - cc_ng_id_, - shard_->core_id_, - *ccp->KeyOfEntry(cce)); + table_name_, cc_ng_id_, *ccp->KeyOfEntry(cce)); if (res == RangeSliceOpStatus::Retry) { // Retry if the slice key cache is being loaded. diff --git a/tx_service/src/cc/cc_req_misc.cpp b/tx_service/src/cc/cc_req_misc.cpp index c4ea6e5c..dad27158 100644 --- a/tx_service/src/cc/cc_req_misc.cpp +++ b/tx_service/src/cc/cc_req_misc.cpp @@ -509,27 +509,20 @@ bool ClearCcNodeGroup::Execute(CcShard &ccs) return false; } -bool InitKeyCacheCc::SetFinish(uint16_t core, bool succ) +void InitKeyCacheCc::SetFinish(bool succ) { if (succ) { - slice_->SetKeyCacheValidity(core, succ); + slice_->SetKeyCacheValidity(succ); } - slice_->SetLoadingKeyCache(core, false); + slice_->SetLoadingKeyCache(false); - if (unfinished_cnt_.fetch_sub(1, std::memory_order_relaxed) == 1) - { - pause_pos_.clear(); - - // Unpin the slice. - range_->UnpinSlice(slice_, true); - std::unique_lock slice_lk(slice_->slice_mux_); - slice_->init_key_cache_cc_ = nullptr; - - return true; - } + pause_pos_ = TxKey(); - return false; + // Unpin the slice. + range_->UnpinSlice(slice_, true); + std::unique_lock slice_lk(slice_->slice_mux_); + slice_->init_key_cache_cc_ = nullptr; } bool InitKeyCacheCc::Execute(CcShard &ccs) @@ -538,15 +531,15 @@ bool InitKeyCacheCc::Execute(CcShard &ccs) int64_t cc_ng_term = Sharder::Instance().LeaderTerm(ng_id_); if (std::max(cc_ng_candid_term, cc_ng_term) != term_) { - return SetFinish(ccs.core_id_, false); + SetFinish(false); + return true; } CcMap *ccm = ccs.GetCcm(tbl_name_, ng_id_); if (ccm == nullptr) { - // ccm is empty when slice is fully cached. That means this slice is - // empty on this core. - return SetFinish(ccs.core_id_, true); + SetFinish(true); + return true; } return ccm->Execute(*this); @@ -561,14 +554,14 @@ StoreSlice &InitKeyCacheCc::Slice() return *slice_; } -void InitKeyCacheCc::SetPauseKey(TxKey &key, uint16_t core_id) +void InitKeyCacheCc::SetPauseKey(TxKey &key) { - pause_pos_[core_id] = key.Clone(); + pause_pos_ = key.Clone(); } -TxKey &InitKeyCacheCc::PauseKey(uint16_t core_id) +TxKey &InitKeyCacheCc::PauseKey() { - return pause_pos_[core_id]; + return pause_pos_; } void FillStoreSliceCc::Reset(const TableName &table_name, diff --git a/tx_service/src/cc/local_cc_handler.cpp b/tx_service/src/cc/local_cc_handler.cpp index b69d6b9b..60c5a33e 100644 --- a/tx_service/src/cc/local_cc_handler.cpp +++ b/tx_service/src/cc/local_cc_handler.cpp @@ -2016,20 +2016,13 @@ void txservice::LocalCcHandler::UpdateKeyCache(const TableName &table_name, hres.SetToBlock(); #endif - size_t core_cnt = cc_shards_.Count(); UpdateKeyCacheCc *req = update_key_cache_pool_.NextRequest(); - req->Reset(table_name, - ng_id, - tx_term, - core_cnt, - start_key, - end_key, - store_range, - &hres); - for (size_t idx = 0; idx < core_cnt; ++idx) - { - cc_shards_.EnqueueCcRequest(idx, req); - } + req->Reset( + table_name, ng_id, tx_term, start_key, end_key, store_range, &hres); + + uint16_t dest_core = static_cast( + (store_range->PartitionId() & 0x3FF) % cc_shards_.Count()); + cc_shards_.EnqueueCcRequest(dest_core, req); } /* diff --git a/tx_service/src/cc/range_slice.cpp b/tx_service/src/cc/range_slice.cpp index e76611a8..baa051ee 100644 --- a/tx_service/src/cc/range_slice.cpp +++ b/tx_service/src/cc/range_slice.cpp @@ -172,19 +172,9 @@ void StoreSlice::InitKeyCache(CcShard *cc_shard, pins_++; init_key_cache_cc_ = cc_shard->NewInitKeyCacheCc(); - init_key_cache_cc_->Reset(range, - this, - range->local_cc_shards_.Count(), - *tbl_name, - term, - ng_id); - - uint16_t core_cnt = range->local_cc_shards_.Count(); - for (uint16_t core_id = 0; core_id < core_cnt; core_id++) - { - Sharder::Instance().GetLocalCcShards()->EnqueueToCcShard( - core_id, init_key_cache_cc_); - } + init_key_cache_cc_->Reset(range, this, *tbl_name, term, ng_id); + + cc_shard->Enqueue(init_key_cache_cc_); } } @@ -253,17 +243,12 @@ StoreRange::StoreRange(uint32_t partition_id, estimate_rec_size)); } - uint16_t core_cnt = Sharder::Instance().GetLocalCcShardsCount(); - for (uint16_t id = 0; id < core_cnt; id++) - { - key_cache_.push_back( - std::make_unique>( - key_cache_size / core_cnt)); - } + key_cache_ = std::make_unique>( + key_cache_size); } else { - key_cache_.resize(0); + key_cache_ = nullptr; } } diff --git a/tx_service/src/tx_operation.cpp b/tx_service/src/tx_operation.cpp index 42bc3796..275309ae 100644 --- a/tx_service/src/tx_operation.cpp +++ b/tx_service/src/tx_operation.cpp @@ -4641,14 +4641,21 @@ void SplitFlushRangeOp::Forward(TransactionExecution *txm) int64_t tx_term = txm->TxTerm(); LocalCcShards *local_shards = Sharder::Instance().GetLocalCcShards(); - // The new ranges that still lands to the same ng after split. + // The new ranges that still lands to the same core of same ng + // after split. std::vector> ranges; ranges.reserve(new_ranges.size()); + uint16_t range_shard_id = + static_cast((range_info_->PartitionId() & 0x3FF) % + local_shards->Count()); for (auto iter = new_ranges.begin(); iter != new_ranges.end(); ++iter) { + uint16_t new_range_shard_id = static_cast( + (iter->second & 0x3FF) % local_shards->Count()); if (local_shards->GetRangeOwner(iter->second, node_group) - ->BucketOwner() == node_group) + ->BucketOwner() == node_group && + (new_range_shard_id == range_shard_id)) { const TxKey *start_key = &(iter->first); const TxKey *end_key = From 5eb0b7ac217628a4f1af3ca1ebd94d518bcfc5ff Mon Sep 17 00:00:00 2001 From: yi-xmu <69192773+yi-xmu@users.noreply.github.com> Date: Fri, 13 Mar 2026 18:44:12 +0800 Subject: [PATCH 14/15] Reset range splitting status after datasync (#451) If a datasync task that is supposed to trigger a range split ends without actually triggering the split, the range splitting state needs to be reset. --- tx_service/include/cc/cc_shard.h | 4 ++++ tx_service/include/data_sync_task.h | 6 ++++++ tx_service/src/cc/cc_shard.cpp | 13 +++++++++++++ tx_service/src/cc/local_cc_shards.cpp | 19 +++++++++++++++++++ tx_service/src/data_sync_task.cpp | 20 ++++++++++++++++++++ 5 files changed, 62 insertions(+) diff --git a/tx_service/include/cc/cc_shard.h b/tx_service/include/cc/cc_shard.h index 9c409871..c1554381 100644 --- a/tx_service/include/cc/cc_shard.h +++ b/tx_service/include/cc/cc_shard.h @@ -1143,6 +1143,10 @@ class CcShard } } + void ResetRangeSplittingStatus(const TableName &table_name, + uint32_t ng_id, + uint32_t range_id); + FillStoreSliceCc *NewFillStoreSliceCc() { return fill_store_slice_cc_pool_.NextRequest(); diff --git a/tx_service/include/data_sync_task.h b/tx_service/include/data_sync_task.h index 8cb4ecc7..1c640f7b 100644 --- a/tx_service/include/data_sync_task.h +++ b/tx_service/include/data_sync_task.h @@ -182,6 +182,12 @@ struct DataSyncTask // flush data buffer. void SetScanTaskFinished(); + // Once the range size reaches the threshold, a DataSyncTask is created to + // trigger the split range operation, and a flag is set indicating that the + // range has been split. This flag needs to be reset after the DataSyncTask + // completes. + void ResetRangeSplittingStatus(); + void SetErrorCode(CcErrorCode err_code) { std::unique_lock lk(status_->mux_); diff --git a/tx_service/src/cc/cc_shard.cpp b/tx_service/src/cc/cc_shard.cpp index 9c7b2c93..d3c009ef 100644 --- a/tx_service/src/cc/cc_shard.cpp +++ b/tx_service/src/cc/cc_shard.cpp @@ -3580,6 +3580,19 @@ void CcShard::RecycleTxLockInfo(TxLockInfo::uptr lock_info) tx_lock_info_head_.next_ = std::move(lock_info); } +void CcShard::ResetRangeSplittingStatus(const TableName &table_name, + uint32_t ng_id, + uint32_t range_id) +{ + CcMap *ccm = GetCcm(table_name, ng_id); + if (ccm == nullptr) + { + return; + } + + ccm->ResetRangeStatus(range_id); +} + void CcShard::CreateSplitRangeDataSyncTask(const TableName &table_name, uint32_t ng_id, int64_t ng_term, diff --git a/tx_service/src/cc/local_cc_shards.cpp b/tx_service/src/cc/local_cc_shards.cpp index edcdd520..776472b6 100644 --- a/tx_service/src/cc/local_cc_shards.cpp +++ b/tx_service/src/cc/local_cc_shards.cpp @@ -3217,6 +3217,7 @@ void LocalCcShards::PostProcessFlushTaskEntries( task->id_); } task->SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); + task->ResetRangeSplittingStatus(); continue; } @@ -3276,6 +3277,7 @@ void LocalCcShards::PostProcessFlushTaskEntries( } task->SetError(err_code); + task->ResetRangeSplittingStatus(); } else { @@ -3287,6 +3289,7 @@ void LocalCcShards::PostProcessFlushTaskEntries( txservice::AbortTx(entry->data_sync_txm_); } task->SetError(CcErrorCode::DATA_STORE_ERR); + task->ResetRangeSplittingStatus(); } } } @@ -3331,6 +3334,7 @@ void LocalCcShards::PostProcessFlushTaskEntries( } task->SetFinish(); + task->ResetRangeSplittingStatus(); } else { @@ -3362,6 +3366,7 @@ void LocalCcShards::PostProcessFlushTaskEntries( } task->SetError(err_code); + task->ResetRangeSplittingStatus(); } } } @@ -3418,6 +3423,7 @@ void LocalCcShards::PostProcessRangePartitionDataSyncTask( task->id_); } task->SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); + task->ResetRangeSplittingStatus(); if (ng_term >= 0) { Sharder::Instance().UnpinNodeGroupData(task->node_group_id_); @@ -3486,6 +3492,7 @@ void LocalCcShards::PostProcessRangePartitionDataSyncTask( } task->SetFinish(); + task->ResetRangeSplittingStatus(); } else if (task_ckpt_err == DataSyncTask::CkptErrorCode::SCAN_ERROR) { @@ -3537,6 +3544,7 @@ void LocalCcShards::PostProcessRangePartitionDataSyncTask( } task->SetError(err_code); + task->ResetRangeSplittingStatus(); } else { @@ -3548,6 +3556,7 @@ void LocalCcShards::PostProcessRangePartitionDataSyncTask( txservice::AbortTx(data_sync_txm); } task->SetError(CcErrorCode::DATA_STORE_ERR); + task->ResetRangeSplittingStatus(); } } @@ -3600,6 +3609,7 @@ void LocalCcShards::DataSyncForRangePartition( // table dropped data_sync_task->SetError(CcErrorCode::REQUESTED_TABLE_NOT_EXISTS); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); ClearAllPendingTasks(ng_id, expected_ng_term, table_name, range_id); } else @@ -3637,6 +3647,7 @@ void LocalCcShards::DataSyncForRangePartition( { data_sync_task->SetFinish(); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); PopPendingTask( ng_id, expected_ng_term, table_name, range_id); assert(need_process == false); @@ -3652,6 +3663,7 @@ void LocalCcShards::DataSyncForRangePartition( data_sync_task->SetError( CcErrorCode::REQUESTED_NODE_NOT_LEADER); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); PopPendingTask(ng_id, expected_ng_term, table_name, range_id); } } @@ -3677,6 +3689,7 @@ void LocalCcShards::DataSyncForRangePartition( // Finish this task and notify the caller. data_sync_task->SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); PopPendingTask(ng_id, expected_ng_term, table_name, range_id); if (ng_term >= 0) @@ -3761,6 +3774,7 @@ void LocalCcShards::DataSyncForRangePartition( // directly. data_sync_task->SetError(); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); ClearAllPendingTasks( ng_id, expected_ng_term, table_name, range_id); @@ -3820,6 +3834,7 @@ void LocalCcShards::DataSyncForRangePartition( data_sync_task->SetFinish(); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); PopPendingTask(ng_id, expected_ng_term, table_name, range_id); return; @@ -3874,6 +3889,7 @@ void LocalCcShards::DataSyncForRangePartition( data_sync_task->SetError(); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); PopPendingTask(ng_id, expected_ng_term, table_name, range_id); return; @@ -3889,6 +3905,7 @@ void LocalCcShards::DataSyncForRangePartition( txservice::AbortTx(data_sync_txm); data_sync_task->SetError(CcErrorCode::GET_RANGE_ID_ERR); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); PopPendingTask(ng_id, expected_ng_term, table_name, range_id); return; @@ -4001,6 +4018,7 @@ void LocalCcShards::DataSyncForRangePartition( } data_sync_task->SetFinish(); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); return; } assert(slices_delta_size.size() > 0 || export_base_table_items); @@ -4033,6 +4051,7 @@ void LocalCcShards::DataSyncForRangePartition( data_sync_task->SetError(); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); // Handle the pending tasks for the same range PopPendingTask(ng_id, expected_ng_term, table_name, range_id); diff --git a/tx_service/src/data_sync_task.cpp b/tx_service/src/data_sync_task.cpp index 70d56892..d12d8c30 100644 --- a/tx_service/src/data_sync_task.cpp +++ b/tx_service/src/data_sync_task.cpp @@ -236,4 +236,24 @@ void DataSyncTask::SetScanTaskFinished() } } +void DataSyncTask::ResetRangeSplittingStatus() +{ + if (!high_priority_ || during_split_range_) + { + return; + } + + WaitableCc reset_cc( + [&](CcShard &ccs) + { + ccs.ResetRangeSplittingStatus(table_name_, node_group_id_, id_); + return true; + }); + + LocalCcShards *local_cc_shards = Sharder::Instance().GetLocalCcShards(); + uint16_t dest_core = (id_ & 0x3FF) % local_cc_shards->Count(); + local_cc_shards->EnqueueToCcShard(dest_core, &reset_cc); + reset_cc.Wait(); +} + } // namespace txservice From 39b6d077988e99b8a2965149ce4f08d79a14371c Mon Sep 17 00:00:00 2001 From: yi-xmu <69192773+yi-xmu@users.noreply.github.com> Date: Mon, 16 Mar 2026 12:34:18 +0800 Subject: [PATCH 15/15] Update comment (#455) --- tx_service/src/cc/cc_req_misc.cpp | 6 +++++- tx_service/src/cc/local_cc_shards.cpp | 10 +++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/tx_service/src/cc/cc_req_misc.cpp b/tx_service/src/cc/cc_req_misc.cpp index dad27158..2d6dbf31 100644 --- a/tx_service/src/cc/cc_req_misc.cpp +++ b/tx_service/src/cc/cc_req_misc.cpp @@ -1521,7 +1521,11 @@ bool FetchTableRangeSizeCc::Execute(CcShard &ccs) bool succ = (error_code_ == 0); CcMap *ccm = ccs.GetCcm(*table_name_, node_group_id_); - assert(ccm != nullptr); + if (ccm == nullptr) + { + assert(error_code_ != 0); + return true; + } bool need_split = ccm->InitRangeSize( static_cast(partition_id_), store_range_size_, succ); diff --git a/tx_service/src/cc/local_cc_shards.cpp b/tx_service/src/cc/local_cc_shards.cpp index 776472b6..fec2f065 100644 --- a/tx_service/src/cc/local_cc_shards.cpp +++ b/tx_service/src/cc/local_cc_shards.cpp @@ -2537,7 +2537,7 @@ void LocalCcShards::EnqueueDataSyncTaskForSplittingRange( TxKey old_start_key = range_entry->GetRangeInfo()->StartTxKey(); TxKey old_end_key = range_entry->GetRangeInfo()->EndTxKey(); // The old range - auto task_queue = + auto &task_queue = data_sync_task_queue_[range_entry->GetRangeInfo()->PartitionId() % data_sync_task_queue_.size()]; auto old_range_task = std::make_shared( @@ -2564,7 +2564,7 @@ void LocalCcShards::EnqueueDataSyncTaskForSplittingRange( TxKey end_key = (i == new_keys->size() - 1 ? range_entry->GetRangeInfo()->EndTxKey() : (*new_keys)[i + 1].GetShallowCopy()); - auto task_queue = + auto &task_queue = data_sync_task_queue_[new_range_id % data_sync_task_queue_.size()]; auto new_range_task = std::make_shared(table_name, ng_id, @@ -2951,10 +2951,14 @@ void LocalCcShards::CreateSplitRangeDataSyncTask(const TableName &table_name, int32_t range_id, uint64_t data_sync_ts) { + std::shared_lock meta_lk(meta_data_mux_); std::shared_ptr status = std::make_shared(ng_id, ng_term, false); + TableName range_table_name(table_name.StringView(), + TableType::RangePartition, + table_name.Engine()); TableRangeEntry *range_entry = const_cast( - GetTableRangeEntry(table_name, ng_id, range_id)); + GetTableRangeEntryInternal(range_table_name, ng_id, range_id)); assert(range_entry != nullptr); uint64_t last_sync_ts = 0; EnqueueRangeDataSyncTask(table_name,