diff --git a/store_handler/bigtable_handler.cpp b/store_handler/bigtable_handler.cpp index 52a712de..172c321f 100644 --- a/store_handler/bigtable_handler.cpp +++ b/store_handler/bigtable_handler.cpp @@ -710,6 +710,13 @@ void EloqDS::BigTableHandler::FetchRangeSlices( fetch_cc)); } +void EloqDS::BigTableHandler::FetchTableRangeSize( + txservice::FetchTableRangeSizeCc *fetch_cc) +{ + LOG(ERROR) << "BigTableHandler::FetchTableRangeSize not implemented"; + assert(false); +} + void EloqDS::BigTableHandler::OnFetchRangeSlices( google::cloud::future>> f, diff --git a/store_handler/bigtable_handler.h b/store_handler/bigtable_handler.h index 10006bbe..e3ccd39c 100644 --- a/store_handler/bigtable_handler.h +++ b/store_handler/bigtable_handler.h @@ -82,6 +82,9 @@ class BigTableHandler : public txservice::store::DataStoreHandler void FetchRangeSlices(txservice::FetchRangeSlicesReq *fetch_cc) override; + void FetchTableRangeSize( + txservice::FetchTableRangeSizeCc *fetch_cc) override; + /** * @brief Read a row from base table or skindex table in datastore with * specified key. Caller should pass in complete primary key or skindex key. diff --git a/store_handler/data_store_service_client.cpp b/store_handler/data_store_service_client.cpp index 1c55d901..2fd359b8 100644 --- a/store_handler/data_store_service_client.cpp +++ b/store_handler/data_store_service_client.cpp @@ -1059,6 +1059,30 @@ void DataStoreServiceClient::FetchRangeSlices( &FetchRangeSlicesCallback); } +void DataStoreServiceClient::FetchTableRangeSize( + txservice::FetchTableRangeSizeCc *fetch_cc) +{ + txservice::TableName range_table_name(fetch_cc->table_name_->StringView(), + txservice::TableType::RangePartition, + fetch_cc->table_name_->Engine()); + + int32_t kv_partition_id = + KvPartitionIdOfRangeSlices(range_table_name, fetch_cc->partition_id_); + uint32_t shard_id = GetShardIdByPartitionId(kv_partition_id, false); + + auto catalog_factory = GetCatalogFactory(range_table_name.Engine()); + assert(catalog_factory != nullptr); + fetch_cc->kv_start_key_ = + EncodeRangeKey(catalog_factory, range_table_name, fetch_cc->start_key_); + + Read(kv_range_table_name, + kv_partition_id, + shard_id, + fetch_cc->kv_start_key_, + fetch_cc, + &FetchRangeSizeCallback); +} + /** * @brief Deletes data that is out of the specified range. * @@ -1275,16 +1299,19 @@ std::string DataStoreServiceClient::EncodeRangeKey( * @param range_version The version of the range. * @param version The general version number. * @param segment_cnt The number of segments in the range. + * @param range_size The size of the range. * @return Binary string containing the encoded range value. */ std::string DataStoreServiceClient::EncodeRangeValue(int32_t range_id, uint64_t range_version, uint64_t version, - uint32_t segment_cnt) + uint32_t segment_cnt, + int32_t range_size) { std::string kv_range_record; kv_range_record.reserve(sizeof(int32_t) + sizeof(uint64_t) + - sizeof(uint64_t) + sizeof(uint32_t)); + sizeof(uint64_t) + sizeof(uint32_t) + + sizeof(int32_t)); kv_range_record.append(reinterpret_cast(&range_id), sizeof(int32_t)); kv_range_record.append(reinterpret_cast(&range_version), @@ -1294,6 +1321,8 @@ std::string DataStoreServiceClient::EncodeRangeValue(int32_t range_id, // segment_cnt of slices kv_range_record.append(reinterpret_cast(&segment_cnt), sizeof(uint32_t)); + kv_range_record.append(reinterpret_cast(&range_size), + sizeof(int32_t)); return kv_range_record; } @@ -1361,6 +1390,7 @@ RangeSliceBatchPlan DataStoreServiceClient::PrepareRangeSliceBatches( RangeSliceBatchPlan plan; plan.segment_cnt = 0; plan.version = version; + plan.range_size = 0; // Estimate capacity based on slices size plan.segment_keys.reserve(slices.size() / 10 + 1); // Rough estimate @@ -1409,6 +1439,7 @@ RangeSliceBatchPlan DataStoreServiceClient::PrepareRangeSliceBatches( sizeof(uint32_t)); segment_record.append(slice_start_key.Data(), key_size); uint32_t slice_size = static_cast(slices[i]->Size()); + plan.range_size += static_cast(slice_size); segment_record.append(reinterpret_cast(&slice_size), sizeof(uint32_t)); } @@ -1574,6 +1605,7 @@ void DataStoreServiceClient::EnqueueRangeMetadataRecord( uint64_t range_version, uint64_t version, uint32_t segment_cnt, + int32_t range_size, RangeMetadataAccumulator &accumulator) { // Compute kv_table_name and kv_partition_id @@ -1584,8 +1616,8 @@ void DataStoreServiceClient::EnqueueRangeMetadataRecord( // Encode key and value std::string key_str = EncodeRangeKey(catalog_factory, table_name, range_start_key); - std::string rec_str = - EncodeRangeValue(partition_id, range_version, version, segment_cnt); + std::string rec_str = EncodeRangeValue( + partition_id, range_version, version, segment_cnt, range_size); // Get or create entry in accumulator auto key = std::make_pair(kv_table_name, kv_partition_id); @@ -1753,6 +1785,7 @@ bool DataStoreServiceClient::UpdateRangeSlices( req.range_slices_, req.partition_id_); uint32_t segment_cnt = slice_plan.segment_cnt; + int32_t range_size = slice_plan.range_size; int32_t kv_partition_id = KvPartitionIdOfRangeSlices(*req.table_name_, req.partition_id_); auto iter = slice_plans.find(kv_partition_id); @@ -1777,6 +1810,7 @@ bool DataStoreServiceClient::UpdateRangeSlices( req.range_version_, req.ckpt_ts_, segment_cnt, + range_size, meta_acc); } @@ -1978,6 +2012,7 @@ bool DataStoreServiceClient::UpdateRangeSlices( range_version, version, segment_cnt, + slice_plans[0].range_size, meta_acc); SyncConcurrentRequest *meta_sync_concurrent = @@ -2069,6 +2104,7 @@ bool DataStoreServiceClient::UpsertRanges( auto slice_plan = PrepareRangeSliceBatches( table_name, version, range.slices_, range.partition_id_); uint32_t segment_cnt = slice_plan.segment_cnt; + int32_t range_size = slice_plan.range_size; int32_t kv_partition_id = KvPartitionIdOfRangeSlices(table_name, range.partition_id_); @@ -2092,6 +2128,7 @@ bool DataStoreServiceClient::UpsertRanges( version, // range_version (using version for now) version, segment_cnt, + range_size, meta_acc); } @@ -4683,7 +4720,8 @@ bool DataStoreServiceClient::InitTableRanges( std::string key_str = EncodeRangeKey(catalog_factory, table_name, *neg_inf_key); - std::string rec_str = EncodeRangeValue(init_range_id, version, version, 0); + std::string rec_str = + EncodeRangeValue(init_range_id, version, version, 0, 0); keys.emplace_back(std::string_view(key_str.data(), key_str.size())); records.emplace_back(std::string_view(rec_str.data(), rec_str.size())); diff --git a/store_handler/data_store_service_client.h b/store_handler/data_store_service_client.h index 4d860174..fb877d1e 100644 --- a/store_handler/data_store_service_client.h +++ b/store_handler/data_store_service_client.h @@ -66,6 +66,7 @@ struct RangeSliceBatchPlan std::vector segment_keys; // Owned string buffers std::vector segment_records; // Owned string buffers size_t version; + int32_t range_size{0}; // Clear method for reuse void Clear() @@ -74,6 +75,7 @@ struct RangeSliceBatchPlan segment_keys.clear(); segment_records.clear(); version = 0; + range_size = 0; } }; @@ -278,6 +280,9 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler void FetchRangeSlices(txservice::FetchRangeSlicesReq *fetch_cc) override; + void FetchTableRangeSize( + txservice::FetchTableRangeSizeCc *fetch_cc) override; + bool DeleteOutOfRangeData( const txservice::TableName &table_name, int32_t partition_id, @@ -346,7 +351,8 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler std::string EncodeRangeValue(int32_t range_id, uint64_t range_version, uint64_t version, - uint32_t segment_cnt); + uint32_t segment_cnt, + int32_t range_size); std::string EncodeRangeSliceKey(const txservice::TableName &table_name, int32_t range_id, uint32_t segment_id); @@ -654,6 +660,7 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler uint64_t range_version, uint64_t version, uint32_t segment_cnt, + int32_t range_size, RangeMetadataAccumulator &accumulator); void DispatchRangeMetadataBatches( @@ -934,6 +941,11 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); + + friend void FetchRangeSizeCallback(void *data, + ::google::protobuf::Closure *closure, + DataStoreServiceClient &client, + const remote::CommonResult &result); }; struct UpsertTableData diff --git a/store_handler/data_store_service_client_closure.cpp b/store_handler/data_store_service_client_closure.cpp index ab11ce5b..bdddbec3 100644 --- a/store_handler/data_store_service_client_closure.cpp +++ b/store_handler/data_store_service_client_closure.cpp @@ -811,8 +811,9 @@ void FetchTableRangesCallback(void *data, for (uint32_t i = 0; i < items_size; i++) { scan_next_closure->GetItem(i, key, value, ts, ttl); - assert(value.size() == (sizeof(int32_t) + sizeof(uint64_t) + - sizeof(uint64_t) + sizeof(uint32_t))); + assert(value.size() == + (sizeof(int32_t) + sizeof(uint64_t) + sizeof(uint64_t) + + sizeof(uint32_t) + sizeof(int32_t))); const char *buf = value.data(); int32_t partition_id = *(reinterpret_cast(buf)); buf += sizeof(partition_id); @@ -925,6 +926,45 @@ void FetchTableRangesCallback(void *data, } } +void FetchRangeSizeCallback(void *data, + ::google::protobuf::Closure *closure, + DataStoreServiceClient &client, + const remote::CommonResult &result) +{ + txservice::FetchTableRangeSizeCc *fetch_range_size_cc = + static_cast(data); + + if (result.error_code() == remote::DataStoreError::KEY_NOT_FOUND) + { + fetch_range_size_cc->store_range_size_ = 0; + fetch_range_size_cc->SetFinish( + static_cast(txservice::CcErrorCode::NO_ERROR)); + } + else if (result.error_code() != remote::DataStoreError::NO_ERROR) + { + LOG(ERROR) << "Fetch range size failed with error code: " + << result.error_code(); + fetch_range_size_cc->SetFinish( + static_cast(txservice::CcErrorCode::DATA_STORE_ERR)); + } + else + { + ReadClosure *read_closure = static_cast(closure); + std::string_view read_val = read_closure->Value(); + assert(read_closure->TableName() == kv_range_table_name); + assert(read_val.size() == + (sizeof(int32_t) + sizeof(uint64_t) + sizeof(uint64_t) + + sizeof(uint32_t) + sizeof(int32_t))); + const char *buf = read_val.data(); + buf += read_val.size() - sizeof(int32_t); + fetch_range_size_cc->store_range_size_ = + *reinterpret_cast(buf); + + fetch_range_size_cc->SetFinish( + static_cast(txservice::CcErrorCode::NO_ERROR)); + } +} + void FetchRangeSlicesCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, @@ -965,8 +1005,9 @@ void FetchRangeSlicesCallback(void *data, else { assert(read_closure->TableName() == kv_range_table_name); - assert(read_val.size() == (sizeof(int32_t) + sizeof(uint64_t) + - sizeof(uint64_t) + sizeof(uint32_t))); + assert(read_val.size() == + (sizeof(int32_t) + sizeof(uint64_t) + sizeof(uint64_t) + + sizeof(uint32_t) + sizeof(int32_t))); const char *buf = read_val.data(); int32_t range_partition_id = *(reinterpret_cast(buf)); diff --git a/store_handler/data_store_service_client_closure.h b/store_handler/data_store_service_client_closure.h index 4bb72373..b8c3813c 100644 --- a/store_handler/data_store_service_client_closure.h +++ b/store_handler/data_store_service_client_closure.h @@ -3102,6 +3102,14 @@ void FetchTableRangesCallback(void *data, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback for fetching range size from table_ranges. + */ +void FetchRangeSizeCallback(void *data, + ::google::protobuf::Closure *closure, + DataStoreServiceClient &client, + const remote::CommonResult &result); + /** * Callback for fetching range slices. * diff --git a/store_handler/dynamo_handler.cpp b/store_handler/dynamo_handler.cpp index 0aa7ef78..5bfa9029 100644 --- a/store_handler/dynamo_handler.cpp +++ b/store_handler/dynamo_handler.cpp @@ -2534,6 +2534,12 @@ void EloqDS::DynamoHandler::FetchRangeSlices(FetchRangeSlicesReq *fetch_cc) assert(false); } +void EloqDS::DynamoHandler::FetchTableRangeSize(FetchTableRangeSizeCc *fetch_cc) +{ + LOG(ERROR) << "DynamoHandler::FetchTableRangeSize not implemented"; + assert(false); +} + void EloqDS::DynamoHandler::OnFetchRangeSlices( const Aws::DynamoDB::DynamoDBClient *client, const Aws::DynamoDB::Model::GetItemRequest &request, diff --git a/store_handler/dynamo_handler.h b/store_handler/dynamo_handler.h index f2fc9ba5..704200e6 100644 --- a/store_handler/dynamo_handler.h +++ b/store_handler/dynamo_handler.h @@ -158,6 +158,7 @@ class DynamoHandler : public txservice::store::DataStoreHandler //-- range partition void FetchTableRanges(FetchTableRangesCc *fetch_cc) override; void FetchRangeSlices(FetchRangeSlicesReq *fetch_cc) override; + void FetchTableRangeSize(FetchTableRangeSizeCc *fetch_cc) override; bool DeleteOutOfRangeData( const txservice::TableName &table_name, diff --git a/store_handler/rocksdb_handler.cpp b/store_handler/rocksdb_handler.cpp index e741748b..47c039aa 100644 --- a/store_handler/rocksdb_handler.cpp +++ b/store_handler/rocksdb_handler.cpp @@ -1128,6 +1128,13 @@ void RocksDBHandler::FetchRangeSlices(txservice::FetchRangeSlicesReq *fetch_cc) assert(false); } +void RocksDBHandler::FetchTableRangeSize( + txservice::FetchTableRangeSizeCc *fetch_cc) +{ + LOG(ERROR) << "RocksDBHandler::FetchTableRangeSize not implemented"; + assert(false); +} + bool DeleteOutOfRangeDataInternal(std::string delete_from_partition_sql, int32_t partition_id, const txservice::TxKey *start_k) diff --git a/store_handler/rocksdb_handler.h b/store_handler/rocksdb_handler.h index c8717a49..8742b064 100644 --- a/store_handler/rocksdb_handler.h +++ b/store_handler/rocksdb_handler.h @@ -346,6 +346,9 @@ class RocksDBHandler : public txservice::store::DataStoreHandler void FetchRangeSlices(txservice::FetchRangeSlicesReq *fetch_cc) override; + void FetchTableRangeSize( + txservice::FetchTableRangeSizeCc *fetch_cc) override; + bool DeleteOutOfRangeDataInternal(std::string delete_from_partition_sql, int32_t partition_id, const txservice::TxKey *start_k); diff --git a/tx_service/include/cc/cc_handler.h b/tx_service/include/cc/cc_handler.h index 3d4640b8..cad6db33 100644 --- a/tx_service/include/cc/cc_handler.h +++ b/tx_service/include/cc/cc_handler.h @@ -166,7 +166,9 @@ class CcHandler const TxRecord *record, OperationType operation_type, uint32_t key_shard_code, - CcHandlerResult &hres) = 0; + CcHandlerResult &hres, + int32_t partition_id = -1, + bool on_dirty_range = false) = 0; /** * @briefPost-processes a read/scan key. Post-processing clears the read diff --git a/tx_service/include/cc/cc_map.h b/tx_service/include/cc/cc_map.h index 0d1434b6..9aaa8c58 100644 --- a/tx_service/include/cc/cc_map.h +++ b/tx_service/include/cc/cc_map.h @@ -21,10 +21,12 @@ */ #pragma once +#include #include #include #include // std::pair +#include "absl/container/flat_hash_map.h" #include "cc/cc_req_base.h" #include "cc_protocol.h" #include "error_messages.h" // CcErrorCode @@ -260,6 +262,20 @@ class CcMap virtual const txservice::KeySchema *KeySchema() const = 0; virtual const txservice::RecordSchema *RecordSchema() const = 0; + /** + * Called by FetchTableRangeSizeCc::Execute when async load completes. + * Merges loaded size with accumulated delta (second), or resets to + * kNotInitialized on failure. + * When emplace is true and partition_id is absent, inserts (partition_id, + * (0,0)) before merging; used for new ranges after split. + */ + bool InitRangeSize(uint32_t partition_id, + int32_t persisted_size, + bool succeed = true, + bool emplace = false); + + void ResetRangeStatus(uint32_t partition_id); + uint64_t SchemaTs() const { return schema_ts_; @@ -294,6 +310,15 @@ class CcMap uint64_t last_dirty_commit_ts_{0}; protected: + // Range id -> (range_size, delta_range_size). Only used when + // RangePartitioned. + // - first: current range size; RangeSizeState::Loading (-1) = loading from + // store; RangeSizeState::Uninitialized (-2) = not yet loaded. + // - second: delta accumulated during load (first==-1) or split (first>=0). + // - third: True if a split task been triggered due to reaching a threshold. + absl::flat_hash_map> + range_sizes_; + /** * @brief After the input request is executed at the current shard, moves * the request to another shard for execution. diff --git a/tx_service/include/cc/cc_page_clean_guard.h b/tx_service/include/cc/cc_page_clean_guard.h index 39c1c316..c2a8d94d 100644 --- a/tx_service/include/cc/cc_page_clean_guard.h +++ b/tx_service/include/cc/cc_page_clean_guard.h @@ -263,8 +263,7 @@ struct CcPageCleanGuard cce->PayloadStatus() != RecordStatus::Unknown) || cce->PayloadStatus() == RecordStatus::Deleted)) { - store_range->DeleteKey( - key, cc_shard_->core_id_, store_slice); + store_range->DeleteKey(key, store_slice); } MarkClean(cc_ng_id_, idx, delay_free); diff --git a/tx_service/include/cc/cc_req_misc.h b/tx_service/include/cc/cc_req_misc.h index 2c1807dd..eedae7e7 100644 --- a/tx_service/include/cc/cc_req_misc.h +++ b/tx_service/include/cc/cc_req_misc.h @@ -367,7 +367,6 @@ struct InitKeyCacheCc : public CcRequestBase void Reset(StoreRange *range, StoreSlice *slice, - uint16_t core_cnt, const TableName &tbl_name, int64_t term, NodeGroupId ng_id) @@ -380,18 +379,15 @@ struct InitKeyCacheCc : public CcRequestBase ng_id_ = ng_id; range_ = range; slice_ = slice; - unfinished_cnt_ = core_cnt; - - pause_pos_.clear(); - pause_pos_.resize(core_cnt); + pause_pos_ = TxKey(); } bool Execute(CcShard &ccs) override; - bool SetFinish(uint16_t core, bool succ); + void SetFinish(bool succ); StoreSlice &Slice(); StoreRange &Range(); - void SetPauseKey(TxKey &key, uint16_t core_id); - TxKey &PauseKey(uint16_t core_id); + void SetPauseKey(TxKey &key); + TxKey &PauseKey(); private: TableName tbl_name_{std::string(""), TableType::Primary, TableEngine::None}; @@ -399,8 +395,7 @@ struct InitKeyCacheCc : public CcRequestBase NodeGroupId ng_id_; StoreRange *range_; StoreSlice *slice_; - std::atomic unfinished_cnt_{0}; - std::vector pause_pos_; + TxKey pause_pos_; }; struct FillStoreSliceCc : public CcRequestBase @@ -426,10 +421,9 @@ struct FillStoreSliceCc : public CcRequestBase bool Execute(CcShard &ccs) override; - std::deque &SliceData(uint16_t core_id) + std::deque &SliceData() { - assert(core_id < partitioned_slice_data_.size()); - return partitioned_slice_data_[core_id]; + return slice_data_; } void AddDataItem(TxKey key, @@ -437,8 +431,8 @@ struct FillStoreSliceCc : public CcRequestBase uint64_t version_ts, bool is_deleted); - bool SetFinish(CcShard *cc_shard); - bool SetError(CcErrorCode err_code); + void SetFinish(CcShard *cc_shard); + void SetError(CcErrorCode err_code); void SetKvFinish(bool success); @@ -447,12 +441,9 @@ struct FillStoreSliceCc : public CcRequestBase assert(err_code != CcErrorCode::NO_ERROR); DLOG(ERROR) << "Abort this FillStoreSliceCc request with error: " << CcErrorMessage(err_code); - bool finish_all = SetError(err_code); + SetError(err_code); // Recycle request - if (finish_all) - { - Free(); - } + Free(); } const TableName &TblName() const @@ -485,17 +476,16 @@ struct FillStoreSliceCc : public CcRequestBase force_load_ = force_load; } - size_t NextIndex(size_t core_idx) const + size_t NextIndex() const { - size_t next_idx = next_idxs_[core_idx]; - assert(next_idx <= partitioned_slice_data_[core_idx].size()); - return next_idx; + assert(next_idx_ <= slice_data_.size()); + return next_idx_; } - void SetNextIndex(size_t core_idx, size_t index) + void SetNextIndex(size_t index) { - assert(index <= partitioned_slice_data_[core_idx].size()); - next_idxs_[core_idx] = index; + assert(index <= slice_data_.size()); + next_idx_ = index; } NodeGroupId NodeGroup() const @@ -533,6 +523,8 @@ struct FillStoreSliceCc : public CcRequestBase return true; } + int32_t PartitionId() const; + metrics::TimePoint start_; private: @@ -540,13 +532,11 @@ struct FillStoreSliceCc : public CcRequestBase NodeGroupId cc_ng_id_; int64_t cc_ng_term_; bool force_load_; - uint16_t finish_cnt_; - uint16_t core_cnt_; std::mutex mux_; CcErrorCode err_code_{CcErrorCode::NO_ERROR}; - std::vector next_idxs_; - std::vector> partitioned_slice_data_; + size_t next_idx_; + std::deque slice_data_; StoreSlice *range_slice_ = nullptr; StoreRange *range_ = nullptr; @@ -1157,4 +1147,35 @@ struct ShardCleanCc : public CcRequestBase private: size_t free_count_{0}; }; + +struct FetchTableRangeSizeCc : public CcRequestBase +{ +public: + FetchTableRangeSizeCc() = default; + ~FetchTableRangeSizeCc() = default; + + void Reset(const TableName &table_name, + int32_t partition_id, + const TxKey &start_key, + CcShard *ccs, + NodeGroupId ng_id, + int64_t ng_term); + + bool ValidTermCheck(); + bool Execute(CcShard &ccs) override; + void SetFinish(uint32_t error); + + const TableName *table_name_; + int32_t partition_id_{0}; + TxKey start_key_{}; + NodeGroupId node_group_id_{0}; + int64_t node_group_term_{-1}; + CcShard *ccs_{nullptr}; + + uint32_t error_code_{0}; + int32_t store_range_size_{0}; + + // Only used in DataStoreHandler + std::string kv_start_key_; +}; } // namespace txservice diff --git a/tx_service/include/cc/cc_request.h b/tx_service/include/cc/cc_request.h index 97e93fae..8096672a 100644 --- a/tx_service/include/cc/cc_request.h +++ b/tx_service/include/cc/cc_request.h @@ -740,7 +740,9 @@ struct PostWriteCc : public TemplatedCcRequest const TxRecord *rec, OperationType operation_type, uint32_t key_shard_code, - CcHandlerResult *res) + CcHandlerResult *res, + int32_t partition_id = -1, + bool on_dirty_range = false) { TemplatedCcRequest::Reset( nullptr, res, addr->NodeGroupId(), tx_number, tx_term); @@ -754,6 +756,8 @@ struct PostWriteCc : public TemplatedCcRequest is_remote_ = false; ccm_ = nullptr; is_initial_insert_ = false; + partition_id_ = partition_id; + on_dirty_range_ = on_dirty_range; } void Reset(const TxKey *key, @@ -767,7 +771,9 @@ struct PostWriteCc : public TemplatedCcRequest uint32_t key_shard_code, CcHandlerResult *res, bool initial_insertion = false, - int64_t ng_term = INIT_TERM) + int64_t ng_term = INIT_TERM, + int32_t partition_id = -1, + bool on_dirty_range = false) { TemplatedCcRequest::Reset( &table_name, @@ -788,6 +794,8 @@ struct PostWriteCc : public TemplatedCcRequest is_remote_ = false; ccm_ = nullptr; is_initial_insert_ = initial_insertion; + partition_id_ = partition_id; + on_dirty_range_ = on_dirty_range; } void Reset(const CcEntryAddr *addr, @@ -797,7 +805,9 @@ struct PostWriteCc : public TemplatedCcRequest const std::string *rec, OperationType operation_type, uint32_t key_shard_code, - CcHandlerResult *res) + CcHandlerResult *res, + int32_t partition_id = -1, + bool on_dirty_range = false) { TemplatedCcRequest::Reset( nullptr, res, addr->NodeGroupId(), tx_number, tx_term); @@ -811,6 +821,8 @@ struct PostWriteCc : public TemplatedCcRequest is_remote_ = true; ccm_ = nullptr; is_initial_insert_ = false; + partition_id_ = partition_id; + on_dirty_range_ = on_dirty_range; } void Reset(const TableName *table_name, @@ -824,7 +836,9 @@ struct PostWriteCc : public TemplatedCcRequest uint32_t key_shard_code, CcHandlerResult *res, bool initial_insertion = false, - int64_t ng_term = INIT_TERM) + int64_t ng_term = INIT_TERM, + int32_t partition_id = -1, + bool on_dirty_range = false) { TemplatedCcRequest::Reset( table_name, @@ -845,6 +859,8 @@ struct PostWriteCc : public TemplatedCcRequest is_remote_ = true; ccm_ = nullptr; is_initial_insert_ = initial_insertion; + partition_id_ = partition_id; + on_dirty_range_ = on_dirty_range; } const CcEntryAddr *CceAddr() const @@ -877,6 +893,11 @@ struct PostWriteCc : public TemplatedCcRequest return key_shard_code_; } + int32_t PartitionId() const + { + return partition_id_; + } + const void *Key() const { return is_remote_ ? nullptr : key_; @@ -892,6 +913,16 @@ struct PostWriteCc : public TemplatedCcRequest return is_initial_insert_; } + bool OnDirtyRange() const + { + return on_dirty_range_; + } + + bool NeedUpdateRangeSize() const + { + return partition_id_ >= 0; + } + private: const CcEntryAddr *cce_addr_; uint64_t commit_ts_; @@ -909,6 +940,9 @@ struct PostWriteCc : public TemplatedCcRequest const void *key_; const std::string *key_str_; }; + int32_t partition_id_{-1}; + // True if the key is located in a splitting range. + bool on_dirty_range_{false}; }; struct PostWriteAllCc @@ -2341,7 +2375,6 @@ struct ScanSliceCc end_key_type_(RangeKeyType::RawPtr), schema_version_(0) { - parallel_req_ = true; } ~ScanSliceCc() @@ -2409,12 +2442,12 @@ struct ScanSliceCc is_require_keys_ = is_require_keys; is_require_recs_ = is_require_recs; - unfinished_core_cnt_.store(1, std::memory_order_relaxed); range_slice_id_.Reset(); last_pinned_slice_ = nullptr; prefetch_size_ = prefetch_size; - err_.store(CcErrorCode::NO_ERROR, std::memory_order_relaxed); + err_ = CcErrorCode::NO_ERROR; cache_hit_miss_collected_ = false; + blocking_info_.Reset(); } void Set(const TableName &tbl_name, @@ -2472,11 +2505,11 @@ struct ScanSliceCc is_require_recs_ = is_require_recs; prefetch_size_ = prefetch_size; - unfinished_core_cnt_.store(1, std::memory_order_relaxed); range_slice_id_.Reset(); last_pinned_slice_ = nullptr; - err_.store(CcErrorCode::NO_ERROR, std::memory_order_relaxed); + err_ = CcErrorCode::NO_ERROR; cache_hit_miss_collected_ = false; + blocking_info_.Reset(); } bool Execute(CcShard &ccs) override @@ -2485,7 +2518,8 @@ struct ScanSliceCc { // Do not modify res_ directly since there could be other cores // still working on this cc req. - return SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); + SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); + return true; } CcMap *ccm = nullptr; @@ -2518,7 +2552,8 @@ struct ScanSliceCc // is marked as errored. if (init_res.error != CcErrorCode::NO_ERROR) { - return SetError(init_res.error); + SetError(init_res.error); + return true; } // The req will be re-enqueued. return false; @@ -2545,16 +2580,13 @@ struct ScanSliceCc void AbortCcRequest(CcErrorCode err_code) override { - if (SetError(err_code)) + SetError(err_code); + // If the request has pinned any slice, unpin it. + if (range_slice_id_.Range() != nullptr) { - // Last core finished. If the request has pinned any slice, unpin - // it. - if (range_slice_id_.Range() != nullptr) - { - UnpinSlices(); - } - Free(); + UnpinSlices(); } + Free(); } bool IsLocal() const @@ -2685,18 +2717,18 @@ struct ScanSliceCc return ts_; } - ScanCache *GetLocalScanCache(size_t shard_id) + ScanCache *GetLocalScanCache() { assert(IsLocal()); - return res_->Value().ccm_scanner_->Cache(shard_id); + return res_->Value().ccm_scanner_->Cache(0); } - RemoteScanSliceCache *GetRemoteScanCache(size_t shard_id) + RemoteScanSliceCache *GetRemoteScanCache() { assert(!IsLocal()); RangeScanSliceResult &slice_result = res_->Value(); - assert(shard_id < slice_result.remote_scan_caches_->size()); - return &slice_result.remote_scan_caches_->at(shard_id); + assert(slice_result.remote_scan_caches_ != nullptr); + return slice_result.remote_scan_caches_; } CcScanner *GetLocalScanner() @@ -2704,161 +2736,70 @@ struct ScanSliceCc return IsLocal() ? res_->Value().ccm_scanner_ : nullptr; } - uint64_t BlockingCceLockAddr(uint16_t core_id) + uint64_t BlockingCceLockAddr() const { - assert(core_id < blocking_vec_.size()); - return blocking_vec_[core_id].cce_lock_addr_; + return blocking_info_.cce_lock_addr_; } - std::pair BlockingPair(uint16_t core_id) + std::pair BlockingPair() const { - assert(core_id < blocking_vec_.size()); - return {blocking_vec_[core_id].type_, - blocking_vec_[core_id].scan_type_}; + return {blocking_info_.type_, blocking_info_.scan_type_}; } - void SetBlockingInfo(uint16_t core_id, - uint64_t cce_lock_addr, + void SetBlockingInfo(uint64_t cce_lock_addr, ScanType scan_type, ScanBlockingType blocking_type) { - assert(core_id < blocking_vec_.size()); - blocking_vec_[core_id] = {cce_lock_addr, scan_type, blocking_type}; + blocking_info_.cce_lock_addr_ = cce_lock_addr; + blocking_info_.scan_type_ = scan_type; + blocking_info_.type_ = blocking_type; } - void SetShardCount(uint16_t shard_cnt) + void SetPriorCceLockAddr(uint64_t addr) { - blocking_vec_.resize(shard_cnt); - for (auto &it : blocking_vec_) - { - it.cce_lock_addr_ = 0; - it.scan_type_ = ScanType::ScanUnknow; - it.type_ = ScanBlockingType::NoBlocking; - } - - wait_for_snapshot_cnt_.resize(shard_cnt); - for (uint16_t i = 0; i < shard_cnt; ++i) - { - wait_for_snapshot_cnt_[i] = 0; - } - } - - uint64_t GetShardCount() const - { - return blocking_vec_.size(); - } - - void SetUnfinishedCoreCnt(uint16_t core_cnt) - { - unfinished_core_cnt_.store(core_cnt, std::memory_order_release); - } - - void SetPriorCceLockAddr(uint64_t addr, uint16_t shard_id) - { - assert(shard_id < blocking_vec_.size()); - blocking_vec_[shard_id] = { - addr, ScanType::ScanUnknow, ScanBlockingType::NoBlocking}; + blocking_info_.cce_lock_addr_ = addr; + blocking_info_.scan_type_ = ScanType::ScanUnknow; + blocking_info_.type_ = ScanBlockingType::NoBlocking; } /** * @brief Notifies the scan slice request that the scan at the calling core * has finished. * - * @return true, if all cores have finished the scan. - * @return false, if the scan is not completed in all cores. */ - bool SetFinish() + void SetFinish() { - uint16_t remaining_cnt = - unfinished_core_cnt_.fetch_sub(1, std::memory_order_acq_rel); - - if (remaining_cnt == 1) + if (err_ == CcErrorCode::NO_ERROR) { - // Only update result if this is local request. Remote request - // result will be updated by dedicated core. - if (res_->Value().is_local_) - { - if (err_.load(std::memory_order_relaxed) == - CcErrorCode::NO_ERROR) - { - res_->Value().ccm_scanner_->FinalizeCommit(); - - res_->SetFinished(); - } - else - { - res_->SetError(err_.load(std::memory_order_relaxed)); - } - } + res_->SetFinished(); + } + else + { + res_->SetError(err_); } - - return remaining_cnt == 1; } - bool SetError(CcErrorCode err) + void SetError(CcErrorCode err) { - CcErrorCode expected = CcErrorCode::NO_ERROR; - err_.compare_exchange_strong(expected, - err, - std::memory_order_relaxed, - std::memory_order_relaxed); - uint16_t remaining_cnt = - unfinished_core_cnt_.fetch_sub(1, std::memory_order_acq_rel); - - // remaining_cnt might be 0 if all cores have finished and the req is - // put back into the result sending core's queue. - if (remaining_cnt <= 1) + if (err_ == CcErrorCode::NO_ERROR) { - res_->SetError(err_.load(std::memory_order_relaxed)); + err_ = err; } - return remaining_cnt <= 1; + res_->SetError(err_); } void DeferSetError(CcErrorCode err) { - CcErrorCode expected = CcErrorCode::NO_ERROR; - err_.compare_exchange_strong(expected, - err, - std::memory_order_relaxed, - std::memory_order_relaxed); - } - - CcErrorCode GetError() const - { - return err_.load(std::memory_order_acquire); - } - - /** - * @brief Send response to src node if all cores have finished. - * We use this method to send scan slice response if this request is - * a remote request. - * We assign a dedicated core to be the response sender instead of directly - * sending the response on the last finished core. This is to avoid - * serialization of response message causing one core to become - * significantly slower than others and would end up being the sender of all - * scan slice response. - */ - bool SendResponseIfFinished() - { - if (unfinished_core_cnt_.load(std::memory_order_relaxed) == 0) + if (err_ == CcErrorCode::NO_ERROR) { - if (err_.load(std::memory_order_relaxed) == CcErrorCode::NO_ERROR) - { - res_->SetFinished(); - } - else - { - res_->SetError(err_.load(std::memory_order_relaxed)); - } - return true; + err_ = err; } - return false; } - bool IsResponseSender(uint16_t core_id) const + CcErrorCode GetError() const { - return ((tx_number_ & 0x3FF) % blocking_vec_.size()) == core_id; + return err_; } bool IsForWrite() const @@ -2931,30 +2872,30 @@ struct ScanSliceCc cache_hit_miss_collected_ = true; } - bool IsWaitForSnapshot(uint16_t core_id) const + bool IsWaitForSnapshot() const { - return blocking_vec_[core_id].type_ == - ScanBlockingType::BlockOnWaitSnapshots; + return blocking_info_.type_ == ScanBlockingType::BlockOnWaitSnapshots; } - void SetIsWaitForSnapshot(uint16_t core_id) + void SetIsWaitForSnapshot() { - blocking_vec_[core_id].type_ = ScanBlockingType::BlockOnWaitSnapshots; + blocking_info_.type_ = ScanBlockingType::BlockOnWaitSnapshots; } - size_t WaitForSnapshotCnt(uint16_t core_id) const + size_t WaitForSnapshotCnt() const { - return wait_for_snapshot_cnt_[core_id]; + return wait_for_snapshot_cnt_; } - void DecreaseWaitForSnapshotCnt(uint16_t core_id) + void DecreaseWaitForSnapshotCnt() { - wait_for_snapshot_cnt_[core_id]--; + assert(wait_for_snapshot_cnt_ > 0); + wait_for_snapshot_cnt_--; } - void IncreaseWaitForSnapshotCnt(uint16_t core_id) + void IncreaseWaitForSnapshotCnt() { - wait_for_snapshot_cnt_[core_id]++; + wait_for_snapshot_cnt_++; } bool AbortIfOom() const override @@ -3008,8 +2949,7 @@ struct ScanSliceCc uint32_t range_id_{0}; - std::atomic unfinished_core_cnt_{1}; - std::atomic err_{CcErrorCode::NO_ERROR}; + CcErrorCode err_{CcErrorCode::NO_ERROR}; uint64_t ts_{0}; @@ -3019,13 +2959,20 @@ struct ScanSliceCc struct ScanBlockingInfo { - uint64_t cce_lock_addr_; - ScanType scan_type_; - ScanBlockingType type_; + void Reset() + { + cce_lock_addr_ = 0; + scan_type_ = ScanType::ScanUnknow; + type_ = ScanBlockingType::NoBlocking; + } + + uint64_t cce_lock_addr_{0}; + ScanType scan_type_{ScanType::ScanUnknow}; + ScanBlockingType type_{ScanBlockingType::NoBlocking}; }; - std::vector blocking_vec_; + ScanBlockingInfo blocking_info_; - std::vector wait_for_snapshot_cnt_; + size_t wait_for_snapshot_cnt_{0}; RangeSliceId range_slice_id_; @@ -3234,36 +3181,14 @@ struct ProcessRemoteScanRespCc : public CcRequestBase void Reset(remote::CcStreamReceiver *receiver, std::unique_ptr resp_msg, - std::vector &&offset_tables, - CcHandlerResult *hd_res, - size_t worker_cnt) + CcHandlerResult *hd_res) { receiver_ = receiver; resp_msg_ = std::move(resp_msg); - offset_tables_ = std::move(offset_tables); hd_res_ = hd_res; - - unfinished_cnt_ = worker_cnt; - next_remote_core_idx_ = worker_cnt; - - assert(offset_tables_.size() == RemoteCoreCnt()); - assert(worker_cnt <= RemoteCoreCnt()); - - cur_idxs_.clear(); - key_offsets_.clear(); - rec_offsets_.clear(); - - assert(cur_idxs_.empty()); - assert(key_offsets_.empty()); - assert(rec_offsets_.empty()); - - for (size_t worker_idx = 0; worker_idx < worker_cnt; ++worker_idx) - { - // worker idx must be less or equal than remote core count - cur_idxs_.push_back({worker_idx, 0}); - key_offsets_.push_back(KeyStartOffset(worker_idx)); - rec_offsets_.push_back(RecStartOffset(worker_idx)); - } + cur_tuple_idx_ = 0; + key_offset_ = 0; + rec_offset_ = 0; } ProcessRemoteScanRespCc(const ProcessRemoteScanRespCc &) = delete; @@ -3276,74 +3201,56 @@ struct ProcessRemoteScanRespCc : public CcRequestBase do { - auto &[remote_core_idx, tuple_idx] = cur_idxs_.at(ccs.core_id_); - + uint32_t remote_core_idx = resp_msg_->core_id(); const uint64_t *key_ts_ptr = (const uint64_t *) resp_msg_->key_ts().data(); - key_ts_ptr += MetaOffset(remote_core_idx); const uint64_t *gap_ts_ptr = (const uint64_t *) resp_msg_->gap_ts().data(); - gap_ts_ptr += MetaOffset(remote_core_idx); const uint64_t *term_ptr = (const uint64_t *) resp_msg_->term().data(); - term_ptr += MetaOffset(remote_core_idx); const uint64_t *cce_lock_ptr_ptr = (const uint64_t *) resp_msg_->cce_lock_ptr().data(); - cce_lock_ptr_ptr += MetaOffset(remote_core_idx); const remote::RecordStatusType *rec_status_ptr = (const remote::RecordStatusType *) resp_msg_->rec_status() .data(); - rec_status_ptr += MetaOffset(remote_core_idx); RangeScanSliceResult &scan_slice_result = hd_res_->Value(); CcScanner &range_scanner = *scan_slice_result.ccm_scanner_; - ScanCache *shard_cache = range_scanner.Cache(remote_core_idx); + ScanCache *shard_cache = range_scanner.Cache(0); - size_t &key_offset = key_offsets_[ccs.core_id_]; - size_t &rec_offset = rec_offsets_[ccs.core_id_]; - size_t tuple_cnt = TupleCnt(remote_core_idx); + size_t tuple_cnt = TupleCnt(); - for (; tuple_idx < tuple_cnt && scan_cnt < SCAN_BATCH_SIZE; - ++tuple_idx, ++scan_cnt) + for (; cur_tuple_idx_ < tuple_cnt && scan_cnt < SCAN_BATCH_SIZE; + ++cur_tuple_idx_, ++scan_cnt) { RecordStatus rec_status = remote::ToLocalType::ConvertRecordStatusType( - rec_status_ptr[tuple_idx]); + rec_status_ptr[cur_tuple_idx_]); shard_cache->AddScanTuple(resp_msg_->keys(), - key_offset, - key_ts_ptr[tuple_idx], + key_offset_, + key_ts_ptr[cur_tuple_idx_], resp_msg_->records(), - rec_offset, + rec_offset_, rec_status, -1, - gap_ts_ptr[tuple_idx], - cce_lock_ptr_ptr[tuple_idx], - term_ptr[tuple_idx], + gap_ts_ptr[cur_tuple_idx_], + cce_lock_ptr_ptr[cur_tuple_idx_], + term_ptr[cur_tuple_idx_], remote_core_idx, scan_slice_result.cc_ng_id_, true); } - if (tuple_idx == tuple_cnt) + if (cur_tuple_idx_ == tuple_cnt) { - size_t trailing_cnt = TrailingCnt(remote_core_idx); - while (trailing_cnt-- > 0) - { - shard_cache->RemoveLast(); - } - - range_scanner.CommitAtCore(remote_core_idx); - - if (!MoveForward(ccs.core_id_)) - { - // No more data - return SetFinished(); - } + // No more data + SetFinished(); + return true; } // To avoid blocking other request for a long time, we only process @@ -3355,115 +3262,43 @@ struct ProcessRemoteScanRespCc : public CcRequestBase return false; } - bool SetFinished() + void SetFinished() { - // This core is last finished worker. We need to set handler result and - // recycle message. - if (unfinished_cnt_.fetch_sub(1, std::memory_order_release) == 1) + if (resp_msg_->error_code() != 0) { - if (resp_msg_->error_code() != 0) - { - hd_res_->SetError(remote::ToLocalType::ConvertCcErrorCode( - resp_msg_->error_code())); - } - else - { - hd_res_->Value().ccm_scanner_->FinalizeCommit(); - - hd_res_->SetFinished(); - } - - TransactionExecution *txm = - reinterpret_cast(resp_msg_->txm_addr()); - txm->ReleaseSharedForwardLatch(); - - // Recycle message - receiver_->RecycleScanSliceResp(std::move(resp_msg_)); - - // Return true to recycle this request - return true; + hd_res_->SetError(remote::ToLocalType::ConvertCcErrorCode( + resp_msg_->error_code())); } - - return false; - } - -private: - bool MoveForward(size_t worker_idx) - { - size_t new_remote_core_idx = next_remote_core_idx_.fetch_add(1); - if (new_remote_core_idx < RemoteCoreCnt()) + else { - cur_idxs_.at(worker_idx) = {new_remote_core_idx, 0}; - key_offsets_.at(worker_idx) = KeyStartOffset(new_remote_core_idx); - rec_offsets_.at(worker_idx) = RecStartOffset(new_remote_core_idx); - - return true; + hd_res_->SetFinished(); } - // No more data - return false; - } - - size_t KeyStartOffset(size_t remote_core_idx) const - { - const size_t *ptr = reinterpret_cast( - resp_msg_->key_start_offsets().data()); - ptr += remote_core_idx; - return *ptr; - } - - size_t RecStartOffset(size_t remote_core_idx) const - { - const size_t *ptr = reinterpret_cast( - resp_msg_->record_start_offsets().data()); - ptr += remote_core_idx; - return *ptr; - } + TransactionExecution *txm = + reinterpret_cast(resp_msg_->txm_addr()); + txm->ReleaseSharedForwardLatch(); - size_t MetaOffset(size_t remote_core_idx) const - { - return offset_tables_[remote_core_idx]; + // Recycle message + receiver_->RecycleScanSliceResp(std::move(resp_msg_)); } - size_t TupleCnt(size_t remote_core_idx) const +private: + size_t TupleCnt() const { const char *tuple_cnt_info = resp_msg_->tuple_cnt().data(); - // remote core count - tuple_cnt_info += sizeof(uint16_t); - // tuple count - tuple_cnt_info += remote_core_idx * sizeof(size_t); return *(reinterpret_cast(tuple_cnt_info)); } - size_t TrailingCnt(size_t remote_core_idx) const - { - const size_t *ptr = - reinterpret_cast(resp_msg_->trailing_cnts().data()); - ptr += remote_core_idx; - return *ptr; - } - - uint16_t RemoteCoreCnt() const - { - const char *tuple_cnt_info = resp_msg_->tuple_cnt().data(); - return *reinterpret_cast(tuple_cnt_info); - } - remote::CcStreamReceiver *receiver_{nullptr}; std::unique_ptr resp_msg_{nullptr}; - // Store the start postition of meta data like `key_ts`. - std::vector offset_tables_; - // The vector of {remote_core_idx, current_tuple_idx}. - std::vector> cur_idxs_; + // current_tuple_idx}. + size_t cur_tuple_idx_; // We need to store key/rec offset so that we could restart from pause // point. - std::vector key_offsets_; - std::vector rec_offsets_; + size_t key_offset_; + size_t rec_offset_; - // Unfinished worker count. std::min(this_node_core_count, - // remote_core_count) - std::atomic unfinished_cnt_{0}; // Next remote core idx we need to process. std::atomic next_remote_core_idx_{0}; CcHandlerResult *hd_res_{nullptr}; @@ -4037,7 +3872,6 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase uint64_t data_sync_ts, uint64_t node_group_id, int64_t node_group_term, - uint16_t core_cnt, size_t scan_batch_size, uint64_t txn, const TxKey *target_start_key, @@ -4052,14 +3886,13 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase table_name_(&table_name), node_group_id_(node_group_id), node_group_term_(node_group_term), - core_cnt_(core_cnt), last_data_sync_ts_(last_data_sync_ts), data_sync_ts_(data_sync_ts), start_key_(target_start_key), end_key_(target_end_key), scan_batch_size_(scan_batch_size), err_(CcErrorCode::NO_ERROR), - unfinished_cnt_(core_cnt_), + finished_(false), mux_(), cv_(), export_base_table_item_(export_base_table_item), @@ -4082,24 +3915,19 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase false); }); } - for (size_t i = 0; i < core_cnt; i++) + data_sync_vec_.resize(scan_batch_size); + if (!export_base_table_item_only_) { - data_sync_vec_.emplace_back(); - data_sync_vec_.back().resize(scan_batch_size); - if (!export_base_table_item_only_) - { - archive_vec_.emplace_back(); - archive_vec_.back().reserve(scan_batch_size); - mv_base_idx_vec_.emplace_back(); - mv_base_idx_vec_.back().reserve(scan_batch_size); - } - - pause_pos_.emplace_back(TxKey(), false); - curr_slice_index_.emplace_back(0); - accumulated_scan_cnt_.emplace_back(0); - accumulated_flush_data_size_.emplace_back(0); - scan_heap_is_full_.emplace_back(0); + archive_vec_.reserve(scan_batch_size); + mv_base_idx_vec_.reserve(scan_batch_size); } + + pause_pos_.first = std::move(TxKey()); + pause_pos_.second = false; + curr_slice_index_ = 0; + accumulated_scan_cnt_ = 0; + accumulated_flush_data_size_ = 0; + scan_heap_is_full_ = 0; } bool ValidTermCheck() @@ -4133,7 +3961,6 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); return false; } - scan_count_++; CcMap *ccm = ccs.GetCcm(*table_name_, node_group_id_); if (ccm == nullptr) { @@ -4169,49 +3996,44 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase return false; } - bool IsDrained(size_t core_idx) const + bool IsDrained() const { - return pause_pos_[core_idx].second; + return pause_pos_.second; } - std::pair &PausePos(size_t core_idx) + std::pair &PausePos() { - return pause_pos_[core_idx]; + return pause_pos_; } void Wait() { std::unique_lock lk(mux_); - cv_.wait(lk, [this] { return unfinished_cnt_ == 0; }); + cv_.wait(lk, [this] { return finished_; }); } void Reset(OpType op_type = OpType::Normal) { std::lock_guard lk(mux_); - unfinished_cnt_ = 1; - for (size_t i = 0; i < core_cnt_; i++) + finished_ = false; + if (!export_base_table_item_only_) { - if (!export_base_table_item_only_) - { - archive_vec_.at(i).clear(); - archive_vec_.at(i).reserve(scan_batch_size_); - mv_base_idx_vec_.at(i).clear(); - mv_base_idx_vec_.at(i).reserve(scan_batch_size_); - } + archive_vec_.clear(); + mv_base_idx_vec_.clear(); + } - accumulated_scan_cnt_.at(i) = 0; - accumulated_flush_data_size_.at(i) = 0; - if (scan_heap_is_full_[i] == 1) - { - // vec has been cleared during ReleaseDataSyncScanHeapCc, - // resize to prepared size - data_sync_vec_[i].resize(scan_batch_size_); - scan_heap_is_full_[i] = 0; - } - if (export_base_table_item_) - { - curr_slice_index_[i] = 0; - } + accumulated_scan_cnt_ = 0; + accumulated_flush_data_size_ = 0; + if (scan_heap_is_full_ == 1) + { + // vec has been cleared during ReleaseDataSyncScanHeapCc, + // resize to prepared size + data_sync_vec_.resize(scan_batch_size_); + scan_heap_is_full_ = 0; + } + if (export_base_table_item_) + { + curr_slice_index_ = 0; } err_ = CcErrorCode::NO_ERROR; @@ -4223,12 +4045,9 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase { std::lock_guard lk(mux_); err_ = err; - --unfinished_cnt_; - if (unfinished_cnt_ == 0) - { - UnpinSlices(); - cv_.notify_one(); - } + finished_ = true; + UnpinSlices(); + cv_.notify_one(); } void AbortCcRequest(CcErrorCode err_code) override @@ -4249,26 +4068,22 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase return err_; } - void SetFinish(size_t core_id) + void SetFinish() { std::unique_lock lk(mux_); - --unfinished_cnt_; - if (export_base_table_item_ && !pause_pos_[core_id].second) + finished_ = true; + if (export_base_table_item_ && !pause_pos_.second) { // Only not drained on this core, should set the paused key. - UpdateMinPausedSlice(&pause_pos_[core_id].first); + UpdateMinPausedSlice(&pause_pos_.first); } else if (!export_base_table_item_) { - UpdateMinPausedSlice(curr_slice_index_[core_id]); - } - - if (unfinished_cnt_ == 0) - { - // Unpin the slices - UnpinSlices(); - cv_.notify_one(); + UpdateMinPausedSlice(curr_slice_index_); } + // Unpin the slices + UnpinSlices(); + cv_.notify_one(); } uint32_t NodeGroupId() @@ -4276,19 +4091,19 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase return node_group_id_; } - std::vector &DataSyncVec(uint16_t core_id) + std::vector &DataSyncVec() { - return data_sync_vec_[core_id]; + return data_sync_vec_; } - std::vector &ArchiveVec(uint16_t core_id) + std::vector &ArchiveVec() { - return archive_vec_[core_id]; + return archive_vec_; } - std::vector &MoveBaseIdxVec(uint16_t core_id) + std::vector &MoveBaseIdxVec() { - return mv_base_idx_vec_[core_id]; + return mv_base_idx_vec_; } int64_t NodeGroupTerm() const @@ -4312,66 +4127,47 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase return store_range_; } - void FixCurrentSliceIndex(uint16_t core_id) + StoreSlice *CurrentSlice() const { - assert(export_base_table_item_); - if (pause_pos_[core_id].first.KeyPtr() != nullptr) - { - size_t curr_slice_idx = 0; - StoreSlice *curr_slice = - slice_coordinator_.pinned_slices_[curr_slice_idx]; - while (curr_slice->EndTxKey() < pause_pos_[core_id].first) - { - ++curr_slice_idx; - assert(curr_slice_idx < - slice_coordinator_.pinned_slices_.size()); - curr_slice = slice_coordinator_.pinned_slices_[curr_slice_idx]; - } - curr_slice_index_[core_id] = curr_slice_idx; - } - } - - StoreSlice *CurrentSlice(uint16_t core_id) const - { - size_t curr_slice_idx = curr_slice_index_[core_id]; if (export_base_table_item_) { - assert(curr_slice_idx < slice_coordinator_.pinned_slices_.size()); - return slice_coordinator_.pinned_slices_.at(curr_slice_idx); + assert(curr_slice_index_ < + slice_coordinator_.pinned_slices_.size()); + return slice_coordinator_.pinned_slices_.at(curr_slice_index_); } - assert(curr_slice_idx < slices_to_scan_.size()); - const TxKey &curr_slice_key = slices_to_scan_.at(curr_slice_idx).first; + assert(curr_slice_index_ < slices_to_scan_.size()); + const TxKey &curr_slice_key = + slices_to_scan_.at(curr_slice_index_).first; return store_range_->FindSlice(curr_slice_key); } - const TxKey &CurrentSliceKey(uint16_t core_id) const + const TxKey &CurrentSliceKey() const { assert(!export_base_table_item_); - size_t curr_slice_index = curr_slice_index_[core_id]; - assert(curr_slice_index < slices_to_scan_.size()); - return slices_to_scan_[curr_slice_index].first; + assert(curr_slice_index_ < slices_to_scan_.size()); + return slices_to_scan_[curr_slice_index_].first; } - void MoveToNextSlice(uint16_t core_id) + void MoveToNextSlice() { - curr_slice_index_[core_id]++; + curr_slice_index_++; } - bool TheBatchEnd(uint16_t core_id) const + bool TheBatchEnd() const { - return curr_slice_index_[core_id] >= + return curr_slice_index_ >= (export_base_table_item_ ? slice_coordinator_.pinned_slices_.size() : slice_coordinator_.batch_end_slice_index_); } - bool IsSlicePinned(uint16_t core_id) const + bool IsSlicePinned() const { assert(export_base_table_item_ || - curr_slice_index_[core_id] < slices_to_scan_.size()); + curr_slice_index_ < slices_to_scan_.size()); return export_base_table_item_ ? true - : slices_to_scan_[curr_slice_index_[core_id]].second; + : slices_to_scan_[curr_slice_index_].second; } uint64_t SchemaVersion() const override @@ -4379,11 +4175,6 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase return schema_version_; } - void SetUnfinishedCoreCnt(uint16_t core_cnt) - { - unfinished_cnt_ = core_cnt; - } - void UnpinSlices() { if (slice_coordinator_.first_slice_id_.Range() != nullptr) @@ -4427,13 +4218,10 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase return last_data_sync_ts_; } - std::vector accumulated_scan_cnt_; - std::vector accumulated_flush_data_size_; + size_t accumulated_scan_cnt_; + uint64_t accumulated_flush_data_size_; - // std::vector is not safe to use in multi-threaded environment, - std::vector scan_heap_is_full_{0}; - - size_t scan_count_{0}; + uint32_t scan_heap_is_full_{0}; private: struct SliceCoordinator @@ -4553,7 +4341,6 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase const TableName *table_name_{nullptr}; uint32_t node_group_id_; int64_t node_group_term_; - uint16_t core_cnt_; // It is used as a hint to decide if a page has dirty data since last round // of checkpoint. It is guaranteed that all entries committed before this ts // are synced into data store. @@ -4561,10 +4348,10 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase // Target ts. Collect all data changes committed before this ts into data // sync vec. uint64_t data_sync_ts_; - std::vector> data_sync_vec_; - std::vector> archive_vec_; + std::vector data_sync_vec_; + std::vector archive_vec_; // Cache the entries to move record from "base" table to "archive" table - std::vector> mv_base_idx_vec_; + std::vector mv_base_idx_vec_; // Start/end key of target range if the scan is on a range only, nullptr if // it's on entire table. @@ -4573,11 +4360,11 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase // Position that we left off during last round of ckpt scan. // pause_pos_.first is the key that we stopped at (has not been scanned // though), bool is if this core has finished scanning all keys already. - std::vector> pause_pos_; + std::pair pause_pos_; size_t scan_batch_size_; CcErrorCode err_{CcErrorCode::NO_ERROR}; - uint32_t unfinished_cnt_; + bool finished_{false}; std::mutex mux_; std::condition_variable cv_; @@ -4595,7 +4382,7 @@ struct RangePartitionDataSyncScanCc : public CcRequestBase // The index of the current slice to be scanned. If export_base_table_item_ // is true, it is the index of the SliceCoordinator::pinned_slices_ vector, // and if false, it is the index of the slices_to_scan_ vector. - std::vector curr_slice_index_; + size_t curr_slice_index_; // keep schema vesion after acquire read lock on catalog, to prevent the // concurrency issue with Truncate Table, detail ref to tx issue #1130 // If schema_version_ is 0, the check will be bypassed, since this data sync @@ -4837,7 +4624,10 @@ struct ReplayLogCc : public TemplatedCcRequest std::shared_ptr range_split_started = nullptr, std::unordered_set *range_splitting = nullptr, uint16_t first_core = 0, - ParseDataLogCc *parse_cc = nullptr) + ParseDataLogCc *parse_cc = nullptr, + const std::unordered_map> + *split_range_info = nullptr) { table_name_holder_ = TableName(table_name_view, table_type, table_engine); @@ -4865,6 +4655,15 @@ struct ReplayLogCc : public TemplatedCcRequest is_lock_recovery_ = is_lock_recovery; upsert_kv_err_code_ = {true, CcErrorCode::NO_ERROR}; parse_cc_ = parse_cc; + split_ranges_ = nullptr; + if (split_range_info != nullptr) + { + auto table_it = split_range_info->find(table_name_holder_); + if (table_it != split_range_info->end()) + { + split_ranges_ = &table_it->second; + } + } } ReplayLogCc(const ReplayLogCc &rhs) = delete; @@ -5063,6 +4862,16 @@ struct ReplayLogCc : public TemplatedCcRequest return first_core_; } + uint64_t RangeSplitCommitTs(int32_t range_id) const + { + if (split_ranges_ == nullptr) + { + return 0; + } + auto it = split_ranges_->find(range_id); + return it == split_ranges_->end() ? 0 : it->second; + } + void SetOffset(size_t offset) { offset_ = offset; @@ -5130,6 +4939,9 @@ struct ReplayLogCc : public TemplatedCcRequest CcErrorCode::NO_ERROR}; ParseDataLogCc *parse_cc_{nullptr}; + // Range split commit ts per range for the current table, if available. + const std::unordered_map *split_ranges_{nullptr}; + friend std::ostream &operator<<(std::ostream &outs, txservice::ReplayLogCc *r); }; @@ -5146,7 +4958,10 @@ struct ParseDataLogCc : public CcRequestBase std::atomic &status, std::atomic &on_fly_cnt, bool &recovery_error, - const bool is_lock_recovery = false) + const bool is_lock_recovery = false, + const std::unordered_map> + *split_range_info = nullptr) { log_records_sv_ = std::string_view(log_records.data(), log_records.size()); @@ -5158,6 +4973,7 @@ struct ParseDataLogCc : public CcRequestBase on_fly_cnt_ = &on_fly_cnt; recovery_error_ = &recovery_error; is_lock_recovery_ = is_lock_recovery; + split_range_info_ = split_range_info; } void Reset(::txlog::ReplayMessage &&replay_message, @@ -5167,7 +4983,10 @@ struct ParseDataLogCc : public CcRequestBase std::atomic &status, std::atomic &on_fly_cnt, bool &recovery_error, - const bool is_lock_recovery = false) + const bool is_lock_recovery = false, + const std::unordered_map> + *split_range_info = nullptr) { replay_message_ = std::make_unique<::txlog::ReplayMessage>(std::move(replay_message)); @@ -5182,13 +5001,15 @@ struct ParseDataLogCc : public CcRequestBase on_fly_cnt_ = &on_fly_cnt; recovery_error_ = &recovery_error; is_lock_recovery_ = is_lock_recovery; + split_range_info_ = split_range_info; } bool Execute(CcShard &ccs) override { size_t offset = 0; // core of first key in log - int dest_core = 0; + uint32_t core_rand = butil::fast_rand(); + uint16_t dest_core = static_cast(core_rand % ccs.core_cnt_); std::vector replay_cc_list; replay_cc_list.reserve(160); while (offset < log_records_sv_.size()) @@ -5259,10 +5080,19 @@ struct ParseDataLogCc : public CcRequestBase uint32_t kv_len = *reinterpret_cast( blob.data() + blob_offset); blob_offset += sizeof(uint32_t); - size_t hash = ccs.GetCatalogFactory(table_engine) - ->KeyHash(blob.data(), blob_offset, nullptr); - dest_core = hash ? (hash & 0x3FF) % ccs.core_cnt_ - : (dest_core + 1) % ccs.core_cnt_; + if (table_engine == TableEngine::EloqSql || + table_engine == TableEngine::EloqDoc) + { + dest_core = (dest_core + 1) % ccs.core_cnt_; + } + else + { + size_t hash = + ccs.GetCatalogFactory(table_engine) + ->KeyHash(blob.data(), blob_offset, nullptr); + dest_core = hash ? (hash & 0x3FF) % ccs.core_cnt_ + : (dest_core + 1) % ccs.core_cnt_; + } ReplayLogCc *cc_req = replay_cc_pool_.NextRequest(); replay_cc_list.push_back(cc_req); assert(cc_ng_term_ >= 0); @@ -5283,7 +5113,8 @@ struct ParseDataLogCc : public CcRequestBase nullptr, nullptr, dest_core, - this); + this, + split_range_info_); blob_offset += kv_len; } @@ -5321,6 +5152,8 @@ struct ParseDataLogCc : public CcRequestBase std::atomic *on_fly_cnt_; bool *recovery_error_; bool is_lock_recovery_; + const std::unordered_map> + *split_range_info_{nullptr}; }; struct BroadcastStatisticsCc @@ -6649,7 +6482,6 @@ struct UpdateKeyCacheCc : public CcRequestBase void Reset(const TableName &tbl_name, uint32_t ng_id, int64_t ng_term, - size_t core_cnt, const TxKey &start_key, const TxKey &end_key, StoreRange *range, @@ -6663,10 +6495,8 @@ struct UpdateKeyCacheCc : public CcRequestBase start_key_ = &start_key; end_key_ = &end_key; store_range_ = range; - unfinished_core_ = core_cnt; hd_res_ = res; - paused_pos_.clear(); - paused_pos_.resize(core_cnt); + paused_pos_ = TxKey(); } bool Execute(CcShard &ccs) override @@ -6674,7 +6504,8 @@ struct UpdateKeyCacheCc : public CcRequestBase int64_t ng_term = Sharder::Instance().LeaderTerm(node_group_id_); if (ng_term < 0 || ng_term != ng_term_) { - return SetFinish(); + SetFinish(); + return true; } CcMap *ccm = ccs.GetCcm(*table_name_, node_group_id_); @@ -6683,14 +6514,9 @@ struct UpdateKeyCacheCc : public CcRequestBase return ccm->Execute(*this); } - bool SetFinish() + void SetFinish() { - if (unfinished_core_.fetch_sub(1, std::memory_order_acq_rel) == 1) - { - hd_res_->SetFinished(); - return true; - } - return false; + hd_res_->SetFinished(); } const TableName *table_name_{nullptr}; @@ -6699,8 +6525,7 @@ struct UpdateKeyCacheCc : public CcRequestBase const TxKey *start_key_{nullptr}; const TxKey *end_key_{nullptr}; StoreRange *store_range_{nullptr}; - std::vector paused_pos_; - std::atomic unfinished_core_; + TxKey paused_pos_; CcHandlerResult *hd_res_{nullptr}; }; @@ -7714,7 +7539,9 @@ struct CollectMemStatsCc : public CcRequestBase struct UploadBatchCc : public CcRequestBase { + // keys, records, commit_ts, rec_status, range_size_flags using WriteEntryTuple = std::tuple; @@ -7731,10 +7558,10 @@ struct UploadBatchCc : public CcRequestBase void Reset(const TableName &table_name, txservice::NodeGroupId ng_id, int64_t &ng_term, - size_t core_cnt, + int32_t partition_id, size_t batch_size, size_t start_key_idx, - const std::vector &entry_vec, + const std::vector> &entry_vec, bthread::Mutex &req_mux, bthread::ConditionVariable &req_cv, size_t &finished_req_cnt, @@ -7745,6 +7572,7 @@ struct UploadBatchCc : public CcRequestBase node_group_id_ = ng_id; node_group_term_ = &ng_term; is_remote_ = false; + partition_id_ = partition_id; batch_size_ = batch_size; start_key_idx_ = start_key_idx; entry_vector_ = &entry_vec; @@ -7752,16 +7580,17 @@ struct UploadBatchCc : public CcRequestBase req_cv_ = &req_cv; finished_req_cnt_ = &finished_req_cnt; req_result_ = &req_result; - unfinished_cnt_.store(core_cnt, std::memory_order_relaxed); + unfinished_cnt_.store(1, std::memory_order_relaxed); err_code_.store(CcErrorCode::NO_ERROR, std::memory_order_relaxed); paused_pos_.clear(); - paused_pos_.resize(core_cnt, {}); + paused_pos_.resize(1, {}); data_type_ = data_type; } void Reset(const TableName &table_name, txservice::NodeGroupId ng_id, int64_t &ng_term, + int32_t partition_id, size_t core_cnt, uint32_t batch_size, const WriteEntryTuple &entry_tuple, @@ -7774,6 +7603,7 @@ struct UploadBatchCc : public CcRequestBase node_group_id_ = ng_id; node_group_term_ = &ng_term; is_remote_ = true; + partition_id_ = partition_id; batch_size_ = batch_size; start_key_idx_ = 0; entry_tuples_ = &entry_tuple; @@ -7916,7 +7746,12 @@ struct UploadBatchCc : public CcRequestBase return batch_size_; } - const std::vector *EntryVector() const + int32_t PartitionId() const + { + return partition_id_; + } + + const std::vector> *EntryVector() const { return is_remote_ ? nullptr : entry_vector_; } @@ -7931,19 +7766,23 @@ struct UploadBatchCc : public CcRequestBase size_t key_off, size_t rec_off, size_t ts_off, - size_t status_off) + size_t status_off, + size_t flags_off) { + core_id = partition_id_ >= 0 ? 0 : core_id; auto &key_pos = paused_pos_.at(core_id); std::get<0>(key_pos) = key_index; std::get<1>(key_pos) = key_off; std::get<2>(key_pos) = rec_off; std::get<3>(key_pos) = ts_off; std::get<4>(key_pos) = status_off; + std::get<5>(key_pos) = flags_off; } - const std::tuple &GetPausedPosition( - uint16_t core_id) const + const std::tuple & + GetPausedPosition(uint16_t core_id) const { + core_id = partition_id_ >= 0 ? 0 : core_id; return paused_pos_.at(core_id); } @@ -7967,12 +7806,14 @@ struct UploadBatchCc : public CcRequestBase uint32_t node_group_id_{0}; int64_t *node_group_term_{nullptr}; bool is_remote_{false}; + // -1 means broadcast to all shards(used by hash partition) + int32_t partition_id_{-1}; uint32_t batch_size_{0}; size_t start_key_idx_{0}; union { - // for local request - const std::vector *entry_vector_; + // for local request: (range_size_flags, WriteEntry*) + const std::vector> *entry_vector_; // for remote request const WriteEntryTuple *entry_tuples_; }; @@ -7984,8 +7825,10 @@ struct UploadBatchCc : public CcRequestBase // This two variables may be accessed by multi-cores. std::atomic unfinished_cnt_{0}; std::atomic err_code_{CcErrorCode::NO_ERROR}; - // key index, key offset, record offset, ts offset, record status offset - std::vector> paused_pos_; + // key index, key offset, record offset, ts offset, record status offset, + // range_size_flags offset + std::vector> + paused_pos_; UploadBatchType data_type_{UploadBatchType::SkIndexData}; }; @@ -8270,25 +8113,19 @@ struct UploadBatchSlicesCc : public CcRequestBase void Reset(const TableName &table_name, txservice::NodeGroupId ng_id, int64_t &ng_term, - size_t core_cnt, const WriteEntryTuple &entry_tuple, std::shared_ptr slice_info) { table_name_ = &table_name; node_group_id_ = ng_id; node_group_term_ = &ng_term; - core_cnt_ = core_cnt; - partitioned_slice_data_.resize(core_cnt); - next_idxs_.resize(core_cnt); - for (size_t i = 0; i < core_cnt; i++) - { - next_idxs_[i] = 0; - } + slice_data_.clear(); + next_idx_ = 0; entry_tuples_ = &entry_tuple; slices_info_ = slice_info; - unfinished_cnt_ = core_cnt; + finished_ = false; err_code_ = CcErrorCode::NO_ERROR; } @@ -8354,14 +8191,12 @@ struct UploadBatchSlicesCc : public CcRequestBase std::pair> SetFinish() { std::unique_lock req_lk(req_mux_); - if (--unfinished_cnt_ == 0) - { - // Make a copy of slices_info_ to avoid race condition. - std::shared_ptr slices_info = slices_info_; - req_cv_.notify_one(); - return {true, std::move(slices_info)}; - } - return {false, nullptr}; + finished_ = true; + + // Make a copy of slices_info_ to avoid race condition. + std::shared_ptr slices_info = slices_info_; + req_cv_.notify_one(); + return {true, std::move(slices_info)}; } bool SetError(CcErrorCode err_code) @@ -8371,13 +8206,9 @@ struct UploadBatchSlicesCc : public CcRequestBase { err_code_ = err_code; } - if (--unfinished_cnt_ == 0) - { - req_cv_.notify_one(); - - return true; - } - return false; + finished_ = true; + req_cv_.notify_one(); + return true; } void AbortCcRequest(CcErrorCode err_code) override @@ -8394,7 +8225,7 @@ struct UploadBatchSlicesCc : public CcRequestBase void Wait() { std::unique_lock lk(req_mux_); - while (unfinished_cnt_ != 0) + while (!finished_) { req_cv_.wait(lk); } @@ -8457,7 +8288,7 @@ struct UploadBatchSlicesCc : public CcRequestBase } void SetParsed() { - parsed_.store(true, std::memory_order_release); + parsed_ = true; } void AddDataItem(TxKey key, @@ -8465,34 +8296,26 @@ struct UploadBatchSlicesCc : public CcRequestBase uint64_t version_ts, bool is_deleted) { - size_t hash = key.Hash(); - // Uses the lower 10 bits of the hash code to shard the key across - // CPU cores at this node. - uint16_t core_code = hash & 0x3FF; - uint16_t core_id = core_code % core_cnt_; - - partitioned_slice_data_[core_id].emplace_back( + slice_data_.emplace_back( std::move(key), std::move(record), version_ts, is_deleted); } - size_t NextIndex(size_t core_idx) const + size_t NextIndex() const { - size_t next_idx = next_idxs_[core_idx]; - assert(next_idx <= partitioned_slice_data_[core_idx].size()); - return next_idx; + assert(next_idx_ <= slice_data_.size()); + return next_idx_; } - void SetNextIndex(size_t core_idx, size_t index) + void SetNextIndex(size_t index) { - assert(index <= partitioned_slice_data_[core_idx].size()); - next_idxs_[core_idx] = index; + assert(index <= slice_data_.size()); + next_idx_ = index; } // Notice: these data items belong to multi slices. - std::deque &SliceData(uint16_t core_id) + std::deque &SliceData() { - assert(core_id < partitioned_slice_data_.size()); - return partitioned_slice_data_[core_id]; + return slice_data_; } bool AbortIfOom() const override @@ -8501,7 +8324,6 @@ struct UploadBatchSlicesCc : public CcRequestBase } private: - uint16_t core_cnt_; const TableName *table_name_{nullptr}; uint32_t node_group_id_{0}; int64_t *node_group_term_{nullptr}; @@ -8514,17 +8336,16 @@ struct UploadBatchSlicesCc : public CcRequestBase // key offset, record offset, ts offset, record status offset // when parse items std::tuple parse_offset_{0, 0, 0, 0}; - // parse items on one core, then put the req to other cores. - std::atomic_bool parsed_{false}; + bool parsed_{false}; - std::vector> partitioned_slice_data_; + std::deque slice_data_; // pause position when emplace keys into ccmap in batches - std::vector next_idxs_; + size_t next_idx_; bthread::Mutex req_mux_{}; bthread::ConditionVariable req_cv_{}; // This two variables may be accessed by multi-cores. - size_t unfinished_cnt_{0}; + bool finished_{false}; CcErrorCode err_code_{CcErrorCode::NO_ERROR}; }; @@ -8747,7 +8568,6 @@ struct ScanSliceDeltaSizeCcForRangePartition : public CcRequestBase uint64_t scan_ts, uint64_t ng_id, int64_t ng_term, - uint64_t core_cnt, uint64_t txn, const TxKey &target_start_key, const TxKey &target_end_key, @@ -8764,20 +8584,14 @@ struct ScanSliceDeltaSizeCcForRangePartition : public CcRequestBase store_range_(store_range), is_dirty_(is_dirty), has_dml_since_ddl_(false), - unfinished_cnt_(core_cnt), + finished_(false), schema_version_(schema_version) { tx_number_ = txn; - pause_pos_.resize(core_cnt); + pause_pos_.first = std::move(TxKey()); + pause_pos_.second = nullptr; size_t slice_cnt = store_range ? store_range->SlicesCount() : 0; - for (size_t i = 0; i < core_cnt; ++i) - { - slice_delta_size_.emplace_back(); - if (slice_cnt > 0) - { - slice_delta_size_.back().reserve(slice_cnt); - } - } + slice_delta_size_.reserve(slice_cnt); } bool ValidTermCheck() const @@ -8820,26 +8634,22 @@ struct ScanSliceDeltaSizeCcForRangePartition : public CcRequestBase void Wait() { std::unique_lock lk(mux_); - cv_.wait(lk, [this] { return unfinished_cnt_ == 0; }); + cv_.wait(lk, [this] { return finished_; }); } void SetFinish() { std::unique_lock lk(mux_); - if (--unfinished_cnt_ == 0) - { - cv_.notify_one(); - } + finished_ = true; + cv_.notify_one(); } void SetError(CcErrorCode err) { std::unique_lock lk(mux_); err_ = err; - if (--unfinished_cnt_ == 0) - { - cv_.notify_one(); - } + finished_ = true; + cv_.notify_one(); } bool IsError() @@ -8901,18 +8711,18 @@ struct ScanSliceDeltaSizeCcForRangePartition : public CcRequestBase assert(store_range); bool res = store_range_.compare_exchange_strong( expect, store_range, std::memory_order_acq_rel); - slice_delta_size_[core_id].reserve(store_range->SlicesCount()); + slice_delta_size_.reserve(store_range->SlicesCount()); return res; } - std::pair &PausedPos(size_t core_id) + std::pair &PausedPos() { - return pause_pos_[core_id]; + return pause_pos_; } - std::vector> &SliceDeltaSize(size_t core_id) + std::vector> &SliceDeltaSize() { - return slice_delta_size_[core_id]; + return slice_delta_size_; } bool IsDirty() const @@ -8956,10 +8766,10 @@ struct ScanSliceDeltaSizeCcForRangePartition : public CcRequestBase // pause_pos_.first is the key that we stopped at (has not been scanned // though), .second is the slice that we stopped in (has not been scanned // completed yet). - std::vector> pause_pos_; + std::pair pause_pos_; // The delta size of the slices. First is the TxKey of the slice, second is // the delta size. The TxKey is not the owner of the key. - std::vector>> slice_delta_size_; + std::vector> slice_delta_size_; // Generally, if the size of a key in the data store is unknown (the // data_store_size_ is INT32_MAX), we need to read the storage (via @@ -8977,7 +8787,7 @@ struct ScanSliceDeltaSizeCcForRangePartition : public CcRequestBase std::atomic has_dml_since_ddl_{false}; CcErrorCode err_{CcErrorCode::NO_ERROR}; - uint32_t unfinished_cnt_; + bool finished_{false}; uint64_t schema_version_; std::mutex mux_; std::condition_variable cv_; diff --git a/tx_service/include/cc/cc_shard.h b/tx_service/include/cc/cc_shard.h index 09e4081d..c1554381 100644 --- a/tx_service/include/cc/cc_shard.h +++ b/tx_service/include/cc/cc_shard.h @@ -315,6 +315,11 @@ class CcShard */ CcMap *GetCcm(const TableName &table_name, uint32_t node_group); + void FetchTableRangeSize(const TableName &table_name, + int32_t partition_id, + NodeGroupId cc_ng_id, + int64_t cc_ng_term); + void AdjustDataKeyStats(const TableName &table_name, int64_t size_delta, int64_t dirty_delta); @@ -1138,6 +1143,10 @@ class CcShard } } + void ResetRangeSplittingStatus(const TableName &table_name, + uint32_t ng_id, + uint32_t range_id); + FillStoreSliceCc *NewFillStoreSliceCc() { return fill_store_slice_cc_pool_.NextRequest(); @@ -1156,6 +1165,12 @@ class CcShard void DeleteSchemaCntl(const TableName &tbl_name); + void CreateSplitRangeDataSyncTask(const TableName &table_name, + uint32_t ng_id, + int64_t ng_term, + int32_t range_id, + uint64_t data_sync_ts); + void ClearNativeSchemaCntl(); void CollectCacheHit(); void CollectCacheMiss(); @@ -1222,6 +1237,7 @@ class CcShard CcRequestPool fill_store_slice_cc_pool_; CcRequestPool init_key_cache_cc_pool_; + CcRequestPool fetch_range_size_cc_pool_; // CcRequest queue on this shard/core. moodycamel::ConcurrentQueue cc_queue_; diff --git a/tx_service/include/cc/ccm_scanner.h b/tx_service/include/cc/ccm_scanner.h index 7de8dbb2..96c5d898 100644 --- a/tx_service/include/cc/ccm_scanner.h +++ b/tx_service/include/cc/ccm_scanner.h @@ -424,7 +424,6 @@ class CcScanner return TxKey(); } - virtual void ResetShards(size_t shard_cnt) = 0; virtual void ResetCaches() = 0; virtual void Reset(const KeySchema *key_schema) = 0; virtual void Close() = 0; @@ -466,16 +465,6 @@ class CcScanner virtual uint32_t ShardCount() const = 0; - virtual void CommitAtCore(uint16_t core_id) - { - assert(false); - } - - virtual void FinalizeCommit() - { - assert(false); - } - ScanDirection Direction() const { return direct_; @@ -841,12 +830,6 @@ class HashParitionCcScanner : public CcScanner { } - void ResetShards(size_t shard_cnt) override - { - assert(false && - "ResetShards is designed for RangePartitionedCcmScanner."); - } - void ResetCaches() override { for (auto &[shard_code, cache] : shard_caches_) @@ -1199,7 +1182,9 @@ class RangePartitionedCcmScanner : public CcScanner RangePartitionedCcmScanner(ScanDirection direct, ScanIndexType index_type, const KeySchema *schema) - : CcScanner(direct, index_type), scans_(), key_schema_(schema) + : CcScanner(direct, index_type), + scan_cache_(this, schema), + key_schema_(schema) { } @@ -1207,113 +1192,59 @@ class RangePartitionedCcmScanner : public CcScanner { } - void ResetShards(size_t shard_cnt) override - { - size_t old_size = scans_.size(); - if (shard_cnt > old_size) - { - scans_.reserve(shard_cnt); - index_chain_.reserve(shard_cnt); - for (size_t idx = old_size; idx < shard_cnt; ++idx) - { - scans_.emplace_back(this, key_schema_); - index_chain_.emplace_back(); - } - } - else if (shard_cnt < old_size) - { - for (size_t idx = shard_cnt; idx < old_size; ++idx) - { - scans_.pop_back(); - } - index_chain_.resize(shard_cnt); - } - - assert(scans_.size() == shard_cnt); - - for (size_t idx = 0; idx < old_size && idx < shard_cnt; ++idx) - { - scans_[idx].Reset(); - index_chain_[idx].clear(); - } - - std::unique_lock lk(mux_); - head_index_ = Inf(); - head_occupied_ = false; - } - void ResetCaches() override { - for (size_t core_id = 0; core_id < scans_.size(); ++core_id) - { - scans_[core_id].Reset(); - index_chain_[core_id].clear(); - } - - head_index_ = Inf(); - head_occupied_ = false; + scan_cache_.Reset(); } ScanCache *Cache(uint32_t shard_code) override { - // For RangePartitionedCcmScanner, shard_code is core_id. - return &scans_[shard_code]; + (void) shard_code; + return &scan_cache_; } void ShardCacheSizes(std::vector> *shard_code_and_sizes) const override { - for (size_t core_id = 0; core_id < scans_.size(); ++core_id) - { - shard_code_and_sizes->emplace_back(core_id, scans_[core_id].Size()); - } + shard_code_and_sizes->emplace_back(0u, scan_cache_.Size()); } void MemoryShardCacheLastTuples( std::vector *last_tuples) const override { - last_tuples->reserve(scans_.size()); - for (size_t core_id = 0; core_id < scans_.size(); ++core_id) - { - last_tuples->emplace_back(scans_[core_id].LastTuple()); - } + last_tuples->emplace_back(scan_cache_.LastTuple()); } void MemoryShardCacheTrailingTuples( std::vector *trailing_tuples) const override { - for (size_t core_id = 0; core_id < scans_.size(); ++core_id) - { - scans_[core_id].TrailingTuples(*trailing_tuples); - } + scan_cache_.TrailingTuples(*trailing_tuples); } const ScanTuple *Current() override { - if (head_index_ == Inf()) + if (status_ != ScannerStatus::Open) { - status_ = ScannerStatus::Blocked; return nullptr; } - else + + const TemplateScanTuple *tuple = scan_cache_.Current(); + if (tuple == nullptr) { - assert(status_ == ScannerStatus::Open); - return At(head_index_); + status_ = ScannerStatus::Blocked; } + + return tuple; } void MoveNext() override { - if (head_index_ == Inf()) + if (status_ != ScannerStatus::Open) { return; } - head_index_ = AdvanceMergeIndex(head_index_); - if (head_index_ == Inf()) - { - status_ = ScannerStatus::Blocked; - } + scan_cache_.MoveNext(); } CcmScannerType Type() const override @@ -1342,7 +1273,7 @@ class RangePartitionedCcmScanner : public CcScanner uint32_t ShardCount() const override { - return scans_.size(); + return 1; } void Reset(const KeySchema *key_schema) override @@ -1354,289 +1285,11 @@ class RangePartitionedCcmScanner : public CcScanner void Close() override { status_ = ScannerStatus::Closed; - scans_.clear(); - index_chain_.clear(); - head_index_ = Inf(); - head_occupied_ = false; - } - - /** - * @brief Commits the scan at the specified core. - * - * @param core_id - */ - void CommitAtCore(uint16_t core_id) override - { - size_t sz = scans_[core_id].Size(); - if (sz > 0) - { - std::vector &next_chain = index_chain_[core_id]; - assert(next_chain.empty()); - next_chain.reserve(sz); - - for (uint32_t idx = 0; idx < sz - 1; ++idx) - { - next_chain.emplace_back(core_id, idx + 1); - } - // The next index of the last tuple is infinity. - next_chain.emplace_back(Inf()); - assert(next_chain.size() == sz); - - if (is_require_sort_) - { - CompoundIndex head_index(core_id, 0); - MergeCompoundIndex(head_index); - } - else - { - // Concat. Delay concat to FinalizeCommit() to avoid lock. - } - } - } - - void FinalizeCommit() override - { - if (is_require_sort_) - { - // Already sorted by CommitAtCore(). - } - else - { - ConcatAll(); - } + scan_cache_.Reset(); } private: - struct CompoundIndex - { - public: - CompoundIndex() : index_(UINT32_MAX) - { - } - - CompoundIndex(uint16_t core_id, uint32_t offset) - { - index_ = (offset << 10) | core_id; - } - - friend bool operator==(const CompoundIndex &lhs, - const CompoundIndex &rhs) - { - return lhs.index_ == rhs.index_; - } - - friend bool operator!=(const CompoundIndex &lhs, - const CompoundIndex &rhs) - { - return !(lhs == rhs); - } - - uint16_t CoreId() const - { - return index_ & 0x3FF; - } - - uint32_t Offset() const - { - return index_ >> 10; - } - - private: - /** - * @brief The lower 10 bits represent the core ID. The remaining higher - * bits represent the offset in the scan result vector. - * - */ - uint32_t index_; - }; - - const CompoundIndex &Inf() const - { - static CompoundIndex inf; - return inf; - } - - void MergeCompoundIndex(CompoundIndex head) - { - std::unique_lock lk(mux_); - if (!head_occupied_) - { - // The head is empty. There is nothing to merge. Sets the head to - // the input scan list's head. - head_index_ = head; - head_occupied_ = true; - } - else if (head != Inf()) - { - // Merges the input scan list with the list pointed by the head. - if (head_index_ == Inf()) - { - head_index_ = head; - return; - } - CompoundIndex curr_head = head_index_; - head_occupied_ = false; - - lk.unlock(); - MergeCompoundIndex(head, curr_head); - } - } - - void MergeCompoundIndex(CompoundIndex left, CompoundIndex right) - { - CompoundIndex merge_head; - CompoundIndex prev_index; - - if (left == Inf()) - { - // The left is empty. - return MergeCompoundIndex(right); - } - else if (right == Inf()) - { - // The right is empty. - return MergeCompoundIndex(left); - } - - const TemplateScanTuple *left_tuple = At(left); - const TemplateScanTuple *right_tuple = At(right); - - if (IsForward) - { - if (left_tuple->KeyObj() < right_tuple->KeyObj()) - { - merge_head = left; - prev_index = left; - left = AdvanceMergeIndex(left); - } - else - { - merge_head = right; - prev_index = right; - right = AdvanceMergeIndex(right); - } - - while (left != Inf() && right != Inf()) - { - left_tuple = At(left); - right_tuple = At(right); - - if (left_tuple->KeyObj() < right_tuple->KeyObj()) - { - UpdateNextIndex(prev_index, left); - prev_index = left; - left = AdvanceMergeIndex(left); - } - else - { - UpdateNextIndex(prev_index, right); - prev_index = right; - right = AdvanceMergeIndex(right); - } - } - } - else - { - if (left_tuple->KeyObj() < right_tuple->KeyObj()) - { - merge_head = right; - prev_index = right; - right = AdvanceMergeIndex(right); - } - else - { - merge_head = left; - prev_index = left; - left = AdvanceMergeIndex(left); - } - - while (left != Inf() && right != Inf()) - { - left_tuple = At(left); - right_tuple = At(right); - - if (left_tuple->KeyObj() < right_tuple->KeyObj()) - { - UpdateNextIndex(prev_index, right); - prev_index = right; - right = AdvanceMergeIndex(right); - } - else - { - UpdateNextIndex(prev_index, left); - prev_index = left; - left = AdvanceMergeIndex(left); - } - } - } - - if (left != Inf()) - { - UpdateNextIndex(prev_index, left); - } - - if (right != Inf()) - { - UpdateNextIndex(prev_index, right); - } - - MergeCompoundIndex(merge_head); - } - - /** - * @brief Concat all chains at last finished core to avoid lock. - */ - void ConcatAll() - { - assert(head_index_ == Inf()); - for (uint16_t core_id = 0; core_id < index_chain_.size(); ++core_id) - { - std::vector &chain = index_chain_[core_id]; - if (!chain.empty()) - { - ConcatLockFree(core_id, chain); - } - } - } - - void ConcatLockFree(uint16_t core_id, std::vector &chain) - { - chain.back() = head_index_; - head_index_ = {core_id, 0}; - } - - CompoundIndex AdvanceMergeIndex(CompoundIndex index) - { - assert(index.CoreId() < index_chain_.size()); - assert(index.Offset() < index_chain_[index.CoreId()].size()); - - return index_chain_[index.CoreId()][index.Offset()]; - } - - const TemplateScanTuple *At(CompoundIndex index) const - { - assert(index.CoreId() < scans_.size()); - assert(index.Offset() < scans_[index.CoreId()].Size()); - - return scans_[index.CoreId()].At(index.Offset()); - } - - void UpdateNextIndex(CompoundIndex prev_index, CompoundIndex index) - { - assert(prev_index.CoreId() < index_chain_.size()); - assert(prev_index.Offset() < index_chain_[prev_index.CoreId()].size()); - - index_chain_[prev_index.CoreId()][prev_index.Offset()] = index; - } - - // Scan caches of the target node group. Its size is core count of the - // target node. - std::vector> scans_; - std::vector> index_chain_; - std::mutex mux_; - bool head_occupied_{false}; - CompoundIndex head_index_{Inf()}; - + TemplateScanCache scan_cache_; const KeySchema *key_schema_; /** * @brief The term of the cc node group where the range partition resides. diff --git a/tx_service/include/cc/local_cc_handler.h b/tx_service/include/cc/local_cc_handler.h index eae6ba46..8e0fb115 100644 --- a/tx_service/include/cc/local_cc_handler.h +++ b/tx_service/include/cc/local_cc_handler.h @@ -103,7 +103,9 @@ class LocalCcHandler : public CcHandler const TxRecord *record, OperationType operation_type, uint32_t key_shard_code, - CcHandlerResult &hres) override; + CcHandlerResult &hres, + int32_t partition_id = -1, + bool on_dirty_range = false) override; CcReqStatus PostRead( uint64_t tx_number, diff --git a/tx_service/include/cc/local_cc_shards.h b/tx_service/include/cc/local_cc_shards.h index 961bee52..870eb7a0 100644 --- a/tx_service/include/cc/local_cc_shards.h +++ b/tx_service/include/cc/local_cc_shards.h @@ -1129,7 +1129,6 @@ class LocalCcShards template RangeSliceOpStatus AddKeyToKeyCache(const TableName &table_name, NodeGroupId cc_ng_id, - uint16_t core_id, const KeyT &key) { std::shared_lock lk(meta_data_mux_); @@ -1156,7 +1155,7 @@ class LocalCcShards return RangeSliceOpStatus::Error; } store_range->UpdateLastAccessedTs(ClockTs()); - return store_range->AddKey(key, core_id); + return store_range->AddKey(key); } template @@ -1757,6 +1756,12 @@ class LocalCcShards uint64_t txn, CcHandlerResult *hres); + void CreateSplitRangeDataSyncTask(const TableName &table_name, + uint32_t ng_id, + int64_t ng_term, + int32_t range_id, + uint64_t data_sync_ts); + std::pair PinStoreRange( const TableName &table_name, const NodeGroupId ng_id, @@ -1913,7 +1918,8 @@ class LocalCcShards bool can_be_skipped, uint64_t &last_sync_ts, std::shared_ptr status, - CcHandlerResult *hres); + CcHandlerResult *hres, + bool high_priority = false); bool EnqueueDataSyncTaskToCore( const TableName &table_name, uint32_t ng_id, @@ -2120,7 +2126,6 @@ class LocalCcShards .GetLocalCcShards() ->GetRangeOwner(new_range_id_, ng_id_) ->BucketOwner(); - assert(new_range_owner_ != ng_id_); dest_node_id_ = Sharder::Instance().LeaderNodeId(new_range_owner_); channel_ = @@ -2303,7 +2308,7 @@ class LocalCcShards { // `0` means no pending task uint64_t latest_pending_task_ts_{0}; - std::queue> pending_tasks_; + std::deque> pending_tasks_; uint64_t UnsetLatestPendingTs() { diff --git a/tx_service/include/cc/object_cc_map.h b/tx_service/include/cc/object_cc_map.h index a2b31c8e..bbd4d17b 100644 --- a/tx_service/include/cc/object_cc_map.h +++ b/tx_service/include/cc/object_cc_map.h @@ -1571,7 +1571,8 @@ class ObjectCcMap : public TemplateCcMap next_ts_offset = ts_offset; next_status_offset = status_offset; - auto [key_str, rec_str, ts_str, status_str] = *entry_tuples; + auto [key_str, rec_str, ts_str, status_str, flags_str] = + *entry_tuples; // deserialize key decoded_key.Deserialize( key_str.data(), next_key_offset, KeySchema()); @@ -1739,7 +1740,8 @@ class ObjectCcMap : public TemplateCcMap key_offset, rec_offset, ts_offset, - status_offset); + status_offset, + 0); shard_->Enqueue(shard_->LocalCoreId(), &req); return false; } diff --git a/tx_service/include/cc/range_cc_map.h b/tx_service/include/cc/range_cc_map.h index 29b679a5..d2a39d50 100644 --- a/tx_service/include/cc/range_cc_map.h +++ b/tx_service/include/cc/range_cc_map.h @@ -743,7 +743,56 @@ class RangeCcMap : public TemplateCcMap // update previous cce's end key cce->SetCommitTsPayloadStatus(new_range_info->version_ts_, RecordStatus::Normal); + + // Reset new range size on the data table ccmap (emplace if + // absent). + int32_t new_range_id = new_range_info->PartitionId(); + NodeGroupId new_range_owner = + shard_->GetRangeOwner(new_range_id, this->cc_ng_id_) + ->BucketOwner(); + if (new_range_owner == this->cc_ng_id_ && + static_cast((new_range_id & 0x3FF) % + shard_->core_cnt_) == + shard_->core_id_) + { + TableType data_table_type = + TableName::Type(this->table_name_.StringView()); + TableName data_table_name(this->table_name_.StringView(), + data_table_type, + this->table_name_.Engine()); + CcMap *ccm = + shard_->GetCcm(data_table_name, this->cc_ng_id_); + assert(ccm != nullptr); + size_t range_size = new_range_entries.at(idx) + ->TypedStoreRange() + ->PostCkptSize(); + ccm->InitRangeSize(static_cast(new_range_id), + static_cast(range_size), + true, + true); + } + } + // Reset old range size on the data table ccmap (no emplace). + int32_t old_partition_id = + upload_range_rec->GetRangeInfo()->PartitionId(); + if (range_owner == this->cc_ng_id_ && + static_cast((old_partition_id & 0x3FF) % + shard_->core_cnt_) == shard_->core_id_) + { + TableType data_table_type = + TableName::Type(this->table_name_.StringView()); + TableName data_table_name(this->table_name_.StringView(), + data_table_type, + this->table_name_.Engine()); + CcMap *ccm = shard_->GetCcm(data_table_name, this->cc_ng_id_); + assert(ccm != nullptr); + size_t old_range_size = + old_entry->TypedStoreRange()->PostCkptSize(); + ccm->InitRangeSize(static_cast(old_partition_id), + static_cast(old_range_size)); + ccm->ResetRangeStatus(static_cast(old_partition_id)); } + // range_owner_rec_ needs to be reset on each core since they point // to bucket records on different cores. upload_range_rec->range_owner_rec_ = @@ -1159,6 +1208,14 @@ class RangeCcMap : public TemplateCcMap // add new range entry to range cc map auto bucket_map = static_cast( shard_->GetCcm(range_bucket_ccm_name, this->cc_ng_id_)); + TableType data_table_type = + TableName::Type(this->table_name_.StringView()); + TableName data_table_name(this->table_name_.StringView(), + data_table_type, + this->table_name_.Engine()); + CcMap *data_ccm = shard_->GetCcm(data_table_name, this->cc_ng_id_); + assert(data_ccm != nullptr); + for (uint idx = 0; idx < new_range_infos.size(); idx++) { const TemplateRangeInfo *new_range_info = @@ -1181,6 +1238,51 @@ class RangeCcMap : public TemplateCcMap new_range_info->PartitionId())); cce->SetCommitTsPayloadStatus(new_range_info->version_ts_, RecordStatus::Normal); + + // Reset new range size on data table ccmap if this core owns + // it. + int32_t new_range_id = new_range_info->PartitionId(); + NodeGroupId new_range_owner = + shard_->GetRangeOwner(new_range_id, this->cc_ng_id_) + ->BucketOwner(); + if (new_range_owner == this->cc_ng_id_ && + static_cast((new_range_id & 0x3FF) % + shard_->core_cnt_) == + shard_->core_id_) + { + const TableRangeEntry *new_range_entry = + shard_->GetTableRangeEntry( + this->table_name_, this->cc_ng_id_, new_range_id); + assert(new_range_entry != nullptr); + size_t range_size = + static_cast *>( + new_range_entry) + ->TypedStoreRange() + ->PostCkptSize(); + data_ccm->InitRangeSize(static_cast(new_range_id), + static_cast(range_size), + true, + true); + } + } + + // Reset old range size on the data table ccmap if this core owns + // it. + int32_t old_range_id = + old_table_range_entry->GetRangeInfo()->PartitionId(); + NodeGroupId range_owner = + shard_->GetRangeOwner(old_range_id, this->cc_ng_id_) + ->BucketOwner(); + if (range_owner == this->cc_ng_id_ && + static_cast((old_range_id & 0x3FF) % + shard_->core_cnt_) == shard_->core_id_) + { + size_t old_range_size = + old_table_range_entry->RangeSlices()->PostCkptSize(); + data_ccm->InitRangeSize(static_cast(old_range_id), + static_cast(old_range_size), + true, + true); } } diff --git a/tx_service/include/cc/range_slice.h b/tx_service/include/cc/range_slice.h index 0291d224..0961534c 100644 --- a/tx_service/include/cc/range_slice.h +++ b/tx_service/include/cc/range_slice.h @@ -303,22 +303,12 @@ class StoreSlice SliceStatus status, bool init_key_cache, bool empty_slice) - : size_(size), - status_(status), - fetch_slice_cc_(nullptr), - cache_validity_((txservice_enable_key_cache && init_key_cache) - ? Sharder::Instance().GetLocalCcShardsCount() - : 0) - { - if (empty_slice && !cache_validity_.empty()) + : size_(size), status_(status), fetch_slice_cc_(nullptr) + { + if (empty_slice && (txservice_enable_key_cache && init_key_cache)) { // If slice is empty, set the key cache as valid at the start. - for (uint16_t i = 0; - i < Sharder::Instance().GetLocalCcShardsCount(); - i++) - { - SetKeyCacheValidity(i, true); - } + SetKeyCacheValidity(true); } } @@ -419,42 +409,38 @@ class StoreSlice last_load_ts_ = load_ts; } - bool IsValidInKeyCache(uint16_t core_id) const + bool IsValidInKeyCache() const { - assert(!cache_validity_.empty()); - return cache_validity_[core_id] & 1; + return cache_validity_ & 1; } - void SetKeyCacheValidity(uint16_t core_id, bool valid) + void SetKeyCacheValidity(bool valid) { - assert(!cache_validity_.empty()); if (valid) { - cache_validity_[core_id] |= 1; + cache_validity_ |= 1; } else { - cache_validity_[core_id] &= ~(1); + cache_validity_ &= ~(1); } } - void SetLoadingKeyCache(uint16_t core_id, bool status) + void SetLoadingKeyCache(bool status) { - assert(!cache_validity_.empty()); if (status) { - cache_validity_[core_id] |= (1 << 1); + cache_validity_ |= (1 << 1); } else { - cache_validity_[core_id] &= ~(1 << 1); + cache_validity_ &= ~(1 << 1); } } - bool IsLoadingKeyCache(uint16_t core_id) + bool IsLoadingKeyCache() { - assert(!cache_validity_.empty()); - return cache_validity_[core_id] & (1 << 1); + return cache_validity_ & (1 << 1); } void InitKeyCache(CcShard *cc_shard, @@ -508,13 +494,12 @@ class StoreSlice std::mutex slice_mux_; - // If this slice is included in the range key filter. Each core should only - // access its own bitset, so we do not need mutex protection. - // Note that byte is the smallest unit c++ sync across threads. To avoid - // data corruption we need at least 1 byte for each core mask. - // The first bit implies if the key cache is valid on this core, the second - // bit implies if the key cache is being loaded on this core. - std::vector cache_validity_; + // If this slice is included in the range key filter. The first bit implies + // if the key cache is valid, the second bit implies if the key cache is + // being loaded. + // All keys in this range are sharding to the same core, so we only need to + // maintain one cache validity for this range. + uint8_t cache_validity_{0}; friend class StoreRange; template @@ -722,10 +707,9 @@ class StoreRange return last_accessed_ts_.load(std::memory_order_relaxed); } - std::string KeyCacheInfo(uint16_t core_id) const + std::string KeyCacheInfo() const { - assert(core_id < key_cache_.size()); - return key_cache_[core_id]->Info(); + return key_cache_->Info(); } void SetHasDmlSinceDdl() @@ -856,8 +840,9 @@ class StoreRange // cache. Removing keys from cache when they are evicted reduces the number // of look ups to find the slice of the key since we can evict the keys in // batch. - std::vector>> - key_cache_; + // All keys in this range are sharding to the same core, so we only need to + // maintain one key cache for this range. + std::unique_ptr> key_cache_; std::atomic last_init_key_cache_time_{0}; // This variable is used during the upsert table scheme transaction(such as, @@ -957,7 +942,7 @@ class TemplateStoreRange : public StoreRange slice_end, slice_size, slice_status, - !key_cache_.empty()); + key_cache_ != nullptr); slices_.emplace_back(std::move(slice)); @@ -970,12 +955,12 @@ class TemplateStoreRange : public StoreRange slice_size = slice_keys[idx].size_; slice_status = slice_keys[idx].status_; - slice = - std::make_unique>(slice_start, - slice_end, - slice_size, - slice_status, - !key_cache_.empty()); + slice = std::make_unique>( + slice_start, + slice_end, + slice_size, + slice_status, + key_cache_ != nullptr); slices_.emplace_back(std::move(slice)); @@ -1063,25 +1048,24 @@ class TemplateStoreRange : public StoreRange return slices_; } - void InvalidateKeyCache(uint16_t core_id) + void InvalidateKeyCache() { - if (key_cache_.empty()) + if (key_cache_ == nullptr) { return; } LOG(INFO) << "Invalidate key cache of range " << partition_id_ - << " on core " << core_id << " due to collision"; + << " due to collision"; std::shared_lock s_lk(mux_); // shared lock to avoid slice split for (auto &slice : slices_) { - slice->SetKeyCacheValidity(core_id, false); + slice->SetKeyCacheValidity(false); } // Create a larger key cache if the old one cannot hold enough keys. - size_t last_key_cache_size = key_cache_[core_id]->Size(); - key_cache_[core_id] = - std::make_unique>( - last_key_cache_size * 1.2); + size_t last_key_cache_size = key_cache_->Size(); + key_cache_ = std::make_unique>( + last_key_cache_size * 1.2); } /** * @brief Split the range with new_end. new_end will be the new @@ -1212,7 +1196,7 @@ class TemplateStoreRange : public StoreRange } CODE_FAULT_INJECTOR("PinSlices_Fail", { LOG(INFO) << "FaultInject PinSlices_Fail, " << check_key_cache - << ", is valid " << slice->IsValidInKeyCache(shard_id); + << ", is valid " << slice->IsValidInKeyCache(); if (slice->status_ == SliceStatus::FullyCached) { slice->status_ = SliceStatus::PartiallyCached; @@ -1305,9 +1289,9 @@ class TemplateStoreRange : public StoreRange else if (check_key_cache) { assert(to_prefetch == false); - if (slice->IsValidInKeyCache(shard_id)) + if (slice->IsValidInKeyCache()) { - bool found = ContainsKey(search_key, shard_id); + bool found = ContainsKey(search_key); if (!found) { // If the key is not found in range, directly return and @@ -1318,7 +1302,7 @@ class TemplateStoreRange : public StoreRange // If key is found in range key cache, the key must exist in kv // store. Load slice from kv to get the value. } - else if (!slice->IsLoadingKeyCache(shard_id)) + else if (!slice->IsLoadingKeyCache()) { // If this slice can use key cache but the key cache is not // intialized, always load slice from kv to initialize the key @@ -1628,17 +1612,16 @@ class TemplateStoreRange : public StoreRange return true; } - void DeleteKey(const KeyT &key, uint16_t core_id, StoreSlice *slice) + void DeleteKey(const KeyT &key, StoreSlice *slice) { if (slice == nullptr) { TxKey search_key(&key); slice = FindSlice(search_key); } - if (slice->IsValidInKeyCache(core_id)) + if (slice->IsValidInKeyCache()) { - cuckoofilter::Status status = - key_cache_[core_id]->Delete(key.Hash()); + cuckoofilter::Status status = key_cache_->Delete(key.Hash()); // We should not try to delete a non-existing key. if (status == cuckoofilter::Status::NotFound) { @@ -1651,9 +1634,9 @@ class TemplateStoreRange : public StoreRange } // NOTE: The slice to which the @@key belong must be valid in key cache. - void DeleteKey(const KeyT &key, uint16_t core_id) + void DeleteKey(const KeyT &key) { - cuckoofilter::Status status = key_cache_[core_id]->Delete(key.Hash()); + cuckoofilter::Status status = key_cache_->Delete(key.Hash()); // We should not try to delete a non-existing key. if (status == cuckoofilter::Status::NotFound) { @@ -1663,7 +1646,6 @@ class TemplateStoreRange : public StoreRange } RangeSliceOpStatus AddKey(const KeyT &key, - uint16_t core_id, StoreSlice *slice = nullptr, bool init = false) { @@ -1673,10 +1655,10 @@ class TemplateStoreRange : public StoreRange TxKey search_key(&key); slice = FindSlice(search_key); } - if (init || slice->IsValidInKeyCache(core_id)) + if (init || slice->IsValidInKeyCache()) { - assert(init || !slice->IsLoadingKeyCache(core_id)); - cuckoofilter::Status status = key_cache_[core_id]->Add(key.Hash()); + assert(init || !slice->IsLoadingKeyCache()); + cuckoofilter::Status status = key_cache_->Add(key.Hash()); if (status == cuckoofilter::Status::Ok) { return RangeSliceOpStatus::Successful; @@ -1685,11 +1667,11 @@ class TemplateStoreRange : public StoreRange { assert(status == cuckoofilter::Status::NotEnoughSpace); // Add failed, we need to invalidate the filter. - InvalidateKeyCache(core_id); + InvalidateKeyCache(); return RangeSliceOpStatus::Error; } } - else if (slice->IsLoadingKeyCache(core_id)) + else if (slice->IsLoadingKeyCache()) { // Retry later when key cache is initialized. return RangeSliceOpStatus::Retry; @@ -1720,10 +1702,9 @@ class TemplateStoreRange : public StoreRange } } - bool ContainsKey(const KeyT &key, uint16_t core_id) + bool ContainsKey(const KeyT &key) { - return key_cache_[core_id]->Contain(key.Hash()) == - cuckoofilter::Status::Ok; + return key_cache_->Contain(key.Hash()) == cuckoofilter::Status::Ok; } size_t PostCkptSize() override @@ -1940,7 +1921,7 @@ class TemplateStoreRange : public StoreRange sub_slice_end, split_keys[idx].cur_size_, SliceStatus::PartiallyCached, - !slice->cache_validity_.empty()); + slice->cache_validity_ != 0); sub_slice->post_ckpt_size_ = split_keys[idx].post_update_size_; sub_slice->status_ = slice->status_; diff --git a/tx_service/include/cc/template_cc_map.h b/tx_service/include/cc/template_cc_map.h index 77fb5be1..136b9b00 100644 --- a/tx_service/include/cc/template_cc_map.h +++ b/tx_service/include/cc/template_cc_map.h @@ -38,6 +38,7 @@ #include #include +#include "absl/container/flat_hash_map.h" #include "cc_entry.h" #include "cc_map.h" #include "cc_page_clean_guard.h" @@ -250,7 +251,7 @@ class TemplateCcMap : public CcMap auto it = Iterator(cce_ptr, ccp, &neg_inf_); target_key = it->first; auto res = shard_->local_shards_.AddKeyToKeyCache( - table_name_, cc_ng_id_, shard_->core_id_, *target_key); + table_name_, cc_ng_id_, *target_key); if (res == RangeSliceOpStatus::Retry) { // If the insert fails due to key cache is being @@ -418,10 +419,7 @@ class TemplateCcMap : public CcMap // or auto incr pk insert, the ReadCc is skipped and we // need to update key cache here. auto res = shard_->local_shards_.AddKeyToKeyCache( - table_name_, - cc_ng_id_, - shard_->core_id_, - *target_key); + table_name_, cc_ng_id_, *target_key); if (res == RangeSliceOpStatus::Retry) { // If the insert fails due to key cache is being @@ -591,6 +589,8 @@ class TemplateCcMap : public CcMap cce->ArchiveBeforeUpdate(); } + [[maybe_unused]] const size_t old_payload_size = + cce->PayloadSize(); if (is_del) { cce->payload_.SetCurrentPayload(nullptr); @@ -612,6 +612,42 @@ class TemplateCcMap : public CcMap bool was_dirty = cce->IsDirty(); cce->SetCommitTsPayloadStatus(commit_ts, new_status); + if constexpr (RangePartitioned) + { + if (req.NeedUpdateRangeSize()) + { + const int64_t key_delta_size = + (new_status == RecordStatus::Deleted) + ? (-static_cast(write_key->Size() + + old_payload_size)) + : (cce_old_status != RecordStatus::Normal + ? static_cast( + write_key->Size() + + cce->PayloadSize()) + : static_cast( + cce->PayloadSize() - + old_payload_size)); + const uint32_t range_id = req.PartitionId(); + // is_dirty: true when range is splitting. + bool need_split = UpdateRangeSize( + range_id, + static_cast(key_delta_size), + req.OnDirtyRange()); + + if (need_split) + { + assert(!req.OnDirtyRange()); + // Create a data sync task for the range. + shard_->CreateSplitRangeDataSyncTask( + table_name_, + cc_ng_id_, + cce_addr->Term(), + range_id, + commit_ts); + } + } + } + if (req.IsInitialInsert()) { // Updates the ckpt ts after commit ts is set. @@ -1673,7 +1709,6 @@ class TemplateCcMap : public CcMap static_cast *>( slice_id.Range()); auto res = range->AddKey(*look_key, - shard_->core_id_, slice_id.Slice()); if (res == RangeSliceOpStatus::Error) { @@ -3434,7 +3469,8 @@ class TemplateCcMap : public CcMap if (ng_term < 0 || (req.RangeCcNgTerm() > 0 && req.RangeCcNgTerm() != ng_term)) { - return req.SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); + req.SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); + return true; } if (req.SchemaVersion() != 0 && req.SchemaVersion() != schema_ts_) @@ -3443,41 +3479,14 @@ class TemplateCcMap : public CcMap return true; } - if (req.SendResponseIfFinished()) + if (req.IsWaitForSnapshot()) { + assert(req.WaitForSnapshotCnt() == 0); req.UnpinSlices(); + req.SetFinish(); return true; } - if (req.IsWaitForSnapshot(shard_->core_id_)) - { - assert(req.WaitForSnapshotCnt(shard_->core_id_) == 0); - if (req.SetFinish()) - { - if (req.Result()->Value().is_local_) - { - req.UnpinSlices(); - return true; - } - else if (req.IsResponseSender(shard_->core_id_)) - { - req.SendResponseIfFinished(); - req.UnpinSlices(); - return true; - } - else - { - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, req.Txn(), &req); - return false; - } - } - else - { - return false; - } - } - CcOperation cc_op; bool is_read_snapshot; if (table_name_.Type() == TableType::Secondary || @@ -3544,18 +3553,17 @@ class TemplateCcMap : public CcMap req.SetEndKey(TxKey(std::move(decoded_end_key))); } - uint16_t core_id = shard_->LocalCoreId(); TemplateScanCache *scan_cache = nullptr; RemoteScanSliceCache *remote_scan_cache = nullptr; if (req.IsLocal()) { scan_cache = static_cast *>( - req.GetLocalScanCache(core_id)); + req.GetLocalScanCache()); assert(scan_cache != nullptr); } else { - remote_scan_cache = req.GetRemoteScanCache(core_id); + remote_scan_cache = req.GetRemoteScanCache(); assert(remote_scan_cache != nullptr); } @@ -3597,10 +3605,6 @@ class TemplateCcMap : public CcMap if (req.SliceId().Slice() == nullptr) { - // The scan slice request is first dispatched to one core, which - // pins the slice in memory. After the slice is pinned, the request - // is dispatched to other cores to scan in parallel. The slice is - // unpinned by the last core finishing the scan batch. RangeSliceOpStatus pin_status = RangeSliceOpStatus::NotPinned; uint32_t max_pin_cnt = req.PrefetchSize(); const StoreSlice *last_pinned_slice; @@ -3650,7 +3654,8 @@ class TemplateCcMap : public CcMap { if (slice_id.Range()->HasLock()) { - return req.SetError(CcErrorCode::OUT_OF_MEMORY); + req.SetError(CcErrorCode::OUT_OF_MEMORY); + return true; } else { @@ -3667,27 +3672,12 @@ class TemplateCcMap : public CcMap { // If the pin operation returns an error, the data store // is inaccessible. - return req.SetError(CcErrorCode::PIN_RANGE_SLICE_FAILED); + req.SetError(CcErrorCode::PIN_RANGE_SLICE_FAILED); + return true; } assert(pin_status == RangeSliceOpStatus::Successful); req.PinSlices(slice_id, last_pinned_slice); - // Update unfinished cnt before dispatching to remaining cores. - req.SetUnfinishedCoreCnt(req.GetShardCount()); - - // Dispatches to remaining cores to scan pinned slice(s) in - // parallel. - for (uint16_t core_id = 0; core_id < shard_->local_shards_.Count(); - ++core_id) - { - if (core_id == shard_->core_id_) - { - continue; - } - - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, core_id, &req); - } } Iterator scan_ccm_it; @@ -3745,7 +3735,6 @@ class TemplateCcMap : public CcMap case CcErrorCode::MVCC_READ_MUST_WAIT_WRITE: { req.SetBlockingInfo( - shard_->core_id_, reinterpret_cast(cce->GetLockAddr()), scan_type, ScanBlockingType::BlockOnFuture); @@ -3754,7 +3743,6 @@ class TemplateCcMap : public CcMap case CcErrorCode::ACQUIRE_LOCK_BLOCKED: { req.SetBlockingInfo( - shard_->core_id_, reinterpret_cast(cce->GetLockAddr()), scan_type, ScanBlockingType::BlockOnLock); @@ -3814,7 +3802,7 @@ class TemplateCcMap : public CcMap assert(fetch_ret_status == store::DataStoreHandler::DataStoreOpStatus::Success); (void) fetch_ret_status; - req.IncreaseWaitForSnapshotCnt(shard_->core_id_); + req.IncreaseWaitForSnapshotCnt(); } } else @@ -3864,14 +3852,14 @@ class TemplateCcMap : public CcMap assert(fetch_ret_status == store::DataStoreHandler::DataStoreOpStatus::Success); (void) fetch_ret_status; - req.IncreaseWaitForSnapshotCnt(shard_->core_id_); + req.IncreaseWaitForSnapshotCnt(); } } return {ScanReturnType::Success, CcErrorCode::NO_ERROR}; }; - uint64_t cce_lock_addr = req.BlockingCceLockAddr(core_id); + uint64_t cce_lock_addr = req.BlockingCceLockAddr(); if (cce_lock_addr != 0) { KeyGapLockAndExtraData *lock = @@ -3881,7 +3869,7 @@ class TemplateCcMap : public CcMap CcEntry *>( lock->GetCcEntry()); - auto [blocking_type, scan_type] = req.BlockingPair(core_id); + auto [blocking_type, scan_type] = req.BlockingPair(); CcPage *ccp = static_cast< CcPage *>( @@ -3936,43 +3924,16 @@ class TemplateCcMap : public CcMap assert(lock_pair.second == CcErrorCode::MVCC_READ_FOR_WRITE_CONFLICT); - if (req.IsLocal()) - { - req.GetLocalScanner()->CommitAtCore(core_id); - } - - if (is_read_snapshot && - req.WaitForSnapshotCnt(shard_->core_id_) > 0) + if (is_read_snapshot && req.WaitForSnapshotCnt() > 0) { - req.SetIsWaitForSnapshot(shard_->core_id_); + req.SetIsWaitForSnapshot(); req.DeferSetError(lock_pair.second); return false; } - if (req.SetError(lock_pair.second)) - { - if (req.Result()->Value().is_local_) - { - req.UnpinSlices(); - return true; - } - else if (req.IsResponseSender(shard_->core_id_)) - { - req.SendResponseIfFinished(); - req.UnpinSlices(); - return true; - } - else - { - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, req.Txn(), &req); - return false; - } - } - else - { - return false; - } + req.UnpinSlices(); + req.SetError(lock_pair.second); + return true; } is_locked = lock_pair.first != LockType::NoLock; @@ -4019,7 +3980,7 @@ class TemplateCcMap : public CcMap store::DataStoreHandler::DataStoreOpStatus:: Success); (void) fetch_ret_status; - req.IncreaseWaitForSnapshotCnt(shard_->core_id_); + req.IncreaseWaitForSnapshotCnt(); } } else @@ -4070,7 +4031,7 @@ class TemplateCcMap : public CcMap store::DataStoreHandler::DataStoreOpStatus:: Success); (void) fetch_ret_status; - req.IncreaseWaitForSnapshotCnt(shard_->core_id_); + req.IncreaseWaitForSnapshotCnt(); } } } @@ -4116,90 +4077,39 @@ class TemplateCcMap : public CcMap } RangeScanSliceResult &slice_result = hd_res->Value(); - auto [final_end_tx_key, end_finalized] = slice_result.PeekLastKey(); if (req.Direction() == ScanDirection::Forward) { const TemplateStoreSlice *last_slice = static_cast *>( req.LastPinnedSlice()); - // The scan at core 0 sets the scan's end key. By default, the - // scan's end is the exclusive end of the slice or the request's - // specified end key, whichever is smaller. In case keys in the - // slice are too many to fit into the scan cache, the key right - // after the last scanned tuple at core 0 becomes the exclusive end - // of scans at other cores. In such a case, it is mandatory that all - // keys smaller than the end key at other cores are returned in this - // batch. So, scans at other cores may slightly exceed the scan - // cache's capacity. - + // By default, the scan's end is the exclusive end of the slice or + // the request's specified end key, whichever is smaller. In case + // keys in the slice are too many to fit into the scan cache, the + // key right after the last scanned tuple becomes the exclusive end + // of scans. const KeyT *initial_end = nullptr; bool init_end_inclusive = false; - // Given the scan batch's final end key, deduces the local scan's - // end and inclusiveness. - auto deduce_scan_end = - [](const KeyT *batch_end_key, - const KeyT *req_end_key, - bool req_inclusive) -> std::pair - { - const KeyT *end = nullptr; - bool inclusive = false; - - assert(batch_end_key != nullptr); - // If the request specifies the end key and it is the scan - // batch's end key, the scan's inclusiveness is determined by - // the request. Or, the scan batch's end must be the exclusive - // end of a slice or positive infinity. - if (batch_end_key == req_end_key) - { - end = req_end_key; - inclusive = req_inclusive; - } - else - { - end = batch_end_key; - inclusive = false; - } - - return {end, inclusive}; - }; + // Takes the smaller of the slice's last key and the request's end + // key as the local scan's initial end. + const KeyT *slice_end = last_slice->EndKey(); + assert(slice_end != nullptr); - if (!end_finalized) + // If the request specifies the end key and it falls into the + // slice, initializes the local scan's end to the request's end + // key. Or, the scan end is the slice's end. + if (req_end_key != nullptr && + (*req_end_key < *slice_end || + (*req_end_key == *slice_end && !req.EndInclusive()))) { - // This scan batch's end key has not been set. Takes the smaller - // of the slice's last key and the request's end key as the - // local scan's initial end. The initial end may be modified, if - // another core finishes earlier and finalizes the batch's end - // before this core. The final end may be smaller or greater - // than the initial end. - const KeyT *slice_end = last_slice->EndKey(); - assert(slice_end != nullptr); - - // If the request specifies the end key and it falls into the - // slice, initializes the local scan's end to the request's end - // key. Or, the scan end is the slice's end. - if (req_end_key != nullptr && - (*req_end_key < *slice_end || - (*req_end_key == *slice_end && !req.EndInclusive()))) - { - initial_end = req_end_key; - init_end_inclusive = req.EndInclusive(); - } - else - { - initial_end = slice_end; - init_end_inclusive = false; - } + initial_end = req_end_key; + init_end_inclusive = req.EndInclusive(); } else { - // This scan batch's end key has been finalized by one of the - // cores. Deduces the local scan's end and inclusiveness. - std::tie(initial_end, init_end_inclusive) = - deduce_scan_end(final_end_tx_key->GetKey(), - req_end_key, - req.EndInclusive()); + initial_end = slice_end; + init_end_inclusive = false; } auto scan_batch_func = @@ -4226,12 +4136,11 @@ class TemplateCcMap : public CcMap return {scan_ret, err_code}; }; - auto scan_loop_func = [this, &scan_batch_func, &is_cache_full]( - Iterator &scan_ccm_it, - const KeyT &end_key, - bool inclusive, - bool end_finalized) - -> std::pair + auto scan_loop_func = + [this, &scan_batch_func, &is_cache_full]( + Iterator &scan_ccm_it, + const KeyT &end_key, + bool inclusive) -> std::pair { ScanReturnType scan_ret = ScanReturnType::Success; CcErrorCode err_code = CcErrorCode::NO_ERROR; @@ -4283,7 +4192,7 @@ class TemplateCcMap : public CcMap scan_ccm_it = End(); ccp = nullptr; } - else if (!end_finalized && is_cache_full()) + else if (is_cache_full()) { scan_ccm_it = Iterator(ccp->next_page_, 0, &neg_inf_); @@ -4305,50 +4214,23 @@ class TemplateCcMap : public CcMap return {scan_ret, err_code}; }; - auto [scan_ret, err] = scan_loop_func( - scan_ccm_it, *initial_end, init_end_inclusive, end_finalized); + auto [scan_ret, err] = + scan_loop_func(scan_ccm_it, *initial_end, init_end_inclusive); switch (scan_ret) { case ScanReturnType::Blocked: return false; case ScanReturnType::Error: - if (req.IsLocal()) + if (is_read_snapshot && req.WaitForSnapshotCnt() > 0) { - req.GetLocalScanner()->CommitAtCore(core_id); - } - - if (is_read_snapshot && - req.WaitForSnapshotCnt(shard_->core_id_) > 0) - { - req.SetIsWaitForSnapshot(shard_->core_id_); + req.SetIsWaitForSnapshot(); req.DeferSetError(err); return false; } - if (req.SetError(err)) - { - if (req.Result()->Value().is_local_) - { - req.UnpinSlices(); - return true; - } - else if (req.IsResponseSender(shard_->core_id_)) - { - req.SendResponseIfFinished(); - req.UnpinSlices(); - return true; - } - else - { - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, req.Txn(), &req); - return false; - } - } - else - { - return false; - } + req.UnpinSlices(); + req.SetError(err); + return true; case ScanReturnType::Yield: shard_->Enqueue(shard_->core_id_, &req); return false; @@ -4356,234 +4238,61 @@ class TemplateCcMap : public CcMap break; } - // If the end of this scan batch is not finalized when the local - // scan at this core started, tries to set the batch's end using the - // local end. If another core has finalized the batch's end, the - // scan at this core may need to be adjusted: if the batch's final - // end is less than the end at this core, keys after the final end - // needs to be removed from the local scan cache; if the batch's - // final end is greater than the end of this core, keys smaller than - // the batch's final end but greater than the local end need to be - // included in the local scan cache. - if (!end_finalized) - { - const KeyT *local_end = nullptr; - SlicePosition slice_position; - - // scan_ccm_it points to the entry after the last scanned tuple. - // If the slice ends with positive infinity and has been fully - // scanned, scan_ccm_it would point to positive infinity. - auto pos_inf_it = End(); - if (scan_ccm_it != pos_inf_it && - (*scan_ccm_it->first < *initial_end || - (init_end_inclusive && - *scan_ccm_it->first == *initial_end))) - { - // The slice is too large. The scan has not fully scanned - // the slice, before reaching the cache's size limit. - // Pretends the slice's exclusive end to be the key after - // the last scanned tuple, from which the next scan batch - // resume. - local_end = scan_ccm_it->first; - slice_position = SlicePosition::Middle; - } - else - { - // The slice has been fully scanned. If the request - // specifies the end key, which falls into the slice, given - // that the slice has been fully scanned, no future scan - // batches are needed. So, we pretend that the scan has - // reached the last slice ending with positive infinity. - // The calling tx will terminate the scan. - if (initial_end == KeyT::PositiveInfinity() || - req_end_key == initial_end) - { - local_end = initial_end; - slice_position = SlicePosition::LastSlice; - } - else - { - // The local scan end must be the end of the slice. - local_end = initial_end; - const TemplateStoreRange *range = - static_cast *>( - req.SliceId().Range()); - const KeyT *range_end = range->RangeEndKey(); - if (range_end != nullptr && *initial_end == *range_end) - { - slice_position = SlicePosition::LastSliceInRange; - } - else - { - slice_position = SlicePosition::Middle; - } - } - } - - auto [batch_end, set_success] = - slice_result.UpdateLastKey(local_end, slice_position); + const KeyT *local_end = nullptr; + SlicePosition slice_position; - if (set_success) + // scan_ccm_it points to the entry after the last scanned tuple. + // If the slice ends with positive infinity and has been fully + // scanned, scan_ccm_it would point to positive infinity. + auto pos_inf_it = End(); + if (scan_ccm_it != pos_inf_it && + (*scan_ccm_it->first < *initial_end || + (init_end_inclusive && *scan_ccm_it->first == *initial_end))) + { + // The slice is too large. The scan has not fully scanned + // the slice, before reaching the cache's size limit. + // Pretends the slice's exclusive end to be the key after + // the last scanned tuple, from which the next scan batch + // resume. + local_end = scan_ccm_it->first; + slice_position = SlicePosition::Middle; + } + else + { + // The slice has been fully scanned. If the request + // specifies the end key, which falls into the slice, given + // that the slice has been fully scanned, no future scan + // batches are needed. So, we pretend that the scan has + // reached the last slice ending with positive infinity. + // The calling tx will terminate the scan. + if (initial_end == KeyT::PositiveInfinity() || + req_end_key == initial_end) { - req.SetRangeCcNgTerm(ng_term); + local_end = initial_end; + slice_position = SlicePosition::LastSlice; } else { - // The local scan tries to set the scan batch's end, but the - // scan at another core have set the batch's end. The scan - // results need to be adjusted, if the results include the - // keys greater than the batch's end, or the results miss - // some keys smaller than the batch's end. - auto [end_key, end_inclusive] = deduce_scan_end( - batch_end, req_end_key, req.EndInclusive()); - size_t trailing_cnt = 0; - - // Excludes keys from the scan cache greater than the - // batch's end. - if (req.IsLocal()) + // The local scan end must be the end of the slice. + local_end = initial_end; + const TemplateStoreRange *range = + static_cast *>( + req.SliceId().Range()); + const KeyT *range_end = range->RangeEndKey(); + if (range_end != nullptr && *initial_end == *range_end) { - while (scan_cache->Size() > 0) - { - // If req.is_require_keys_ is false, the KeyT object - // in scan cache is invalid, so, should use the cce, - // which is valid in any situation, to get the - // corresponding key. - auto last_cce = - reinterpret_cast *>( - scan_cache->Last()->cce_ptr_); - while (scan_ccm_it->second != last_cce) - { - --scan_ccm_it; - assert(scan_ccm_it != Begin()); - } - const KeyT *last_key = - static_cast(scan_ccm_it->first); - if (*end_key < *last_key || - (*end_key == *last_key && !end_inclusive)) - { - ++trailing_cnt; - // Remove cce from scan cache, but keep possible - // locks, because those locks might acquired by - // other ScanSliceCc/ReadCc from the - // transaction. - scan_cache->RemoveLast(); - } - else - { - // Reset iterator to the key after the last - // scanned tuple since we might need to continue - // scanning if trailing_cnt == 0. - ++scan_ccm_it; - break; - } - } + slice_position = SlicePosition::LastSliceInRange; } else { - while (remote_scan_cache->Size() > 0) - { - // Cc entry pointers here are always valid since - // the slices are still pinned so the cce cannot - // be kicked from memory regardless of the lock - // type. - auto last_remote_cce = - reinterpret_cast *>( - remote_scan_cache->LastCce()); - while (scan_ccm_it->second != last_remote_cce) - { - // As long as remote scan cache is not empty, - // iterator should not reach neg inf. - --scan_ccm_it; - assert(scan_ccm_it != Begin()); - } - const KeyT *last_key = - static_cast(scan_ccm_it->first); - if (*end_key < *last_key || - (*end_key == *last_key && !end_inclusive)) - { - trailing_cnt++; - // Remove cce from scan cache, but keep possible - // locks, because those locks might acquired by - // other ScanSliceCc/ReadCc from the - // transaction. - remote_scan_cache->RemoveLast(); - } - else - { - // Reset iterator to the key after the last - // scanned tuple since we might need to continue - // scanning if trailing_cnt == 0. - ++scan_ccm_it; - break; - } - } - } - - // If no key is removed from the scan cache, it's possible - // that the local scan may miss keys smaller than the - // batch's end. Re-scans the cc map using the batch's end. - if (trailing_cnt == 0) - { - auto [scan_ret, err] = scan_loop_func( - scan_ccm_it, *end_key, end_inclusive, true); - switch (scan_ret) - { - case ScanReturnType::Blocked: - return false; - case ScanReturnType::Error: - if (req.IsLocal()) - { - req.GetLocalScanner()->CommitAtCore(core_id); - } - - if (is_read_snapshot && - req.WaitForSnapshotCnt(shard_->core_id_) > 0) - { - req.SetIsWaitForSnapshot(shard_->core_id_); - req.DeferSetError(err); - return false; - } - - if (req.SetError(err)) - { - if (req.Result()->Value().is_local_) - { - req.UnpinSlices(); - return true; - } - else if (req.IsResponseSender(shard_->core_id_)) - { - req.SendResponseIfFinished(); - req.UnpinSlices(); - return true; - } - else - { - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, req.Txn(), &req); - return false; - } - } - else - { - return false; - } - case ScanReturnType::Yield: - shard_->Enqueue(shard_->core_id_, &req); - return false; - default: - break; - } + slice_position = SlicePosition::Middle; } } } + slice_result.SetLastKey(local_end, slice_position); + req.SetRangeCcNgTerm(ng_term); + // Sets the iterator to the last cce, which may need to be pinned to // resume the next scan batch. if (CcEntry @@ -4596,7 +4305,7 @@ class TemplateCcMap : public CcMap } } } - else + else // Backward scan { const TemplateStoreSlice *last_slice = static_cast *>( @@ -4605,53 +4314,19 @@ class TemplateCcMap : public CcMap const KeyT *initial_end = nullptr; bool init_end_inclusive = false; - auto deduce_scan_end = - [](const KeyT *batch_end_key, - const KeyT *req_end_key, - bool req_inclusive) -> std::pair - { - const KeyT *end = nullptr; - bool inclusive = false; - - if (batch_end_key == req_end_key) - { - end = req_end_key; - inclusive = req_inclusive; - } - else - { - end = batch_end_key; - inclusive = true; - } - - return {end, inclusive}; - }; + const KeyT *slice_begin = last_slice->StartKey(); + assert(slice_begin != nullptr); - if (!end_finalized) + if (req_end_key != nullptr && + (*slice_begin < *req_end_key || *slice_begin == *req_end_key)) { - const KeyT *slice_begin = last_slice->StartKey(); - assert(slice_begin != nullptr); - - if (req_end_key != nullptr && (*slice_begin < *req_end_key || - *slice_begin == *req_end_key)) - { - initial_end = req_end_key; - init_end_inclusive = req.EndInclusive(); - } - else - { - initial_end = slice_begin; - init_end_inclusive = true; - } + initial_end = req_end_key; + init_end_inclusive = req.EndInclusive(); } else { - // This scan batch's end key has been finalized by one of the - // cores. Deduces the local scan's end and inclusiveness. - std::tie(initial_end, init_end_inclusive) = - deduce_scan_end(final_end_tx_key->GetKey(), - req_end_key, - req.EndInclusive()); + initial_end = slice_begin; + init_end_inclusive = true; } auto scan_batch_func = @@ -4678,12 +4353,11 @@ class TemplateCcMap : public CcMap return {scan_ret, err_code}; }; - auto scan_loop_func = [this, &scan_batch_func, &is_cache_full]( - Iterator &scan_ccm_it, - const KeyT &end_key, - bool inclusive, - bool end_finalized) - -> std::pair + auto scan_loop_func = + [this, &scan_batch_func, &is_cache_full]( + Iterator &scan_ccm_it, + const KeyT &end_key, + bool inclusive) -> std::pair { ScanReturnType scan_ret = ScanReturnType::Success; CcErrorCode err_code = CcErrorCode::NO_ERROR; @@ -4738,7 +4412,7 @@ class TemplateCcMap : public CcMap scan_ccm_it = Begin(); ccp = nullptr; } - else if (!end_finalized && is_cache_full()) + else if (is_cache_full()) { scan_ccm_it = Iterator(ccp->prev_page_, ccp->prev_page_->Size() - 1, @@ -4761,50 +4435,23 @@ class TemplateCcMap : public CcMap return {scan_ret, err_code}; }; - auto [scan_ret, err] = scan_loop_func( - scan_ccm_it, *initial_end, init_end_inclusive, end_finalized); + auto [scan_ret, err] = + scan_loop_func(scan_ccm_it, *initial_end, init_end_inclusive); switch (scan_ret) { case ScanReturnType::Blocked: return false; case ScanReturnType::Error: - if (req.IsLocal()) - { - req.GetLocalScanner()->CommitAtCore(core_id); - } - - if (is_read_snapshot && - req.WaitForSnapshotCnt(shard_->core_id_) > 0) + if (is_read_snapshot && req.WaitForSnapshotCnt() > 0) { - req.SetIsWaitForSnapshot(shard_->core_id_); + req.SetIsWaitForSnapshot(); req.DeferSetError(err); return false; } - if (req.SetError(err)) - { - if (req.Result()->Value().is_local_) - { - req.UnpinSlices(); - return true; - } - else if (req.IsResponseSender(shard_->core_id_)) - { - req.SendResponseIfFinished(); - req.UnpinSlices(); - return true; - } - else - { - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, req.Txn(), &req); - return false; - } - } - else - { - return false; - } + req.UnpinSlices(); + req.SetError(err); + return true; case ScanReturnType::Yield: shard_->Enqueue(shard_->core_id_, &req); return false; @@ -4812,234 +4459,61 @@ class TemplateCcMap : public CcMap break; } - // If the end of this scan batch is not finalized when the local - // scan at this core started, tries to set the batch's end using the - // local end. If another core has finalized the batch's end, the - // scan at this core may need to be adjusted: if the batch's final - // end is less than the end at this core, keys before the final end - // needs to be removed from the local scan cache; if the batch's - // final end is smaller than the end of this core, keys greater than - // the batch's final end but less than the local end need to be - // included in the local scan cache. - - if (!end_finalized) - { - const KeyT *local_end = nullptr; - SlicePosition slice_position; - - // scan_ccm_it points to the entry before the last scanned - // tuple. - auto neg_inf_it = Begin(); - if (scan_ccm_it != neg_inf_it && - (*initial_end < *scan_ccm_it->first || - (init_end_inclusive && - *scan_ccm_it->first == *initial_end))) - { - // The slice is too large. The scan has not fully scanned - // the slice, before reaching the cache's size limit. - // Pretends the slice's inclusive start to be the last - // scanned key, from which the next scan batch resumes. - ++scan_ccm_it; - local_end = scan_ccm_it->first; - slice_position = SlicePosition::Middle; - } - else - { - // The slice has been fully scanned. If the request - // specifies the end key, which falls into the slice, given - // that the slice has been fully scanned, no future scan - // batches are needed. So, we pretend that the scan has - // reached the first slice (starting with negative - // infinity). The calling tx will terminate the scan. - if (initial_end == KeyT::NegativeInfinity() || - req_end_key == initial_end) - { - local_end = initial_end; - slice_position = SlicePosition::FirstSlice; - } - else - { - // The local scan end must be the start of the slice. - local_end = initial_end; - - const TemplateStoreRange *range = - static_cast *>( - req.SliceId().Range()); - const KeyT *range_start = range->RangeStartKey(); - if (range_start != nullptr && - *initial_end == *range_start) - { - slice_position = SlicePosition::FirstSliceInRange; - } - else - { - slice_position = SlicePosition::Middle; - } - } - } - - auto [batch_end, set_success] = - slice_result.UpdateLastKey(local_end, slice_position); + const KeyT *local_end = nullptr; + SlicePosition slice_position; - if (set_success) + // scan_ccm_it points to the entry before the last scanned + // tuple. + auto neg_inf_it = Begin(); + if (scan_ccm_it != neg_inf_it && + (*initial_end < *scan_ccm_it->first || + (init_end_inclusive && *scan_ccm_it->first == *initial_end))) + { + // The slice is too large. The scan has not fully scanned + // the slice, before reaching the cache's size limit. + // Pretends the slice's inclusive start to be the last + // scanned key, from which the next scan batch resumes. + ++scan_ccm_it; + local_end = scan_ccm_it->first; + slice_position = SlicePosition::Middle; + } + else + { + // The slice has been fully scanned. If the request + // specifies the end key, which falls into the slice, given + // that the slice has been fully scanned, no future scan + // batches are needed. So, we pretend that the scan has + // reached the first slice (starting with negative + // infinity). The calling tx will terminate the scan. + if (initial_end == KeyT::NegativeInfinity() || + req_end_key == initial_end) { - req.SetRangeCcNgTerm(ng_term); + local_end = initial_end; + slice_position = SlicePosition::FirstSlice; } else { - // The local scan tries to set the scan batch's end, but the - // scan at another core have set the batch's end. The scan - // results need to be adjusted, if the results include the - // keys smaller than the batch's end, or the results miss - // some keys greater than the batch's end. - auto [end_key, end_inclusive] = deduce_scan_end( - batch_end, req_end_key, req.EndInclusive()); - size_t trailing_cnt = 0; - - // Excludes keys from the scan cache smaller than the - // batch's end. - if (req.IsLocal()) + // The local scan end must be the start of the slice. + local_end = initial_end; + + const TemplateStoreRange *range = + static_cast *>( + req.SliceId().Range()); + const KeyT *range_start = range->RangeStartKey(); + if (range_start != nullptr && *initial_end == *range_start) { - while (scan_cache->Size() > 0) - { - // If req.is_require_keys_ is false, the KeyT object - // in scan cache is invalid, so, should use the cce, - // which is valid in any situation, to get the - // corresponding key. - CcEntry *last_cce = - reinterpret_cast *>( - scan_cache->Last()->cce_ptr_); - while (scan_ccm_it->second != last_cce) - { - ++scan_ccm_it; - assert(scan_ccm_it != End()); - } - const KeyT *last_key = - static_cast(scan_ccm_it->first); - if (*last_key < *end_key || - (*last_key == *end_key && !end_inclusive)) - { - ++trailing_cnt; - scan_cache->RemoveLast(); - } - else - { - // Reset iterator to the key after the last - // scanned tuple since we might need to continue - // scanning if trailing_cnt == 0. - --scan_ccm_it; - break; - } - } + slice_position = SlicePosition::FirstSliceInRange; } else { - while (remote_scan_cache->Size() > 0) - { - // Cc entry pointers here are always valid since - // the slices are still pinned so the cce cannot - // be kicked from memory regardless of the lock - // type. - CcEntry *last_remote_cce = - reinterpret_cast *>( - remote_scan_cache->LastCce()); - while (scan_ccm_it->second != last_remote_cce) - { - // As long as remote scan cache is not empty, - // iterator should not reach pos inf. - ++scan_ccm_it; - assert(scan_ccm_it != End()); - } - const KeyT *last_key = - static_cast(scan_ccm_it->first); - if (*last_key < *end_key || - (*last_key == *end_key && !end_inclusive)) - { - trailing_cnt++; - remote_scan_cache->RemoveLast(); - } - else - { - // Reset iterator to the key after the last - // scanned tuple since we might need to continue - // scanning if trailing_cnt == 0. - --scan_ccm_it; - break; - } - } - } - - // If no key is removed from the scan cache, it's possible - // that the local scan may miss keys greater than the - // batch's end. Re-scans the cc map using the batch's end. - if (trailing_cnt == 0) - { - auto [scan_ret, err] = scan_loop_func( - scan_ccm_it, *end_key, end_inclusive, true); - switch (scan_ret) - { - case ScanReturnType::Blocked: - return false; - case ScanReturnType::Error: - if (req.IsLocal()) - { - req.GetLocalScanner()->CommitAtCore(core_id); - } - - if (is_read_snapshot && - req.WaitForSnapshotCnt(shard_->core_id_) > 0) - { - req.SetIsWaitForSnapshot(shard_->core_id_); - req.DeferSetError(err); - return false; - } - - if (req.SetError(err)) - { - if (req.Result()->Value().is_local_) - { - req.UnpinSlices(); - return true; - } - else if (req.IsResponseSender(shard_->core_id_)) - { - req.SendResponseIfFinished(); - req.UnpinSlices(); - return true; - } - else - { - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, req.Txn(), &req); - return false; - } - } - else - { - return false; - } - case ScanReturnType::Yield: - shard_->Enqueue(shard_->core_id_, &req); - return false; - default: - break; - } + slice_position = SlicePosition::Middle; } } } + slice_result.SetLastKey(local_end, slice_position); + req.SetRangeCcNgTerm(ng_term); + // Sets the iterator to the last cce, which may need to be pinned to // resume the next scan batch. if (CcEntry @@ -5090,47 +4564,15 @@ class TemplateCcMap : public CcMap } } - if (req.IsLocal()) - { - req.GetLocalScanner()->CommitAtCore(core_id); - } - - if (is_read_snapshot && req.WaitForSnapshotCnt(shard_->core_id_) > 0) + if (is_read_snapshot && req.WaitForSnapshotCnt() > 0) { - req.SetIsWaitForSnapshot(shard_->core_id_); + req.SetIsWaitForSnapshot(); return false; } - if (req.SetFinish()) - { - if (req.Result()->Value().is_local_) - { - req.UnpinSlices(); - return true; - } - else if (req.IsResponseSender(shard_->core_id_)) - { - req.SendResponseIfFinished(); - req.UnpinSlices(); - return true; - } - else - { - // Renqueue the cc req to the sender req list. - // We assign a dedicated core to be the response sender instead - // of directly sending the response on the last finished core. - // This is to avoid serialization of response message causing - // one core to become significantly slower than others and would - // end up being the sender of all scan slice response. - shard_->local_shards_.EnqueueCcRequest( - shard_->core_id_, req.Txn(), &req); - return false; - } - } - else - { - return false; - } + req.UnpinSlices(); + req.SetFinish(); + return true; } /** @@ -5524,37 +4966,17 @@ class TemplateCcMap : public CcMap req.slice_coordinator_.UpdatePreparedSliceCnt(prepared_slice_cnt); req.slice_coordinator_.UpdateBatchEnd(); - if (req.export_base_table_item_) - { - // Fix the slice index of the current core - for (uint16_t core_id = 0; core_id < shard_->core_cnt_; - ++core_id) - { - req.FixCurrentSliceIndex(core_id); - } - } req.slice_coordinator_.SetReadyForScan(); - req.SetUnfinishedCoreCnt(shard_->core_cnt_); - - // Dispatch the request to the cores - for (uint16_t core_id = 0; core_id < shard_->core_cnt_; ++core_id) - { - if (core_id == shard_->core_id_) - { - continue; - } - shard_->Enqueue(shard_->LocalCoreId(), core_id, &req); - } } - if (req.IsDrained(shard_->core_id_)) + if (req.IsDrained()) { // scan is already finished on this core - req.SetFinish(shard_->core_id_); + req.SetFinish(); return false; } - auto &pause_key_and_is_drained = req.PausePos(shard_->core_id_); + auto &pause_key_and_is_drained = req.PausePos(); auto find_non_empty_slice = [this, &req, &deduce_iterator](const KeyT &search_key) @@ -5568,8 +4990,7 @@ class TemplateCcMap : public CcMap } else { - const TxKey &curr_start_tx_key = - req.CurrentSliceKey(shard_->core_id_); + const TxKey &curr_start_tx_key = req.CurrentSliceKey(); const KeyT *curr_start_key = curr_start_tx_key.GetKey(); start_key = (*curr_start_key < search_key ? &search_key : curr_start_key); @@ -5594,7 +5015,7 @@ class TemplateCcMap : public CcMap const KeyT *slice_end_key = nullptr; do { - store_slice = req.CurrentSlice(shard_->core_id_); + store_slice = req.CurrentSlice(); const TemplateStoreSlice *typed_slice = static_cast *>(store_slice); start_key = @@ -5611,11 +5032,11 @@ class TemplateCcMap : public CcMap } // The current slice is empty, try to find next slice. - req.MoveToNextSlice(shard_->core_id_); + req.MoveToNextSlice(); start_key = nullptr; // Continue to handle the next slice if not the batch end - } while (!req.TheBatchEnd(shard_->core_id_)); + } while (!req.TheBatchEnd()); return {it, end_it, slice_end_key}; }; @@ -5656,16 +5077,14 @@ class TemplateCcMap : public CcMap // If reach to the batch end, it means there are no slices that need to // be scanned. - bool slice_pinned = req.TheBatchEnd(shard_->core_id_) - ? false - : req.IsSlicePinned(shard_->core_id_); + bool slice_pinned = req.TheBatchEnd() ? false : req.IsSlicePinned(); // The following flag is used to mark the behavior of one slice. // Only need to export the key if the key is already persisted, this // will happen when the slice need to split, and should export all the // keys in this slice to get the subslice keys. bool export_persisted_key_only = !req.export_base_table_item_ && slice_pinned; - assert(key_it != slice_end_it || req.TheBatchEnd(shard_->core_id_)); + assert(key_it != slice_end_it || req.TheBatchEnd()); // 3. Loop to scan keys // DataSyncScanCc is running on TxProcessor thread. To avoid @@ -5674,8 +5093,7 @@ class TemplateCcMap : public CcMap for (size_t scan_cnt = 0; key_it != slice_end_it && key_it != slice_end_next_page_it && scan_cnt < RangePartitionDataSyncScanCc::DataSyncScanBatchSize && - req.accumulated_scan_cnt_.at(shard_->core_id_) < - req.scan_batch_size_; + req.accumulated_scan_cnt_ < req.scan_batch_size_; ++scan_cnt) { const KeyT *key = key_it->first; @@ -5708,8 +5126,8 @@ class TemplateCcMap : public CcMap { // Reach to the end of current slice. // Move to the next slice. - req.MoveToNextSlice(shard_->core_id_); - if (!req.TheBatchEnd(shard_->core_id_)) + req.MoveToNextSlice(); + if (!req.TheBatchEnd()) { search_start_key = slice_end_key; std::tie(key_it, slice_end_it, slice_end_key) = @@ -5720,9 +5138,7 @@ class TemplateCcMap : public CcMap // If reach to the batch end, it means there are no // slices that need to be scanned. slice_pinned = - req.TheBatchEnd(shard_->core_id_) - ? false - : req.IsSlicePinned(shard_->core_id_); + req.TheBatchEnd() ? false : req.IsSlicePinned(); export_persisted_key_only = !req.export_base_table_item_ && slice_pinned; } @@ -5776,20 +5192,19 @@ class TemplateCcMap : public CcMap auto export_result = ExportForCkpt(cce, *key, - req.DataSyncVec(shard_->core_id_), - req.ArchiveVec(shard_->core_id_), - req.MoveBaseIdxVec(shard_->core_id_), + req.DataSyncVec(), + req.ArchiveVec(), + req.MoveBaseIdxVec(), req.data_sync_ts_, recycle_ts, shard_->EnableMvcc(), - req.accumulated_scan_cnt_[shard_->core_id_], + req.accumulated_scan_cnt_, req.export_base_table_item_, req.export_base_table_item_only_, export_persisted_key_only, flush_size); - req.accumulated_flush_data_size_[shard_->core_id_] += - flush_size; + req.accumulated_flush_data_size_ += flush_size; if (export_result.second) { @@ -5806,8 +5221,8 @@ class TemplateCcMap : public CcMap { slice_pinned = false; // Reach to the end of current slice. Move to the next slice. - req.MoveToNextSlice(shard_->core_id_); - if (!req.TheBatchEnd(shard_->core_id_)) + req.MoveToNextSlice(); + if (!req.TheBatchEnd()) { search_start_key = slice_end_key; std::tie(key_it, slice_end_it, slice_end_key) = @@ -5817,9 +5232,8 @@ class TemplateCcMap : public CcMap // If reach to the batch end, it means there are no slices // that need to be scanned. - slice_pinned = req.TheBatchEnd(shard_->core_id_) - ? false - : req.IsSlicePinned(shard_->core_id_); + slice_pinned = + req.TheBatchEnd() ? false : req.IsSlicePinned(); export_persisted_key_only = !req.export_base_table_item_ && slice_pinned; } @@ -5830,7 +5244,7 @@ class TemplateCcMap : public CcMap // scan batch size, or reach to the end slice of the current batch // slices. assert((key_it != slice_end_it && key_it != slice_end_next_page_it) || - req.TheBatchEnd(shard_->core_id_)); + req.TheBatchEnd()); // 4. Check whether the request is finished. TxKey next_pause_key; bool no_more_data = @@ -5852,16 +5266,15 @@ class TemplateCcMap : public CcMap if (is_scan_mem_full) { - req.scan_heap_is_full_[shard_->core_id_] = 1; + req.scan_heap_is_full_ = 1; } if (is_scan_mem_full || no_more_data || - req.accumulated_scan_cnt_[shard_->core_id_] >= - req.scan_batch_size_ || - req.TheBatchEnd(shard_->core_id_)) + req.accumulated_scan_cnt_ >= req.scan_batch_size_ || + req.TheBatchEnd()) { // Request is finished - req.SetFinish(shard_->core_id_); + req.SetFinish(); return false; } @@ -6858,37 +6271,33 @@ class TemplateCcMap : public CcMap offset += sizeof(uint8_t); - uint16_t core_id = (key.Hash() & 0x3FF) % shard_->core_cnt_; - if (core_id != shard_->core_id_) - { - // Skips the key in the log record that is not sharded - // to this core. - if (op_type == OperationType::Insert || - op_type == OperationType::Update) - { - rec.Deserialize(log_blob.data(), offset); - } - if (shard_->core_id_ == req.FirstCore() || - (core_id != req.FirstCore() && core_id > shard_->core_id_)) - { - // Move to the smallest unvisited core id - next_core = std::min(core_id, next_core); - } - continue; - } + uint16_t core_id = 0; + bool is_dirty = false; + bool need_update_size = true; + int32_t partition_id = -1; - // Skip records that no longer belong to this ng. - if (RangePartitioned) + if constexpr (RangePartitioned) { const TableRangeEntry *range_entry = shard_->GetTableRangeEntry( table_name_, cc_ng_id_, TxKey(&key)); + if (range_entry == nullptr) + { + // range metadata missing, conservative handling: only + // consume value / skip. + if (op_type == OperationType::Insert || + op_type == OperationType::Update) + { + rec.Deserialize(log_blob.data(), offset); + } + continue; + } + partition_id = range_entry->GetRangeInfo()->PartitionId(); const BucketInfo *bucket_info = shard_->GetBucketInfo( - Sharder::MapRangeIdToBucketId( - range_entry->GetRangeInfo()->PartitionId()), - cc_ng_id_); - // Check if range bucket belongs to this ng or is migrating - // to this ng. + Sharder::MapRangeIdToBucketId(partition_id), cc_ng_id_); + + // Old range bucket does not belong to this ng, nor is it a + // "dirty bucket" migrating to this ng. if (bucket_info->BucketOwner() != cc_ng_id_ && bucket_info->DirtyBucketOwner() != cc_ng_id_) { @@ -6901,20 +6310,60 @@ class TemplateCcMap : public CcMap { const BucketInfo *new_bucket_info = shard_->GetBucketInfo( - Sharder::MapRangeIdToBucketId( - range_entry->GetRangeInfo()->PartitionId()), + Sharder::MapRangeIdToBucketId(new_range_id), cc_ng_id_); if (new_bucket_info->BucketOwner() != cc_ng_id_ && new_bucket_info->DirtyBucketOwner() != cc_ng_id_) { + // Neither old bucket nor new bucket belongs to this + // ng: only consume value and continue. if (op_type != OperationType::Delete) { rec.Deserialize(log_blob.data(), offset); } continue; } + + // new range belongs to this ng: determine core based on + // new_range_id and mark dirty. + core_id = static_cast((new_range_id & 0x3FF) % + shard_->core_cnt_); + is_dirty = true; + + uint64_t range_split_commit_ts = + req.RangeSplitCommitTs(partition_id); + // Only update range size for keys updated during the + // double-write phase. + need_update_size = + (range_split_commit_ts == 0) || + (req.CommitTs() > range_split_commit_ts); + } + else + { + // new_range_id < 0: key still belongs to old range, but + // old range bucket does not belong to this ng. + // Semantically, it should not be applied to this ng: + // only consume and continue. + if (op_type != OperationType::Delete) + { + rec.Deserialize(log_blob.data(), offset); + } + continue; } } + else + { + // Old range bucket belongs to this ng or is migrating to + // this ng. + core_id = static_cast((partition_id & 0x3FF) % + shard_->core_cnt_); + is_dirty = range_entry->GetRangeInfo()->IsDirty(); + + uint64_t range_split_commit_ts = + req.RangeSplitCommitTs(partition_id); + need_update_size = (range_split_commit_ts == 0) || + (req.CommitTs() > range_split_commit_ts); + } } else { @@ -6926,6 +6375,26 @@ class TemplateCcMap : public CcMap { continue; } + core_id = static_cast((key.Hash() & 0x3FF) % + shard_->core_cnt_); + } + + if (core_id != shard_->core_id_) + { + // Skips the key in the log record that is not sharded + // to this core. + if (op_type == OperationType::Insert || + op_type == OperationType::Update) + { + rec.Deserialize(log_blob.data(), offset); + } + if (shard_->core_id_ == req.FirstCore() || + (core_id != req.FirstCore() && core_id > shard_->core_id_)) + { + // Move to the smallest unvisited core id + next_core = std::min(core_id, next_core); + } + continue; } Iterator it = FindEmplace(key); @@ -7000,6 +6469,12 @@ class TemplateCcMap : public CcMap { cce->ArchiveBeforeUpdate(); } + + [[maybe_unused]] const size_t old_payload_size = + cce->PayloadSize(); + [[maybe_unused]] const RecordStatus cce_old_status = + cce->PayloadStatus(); + RecordStatus rec_status; if (op_type == OperationType::Insert || op_type == OperationType::Update) @@ -7021,6 +6496,26 @@ class TemplateCcMap : public CcMap cce->SetCommitTsPayloadStatus(commit_ts, rec_status); OnCommittedUpdate(cce, was_dirty); + if constexpr (RangePartitioned) + { + if (need_update_size) + { + int32_t delta_size = + (rec_status == RecordStatus::Deleted) + ? -static_cast(key.Size() + + old_payload_size) + : static_cast( + cce_old_status != RecordStatus::Normal + ? (key.Size() + cce->PayloadSize()) + : (cce->PayloadSize() - + old_payload_size)); + + UpdateRangeSize(static_cast(partition_id), + delta_size, + is_dirty); + } + } + if (commit_ts > last_dirty_commit_ts_) { last_dirty_commit_ts_ = commit_ts; @@ -7205,9 +6700,9 @@ class TemplateCcMap : public CcMap bool Execute(FillStoreSliceCc &req) override { - std::deque &slice_vec = req.SliceData(shard_->core_id_); + std::deque &slice_vec = req.SliceData(); - size_t index = req.NextIndex(shard_->core_id_); + size_t index = req.NextIndex(); size_t last_index = std::min(index + FillStoreSliceCc::MaxScanBatchSize, slice_vec.size()); @@ -7224,11 +6719,12 @@ class TemplateCcMap : public CcMap if (index == slice_vec.size()) { slice_vec.clear(); - return req.SetFinish(shard_); + req.SetFinish(shard_); + return true; } else { - req.SetNextIndex(shard_->core_id_, index); + req.SetNextIndex(index); shard_->Enqueue(shard_->LocalCoreId(), &req); return false; } @@ -7237,17 +6733,18 @@ class TemplateCcMap : public CcMap bool Execute(InitKeyCacheCc &req) override { Iterator map_it, map_end_it; - TxKey &resume_key = req.PauseKey(shard_->core_id_); + TxKey &resume_key = req.PauseKey(); const KeyT *start_key = nullptr; if (!resume_key.KeyPtr()) { // First time being processed. - if (req.Slice().IsValidInKeyCache(shard_->core_id_)) + if (req.Slice().IsValidInKeyCache()) { // No need to init key cache. - return req.SetFinish(shard_->core_id_, true); + req.SetFinish(true); + return true; } - req.Slice().SetLoadingKeyCache(shard_->core_id_, true); + req.Slice().SetLoadingKeyCache(true); start_key = req.Slice().StartTxKey().GetKey(); } else @@ -7307,24 +6804,25 @@ class TemplateCcMap : public CcMap continue; } const KeyT *key = map_it->first; - auto ret = - range->AddKey(*key, shard_->core_id_, &req.Slice(), true); + auto ret = range->AddKey(*key, &req.Slice(), true); if (ret == RangeSliceOpStatus::Error) { // Stop immediately if one of the add key fails. - return req.SetFinish(shard_->core_id_, false); + req.SetFinish(false); + return true; } } if (map_it == map_end_it) { - return req.SetFinish(shard_->core_id_, true); + req.SetFinish(true); + return true; } else { // record pause position and resume in next round. TxKey pause_key(map_it->first); - req.SetPauseKey(pause_key, shard_->core_id_); + req.SetPauseKey(pause_key); shard_->Enqueue(&req); return false; } @@ -7399,9 +6897,12 @@ class TemplateCcMap : public CcMap } LruPage *lru_page; uint16_t pause_idx = shard_->core_id_; - if (req.GetCleanType() == CleanType::CleanBucketData) + CleanType clean_type = req.GetCleanType(); + if (clean_type == CleanType::CleanBucketData || + clean_type == CleanType::CleanRangeData) { - // For clean bucket data, cc req is only sent to 1 core. + // For clean bucket data and range data, cc req is only sent to 1 + // core. pause_idx = 0; } if (req.ResumeKey(pause_idx)->KeyPtr() != nullptr) @@ -7501,8 +7002,8 @@ class TemplateCcMap : public CcMap : KeyT::PositiveInfinity(); const KeyT *start_key = - req.paused_pos_[shard_->core_id_].KeyPtr() != nullptr - ? req.paused_pos_[shard_->core_id_].GetKey() + req.paused_pos_.KeyPtr() != nullptr + ? req.paused_pos_.GetKey() : (req.end_key_ != nullptr ? req.start_key_->GetKey() : KeyT::NegativeInfinity()); @@ -7539,8 +7040,7 @@ class TemplateCcMap : public CcMap curr_slice = range->FindSlice(*key); it = deduce_iterator(*key); end_it = deduce_iterator(*(curr_slice->EndKey())); - if ((!curr_slice->IsValidInKeyCache(shard_->core_id_) || - it == end_it) && + if ((!curr_slice->IsValidInKeyCache() || it == end_it) && end_it != req_end_it) { // The slice is empty or the slice is invalid in key cache, @@ -7549,7 +7049,7 @@ class TemplateCcMap : public CcMap key = curr_slice->EndKey(); curr_slice = nullptr; } - else if (!curr_slice->IsValidInKeyCache(shard_->core_id_) && + else if (!curr_slice->IsValidInKeyCache() && end_it == req_end_it) { // Reach to the last slice, and the slice is invalid in key @@ -7577,7 +7077,7 @@ class TemplateCcMap : public CcMap { assert(cce->PayloadStatus() == RecordStatus::Normal || cce->PayloadStatus() == RecordStatus::Deleted); - range->DeleteKey(*cce_key, shard_->core_id_); + range->DeleteKey(*cce_key); } // Forward the iterator. @@ -7593,12 +7093,13 @@ class TemplateCcMap : public CcMap if (key_it == slice_end_it) { - req.paused_pos_[shard_->core_id_] = TxKey(); - return req.SetFinish(); + req.paused_pos_ = TxKey(); + req.SetFinish(); + return true; } else { - req.paused_pos_[shard_->core_id_] = key_it->first->CloneTxKey(); + req.paused_pos_ = key_it->first->CloneTxKey(); shard_->Enqueue(&req); return false; } @@ -7621,6 +7122,7 @@ class TemplateCcMap : public CcMap auto entry_tuples = req.EntryTuple(); size_t batch_size = req.BatchSize(); size_t start_key_index = req.StartKeyIndex(); + const int32_t partition_id = req.PartitionId(); const TxRecord *req_rec = nullptr; @@ -7630,6 +7132,7 @@ class TemplateCcMap : public CcMap ValueT decoded_rec; uint64_t commit_ts = 0; RecordStatus rec_status = RecordStatus::Normal; + uint8_t range_size_flags = 0; auto &resume_pos = req.GetPausedPosition(shard_->core_id_); size_t key_pos = std::get<0>(resume_pos); @@ -7637,6 +7140,7 @@ class TemplateCcMap : public CcMap size_t rec_offset = std::get<2>(resume_pos); size_t ts_offset = std::get<3>(resume_pos); size_t status_offset = std::get<4>(resume_pos); + size_t flags_offset = std::get<5>(resume_pos); size_t hash = 0; Iterator it; @@ -7649,6 +7153,7 @@ class TemplateCcMap : public CcMap size_t next_rec_offset = 0; size_t next_ts_offset = 0; size_t next_status_offset = 0; + size_t next_flags_offset = 0; for (size_t cnt = 0; key_pos < batch_size && cnt < UploadBatchCc::UploadBatchBatchSize; ++key_pos, ++cnt) @@ -7657,13 +7162,16 @@ class TemplateCcMap : public CcMap next_rec_offset = rec_offset; next_ts_offset = ts_offset; next_status_offset = status_offset; + next_flags_offset = flags_offset; + if (entry_vec != nullptr) { key_idx = start_key_index + key_pos; - // get key - key = entry_vec->at(key_idx)->key_.GetKey(); - // get record - req_rec = entry_vec->at(key_idx)->rec_.get(); + const auto &pair = entry_vec->at(key_idx); + range_size_flags = pair.first; + const WriteEntry *we = pair.second; + key = we->key_.GetKey(); + req_rec = we->rec_.get(); if (req_rec) { rec_status = RecordStatus::Normal; @@ -7675,11 +7183,12 @@ class TemplateCcMap : public CcMap commit_val = nullptr; } // get commit ts - commit_ts = entry_vec->at(key_idx)->commit_ts_; + commit_ts = we->commit_ts_; } else { - auto [key_str, rec_str, ts_str, status_str] = *entry_tuples; + auto [key_str, rec_str, ts_str, status_str, flags_str] = + *entry_tuples; // deserialize key decoded_key.Deserialize( key_str.data(), next_key_offset, KeySchema()); @@ -7702,21 +7211,43 @@ class TemplateCcMap : public CcMap // deserialize commit ts commit_ts = *((uint64_t *) (ts_str.data() + next_ts_offset)); next_ts_offset += sizeof(uint64_t); + if (RangePartitioned) + { + range_size_flags = + static_cast(flags_str[next_flags_offset]); + next_flags_offset += sizeof(uint8_t); + } } - hash = key->Hash(); - size_t core_idx = (hash & 0x3FF) % shard_->core_cnt_; - if (!(core_idx == shard_->core_id_) || commit_ts <= 1) + if (commit_ts <= 1) { - // Skip the key that does not belong to this core or - // commit ts does not greater than 1. Move to next key. + // skip the key that commit ts does not greater than 1. key_offset = next_key_offset; rec_offset = next_rec_offset; ts_offset = next_ts_offset; status_offset = next_status_offset; + if constexpr (RangePartitioned) + { + flags_offset = next_flags_offset; + } continue; } + if constexpr (!RangePartitioned) + { + hash = key->Hash(); + size_t core_idx = (hash & 0x3FF) % shard_->core_cnt_; + if (core_idx != shard_->core_id_) + { + // skip the key that does not belong to this core. + key_offset = next_key_offset; + rec_offset = next_rec_offset; + ts_offset = next_ts_offset; + status_offset = next_status_offset; + continue; + } + } + it = FindEmplace(*key); cce = it->second; cc_page = it.GetPage(); @@ -7748,9 +7279,14 @@ class TemplateCcMap : public CcMap rec_offset = next_rec_offset; ts_offset = next_ts_offset; status_offset = next_status_offset; + if constexpr (RangePartitioned) + { + flags_offset = next_flags_offset; + } continue; } + [[maybe_unused]] const size_t old_payload_size = cce->PayloadSize(); // Now, all versions of non-unique SecondaryIndex key shared // the unpack info in current version's payload, though the // unpack info will not be used for deleted key, we must not @@ -7770,6 +7306,8 @@ class TemplateCcMap : public CcMap } bool was_dirty = cce->IsDirty(); + [[maybe_unused]] const RecordStatus cce_old_status = + cce->PayloadStatus(); cce->SetCommitTsPayloadStatus(commit_ts, rec_status); if (req.Kind() == UploadBatchType::DirtyBucketData) { @@ -7783,6 +7321,43 @@ class TemplateCcMap : public CcMap } cce->SetCkptTs(commit_ts); } + + if constexpr (RangePartitioned) + { + if ((range_size_flags >> 4) != 0) + { + int32_t delta = + (rec_status == RecordStatus::Deleted) + ? -(static_cast(write_key->Size() + + old_payload_size)) + : (cce_old_status != RecordStatus::Normal + ? static_cast(write_key->Size() + + cce->PayloadSize()) + : static_cast(cce->PayloadSize() - + old_payload_size)); + bool need_split = + UpdateRangeSize(static_cast(partition_id), + delta, + (range_size_flags & 0x0F) != 0); + if (need_split) + { + // Create a data sync task for the range. + uint64_t data_sync_ts = + std::chrono::duration_cast< + std::chrono::microseconds>( + std::chrono::high_resolution_clock::now() + .time_since_epoch()) + .count(); + shard_->CreateSplitRangeDataSyncTask( + table_name_, + cc_ng_id_, + req.CcNgTerm(), + static_cast(partition_id), + data_sync_ts); + } + } + } + OnCommittedUpdate(cce, was_dirty); OnFlushed(cce, was_dirty); DLOG_IF(INFO, TRACE_OCC_ERR) @@ -7809,6 +7384,10 @@ class TemplateCcMap : public CcMap rec_offset = next_rec_offset; ts_offset = next_ts_offset; status_offset = next_status_offset; + if constexpr (RangePartitioned) + { + flags_offset = next_flags_offset; + } } if (key_pos < batch_size) { @@ -7820,7 +7399,8 @@ class TemplateCcMap : public CcMap key_offset, rec_offset, ts_offset, - status_offset); + status_offset, + flags_offset); shard_->Enqueue(shard_->LocalCoreId(), &req); return false; } @@ -7902,22 +7482,12 @@ class TemplateCcMap : public CcMap { // Parsed all records req.SetParsed(); - - // Emplace key on all cores - for (size_t core = 0; core < shard_->core_cnt_; ++core) - { - if (core != shard_->core_id_) - { - shard_->Enqueue(shard_->core_id_, core, &req); - } - } } - } // end-parsed - std::deque &slice_vec = req.SliceData(shard_->core_id_); + std::deque &slice_vec = req.SliceData(); - size_t index = req.NextIndex(shard_->core_id_); + size_t index = req.NextIndex(); size_t last_index = std::min( index + UploadBatchSlicesCc::MaxEmplaceBatchSize, slice_vec.size()); @@ -7953,7 +7523,7 @@ class TemplateCcMap : public CcMap else { index = last_index; - req.SetNextIndex(shard_->core_id_, index); + req.SetNextIndex(index); shard_->Enqueue(shard_->LocalCoreId(), &req); } return false; @@ -8050,7 +7620,7 @@ class TemplateCcMap : public CcMap const KeyT *const req_start_key = req.StartTxKey().GetKey(); const KeyT *const req_end_key = req.EndTxKey().GetKey(); - auto &paused_position = req.PausedPos(shard_->core_id_); + auto &paused_position = req.PausedPos(); bool is_dirty = req.IsDirty(); @@ -8121,8 +7691,7 @@ class TemplateCcMap : public CcMap slice_end_next_page_it = next_page_it(slice_end_it); - curr_slice_delta_size = - &(req.SliceDeltaSize(shard_->core_id_).back().second); + curr_slice_delta_size = &(req.SliceDeltaSize().back().second); } bool has_dml_since_ddl = false; @@ -8324,8 +7893,7 @@ class TemplateCcMap : public CcMap slice_end_next_page_it = next_page_it(slice_end_it); - auto &slice_delta_size = - req.SliceDeltaSize(shard_->core_id_); + auto &slice_delta_size = req.SliceDeltaSize(); slice_delta_size.emplace_back(slice->StartTxKey(), 0); curr_slice_delta_size = &slice_delta_size.back().second; } @@ -8771,6 +8339,10 @@ class TemplateCcMap : public CcMap } normal_obj_sz_ = 0; + if constexpr (RangePartitioned) + { + range_sizes_.clear(); + } ccmp_.clear(); } @@ -10483,10 +10055,7 @@ class TemplateCcMap : public CcMap // status, it should already be in the key cache. Only add it if // it's in DELETED. auto res = shard_->local_shards_.AddKeyToKeyCache( - table_name_, - cc_ng_id_, - shard_->core_id_, - *ccp->KeyOfEntry(cce)); + table_name_, cc_ng_id_, *ccp->KeyOfEntry(cce)); if (res == RangeSliceOpStatus::Retry) { // Retry if the slice key cache is being loaded. @@ -11914,6 +11483,74 @@ class TemplateCcMap : public CcMap return &pos_inf_page_; } + bool UpdateRangeSize(uint32_t partition_id, + int32_t delta_size, + bool is_dirty) + { + if constexpr (RangePartitioned) + { + auto it = range_sizes_.find(partition_id); + if (it == range_sizes_.end()) + { + it = range_sizes_ + .emplace(partition_id, + std::make_tuple( + static_cast( + RangeSizeStatus::kNotInitialized), + 0, + false)) + .first; + } + if (std::get<0>(it->second) == + static_cast(RangeSizeStatus::kNotInitialized) && + !is_dirty) + { + std::get<1>(it->second) += delta_size; + // Init the range size of this range. + std::get<0>(it->second) = + static_cast(RangeSizeStatus::kLoading); + + int64_t ng_term = Sharder::Instance().LeaderTerm(cc_ng_id_); + shard_->FetchTableRangeSize(table_name_, + static_cast(partition_id), + cc_ng_id_, + ng_term); + return false; + } + + if (std::get<0>(it->second) == + static_cast(RangeSizeStatus::kLoading) || + is_dirty) + { + // Loading or split: record delta in delta part (.second). + std::get<1>(it->second) += delta_size; + } + else + { + int32_t new_range_size = std::get<0>(it->second) + delta_size; + std::get<0>(it->second) = + new_range_size > 0 ? new_range_size : 0; + + bool trigger_split = + !is_dirty && !std::get<2>(it->second) && + std::get<0>(it->second) >= + static_cast(StoreRange::range_max_size); + + DLOG_IF(INFO, trigger_split) + << "Range size is too large, need to split. table: " + << table_name_.StringView() + << " partition: " << partition_id + << " range size: " << std::get<0>(it->second) + << " range max size: " << StoreRange::range_max_size; + std::get<2>(it->second) = + trigger_split == true ? true : std::get<2>(it->second); + return trigger_split; + } + } // RangePartitioned + + return false; + } + absl::btree_map< KeyT, std::unique_ptr< @@ -11941,7 +11578,7 @@ void BackfillSnapshotForScanSlice(FetchSnapshotCc *fetch_cc, { TemplateScanCache *scan_cache = static_cast *>( - req->GetLocalScanCache(core_id)); + req->GetLocalScanCache()); assert(scan_cache != nullptr); auto *scan_tuple = const_cast *>( scan_cache->At(tuple_idx)); @@ -11960,8 +11597,7 @@ void BackfillSnapshotForScanSlice(FetchSnapshotCc *fetch_cc, } else { - RemoteScanSliceCache *remote_scan_cache = - req->GetRemoteScanCache(core_id); + RemoteScanSliceCache *remote_scan_cache = req->GetRemoteScanCache(); assert(remote_scan_cache != nullptr); assert(remote_scan_cache->archive_records_.size() >= tuple_idx); auto &tmp_pair = remote_scan_cache->archive_positions_[tuple_idx]; @@ -11977,9 +11613,8 @@ void BackfillSnapshotForScanSlice(FetchSnapshotCc *fetch_cc, } // trigger request - req->DecreaseWaitForSnapshotCnt(core_id); - if (req->IsWaitForSnapshot(core_id) && - req->WaitForSnapshotCnt(core_id) == 0) + req->DecreaseWaitForSnapshotCnt(); + if (req->IsWaitForSnapshot() && req->WaitForSnapshotCnt() == 0) { shard.Enqueue(core_id, req); } diff --git a/tx_service/include/data_sync_task.h b/tx_service/include/data_sync_task.h index 06aa8d01..1c640f7b 100644 --- a/tx_service/include/data_sync_task.h +++ b/tx_service/include/data_sync_task.h @@ -138,7 +138,8 @@ struct DataSyncTask CcHandlerResult *hres, std::function filter_lambda = nullptr, bool forward_cache = false, - bool is_standby_node_ckpt = false) + bool is_standby_node_ckpt = false, + bool high_priority = false) : table_name_(table_name), id_(id), range_version_(range_version), @@ -152,7 +153,8 @@ struct DataSyncTask is_dirty_(is_dirty), sync_ts_adjustable_(need_adjust_ts), task_res_(hres), - need_update_ckpt_ts_(true) + need_update_ckpt_ts_(true), + high_priority_(high_priority) { } @@ -180,6 +182,12 @@ struct DataSyncTask // flush data buffer. void SetScanTaskFinished(); + // Once the range size reaches the threshold, a DataSyncTask is created to + // trigger the split range operation, and a flag is set indicating that the + // range has been split. This flag needs to be reset after the DataSyncTask + // completes. + void ResetRangeSplittingStatus(); + void SetErrorCode(CcErrorCode err_code) { std::unique_lock lk(status_->mux_); @@ -252,6 +260,7 @@ struct DataSyncTask cce_entries_; bool need_update_ckpt_ts_{true}; + bool high_priority_{false}; }; struct FlushTaskEntry diff --git a/tx_service/include/fault/log_replay_service.h b/tx_service/include/fault/log_replay_service.h index e9fa2fc2..eb308a58 100644 --- a/tx_service/include/fault/log_replay_service.h +++ b/tx_service/include/fault/log_replay_service.h @@ -35,6 +35,7 @@ #include #include "txlog.h" +#include "type.h" namespace txservice { @@ -174,6 +175,17 @@ class RecoveryService : public brpc::StreamInputHandler, void ProcessRecoverTxTask(RecoverTxTask &task); + // Range split info management. + void SetSplitRangeInfo(uint32_t ng_id, + TableName table_name, + int32_t range_id, + uint64_t commit_ts); + + const std::unordered_map> * + GetSplitRangeInfo(uint32_t ng_id) const; + + void CleanSplitRangeInfo(uint32_t ng_id); + struct ConnectionInfo { ConnectionInfo() = default; @@ -237,6 +249,13 @@ class RecoveryService : public brpc::StreamInputHandler, uint16_t port_; void ClearTx(uint64_t tx_number); + + // Range split info for each node group: + // ng_id -> split range commit ts>> + std::unordered_map< + uint32_t, + std::unordered_map>> + split_range_info_; }; } // namespace fault } // namespace txservice diff --git a/tx_service/include/proto/cc_request.proto b/tx_service/include/proto/cc_request.proto index 889d8259..d9d722ec 100644 --- a/tx_service/include/proto/cc_request.proto +++ b/tx_service/include/proto/cc_request.proto @@ -176,6 +176,10 @@ message UploadBatchRequest bytes commit_ts = 9; bytes rec_status = 10; UploadBatchKind kind = 11; + // Target range partition; + int32 partition_id = 12; + // Per-key one byte: [uint8_t, ...] + bytes range_size_flags = 13; } message UploadBatchSlicesRequest @@ -920,6 +924,8 @@ message PostCommitRequest { bytes record = 5; uint32 operation_type = 6; uint32 key_shard_code = 7; + int32 partition_id = 8; + bool on_dirty_range = 9; } message ForwardPostCommitRequest { @@ -1088,7 +1094,7 @@ message ScanSliceRequest { bool end_inclusive = 11; bool is_forward = 12; uint64 ts = 13; - repeated uint64 prior_cce_lock_vec = 14; + uint64 prior_cce_lock = 14; IsolationType iso_level = 15; CcProtocolType protocol = 16; bool is_for_write = 17; @@ -1105,6 +1111,7 @@ message ScanSliceResponse { int64 tx_term = 3; uint32 command_id=4; int32 error_code = 5; + uint32 core_id = 16; bytes tuple_cnt = 6; bytes last_key = 7; SlicePosition slice_position = 8; @@ -1115,9 +1122,6 @@ message ScanSliceResponse { bytes gap_ts = 13; bytes cce_lock_ptr = 14; bytes term = 15; - bytes key_start_offsets = 16; - bytes record_start_offsets = 17; - bytes trailing_cnts = 18; uint64 txm_addr = 19; } diff --git a/tx_service/include/read_write_entry.h b/tx_service/include/read_write_entry.h index 4d86c34c..36463be1 100644 --- a/tx_service/include/read_write_entry.h +++ b/tx_service/include/read_write_entry.h @@ -49,17 +49,25 @@ struct WriteSetEntry op_(other.op_), cce_addr_(other.cce_addr_), key_shard_code_(other.key_shard_code_), - forward_addr_(std::move(other.forward_addr_)) + partition_id_(other.partition_id_), + forward_addr_(std::move(other.forward_addr_)), + on_dirty_range_(other.on_dirty_range_) { } WriteSetEntry &operator=(WriteSetEntry &&other) noexcept { + if (this == &other) + { + return *this; + } rec_ = std::move(other.rec_); op_ = other.op_; cce_addr_ = other.cce_addr_; key_shard_code_ = other.key_shard_code_; + partition_id_ = other.partition_id_; forward_addr_ = std::move(other.forward_addr_); + on_dirty_range_ = other.on_dirty_range_; return *this; } @@ -68,8 +76,11 @@ struct WriteSetEntry OperationType op_; CcEntryAddr cce_addr_; uint32_t key_shard_code_{}; + int32_t partition_id_{-1}; // Used in double write scenarios during online DDL. - std::unordered_map forward_addr_; + // key shard code -> (partition id, cce addr) + std::unordered_map> forward_addr_; + bool on_dirty_range_{false}; }; /** diff --git a/tx_service/include/remote/remote_cc_handler.h b/tx_service/include/remote/remote_cc_handler.h index 83695f21..b7c43cdd 100644 --- a/tx_service/include/remote/remote_cc_handler.h +++ b/tx_service/include/remote/remote_cc_handler.h @@ -84,7 +84,9 @@ class RemoteCcHandler const TxRecord *record, OperationType operation_type, uint32_t key_shard_code, - CcHandlerResult &hres); + CcHandlerResult &hres, + int32_t partition_id = -1, + bool on_dirty_range = false); void PostWriteAll(uint32_t src_node_id, const TableName &table_name, diff --git a/tx_service/include/remote/remote_cc_request.h b/tx_service/include/remote/remote_cc_request.h index b59af76b..1c0c2604 100644 --- a/tx_service/include/remote/remote_cc_request.h +++ b/tx_service/include/remote/remote_cc_request.h @@ -763,7 +763,7 @@ struct RemoteScanSlice : public ScanSliceCc { public: RemoteScanSlice(); - void Reset(std::unique_ptr input_msg, uint16_t core_cnt); + void Reset(std::unique_ptr input_msg); private: ScanSliceResponse output_msg_; @@ -773,7 +773,7 @@ struct RemoteScanSlice : public ScanSliceCc TableName remote_tbl_name_{ empty_sv, TableType::Primary, txservice::TableEngine::None}; CcHandlerResult cc_res_{nullptr}; - std::vector scan_cache_vec_; + RemoteScanSliceCache scan_cache_; }; struct RemoteReloadCacheCc : public ReloadCacheCc diff --git a/tx_service/include/sk_generator.h b/tx_service/include/sk_generator.h index 050d6b27..b33941e8 100644 --- a/tx_service/include/sk_generator.h +++ b/tx_service/include/sk_generator.h @@ -40,8 +40,11 @@ class UploadIndexContext public: using TableIndexSet = std::unordered_map>; - using NGIndexSet = - std::unordered_map>; + // ng_id -> (range_id -> vector of (range_size_flags, WriteEntry*)) + using NGIndexSet = std::unordered_map< + NodeGroupId, + std::unordered_map>>>; private: enum struct UploadTaskStatus @@ -101,16 +104,18 @@ class UploadIndexContext CcErrorCode UploadEncodedIndex(UploadIndexTask &upload_task); CcErrorCode UploadIndexInternal( std::unordered_map &ng_index_set); - void SendIndexes(const TableName &table_name, - NodeGroupId dest_ng_id, - int64_t &ng_term, - const std::vector &write_entry_vec, - size_t batch_size, - size_t start_key_idx, - bthread::Mutex &req_mux, - bthread::ConditionVariable &req_cv, - size_t &finished_req_cnt, - CcErrorCode &res_code); + void SendIndexes( + const TableName &table_name, + NodeGroupId dest_ng_id, + int64_t &ng_term, + int32_t partition_id, + const std::vector> &write_entry_vec, + size_t batch_size, + size_t start_key_idx, + bthread::Mutex &req_mux, + bthread::ConditionVariable &req_cv, + size_t &finished_req_cnt, + CcErrorCode &res_code); // Acquire and release range read lock. CcErrorCode AcquireRangeReadLocks( TransactionExecution *acq_lock_txm, diff --git a/tx_service/include/store/data_store_handler.h b/tx_service/include/store/data_store_handler.h index d0ca96d8..4059431a 100644 --- a/tx_service/include/store/data_store_handler.h +++ b/tx_service/include/store/data_store_handler.h @@ -135,6 +135,8 @@ class DataStoreHandler virtual void FetchRangeSlices(FetchRangeSlicesReq *fetch_cc) = 0; + virtual void FetchTableRangeSize(FetchTableRangeSizeCc *fetch_cc) = 0; + /** * @brief Read a row from base table or skindex table in datastore with * specified key. Caller should pass in complete primary key or skindex key. diff --git a/tx_service/include/tx_operation_result.h b/tx_service/include/tx_operation_result.h index d31492dc..b03417fa 100644 --- a/tx_service/include/tx_operation_result.h +++ b/tx_service/include/tx_operation_result.h @@ -447,11 +447,8 @@ struct RemoteScanSliceCache static constexpr size_t MetaDataSize = 8; static constexpr size_t DefaultCacheMaxBytes = 10 * 1024 * 1024; - RemoteScanSliceCache(uint16_t shard_cnt) - : cache_mem_size_(0), - mem_max_bytes_(DefaultCacheMaxBytes), - shard_cnt_(shard_cnt), - trailing_cnt_(0) + RemoteScanSliceCache() + : cache_mem_size_(0), mem_max_bytes_(DefaultCacheMaxBytes) { } @@ -465,7 +462,7 @@ struct RemoteScanSliceCache mem_max_bytes_ = max_bytes; } - void Reset(uint16_t shard_cnt) + void Reset() { key_ts_.clear(); gap_ts_.clear(); @@ -476,26 +473,19 @@ struct RemoteScanSliceCache keys_.clear(); records_.clear(); cache_mem_size_ = 0; - trailing_cnt_ = 0; mem_max_bytes_ = DefaultCacheMaxBytes; - shard_cnt_ = shard_cnt; archive_positions_.clear(); archive_records_.clear(); } - void RemoveLast() - { - trailing_cnt_++; - } - uint64_t LastCce() { - return cce_ptr_.at(cce_ptr_.size() - 1 - trailing_cnt_); + return cce_ptr_.at(cce_ptr_.size() - 1); } size_t Size() const { - return cce_ptr_.size() - trailing_cnt_; + return cce_ptr_.size(); } void SetLastCceLock(uint64_t lock_ptr) @@ -514,8 +504,6 @@ struct RemoteScanSliceCache std::string records_; uint32_t cache_mem_size_; uint32_t mem_max_bytes_; - uint16_t shard_cnt_; - size_t trailing_cnt_; // The first element of archive_positions_ is the index of key_ts_ to // backfill and the second element is the position in records_ to be @@ -531,8 +519,7 @@ struct RangeScanSliceResult slice_position_(SlicePosition::FirstSlice), cc_ng_id_(0), ccm_scanner_(nullptr), - is_local_(true), - last_key_status_(LastKeySetStatus::Unset) + is_local_(true) { } @@ -541,8 +528,7 @@ struct RangeScanSliceResult slice_position_(status), cc_ng_id_(0), ccm_scanner_(nullptr), - is_local_(true), - last_key_status_(LastKeySetStatus::Setup) + is_local_(true) { } @@ -550,8 +536,7 @@ struct RangeScanSliceResult : last_key_(std::move(rhs.last_key_)), slice_position_(rhs.slice_position_), cc_ng_id_(rhs.cc_ng_id_), - is_local_(rhs.is_local_), - last_key_status_(rhs.last_key_status_.load(std::memory_order_acquire)) + is_local_(rhs.is_local_) { if (rhs.is_local_) { @@ -576,9 +561,6 @@ struct RangeScanSliceResult slice_position_ = rhs.slice_position_; is_local_ = rhs.is_local_; cc_ng_id_ = rhs.cc_ng_id_; - last_key_status_.store( - rhs.last_key_status_.load(std::memory_order_acquire), - std::memory_order_release); if (rhs.is_local_) { @@ -594,85 +576,47 @@ struct RangeScanSliceResult void Reset() { - last_key_status_.store(LastKeySetStatus::Unset, - std::memory_order_release); last_key_ = TxKey(); } const TxKey *SetLastKey(TxKey key) { - assert(last_key_status_.load(std::memory_order_acquire) == - LastKeySetStatus::Unset); last_key_ = std::move(key); - last_key_status_.store(LastKeySetStatus::Setup, - std::memory_order_release); - return &last_key_; } template - std::pair UpdateLastKey(const KeyT *key, - SlicePosition slice_pos) + void SetLastKey(const KeyT *key, SlicePosition slice_pos) { - bool success = false; + slice_position_ = slice_pos; - LastKeySetStatus actual = LastKeySetStatus::Unset; - if (last_key_status_.compare_exchange_strong( - actual, LastKeySetStatus::Setting, std::memory_order_acq_rel)) + // If the slice position is the last or the first, this is the last + // scan batch, which must end with positive/negative infinity or the + // request's end key. In both cases, the input key is a valid + // reference throughout the lifetime of RangeScanSliceResult. So, + // the tx key does not own a new copy of the input key. + if (slice_pos == SlicePosition::FirstSlice || + slice_pos == SlicePosition::LastSlice) { - slice_position_ = slice_pos; - - // If the slice position is the last or the first, this is the last - // scan batch, which must end with positive/negative infinity or the - // request's end key. In both cases, the input key is a valid - // reference throughout the lifetime of RangeScanSliceResult. So, - // the tx key does not own a new copy of the input key. - if (slice_pos == SlicePosition::FirstSlice || - slice_pos == SlicePosition::LastSlice) - { - last_key_ = TxKey(key); - } - else - { - last_key_ = key->CloneTxKey(); - } - - last_key_status_.store(LastKeySetStatus::Setup, - std::memory_order_release); - success = true; + last_key_ = TxKey(key); } else { - if (actual != LastKeySetStatus::Setup) - { - while (last_key_status_.load(std::memory_order_acquire) != - LastKeySetStatus::Setup) - { - // Busy poll. - } - } + last_key_ = key->CloneTxKey(); } - - return {last_key_.GetKey(), success}; } - std::pair PeekLastKey() const + const TxKey *LastKey() const { - if (last_key_status_.load(std::memory_order_acquire) == - LastKeySetStatus::Setup) - { - return {&last_key_, true}; - } - else + if (last_key_.KeyPtr() != nullptr) { - return {nullptr, false}; + return &last_key_; } + return nullptr; } TxKey MoveLastKey() { - last_key_status_.store(LastKeySetStatus::Unset, - std::memory_order_release); return std::move(last_key_); } @@ -691,23 +635,9 @@ struct RangeScanSliceResult union { CcScanner *ccm_scanner_; - std::vector *remote_scan_caches_; + RemoteScanSliceCache *remote_scan_caches_; }; bool is_local_{true}; - - /** - * For scene like: (1-write, n-read), atomic variable has obvious - * performance advantage over mutex/shared_mutex. For readers, mutex needs - * to modify a flag, and shared_mutex needs to modify a counter. However, - * atomic variable merely load a variable. - */ - enum struct LastKeySetStatus : uint8_t - { - Unset, - Setting, - Setup, - }; - std::atomic last_key_status_; }; struct BucketScanProgress diff --git a/tx_service/include/type.h b/tx_service/include/type.h index 2fe288c5..566e4171 100644 --- a/tx_service/include/type.h +++ b/tx_service/include/type.h @@ -167,6 +167,13 @@ enum class TableEngine : uint8_t InternalHash = 5, // eg. Sequence table is a kind of internal hash table. }; +// Status values for range_sizes_.first (range size not yet known). +enum RangeSizeStatus : int32_t +{ + kNotInitialized = -2, // Range size not yet initialized; need to fetch. + kLoading = -1, // Range size is being loaded; delta goes to .second. +}; + inline std::string KvTablePrefixOf(TableEngine engine) { switch (engine) diff --git a/tx_service/src/cc/cc_map.cpp b/tx_service/src/cc/cc_map.cpp index 52443b45..ede1962c 100644 --- a/tx_service/src/cc/cc_map.cpp +++ b/tx_service/src/cc/cc_map.cpp @@ -27,6 +27,7 @@ #include "cc/local_cc_shards.h" #include "cc_entry.h" #include "tx_trace.h" +#include "type.h" namespace txservice { @@ -461,4 +462,57 @@ void CcMap::DecrReadIntent(NonBlockingLock *lock, } } +bool CcMap::InitRangeSize(uint32_t partition_id, + int32_t persisted_size, + bool succeed, + bool emplace) +{ + auto it = range_sizes_.find(partition_id); + if (it == range_sizes_.end()) + { + if (!emplace) + { + return false; + } + it = range_sizes_.emplace(partition_id, std::make_tuple(0, 0, false)) + .first; + } + + if (succeed) + { + int32_t final_size = persisted_size + std::get<1>(it->second); + std::get<0>(it->second) = final_size < 0 ? 0 : final_size; + std::get<1>(it->second) = 0; + + bool trigger_split = + !std::get<2>(it->second) && + std::get<0>(it->second) >= + static_cast(StoreRange::range_max_size); + std::get<2>(it->second) = + trigger_split == true ? true : std::get<2>(it->second); + return trigger_split; + } + else + { + // Load range size failed; reset to not-initialized for retry. + std::get<0>(it->second) = + static_cast(RangeSizeStatus::kNotInitialized); + } + return false; +} + +void CcMap::ResetRangeStatus(uint32_t partition_id) +{ + auto it = range_sizes_.find(partition_id); + if (it == range_sizes_.end()) + { + return; + } + std::get<2>(it->second) = false; + + DLOG(INFO) << "ResetRangeStatus: table: " << table_name_.StringView() + << " partition: " << partition_id + << " status: " << std::boolalpha << std::get<2>(it->second); +} + } // namespace txservice diff --git a/tx_service/src/cc/cc_req_misc.cpp b/tx_service/src/cc/cc_req_misc.cpp index eae335c7..2d6dbf31 100644 --- a/tx_service/src/cc/cc_req_misc.cpp +++ b/tx_service/src/cc/cc_req_misc.cpp @@ -509,27 +509,20 @@ bool ClearCcNodeGroup::Execute(CcShard &ccs) return false; } -bool InitKeyCacheCc::SetFinish(uint16_t core, bool succ) +void InitKeyCacheCc::SetFinish(bool succ) { if (succ) { - slice_->SetKeyCacheValidity(core, succ); + slice_->SetKeyCacheValidity(succ); } - slice_->SetLoadingKeyCache(core, false); + slice_->SetLoadingKeyCache(false); - if (unfinished_cnt_.fetch_sub(1, std::memory_order_relaxed) == 1) - { - pause_pos_.clear(); - - // Unpin the slice. - range_->UnpinSlice(slice_, true); - std::unique_lock slice_lk(slice_->slice_mux_); - slice_->init_key_cache_cc_ = nullptr; - - return true; - } + pause_pos_ = TxKey(); - return false; + // Unpin the slice. + range_->UnpinSlice(slice_, true); + std::unique_lock slice_lk(slice_->slice_mux_); + slice_->init_key_cache_cc_ = nullptr; } bool InitKeyCacheCc::Execute(CcShard &ccs) @@ -538,15 +531,15 @@ bool InitKeyCacheCc::Execute(CcShard &ccs) int64_t cc_ng_term = Sharder::Instance().LeaderTerm(ng_id_); if (std::max(cc_ng_candid_term, cc_ng_term) != term_) { - return SetFinish(ccs.core_id_, false); + SetFinish(false); + return true; } CcMap *ccm = ccs.GetCcm(tbl_name_, ng_id_); if (ccm == nullptr) { - // ccm is empty when slice is fully cached. That means this slice is - // empty on this core. - return SetFinish(ccs.core_id_, true); + SetFinish(true); + return true; } return ccm->Execute(*this); @@ -561,14 +554,14 @@ StoreSlice &InitKeyCacheCc::Slice() return *slice_; } -void InitKeyCacheCc::SetPauseKey(TxKey &key, uint16_t core_id) +void InitKeyCacheCc::SetPauseKey(TxKey &key) { - pause_pos_[core_id] = key.Clone(); + pause_pos_ = key.Clone(); } -TxKey &InitKeyCacheCc::PauseKey(uint16_t core_id) +TxKey &InitKeyCacheCc::PauseKey() { - return pause_pos_[core_id]; + return pause_pos_; } void FillStoreSliceCc::Reset(const TableName &table_name, @@ -590,14 +583,9 @@ void FillStoreSliceCc::Reset(const TableName &table_name, cc_ng_id_ = cc_ng_id; cc_ng_term_ = cc_ng_term; force_load_ = force_load; - finish_cnt_ = 0; - core_cnt_ = cc_shards.Count(); - next_idxs_.clear(); - next_idxs_.resize(cc_shards.Count(), 0); - - partitioned_slice_data_.clear(); - partitioned_slice_data_.resize(cc_shards.Count()); + next_idx_ = 0; + slice_data_.clear(); range_slice_ = slice; range_ = range; @@ -619,7 +607,7 @@ void FillStoreSliceCc::SetKvFinish(bool success) { CODE_FAULT_INJECTOR("LoadRangeSliceRequest_SetFinish_Error", { success = false; - partitioned_slice_data_.clear(); + slice_data_.clear(); slice_size_ = 0; snapshot_ts_ = 0; }); @@ -656,7 +644,8 @@ bool FillStoreSliceCc::Execute(CcShard &ccs) int64_t cc_ng_term = Sharder::Instance().LeaderTerm(cc_ng_id_); if (std::max(cc_ng_candid_term, cc_ng_term) != cc_ng_term_) { - return SetError(CcErrorCode::NG_TERM_CHANGED); + SetError(CcErrorCode::NG_TERM_CHANGED); + return true; } CcMap *ccm = ccs.GetCcm(*table_name_, cc_ng_id_); @@ -705,106 +694,65 @@ void FillStoreSliceCc::AddDataItem( rec_cnt_++; } - size_t hash = key.Hash(); - // Uses the lower 10 bits of the hash code to shard the key across - // CPU cores at this node. - uint16_t core_code = hash & 0x3FF; - uint16_t core_id = core_code % core_cnt_; - - partitioned_slice_data_[core_id].emplace_back( + slice_data_.emplace_back( std::move(key), std::move(record), version_ts, is_deleted); } -bool FillStoreSliceCc::SetFinish(CcShard *cc_shard) +void FillStoreSliceCc::SetFinish(CcShard *cc_shard) { - bool finish_all = false; - CcErrorCode err_code; + if (err_code_ == CcErrorCode::NO_ERROR) { - std::lock_guard lk(mux_); - ++finish_cnt_; - - if (finish_cnt_ == core_cnt_) + bool init_key_cache = + txservice_enable_key_cache && table_name_->IsBase(); + // Cache the pointer since FillStoreSliceCc will be freed after + // CommitLoading. + + const TableName *tbl_name = table_name_; + auto cc_ng_id = cc_ng_id_; + auto cc_ng_term = cc_ng_term_; + if (init_key_cache && rec_cnt_ > 0) { - finish_all = true; - err_code = err_code_; - } - } + LocalCcShards *shards = Sharder::Instance().GetLocalCcShards(); + size_t estimate_rec_size = UINT64_MAX; - if (finish_all) - { - if (err_code == CcErrorCode::NO_ERROR) - { - bool init_key_cache = - txservice_enable_key_cache && table_name_->IsBase(); - // Cache the pointer since FillStoreSliceCc will be freed after - // CommitLoading. - - const TableName *tbl_name = table_name_; - auto cc_ng_id = cc_ng_id_; - auto cc_ng_term = cc_ng_term_; - if (init_key_cache && rec_cnt_ > 0) - { - LocalCcShards *shards = Sharder::Instance().GetLocalCcShards(); - size_t estimate_rec_size = UINT64_MAX; - - // Get estiamte record size for key cache - auto schema = shards->GetSharedTableSchema( - TableName(table_name_->GetBaseTableNameSV(), - TableType::Primary, - table_name_->Engine()), - cc_ng_id_); - auto stats = schema->StatisticsObject(); - assert(slice_size_ > 0); - estimate_rec_size = slice_size_ / rec_cnt_; - if (stats) - { - // Update estimate size in table stats with the loaded - // slice. - stats->SetEstimateRecordSize(estimate_rec_size); - } - } - range_slice_->CommitLoading(*range_, slice_size_); - if (init_key_cache) + // Get estiamte record size for key cache + auto schema = shards->GetSharedTableSchema( + TableName(table_name_->GetBaseTableNameSV(), + TableType::Primary, + table_name_->Engine()), + cc_ng_id_); + auto stats = schema->StatisticsObject(); + assert(slice_size_ > 0); + estimate_rec_size = slice_size_ / rec_cnt_; + if (stats) { - range_slice_->InitKeyCache( - cc_shard, range_, tbl_name, cc_ng_id, cc_ng_term); + // Update estimate size in table stats with the loaded + // slice. + stats->SetEstimateRecordSize(estimate_rec_size); } } - else - { - range_slice_->SetLoadingError(*range_, err_code); - } - - next_idxs_.clear(); - partitioned_slice_data_.clear(); - } - - return finish_all; -} - -bool FillStoreSliceCc::SetError(CcErrorCode err_code) -{ - bool finish_all = false; - { - std::lock_guard lk(mux_); - ++finish_cnt_; - err_code_ = err_code; - - if (finish_cnt_ == core_cnt_) + range_slice_->CommitLoading(*range_, slice_size_); + if (init_key_cache) { - finish_all = true; + range_slice_->InitKeyCache( + cc_shard, range_, tbl_name, cc_ng_id, cc_ng_term); } } - - if (finish_all) + else { range_slice_->SetLoadingError(*range_, err_code_); - - next_idxs_.clear(); - partitioned_slice_data_.clear(); } - return finish_all; + next_idx_ = 0; + slice_data_.clear(); +} + +void FillStoreSliceCc::SetError(CcErrorCode err_code) +{ + err_code_ = err_code; + range_slice_->SetLoadingError(*range_, err_code_); + next_idx_ = 0; + slice_data_.clear(); } void FillStoreSliceCc::StartFilling() @@ -818,8 +766,14 @@ void FillStoreSliceCc::TerminateFilling() // The slice has not been filled into memory. So, the out-of-memory flag is // false. range_slice_->SetLoadingError(*range_, CcErrorCode::DATA_STORE_ERR); - next_idxs_.clear(); - partitioned_slice_data_.clear(); + next_idx_ = 0; + slice_data_.clear(); +} + +int32_t FillStoreSliceCc::PartitionId() const +{ + assert(range_ != nullptr); + return range_->PartitionId(); } FetchRecordCc::FetchRecordCc(const TableName *tbl_name, @@ -1535,4 +1489,63 @@ bool ShardCleanCc::Execute(CcShard &ccs) } } +void FetchTableRangeSizeCc::Reset(const TableName &table_name, + int32_t partition_id, + const TxKey &start_key, + CcShard *ccs, + NodeGroupId ng_id, + int64_t ng_term) +{ + table_name_ = &table_name; + partition_id_ = partition_id; + start_key_ = start_key.GetShallowCopy(); + node_group_id_ = ng_id; + node_group_term_ = ng_term; + ccs_ = ccs; + error_code_ = 0; + store_range_size_ = 0; +} + +bool FetchTableRangeSizeCc::ValidTermCheck() +{ + int64_t ng_leader_term = Sharder::Instance().LeaderTerm(node_group_id_); + return ng_leader_term == node_group_term_; +} + +bool FetchTableRangeSizeCc::Execute(CcShard &ccs) +{ + if (!ValidTermCheck()) + { + error_code_ = static_cast(CcErrorCode::NG_TERM_CHANGED); + } + + bool succ = (error_code_ == 0); + CcMap *ccm = ccs.GetCcm(*table_name_, node_group_id_); + if (ccm == nullptr) + { + assert(error_code_ != 0); + return true; + } + bool need_split = ccm->InitRangeSize( + static_cast(partition_id_), store_range_size_, succ); + + if (need_split) + { + uint64_t data_sync_ts = ccs.local_shards_.ClockTs(); + ccs.CreateSplitRangeDataSyncTask(*table_name_, + node_group_id_, + node_group_term_, + partition_id_, + data_sync_ts); + } + + return true; +} + +void FetchTableRangeSizeCc::SetFinish(uint32_t error) +{ + error_code_ = error; + ccs_->Enqueue(this); +} + } // namespace txservice diff --git a/tx_service/src/cc/cc_shard.cpp b/tx_service/src/cc/cc_shard.cpp index 2036d569..d3c009ef 100644 --- a/tx_service/src/cc/cc_shard.cpp +++ b/tx_service/src/cc/cc_shard.cpp @@ -398,6 +398,26 @@ CcMap *CcShard::GetCcm(const TableName &table_name, uint32_t node_group) } } +void CcShard::FetchTableRangeSize(const TableName &table_name, + int32_t partition_id, + NodeGroupId cc_ng_id, + int64_t cc_ng_term) +{ + FetchTableRangeSizeCc *fetch_cc = fetch_range_size_cc_pool_.NextRequest(); + + const TableName range_table_name(table_name.StringView(), + TableType::RangePartition, + table_name.Engine()); + const TableRangeEntry *range_entry = + GetTableRangeEntry(range_table_name, cc_ng_id, partition_id); + assert(range_entry != nullptr); + TxKey start_key = range_entry->GetRangeInfo()->StartTxKey(); + + fetch_cc->Reset( + table_name, partition_id, start_key, this, cc_ng_id, cc_ng_term); + local_shards_.store_hd_->FetchTableRangeSize(fetch_cc); +} + void CcShard::AdjustDataKeyStats(const TableName &table_name, int64_t size_delta, int64_t dirty_delta) @@ -3560,6 +3580,29 @@ void CcShard::RecycleTxLockInfo(TxLockInfo::uptr lock_info) tx_lock_info_head_.next_ = std::move(lock_info); } +void CcShard::ResetRangeSplittingStatus(const TableName &table_name, + uint32_t ng_id, + uint32_t range_id) +{ + CcMap *ccm = GetCcm(table_name, ng_id); + if (ccm == nullptr) + { + return; + } + + ccm->ResetRangeStatus(range_id); +} + +void CcShard::CreateSplitRangeDataSyncTask(const TableName &table_name, + uint32_t ng_id, + int64_t ng_term, + int32_t range_id, + uint64_t data_sync_ts) +{ + local_shards_.CreateSplitRangeDataSyncTask( + table_name, ng_id, ng_term, range_id, data_sync_ts); +} + void CcShard::CollectCacheHit() { assert(metrics::enable_cache_hit_rate); diff --git a/tx_service/src/cc/local_cc_handler.cpp b/tx_service/src/cc/local_cc_handler.cpp index 9dd7962d..60c5a33e 100644 --- a/tx_service/src/cc/local_cc_handler.cpp +++ b/tx_service/src/cc/local_cc_handler.cpp @@ -274,7 +274,9 @@ txservice::CcReqStatus txservice::LocalCcHandler::PostWrite( const TxRecord *record, OperationType operation_type, uint32_t key_shard_code, - CcHandlerResult &hres) + CcHandlerResult &hres, + int32_t partition_id, + bool on_dirty_range) { uint32_t ng_id = cce_addr.NodeGroupId(); uint32_t dest_node_id = Sharder::Instance().LeaderNodeId(ng_id); @@ -293,7 +295,9 @@ txservice::CcReqStatus txservice::LocalCcHandler::PostWrite( record, operation_type, key_shard_code, - &hres); + &hres, + partition_id, + on_dirty_range); TX_TRACE_ACTION(this, req); TX_TRACE_DUMP(req); cc_shards_.EnqueueCcRequest(thd_id_, cce_addr.CoreId(), req); @@ -312,7 +316,9 @@ txservice::CcReqStatus txservice::LocalCcHandler::PostWrite( record, operation_type, key_shard_code, - hres); + hres, + partition_id, + on_dirty_range); } return req_status; } @@ -1283,34 +1289,22 @@ void txservice::LocalCcHandler::ScanNextBatch( scanner.is_require_recs_, prefetch_size); - uint32_t core_cnt = cc_shards_.Count(); - req->SetShardCount(core_cnt); - // When the cc ng term is less than 0, this is the first scan of the // specified range. - if (cc_ng_term < 0) + if (cc_ng_term >= 0) { - scanner.ResetShards(core_cnt); - } - - for (uint32_t core_id = 0; core_id < core_cnt; ++core_id) - { - ScanCache *cache = scanner.Cache(core_id); + ScanCache *cache = scanner.Cache(0); const ScanTuple *last_tuple = cache->LastTuple(); req->SetPriorCceLockAddr( - last_tuple != nullptr ? last_tuple->cce_addr_.CceLockPtr() : 0, - core_id); + last_tuple != nullptr ? last_tuple->cce_addr_.CceLockPtr() : 0); } scanner.ResetCaches(); - uint32_t core_rand = butil::fast_rand(); + uint16_t dest_core = (range_id & 0x3FF) % cc_shards_.Count(); - // The scan slice request is dispatched to the first core. The first - // core tries to pin the slice in memory and if succeeds, further - // dispatches the request to remaining cores for parallel scans. - cc_shards_.EnqueueCcRequest(thd_id_, core_rand % core_cnt, req); + cc_shards_.EnqueueCcRequest(thd_id_, dest_core, req); } else { @@ -1907,7 +1901,8 @@ void txservice::LocalCcHandler::KickoutData(const TableName &table_name, KickoutCcEntryCc *req = kickout_ccentry_pool_.NextRequest(); // For hash partition, all data in a single bucket should be hashed to // the same core. - uint16_t core_cnt = clean_type == CleanType::CleanBucketData + uint16_t core_cnt = (clean_type == CleanType::CleanBucketData || + clean_type == CleanType::CleanRangeData) ? 1 : Sharder::Instance().GetLocalCcShardsCount(); req->Reset(table_name, @@ -1934,6 +1929,14 @@ void txservice::LocalCcHandler::KickoutData(const TableName &table_name, Sharder::Instance().ShardBucketIdToCoreIdx((*bucket_id)[0]), req); } + else if (clean_type == CleanType::CleanRangeData) + { + assert(range_id != INT32_MAX); + uint16_t dest_core = static_cast( + (range_id & 0x3FF) % + Sharder::Instance().GetLocalCcShardsCount()); + cc_shards_.EnqueueToCcShard(dest_core, req); + } else { // Dispatch the request to all cores and run in parallel @@ -2013,20 +2016,13 @@ void txservice::LocalCcHandler::UpdateKeyCache(const TableName &table_name, hres.SetToBlock(); #endif - size_t core_cnt = cc_shards_.Count(); UpdateKeyCacheCc *req = update_key_cache_pool_.NextRequest(); - req->Reset(table_name, - ng_id, - tx_term, - core_cnt, - start_key, - end_key, - store_range, - &hres); - for (size_t idx = 0; idx < core_cnt; ++idx) - { - cc_shards_.EnqueueCcRequest(idx, req); - } + req->Reset( + table_name, ng_id, tx_term, start_key, end_key, store_range, &hres); + + uint16_t dest_core = static_cast( + (store_range->PartitionId() & 0x3FF) % cc_shards_.Count()); + cc_shards_.EnqueueCcRequest(dest_core, req); } /* diff --git a/tx_service/src/cc/local_cc_shards.cpp b/tx_service/src/cc/local_cc_shards.cpp index 810b5607..fec2f065 100644 --- a/tx_service/src/cc/local_cc_shards.cpp +++ b/tx_service/src/cc/local_cc_shards.cpp @@ -2337,7 +2337,8 @@ bool LocalCcShards::EnqueueRangeDataSyncTask( bool can_be_skipped, uint64_t &last_sync_ts, std::shared_ptr status, - CcHandlerResult *hres) + CcHandlerResult *hres, + bool high_priority) { const RangeInfo *range_info = range_entry->GetRangeInfo(); NodeGroupId range_ng = @@ -2371,19 +2372,33 @@ bool LocalCcShards::EnqueueRangeDataSyncTask( // Push task to worker task queue. std::lock_guard task_worker_lk( data_sync_worker_ctx_.mux_); - data_sync_task_queue_[range_info->PartitionId() % - data_sync_task_queue_.size()] - .emplace_back( - std::make_shared(table_name, - range_info->PartitionId(), - range_info->VersionTs(), - ng_id, - ng_term, - data_sync_ts, - status, - is_dirty, - can_be_skipped, - hres)); + std::deque> &task_queue = + data_sync_task_queue_[range_info->PartitionId() % + data_sync_task_queue_.size()]; + + auto task = + std::make_shared(table_name, + range_info->PartitionId(), + range_info->VersionTs(), + ng_id, + ng_term, + data_sync_ts, + status, + is_dirty, + can_be_skipped, + hres, + nullptr, + false, + false, + high_priority); + if (high_priority) + { + task_queue.push_front(std::move(task)); + } + else + { + task_queue.push_back(std::move(task)); + } return true; } else @@ -2391,11 +2406,12 @@ bool LocalCcShards::EnqueueRangeDataSyncTask( if (can_be_skipped) { assert(hres == nullptr); + assert(!high_priority); // '0' means have no pending task on queue. if (iter->second->latest_pending_task_ts_ == 0) { iter->second->latest_pending_task_ts_ = data_sync_ts; - iter->second->pending_tasks_.push( + iter->second->pending_tasks_.push_back( std::make_shared( table_name, range_info->PartitionId(), @@ -2424,7 +2440,7 @@ bool LocalCcShards::EnqueueRangeDataSyncTask( // This task can't be skipped(DataMigration, CraeteIndex, // LastCheckpoint). So we push this task to the pending task // queue of `Limiter` - iter->second->pending_tasks_.push( + auto task = std::make_shared(table_name, range_info->PartitionId(), range_info->VersionTs(), @@ -2434,7 +2450,19 @@ bool LocalCcShards::EnqueueRangeDataSyncTask( status, is_dirty, can_be_skipped, - hres)); + hres, + nullptr, + false, + false, + high_priority); + if (high_priority) + { + iter->second->pending_tasks_.push_front(std::move(task)); + } + else + { + iter->second->pending_tasks_.push_back(std::move(task)); + } return true; } } @@ -2509,22 +2537,24 @@ void LocalCcShards::EnqueueDataSyncTaskForSplittingRange( TxKey old_start_key = range_entry->GetRangeInfo()->StartTxKey(); TxKey old_end_key = range_entry->GetRangeInfo()->EndTxKey(); // The old range - data_sync_task_queue_[range_entry->GetRangeInfo()->PartitionId() % - data_sync_task_queue_.size()] - .emplace_back(std::make_shared( - table_name, - ng_id, - ng_term, - table_schema, - range_entry, - range_entry->GetRangeInfo()->StartTxKey(), - *new_keys->begin(), - data_sync_ts, - is_dirty, - false, - txn, - status, - hres)); + auto &task_queue = + data_sync_task_queue_[range_entry->GetRangeInfo()->PartitionId() % + data_sync_task_queue_.size()]; + auto old_range_task = std::make_shared( + table_name, + ng_id, + ng_term, + table_schema, + range_entry, + range_entry->GetRangeInfo()->StartTxKey(), + *new_keys->begin(), + data_sync_ts, + is_dirty, + false, + txn, + status, + hres); + task_queue.push_front(std::move(old_range_task)); bool need_copy_range = store_hd_->NeedCopyRange(); @@ -2534,20 +2564,22 @@ void LocalCcShards::EnqueueDataSyncTaskForSplittingRange( TxKey end_key = (i == new_keys->size() - 1 ? range_entry->GetRangeInfo()->EndTxKey() : (*new_keys)[i + 1].GetShallowCopy()); - data_sync_task_queue_[new_range_id % data_sync_task_queue_.size()] - .emplace_back(std::make_shared(table_name, - ng_id, - ng_term, - table_schema, - range_entry, - (*new_keys)[i], - end_key, - data_sync_ts, - is_dirty, - need_copy_range, - txn, - status, - hres)); + auto &task_queue = + data_sync_task_queue_[new_range_id % data_sync_task_queue_.size()]; + auto new_range_task = std::make_shared(table_name, + ng_id, + ng_term, + table_schema, + range_entry, + (*new_keys)[i], + end_key, + data_sync_ts, + is_dirty, + need_copy_range, + txn, + status, + hres); + task_queue.push_front(std::move(new_range_task)); } data_sync_worker_ctx_.cv_.notify_all(); @@ -2641,7 +2673,7 @@ bool LocalCcShards::EnqueueDataSyncTaskToCore( if (iter->second->latest_pending_task_ts_ == 0) { iter->second->latest_pending_task_ts_ = data_sync_ts; - iter->second->pending_tasks_.push( + iter->second->pending_tasks_.push_back( std::make_shared(table_name, core_idx, 0, @@ -2672,7 +2704,7 @@ bool LocalCcShards::EnqueueDataSyncTaskToCore( // LastCheckpoint). Because these operations need to explicitly // flush data into storage, rather than relying on other // checkpoint tasks. - iter->second->pending_tasks_.push( + iter->second->pending_tasks_.push_back( std::make_shared(table_name, core_idx, 0, @@ -2913,6 +2945,37 @@ void LocalCcShards::EnqueueDataSyncTaskForBucket( data_sync_worker_ctx_.cv_.notify_all(); } +void LocalCcShards::CreateSplitRangeDataSyncTask(const TableName &table_name, + uint32_t ng_id, + int64_t ng_term, + int32_t range_id, + uint64_t data_sync_ts) +{ + std::shared_lock meta_lk(meta_data_mux_); + std::shared_ptr status = + std::make_shared(ng_id, ng_term, false); + TableName range_table_name(table_name.StringView(), + TableType::RangePartition, + table_name.Engine()); + TableRangeEntry *range_entry = const_cast( + GetTableRangeEntryInternal(range_table_name, ng_id, range_id)); + assert(range_entry != nullptr); + uint64_t last_sync_ts = 0; + EnqueueRangeDataSyncTask(table_name, + ng_id, + ng_term, + range_entry, + data_sync_ts, + false, + false, + last_sync_ts, + status, + nullptr, + true); + + data_sync_worker_ctx_.cv_.notify_all(); +} + void LocalCcShards::Terminate() { // Terminate the data sync task worker thds. @@ -3158,6 +3221,7 @@ void LocalCcShards::PostProcessFlushTaskEntries( task->id_); } task->SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); + task->ResetRangeSplittingStatus(); continue; } @@ -3217,6 +3281,7 @@ void LocalCcShards::PostProcessFlushTaskEntries( } task->SetError(err_code); + task->ResetRangeSplittingStatus(); } else { @@ -3228,6 +3293,7 @@ void LocalCcShards::PostProcessFlushTaskEntries( txservice::AbortTx(entry->data_sync_txm_); } task->SetError(CcErrorCode::DATA_STORE_ERR); + task->ResetRangeSplittingStatus(); } } } @@ -3272,6 +3338,7 @@ void LocalCcShards::PostProcessFlushTaskEntries( } task->SetFinish(); + task->ResetRangeSplittingStatus(); } else { @@ -3303,6 +3370,7 @@ void LocalCcShards::PostProcessFlushTaskEntries( } task->SetError(err_code); + task->ResetRangeSplittingStatus(); } } } @@ -3359,6 +3427,7 @@ void LocalCcShards::PostProcessRangePartitionDataSyncTask( task->id_); } task->SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); + task->ResetRangeSplittingStatus(); if (ng_term >= 0) { Sharder::Instance().UnpinNodeGroupData(task->node_group_id_); @@ -3427,6 +3496,7 @@ void LocalCcShards::PostProcessRangePartitionDataSyncTask( } task->SetFinish(); + task->ResetRangeSplittingStatus(); } else if (task_ckpt_err == DataSyncTask::CkptErrorCode::SCAN_ERROR) { @@ -3478,6 +3548,7 @@ void LocalCcShards::PostProcessRangePartitionDataSyncTask( } task->SetError(err_code); + task->ResetRangeSplittingStatus(); } else { @@ -3489,6 +3560,7 @@ void LocalCcShards::PostProcessRangePartitionDataSyncTask( txservice::AbortTx(data_sync_txm); } task->SetError(CcErrorCode::DATA_STORE_ERR); + task->ResetRangeSplittingStatus(); } } @@ -3541,6 +3613,7 @@ void LocalCcShards::DataSyncForRangePartition( // table dropped data_sync_task->SetError(CcErrorCode::REQUESTED_TABLE_NOT_EXISTS); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); ClearAllPendingTasks(ng_id, expected_ng_term, table_name, range_id); } else @@ -3578,6 +3651,7 @@ void LocalCcShards::DataSyncForRangePartition( { data_sync_task->SetFinish(); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); PopPendingTask( ng_id, expected_ng_term, table_name, range_id); assert(need_process == false); @@ -3593,6 +3667,7 @@ void LocalCcShards::DataSyncForRangePartition( data_sync_task->SetError( CcErrorCode::REQUESTED_NODE_NOT_LEADER); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); PopPendingTask(ng_id, expected_ng_term, table_name, range_id); } } @@ -3618,6 +3693,7 @@ void LocalCcShards::DataSyncForRangePartition( // Finish this task and notify the caller. data_sync_task->SetError(CcErrorCode::REQUESTED_NODE_NOT_LEADER); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); PopPendingTask(ng_id, expected_ng_term, table_name, range_id); if (ng_term >= 0) @@ -3702,6 +3778,7 @@ void LocalCcShards::DataSyncForRangePartition( // directly. data_sync_task->SetError(); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); ClearAllPendingTasks( ng_id, expected_ng_term, table_name, range_id); @@ -3761,6 +3838,7 @@ void LocalCcShards::DataSyncForRangePartition( data_sync_task->SetFinish(); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); PopPendingTask(ng_id, expected_ng_term, table_name, range_id); return; @@ -3815,6 +3893,7 @@ void LocalCcShards::DataSyncForRangePartition( data_sync_task->SetError(); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); PopPendingTask(ng_id, expected_ng_term, table_name, range_id); return; @@ -3830,6 +3909,7 @@ void LocalCcShards::DataSyncForRangePartition( txservice::AbortTx(data_sync_txm); data_sync_task->SetError(CcErrorCode::GET_RANGE_ID_ERR); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); PopPendingTask(ng_id, expected_ng_term, table_name, range_id); return; @@ -3871,7 +3951,6 @@ void LocalCcShards::DataSyncForRangePartition( data_sync_task->data_sync_ts_, ng_id, ng_term, - cc_shards_.size(), tx_number, start_tx_key, end_tx_key, @@ -3879,10 +3958,10 @@ void LocalCcShards::DataSyncForRangePartition( is_dirty, schema_version); - for (size_t i = 0; i < cc_shards_.size(); i++) - { - EnqueueLowPriorityCcRequestToShard(i, &scan_delta_size_cc); - } + uint16_t dest_core = static_cast( + (range_entry->GetRangeInfo()->PartitionId() & 0x3FF) % Count()); + EnqueueLowPriorityCcRequestToShard(dest_core, &scan_delta_size_cc); + scan_delta_size_cc.Wait(); if (scan_delta_size_cc.IsError()) @@ -3905,14 +3984,10 @@ void LocalCcShards::DataSyncForRangePartition( return; } - for (size_t i = 0; i < cc_shards_.size(); ++i) + auto &delta_size = scan_delta_size_cc.SliceDeltaSize(); + for (auto &delta : delta_size) { - auto &delta_size = scan_delta_size_cc.SliceDeltaSize(i); - for (size_t j = 0; j < delta_size.size(); ++j) - { - slices_delta_size[std::move(delta_size[j].first)] += - delta_size[j].second; - } + slices_delta_size[std::move(delta.first)] += delta.second; } if (!export_base_table_items && slices_delta_size.size() == 0) @@ -3947,6 +4022,7 @@ void LocalCcShards::DataSyncForRangePartition( } data_sync_task->SetFinish(); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); return; } assert(slices_delta_size.size() > 0 || export_base_table_items); @@ -3979,6 +4055,7 @@ void LocalCcShards::DataSyncForRangePartition( data_sync_task->SetError(); data_sync_task->SetScanTaskFinished(); + data_sync_task->ResetRangeSplittingStatus(); // Handle the pending tasks for the same range PopPendingTask(ng_id, expected_ng_term, table_name, range_id); @@ -4007,40 +4084,6 @@ void LocalCcShards::DataSyncForRangePartition( } // 3. Scan records. - // The data sync worker thread is the owner of those vectors. - - // Sort output vectors in key sorting order. - auto key_greater = [](const std::pair &r1, - const std::pair &r2) -> bool - { return r2.first < r1.first; }; - auto rec_greater = [](const FlushRecord &r1, const FlushRecord &r2) -> bool - { return r2.Key() < r1.Key(); }; - - std::vector> data_sync_vecs; - std::vector> archive_vecs; - std::vector>> mv_base_vecs; - - // Add an extra vector as a remaining vector to store the remaining keys - // of the current batch of FlushRecords. - // DataSyncScanCc request is executed in parallel on all cores. For a - // batch of scan results, the end keys among the cores are different. - // In order to ensure the accuracy of the calculated subslice keys, for - // this batch of FlushRecords, the minimum end key of all cores's scan - // result is obtained, and the FlushRecords after this key is placed in - // this remaining vector, which will be merged with the next batch of - // FlushRecords. For example: core1[10,15,20], core2[8,16,24,32], only - // [8,10,15,16,20] will be flushed into data store in this round,and - // the remaining vector stores [24,32] - for (size_t i = 0; i < (cc_shards_.size() + 1); ++i) - { - data_sync_vecs.emplace_back(); - data_sync_vecs.back().reserve(DATA_SYNC_SCAN_BATCH_SIZE); - archive_vecs.emplace_back(); - archive_vecs.back().reserve(DATA_SYNC_SCAN_BATCH_SIZE); - mv_base_vecs.emplace_back(); - mv_base_vecs.back().reserve(DATA_SYNC_SCAN_BATCH_SIZE); - } - // Scan the FlushRecords. // Paused position UpdateSliceStatus update_slice_status; @@ -4053,8 +4096,11 @@ void LocalCcShards::DataSyncForRangePartition( GetRangeOwner(old_range_id, ng_id)->BucketOwner(); NodeGroupId new_range_owner = GetRangeOwner(range_id, ng_id)->BucketOwner(); + uint16_t old_range_owner_shard = (old_range_id & 0x3FF) % Count(); + uint16_t new_range_owner_shard = (range_id & 0x3FF) % Count(); - need_send_range_cache = new_range_owner != old_range_owner; + need_send_range_cache = new_range_owner != old_range_owner || + new_range_owner_shard != old_range_owner_shard; if (need_send_range_cache) { range_cache_sender = std::make_unique( @@ -4073,7 +4119,6 @@ void LocalCcShards::DataSyncForRangePartition( data_sync_task->data_sync_ts_, ng_id, ng_term, - cc_shards_.size(), DATA_SYNC_SCAN_BATCH_SIZE, tx_number, &start_tx_key, @@ -4095,12 +4140,7 @@ void LocalCcShards::DataSyncForRangePartition( while (!scan_data_drained) { - uint32_t core_rand = butil::fast_rand(); - // The scan slice request is dispatched to the first core. The first - // core tries to pin the slice if necessary and if succeeds, further - // dispatches the request to remaining cores for parallel scans. - EnqueueLowPriorityCcRequestToShard(core_rand % cc_shards_.size(), - &scan_cc); + EnqueueLowPriorityCcRequestToShard(dest_core, &scan_cc); scan_cc.Wait(); if (scan_cc.IsError()) @@ -4119,61 +4159,51 @@ void LocalCcShards::DataSyncForRangePartition( else { scan_data_drained = true; - assert(scan_cc.accumulated_flush_data_size_.size() == - cc_shards_.size()); - uint64_t flush_data_size = 0; - for (size_t flush_data_size_per_core : - scan_cc.accumulated_flush_data_size_) - { - flush_data_size += flush_data_size_per_core; - } + uint64_t flush_data_size = scan_cc.accumulated_flush_data_size_; // The cost of FlushRecord also needs to be considered. - for (size_t i = 0; i < cc_shards_.size(); ++i) - { #ifdef WITH_JEMALLOC - flush_data_size += - (scan_cc.DataSyncVec(i).size() * sizeof(FlushRecord) + - scan_cc.ArchiveVec(i).size() * sizeof(FlushRecord) + - scan_cc.MoveBaseIdxVec(i).size() * - sizeof(std::pair)); + flush_data_size += + (scan_cc.DataSyncVec().size() * sizeof(FlushRecord) + + scan_cc.ArchiveVec().size() * sizeof(FlushRecord) + + scan_cc.MoveBaseIdxVec().size() * + sizeof(std::pair)); #else - // Check if vectors are empty before calling malloc_usable_size - // to avoid SEGV on nullptr or invalid pointers. - // Use malloc_usable_size when ASan is enabled (vectors may be - // allocated by ASan's allocator), otherwise use - // mi_malloc_usable_size for mimalloc-allocated memory. - auto &data_sync_vec_ref = scan_cc.DataSyncVec(i); - auto &archive_vec_ref = scan_cc.ArchiveVec(i); - auto &move_base_idx_vec_ref = scan_cc.MoveBaseIdxVec(i); + // Check if vectors are empty before calling malloc_usable_size + // to avoid SEGV on nullptr or invalid pointers. + // Use malloc_usable_size when ASan is enabled (vectors may be + // allocated by ASan's allocator), otherwise use + // mi_malloc_usable_size for mimalloc-allocated memory. + auto &data_sync_vec_ref = scan_cc.DataSyncVec(); + auto &archive_vec_ref = scan_cc.ArchiveVec(); + auto &move_base_idx_vec_ref = scan_cc.MoveBaseIdxVec(); #ifdef __SANITIZE_ADDRESS__ - // When ASan is enabled, use standard malloc_usable_size - flush_data_size += - (data_sync_vec_ref.empty() - ? 0 - : malloc_usable_size(data_sync_vec_ref.data())) + - (archive_vec_ref.empty() - ? 0 - : malloc_usable_size(archive_vec_ref.data())) + - (move_base_idx_vec_ref.empty() - ? 0 - : malloc_usable_size(move_base_idx_vec_ref.data())); + // When ASan is enabled, use standard malloc_usable_size + flush_data_size += + (data_sync_vec_ref.empty() + ? 0 + : malloc_usable_size(data_sync_vec_ref.data())) + + (archive_vec_ref.empty() + ? 0 + : malloc_usable_size(archive_vec_ref.data())) + + (move_base_idx_vec_ref.empty() + ? 0 + : malloc_usable_size(move_base_idx_vec_ref.data())); #else - // When ASan is not enabled, use mimalloc's API - flush_data_size += - (data_sync_vec_ref.empty() - ? 0 - : mi_malloc_usable_size(data_sync_vec_ref.data())) + - (archive_vec_ref.empty() - ? 0 - : mi_malloc_usable_size(archive_vec_ref.data())) + - (move_base_idx_vec_ref.empty() - ? 0 - : mi_malloc_usable_size(move_base_idx_vec_ref.data())); + // When ASan is not enabled, use mimalloc's API + flush_data_size += + (data_sync_vec_ref.empty() + ? 0 + : mi_malloc_usable_size(data_sync_vec_ref.data())) + + (archive_vec_ref.empty() + ? 0 + : mi_malloc_usable_size(archive_vec_ref.data())) + + (move_base_idx_vec_ref.empty() + ? 0 + : mi_malloc_usable_size(move_base_idx_vec_ref.data())); #endif #endif - } // This thread will wait in AllocatePendingFlushDataMemQuota if // quota is not available @@ -4189,53 +4219,6 @@ void LocalCcShards::DataSyncForRangePartition( << " of range: " << range_id << " for table: " << table_name.StringView(); - // The minimum end key of this batch data between all the cores. - TxKey min_scanned_end_key = - GetCatalogFactory(table_name.Engine())->PositiveInfKey(); - for (size_t i = 0; i < cc_shards_.size(); ++i) - { - for (size_t j = 0; j < scan_cc.accumulated_scan_cnt_[i]; ++j) - { - auto &rec = scan_cc.DataSyncVec(i)[j]; - // Clone key - data_sync_vecs[i].emplace_back( - rec.Key().Clone(), - rec.ReleaseVersionedPayload(), - rec.payload_status_, - rec.commit_ts_, - rec.cce_, - rec.post_flush_size_, - range_id); - } - - // Get the minimum end key. - if (!data_sync_vecs[i].empty() && - data_sync_vecs[i].back().Key() < min_scanned_end_key) - { - min_scanned_end_key = data_sync_vecs[i].back().Key(); - } - - for (size_t j = 0; j < scan_cc.ArchiveVec(i).size(); ++j) - { - auto &rec = scan_cc.ArchiveVec(i)[j]; - rec.SetKey(data_sync_vecs[i][rec.GetKeyIndex()].Key()); - } - - for (size_t j = 0; j < scan_cc.MoveBaseIdxVec(i).size(); ++j) - { - size_t key_idx = scan_cc.MoveBaseIdxVec(i)[j]; - TxKey key_raw = data_sync_vecs[i][key_idx].Key(); - mv_base_vecs[i].emplace_back(std::move(key_raw), range_id); - } - - // Move the bucket into the tank - std::move(scan_cc.ArchiveVec(i).begin(), - scan_cc.ArchiveVec(i).end(), - std::back_inserter(archive_vecs.at(i))); - - scan_data_drained = scan_cc.IsDrained(i) && scan_data_drained; - } - std::unique_ptr> data_sync_vec = std::make_unique>(); std::unique_ptr> archive_vec = @@ -4244,90 +4227,46 @@ void LocalCcShards::DataSyncForRangePartition( mv_base_vec = std::make_unique>>(); - MergeSortedVectors( - std::move(mv_base_vecs), *mv_base_vec, key_greater, false); - - // Set the ckpt_ts_ of a cc entry repeatedly, which might cause the - // ccentry become invalid in between. But, there should be no - // duplication here. we don't need to remove duplicate record. - MergeSortedVectors( - std::move(data_sync_vecs), *data_sync_vec, rec_greater, false); - - // For archive vec we don't need to worry about duplicate causing - // issue since we're not visiting their cc entry. Also we cannot - // rely on key compare to dedup archive vec since a key could have - // multiple version of archive versions. - MergeSortedVectors( - std::move(archive_vecs), *archive_vec, rec_greater, false); - - data_sync_vecs.resize(cc_shards_.size() + 1); - archive_vecs.resize(cc_shards_.size() + 1); - mv_base_vecs.resize(cc_shards_.size() + 1); - for (size_t i = 0; i <= cc_shards_.size(); ++i) + data_sync_vec->reserve(DATA_SYNC_SCAN_BATCH_SIZE); + archive_vec->reserve(DATA_SYNC_SCAN_BATCH_SIZE); + mv_base_vec->reserve(DATA_SYNC_SCAN_BATCH_SIZE); + + for (size_t j = 0; j < scan_cc.accumulated_scan_cnt_; ++j) + { + auto &rec = scan_cc.DataSyncVec()[j]; + // Clone key + data_sync_vec->emplace_back(rec.Key().Clone(), + rec.ReleaseVersionedPayload(), + rec.payload_status_, + rec.commit_ts_, + rec.cce_, + rec.post_flush_size_, + range_id); + } + + for (size_t j = 0; j < scan_cc.ArchiveVec().size(); ++j) { - data_sync_vecs.at(i).clear(); - archive_vecs.at(i).clear(); - mv_base_vecs.at(i).clear(); + auto &rec = scan_cc.ArchiveVec()[j]; + rec.SetKey(data_sync_vec->at(rec.GetKeyIndex()).Key()); } - size_t data_sync_vec_size = data_sync_vec->size(); - // Fix the vector of FlushRecords. - if (!scan_data_drained) + for (size_t j = 0; j < scan_cc.MoveBaseIdxVec().size(); ++j) { - // Only flush the keys that are not greater than the - // min_scanned_end_key - auto iter = std::upper_bound( - data_sync_vec->begin(), - data_sync_vec->end(), - min_scanned_end_key, - [](const TxKey &key, const FlushRecord &rec) - { return key < rec.Key(); }); - - auto &remaining_vec = data_sync_vecs[cc_shards_.size()]; - remaining_vec.clear(); - remaining_vec.insert( - remaining_vec.begin(), - std::make_move_iterator(iter), - std::make_move_iterator(data_sync_vec->end())); - data_sync_vec->erase(iter, data_sync_vec->end()); - - // archive vector - auto archive_iter = std::upper_bound( - archive_vec->begin(), - archive_vec->end(), - min_scanned_end_key, - [](const TxKey &key, const FlushRecord &rec) - { return key < rec.Key(); }); - auto &archive_remaining_vec = archive_vecs[cc_shards_.size()]; - archive_remaining_vec.clear(); - archive_remaining_vec.insert( - archive_remaining_vec.begin(), - std::make_move_iterator(archive_iter), - std::make_move_iterator(archive_vec->end())); - archive_vec->erase(archive_iter, archive_vec->end()); - - // mv base vector - auto mv_base_iter = std::upper_bound( - mv_base_vec->begin(), - mv_base_vec->end(), - min_scanned_end_key, - [](const TxKey &t_key, - const std::pair &key_and_partition_id) - { return t_key < key_and_partition_id.first; }); - auto &mv_base_remaining_vec = mv_base_vecs[cc_shards_.size()]; - mv_base_remaining_vec.clear(); - mv_base_remaining_vec.insert( - mv_base_remaining_vec.begin(), - std::make_move_iterator(mv_base_iter), - std::make_move_iterator(mv_base_vec->end())); - mv_base_vec->erase(mv_base_iter, mv_base_vec->end()); + size_t key_idx = scan_cc.MoveBaseIdxVec()[j]; + TxKey key_raw = data_sync_vec->at(key_idx).Key(); + mv_base_vec->emplace_back(std::move(key_raw), range_id); } + // Move the bucket into the tank + std::move(scan_cc.ArchiveVec().begin(), + scan_cc.ArchiveVec().end(), + std::back_inserter(*archive_vec)); + + scan_data_drained = scan_cc.IsDrained(); + if (data_sync_vec->empty()) { - LOG(WARNING) << "data_sync_vec becomes empty after erase, old " - "size of data_sync_vec_size: " - << data_sync_vec_size; + LOG(WARNING) << "data_sync_vec is empty."; // Reset scan_cc.Reset(); // Return the quota to flush data memory usage pool since the @@ -4403,20 +4342,17 @@ void LocalCcShards::DataSyncForRangePartition( table_schema, flush_data_size)); - for (size_t i = 0; i < cc_shards_.size(); ++i) + if (scan_cc.scan_heap_is_full_ == 1) { - if (scan_cc.scan_heap_is_full_[i] == 1) - { - // Clear the FlushRecords' memory of scan cc since the - // DataSyncScan heap is full. - auto &data_sync_vec_ref = scan_cc.DataSyncVec(i); - auto &archive_vec_ref = scan_cc.ArchiveVec(i); - ReleaseDataSyncScanHeapCc release_scan_heap_cc( - &data_sync_vec_ref, &archive_vec_ref); - EnqueueLowPriorityCcRequestToShard(i, - &release_scan_heap_cc); - release_scan_heap_cc.Wait(); - } + // Clear the FlushRecords' memory of scan cc since the + // DataSyncScan heap is full. + auto &data_sync_vec_ref = scan_cc.DataSyncVec(); + auto &archive_vec_ref = scan_cc.ArchiveVec(); + ReleaseDataSyncScanHeapCc release_scan_heap_cc( + &data_sync_vec_ref, &archive_vec_ref); + EnqueueLowPriorityCcRequestToShard(dest_core, + &release_scan_heap_cc); + release_scan_heap_cc.Wait(); } // Reset scan_cc.Reset(); @@ -4431,19 +4367,12 @@ void LocalCcShards::DataSyncForRangePartition( } // Release scan heap memory after scan finish. - std::list req_vec; - for (size_t core_idx = 0; core_idx < Count(); ++core_idx) - { - auto &data_sync_vec_ref = scan_cc.DataSyncVec(core_idx); - auto &archive_vec_ref = scan_cc.ArchiveVec(core_idx); - req_vec.emplace_back(&data_sync_vec_ref, &archive_vec_ref); - EnqueueLowPriorityCcRequestToShard(core_idx, &req_vec.back()); - } - while (req_vec.size() > 0) - { - req_vec.back().Wait(); - req_vec.pop_back(); - } + auto &data_sync_vec_ref = scan_cc.DataSyncVec(); + auto &archive_vec_ref = scan_cc.ArchiveVec(); + ReleaseDataSyncScanHeapCc release_scan_heap_cc(&data_sync_vec_ref, + &archive_vec_ref); + EnqueueLowPriorityCcRequestToShard(dest_core, &release_scan_heap_cc); + release_scan_heap_cc.Wait(); PostProcessRangePartitionDataSyncTask(std::move(data_sync_task), data_sync_txm, @@ -4939,6 +4868,7 @@ void LocalCcShards::DataSyncForHashPartition( req_ptr = upload_batch_closure->UploadBatchRequest(); req_ptr->set_node_group_id(dest_ng); req_ptr->set_node_group_term(-1); + req_ptr->set_partition_id(-1); req_ptr->set_table_name_str(table_name.String()); req_ptr->set_table_type( remote::ToRemoteType::ConvertTableType( @@ -5281,12 +5211,20 @@ void LocalCcShards::PopPendingTask(NodeGroupId ng_id, { std::shared_ptr task = iter->second->pending_tasks_.front(); - iter->second->pending_tasks_.pop(); + iter->second->pending_tasks_.pop_front(); task_limiter_lk.unlock(); std::lock_guard task_worker_lk(data_sync_worker_ctx_.mux_); - data_sync_task_queue_[id % data_sync_task_queue_.size()].push_back( - std::move(task)); + auto &task_queue = + data_sync_task_queue_[id % data_sync_task_queue_.size()]; + if (task->high_priority_) + { + task_queue.push_front(std::move(task)); + } + else + { + task_queue.push_back(std::move(task)); + } data_sync_worker_ctx_.cv_.notify_all(); } else @@ -5318,7 +5256,7 @@ void LocalCcShards::ClearAllPendingTasks(NodeGroupId ng_id, auto &task = iter->second->pending_tasks_.front(); task->SetError(CcErrorCode::REQUESTED_TABLE_NOT_EXISTS); task->SetScanTaskFinished(); - iter->second->pending_tasks_.pop(); + iter->second->pending_tasks_.pop_front(); } task_limiters_.erase(iter); @@ -5899,7 +5837,9 @@ void LocalCcShards::FlushData(std::unique_lock &flush_worker_lk) size_t key_core_idx = 0; if (!table_name.IsHashPartitioned()) { - key_core_idx = (rec.Key().Hash() & 0x3FF) % Count(); + int32_t range_id = entry->data_sync_task_->id_; + key_core_idx = static_cast( + (range_id & 0x3FF) % Count()); } else { @@ -6992,79 +6932,84 @@ void LocalCcShards::RangeCacheSender::SendRangeCacheRequest( // 1- upload dirty range slices info (with PartiallyCached) int64_t ng_term = INIT_TERM; - remote::CcRpcService_Stub stub(channel_.get()); - - brpc::Controller cntl; - cntl.set_timeout_ms(10000); - cntl.set_write_to_socket_in_background(true); - // cntl.ignore_eovercrowded(true); - remote::UploadRangeSlicesRequest req; - remote::UploadRangeSlicesResponse resp; - - req.set_node_group_id(new_range_owner_); - req.set_ng_term(ng_term); - req.set_table_name_str(table_name_.String()); - req.set_table_engine( - remote::ToRemoteType::ConvertTableEngine(table_name_.Engine())); - req.set_old_partition_id(old_range_id_); - req.set_version_ts(version_ts_); - req.set_new_partition_id(new_range_id_); - req.set_new_slices_num(slices_vec_.size()); - std::string *keys_str = req.mutable_new_slices_keys(); - std::string *sizes_str = req.mutable_new_slices_sizes(); - std::string *status_str = req.mutable_new_slices_status(); - for (const StoreSlice *slice : slices_vec_) - { - // key - TxKey slice_key = slice->StartTxKey(); - slice_key.Serialize(*keys_str); - // size - // If post ckpt size of the slice is UINT64_MAX, it means that there is - // no item need to be ckpt in this slice, so should use the current size - // of the slice. - uint32_t slice_size = - (slice->PostCkptSize() == UINT64_MAX ? slice->Size() - : slice->PostCkptSize()); - const char *slice_size_ptr = - reinterpret_cast(&slice_size); - sizes_str->append(slice_size_ptr, sizeof(slice_size)); - // status - int8_t slice_status = static_cast(SliceStatus::PartiallyCached); - const char *slice_status_ptr = - reinterpret_cast(&slice_status); - status_str->append(slice_status_ptr, sizeof(slice_status)); - } - req.set_has_dml_since_ddl(store_range_->HasDmlSinceDdl()); - stub.UploadRangeSlices(&cntl, &req, &resp, nullptr); - - if (cntl.Failed()) - { - LOG(WARNING) << "SendRangeCacheRequest: Fail to upload dirty range " - "slices RPC ng#" - << new_range_owner_ << ". Error code: " << cntl.ErrorCode() - << ". Msg: " << cntl.ErrorText(); - return; - } + if (new_range_owner_ != ng_id_) + { + remote::CcRpcService_Stub stub(channel_.get()); + + brpc::Controller cntl; + cntl.set_timeout_ms(10000); + cntl.set_write_to_socket_in_background(true); + // cntl.ignore_eovercrowded(true); + remote::UploadRangeSlicesRequest req; + remote::UploadRangeSlicesResponse resp; + + req.set_node_group_id(new_range_owner_); + req.set_ng_term(ng_term); + req.set_table_name_str(table_name_.String()); + req.set_table_engine( + remote::ToRemoteType::ConvertTableEngine(table_name_.Engine())); + req.set_old_partition_id(old_range_id_); + req.set_version_ts(version_ts_); + req.set_new_partition_id(new_range_id_); + req.set_new_slices_num(slices_vec_.size()); + std::string *keys_str = req.mutable_new_slices_keys(); + std::string *sizes_str = req.mutable_new_slices_sizes(); + std::string *status_str = req.mutable_new_slices_status(); + for (const StoreSlice *slice : slices_vec_) + { + // key + TxKey slice_key = slice->StartTxKey(); + slice_key.Serialize(*keys_str); + // size + // If post ckpt size of the slice is UINT64_MAX, it means that there + // is no item need to be ckpt in this slice, so should use the + // current size of the slice. + uint32_t slice_size = + (slice->PostCkptSize() == UINT64_MAX ? slice->Size() + : slice->PostCkptSize()); + const char *slice_size_ptr = + reinterpret_cast(&slice_size); + sizes_str->append(slice_size_ptr, sizeof(slice_size)); + // status + int8_t slice_status = + static_cast(SliceStatus::PartiallyCached); + const char *slice_status_ptr = + reinterpret_cast(&slice_status); + status_str->append(slice_status_ptr, sizeof(slice_status)); + } + req.set_has_dml_since_ddl(store_range_->HasDmlSinceDdl()); + stub.UploadRangeSlices(&cntl, &req, &resp, nullptr); + + if (cntl.Failed()) + { + LOG(WARNING) << "SendRangeCacheRequest: Fail to upload dirty range " + "slices RPC ng#" + << new_range_owner_ + << ". Error code: " << cntl.ErrorCode() + << ". Msg: " << cntl.ErrorText(); + return; + } - if (remote::ToLocalType::ConvertCcErrorCode(resp.error_code()) != - CcErrorCode::NO_ERROR) - { - LOG(WARNING) << "SendRangeCacheRequest: New owner ng#" - << new_range_owner_ - << " reject to receive dirty range data"; - return; - } + if (remote::ToLocalType::ConvertCcErrorCode(resp.error_code()) != + CcErrorCode::NO_ERROR) + { + LOG(WARNING) << "SendRangeCacheRequest: New owner ng#" + << new_range_owner_ + << " reject to receive dirty range data"; + return; + } - ng_term = resp.ng_term(); - LOG(INFO) << "SendRangeCacheRequest: Uploaded new range slices info to " - "future owner, range#" - << old_range_id_ << ", new_range#" << new_range_id_; + ng_term = resp.ng_term(); + LOG(INFO) << "SendRangeCacheRequest: Uploaded new range slices info to " + "future owner, range#" + << old_range_id_ << ", new_range#" << new_range_id_; + } // 2- upload records belongs to dirty range assert(closure_vec_->size() > 0); LOG(INFO) << "SendRangeCacheRequest: Sending range data, old_range_id: " << old_range_id_ << ", to upload " << closure_vec_->size() - << " batches to ng#" << new_range_owner_; + << " batches to ng#" << new_range_owner_ << " from ng#" << ng_id_; uint32_t sender_cnt = 5; auto closures_idx = std::make_shared(sender_cnt); @@ -7084,6 +7029,8 @@ void LocalCcShards::RangeCacheSender::SendRangeCacheRequest( size_t vec_size = vec.size(); size_t end_idx = std::min(begin_idx + 5, vec_size); bool rejected = false; + int64_t term = + ng_term == INIT_TERM ? dest_ng_term : ng_term; while (begin_idx < end_idx) { std::unique_ptr closure( @@ -7096,6 +7043,7 @@ void LocalCcShards::RangeCacheSender::SendRangeCacheRequest( end_idx = std::min(begin_idx + 5, vec_size); } + rejected = rejected || term != dest_ng_term; if (rejected) { // Must continue to delete left closures in @@ -7110,7 +7058,7 @@ void LocalCcShards::RangeCacheSender::SendRangeCacheRequest( cntl_ptr->set_timeout_ms(closure->TimeoutValue()); // Fix the term closure->UploadBatchRequest()->set_node_group_term( - ng_term); + term); stub.UploadBatchSlices(cntl_ptr, closure->UploadBatchRequest(), closure->UploadBatchResponse(), @@ -7131,6 +7079,7 @@ void LocalCcShards::RangeCacheSender::SendRangeCacheRequest( << closure->NodeId() << " is reject for no free memory"; } + term = resp->ng_term(); } LOG(INFO) << "Old_Range#" << range_id diff --git a/tx_service/src/cc/range_slice.cpp b/tx_service/src/cc/range_slice.cpp index 91b1973b..baa051ee 100644 --- a/tx_service/src/cc/range_slice.cpp +++ b/tx_service/src/cc/range_slice.cpp @@ -70,10 +70,9 @@ void StoreSlice::StartLoading(FillStoreSliceCc *fill_req, assert(pins_ == 0); status_ = SliceStatus::BeingLoaded; - for (uint16_t core_id = 0; core_id < cc_shards.Count(); ++core_id) - { - cc_shards.EnqueueCcRequest(core_id, fill_req); - } + uint16_t dest_core = static_cast( + (fill_req->PartitionId() & 0x3FF) % cc_shards.Count()); + cc_shards.EnqueueToCcShard(dest_core, fill_req); } void StoreSlice::CommitLoading(StoreRange &range, uint32_t slice_size) @@ -173,19 +172,9 @@ void StoreSlice::InitKeyCache(CcShard *cc_shard, pins_++; init_key_cache_cc_ = cc_shard->NewInitKeyCacheCc(); - init_key_cache_cc_->Reset(range, - this, - range->local_cc_shards_.Count(), - *tbl_name, - term, - ng_id); - - uint16_t core_cnt = range->local_cc_shards_.Count(); - for (uint16_t core_id = 0; core_id < core_cnt; core_id++) - { - Sharder::Instance().GetLocalCcShards()->EnqueueToCcShard( - core_id, init_key_cache_cc_); - } + init_key_cache_cc_->Reset(range, this, *tbl_name, term, ng_id); + + cc_shard->Enqueue(init_key_cache_cc_); } } @@ -254,17 +243,12 @@ StoreRange::StoreRange(uint32_t partition_id, estimate_rec_size)); } - uint16_t core_cnt = Sharder::Instance().GetLocalCcShardsCount(); - for (uint16_t id = 0; id < core_cnt; id++) - { - key_cache_.push_back( - std::make_unique>( - key_cache_size / core_cnt)); - } + key_cache_ = std::make_unique>( + key_cache_size); } else { - key_cache_.resize(0); + key_cache_ = nullptr; } } @@ -449,12 +433,11 @@ bool StoreRange::SampleSubRangeKeys(StoreSlice *slice, &end_key, key_cnt); - // Send the request to one shard randomly. - uint64_t core_rand = butil::fast_rand(); - local_cc_shards_.EnqueueLowPriorityCcRequestToShard( - core_rand % local_cc_shards_.Count(), &sample_keys_cc); - DLOG(INFO) << "Send the sample range keys request to shard#" - << core_rand % local_cc_shards_.Count(); + uint16_t dest_core = static_cast((partition_id_ & 0x3FF) % + local_cc_shards_.Count()); + local_cc_shards_.EnqueueLowPriorityCcRequestToShard(dest_core, + &sample_keys_cc); + DLOG(INFO) << "Send the sample range keys request to shard#" << dest_core; sample_keys_cc.Wait(); CcErrorCode res = sample_keys_cc.ErrorCode(); diff --git a/tx_service/src/data_sync_task.cpp b/tx_service/src/data_sync_task.cpp index ec1e4815..d12d8c30 100644 --- a/tx_service/src/data_sync_task.cpp +++ b/tx_service/src/data_sync_task.cpp @@ -79,7 +79,8 @@ DataSyncTask::DataSyncTask(const TableName &table_name, range_entry_(range_entry), during_split_range_(true), export_base_table_items_(export_base_table_items), - tx_number_(txn) + tx_number_(txn), + high_priority_(true) { assert(!table_name_.IsHashPartitioned()); if (start_key_.KeyPtr() == @@ -98,7 +99,15 @@ DataSyncTask::DataSyncTask(const TableName &table_name, .GetLocalCcShards() ->GetRangeOwner(id_, ng_id) ->BucketOwner(); - need_update_ckpt_ts_ = range_owner == ng_id; + + size_t local_shard_count = Sharder::Instance().GetLocalCcShardsCount(); + int32_t old_range_id = range_entry_->GetRangeInfo()->PartitionId(); + uint16_t old_range_owner_shard = + static_cast((old_range_id & 0x3FF) % local_shard_count); + uint16_t new_range_owner_shard = + static_cast((id_ & 0x3FF) % local_shard_count); + need_update_ckpt_ts_ = + range_owner == ng_id && old_range_owner_shard == new_range_owner_shard; } void DataSyncTask::SetFinish() @@ -227,4 +236,24 @@ void DataSyncTask::SetScanTaskFinished() } } +void DataSyncTask::ResetRangeSplittingStatus() +{ + if (!high_priority_ || during_split_range_) + { + return; + } + + WaitableCc reset_cc( + [&](CcShard &ccs) + { + ccs.ResetRangeSplittingStatus(table_name_, node_group_id_, id_); + return true; + }); + + LocalCcShards *local_cc_shards = Sharder::Instance().GetLocalCcShards(); + uint16_t dest_core = (id_ & 0x3FF) % local_cc_shards->Count(); + local_cc_shards->EnqueueToCcShard(dest_core, &reset_cc); + reset_cc.Wait(); +} + } // namespace txservice diff --git a/tx_service/src/fault/log_replay_service.cpp b/tx_service/src/fault/log_replay_service.cpp index 739caa70..8c65958f 100644 --- a/tx_service/src/fault/log_replay_service.cpp +++ b/tx_service/src/fault/log_replay_service.cpp @@ -584,6 +584,21 @@ int RecoveryService::on_received_messages(brpc::StreamId stream_id, auto res_pair = table_range_split_cnt.try_emplace( base_table_name, std::make_shared(0)); + // Record split range commit ts for data log replay. + ::txlog::SplitRangeOpMessage ds_split_range_op_msg; + if (!ds_split_range_op_msg.ParseFromArray( + split_range_op_blob.data() + blob_offset, + split_range_op_blob.length() - blob_offset)) + { + recovery_error = true; + CleanSplitRangeInfo(cc_ng_id); + return 0; + } + int32_t range_id = ds_split_range_op_msg.partition_id(); + uint64_t split_commit_ts = split_range_msg.commit_ts(); + SetSplitRangeInfo( + cc_ng_id, base_table_name, range_id, split_commit_ts); + // Replay Split ReplayLogCc *cc_req = replay_cc_pool_.NextRequest(); cc_req->Reset( @@ -611,6 +626,7 @@ int RecoveryService::on_received_messages(brpc::StreamId stream_id, stream_id, mux, on_fly_cnt, status, recovery_error); if (recovery_error) { + CleanSplitRangeInfo(cc_ng_id); return 0; } } @@ -618,6 +634,7 @@ int RecoveryService::on_received_messages(brpc::StreamId stream_id, // parse and process log records if (!msg.has_finish()) { + const auto *split_range_info = GetSplitRangeInfo(cc_ng_id); ParseDataLogCc *cc_req = parse_datalog_cc_pool_.NextRequest(); cc_req->Reset(std::move(msg), cc_ng_id, @@ -626,7 +643,8 @@ int RecoveryService::on_received_messages(brpc::StreamId stream_id, status, on_fly_cnt, recovery_error, - is_lock_recovery); + is_lock_recovery, + split_range_info); on_fly_cnt.fetch_add(1, std::memory_order_release); local_shards_.EnqueueCcRequest(next_core, cc_req); next_core = (next_core + 1) % local_shards_.Count(); @@ -634,6 +652,7 @@ int RecoveryService::on_received_messages(brpc::StreamId stream_id, else // has finish message { const std::string &log_records = msg.binary_log_records(); + const auto *split_range_info = GetSplitRangeInfo(cc_ng_id); ParseDataLogCc *cc_req = parse_datalog_cc_pool_.NextRequest(); cc_req->Reset(log_records, cc_ng_id, @@ -642,7 +661,8 @@ int RecoveryService::on_received_messages(brpc::StreamId stream_id, status, on_fly_cnt, recovery_error, - is_lock_recovery); + is_lock_recovery, + split_range_info); on_fly_cnt.fetch_add(1, std::memory_order_release); local_shards_.EnqueueCcRequest(next_core, cc_req); next_core = (next_core + 1) % local_shards_.Count(); @@ -687,6 +707,8 @@ int RecoveryService::on_received_messages(brpc::StreamId stream_id, << ", log group: " << info->log_group_id_ << ", set recovering status to finished"; } + // Clean up split range info for this node group. + CleanSplitRangeInfo(cc_ng_id); brpc::StreamClose(stream_id); // assumption: finish message must be the last message so return return 0; @@ -1060,5 +1082,40 @@ void RecoveryService::ProcessRecoverTxTask(RecoverTxTask &task) } } +void RecoveryService::SetSplitRangeInfo(uint32_t ng_id, + TableName table_name, + int32_t range_id, + uint64_t commit_ts) +{ + auto ng_it = split_range_info_.try_emplace(ng_id).first; + auto &table_map = ng_it->second; + auto table_it = + table_map + .try_emplace(table_name, std::unordered_map{}) + .first; + auto &range_map = table_it->second; + auto [it, inserted] = range_map.try_emplace(range_id, commit_ts); + if (!inserted) + { + it->second = commit_ts; + } +} + +const std::unordered_map> * +RecoveryService::GetSplitRangeInfo(uint32_t ng_id) const +{ + auto ng_it = split_range_info_.find(ng_id); + if (ng_it == split_range_info_.end()) + { + return nullptr; + } + return &ng_it->second; +} + +void RecoveryService::CleanSplitRangeInfo(uint32_t ng_id) +{ + split_range_info_.erase(ng_id); +} + } // namespace fault } // namespace txservice diff --git a/tx_service/src/remote/cc_node_service.cpp b/tx_service/src/remote/cc_node_service.cpp index 123cd440..37a2e7c7 100644 --- a/tx_service/src/remote/cc_node_service.cpp +++ b/tx_service/src/remote/cc_node_service.cpp @@ -1172,6 +1172,7 @@ void CcNodeService::UploadBatch( NodeGroupId ng_id = request->node_group_id(); int64_t ng_term = request->node_group_term(); + int32_t partition_id = request->partition_id(); std::string_view table_name_sv{request->table_name_str()}; TableType table_type = @@ -1199,14 +1200,15 @@ void CcNodeService::UploadBatch( << " for table:" << table_name.Trace(); LocalCcShards *cc_shards = Sharder::Instance().GetLocalCcShards(); - size_t core_cnt = cc_shards->Count(); + size_t core_cnt = (partition_id >= 0) ? 1 : cc_shards->Count(); uint32_t batch_size = request->batch_size(); auto write_entry_tuple = UploadBatchCc::WriteEntryTuple(request->keys(), request->records(), request->commit_ts(), - request->rec_status()); + request->rec_status(), + request->range_size_flags()); size_t finished_req = 0; bthread::Mutex req_mux; @@ -1217,6 +1219,7 @@ void CcNodeService::UploadBatch( req.Reset(table_name, ng_id, ng_term, + partition_id, core_cnt, batch_size, write_entry_tuple, @@ -1224,9 +1227,18 @@ void CcNodeService::UploadBatch( req_cv, finished_req, data_type); - for (size_t core = 0; core < core_cnt; ++core) + if (partition_id >= 0) + { + uint16_t dest_core = + static_cast((partition_id & 0x3FF) % cc_shards->Count()); + cc_shards->EnqueueToCcShard(dest_core, &req); + } + else { - cc_shards->EnqueueToCcShard(core, &req); + for (size_t core = 0; core < cc_shards->Count(); ++core) + { + cc_shards->EnqueueToCcShard(core, &req); + } } { @@ -1383,30 +1395,32 @@ void CcNodeService::UploadBatchSlices( } UploadBatchSlicesCc req; - req.Reset( - table_name, ng_id, ng_term, core_cnt, write_entry_tuple, slices_info); + req.Reset(table_name, ng_id, ng_term, write_entry_tuple, slices_info); - // Select a core randomly to parse items. After parsed, this core will push - // the request to other cores to emplace keys. - uint16_t rand_core = std::rand() % core_cnt; - cc_shards->EnqueueToCcShard(rand_core, &req); + uint16_t dest_core = + static_cast((slices_info->new_range_ & 0x3FF) % core_cnt); + cc_shards->EnqueueToCcShard(dest_core, &req); req.Wait(); CcErrorCode err = CcErrorCode::NO_ERROR; if (req.ErrorCode() != CcErrorCode::NO_ERROR) { - LOG(INFO) << "CcNodeService UploadBatch RPC of #ng" << ng_id + LOG(INFO) << "CcNodeService UploadBatchRecordCache RPC of #ng" << ng_id + << " for range#" << slices_info->range_ << ", new_range#" + << slices_info->new_range_ << " finished with error: " << static_cast(err); err = req.ErrorCode(); } else { - DLOG(INFO) << "CcNodeService UploadBatch RPC of #ng" << ng_id + DLOG(INFO) << "CcNodeService UploadBatchRecordCache RPC of #ng" << ng_id + << " for range#" << slices_info->range_ << ", new_range#" + << slices_info->new_range_ << " finished with error: " << static_cast(err); } response->set_error_code(ToRemoteType::ConvertCcErrorCode(err)); - response->set_ng_term(ng_term); + response->set_ng_term(req.CcNgTerm()); } void CcNodeService::FetchPayload( diff --git a/tx_service/src/remote/cc_stream_receiver.cpp b/tx_service/src/remote/cc_stream_receiver.cpp index 3a0166e7..e015881e 100644 --- a/tx_service/src/remote/cc_stream_receiver.cpp +++ b/tx_service/src/remote/cc_stream_receiver.cpp @@ -377,44 +377,14 @@ void CcStreamReceiver::PreProcessScanResp( ToLocalType::ConvertSlicePosition(msg->slice_position()); const char *tuple_cnt_info = msg->tuple_cnt().data(); - uint16_t remote_core_cnt = *((const uint16_t *) tuple_cnt_info); - tuple_cnt_info += sizeof(uint16_t); - range_scanner.ResetShards(remote_core_cnt); + size_t tuple_cnt = *((const size_t *) tuple_cnt_info); - const uint64_t *term_ptr = (const uint64_t *) msg->term().data(); - - // The offset_table stores the start postition of meta data like `key_ts` - // for all remote cores - std::vector offset_table; - size_t meta_offset = 0; - - range_scanner.SetPartitionNgTerm(-1); - - bool all_remote_core_no_more_data = true; - - for (uint16_t core_id = 0; core_id < remote_core_cnt; ++core_id) - { - size_t tuple_cnt = *((const size_t *) tuple_cnt_info); - tuple_cnt_info += sizeof(size_t); - - all_remote_core_no_more_data = - all_remote_core_no_more_data && (tuple_cnt == 0); - - // All term value are same. We only set `partition_ng_term` once. - if (range_scanner.PartitionNgTerm() == -1 && tuple_cnt != 0) - { - range_scanner.SetPartitionNgTerm(term_ptr[0]); - } + bool remote_no_more_data = tuple_cnt == 0; - offset_table.push_back(meta_offset); - meta_offset += tuple_cnt; - term_ptr += tuple_cnt; - } - - assert(offset_table.size() == remote_core_cnt); + const uint64_t *term_ptr = (const uint64_t *) msg->term().data(); // No more data. - if (all_remote_core_no_more_data) + if (remote_no_more_data) { if (msg->error_code() != 0) { @@ -430,21 +400,18 @@ void CcStreamReceiver::PreProcessScanResp( RecycleScanSliceResp(std::move(msg)); return; } - - // Worker count means how many tx processer to parallel deserialize msg. - // remote core count is not always equal to local core count - size_t worker_cnt = std::min((size_t) remote_core_cnt, - Sharder::Instance().GetLocalCcShardsCount()); + else + { + range_scanner.SetPartitionNgTerm(term_ptr[0]); + } ProcessRemoteScanRespCc *request = process_remote_scan_resp_pool_.NextRequest(); - request->Reset( - this, std::move(msg), std::move(offset_table), hd_res, worker_cnt); + request->Reset(this, std::move(msg), hd_res); - for (size_t idx = 0; idx < worker_cnt; ++idx) - { - local_shards_.EnqueueCcRequest(idx, request); - } + uint32_t core_rand = butil::fast_rand(); + uint16_t dest_core = core_rand % local_shards_.Count(); + local_shards_.EnqueueCcRequest(dest_core, request); } void CcStreamReceiver::OnReceiveCcMsg(std::unique_ptr msg) @@ -1283,9 +1250,8 @@ void CcStreamReceiver::OnReceiveCcMsg(std::unique_ptr msg) case CcMessage::MessageType::CcMessage_MessageType_ScanSliceRequest: { RemoteScanSlice *scan_slice_req = scan_slice_pool.NextRequest(); - uint32_t local_core_cnt = (uint32_t) local_shards_.Count(); TX_TRACE_ASSOCIATE(msg.get(), scan_slice_req); - scan_slice_req->Reset(std::move(msg), local_core_cnt); + scan_slice_req->Reset(std::move(msg)); // The scan slice request is enqueued into the first core, where it pins // the slice and sets the scan's end key. The request is then dispatched // to remaining cores to scan the slice in parallel. diff --git a/tx_service/src/remote/remote_cc_handler.cpp b/tx_service/src/remote/remote_cc_handler.cpp index 848ae8f7..7b863703 100644 --- a/tx_service/src/remote/remote_cc_handler.cpp +++ b/tx_service/src/remote/remote_cc_handler.cpp @@ -159,7 +159,9 @@ void txservice::remote::RemoteCcHandler::PostWrite( const TxRecord *record, OperationType operation_type, uint32_t key_shard_code, - CcHandlerResult &hres) + CcHandlerResult &hres, + int32_t partition_id, + bool on_dirty_range) { CcMessage send_msg; @@ -194,6 +196,8 @@ void txservice::remote::RemoteCcHandler::PostWrite( post_commit->set_commit_ts(commit_ts); post_commit->set_operation_type(static_cast(operation_type)); post_commit->set_key_shard_code(key_shard_code); + post_commit->set_partition_id(partition_id); + post_commit->set_on_dirty_range(on_dirty_range); stream_sender_.SendMessageToNg(cce_addr.NodeGroupId(), send_msg, &hres); } @@ -720,20 +724,15 @@ void txservice::remote::RemoteCcHandler::ScanNext( CcScanner &scanner = *hd_res.Value().ccm_scanner_; - scan_slice->clear_prior_cce_lock_vec(); + scan_slice->clear_prior_cce_lock(); // When the cc ng term is greater than 0, this scan resumes the last scan in // the range. Sets the cc entry addresses where last scan stops. if (cc_ng_term > 0) { - uint32_t remote_core_cnt = scanner.ShardCount(); - - for (uint32_t core_id = 0; core_id < remote_core_cnt; ++core_id) - { - ScanCache *cache = scanner.Cache(core_id); - const ScanTuple *last_tuple = cache->LastTuple(); - scan_slice->add_prior_cce_lock_vec( - last_tuple != nullptr ? last_tuple->cce_addr_.CceLockPtr() : 0); - } + ScanCache *cache = scanner.Cache(0); + const ScanTuple *last_tuple = cache->LastTuple(); + scan_slice->set_prior_cce_lock( + last_tuple != nullptr ? last_tuple->cce_addr_.CceLockPtr() : 0); scanner.ResetCaches(); } diff --git a/tx_service/src/remote/remote_cc_request.cpp b/tx_service/src/remote/remote_cc_request.cpp index 32fbb935..7b24630b 100644 --- a/tx_service/src/remote/remote_cc_request.cpp +++ b/tx_service/src/remote/remote_cc_request.cpp @@ -594,7 +594,9 @@ void txservice::remote::RemotePostWrite::Reset( rec_str, static_cast(post_commit.operation_type()), post_commit.key_shard_code(), - &cc_res_); + &cc_res_, + post_commit.partition_id(), + post_commit.on_dirty_range()); } else { @@ -1317,7 +1319,6 @@ bool txservice::remote::RemoteScanNextBatch::EndKeyInclusive() txservice::remote::RemoteScanSlice::RemoteScanSlice() { - parallel_req_ = true; res_ = &cc_res_; cc_res_.Value().is_local_ = false; @@ -1359,8 +1360,8 @@ txservice::remote::RemoteScanSlice::RemoteScanSlice() const RangeScanSliceResult &slice_result = cc_res_.Value(); output_msg_.clear_last_key(); - auto [last_key, key_set] = slice_result.PeekLastKey(); - assert(key_set || cc_res_.IsError()); + const TxKey *last_key = slice_result.LastKey(); + assert(last_key != nullptr || cc_res_.IsError()); // Only sends back the last key if this scan batch is not the last. The // next scan batch will use this last key as the beginning of the next // batch. @@ -1376,95 +1377,69 @@ txservice::remote::RemoteScanSlice::RemoteScanSlice() output_msg_.set_slice_position( ToRemoteType::ConvertSlicePosition(slice_result.slice_position_)); - uint16_t core_cnt = GetShardCount(); - // Add core cnt first - output_msg_.mutable_tuple_cnt()->append((const char *) &core_cnt, - sizeof(uint16_t)); - // Add tuple count for each core - for (size_t idx = 0; idx < core_cnt; ++idx) - { - size_t tuple_cnt; - if (send_cache) - { - tuple_cnt = scan_cache_vec_[idx].rec_status_.size(); - } - else - { - tuple_cnt = 0; - } - output_msg_.mutable_tuple_cnt()->append((const char *) &tuple_cnt, - sizeof(size_t)); - } + // Add tuple count + size_t tuple_cnt = + send_cache ? slice_result.remote_scan_caches_->Size() : 0; + output_msg_.mutable_tuple_cnt()->append((const char *) &tuple_cnt, + sizeof(size_t)); if (send_cache) { - // Merge scan cache info into a single byte array to reduce - // deserialization time on the receiver side. - for (size_t idx = 0; idx < core_cnt; ++idx) - { - RemoteScanSliceCache &cache = scan_cache_vec_[idx]; - - size_t keys_start_offset = output_msg_.keys().size(); - output_msg_.mutable_key_start_offsets()->append( - (const char *) &keys_start_offset, sizeof(size_t)); - size_t record_start_offset = output_msg_.records().size(); - output_msg_.mutable_record_start_offsets()->append( - (const char *) &record_start_offset, sizeof(size_t)); + output_msg_.mutable_keys()->append(scan_cache_.keys_); - output_msg_.mutable_keys()->append(cache.keys_); - - if (cache.archive_positions_.size() > 0) + if (scan_cache_.archive_positions_.size() > 0) + { + // Merge the backfilled archive records. + size_t rec_offset = 0; + for (size_t j = 0; j < scan_cache_.archive_positions_.size(); + j++) { - // Merge the backfilled archive records. - size_t rec_offset = 0; - for (size_t j = 0; j < cache.archive_positions_.size(); j++) - { - output_msg_.mutable_records()->append( - cache.records_.data() + rec_offset, - cache.records_.data() + - cache.archive_positions_[j].second); - rec_offset = cache.archive_positions_[j].second; - assert(cache.archive_records_[j].size() > 0); - output_msg_.mutable_records()->append( - cache.archive_records_[j]); - } output_msg_.mutable_records()->append( - cache.records_.data() + rec_offset, - cache.records_.data() + cache.records_.size()); - } - else - { - output_msg_.mutable_records()->append(cache.records_); + scan_cache_.records_.data() + rec_offset, + scan_cache_.records_.data() + + scan_cache_.archive_positions_[j].second); + rec_offset = scan_cache_.archive_positions_[j].second; + assert(scan_cache_.archive_records_[j].size() > 0); + output_msg_.mutable_records()->append( + scan_cache_.archive_records_[j]); } - - output_msg_.mutable_key_ts()->append( - (const char *) cache.key_ts_.data(), - cache.key_ts_.size() * sizeof(uint64_t)); - output_msg_.mutable_gap_ts()->append( - (const char *) cache.gap_ts_.data(), - cache.gap_ts_.size() * sizeof(uint64_t)); - output_msg_.mutable_term()->append( - (const char *) cache.term_.data(), - cache.term_.size() * sizeof(uint64_t)); - output_msg_.mutable_cce_lock_ptr()->append( - (const char *) cache.cce_lock_ptr_.data(), - cache.cce_lock_ptr_.size() * sizeof(uint64_t)); - output_msg_.mutable_rec_status()->append( - (const char *) cache.rec_status_.data(), - cache.rec_status_.size() * sizeof(RecordStatusType)); - - output_msg_.mutable_trailing_cnts()->append( - (const char *) &cache.trailing_cnt_, sizeof(size_t)); + output_msg_.mutable_records()->append( + scan_cache_.records_.data() + rec_offset, + scan_cache_.records_.data() + scan_cache_.records_.size()); } + else + { + output_msg_.mutable_records()->append(scan_cache_.records_); + } + + output_msg_.mutable_key_ts()->append( + (const char *) scan_cache_.key_ts_.data(), + scan_cache_.key_ts_.size() * sizeof(uint64_t)); + output_msg_.mutable_gap_ts()->append( + (const char *) scan_cache_.gap_ts_.data(), + scan_cache_.gap_ts_.size() * sizeof(uint64_t)); + output_msg_.mutable_term()->append( + (const char *) scan_cache_.term_.data(), + scan_cache_.term_.size() * sizeof(uint64_t)); + output_msg_.mutable_cce_lock_ptr()->append( + (const char *) scan_cache_.cce_lock_ptr_.data(), + scan_cache_.cce_lock_ptr_.size() * sizeof(uint64_t)); + output_msg_.mutable_rec_status()->append( + (const char *) scan_cache_.rec_status_.data(), + scan_cache_.rec_status_.size() * sizeof(RecordStatusType)); } const ScanSliceRequest &req = input_msg_->scan_slice_req(); + uint32_t range_id = req.range_id(); + uint32_t core_id = + (range_id & 0x3FF) % Sharder::Instance().GetLocalCcShardsCount(); + output_msg_.set_core_id(core_id); hd_->SendScanRespToNode(req.src_node_id(), output_msg_, false); hd_->RecycleCcMsg(std::move(input_msg_)); }; } void txservice::remote::RemoteScanSlice::Reset( - std::unique_ptr input_msg, uint16_t core_cnt) + std::unique_ptr input_msg) { assert(input_msg->has_scan_slice_req()); @@ -1508,30 +1483,13 @@ void txservice::remote::RemoteScanSlice::Reset( output_msg_.set_tx_term(input_msg->tx_term()); output_msg_.set_command_id(input_msg->command_id()); - SetShardCount(core_cnt); - - size_t vec_size = scan_slice_req.prior_cce_lock_vec_size(); - for (size_t core_id = 0; core_id < core_cnt; ++core_id) - { - uint64_t cce_lock_addr = - core_id < vec_size ? scan_slice_req.prior_cce_lock_vec(core_id) : 0; - SetPriorCceLockAddr(cce_lock_addr, core_id); - } + uint64_t cce_lock_addr = scan_slice_req.prior_cce_lock(); + SetPriorCceLockAddr(cce_lock_addr); RangeScanSliceResult &slice_result = cc_res_.Value(); - for (uint16_t core_id = 0; core_id < core_cnt; ++core_id) - { - if (core_id == scan_cache_vec_.size()) - { - scan_cache_vec_.emplace_back(core_cnt); - } - else - { - scan_cache_vec_[core_id].Reset(core_cnt); - } - } - slice_result.remote_scan_caches_ = &scan_cache_vec_; + scan_cache_.Reset(); + slice_result.remote_scan_caches_ = &scan_cache_; input_msg_ = std::move(input_msg); diff --git a/tx_service/src/sk_generator.cpp b/tx_service/src/sk_generator.cpp index e3fc928e..01ff589e 100644 --- a/tx_service/src/sk_generator.cpp +++ b/tx_service/src/sk_generator.cpp @@ -324,7 +324,6 @@ void SkGenerator::ScanAndEncodeIndex(const TxKey *start_key, scan_ts_, node_group_id_, ng_term, - core_cnt, scan_batch_size_, tx_number, start_key, @@ -336,12 +335,7 @@ void SkGenerator::ScanAndEncodeIndex(const TxKey *start_key, CcErrorCode scan_res = CcErrorCode::NO_ERROR; bool scan_data_drained = false; bool scan_pk_finished = false; - std::vector last_finished_pos; - last_finished_pos.reserve(core_cnt); - for (size_t i = 0; i < core_cnt; ++i) - { - last_finished_pos.emplace_back(start_key->Clone()); - } + TxKey last_finished_pos = start_key->Clone(); TxKey target_key; const TxRecord *target_rec = nullptr; @@ -355,11 +349,8 @@ void SkGenerator::ScanAndEncodeIndex(const TxKey *start_key, { batch_tuples = 0; - uint32_t core_rand = butil::fast_rand(); - // The scan slice request is dispatched to the first core. The first - // core tries to pin the slice if necessary and if succeeds, further - // dispatches the request to remaining cores for parallel scans. - cc_shards->EnqueueToCcShard(core_rand % core_cnt, &scan_req); + uint16_t dest_core = (partition_id_ & 0x3FF) % cc_shards->Count(); + cc_shards->EnqueueToCcShard(dest_core, &scan_req); scan_req.Wait(); if (scan_req.IsError()) @@ -381,17 +372,14 @@ void SkGenerator::ScanAndEncodeIndex(const TxKey *start_key, { std::this_thread::sleep_for(std::chrono::seconds(30)); // Reset the paused key. - for (size_t i = 0; i < core_cnt; ++i) + const TxKey &paused_key = scan_req.PausePos().first; + if (!scan_req.IsDrained()) { - const TxKey &paused_key = scan_req.PausePos(i).first; - if (!scan_req.IsDrained(i)) - { - // Should use one copy of the key, instead of move the - // ownership of the key, because this round of scan may - // failed again. - assert(paused_key.IsOwner()); - paused_key.Copy(last_finished_pos[i]); - } + // Should use one copy of the key, instead of move the + // ownership of the key, because this round of scan may + // failed again. + assert(paused_key.IsOwner()); + paused_key.Copy(last_finished_pos); } scan_req.Reset(); scan_pk_finished = false; @@ -431,71 +419,63 @@ void SkGenerator::ScanAndEncodeIndex(const TxKey *start_key, } sk_encoder = sk_encoder_vec_[vec_idx].get(); - for (size_t core_idx = 0; core_idx < core_cnt; ++core_idx) + for (size_t key_idx = 0; key_idx < scan_req.accumulated_scan_cnt_; + ++key_idx) { - for (size_t key_idx = 0; - key_idx < scan_req.accumulated_scan_cnt_.at(core_idx); - ++key_idx) + auto &tuple = scan_req.DataSyncVec().at(key_idx); + target_key = tuple.Key(); + target_rec = tuple.Payload(); + version_ts = tuple.commit_ts_; + if (tuple.payload_status_ == RecordStatus::Deleted) { - auto &tuple = scan_req.DataSyncVec(core_idx).at(key_idx); - target_key = tuple.Key(); - target_rec = tuple.Payload(); - version_ts = tuple.commit_ts_; - if (tuple.payload_status_ == RecordStatus::Deleted) - { - // Skip the deleted record. - continue; - } - assert(target_key.KeyPtr() != nullptr && - target_rec != nullptr); + // Skip the deleted record. + continue; + } + assert(target_key.KeyPtr() != nullptr && target_rec != nullptr); - int32_t appended_sk_size = sk_encoder->AppendPackedSk( - &target_key, target_rec, version_ts, index_set); - if (appended_sk_size < 0) - { - LOG(ERROR) - << "ScanAndEncodeIndex: Failed to encode " - << "key for index: " << tbl_name_it->StringView() - << "of ng#" << node_group_id_; - // Finish the pack sk operation - task_result_ = CcErrorCode::PACK_SK_ERR; - pack_sk_err_ = std::move(sk_encoder->GetError()); - return; - } - } /* End of each key */ + int32_t appended_sk_size = sk_encoder->AppendPackedSk( + &target_key, target_rec, version_ts, index_set); + if (appended_sk_size < 0) + { + LOG(ERROR) << "ScanAndEncodeIndex: Failed to encode " + << "key for index: " << tbl_name_it->StringView() + << "of ng#" << node_group_id_; + // Finish the pack sk operation + task_result_ = CcErrorCode::PACK_SK_ERR; + pack_sk_err_ = std::move(sk_encoder->GetError()); + return; + } + } /* End of each key */ - if (tbl_name_it == new_indexes_name_->cbegin()) + if (tbl_name_it == new_indexes_name_->cbegin()) + { + batch_tuples += scan_req.accumulated_scan_cnt_; + if (batch_tuples % 10240 == 0 && + !task_status_->CheckTxTermStatus()) + { + LOG(WARNING) + << "ScanAndEncodeIndex: Terminate this task cause " + << "the tx leader transferred of ng#" << node_group_id_; + task_status_->TerminateGenerateSk(); + task_result_ = CcErrorCode::TX_NODE_NOT_LEADER; + return; + } + // Update the last finished key. + auto &paused_key = scan_req.PausePos().first; + if (!scan_req.IsDrained()) { - batch_tuples += scan_req.accumulated_scan_cnt_.at(core_idx); - if (batch_tuples % 10240 == 0 && - !task_status_->CheckTxTermStatus()) + if (last_finished_pos.IsOwner()) { - LOG(WARNING) - << "ScanAndEncodeIndex: Terminate this task cause " - << "the tx leader transferred of ng#" - << node_group_id_; - task_status_->TerminateGenerateSk(); - task_result_ = CcErrorCode::TX_NODE_NOT_LEADER; - return; + last_finished_pos.Copy(paused_key); } - // Update the last finished key. - auto &paused_key = scan_req.PausePos(core_idx).first; - if (!scan_req.IsDrained(core_idx)) + else { - if (last_finished_pos[core_idx].IsOwner()) - { - last_finished_pos[core_idx].Copy(paused_key); - } - else - { - last_finished_pos[core_idx] = paused_key.Clone(); - } + last_finished_pos = paused_key.Clone(); } - // If the data is drained - scan_data_drained = - scan_req.IsDrained(core_idx) && scan_data_drained; } - } /* End of each core */ + // If the data is drained + scan_data_drained = scan_req.IsDrained(); + } } /* End of foreach new_indexes_name */ scan_pk_finished = scan_data_drained; @@ -680,37 +660,41 @@ CcErrorCode UploadIndexContext::UploadIndexInternal( size_t finished_upload_count = 0; CcErrorCode upload_res_code = CcErrorCode::NO_ERROR; size_t upload_req_count = 0; + for (auto &[table_name, ng_entries] : ng_index_set) { - for (auto &[ng_id, entry_vec] : ng_entries) + for (auto &[ng_id, range_entries] : ng_entries) { - entry_vec_size = entry_vec.size(); - batch_req_cnt = (entry_vec_size / upload_batch_size_ + - (entry_vec_size % upload_batch_size_ ? 1 : 0)); - int64_t &expected_term = leader_terms_.at(ng_id); - size_t start_idx = 0; - size_t end_idx = - (batch_req_cnt > 1 ? upload_batch_size_ : entry_vec_size); - for (size_t idx = 0; idx < batch_req_cnt; ++idx) + for (auto &[range_id, entry_vec] : range_entries) { - SendIndexes(table_name, - ng_id, - expected_term, - entry_vec, - (end_idx - start_idx), - start_idx, - req_mux, - req_cv, - finished_upload_count, - upload_res_code); - ++upload_req_count; - // Next batch - start_idx = end_idx; - end_idx = ((start_idx + upload_batch_size_) > entry_vec_size - ? entry_vec_size - : (start_idx + upload_batch_size_)); + entry_vec_size = entry_vec.size(); + batch_req_cnt = (entry_vec_size / upload_batch_size_ + + (entry_vec_size % upload_batch_size_ ? 1 : 0)); + + size_t start_idx = 0; + size_t end_idx = + (batch_req_cnt > 1 ? upload_batch_size_ : entry_vec_size); + for (size_t idx = 0; idx < batch_req_cnt; ++idx) + { + SendIndexes(table_name, + ng_id, + expected_term, + range_id, + entry_vec, + (end_idx - start_idx), + start_idx, + req_mux, + req_cv, + finished_upload_count, + upload_res_code); + ++upload_req_count; + start_idx = end_idx; + end_idx = ((start_idx + upload_batch_size_) > entry_vec_size + ? entry_vec_size + : (start_idx + upload_batch_size_)); + } } } } @@ -730,7 +714,8 @@ void UploadIndexContext::SendIndexes( const TableName &table_name, NodeGroupId dest_ng_id, int64_t &ng_term, - const std::vector &write_entry_vec, + int32_t partition_id, + const std::vector> &write_entry_vec, size_t batch_size, size_t start_key_idx, bthread::Mutex &req_mux, @@ -740,14 +725,13 @@ void UploadIndexContext::SendIndexes( { uint32_t dest_node_id = Sharder::Instance().LeaderNodeId(dest_ng_id); LocalCcShards *cc_shards = Sharder::Instance().GetLocalCcShards(); - size_t core_cnt = cc_shards->Count(); if (dest_node_id == cc_shards->NodeId()) { UploadBatchCc *req_ptr = NextRequest(); req_ptr->Reset(table_name, dest_ng_id, ng_term, - core_cnt, + partition_id, batch_size, start_key_idx, write_entry_vec, @@ -757,10 +741,9 @@ void UploadIndexContext::SendIndexes( res_code, UploadBatchType::SkIndexData); - for (size_t core = 0; core < core_cnt; ++core) - { - cc_shards->EnqueueToCcShard(core, req_ptr); - } + uint16_t dest_core = + static_cast((partition_id & 0x3FF) % cc_shards->Count()); + cc_shards->EnqueueToCcShard(dest_core, req_ptr); } else { @@ -834,6 +817,7 @@ void UploadIndexContext::SendIndexes( remote::ToRemoteType::ConvertTableType(table_name.Type())); req_ptr->set_table_engine( remote::ToRemoteType::ConvertTableEngine(table_name.Engine())); + req_ptr->set_partition_id(partition_id); size_t end_key_idx = start_key_idx + batch_size; req_ptr->set_kind(remote::UploadBatchKind::SK_DATA); req_ptr->set_batch_size(batch_size); @@ -853,15 +837,24 @@ void UploadIndexContext::SendIndexes( std::string *rec_status_str = req_ptr->mutable_rec_status(); // All generated sk should be normal status. const RecordStatus rec_status = RecordStatus::Normal; + // range_size_flags + req_ptr->clear_range_size_flags(); + std::string *range_size_flags_str = req_ptr->mutable_range_size_flags(); + for (size_t idx = start_key_idx; idx < end_key_idx; ++idx) { - write_entry_vec.at(idx)->key_.Serialize(*keys_str); - write_entry_vec.at(idx)->rec_->Serialize(*recs_str); - val_ptr = reinterpret_cast( - &(write_entry_vec.at(idx)->commit_ts_)); + uint8_t range_size_flags = write_entry_vec.at(idx).first; + WriteEntry *write_entry = write_entry_vec.at(idx).second; + write_entry->key_.Serialize(*keys_str); + write_entry->rec_->Serialize(*recs_str); + val_ptr = + reinterpret_cast(&(write_entry->commit_ts_)); commit_ts_str->append(val_ptr, len_sizeof); rec_status_str->append(reinterpret_cast(&rec_status), sizeof(rec_status)); + range_size_flags_str->append( + reinterpret_cast(&range_size_flags), + sizeof(range_size_flags)); } brpc::Controller *cntl_ptr = upload_batch_closure->Controller(); @@ -989,17 +982,24 @@ void UploadIndexContext::AdvanceWriteEntryForRangeInfo( size_t new_range_idx = 0; auto *range_info = range_record.GetRangeInfo(); + const int32_t range_id = range_info->PartitionId(); + const uint8_t default_flags = + 0x10 | static_cast(range_info->IsDirty()); while (cur_write_entry_it != next_range_start) { WriteEntry &write_entry = *cur_write_entry_it; - auto ng_it = ng_write_entrys.try_emplace(range_ng); - ng_it.first->second.push_back(&write_entry); + auto &range_vec = ng_write_entrys[range_ng][range_id]; + range_vec.emplace_back(default_flags, &write_entry); + uint8_t *old_range_flags_ptr = &range_vec.back().first; + + uint8_t *new_bucket_flags_ptr = nullptr; // If current range is migrating, forward to new range owner. if (new_bucket_ng != UINT32_MAX) { - ng_write_entrys.try_emplace(new_bucket_ng) - .first->second.push_back(&write_entry); + auto &new_bucket_vec = ng_write_entrys[new_bucket_ng][range_id]; + new_bucket_vec.emplace_back(default_flags, &write_entry); + new_bucket_flags_ptr = &new_bucket_vec.back().first; } // If range is splitting and the key will fall on a new range after @@ -1016,18 +1016,25 @@ void UploadIndexContext::AdvanceWriteEntryForRangeInfo( } if (new_range_ng != UINT32_MAX) { - if (new_range_ng != range_ng) - { - ng_write_entrys.try_emplace(new_range_ng) - .first->second.push_back(&write_entry); - } + const int32_t new_range_id = + range_info->NewPartitionId()->at(new_range_idx - 1); + + ng_write_entrys[new_range_ng][new_range_id].emplace_back( + default_flags, &write_entry); + // Only update range size on the new range + *old_range_flags_ptr &= 0x0F; + // If the new range is migrating, forward to the new owner of new // range. if (new_range_new_bucket_ng != UINT32_MAX && new_range_new_bucket_ng != range_ng) { - ng_write_entrys.try_emplace(new_range_new_bucket_ng) - .first->second.push_back(&write_entry); + ng_write_entrys[new_range_new_bucket_ng][new_range_id] + .emplace_back(default_flags, &write_entry); + if (new_bucket_flags_ptr) + { + *new_bucket_flags_ptr &= 0x0F; + } } } diff --git a/tx_service/src/tx_execution.cpp b/tx_service/src/tx_execution.cpp index 6e80dbba..f46fb46a 100644 --- a/tx_service/src/tx_execution.cpp +++ b/tx_service/src/tx_execution.cpp @@ -1963,13 +1963,14 @@ void TransactionExecution::Process(ReadOperation &read) // error to the tx read request. assert(!lock_range_bucket_result_.IsError()); - // Uses the lower 10 bits of the key's hash code to shard - // the key across CPU cores in a cc node. - uint32_t residual = key.Hash() & 0x3FF; + // Uses the partition id to shard the key across CPU cores + // in a cc node. + partition_id = range_rec_.GetRangeInfo()->PartitionId(); + uint32_t residual = + static_cast((partition_id & 0x3FF)); NodeGroupId range_ng = range_rec_.GetRangeOwnerNg()->BucketOwner(); key_shard_code = range_ng << 10 | residual; - partition_id = range_rec_.GetRangeInfo()->PartitionId(); } } else @@ -4611,12 +4612,17 @@ bool TransactionExecution::FillDataLogRequest(WriteToLogOp &write_log) // ngs, write log for both ngs. uint32_t forward_ng_id = Sharder::Instance().ShardToCcNodeGroup(forward_shard_code); - auto table_rec_it = ng_table_set.try_emplace(forward_ng_id); + auto [table_rec_it, inserted] = + ng_table_set.try_emplace(forward_ng_id); + if (!inserted) + { + continue; + } std::unordered_map< TableName, std::vector< std::pair>> - &table_rec_set = table_rec_it.first->second.second; + &table_rec_set = table_rec_it->second.second; auto rec_vec_it = table_rec_set.emplace( std::piecewise_construct, @@ -5288,6 +5294,7 @@ void TransactionExecution::Process(PostProcessOp &post_process) { for (const auto &[key, write_entry] : pair.second) { + bool on_dirty_range = write_entry.on_dirty_range_; CcReqStatus ret = cc_handler_->PostWrite(tx_number, tx_term_, @@ -5297,10 +5304,12 @@ void TransactionExecution::Process(PostProcessOp &post_process) write_entry.rec_.get(), write_entry.op_, write_entry.key_shard_code_, - post_process.hd_result_); + post_process.hd_result_, + write_entry.partition_id_, + on_dirty_range); update_post_cnt(ret); - for (auto &[forward_shard_code, cce_addr] : + for (auto &[forward_shard_code, forward_pair] : write_entry.forward_addr_) { CcReqStatus ret = @@ -5308,11 +5317,13 @@ void TransactionExecution::Process(PostProcessOp &post_process) tx_term_, command_id, commit_ts_, - cce_addr, + forward_pair.second, write_entry.rec_.get(), write_entry.op_, forward_shard_code, - post_process.hd_result_); + post_process.hd_result_, + forward_pair.first, + on_dirty_range); update_post_cnt(ret); } } @@ -5394,9 +5405,10 @@ void TransactionExecution::Process(PostProcessOp &post_process) // Keys that were not successfully locked in the cc // map do not need post-processing. - for (const auto &[forward_shard_code, cce_addr] : + for (const auto &[forward_shard_code, forward_pair] : write_entry.forward_addr_) { + const CcEntryAddr &cce_addr = forward_pair.second; if (cce_addr.Term() >= 0) { assert(!cce_addr.Empty()); @@ -7763,17 +7775,19 @@ void TransactionExecution::Process(BatchReadOperation &batch_read_op) TxRecord &rec = *read_batch[idx].record_; uint32_t sharding_code = 0; - size_t key_hash = key.Hash(); - sharding_code = - read_batch[idx].cce_addr_.NodeGroupId() << 10 | (key_hash & 0x3FF); int32_t partition_id = -1; if (table_name.IsHashPartitioned()) { + size_t key_hash = key.Hash(); + sharding_code = read_batch[idx].cce_addr_.NodeGroupId() << 10 | + (key_hash & 0x3FF); partition_id = Sharder::MapKeyHashToHashPartitionId(key_hash); } else { partition_id = batch_read_op.range_ids_[idx]; + sharding_code = read_batch[idx].cce_addr_.NodeGroupId() << 10 | + (partition_id & 0x3FF); } cc_handler_->Read( table_name, diff --git a/tx_service/src/tx_operation.cpp b/tx_service/src/tx_operation.cpp index 926ff090..275309ae 100644 --- a/tx_service/src/tx_operation.cpp +++ b/tx_service/src/tx_operation.cpp @@ -464,19 +464,20 @@ void AcquireWriteOperation::AggregateAcquiredKeys(TransactionExecution *txm) } } - for (auto &[forward_shard_code, cce_addr] : write_entry->forward_addr_) + for (auto &[forward_shard_code, forward_pair] : + write_entry->forward_addr_) { AcquireKeyResult &acquire_key_res = acquire_key_vec[res_idx++]; CcEntryAddr &addr = acquire_key_res.cce_addr_; term = addr.Term(); if (term < 0) { - cce_addr.SetCceLock(0, -1, 0); + forward_pair.second.SetCceLock(0, -1, 0); } else if (acquire_key_res.commit_ts_ == 0) { // acqurie write failed on forward addr. - cce_addr.SetCceLock(0, -1, 0); + forward_pair.second.SetCceLock(0, -1, 0); // Set term to -1 so that post write will not be sent to this // addr. addr.SetTerm(-1); @@ -485,7 +486,7 @@ void AcquireWriteOperation::AggregateAcquiredKeys(TransactionExecution *txm) { // Assigns to the write entry the cc entry address obtained // in the acquire phase. - cce_addr = addr; + forward_pair.second = addr; } // No need to dedup forwarded req since they are not visible to read @@ -720,17 +721,23 @@ void LockWriteRangeBucketsOp::Advance(TransactionExecution *txm) size_t new_range_idx = 0; auto *range_info = txm->range_rec_.GetRangeInfo(); + int32_t range_id = range_info->PartitionId(); + uint32_t residual = static_cast(range_id & 0x3FF); + bool on_dirty_range = range_info->IsDirty(); while (write_key_it_ != next_range_start) { const TxKey &write_tx_key = write_key_it_->first; WriteSetEntry &write_entry = write_key_it_->second; - size_t hash = write_tx_key.Hash(); - write_entry.key_shard_code_ = (range_ng << 10) | (hash & 0x3FF); + write_entry.key_shard_code_ = (range_ng << 10) | residual; + write_entry.partition_id_ = range_id; + write_entry.on_dirty_range_ = on_dirty_range; // If current range is migrating, forward to new range owner. if (new_bucket_ng != UINT32_MAX) { - write_entry.forward_addr_.try_emplace((new_bucket_ng << 10) | - (hash & 0x3FF)); + assert(new_bucket_ng != range_ng); + write_entry.forward_addr_.try_emplace( + ((new_bucket_ng << 10) | residual), + std::make_pair(range_id, CcEntryAddr())); } // If range is splitting and the key will fall on a new range after @@ -748,18 +755,47 @@ void LockWriteRangeBucketsOp::Advance(TransactionExecution *txm) } if (new_range_ng != UINT32_MAX) { - if (new_range_ng != range_ng) + int32_t new_range_id = + range_info->NewPartitionId()->at(new_range_idx - 1); + uint32_t new_residual = + static_cast(new_range_id & 0x3FF); + uint16_t core_cnt = + Sharder::Instance().GetLocalCcShards()->Count(); + uint16_t new_range_shard = + static_cast(new_residual % core_cnt); + uint16_t range_shard = + static_cast(residual % core_cnt); + if (new_range_ng != range_ng || new_range_shard != range_shard) + { + write_entry.forward_addr_.try_emplace( + ((new_range_ng << 10) | new_residual), + std::make_pair(new_range_id, CcEntryAddr())); + // There is no need to update the range size of the old + // range. + write_entry.partition_id_ = -1; + } + else if (new_range_ng == range_ng && + new_range_shard == range_shard) { - write_entry.forward_addr_.try_emplace((new_range_ng << 10) | - (hash & 0x3FF)); + // Only update the range size on the new range id in case of + // the new range and the old range are located on the same + // shard. + write_entry.partition_id_ = new_range_id; } + // If the new range is migrating, forward to the new owner of // new range. - if (new_range_new_bucket_ng != UINT32_MAX && - new_range_new_bucket_ng != range_ng) + // TODO(ysw): double check the logic here. + if (new_range_new_bucket_ng != UINT32_MAX) { - write_entry.forward_addr_.try_emplace( - (new_range_new_bucket_ng << 10) | (hash & 0x3FF)); + assert(new_range_new_bucket_ng != new_range_ng); + if (new_range_new_bucket_ng != range_ng || + new_range_shard != range_shard) + { + write_entry.forward_addr_.try_emplace( + ((new_range_new_bucket_ng << 10) | new_residual), + std::make_pair(new_range_id, CcEntryAddr())); + } } } @@ -4605,14 +4641,21 @@ void SplitFlushRangeOp::Forward(TransactionExecution *txm) int64_t tx_term = txm->TxTerm(); LocalCcShards *local_shards = Sharder::Instance().GetLocalCcShards(); - // The new ranges that still lands to the same ng after split. + // The new ranges that still lands to the same core of same ng + // after split. std::vector> ranges; ranges.reserve(new_ranges.size()); + uint16_t range_shard_id = + static_cast((range_info_->PartitionId() & 0x3FF) % + local_shards->Count()); for (auto iter = new_ranges.begin(); iter != new_ranges.end(); ++iter) { + uint16_t new_range_shard_id = static_cast( + (iter->second & 0x3FF) % local_shards->Count()); if (local_shards->GetRangeOwner(iter->second, node_group) - ->BucketOwner() == node_group) + ->BucketOwner() == node_group && + (new_range_shard_id == range_shard_id)) { const TxKey *start_key = &(iter->first); const TxKey *end_key = @@ -5132,8 +5175,13 @@ bool SplitFlushRangeOp::ForwardKickoutIterator(TransactionExecution *txm) NodeGroupId new_owner = new_range_bucket_info->BucketOwner(); NodeGroupId dirty_new_owner = new_range_bucket_info->DirtyBucketOwner(); - if (new_owner != txm->TxCcNodeId() && - dirty_new_owner != txm->TxCcNodeId()) + uint16_t range_shard_id = static_cast( + (range_info_->PartitionId() & 0x3FF) % local_shards->Count()); + uint16_t new_range_shard_id = static_cast( + (kickout_data_it_->second & 0x3FF) % local_shards->Count()); + if ((new_owner != txm->TxCcNodeId() && + dirty_new_owner != txm->TxCcNodeId()) || + (range_shard_id != new_range_shard_id)) { // Note that even if the new node group falls on the same node, // we still need to clean the cc entry from native ccmap since @@ -5152,11 +5200,14 @@ bool SplitFlushRangeOp::ForwardKickoutIterator(TransactionExecution *txm) } kickout_old_range_data_op_.clean_type_ = CleanType::CleanRangeData; + kickout_old_range_data_op_.range_id_ = + range_info_->PartitionId(); kickout_old_range_data_op_.node_group_ = txm->TxCcNodeId(); LOG(INFO) << "Split Flush transaction kickout old data in range " << kickout_data_it_->second << ", original range id " << range_info_->PartitionId() + << ", new range id: " << kickout_data_it_->second << ", txn: " << txm->TxNumber(); kickout_data_it_++; return false;