diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp index dfbf6ed..97e8c61 100644 --- a/data_store_service_client.cpp +++ b/data_store_service_client.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -108,7 +109,6 @@ DataStoreServiceClient::~DataStoreServiceClient() void DataStoreServiceClient::SetupConfig( const DataStoreServiceClusterManager &cluster_manager) { - assert(cluster_manager.GetShardCount() == 1); auto current_version = dss_topology_version_.load(std::memory_order_acquire); auto new_version = cluster_manager.GetTopologyVersion(); @@ -149,62 +149,63 @@ void DataStoreServiceClient::SetupConfig( } void DataStoreServiceClient::TxConfigsToDssClusterConfig( - uint32_t dss_node_id, // = 0, - uint32_t ng_id, // = 0, + uint32_t node_id, const std::unordered_map> &ng_configs, - uint32_t dss_leader_node_id, // if no leader,set uint32t_max + const std::unordered_map &ng_leaders, DataStoreServiceClusterManager &cluster_manager) { - assert(ng_configs.size() == 1); - auto it = ng_configs.find(ng_id); - assert(it != ng_configs.end()); - auto &ng_member_configs = it->second; - - const txservice::NodeConfig *this_node = nullptr; - const txservice::NodeConfig *leader_node = nullptr; - for (auto &node_config : ng_member_configs) + std::unordered_map nodes_map; + for (auto &[ng_id, ng_members] : ng_configs) { - if (node_config.node_id_ == dss_node_id) + for (auto &node_config : ng_members) { - this_node = &node_config; - } - if (node_config.node_id_ == dss_leader_node_id) - { - leader_node = &node_config; + nodes_map.try_emplace(node_config.node_id_, + node_config.host_name_, + TxPort2DssPort(node_config.port_)); } } - assert(this_node != nullptr); - assert(dss_leader_node_id == UNKNOWN_DSS_LEADER_NODE_ID || - leader_node != nullptr); - cluster_manager.Initialize(this_node->host_name_, - TxPort2DssPort(this_node->port_)); - std::vector shard_nodes; - for (auto &node_config : ng_member_configs) + for (auto &[ng_id, ng_members] : ng_configs) { - if (node_config.node_id_ != dss_node_id) + // add nodes + bool contain_this_node = false; + for (auto &node_config : ng_members) { - DSSNode dss_node(node_config.host_name_, - TxPort2DssPort(node_config.port_)); - cluster_manager.AddShardMember(ng_id, dss_node); + if (node_config.is_candidate_) + { + if (node_config.node_id_ == node_id) + { + contain_this_node = true; + } + cluster_manager.AddShardMember( + ng_id, nodes_map.at(node_config.node_id_)); + } + } + // set primary node + if (ng_leaders.find(ng_id) != ng_leaders.end()) + { + uint32_t leader_id = ng_leaders.at(ng_id); + assert(nodes_map.find(leader_id) != nodes_map.end()); + if (nodes_map.find(leader_id) != nodes_map.end()) + { + cluster_manager.UpdatePrimaryNode(ng_id, + nodes_map.at(leader_id)); + } + if (leader_id == node_id) + { + contain_this_node = true; + cluster_manager.SwitchShardToReadWrite(ng_id, + DSShardStatus::Closed); + } } - } - if (dss_leader_node_id != dss_node_id) - { - LOG(INFO) << "cluster_manager change shard status " << ng_id << " from " - << static_cast( - cluster_manager.FetchDSShardStatus(ng_id)); - cluster_manager.SwitchShardToClosed(ng_id, DSShardStatus::ReadWrite); - LOG(INFO) << "cluster_manager change shard status " << ng_id << " to " - << static_cast( - cluster_manager.FetchDSShardStatus(ng_id)); - if (dss_leader_node_id != UNKNOWN_DSS_LEADER_NODE_ID) + // set this node + if (contain_this_node && nodes_map.find(node_id) != nodes_map.end()) { - DSSNode dss_node(leader_node->host_name_, - TxPort2DssPort(leader_node->port_)); - cluster_manager.UpdatePrimaryNode(ng_id, dss_node); + auto &this_node_config = nodes_map.at(node_id); + cluster_manager.SetThisNode(this_node_config.host_name_, + this_node_config.port_); } } } @@ -220,6 +221,10 @@ void DataStoreServiceClient::TxConfigsToDssClusterConfig( */ bool DataStoreServiceClient::Connect() { + if (!need_bootstrap_) + { + return true; + } bool succeed = false; for (int retry = 1; retry <= 5 && !succeed; retry++) { @@ -245,8 +250,7 @@ bool DataStoreServiceClient::Connect() */ void DataStoreServiceClient::ScheduleTimerTasks() { - LOG(ERROR) << "ScheduleTimerTasks not implemented"; - assert(false); + LOG(WARNING) << "ScheduleTimerTasks not implemented (noop)"; } /** @@ -328,10 +332,10 @@ bool DataStoreServiceClient::PutAll( { // All records in the batch are in the same partition for range // table - uint32_t parition_id = + int32_t partition_id = KvPartitionIdOf(batch[0].partition_id_, true); auto [it, inserted] = - range_partitions_map.try_emplace(parition_id); + range_partitions_map.try_emplace(partition_id); it->second.emplace_back(flush_task_entry_idx); } flush_task_entry_idx++; @@ -342,7 +346,7 @@ bool DataStoreServiceClient::PutAll( PoolableGuard sync_putall_guard(sync_putall); sync_putall->Reset(); - uint16_t parts_cnt_per_key = table_name.IsHashPartitioned() ? 2 : 1; + uint16_t parts_cnt_per_key = 1; uint16_t parts_cnt_per_record = 5; if (table_name.IsHashPartitioned() && table_name.IsObjectTable()) { @@ -397,6 +401,7 @@ bool DataStoreServiceClient::PutAll( // Set up global coordinator sync_putall->total_partitions_ = sync_putall->partition_states_.size(); + bool is_range_partitioned = !table_name.IsHashPartitioned(); // Start concurrent processing for each partition for (size_t i = 0; i < callback_data_list.size(); ++i) { @@ -407,18 +412,21 @@ bool DataStoreServiceClient::PutAll( auto &first_batch = callback_data->inflight_batch; if (partition_state->GetNextBatch(first_batch)) { - BatchWriteRecords(callback_data->table_name, - partition_state->partition_id, - std::move(first_batch.key_parts), - std::move(first_batch.record_parts), - std::move(first_batch.records_ts), - std::move(first_batch.records_ttl), - std::move(first_batch.op_types), - true, // skip_wal - callback_data, - PartitionBatchCallback, - first_batch.parts_cnt_per_key, - first_batch.parts_cnt_per_record); + BatchWriteRecords( + callback_data->table_name, + partition_state->partition_id, + GetShardIdByPartitionId(partition_state->partition_id, + is_range_partitioned), + std::move(first_batch.key_parts), + std::move(first_batch.record_parts), + std::move(first_batch.records_ts), + std::move(first_batch.records_ttl), + std::move(first_batch.op_types), + true, // skip_wal + callback_data, + PartitionBatchCallback, + first_batch.parts_cnt_per_key, + first_batch.parts_cnt_per_record); } else { @@ -585,10 +593,12 @@ void DataStoreServiceClient::FetchTableCatalog( txservice::FetchCatalogCc *fetch_cc) { int32_t kv_partition_id = 0; + uint32_t shard_id = GetShardIdByPartitionId(kv_partition_id, false); + std::string_view key = fetch_cc->CatalogName().StringView(); Read(kv_table_catalogs_name, kv_partition_id, - "", + shard_id, key, fetch_cc, &FetchTableCatalogCallback); @@ -612,11 +622,13 @@ void DataStoreServiceClient::FetchCurrentTableStatistics( { std::string_view sv = ccm_table_name.StringView(); fetch_cc->kv_partition_id_ = KvPartitionIdOf(ccm_table_name); + uint32_t shard_id = + GetShardIdByPartitionId(fetch_cc->kv_partition_id_, false); fetch_cc->SetStoreHandler(this); Read(kv_table_statistics_version_name, fetch_cc->kv_partition_id_, - "", + shard_id, sv, fetch_cc, &FetchCurrentTableStatsCallback); @@ -652,11 +664,14 @@ void DataStoreServiceClient::FetchTableStatistics( fetch_cc->kv_end_key_.back()++; fetch_cc->kv_partition_id_ = KvPartitionIdOf(ccm_table_name); + uint32_t data_shard_id = + GetShardIdByPartitionId(fetch_cc->kv_partition_id_, false); // NOTICE: here batch_size is 1, because the size of item in // {kv_table_statistics_name} may be more than MAX_WRITE_BATCH_SIZE. ScanNext(kv_table_statistics_name, fetch_cc->kv_partition_id_, + data_shard_id, fetch_cc->kv_start_key_, fetch_cc->kv_end_key_, fetch_cc->kv_session_id_, @@ -800,6 +815,7 @@ bool DataStoreServiceClient::UpsertTableStatistics( // 2- write the segments to storage int32_t kv_partition_id = KvPartitionIdOf(ccm_table_name); + uint32_t data_shard_id = GetShardIdByPartitionId(kv_partition_id, false); std::vector keys; std::vector records; std::vector records_ts; @@ -823,6 +839,7 @@ bool DataStoreServiceClient::UpsertTableStatistics( callback_data->Reset(); BatchWriteRecords(kv_table_statistics_name, kv_partition_id, + data_shard_id, std::move(keys), std::move(records), std::move(records_ts), @@ -852,6 +869,7 @@ bool DataStoreServiceClient::UpsertTableStatistics( op_types.emplace_back(WriteOpType::PUT); BatchWriteRecords(kv_table_statistics_version_name, kv_partition_id, + data_shard_id, std::move(keys), std::move(records), std::move(records_ts), @@ -885,6 +903,7 @@ bool DataStoreServiceClient::UpsertTableStatistics( callback_data->Reset(); DeleteRange(kv_table_statistics_name, kv_partition_id, + data_shard_id, start_key, end_key, true, @@ -916,6 +935,8 @@ void DataStoreServiceClient::FetchTableRanges( txservice::FetchTableRangesCc *fetch_cc) { fetch_cc->kv_partition_id_ = KvPartitionIdOf(fetch_cc->table_name_); + uint32_t data_shard_id = + GetShardIdByPartitionId(fetch_cc->kv_partition_id_, false); fetch_cc->kv_start_key_ = fetch_cc->table_name_.String(); fetch_cc->kv_end_key_ = fetch_cc->table_name_.String(); @@ -924,6 +945,7 @@ void DataStoreServiceClient::FetchTableRanges( ScanNext(kv_range_table_name, fetch_cc->kv_partition_id_, + data_shard_id, fetch_cc->kv_start_key_, fetch_cc->kv_end_key_, fetch_cc->kv_session_id_, @@ -961,6 +983,8 @@ void DataStoreServiceClient::FetchRangeSlices( return; } fetch_cc->kv_partition_id_ = KvPartitionIdOf(fetch_cc->table_name_); + uint32_t shard_id = + GetShardIdByPartitionId(fetch_cc->kv_partition_id_, false); // Also use segment_cnt to identify the step is fetch range or fetch slices. fetch_cc->SetSegmentCnt(0); @@ -974,7 +998,7 @@ void DataStoreServiceClient::FetchRangeSlices( Read(kv_range_table_name, fetch_cc->kv_partition_id_, - "", + shard_id, fetch_cc->kv_start_key_, fetch_cc, &FetchRangeSlicesCallback); @@ -1023,8 +1047,11 @@ bool DataStoreServiceClient::DeleteOutOfRangeData( SyncCallbackData *callback_data = sync_callback_data_pool_.NextObject(); PoolableGuard guard(callback_data); callback_data->Reset(); + int32_t kv_part_id = KvPartitionIdOf(partition_id, true); + uint32_t shard_id = GetShardIdByPartitionId(kv_part_id, true); DeleteRange(kv_table_name, - KvPartitionIdOf(partition_id, true), + kv_part_id, + shard_id, start_key_str, end_key_str, false, @@ -1116,10 +1143,13 @@ DataStoreServiceClient::LoadRangeSlice( load_slice_req->kv_table_name_ = &(kv_info->GetKvTableName(table_name)); load_slice_req->kv_partition_id_ = KvPartitionIdOf(range_partition_id, true); + uint32_t data_shard_id = + GetShardIdByPartitionId(load_slice_req->kv_partition_id_, true); load_slice_req->kv_session_id_.clear(); ScanNext(*load_slice_req->kv_table_name_, load_slice_req->kv_partition_id_, + data_shard_id, load_slice_req->kv_start_key_, load_slice_req->kv_end_key_, "", // session_id @@ -1351,6 +1381,7 @@ bool DataStoreServiceClient::UpdateRangeSlices( // 2- write the segments to storage // Calculate kv_partition_id based on table_name. int32_t kv_partition_id = KvPartitionIdOf(table_name); + uint32_t data_shard_id = GetShardIdByPartitionId(kv_partition_id, false); std::vector keys; std::vector records; std::vector records_ts; @@ -1373,6 +1404,7 @@ bool DataStoreServiceClient::UpdateRangeSlices( callback_data->Reset(); BatchWriteRecords(kv_range_slices_table_name, kv_partition_id, + data_shard_id, std::move(keys), std::move(records), std::move(records_ts), @@ -1412,6 +1444,7 @@ bool DataStoreServiceClient::UpdateRangeSlices( op_types.emplace_back(WriteOpType::PUT); BatchWriteRecords(kv_range_table_name, kv_partition_id, + data_shard_id, std::move(keys), std::move(records), std::move(records_ts), @@ -1506,9 +1539,11 @@ bool DataStoreServiceClient::FetchTable(const txservice::TableName &table_name, fetch_table_callback_data_pool_.NextObject(); PoolableGuard guard(callback_data); callback_data->Reset(schema_image, found, version_ts); + uint32_t shard_id = GetShardIdByPartitionId(0, false); + Read(kv_table_catalogs_name, 0, - "", + shard_id, table_name.StringView(), callback_data, &FetchTableCallback); @@ -1550,6 +1585,7 @@ bool DataStoreServiceClient::DiscoverAllTableNames( ScanNext(kv_table_catalogs_name, 0, // kv_partition_id + GetShardIdByPartitionId(0, false), "", "", callback_data->session_id_, @@ -1603,6 +1639,7 @@ bool DataStoreServiceClient::UpsertDatabase(std::string_view db, BatchWriteRecords(kv_database_catalogs_name, 0, + GetShardIdByPartitionId(0, false), std::move(keys), std::move(records), std::move(records_ts), @@ -1659,6 +1696,7 @@ bool DataStoreServiceClient::DropDatabase(std::string_view db) BatchWriteRecords(kv_database_catalogs_name, 0, + GetShardIdByPartitionId(0, false), std::move(keys), std::move(records), std::move(records_ts), @@ -1706,9 +1744,11 @@ bool DataStoreServiceClient::FetchDatabase( fetch_db_callback_data_pool_.NextObject(); PoolableGuard guard(callback_data); callback_data->Reset(definition, found, yield_fptr, resume_fptr); + uint32_t shard_id = GetShardIdByPartitionId(0, false); + Read(kv_database_catalogs_name, 0, - "", + shard_id, db, callback_data, &FetchDatabaseCallback); @@ -1729,6 +1769,7 @@ bool DataStoreServiceClient::FetchAllDatabase( ScanNext(kv_database_catalogs_name, 0, + GetShardIdByPartitionId(0, false), callback_data->start_key_, callback_data->end_key_, callback_data->session_id_, @@ -1767,8 +1808,8 @@ bool DataStoreServiceClient::DropKvTable(const std::string &kv_table_name) // NOTICE: this function is not atomic void DataStoreServiceClient::DropKvTableAsync(const std::string &kv_table_name) { - // FIXME(lzx): this function may not be used now. - assert(false); + // FIXME(lzx): this function may not be used now, delete it. + LOG(WARNING) << "DropKvTableAsync should not be used (noop)"; AsyncDropTableCallbackData *callback_data = new AsyncDropTableCallbackData(); @@ -1933,33 +1974,6 @@ uint32_t DataStoreServiceClient::HashArchiveKey( return partition_id; } -void DataStoreServiceClient::EncodeKvKeyForHashPart(uint16_t bucket_id, - std::string &key_out) -{ - uint16_t be_bucket_id = EloqShare::host_to_big_endian(bucket_id); - key_out.append(reinterpret_cast(&be_bucket_id), - sizeof(be_bucket_id)); -} - -void DataStoreServiceClient::EncodeKvKeyForHashPart( - uint16_t bucket_id, const std::string_view &tx_key, std::string &key_out) -{ - uint16_t be_bucket_id = EloqShare::host_to_big_endian(bucket_id); - key_out.reserve(sizeof(uint16_t) + tx_key.size()); - key_out.append(reinterpret_cast(&be_bucket_id), - sizeof(be_bucket_id)); - key_out.append(tx_key.data(), tx_key.size()); -} - -std::string_view DataStoreServiceClient::DecodeKvKeyForHashPart( - const char *data, size_t size) -{ - assert(size >= sizeof(uint16_t)); - const char *tx_key_start = data + sizeof(uint16_t); - size_t tx_key_len = size - sizeof(uint16_t); - return std::string_view(tx_key_start, tx_key_len); -} - std::string DataStoreServiceClient::EncodeArchiveKey( std::string_view table_name, std::string_view key, uint64_t be_commit_ts) { @@ -2125,10 +2139,10 @@ bool DataStoreServiceClient::PutArchivesAll( for (size_t i = 0; i < archive_vec.size(); ++i) { txservice::TxKey tx_key = archive_vec[i].Key(); - uint32_t partition_id = + int32_t partition_id = HashArchiveKey(kv_table_name.data(), tx_key); auto [it, inserted] = partitions_map.try_emplace( - KvPartitionIdOf(partition_id, true)); + KvPartitionIdOf(partition_id, false)); if (inserted) { it->second.reserve(archive_vec.size() / 1024 * 2 * @@ -2175,6 +2189,8 @@ bool DataStoreServiceClient::PutArchivesAll( records_ttl.reserve(recs_cnt); op_types.reserve(recs_cnt); + uint32_t data_shard_id = GetShardIdByPartitionId(partition_id, false); + for (size_t i = 0; i < archive_ptrs.size(); ++i) { // Start a new batch if done with current partition. @@ -2192,6 +2208,7 @@ bool DataStoreServiceClient::PutArchivesAll( } BatchWriteRecords(kv_mvcc_archive_name, partition_id, + data_shard_id, std::move(keys), std::move(records), std::move(records_ts), @@ -2268,6 +2285,7 @@ bool DataStoreServiceClient::PutArchivesAll( { BatchWriteRecords(kv_mvcc_archive_name, partition_id, + data_shard_id, std::move(keys), std::move(records), std::move(records_ts), @@ -2346,6 +2364,7 @@ bool DataStoreServiceClient::CopyBaseToArchive( auto &table_name = flush_task_entry.front()->data_sync_task_->table_name_; auto &table_schema = flush_task_entry.front()->table_schema_; + bool is_range_partitioned = !table_name.IsHashPartitioned(); auto *catalog_factory = GetCatalogFactory(table_name.Engine()); assert(catalog_factory != nullptr); @@ -2379,21 +2398,19 @@ bool DataStoreServiceClient::CopyBaseToArchive( txservice::TxKey &tx_key = base_vec[base_idx].first; assert(tx_key.Data() != nullptr && tx_key.Size() > 0); - uint32_t partition_id = base_vec[base_idx].second; + int32_t partition_id = base_vec[base_idx].second; + int32_t kv_part_id = + KvPartitionIdOf(partition_id, is_range_partitioned); + uint32_t shard_id = + GetShardIdByPartitionId(kv_part_id, is_range_partitioned); + auto *callback_data = &callback_datas[base_idx]; callback_data->ResetResult(); size_t flying_cnt = callback_data->AddFlyingReadCount(); - std::string_view be_bucket_id = - table_name.IsHashPartitioned() - ? EncodeBucketId( - txservice::Sharder::MapKeyHashToBucketId( - tx_key.Hash())) - : std::string_view(); - Read(base_kv_table_name, - KvPartitionIdOf(partition_id, true), - be_bucket_id, + kv_part_id, + shard_id, std::string_view(tx_key.Data(), tx_key.Size()), callback_data, &SyncBatchReadForArchiveCallback); @@ -2429,17 +2446,10 @@ bool DataStoreServiceClient::CopyBaseToArchive( for (size_t i = 0; i < base_vec.size(); i++) { auto &callback_data = callback_datas[i]; - std::string_view tx_key_view = callback_data.key_str_; - if (table_name.IsHashPartitioned()) - { - tx_key_view = DecodeKvKeyForHashPart(tx_key_view.data(), - tx_key_view.size()); - } - - txservice::TxKey tx_key = catalog_factory->CreateTxKey( - tx_key_view.data(), tx_key_view.size()); - - batch_size += tx_key_view.size(); + txservice::TxKey tx_key = + catalog_factory->CreateTxKey(callback_data.key_str_.data(), + callback_data.key_str_.size()); + batch_size += callback_data.key_str_.size(); batch_size += callback_data.value_str_.size(); std::string_view val = callback_data.value_str_; size_t offset = 0; @@ -2449,7 +2459,7 @@ bool DataStoreServiceClient::CopyBaseToArchive( if (table_name.Engine() == txservice::TableEngine::EloqKv) { // mvcc is not used for EloqKV - assert(false); + LOG(WARNING) << "EloqKv engine not support mvcc feature"; txservice::TxObject *tx_object = static_cast(record.get()); record = tx_object->DeserializeObject(val.data(), offset); @@ -2473,7 +2483,8 @@ bool DataStoreServiceClient::CopyBaseToArchive( if (table_name.Engine() == txservice::TableEngine::EloqKv) { // should not be here - assert(false); + LOG(WARNING) + << "EloqKv engine not support mvcc feature"; ref.SetNonVersionedPayload(record.get()); } else @@ -2538,7 +2549,7 @@ bool DataStoreServiceClient::CopyBaseToArchive( * @param key The key to fetch archive records for. * @param archives Output vector to store the fetched archive records. * @param from_ts Starting timestamp for the archive fetch. - * @return Currently always returns false (not implemented). + * @return True if the archives are successfully fetched, false otherwise. */ bool DataStoreServiceClient::FetchArchives( const txservice::TableName &table_name, @@ -2547,7 +2558,8 @@ bool DataStoreServiceClient::FetchArchives( std::vector &archives, uint64_t from_ts) { - assert(false); + LOG(WARNING) << "FetchArchives should not be used because all " + "archive versions are fetched from ccmap. (noop)"; LOG(INFO) << "FetchArchives: table_name: " << table_name.StringView(); const std::string &kv_table_name = kv_info->GetKvTableName(table_name); @@ -2556,8 +2568,10 @@ bool DataStoreServiceClient::FetchArchives( kv_table_name, std::string_view(key.Data(), key.Size()), be_from_ts); std::string upper_bound_key = EncodeArchiveKey( kv_table_name, std::string_view(key.Data(), key.Size()), UINT64_MAX); - uint32_t partition_id = HashArchiveKey(kv_table_name, key); - int32_t kv_partition_id = KvPartitionIdOf(partition_id, true); + int32_t partition_id = HashArchiveKey(kv_table_name, key); + int32_t kv_partition_id = KvPartitionIdOf(partition_id, false); + uint32_t data_shard_id = GetShardIdByPartitionId(kv_partition_id, false); + size_t batch_size = 100; FetchArchivesCallbackData callback_data(kv_mvcc_archive_name, kv_partition_id, @@ -2569,6 +2583,7 @@ bool DataStoreServiceClient::FetchArchives( ScanNext(kv_mvcc_archive_name, kv_partition_id, + data_shard_id, lower_bound_key, upper_bound_key, callback_data.session_id_, @@ -2613,7 +2628,7 @@ bool DataStoreServiceClient::FetchArchives( if (table_name.Engine() == txservice::TableEngine::EloqKv) { // should not be here - assert(false); + LOG(WARNING) << "EloqKv engine not support mvcc feature"; } else { @@ -2643,7 +2658,8 @@ bool DataStoreServiceClient::FetchArchives( * @param rec Output parameter for the fetched record. * @param rec_status Output parameter for the record status. * @param commit_ts Output parameter for the commit timestamp. - * @return Currently always returns false (not implemented). + * @return True if the visible archive record is successfully fetched, false + * otherwise. */ bool DataStoreServiceClient::FetchVisibleArchive( const txservice::TableName &table_name, @@ -2654,7 +2670,9 @@ bool DataStoreServiceClient::FetchVisibleArchive( txservice::RecordStatus &rec_status, uint64_t &commit_ts) { - assert(false); + // TODO(lzx): Remove this function if not needed. + LOG(WARNING) << "FetchVisibleArchive should not be used because all " + "archive versions are fetched from ccmap. (noop)"; const std::string &kv_table_name = kv_info->GetKvTableName(table_name); uint64_t be_upper_bound_ts = EloqShare::host_to_big_endian(upper_bound_ts); @@ -2664,8 +2682,9 @@ bool DataStoreServiceClient::FetchVisibleArchive( be_upper_bound_ts); std::string upper_bound_key = EncodeArchiveKey( kv_table_name, std::string_view(key.Data(), key.Size()), 0); - uint32_t partition_id = HashArchiveKey(kv_table_name, key); - int32_t kv_partition_id = KvPartitionIdOf(partition_id, true); + int32_t partition_id = HashArchiveKey(kv_table_name, key); + int32_t kv_partition_id = KvPartitionIdOf(partition_id, false); + uint32_t data_shard_id = GetShardIdByPartitionId(kv_partition_id, false); size_t batch_size = 1; FetchArchivesCallbackData callback_data(kv_mvcc_archive_name, kv_partition_id, @@ -2676,6 +2695,7 @@ bool DataStoreServiceClient::FetchVisibleArchive( false); ScanNext(kv_mvcc_archive_name, kv_partition_id, + data_shard_id, lower_bound_key, upper_bound_key, callback_data.session_id_, @@ -2719,7 +2739,7 @@ bool DataStoreServiceClient::FetchVisibleArchive( if (table_name.Engine() == txservice::TableEngine::EloqKv) { // should not be here - assert(false); + LOG(WARNING) << "EloqKv engine not support mvcc feature"; } else { @@ -2758,13 +2778,16 @@ DataStoreServiceClient::FetchArchives(txservice::FetchRecordCc *fetch_cc) kv_table_name, std::string_view(key.Data(), key.Size()), be_read_ts); fetch_cc->kv_end_key_ = EncodeArchiveKey( kv_table_name, std::string_view(key.Data(), key.Size()), 0); - uint32_t partition_id = HashArchiveKey(kv_table_name, key); + int32_t partition_id = HashArchiveKey(kv_table_name, key); // Also use the partion_id in fetch_cc to store kv partition - fetch_cc->partition_id_ = KvPartitionIdOf(partition_id, true); + fetch_cc->partition_id_ = KvPartitionIdOf(partition_id, false); + uint32_t data_shard_id = + GetShardIdByPartitionId(fetch_cc->partition_id_, false); fetch_cc->kv_session_id_.clear(); ScanNext(kv_mvcc_archive_name, fetch_cc->partition_id_, + data_shard_id, fetch_cc->kv_start_key_, fetch_cc->kv_end_key_, fetch_cc->kv_session_id_, @@ -2794,11 +2817,13 @@ DataStoreServiceClient::FetchVisibleArchive( kv_table_name, std::string_view(key.Data(), key.Size()), be_read_ts); fetch_cc->kv_end_key_ = EncodeArchiveKey( kv_table_name, std::string_view(key.Data(), key.Size()), 0); - uint32_t partition_id = HashArchiveKey(kv_table_name, key); - int32_t kv_partition_id = KvPartitionIdOf(partition_id, true); + int32_t partition_id = HashArchiveKey(kv_table_name, key); + int32_t kv_partition_id = KvPartitionIdOf(partition_id, false); + uint32_t data_shard_id = GetShardIdByPartitionId(kv_partition_id, false); ScanNext(kv_mvcc_archive_name, kv_partition_id, + data_shard_id, fetch_cc->kv_start_key_, fetch_cc->kv_end_key_, "", @@ -2834,14 +2859,8 @@ bool DataStoreServiceClient::CreateSnapshotForBackup( { CreateSnapshotForBackupClosure *closure = create_snapshot_for_backup_closure_pool_.NextObject(); - uint32_t shard_cnt = AllDataShardCount(); - std::vector shard_ids; - shard_ids.reserve(shard_cnt); - for (uint32_t shard_id = 0; shard_id < shard_cnt; shard_id++) - { - shard_ids.push_back(shard_id); - } + std::vector shard_ids = GetAllDataShards(); CreateSnapshotForBackupCallbackData *callback_data = create_snapshot_for_backup_callback_data_pool_.NextObject(); PoolableGuard guard(callback_data); @@ -2938,8 +2957,8 @@ bool DataStoreServiceClient::NeedCopyRange() const void DataStoreServiceClient::RestoreTxCache(txservice::NodeGroupId cc_ng_id, int64_t cc_ng_term) { - LOG(ERROR) << "RestoreTxCache not implemented"; - assert(false); + LOG(ERROR) << "RestoreTxCache not implemented - operation skipped"; + // TODO: Implement if needed } /** @@ -2951,15 +2970,18 @@ void DataStoreServiceClient::RestoreTxCache(txservice::NodeGroupId cc_ng_id, * @param next_leader_node Pointer to store the next leader node ID (unused). * @return Always returns true. */ -bool DataStoreServiceClient::OnLeaderStart(uint32_t *next_leader_node) +bool DataStoreServiceClient::OnLeaderStart(uint32_t ng_id, + uint32_t *next_leader_node) { - DLOG(INFO) - << "DataStoreServiceClient OnLeaderStart called data_store_service_:" - << data_store_service_; + if (!bind_data_shard_with_ng_) + { + return true; + } + if (data_store_service_ != nullptr) { - // Now, only support one shard. - data_store_service_->OpenDataStore(0); + // Binded data store shard with ng. + data_store_service_->OpenDataStore(ng_id); } Connect(); @@ -2967,16 +2989,17 @@ bool DataStoreServiceClient::OnLeaderStart(uint32_t *next_leader_node) return true; } -bool DataStoreServiceClient::OnLeaderStop(int64_t term) +bool DataStoreServiceClient::OnLeaderStop(uint32_t ng_id, int64_t term) { - DLOG(INFO) - << "DataStoreServiceClient OnLeaderStop called data_store_service_:" - << data_store_service_; - // swith to read only in case of data store status is read write + if (!bind_data_shard_with_ng_) + { + return true; + } + if (data_store_service_ != nullptr) { - // Now, only support one shard. - data_store_service_->CloseDataStore(0); + // Close the data store shard. + data_store_service_->CloseDataStore(ng_id); } return true; } @@ -2988,23 +3011,25 @@ bool DataStoreServiceClient::OnLeaderStop(int64_t term) * following another leader and can be used to perform follower-specific * initialization. */ -void DataStoreServiceClient::OnStartFollowing(uint32_t leader_node_id, +void DataStoreServiceClient::OnStartFollowing(uint32_t ng_id, + uint32_t leader_node_id, int64_t term, int64_t standby_term, bool resubscribe) { - DLOG(INFO) - << "DataStoreServiceClient OnStartFollowing called data_store_service_:" - << data_store_service_; + if (!bind_data_shard_with_ng_) + { + return; + } + if (data_store_service_ != nullptr) { - // Now, only support one shard. - data_store_service_->CloseDataStore(0); + data_store_service_->CloseDataStore(ng_id); } // Treat leader_node_id as dss_leader_node_id uint32_t dss_leader_node_id = leader_node_id; - uint32_t dss_shard_id = txservice::Sharder::Instance().NativeNodeGroup(); + uint32_t dss_shard_id = ng_id; // Update leader node in cluster_manager if necessary auto ng_configs = txservice::Sharder::Instance().GetNodeGroupConfigs(); @@ -3048,14 +3073,9 @@ void DataStoreServiceClient::OnShutdown() } /** - * @brief Checks if a shard is local to this node. - * - * Determines whether the specified shard is owned by this node using the - * cluster manager. This is used for scale-up scenarios where data needs to be - * migrated from smaller to larger nodes. - * - * @param shard_id The shard ID to check. - * @return true if the shard is local to this node, false otherwise. + * @brief Check if the owner of shard is the local DataStoreService node. + * @param shard_id + * @return true if the owner of shard is the local DataStoreService node. */ bool DataStoreServiceClient::IsLocalShard(uint32_t shard_id) { @@ -3067,36 +3087,123 @@ bool DataStoreServiceClient::IsLocalShard(uint32_t shard_id) return false; } -/** - * @brief Checks if a partition is local to this node. - * - * Determines whether the specified partition is owned by this node using the - * cluster manager. Used for determining whether operations should be performed - * locally or remotely. - * - * @param partition_id The partition ID to check. - * @return true if the partition is local to this node, false otherwise. - */ -bool DataStoreServiceClient::IsLocalPartition(int32_t partition_id) +uint32_t DataStoreServiceClient::GetShardIdByPartitionId( + int32_t partition_id, bool is_range_partition) const { - return IsLocalShard(GetShardIdByPartitionId(partition_id)); + uint16_t bucket_id; + if (is_range_partition) + { + bucket_id = txservice::Sharder::MapRangeIdToBucketId(partition_id); + } + else + { + bucket_id = + txservice::Sharder::MapHashPartitionIdToBucketId(partition_id); + } + + auto it = bucket_infos_.find(bucket_id); + assert(it != bucket_infos_.end()); + if (it != bucket_infos_.end()) + { + uint32_t shard_id = it->second->BucketOwner(); + assert(dss_shard_ids_.find(shard_id) != dss_shard_ids_.end()); + return shard_id; + } + LOG(ERROR) << "Bucket not found for partition_id=" << partition_id + << " (bucket_id=" << bucket_id << ")"; + return UINT32_MAX; } -uint32_t DataStoreServiceClient::GetShardIdByPartitionId( - int32_t partition_id) const +std::vector DataStoreServiceClient::GetAllDataShards() { - // Now, only support one shard. - return 0; + // Ensure that the access of dss_shard_ids_ is thread-safe after + // support shard scaling. + std::shared_lock lock(dss_shard_ids_mutex_); + std::vector shard_ids; + shard_ids.reserve(dss_shard_ids_.size()); + for (auto shard_id : dss_shard_ids_) + { + shard_ids.push_back(shard_id); + } + + return shard_ids; } -uint32_t DataStoreServiceClient::AllDataShardCount() const +void DataStoreServiceClient::InitBucketsInfo( + const std::set &node_groups, + uint64_t version, + std::unordered_map> + &ng_bucket_infos) { - return dss_shards_.size(); + // Construct bucket info map on startup + if (node_groups.empty()) + { + LOG(ERROR) << "InitBucketsInfo called with empty node_groups"; + ng_bucket_infos.clear(); + return; + } + // Generate 64 random numbers for each node group as virtual nodes on + // hashing ring. Each bucket id belongs to the first virtual node that is + // larger than the bucket id. + ng_bucket_infos.clear(); + std::map rand_num_to_ng; + // use ng id as seed to generate random numbers + for (auto ng : node_groups) + { + // Thread-safe and deterministic random generator + std::mt19937 rng(ng); + std::uniform_int_distribution dist( + 0, txservice::total_range_buckets - 1); + size_t generated = 0; + while (generated < 64) + { + uint16_t rand_num = dist(rng); + if (rand_num_to_ng.find(rand_num) == rand_num_to_ng.end()) + { + generated++; + rand_num_to_ng.emplace(rand_num, ng); + } + if (rand_num_to_ng.size() >= txservice::total_range_buckets) + { + LOG(WARNING) + << "Cluster has too many node groups, need to reduce the " + "number of buckets held by each node group"; + break; + } + } + } + + // Insert bucket ids into the map. + auto it = rand_num_to_ng.begin(); + for (uint16_t bucket_id = 0; bucket_id < txservice::total_range_buckets; + bucket_id++) + { + // The buckets larger than the last random number belongs to the + // first virtual node on the ring. + if (it != rand_num_to_ng.end() && bucket_id >= it->first) + { + it++; + } + uint32_t ng_id = it == rand_num_to_ng.end() + ? rand_num_to_ng.begin()->second + : it->second; + auto insert_res = ng_bucket_infos.try_emplace( + bucket_id, std::make_unique(ng_id, version)); + if (insert_res.second) + { + insert_res.first->second->Set(ng_id, version); + } + } } uint32_t DataStoreServiceClient::GetOwnerNodeIndexOfShard( uint32_t shard_id) const { + if (shard_id >= dss_shards_.size()) + { + LOG(ERROR) << "shard_id " << shard_id << " exceeds array bounds"; + return UINT32_MAX; + } assert(dss_shards_[shard_id].load(std::memory_order_acquire) != UINT32_MAX); return dss_shards_[shard_id].load(std::memory_order_acquire); } @@ -3104,6 +3211,11 @@ uint32_t DataStoreServiceClient::GetOwnerNodeIndexOfShard( bool DataStoreServiceClient::UpdateOwnerNodeIndexOfShard( uint32_t shard_id, uint32_t old_node_index, uint32_t &new_node_index) { + if (shard_id >= dss_shards_.size()) + { + LOG(ERROR) << "shard_id " << shard_id << " exceeds array bounds"; + return false; + } new_node_index = dss_shards_[shard_id].load(std::memory_order_acquire); if (new_node_index != old_node_index) { @@ -3204,7 +3316,6 @@ void DataStoreServiceClient::HandleShardingError( } else { - assert(false); // the whole node group has changed LOG(FATAL) << "The topology of data shards is changed"; // TODO(lzx): handle the topology of cluster change. @@ -3218,10 +3329,8 @@ bool DataStoreServiceClient::UpgradeShardVersion(uint32_t shard_id, { if (shard_id >= dss_shards_.size()) { - assert(false); - // Now only support one shard. - LOG(FATAL) << "Shard id not found, shard_id: " << shard_id; - return true; + LOG(ERROR) << "shard_id " << shard_id << " exceeds array bounds"; + return false; } uint32_t node_index = dss_shards_[shard_id].load(std::memory_order_acquire); @@ -3254,7 +3363,8 @@ bool DataStoreServiceClient::UpgradeShardVersion(uint32_t shard_id, if (!dss_shards_[shard_id].compare_exchange_strong(node_index, free_node_index)) { - assert(false); + LOG(WARNING) << "Other thread updated the data shard, shard_id: " + << shard_id; free_node_ref.expired_ts_.store(1, std::memory_order_release); } } @@ -3287,16 +3397,14 @@ DataStoreServiceClient::FetchRecord( return FetchArchives(fetch_cc); } - std::string_view be_bucket_id = - fetch_cc->table_name_.IsHashPartitioned() - ? EncodeBucketId(txservice::Sharder::MapKeyHashToBucketId( - fetch_cc->tx_key_.Hash())) - : std::string_view(); + int32_t kv_partition_id = KvPartitionIdOf( + fetch_cc->partition_id_, !fetch_cc->table_name_.IsHashPartitioned()); + uint32_t shard_id = GetShardIdByPartitionId( + kv_partition_id, !fetch_cc->table_name_.IsHashPartitioned()); Read(fetch_cc->kv_table_name_, - KvPartitionIdOf(fetch_cc->partition_id_, - !fetch_cc->table_name_.IsHashPartitioned()), - be_bucket_id, + kv_partition_id, + shard_id, std::string_view(fetch_cc->tx_key_.Data(), fetch_cc->tx_key_.Size()), fetch_cc, &FetchRecordCallback); @@ -3341,44 +3449,34 @@ DataStoreServiceClient::FetchBucketData( assert(fetch_bucket_data_cc->table_name_.IsHashPartitioned()); int32_t kv_partition_id = - KvPartitionIdOf(txservice::Sharder::MapBucketIdToKvPartitionId( + KvPartitionIdOf(txservice::Sharder::MapBucketIdToHashPartitionId( fetch_bucket_data_cc->bucket_id_), false); + uint32_t shard_id = GetShardIdByPartitionId( + kv_partition_id, + !fetch_bucket_data_cc->table_name_.IsHashPartitioned()); fetch_bucket_data_cc->kv_start_key_.clear(); fetch_bucket_data_cc->kv_end_key_.clear(); - if (fetch_bucket_data_cc->start_key_type_ == + if (fetch_bucket_data_cc->start_key_type_ != txservice::KeyType::NegativeInf) - { - EncodeKvKeyForHashPart(fetch_bucket_data_cc->bucket_id_, - fetch_bucket_data_cc->kv_start_key_); - } - else { assert(fetch_bucket_data_cc->start_key_type_ == txservice::KeyType::Normal); - EncodeKvKeyForHashPart(fetch_bucket_data_cc->bucket_id_, - fetch_bucket_data_cc->StartKey(), - fetch_bucket_data_cc->kv_start_key_); + fetch_bucket_data_cc->kv_start_key_ = fetch_bucket_data_cc->StartKey(); } - if (fetch_bucket_data_cc->end_key_type_ == txservice::KeyType::PositiveInf) - { - EncodeKvKeyForHashPart(fetch_bucket_data_cc->bucket_id_ + 1, - fetch_bucket_data_cc->kv_end_key_); - } - else + if (fetch_bucket_data_cc->end_key_type_ != txservice::KeyType::PositiveInf) { assert(fetch_bucket_data_cc->end_key_type_ == txservice::KeyType::Normal); - EncodeKvKeyForHashPart(fetch_bucket_data_cc->bucket_id_, - fetch_bucket_data_cc->EndKey(), - fetch_bucket_data_cc->kv_end_key_); + fetch_bucket_data_cc->kv_end_key_ = fetch_bucket_data_cc->EndKey(); } ScanNext(fetch_bucket_data_cc->kv_table_name_, kv_partition_id, + shard_id, fetch_bucket_data_cc->kv_start_key_, fetch_bucket_data_cc->kv_end_key_, "", @@ -3412,16 +3510,14 @@ DataStoreServiceClient::FetchSnapshot(txservice::FetchSnapshotCc *fetch_cc) return FetchVisibleArchive(fetch_cc); } - std::string_view be_bucket_id = - fetch_cc->table_name_.IsHashPartitioned() - ? EncodeBucketId(txservice::Sharder::MapKeyHashToBucketId( - fetch_cc->tx_key_.Hash())) - : std::string_view(); + int32_t kv_part_id = KvPartitionIdOf( + fetch_cc->partition_id_, !fetch_cc->table_name_.IsHashPartitioned()); + uint32_t shard_id = GetShardIdByPartitionId( + kv_part_id, !fetch_cc->table_name_.IsHashPartitioned()); Read(fetch_cc->kv_table_name_, - KvPartitionIdOf(fetch_cc->partition_id_, - !fetch_cc->table_name_.IsHashPartitioned()), - be_bucket_id, + kv_part_id, + shard_id, std::string_view(fetch_cc->tx_key_.Data(), fetch_cc->tx_key_.Size()), fetch_cc, &FetchSnapshotCallback); @@ -3430,8 +3526,8 @@ DataStoreServiceClient::FetchSnapshot(txservice::FetchSnapshotCc *fetch_cc) } void DataStoreServiceClient::Read(const std::string_view kv_table_name, - const uint32_t partition_id, - std::string_view be_bucket_id, + const int32_t partition_id, + const uint32_t shard_id, const std::string_view key, void *callback_data, DataStoreCallback callback) @@ -3440,7 +3536,7 @@ void DataStoreServiceClient::Read(const std::string_view kv_table_name, read_clouse->Reset(this, kv_table_name, partition_id, - be_bucket_id, + shard_id, key, callback_data, callback); @@ -3449,11 +3545,12 @@ void DataStoreServiceClient::Read(const std::string_view kv_table_name, void DataStoreServiceClient::ReadInternal(ReadClosure *read_closure) { - if (IsLocalPartition(read_closure->PartitionId())) + if (IsLocalShard(read_closure->ShardId())) { read_closure->PrepareRequest(true); data_store_service_->Read(read_closure->TableName(), read_closure->PartitionId(), + read_closure->ShardId(), read_closure->Key(), &read_closure->LocalValueRef(), &read_closure->LocalTsRef(), @@ -3464,8 +3561,7 @@ void DataStoreServiceClient::ReadInternal(ReadClosure *read_closure) else { read_closure->PrepareRequest(false); - uint32_t node_index = GetOwnerNodeIndexOfShard( - GetShardIdByPartitionId(read_closure->PartitionId())); + uint32_t node_index = GetOwnerNodeIndexOfShard(read_closure->ShardId()); read_closure->SetRemoteNodeIndex(node_index); auto *channel = dss_nodes_[node_index].Channel(); @@ -3480,6 +3576,7 @@ void DataStoreServiceClient::ReadInternal(ReadClosure *read_closure) void DataStoreServiceClient::DeleteRange(const std::string_view table_name, const int32_t partition_id, + const uint32_t shard_id, const std::string &start_key, const std::string &end_key, const bool skip_wal, @@ -3491,6 +3588,7 @@ void DataStoreServiceClient::DeleteRange(const std::string_view table_name, closure->Reset(*this, table_name, partition_id, + shard_id, start_key, end_key, skip_wal, @@ -3503,11 +3601,12 @@ void DataStoreServiceClient::DeleteRange(const std::string_view table_name, void DataStoreServiceClient::DeleteRangeInternal( DeleteRangeClosure *delete_range_clouse) { - if (IsLocalPartition(delete_range_clouse->PartitionId())) + if (IsLocalShard(delete_range_clouse->ShardId())) { delete_range_clouse->PrepareRequest(true); data_store_service_->DeleteRange(delete_range_clouse->TableName(), delete_range_clouse->PartitionId(), + delete_range_clouse->ShardId(), delete_range_clouse->StartKey(), delete_range_clouse->EndKey(), delete_range_clouse->SkipWal(), @@ -3517,8 +3616,8 @@ void DataStoreServiceClient::DeleteRangeInternal( else { delete_range_clouse->PrepareRequest(false); - uint32_t node_index = GetOwnerNodeIndexOfShard( - GetShardIdByPartitionId(delete_range_clouse->PartitionId())); + uint32_t node_index = + GetOwnerNodeIndexOfShard(delete_range_clouse->ShardId()); delete_range_clouse->SetRemoteNodeIndex(node_index); auto *channel = dss_nodes_[node_index].Channel(); @@ -3537,13 +3636,7 @@ void DataStoreServiceClient::FlushData( DataStoreCallback callback) { FlushDataClosure *closure = flush_data_closure_pool_.NextObject(); - uint32_t shard_cnt = AllDataShardCount(); - std::vector shard_ids; - shard_ids.reserve(shard_cnt); - for (uint32_t shard_id = 0; shard_id < shard_cnt; shard_id++) - { - shard_ids.push_back(shard_id); - } + std::vector shard_ids = GetAllDataShards(); closure->Reset( *this, &kv_table_names, std::move(shard_ids), callback_data, callback); @@ -3588,14 +3681,7 @@ void DataStoreServiceClient::DropTable(std::string_view table_name, DLOG(INFO) << "DropTableWithRetry for table: " << table_name; DropTableClosure *closure = drop_table_closure_pool_.NextObject(); - uint32_t shard_cnt = AllDataShardCount(); - std::vector shard_ids; - shard_ids.reserve(shard_cnt); - for (uint32_t shard_id = 0; shard_id < shard_cnt; shard_id++) - { - shard_ids.push_back(shard_id); - } - + std::vector shard_ids = GetAllDataShards(); closure->Reset( *this, table_name, std::move(shard_ids), callback_data, callback); @@ -3633,7 +3719,8 @@ void DataStoreServiceClient::DropTableInternal( void DataStoreServiceClient::ScanNext( const std::string_view table_name, - uint32_t partition_id, + int32_t partition_id, + uint32_t shard_id, const std::string_view start_key, const std::string_view end_key, const std::string_view session_id, @@ -3650,6 +3737,7 @@ void DataStoreServiceClient::ScanNext( closure->Reset(*this, table_name, partition_id, + shard_id, start_key, end_key, inclusive_start, @@ -3667,12 +3755,13 @@ void DataStoreServiceClient::ScanNext( void DataStoreServiceClient::ScanNextInternal( ScanNextClosure *scan_next_closure) { - if (IsLocalPartition(scan_next_closure->PartitionId())) + if (IsLocalShard(scan_next_closure->ShardId())) { scan_next_closure->PrepareRequest(true); data_store_service_->ScanNext( scan_next_closure->TableName(), scan_next_closure->PartitionId(), + scan_next_closure->ShardId(), scan_next_closure->StartKey(), scan_next_closure->EndKey(), scan_next_closure->InclusiveStart(), @@ -3689,8 +3778,8 @@ void DataStoreServiceClient::ScanNextInternal( else { scan_next_closure->PrepareRequest(false); - uint32_t node_index = GetOwnerNodeIndexOfShard( - GetShardIdByPartitionId(scan_next_closure->PartitionId())); + uint32_t node_index = + GetOwnerNodeIndexOfShard(scan_next_closure->ShardId()); scan_next_closure->SetRemoteNodeIndex(node_index); auto *channel = dss_nodes_[node_index].Channel(); @@ -3704,7 +3793,8 @@ void DataStoreServiceClient::ScanNextInternal( } void DataStoreServiceClient::ScanClose(const std::string_view table_name, - uint32_t partition_id, + int32_t partition_id, + uint32_t shard_id, std::string &session_id, void *callback_data, DataStoreCallback callback) @@ -3713,6 +3803,7 @@ void DataStoreServiceClient::ScanClose(const std::string_view table_name, closure->Reset(*this, table_name, partition_id, + shard_id, "", // start_key (empty for scan close) "", // end_key (empty for scan close) false, // inclusive_start @@ -3730,11 +3821,12 @@ void DataStoreServiceClient::ScanClose(const std::string_view table_name, void DataStoreServiceClient::ScanCloseInternal( ScanNextClosure *scan_next_closure) { - if (IsLocalPartition(scan_next_closure->PartitionId())) + if (IsLocalShard(scan_next_closure->ShardId())) { scan_next_closure->PrepareRequest(true); data_store_service_->ScanClose(scan_next_closure->TableName(), scan_next_closure->PartitionId(), + scan_next_closure->ShardId(), &scan_next_closure->LocalSessionIdRef(), &scan_next_closure->LocalResultRef(), scan_next_closure); @@ -3742,8 +3834,8 @@ void DataStoreServiceClient::ScanCloseInternal( else { scan_next_closure->PrepareRequest(false); - uint32_t node_index = GetOwnerNodeIndexOfShard( - GetShardIdByPartitionId(scan_next_closure->PartitionId())); + uint32_t node_index = + GetOwnerNodeIndexOfShard(scan_next_closure->ShardId()); scan_next_closure->SetRemoteNodeIndex(node_index); auto *channel = dss_nodes_[node_index].Channel(); @@ -3761,6 +3853,7 @@ bool DataStoreServiceClient::InitTableRanges( { // init_partition_id and kv_partition_id int32_t kv_partition_id = KvPartitionIdOf(table_name); + uint32_t data_shard_id = GetShardIdByPartitionId(kv_partition_id, false); int32_t init_range_id = txservice::Sequences::InitialRangePartitionIdOf(table_name); auto catalog_factory = GetCatalogFactory(table_name.Engine()); @@ -3789,6 +3882,7 @@ bool DataStoreServiceClient::InitTableRanges( op_types.emplace_back(WriteOpType::PUT); BatchWriteRecords(kv_range_table_name, kv_partition_id, + data_shard_id, std::move(keys), std::move(records), std::move(records_ts), @@ -3812,6 +3906,7 @@ bool DataStoreServiceClient::DeleteTableRanges( const txservice::TableName &table_name) { int32_t kv_partition_id = KvPartitionIdOf(table_name); + uint32_t data_shard_id = GetShardIdByPartitionId(kv_partition_id, false); // delete all slices info from {kv_range_slices_table_name} table std::string start_key = table_name.String(); std::string end_key = start_key; @@ -3822,6 +3917,7 @@ bool DataStoreServiceClient::DeleteTableRanges( callback_data->Reset(); DeleteRange(kv_range_slices_table_name, kv_partition_id, + data_shard_id, start_key, end_key, false, @@ -3841,6 +3937,7 @@ bool DataStoreServiceClient::DeleteTableRanges( callback_data->Reset(); DeleteRange(kv_range_table_name, kv_partition_id, + data_shard_id, start_key, end_key, false, @@ -3902,8 +3999,21 @@ bool DataStoreServiceClient::InitTableLastRangePartitionId( { encoded_tx_record = SerializeTxRecord(false, seq_pair.second.get()); } - int32_t kv_partition_id = - KvPartitionIdOf(txservice::Sequences::table_name_); + + int32_t kv_partition_id; + uint32_t data_shard_id; + + if (txservice::Sequences::table_name_.IsHashPartitioned()) + { + kv_partition_id = txservice::Sharder::MapKeyHashToHashPartitionId( + seq_pair.first.Hash()); + data_shard_id = GetShardIdByPartitionId(kv_partition_id, false); + } + else + { + LOG(ERROR) << "Sequences table must be hash partitioned"; + return false; + } for (int i = 0; i < 3; i++) { @@ -3919,6 +4029,7 @@ bool DataStoreServiceClient::InitTableLastRangePartitionId( BatchWriteRecords(txservice::Sequences::kv_table_name_sv_, kv_partition_id, + data_shard_id, std::move(keys), std::move(records), std::move(records_ts), @@ -3952,6 +4063,7 @@ bool DataStoreServiceClient::DeleteTableStatistics( const txservice::TableName &base_table_name) { int32_t kv_partition_id = KvPartitionIdOf(base_table_name); + uint32_t data_shard_id = GetShardIdByPartitionId(kv_partition_id, false); // delete all sample keys from {kv_table_statistics_name} table std::string start_key = base_table_name.String(); @@ -3963,6 +4075,7 @@ bool DataStoreServiceClient::DeleteTableStatistics( callback_data->Reset(); DeleteRange(kv_table_statistics_name, kv_partition_id, + data_shard_id, start_key, end_key, false, @@ -3983,6 +4096,7 @@ bool DataStoreServiceClient::DeleteTableStatistics( callback_data->Reset(); DeleteRange(kv_table_statistics_version_name, kv_partition_id, + data_shard_id, start_key, end_key, false, @@ -4004,6 +4118,7 @@ bool DataStoreServiceClient::DeleteTableStatistics( void DataStoreServiceClient::BatchWriteRecords( std::string_view kv_table_name, int32_t partition_id, + uint32_t shard_id, std::vector &&key_parts, std::vector &&record_parts, std::vector &&records_ts, @@ -4022,6 +4137,7 @@ void DataStoreServiceClient::BatchWriteRecords( closure->Reset(*this, kv_table_name, partition_id, + shard_id, std::move(key_parts), std::move(record_parts), std::move(records_ts), @@ -4040,13 +4156,13 @@ void DataStoreServiceClient::BatchWriteRecordsInternal( BatchWriteRecordsClosure *closure) { assert(closure != nullptr); - uint32_t req_shard_id = GetShardIdByPartitionId(closure->partition_id_); - if (IsLocalShard(req_shard_id)) + if (IsLocalShard(closure->shard_id_)) { closure->PrepareRequest(true); data_store_service_->BatchWriteRecords(closure->kv_table_name_, closure->partition_id_, + closure->shard_id_, closure->key_parts_, closure->record_parts_, closure->record_ts_, @@ -4062,7 +4178,7 @@ void DataStoreServiceClient::BatchWriteRecordsInternal( { // prepare request closure->PrepareRequest(false); - uint32_t node_index = GetOwnerNodeIndexOfShard(req_shard_id); + uint32_t node_index = GetOwnerNodeIndexOfShard(closure->shard_id_); closure->SetRemoteNodeIndex(node_index); auto *channel = dss_nodes_[node_index].Channel(); @@ -4158,7 +4274,6 @@ bool DataStoreServiceClient::DeserializeTxRecordStr( bool DataStoreServiceClient::InitPreBuiltTables() { - int32_t partition_id = 0; uint64_t table_version = 100U; std::vector keys; std::vector records; @@ -4218,8 +4333,12 @@ bool DataStoreServiceClient::InitPreBuiltTables() SyncCallbackData *callback_data = sync_callback_data_pool_.NextObject(); PoolableGuard guard(callback_data); callback_data->Reset(); + int32_t partition_id = 0; + int32_t kv_part_id = KvPartitionIdOf(partition_id, false); + uint32_t data_shard_id = GetShardIdByPartitionId(kv_part_id, false); BatchWriteRecords(kv_table_catalogs_name, - partition_id, + kv_part_id, + data_shard_id, std::move(keys), std::move(records), std::move(records_ts), @@ -4439,6 +4558,7 @@ bool DataStoreServiceClient::UpsertCatalog( table_schema->GetBaseTableName(); const std::string &catalog_image = table_schema->SchemaImage(); int32_t partition_id = 0; + uint32_t data_shard_id = GetShardIdByPartitionId(partition_id, false); keys.emplace_back(base_table_name.StringView()); records.emplace_back( @@ -4449,6 +4569,7 @@ bool DataStoreServiceClient::UpsertCatalog( BatchWriteRecords(kv_table_catalogs_name, partition_id, + data_shard_id, std::move(keys), std::move(records), std::move(records_ts), @@ -4484,6 +4605,7 @@ bool DataStoreServiceClient::DeleteCatalog( // Delete table catalog image int32_t partition_id = 0; + uint32_t data_shard_id = GetShardIdByPartitionId(partition_id, false); keys.emplace_back(base_table_name.StringView()); records.emplace_back(std::string_view()); @@ -4493,6 +4615,7 @@ bool DataStoreServiceClient::DeleteCatalog( BatchWriteRecords(kv_table_catalogs_name, partition_id, + data_shard_id, std::move(keys), std::move(records), std::move(records_ts), @@ -4540,11 +4663,9 @@ void DataStoreServiceClient::PreparePartitionBatches( if (ckpt_rec.payload_status_ == txservice::RecordStatus::Normal && (!ckpt_rec.Payload()->HasTTL() || ttl > now)) { - batch_request.key_parts.emplace_back(EncodeBucketId( - txservice::Sharder::MapKeyHashToBucketId(tx_key.Hash()))); batch_request.key_parts.emplace_back( std::string_view(tx_key.Data(), tx_key.Size())); - batch_size += tx_key.Size() + sizeof(uint16_t); + batch_size += tx_key.Size(); const txservice::TxRecord *rec = ckpt_rec.Payload(); batch_request.record_parts.emplace_back(std::string_view( @@ -4562,11 +4683,9 @@ void DataStoreServiceClient::PreparePartitionBatches( } else { - batch_request.key_parts.emplace_back(EncodeBucketId( - txservice::Sharder::MapKeyHashToBucketId(tx_key.Hash()))); batch_request.key_parts.emplace_back( std::string_view(tx_key.Data(), tx_key.Size())); - batch_size += tx_key.Size() + sizeof(uint16_t); + batch_size += tx_key.Size(); batch_request.record_parts.emplace_back(std::string_view()); batch_size += 0; @@ -4591,11 +4710,9 @@ void DataStoreServiceClient::PreparePartitionBatches( bool is_deleted = !(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal); - batch_request.key_parts.emplace_back(EncodeBucketId( - txservice::Sharder::MapKeyHashToBucketId(tx_key.Hash()))); batch_request.key_parts.emplace_back( std::string_view(tx_key.Data(), tx_key.Size())); - batch_size += tx_key.Size() + sizeof(uint16_t); + batch_size += tx_key.Size(); const txservice::TxRecord *rec = ckpt_rec.Payload(); if (is_deleted) diff --git a/data_store_service_client.h b/data_store_service_client.h index 33c7501..1471d01 100644 --- a/data_store_service_client.h +++ b/data_store_service_client.h @@ -71,30 +71,44 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler ~DataStoreServiceClient(); DataStoreServiceClient( + bool is_bootstrap, txservice::CatalogFactory *catalog_factory[3], const DataStoreServiceClusterManager &cluster_manager, + bool bind_data_shard_with_ng, DataStoreService *data_store_service = nullptr) : catalog_factory_array_{catalog_factory[0], catalog_factory[1], catalog_factory[2], &range_catalog_factory_, &hash_catalog_factory_}, - data_store_service_(data_store_service) + data_store_service_(data_store_service), + need_bootstrap_(is_bootstrap), + bind_data_shard_with_ng_(bind_data_shard_with_ng) { // Init dss cluster config. dss_topology_version_ = cluster_manager.GetTopologyVersion(); auto all_shards = cluster_manager.GetAllShards(); - assert(all_shards.size() == 1); + for (auto &[shard_id, shard] : all_shards) { + if (shard_id >= dss_shards_.size()) + { + LOG(FATAL) << "Shard id " << shard_id + << " is out of range, should expand the hard-coded " + "dss_shards_ size."; + } uint32_t node_idx = FindFreeNodeIndex(); auto &node_ref = dss_nodes_[node_idx]; node_ref.Reset(shard.nodes_[0].host_name_, shard.nodes_[0].port_, shard.version_); - dss_shards_[shard_id].store(shard_id); + dss_shards_[shard_id].store(node_idx); + dss_shard_ids_.insert(shard_id); } + // init bucket infos + InitBucketsInfo(dss_shard_ids_, 0, bucket_infos_); + if (data_store_service_ != nullptr) { data_store_service_->AddListenerForUpdateConfig( @@ -107,15 +121,6 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler << txservice::Sequences::table_name_sv_; AppendPreBuiltTable(txservice::Sequences::table_name_); - - be_bucket_ids_.reserve(txservice::Sharder::TotalRangeBuckets()); - for (uint16_t bucket_id = 0; - bucket_id < txservice::Sharder::TotalRangeBuckets(); - ++bucket_id) - { - uint16_t be_bucket_id = EloqShare::host_to_big_endian(bucket_id); - be_bucket_ids_.push_back(be_bucket_id); - } } // The maximum number of retries for RPC requests. @@ -132,11 +137,10 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler } static void TxConfigsToDssClusterConfig( - uint32_t dss_node_id, // = 0, - uint32_t ng_id, // = 0, + uint32_t node_id, // = 0, const std::unordered_map> &ng_configs, - uint32_t dss_leader_node_id, // if no leader,set uint32t_max + const std::unordered_map &ng_leaders, DataStoreServiceClusterManager &cluster_manager); void ConnectToLocalDataStoreService( @@ -388,11 +392,12 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler void RestoreTxCache(txservice::NodeGroupId cc_ng_id, int64_t cc_ng_term) override; - bool OnLeaderStart(uint32_t *next_leader_node) override; + bool OnLeaderStart(uint32_t ng_id, uint32_t *next_leader_node) override; - bool OnLeaderStop(int64_t term) override; + bool OnLeaderStop(uint32_t ng_id, int64_t term) override; - void OnStartFollowing(uint32_t leader_node_id, + void OnStartFollowing(uint32_t ng_id, + uint32_t leader_node_id, int64_t term, int64_t standby_term, bool resubscribe) override; @@ -439,15 +444,6 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler static uint32_t HashArchiveKey(const std::string &kv_table_name, const txservice::TxKey &tx_key); - static void EncodeKvKeyForHashPart(uint16_t bucket_id, - std::string &key_out); - static void EncodeKvKeyForHashPart(uint16_t bucket_id, - const std::string_view &tx_key, - std::string &key_out); - - static std::string_view DecodeKvKeyForHashPart(const char *data, - size_t size); - // NOTICE: be_commit_ts is the big endian encode value of commit_ts static std::string EncodeArchiveKey(std::string_view table_name, std::string_view key, @@ -488,6 +484,9 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler bool DeleteCatalog(const txservice::TableName &base_table_name, uint64_t write_time); + uint32_t GetShardIdByPartitionId(int32_t partition_id, + bool is_range_partition) const; + private: int32_t MapKeyHashToPartitionId(const txservice::TxKey &key) const { @@ -500,8 +499,8 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler // ===================================================== void Read(const std::string_view kv_table_name, - const uint32_t partition_id, - const std::string_view be_bucket_id, + const int32_t partition_id, + const uint32_t shard_id, const std::string_view key, void *callback_data, DataStoreCallback callback); @@ -511,6 +510,7 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler void BatchWriteRecords( std::string_view kv_table_name, int32_t partition_id, + uint32_t shard_id, std::vector &&key_parts, std::vector &&record_parts, std::vector &&records_ts, @@ -553,6 +553,7 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler */ void DeleteRange(const std::string_view table_name, const int32_t partition_id, + uint32_t shard_id, const std::string &start_key, const std::string &end_key, const bool skip_wal, @@ -572,7 +573,8 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler void ScanNext( const std::string_view table_name, - uint32_t partition_id, + int32_t partition_id, + uint32_t shard_id, const std::string_view start_key, const std::string_view end_key, const std::string_view session_id, @@ -588,7 +590,8 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler void ScanNextInternal(ScanNextClosure *scan_next_closure); void ScanClose(const std::string_view table_name, - uint32_t partition_id, + int32_t partition_id, + uint32_t shard_id, std::string &session_id, void *callback_data, DataStoreCallback callback); @@ -625,29 +628,15 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler // statistics and etc.). int32_t KvPartitionIdOf(const txservice::TableName &table) const { -#ifdef USE_ONE_ELOQDSS_PARTITION - return 0; -#else std::string_view sv = table.StringView(); - return (std::hash()(sv)) & 0x3FF; -#endif + auto hash_code = std::hash()(sv); + return txservice::Sharder::MapKeyHashToHashPartitionId(hash_code); } int32_t KvPartitionIdOf(int32_t key_partition, bool is_range_partition = true) { -#ifdef USE_ONE_ELOQDSS_PARTITION - if (is_range_partition) - { - return key_partition; - } - else - { - return 0; - } -#else return key_partition; -#endif } const txservice::CatalogFactory *GetCatalogFactory( @@ -657,25 +646,29 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler } /** - * @brief Check if the shard_id is local to the current node. + * @brief Check if the owner of shard is the local DataStoreService node. * @param shard_id - * @return true if the shard_id is local to the current node. + * @return true if the owner of shard is the local DataStoreService node */ bool IsLocalShard(uint32_t shard_id); - /** - * @brief Check if the partition_id is local to the current node. - * @param partition_id - * @return true if the partition_id is local to the current node. + * @brief Get the index of the shard's owner node in dss_nodes_. + * @param shard_id + * @return uint32_t */ - bool IsLocalPartition(int32_t partition_id); - - uint32_t GetShardIdByPartitionId(int32_t partition_id) const; - uint32_t AllDataShardCount() const; uint32_t GetOwnerNodeIndexOfShard(uint32_t shard_id) const; + std::vector GetAllDataShards(); bool UpdateOwnerNodeIndexOfShard(uint32_t shard_id, uint32_t old_node_index, uint32_t &new_node_index); + void InitBucketsInfo( + const std::set &node_groups, + uint64_t version, + std::unordered_map> + &ng_bucket_infos); + + void UpdateShardOwner(uint32_t shard_id, uint32_t node_id); + uint32_t FindFreeNodeIndex(); void HandleShardingError(const ::EloqDS::remote::CommonResult &result); bool UpgradeShardVersion(uint32_t shard_id, @@ -683,25 +676,18 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler const std::string &host_name, uint16_t port); - std::string_view EncodeBucketId(uint16_t bucket_id) - { - uint16_t &be_bucket_id = be_bucket_ids_[bucket_id]; - return std::string_view(reinterpret_cast(&be_bucket_id), - sizeof(uint16_t)); - } - txservice::EloqHashCatalogFactory hash_catalog_factory_{}; txservice::EloqRangeCatalogFactory range_catalog_factory_{}; // TODO(lzx): define a global catalog factory array that used by // EngineServer TxService and DataStoreHandler std::array catalog_factory_array_; - // bthread::Mutex ds_service_mutex_; - // bthread::ConditionVariable ds_service_cv_; - // std::atomic ds_serv_shutdown_indicator_; // point to the data store service if it is colocated DataStoreService *data_store_service_; + bool need_bootstrap_{false}; + bool bind_data_shard_with_ng_{false}; + struct DssNode { DssNode() = default; @@ -760,20 +746,20 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler const uint64_t NodeExpiredTime = 10 * 1000 * 1000; // 10s // Now only support one shard. dss_shards_ caches the index in dss_nodes_ of // shard owner. - std::array, 1> dss_shards_; + std::array, 1000> dss_shards_; std::atomic dss_topology_version_{0}; - // std::atomic flying_remote_fetch_count_{0}; - // // Work queue for fetch records from primary node - // std::deque remote_fetch_cc_queue_; + std::shared_mutex dss_shard_ids_mutex_; + std::set dss_shard_ids_; + // key is bucket id, value is bucket info. + std::unordered_map> + bucket_infos_; // table names and their kv table names std::unordered_map pre_built_table_names_; ThreadWorkerPool upsert_table_worker_{1}; - std::vector be_bucket_ids_; - friend class ReadClosure; friend class BatchWriteRecordsClosure; friend class FlushDataClosure; diff --git a/data_store_service_client_closure.cpp b/data_store_service_client_closure.cpp index eb3fa17..8b60d7f 100644 --- a/data_store_service_client_closure.cpp +++ b/data_store_service_client_closure.cpp @@ -69,11 +69,8 @@ void SyncBatchReadForArchiveCallback(void *data, if (err_code == remote::DataStoreError::KEY_NOT_FOUND) { LOG(INFO) << "BatchReadForArchiveCallback, key not found: " - << read_closure->Key().back() << " , set as deleted"; - // callback_data->SetErrorCode( - // static_cast(txservice::CcErrorCode::DATA_STORE_ERR)); - // assert(false); - std::string_view key_str = read_closure->Key().back(); + << read_closure->Key() << " , set as deleted"; + std::string_view key_str = read_closure->Key(); uint64_t ts = 1U; uint64_t ttl = 0U; std::string value_str = client.SerializeTxRecord(true, nullptr); @@ -96,7 +93,7 @@ void SyncBatchReadForArchiveCallback(void *data, } else { - std::string_view key_str = read_closure->Key().back(); + std::string_view key_str = read_closure->Key(); std::string &value_str = read_closure->ValueStringRef(); uint64_t ts = read_closure->Ts(); uint64_t ttl = read_closure->Ttl(); @@ -178,7 +175,7 @@ void FetchRecordCallback(void *data, val, is_deleted, offset)) { LOG(ERROR) << "====fetch record===decode error==" << " key: " - << read_closure->Key().back() + << read_closure->Key() << " status: " << (int) fetch_cc->rec_status_; std::abort(); } @@ -264,10 +261,8 @@ void FetchBucketDataCallback(void *data, } else { - std::string tx_key( - client.DecodeKvKeyForHashPart(key_str.data(), key_str.size())); fetch_bucket_data_cc->AddDataItem( - std::move(tx_key), std::move(value_str), ts, false); + std::move(key_str), std::move(value_str), ts, false); } } @@ -280,12 +275,16 @@ void FetchBucketDataCallback(void *data, fetch_bucket_data_cc->is_drained_ = false; if (fetch_bucket_data_cc->bucket_data_items_.empty()) { - int32_t kv_partition_id = client.KvPartitionIdOf( - txservice::Sharder::MapBucketIdToKvPartitionId( - fetch_bucket_data_cc->bucket_id_), - false); + int32_t partition_id = + txservice::Sharder::MapBucketIdToHashPartitionId( + fetch_bucket_data_cc->bucket_id_); + int32_t kv_partition_id = + client.KvPartitionIdOf(partition_id, false); + uint32_t data_shard_id = + client.GetShardIdByPartitionId(kv_partition_id, false); client.ScanNext(fetch_bucket_data_cc->kv_table_name_, kv_partition_id, + data_shard_id, fetch_bucket_data_cc->kv_start_key_, fetch_bucket_data_cc->kv_end_key_, "", @@ -352,9 +351,7 @@ void FetchSnapshotCallback(void *data, val, is_deleted, offset)) { LOG(ERROR) << "====fetch snapshot===decode error==" - << " key: " - << std::string_view(fetch_cc->tx_key_.Data(), - fetch_cc->tx_key_.Size()) + << " key: " << read_closure->Key() << " status: " << (int) fetch_cc->rec_status_; std::abort(); } @@ -516,8 +513,12 @@ void PartitionBatchCallback(void *data, if (partition_state->GetNextBatch(next_batch)) { // Send the next batch + BatchWriteRecordsClosure *batch_closure = + static_cast(closure); + uint32_t data_shard_id = batch_closure->ShardId(); client.BatchWriteRecords(callback_data->table_name, partition_state->partition_id, + data_shard_id, std::move(next_batch.key_parts), std::move(next_batch.record_parts), std::move(next_batch.records_ts), @@ -615,6 +616,7 @@ void FetchAllDatabaseCallback(void *data, fetch_data->session_id_ = scan_next_closure->SessionId(); client.ScanNext(kv_database_catalogs_name, 0, + scan_next_closure->ShardId(), fetch_data->dbnames_->back(), fetch_data->end_key_, fetch_data->session_id_, @@ -683,6 +685,7 @@ void DiscoverAllTableNamesCallback(void *data, fetch_data->session_id_ = scan_next_closure->SessionId(); client.ScanNext(kv_table_catalogs_name, 0, + scan_next_closure->ShardId(), fetch_data->table_names_->back(), "", fetch_data->session_id_, @@ -815,6 +818,7 @@ void FetchTableRangesCallback(void *data, client.ScanNext(kv_range_table_name, fetch_range_cc->kv_partition_id_, + scan_next_closure->ShardId(), fetch_range_cc->kv_start_key_, fetch_range_cc->kv_end_key_, fetch_range_cc->kv_session_id_, @@ -912,7 +916,7 @@ void FetchRangeSlicesCallback(void *data, client.Read(kv_range_slices_table_name, fetch_req->kv_partition_id_, - "", + read_closure->ShardId(), fetch_req->kv_start_key_, fetch_req, &FetchRangeSlicesCallback); @@ -1009,7 +1013,7 @@ void FetchRangeSlicesCallback(void *data, fetch_req->CurrentSegmentId()); client.Read(kv_range_slices_table_name, fetch_req->kv_partition_id_, - "", + read_closure->ShardId(), fetch_req->kv_start_key_, fetch_req, @@ -1127,6 +1131,7 @@ void FetchTableStatsCallback(void *data, fetch_cc->kv_start_key_ = std::move(key); client.ScanNext(kv_table_statistics_name, fetch_cc->kv_partition_id_, + scan_next_closure->ShardId(), fetch_cc->kv_start_key_, fetch_cc->kv_end_key_, fetch_cc->kv_session_id_, @@ -1233,6 +1238,7 @@ void LoadRangeSliceCallback(void *data, // has more data, continue to scan. client.ScanNext(*fill_store_slice_req->kv_table_name_, fill_store_slice_req->kv_partition_id_, + scan_next_closure->ShardId(), fill_store_slice_req->kv_start_key_, fill_store_slice_req->kv_end_key_, fill_store_slice_req->kv_session_id_, @@ -1301,6 +1307,7 @@ void FetchArchivesCallback(void *data, { client.ScanNext(fetch_data->kv_table_name_, fetch_data->partition_id_, + scan_next_closure->ShardId(), fetch_data->start_key_, fetch_data->end_key_, scan_next_closure->SessionId(), @@ -1409,6 +1416,7 @@ void FetchRecordArchivesCallback(void *data, client.ScanNext(kv_mvcc_archive_name, fetch_cc->partition_id_, + scan_next_closure->ShardId(), fetch_cc->kv_start_key_, fetch_cc->kv_end_key_, fetch_cc->kv_session_id_, @@ -1434,6 +1442,7 @@ void FetchRecordArchivesCallback(void *data, client.ScanNext(kv_mvcc_archive_name, fetch_cc->partition_id_, + scan_next_closure->ShardId(), fetch_cc->kv_start_key_, fetch_cc->kv_end_key_, fetch_cc->kv_session_id_, diff --git a/data_store_service_client_closure.h b/data_store_service_client_closure.h index ee0c7f3..3644638 100644 --- a/data_store_service_client_closure.h +++ b/data_store_service_client_closure.h @@ -496,7 +496,7 @@ struct ReadBaseForArchiveCallbackData return flying_read_cnt_; } - void AddResult(uint32_t partition_id, + void AddResult(int32_t partition_id, const std::string_view key, std::string &&value, uint64_t ts, @@ -529,7 +529,7 @@ struct ReadBaseForArchiveCallbackData bthread::ConditionVariable &cv_; size_t &flying_read_cnt_; int &error_code_; - uint32_t partition_id_; + int32_t partition_id_; std::string_view key_str_; std::string value_str_; uint64_t ts_; @@ -599,8 +599,8 @@ class ReadClosure : public ::google::protobuf::Closure, public Poolable void Reset(DataStoreServiceClient *client, const std::string_view table_name, - const uint32_t partition_id, - std::string_view be_bucket_id, + const int32_t partition_id, + const uint32_t shard_id, std::string_view key, void *callback_data, DataStoreCallback callback) @@ -610,11 +610,8 @@ class ReadClosure : public ::google::protobuf::Closure, public Poolable retry_count_ = 0; table_name_ = table_name; partition_id_ = partition_id; - if (!be_bucket_id.empty()) - { - key_parts_.emplace_back(be_bucket_id); - } - key_parts_.emplace_back(key); + shard_id_ = shard_id; + key_ = key; ds_service_client_ = client; callback_data_ = callback_data; callback_ = callback; @@ -631,8 +628,9 @@ class ReadClosure : public ::google::protobuf::Closure, public Poolable request_.Clear(); response_.Clear(); table_name_ = ""; - partition_id_ = 0; - key_parts_.clear(); + partition_id_ = INT32_MAX; + shard_id_ = UINT32_MAX; + key_ = ""; result_.Clear(); value_.clear(); ts_ = 0; @@ -659,15 +657,9 @@ class ReadClosure : public ::google::protobuf::Closure, public Poolable } request_.Clear(); request_.set_kv_table_name(table_name_.data(), table_name_.size()); + request_.set_shard_id(shard_id_); request_.set_partition_id(partition_id_); - - for (size_t idx = 0; idx < key_parts_.size(); ++idx) - { - std::string *key_part = request_.add_key_str(); - key_part->append(key_parts_[idx].data(), - key_parts_[idx].size()); - } - + request_.set_key_str(key_.data(), key_.size()); rpc_request_prepare_ = true; } } @@ -691,12 +683,9 @@ class ReadClosure : public ::google::protobuf::Closure, public Poolable cntl_.ErrorCode() != EAGAIN && cntl_.ErrorCode() != brpc::ERPCTIMEDOUT) { - uint32_t shard_id = - ds_service_client_->GetShardIdByPartitionId( - partition_id_); uint32_t new_node_index; ds_service_client_->UpdateOwnerNodeIndexOfShard( - shard_id, remote_node_index_, new_node_index); + shard_id_, remote_node_index_, new_node_index); // Retry if (retry_count_ < ds_service_client_->retry_limit_) @@ -765,14 +754,19 @@ class ReadClosure : public ::google::protobuf::Closure, public Poolable return table_name_; } - uint32_t PartitionId() + int32_t PartitionId() { return partition_id_; } + + uint32_t ShardId() + { + return shard_id_; + } - const std::vector &Key() + const std::string_view Key() { - return key_parts_; + return key_; } std::string &LocalValueRef() @@ -891,8 +885,9 @@ class ReadClosure : public ::google::protobuf::Closure, public Poolable // serve local call std::string_view table_name_; - uint32_t partition_id_; - std::vector key_parts_; + int32_t partition_id_; + uint32_t shard_id_; + std::string_view key_; ::EloqDS::remote::CommonResult result_; std::string value_; uint64_t ts_; @@ -1158,7 +1153,8 @@ class DeleteRangeClosure : public ::google::protobuf::Closure, public Poolable void Reset(DataStoreServiceClient &store_hd, const std::string_view table_name, - const uint32_t partition_id, + const int32_t partition_id, + const uint32_t shard_id, const std::string &start_key, const std::string &end_key, const bool skip_wal, @@ -1172,6 +1168,7 @@ class DeleteRangeClosure : public ::google::protobuf::Closure, public Poolable ds_service_client_ = &store_hd; table_name_ = table_name; partition_id_ = partition_id; + shard_id_ = shard_id; start_key_ = start_key; end_key_ = end_key; skip_wal_ = skip_wal; @@ -1200,6 +1197,7 @@ class DeleteRangeClosure : public ::google::protobuf::Closure, public Poolable request_.Clear(); request_.set_kv_table_name(table_name_.data(), table_name_.size()); request_.set_partition_id(partition_id_); + request_.set_shard_id(shard_id_); request_.set_start_key(start_key_.data(), start_key_.size()); request_.set_end_key(end_key_.data(), end_key_.size()); request_.set_skip_wal(skip_wal_); @@ -1225,12 +1223,9 @@ class DeleteRangeClosure : public ::google::protobuf::Closure, public Poolable cntl_.ErrorCode() != EAGAIN && cntl_.ErrorCode() != brpc::ERPCTIMEDOUT) { - uint32_t shard_id = - ds_service_client_->GetShardIdByPartitionId( - partition_id_); uint32_t new_node_index; ds_service_client_->UpdateOwnerNodeIndexOfShard( - shard_id, remote_node_index_, new_node_index); + shard_id_, remote_node_index_, new_node_index); // Retry if (retry_count_ < ds_service_client_->retry_limit_) @@ -1298,11 +1293,16 @@ class DeleteRangeClosure : public ::google::protobuf::Closure, public Poolable return table_name_; } - uint32_t PartitionId() + int32_t PartitionId() { return partition_id_; } + uint32_t ShardId() + { + return shard_id_; + } + std::string_view StartKey() { return start_key_; @@ -1353,7 +1353,8 @@ class DeleteRangeClosure : public ::google::protobuf::Closure, public Poolable bool is_local_request_{false}; bool rpc_request_prepare_{false}; std::string_view table_name_; - uint32_t partition_id_; + int32_t partition_id_; + uint32_t shard_id_; std::string_view start_key_; std::string_view end_key_; bool skip_wal_{false}; @@ -1601,7 +1602,8 @@ class BatchWriteRecordsClosure : public ::google::protobuf::Closure, is_local_request_ = false; kv_table_name_ = ""; - partition_id_ = 0; + partition_id_ = INT32_MAX; + shard_id_ = UINT32_MAX; key_parts_.clear(); record_parts_.clear(); record_ts_.clear(); @@ -1625,6 +1627,7 @@ class BatchWriteRecordsClosure : public ::google::protobuf::Closure, void Reset(DataStoreServiceClient &store_hd, std::string_view kv_table_name, int32_t partition_id, + uint32_t shard_id, std::string_view key, std::string_view record, uint64_t record_ts, @@ -1641,6 +1644,7 @@ class BatchWriteRecordsClosure : public ::google::protobuf::Closure, ds_service_client_ = &store_hd; kv_table_name_ = kv_table_name; partition_id_ = partition_id; + shard_id_ = shard_id; key_parts_.emplace_back(key); record_parts_.emplace_back(record); record_ts_.emplace_back(record_ts); @@ -1656,6 +1660,7 @@ class BatchWriteRecordsClosure : public ::google::protobuf::Closure, void Reset(DataStoreServiceClient &store_hd, std::string_view kv_table_name, int32_t partition_id, + uint32_t shard_id, std::vector &&key_parts, std::vector &&record_parts, std::vector &&record_ts, @@ -1672,6 +1677,7 @@ class BatchWriteRecordsClosure : public ::google::protobuf::Closure, ds_service_client_ = &store_hd; kv_table_name_ = kv_table_name; partition_id_ = partition_id; + shard_id_ = shard_id; key_parts_ = std::move(key_parts); record_parts_ = std::move(record_parts); @@ -1704,12 +1710,9 @@ class BatchWriteRecordsClosure : public ::google::protobuf::Closure, cntl_.ErrorCode() != EAGAIN && cntl_.ErrorCode() != brpc::ERPCTIMEDOUT) { - uint32_t req_shard_id = - ds_service_client_->GetShardIdByPartitionId( - partition_id_); uint32_t new_node_index; ds_service_client_->UpdateOwnerNodeIndexOfShard( - req_shard_id, remote_node_index_, new_node_index); + shard_id_, remote_node_index_, new_node_index); need_retry = true; } @@ -1771,6 +1774,7 @@ class BatchWriteRecordsClosure : public ::google::protobuf::Closure, request_.set_kv_table_name(kv_table_name_.data(), kv_table_name_.size()); request_.set_partition_id(partition_id_); + request_.set_shard_id(shard_id_); request_.set_skip_wal(skip_wal_); assert(record_ts_.size() * parts_cnt_per_key_ == key_parts_.size()); assert(record_ts_.size() * parts_cnt_per_record_ == @@ -1843,6 +1847,16 @@ class BatchWriteRecordsClosure : public ::google::protobuf::Closure, remote_node_index_ = remote_node_index; } + int32_t PartitionId() const + { + return partition_id_; + } + + uint32_t ShardId() const + { + return shard_id_; + } + private: brpc::Controller cntl_; EloqDS::remote::BatchWriteRecordsRequest request_; @@ -1853,6 +1867,7 @@ class BatchWriteRecordsClosure : public ::google::protobuf::Closure, std::string_view kv_table_name_; int32_t partition_id_; + uint32_t shard_id_; std::vector key_parts_; std::vector record_parts_; std::vector record_ts_; @@ -1895,7 +1910,8 @@ class ScanNextClosure : public ::google::protobuf::Closure, public Poolable is_local_request_ = false; rpc_request_prepare_ = false; table_name_ = ""; - partition_id_ = 0; + partition_id_ = INT32_MAX; + shard_id_ = UINT32_MAX; start_key_ = ""; end_key_ = ""; inclusive_start_ = false; @@ -1917,7 +1933,8 @@ class ScanNextClosure : public ::google::protobuf::Closure, public Poolable void Reset( DataStoreServiceClient &store_hd, const std::string_view table_name, - uint32_t partition_id, + int32_t partition_id, + uint32_t shard_id, const std::string_view start_key, const std::string_view end_key, bool inclusive_start, @@ -1937,6 +1954,7 @@ class ScanNextClosure : public ::google::protobuf::Closure, public Poolable ds_service_client_ = &store_hd; table_name_ = table_name; partition_id_ = partition_id; + shard_id_ = shard_id; start_key_ = start_key; end_key_ = end_key; inclusive_start_ = inclusive_start; @@ -1987,7 +2005,7 @@ class ScanNextClosure : public ::google::protobuf::Closure, public Poolable request_.set_kv_table_name_str(table_name_.data(), table_name_.size()); request_.set_partition_id(partition_id_); - + request_.set_shard_id(shard_id_); request_.set_start_key(start_key_.data(), start_key_.size()); request_.set_inclusive_start(inclusive_start_); request_.set_inclusive_end(inclusive_end_); @@ -2032,12 +2050,9 @@ class ScanNextClosure : public ::google::protobuf::Closure, public Poolable cntl_.ErrorCode() != EAGAIN && cntl_.ErrorCode() != brpc::ERPCTIMEDOUT) { - uint32_t shard_id = - ds_service_client_->GetShardIdByPartitionId( - partition_id_); uint32_t new_node_index; ds_service_client_->UpdateOwnerNodeIndexOfShard( - shard_id, remote_node_index_, new_node_index); + shard_id_, remote_node_index_, new_node_index); // Retry if (retry_count_ < ds_service_client_->retry_limit_) @@ -2105,11 +2120,16 @@ class ScanNextClosure : public ::google::protobuf::Closure, public Poolable return table_name_; } - uint32_t PartitionId() + int32_t PartitionId() { return partition_id_; } + uint32_t ShardId() + { + return shard_id_; + } + const std::string_view StartKey() { return start_key_; @@ -2252,7 +2272,8 @@ class ScanNextClosure : public ::google::protobuf::Closure, public Poolable bool is_local_request_{false}; bool rpc_request_prepare_{false}; std::string_view table_name_; - uint32_t partition_id_; + int32_t partition_id_; + uint32_t shard_id_; std::string_view start_key_; std::string_view end_key_; bool inclusive_start_{false}; @@ -2907,7 +2928,7 @@ void FetchTableStatsCallback(void *data, struct FetchArchivesCallbackData : public SyncCallbackData { FetchArchivesCallbackData(const std::string_view kv_table_name, - uint32_t partition_id, + int32_t partition_id, std::string &start_key, const std::string &end_key, const size_t batch_size, @@ -2925,7 +2946,7 @@ struct FetchArchivesCallbackData : public SyncCallbackData } const std::string_view kv_table_name_; - const uint32_t partition_id_; + const int32_t partition_id_; std::string &start_key_; const std::string &end_key_; const size_t batch_size_; @@ -3011,4 +3032,4 @@ void CreateSnapshotForBackupCallback(void *data, DataStoreServiceClient &client, const remote::CommonResult &result); -} // namespace EloqDS +} // namespace EloqDS \ No newline at end of file diff --git a/data_store_service_scanner.cpp b/data_store_service_scanner.cpp index 405eebb..8322981 100644 --- a/data_store_service_scanner.cpp +++ b/data_store_service_scanner.cpp @@ -64,6 +64,7 @@ bool SinglePartitionScanner::FetchNextBatch() client->ScanNext( scanner_->GetTableName(), partition_id_, + data_shard_id_, last_key_, scanner_->GetEndKey(), session_id_, @@ -107,6 +108,7 @@ bool SinglePartitionScanner::ScanClose() { client->ScanClose(scanner_->GetTableName(), partition_id_, + data_shard_id_, session_id_, this, ProcessScanCloseResult); @@ -412,7 +414,9 @@ bool DataStoreServiceHashPartitionScanner::Init() for (uint32_t part_cnt = 0; part_cnt < HASH_PARTITION_COUNT; part_cnt++) { auto *part_scanner = single_partition_scanner_pool_.NextObject(); - part_scanner->Reset(this, part_cnt, start_key_); + uint32_t data_shard_id = client_->GetShardIdByPartitionId( + static_cast(part_cnt), false); + part_scanner->Reset(this, part_cnt, data_shard_id, start_key_); partition_scanners_.push_back(part_scanner); // ignore the return value, since the scanner is not used part_scanner->FetchNextBatch(); diff --git a/data_store_service_scanner.h b/data_store_service_scanner.h index 73162aa..c5ed621 100644 --- a/data_store_service_scanner.h +++ b/data_store_service_scanner.h @@ -205,11 +205,13 @@ class SinglePartitionScanner : public Poolable } void Reset(DataStoreServiceScanner *scanner, - uint32_t partition_id, + int32_t partition_id, + uint32_t data_shard_id, const std::string_view start_key) { scanner_ = scanner; partition_id_ = partition_id; + data_shard_id_ = data_shard_id; last_key_ = start_key; session_id_ = ""; last_batch_size_ = scanner_->GetBatchSize(); @@ -247,6 +249,7 @@ class SinglePartitionScanner : public Poolable { scanner_ = nullptr; partition_id_ = 0; + data_shard_id_ = 0; last_key_ = ""; session_id_ = ""; last_batch_size_ = 0; @@ -259,7 +262,8 @@ class SinglePartitionScanner : public Poolable void ResetCache(); DataStoreServiceScanner *scanner_; - uint32_t partition_id_; + int32_t partition_id_; + uint32_t data_shard_id_{0}; std::string last_key_; uint32_t last_batch_size_; std::string session_id_; @@ -277,11 +281,8 @@ class DataStoreServiceHashPartitionScanner : public txservice::store::DataStoreScanner, public DataStoreServiceScanner { -#ifdef USE_ONE_ELOQDSS_PARTITION - static constexpr uint32_t HASH_PARTITION_COUNT = 1; -#else - static constexpr uint32_t HASH_PARTITION_COUNT = 0x400; // 1024 -#endif + static constexpr uint32_t HASH_PARTITION_COUNT = + txservice::total_hash_partitions; public: DataStoreServiceHashPartitionScanner( diff --git a/eloq_data_store_service/CMakeLists.txt b/eloq_data_store_service/CMakeLists.txt index 99a0253..b3a9b7a 100644 --- a/eloq_data_store_service/CMakeLists.txt +++ b/eloq_data_store_service/CMakeLists.txt @@ -57,11 +57,6 @@ set(WITH_DATA_STORE "ELOQDSS_ROCKSDB_CLOUD_S3" CACHE STRING "The KV data store t set_property(CACHE WITH_DATA_STORE PROPERTY STRINGS "ELOQDSS_ROCKSDB_CLOUD_S3" "ELOQDSS_ROCKSDB_CLOUD_GCS" "ELOQDSS_ROCKSDB" "ELOQDSS_ELOQSTORE") message(NOTICE "With DATA_STORE: ${WITH_DATA_STORE}") -option(USE_ONE_ELOQDSS_PARTITION_ENABLED "Whether use one partition for kv store" ON) -message(NOTICE "USE_ONE_ELOQDSS_PARTITION_ENABLED : ${USE_ONE_ELOQDSS_PARTITION_ENABLED}") -if (USE_ONE_ELOQDSS_PARTITION_ENABLED) - add_compile_definitions(USE_ONE_ELOQDSS_PARTITION) -endif() # Add compile flags for KV stores if (WITH_DATA_STORE STREQUAL "ELOQDSS_ROCKSDB_CLOUD_S3") diff --git a/eloq_data_store_service/data_store_service.cpp b/eloq_data_store_service/data_store_service.cpp index d97d4b9..749cada 100644 --- a/eloq_data_store_service/data_store_service.cpp +++ b/eloq_data_store_service/data_store_service.cpp @@ -219,6 +219,12 @@ DataStoreService::DataStoreService( assert(data_store_factory_ != nullptr); // Create the directory if it doesn't exist std::filesystem::create_directories(migration_log_path_); + + for (size_t i = 0; i < data_shards_.size(); i++) + { + data_shards_[i].shard_id_ = i; + data_shards_[i].shard_status_.store(DSShardStatus::Closed); + } } DataStoreService::~DataStoreService() @@ -250,19 +256,13 @@ DataStoreService::~DataStoreService() } // shutdown all data_store - if (shard_status_.load(std::memory_order_acquire) != DSShardStatus::Closed) + for (auto &it : data_shards_) { - if (data_store_ != nullptr) - { - data_store_->Shutdown(); - } - data_store_ = nullptr; + it.ShutDown(); } } -bool DataStoreService::StartService(bool create_db_if_missing, - uint32_t dss_leader_node_id, - uint32_t dss_node_id) +bool DataStoreService::StartService(bool create_db_if_missing) { if (server_ != nullptr) { @@ -270,49 +270,45 @@ bool DataStoreService::StartService(bool create_db_if_missing, } auto dss_shards = cluster_manager_.GetShardsForThisNode(); - assert(dss_shards.size() <= 1); - assert(shard_status_.load(std::memory_order_acquire) == - DSShardStatus::Closed); + LOG(INFO) << "DataStoreService start with shards: " << dss_shards.size(); + if (!dss_shards.empty()) { - shard_id_ = dss_shards.at(0); - auto open_mode = cluster_manager_.FetchDSShardStatus(shard_id_); - DLOG(INFO) << "StartService data store shard id:" << shard_id_ - << ", open_mode:" << static_cast(open_mode) - << ", create_db_if_missing:" << create_db_if_missing - << ", dss_leader_node_id:" << dss_leader_node_id - << ", dss_node_id:" << dss_node_id; - if (open_mode == DSShardStatus::ReadOnly || - open_mode == DSShardStatus::ReadWrite) - { - auto expect_status = DSShardStatus::Closed; - if (shard_status_.compare_exchange_strong(expect_status, - DSShardStatus::Starting)) + for (uint32_t shard_id : dss_shards) + { + auto &ds_ref = data_shards_[shard_id]; + auto open_mode = cluster_manager_.FetchDSShardStatus(shard_id); + if (open_mode == DSShardStatus::ReadOnly || + open_mode == DSShardStatus::ReadWrite) { - // start underling db if this dss node is the - // leader dss node - data_store_ = data_store_factory_->CreateDataStore( - create_db_if_missing, - shard_id_, - this, - dss_leader_node_id == dss_node_id); - if (data_store_ == nullptr) + auto expect_status = DSShardStatus::Closed; + if (ds_ref.shard_status_.compare_exchange_strong( + expect_status, DSShardStatus::Starting)) { - LOG(ERROR) << "Failed to create data store on starting " - "DataStoreService."; - return false; - } + ds_ref.data_store_ = data_store_factory_->CreateDataStore( + create_db_if_missing, shard_id, this, true); + if (ds_ref.data_store_ == nullptr) + { + LOG(ERROR) << "Failed to create data store on starting " + "DataStoreService, shard id: " + << shard_id; + return false; + } + ds_ref.scan_iter_cache_ = + std::make_unique(); - if (open_mode == DSShardStatus::ReadOnly) - { - data_store_->SwitchToReadOnly(); + if (open_mode == DSShardStatus::ReadOnly) + { + ds_ref.data_store_->SwitchToReadOnly(); + } + ds_ref.shard_status_.store(open_mode, + std::memory_order_release); + LOG(INFO) << "Created data store on starting " + "DataStoreService, shard id: " + << shard_id; } - shard_status_.store(open_mode, std::memory_order_release); } } - - DLOG(INFO) << "Created data store shard id:" << shard_id_ - << ", shard_status:" << static_cast(shard_status_); } server_ = std::make_unique(); @@ -325,13 +321,13 @@ bool DataStoreService::StartService(bool create_db_if_missing, brpc::ServerOptions options; options.num_threads = 0; options.has_builtin_services = true; - if (server_->Start(cluster_manager_.GetThisNode().port_, &options) != 0) + auto this_node = cluster_manager_.GetThisNode(); + if (server_->Start(this_node.port_, &options) != 0) { LOG(ERROR) << "Failed to start DataStoreService"; return false; } - LOG(INFO) << "DataStoreService started on port " - << cluster_manager_.GetThisNode().port_; + LOG(INFO) << "DataStoreService started on port " << this_node.port_; #ifdef DATA_STORE_TYPE_ELOQDSS_ROCKSDB_CLOUD_S3 // Start file cache sync worker (for primary node to send file cache to standby) @@ -362,10 +358,11 @@ bool DataStoreService::ConnectAndStartDataStore(uint32_t data_shard_id, return true; } // assert(open_mode == DSShardStatus::ReadOnly); + auto &shard_ref = data_shards_.at(data_shard_id); DSShardStatus expect_status = DSShardStatus::Closed; - if (!shard_status_.compare_exchange_strong(expect_status, - DSShardStatus::Starting)) + if (!shard_ref.shard_status_.compare_exchange_strong( + expect_status, DSShardStatus::Starting)) { if (expect_status == open_mode) { @@ -374,29 +371,29 @@ bool DataStoreService::ConnectAndStartDataStore(uint32_t data_shard_id, while (expect_status == DSShardStatus::Starting) { bthread_usleep(10000); - expect_status = shard_status_.load(std::memory_order_acquire); + expect_status = + shard_ref.shard_status_.load(std::memory_order_acquire); } return expect_status == open_mode; } // Make sure file sync is not running - while (is_file_sync_running_.load(std::memory_order_relaxed)) + while (shard_ref.is_file_sync_running_.load(std::memory_order_relaxed)) { bthread_usleep(10000); } DLOG(INFO) << "Connecting and starting data store for shard id:" - << data_shard_id - << ", open_mode:" << static_cast(open_mode) - << ", create_db_if_missing:" << create_db_if_missing - << ", data_store_ is null:" << (data_store_ == nullptr); + << data_shard_id << ", open_mode:" << static_cast(open_mode) + << ", create_db_if_missing:" << create_db_if_missing + << ", data_store_ is null:" + << (shard_ref.data_store_ == nullptr); assert(data_store_factory_ != nullptr); - if (data_store_ == nullptr) + if (shard_ref.data_store_ == nullptr) { - shard_id_ = data_shard_id; - data_store_ = data_store_factory_->CreateDataStore( + shard_ref.data_store_ = data_store_factory_->CreateDataStore( create_db_if_missing, data_shard_id, this, true); - if (data_store_ == nullptr) + if (shard_ref.data_store_ == nullptr) { LOG(ERROR) << "Failed to create data store"; return false; @@ -404,24 +401,29 @@ bool DataStoreService::ConnectAndStartDataStore(uint32_t data_shard_id, } else { - assert(shard_id_ == data_shard_id); - bool res = data_store_->Initialize(); + bool res = shard_ref.data_store_->Initialize(); if (!res) { LOG(ERROR) << "Failed to initialize data store"; return false; } - res = data_store_->StartDB(); + res = shard_ref.data_store_->StartDB(); if (!res) { LOG(ERROR) << "Failed to start db instance in data store service"; return false; } } + + if (shard_ref.scan_iter_cache_ == nullptr) + { + shard_ref.scan_iter_cache_ = std::make_unique(); + } + if (open_mode == DSShardStatus::ReadOnly) { - data_store_->SwitchToReadOnly(); + shard_ref.data_store_->SwitchToReadOnly(); cluster_manager_.SwitchShardToReadOnly(data_shard_id, DSShardStatus::Closed); } @@ -433,7 +435,7 @@ bool DataStoreService::ConnectAndStartDataStore(uint32_t data_shard_id, } expect_status = DSShardStatus::Starting; - shard_status_.compare_exchange_strong( + shard_ref.shard_status_.compare_exchange_strong( expect_status, open_mode, std::memory_order_release); return true; } @@ -443,18 +445,18 @@ void DataStoreService::Read(::google::protobuf::RpcController *controller, ::EloqDS::remote::ReadResponse *response, ::google::protobuf::Closure *done) { - uint32_t partition_id = request->partition_id(); - uint32_t shard_id = GetShardIdByPartitionId(partition_id); + uint32_t shard_id = request->shard_id(); if (!IsOwnerOfShard(shard_id)) { brpc::ClosureGuard done_guard(done); auto *result = response->mutable_result(); - cluster_manager_.PrepareShardingError(shard_id, result); + PrepareShardingError(shard_id, result); return; } - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadOnly && shard_status != DSShardStatus::ReadWrite) { @@ -465,33 +467,33 @@ void DataStoreService::Read(::google::protobuf::RpcController *controller, return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); // decrease read req count when read done ReadRpcRequest *req = rpc_read_request_pool_.NextObject(); req->Reset(this, request, response, done); - data_store_->Read(req); + ds_ref.data_store_->Read(req); } void DataStoreService::Read(const std::string_view table_name, - const uint32_t partition_id, - const std::vector &key, + const int32_t partition_id, + const uint32_t shard_id, + const std::string_view key, std::string *record, uint64_t *ts, uint64_t *ttl, ::EloqDS::remote::CommonResult *result, ::google::protobuf::Closure *done) { - uint32_t shard_id = GetShardIdByPartitionId(partition_id); - if (!IsOwnerOfShard(shard_id)) { brpc::ClosureGuard done_guard(done); - cluster_manager_.PrepareShardingError(shard_id, result); + PrepareShardingError(shard_id, result); return; } - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadOnly && shard_status != DSShardStatus::ReadWrite) { @@ -503,11 +505,19 @@ void DataStoreService::Read(const std::string_view table_name, return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); ReadLocalRequest *req = local_read_request_pool_.NextObject(); - req->Reset( - this, table_name, partition_id, &key, record, ts, ttl, result, done); - data_store_->Read(req); + req->Reset(this, + table_name, + partition_id, + shard_id, + key, + record, + ts, + ttl, + result, + done); + ds_ref.data_store_->Read(req); } void DataStoreService::FlushData( @@ -521,16 +531,17 @@ void DataStoreService::FlushData( { brpc::ClosureGuard done_guard(done); ::EloqDS::remote::CommonResult *result = response->mutable_result(); - cluster_manager_.PrepareShardingError(shard_id, result); + PrepareShardingError(shard_id, result); return; } - IncreaseWriteReqCount(); + IncreaseWriteReqCount(shard_id); + DataShard &ds_ref = data_shards_.at(shard_id); - auto shard_status = shard_status_.load(std::memory_order_acquire); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite) { - DecreaseWriteReqCount(); + DecreaseWriteReqCount(shard_id); brpc::ClosureGuard done_guard(done); ::EloqDS::remote::CommonResult *result = response->mutable_result(); if (shard_status == DSShardStatus::Closed) @@ -547,13 +558,13 @@ void DataStoreService::FlushData( return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); FlushDataRpcRequest *req = rpc_flush_data_req_pool_.NextObject(); req->Reset(this, request, response, done); // Process request async. - data_store_->FlushData(req); + ds_ref.data_store_->FlushData(req); } void DataStoreService::FlushData(const std::vector &kv_table_names, @@ -564,16 +575,17 @@ void DataStoreService::FlushData(const std::vector &kv_table_names, if (!IsOwnerOfShard(shard_id)) { brpc::ClosureGuard done_guard(done); - cluster_manager_.PrepareShardingError(shard_id, &result); + PrepareShardingError(shard_id, &result); return; } - IncreaseWriteReqCount(); + IncreaseWriteReqCount(shard_id); + DataShard &ds_ref = data_shards_.at(shard_id); - auto shard_status = shard_status_.load(std::memory_order_acquire); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite) { - DecreaseWriteReqCount(); + DecreaseWriteReqCount(shard_id); brpc::ClosureGuard done_guard(done); if (shard_status == DSShardStatus::Closed) { @@ -589,13 +601,13 @@ void DataStoreService::FlushData(const std::vector &kv_table_names, return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); FlushDataLocalRequest *req = local_flush_data_req_pool_.NextObject(); - req->Reset(this, &kv_table_names, result, done); + req->Reset(this, &kv_table_names, shard_id, result, done); // Process request async. - data_store_->FlushData(req); + ds_ref.data_store_->FlushData(req); } void DataStoreService::DeleteRange( @@ -604,21 +616,22 @@ void DataStoreService::DeleteRange( ::EloqDS::remote::DeleteRangeResponse *response, ::google::protobuf::Closure *done) { - uint32_t shard_id = GetShardIdByPartitionId(request->partition_id()); + uint32_t shard_id = request->shard_id(); if (!IsOwnerOfShard(shard_id)) { brpc::ClosureGuard done_guard(done); ::EloqDS::remote::CommonResult *result = response->mutable_result(); - cluster_manager_.PrepareShardingError(shard_id, result); + PrepareShardingError(shard_id, result); return; } - IncreaseWriteReqCount(); + IncreaseWriteReqCount(shard_id); - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite) { - DecreaseWriteReqCount(); + DecreaseWriteReqCount(shard_id); // This object helps to call done->Run() in RAII style. If you need to // process the request asynchronously, pass done_guard.release(). brpc::ClosureGuard done_guard(done); @@ -637,38 +650,38 @@ void DataStoreService::DeleteRange( return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); DeleteRangeRpcRequest *req = rpc_delete_range_req_pool_.NextObject(); req->Reset(this, request, response, done); // Process request async. - data_store_->DeleteRange(req); + ds_ref.data_store_->DeleteRange(req); } void DataStoreService::DeleteRange(const std::string_view table_name, - const uint32_t partition_id, + const int32_t partition_id, + const uint32_t shard_id, const std::string_view start_key, const std::string_view end_key, const bool skip_wal, remote::CommonResult &result, ::google::protobuf::Closure *done) { - uint32_t shard_id = GetShardIdByPartitionId(partition_id); - if (!IsOwnerOfShard(shard_id)) { brpc::ClosureGuard done_guard(done); - cluster_manager_.PrepareShardingError(shard_id, &result); + PrepareShardingError(shard_id, &result); return; } - IncreaseWriteReqCount(); + IncreaseWriteReqCount(shard_id); - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite) { - DecreaseWriteReqCount(); + DecreaseWriteReqCount(shard_id); brpc::ClosureGuard done_guard(done); if (shard_status == DSShardStatus::Closed) { @@ -684,12 +697,13 @@ void DataStoreService::DeleteRange(const std::string_view table_name, return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); DeleteRangeLocalRequest *req = local_delete_range_req_pool_.NextObject(); req->Reset(this, table_name, partition_id, + shard_id, start_key, end_key, skip_wal, @@ -697,7 +711,7 @@ void DataStoreService::DeleteRange(const std::string_view table_name, done); // Process request async. - data_store_->DeleteRange(req); + ds_ref.data_store_->DeleteRange(req); } void DataStoreService::CreateTable( @@ -711,16 +725,17 @@ void DataStoreService::CreateTable( { brpc::ClosureGuard done_guard(done); ::EloqDS::remote::CommonResult *result = response->mutable_result(); - cluster_manager_.PrepareShardingError(shard_id, result); + PrepareShardingError(shard_id, result); return; } - IncreaseWriteReqCount(); + IncreaseWriteReqCount(shard_id); - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite) { - DecreaseWriteReqCount(); + DecreaseWriteReqCount(shard_id); brpc::ClosureGuard done_guard(done); ::EloqDS::remote::CommonResult *result = response->mutable_result(); if (shard_status == DSShardStatus::Closed) @@ -737,13 +752,13 @@ void DataStoreService::CreateTable( return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); CreateTableRpcRequest *req = rpc_create_table_req_pool_.NextObject(); req->Reset(this, request, response, done); // Process request async. - data_store_->CreateTable(req); + ds_ref.data_store_->CreateTable(req); } void DataStoreService::CreateTable(const std::string_view table_name, @@ -754,16 +769,17 @@ void DataStoreService::CreateTable(const std::string_view table_name, if (!IsOwnerOfShard(shard_id)) { brpc::ClosureGuard done_guard(done); - cluster_manager_.PrepareShardingError(shard_id, &result); + PrepareShardingError(shard_id, &result); return; } - IncreaseWriteReqCount(); + IncreaseWriteReqCount(shard_id); - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite) { - DecreaseWriteReqCount(); + DecreaseWriteReqCount(shard_id); brpc::ClosureGuard done_guard(done); if (shard_status == DSShardStatus::Closed) { @@ -779,13 +795,13 @@ void DataStoreService::CreateTable(const std::string_view table_name, return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); CreateTableLocalRequest *req = local_create_table_req_pool_.NextObject(); - req->Reset(this, table_name, result, done); + req->Reset(this, table_name, shard_id, result, done); // Process request async. - data_store_->CreateTable(req); + ds_ref.data_store_->CreateTable(req); } void DataStoreService::DropTable( @@ -799,16 +815,17 @@ void DataStoreService::DropTable( { brpc::ClosureGuard done_guard(done); ::EloqDS::remote::CommonResult *result = response->mutable_result(); - cluster_manager_.PrepareShardingError(shard_id, result); + PrepareShardingError(shard_id, result); return; } - IncreaseWriteReqCount(); + IncreaseWriteReqCount(shard_id); - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite) { - DecreaseWriteReqCount(); + DecreaseWriteReqCount(shard_id); brpc::ClosureGuard done_guard(done); ::EloqDS::remote::CommonResult *result = response->mutable_result(); if (shard_status == DSShardStatus::Closed) @@ -825,13 +842,13 @@ void DataStoreService::DropTable( return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); DropTableRpcRequest *req = rpc_drop_table_req_pool_.NextObject(); req->Reset(this, request, response, done); // Process request async. - data_store_->DropTable(req); + ds_ref.data_store_->DropTable(req); } void DataStoreService::DropTable(const std::string_view table_name, @@ -842,16 +859,17 @@ void DataStoreService::DropTable(const std::string_view table_name, if (!IsOwnerOfShard(shard_id)) { brpc::ClosureGuard done_guard(done); - cluster_manager_.PrepareShardingError(shard_id, &result); + PrepareShardingError(shard_id, &result); return; } - IncreaseWriteReqCount(); + IncreaseWriteReqCount(shard_id); - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite) { - DecreaseWriteReqCount(); + DecreaseWriteReqCount(shard_id); brpc::ClosureGuard done_guard(done); if (shard_status == DSShardStatus::Closed) { @@ -867,13 +885,13 @@ void DataStoreService::DropTable(const std::string_view table_name, return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); DropTableLocalRequest *req = local_drop_table_req_pool_.NextObject(); - req->Reset(this, table_name, result, done); + req->Reset(this, table_name, shard_id, result, done); // Process request async. - data_store_->DropTable(req); + ds_ref.data_store_->DropTable(req); } void DataStoreService::BatchWriteRecords( @@ -882,23 +900,23 @@ void DataStoreService::BatchWriteRecords( ::EloqDS::remote::BatchWriteRecordsResponse *response, ::google::protobuf::Closure *done) { - uint32_t partition_id = request->partition_id(); - uint32_t shard_id = GetShardIdByPartitionId(partition_id); + uint32_t shard_id = request->shard_id(); if (!IsOwnerOfShard(shard_id)) { brpc::ClosureGuard done_guard(done); auto *result = response->mutable_result(); - cluster_manager_.PrepareShardingError(shard_id, result); + PrepareShardingError(shard_id, result); return; } - IncreaseWriteReqCount(); + IncreaseWriteReqCount(shard_id); - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite) { - DecreaseWriteReqCount(); + DecreaseWriteReqCount(shard_id); brpc::ClosureGuard done_guard(done); auto *result = response->mutable_result(); if (shard_status == DSShardStatus::Closed) @@ -917,18 +935,19 @@ void DataStoreService::BatchWriteRecords( return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); WriteRecordsRpcRequest *batch_write_req = rpc_write_records_request_pool_.NextObject(); batch_write_req->Reset(this, request, response, done); - data_store_->BatchWriteRecords(batch_write_req); + ds_ref.data_store_->BatchWriteRecords(batch_write_req); } void DataStoreService::ScanNext( const std::string_view table_name, - uint32_t partition_id, + int32_t partition_id, + uint32_t shard_id, const std::string_view start_key, const std::string_view end_key, bool inclusive_start, @@ -942,16 +961,15 @@ void DataStoreService::ScanNext( ::EloqDS::remote::CommonResult *result, ::google::protobuf::Closure *done) { - uint32_t shard_id = GetShardIdByPartitionId(partition_id); - if (!IsOwnerOfShard(shard_id)) { brpc::ClosureGuard done_guard(done); - cluster_manager_.PrepareShardingError(shard_id, result); + PrepareShardingError(shard_id, result); return; } - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite && shard_status != DSShardStatus::ReadOnly) { @@ -961,12 +979,13 @@ void DataStoreService::ScanNext( return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); ScanLocalRequest *req = local_scan_request_pool_.NextObject(); req->Reset(this, table_name, partition_id, + shard_id, start_key, end_key, inclusive_start, @@ -980,7 +999,7 @@ void DataStoreService::ScanNext( result, done); - data_store_->ScanNext(req); + ds_ref.data_store_->ScanNext(req); } void DataStoreService::ScanNext(::google::protobuf::RpcController *controller, @@ -988,18 +1007,17 @@ void DataStoreService::ScanNext(::google::protobuf::RpcController *controller, ::EloqDS::remote::ScanResponse *response, ::google::protobuf::Closure *done) { - uint32_t partition_id = request->partition_id(); - uint32_t shard_id = GetShardIdByPartitionId(partition_id); + uint32_t shard_id = request->shard_id(); if (!IsOwnerOfShard(shard_id)) { brpc::ClosureGuard done_guard(done); - cluster_manager_.PrepareShardingError(shard_id, - response->mutable_result()); + PrepareShardingError(shard_id, response->mutable_result()); return; } - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite && shard_status != DSShardStatus::ReadOnly) { @@ -1010,12 +1028,12 @@ void DataStoreService::ScanNext(::google::protobuf::RpcController *controller, return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); ScanRpcRequest *req = rpc_scan_request_pool_.NextObject(); req->Reset(this, request, response, done); - data_store_->ScanNext(req); + ds_ref.data_store_->ScanNext(req); } void DataStoreService::ScanClose(::google::protobuf::RpcController *controller, @@ -1023,18 +1041,17 @@ void DataStoreService::ScanClose(::google::protobuf::RpcController *controller, ::EloqDS::remote::ScanResponse *response, ::google::protobuf::Closure *done) { - uint32_t partition_id = request->partition_id(); - uint32_t shard_id = GetShardIdByPartitionId(partition_id); + uint32_t shard_id = request->shard_id(); if (!IsOwnerOfShard(shard_id)) { brpc::ClosureGuard done_guard(done); - cluster_manager_.PrepareShardingError(shard_id, - response->mutable_result()); + PrepareShardingError(shard_id, response->mutable_result()); return; } - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite && shard_status != DSShardStatus::ReadOnly) { @@ -1046,30 +1063,30 @@ void DataStoreService::ScanClose(::google::protobuf::RpcController *controller, return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); ScanRpcRequest *req = rpc_scan_request_pool_.NextObject(); req->Reset(this, request, response, done); - data_store_->ScanClose(req); + ds_ref.data_store_->ScanClose(req); } void DataStoreService::ScanClose(const std::string_view table_name, - uint32_t partition_id, + int32_t partition_id, + uint32_t shard_id, std::string *session_id, remote::CommonResult *result, ::google::protobuf::Closure *done) { - uint32_t shard_id = GetShardIdByPartitionId(partition_id); - if (!IsOwnerOfShard(shard_id)) { brpc::ClosureGuard done_guard(done); - cluster_manager_.PrepareShardingError(shard_id, result); + PrepareShardingError(shard_id, result); return; } - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite && shard_status != DSShardStatus::ReadOnly) { @@ -1080,12 +1097,12 @@ void DataStoreService::ScanClose(const std::string_view table_name, return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); ScanLocalRequest *req = local_scan_request_pool_.NextObject(); - req->Reset(this, table_name, partition_id, session_id, false, result, done); + req->Reset(this, table_name, partition_id, shard_id, session_id, false, result, done); - data_store_->ScanClose(req); + ds_ref.data_store_->ScanClose(req); } void DataStoreService::AppendThisNodeKey(std::stringstream &ss) @@ -1111,37 +1128,57 @@ void DataStoreService::EmplaceScanIter(uint32_t shard_id, std::string &session_id, std::unique_ptr iter) { - assert(shard_id == shard_id_); - scan_iter_cache_.Emplace(session_id, std::move(iter)); + DataShard &ds_ref = data_shards_.at(shard_id); + if (ds_ref.scan_iter_cache_ != nullptr) + { + ds_ref.scan_iter_cache_->Emplace(session_id, std::move(iter)); + } } TTLWrapper *DataStoreService::BorrowScanIter(uint32_t shard_id, const std::string &session_id) { - assert(shard_id == shard_id_); - auto *scan_iter_wrapper = scan_iter_cache_.Borrow(session_id); - return scan_iter_wrapper; + DataShard &ds_ref = data_shards_.at(shard_id); + if (ds_ref.scan_iter_cache_ != nullptr) + { + auto *scan_iter_wrapper = ds_ref.scan_iter_cache_->Borrow(session_id); + return scan_iter_wrapper; + } + return nullptr; } void DataStoreService::ReturnScanIter(uint32_t shard_id, TTLWrapper *iter) { - scan_iter_cache_.Return(iter); + DataShard &ds_ref = data_shards_.at(shard_id); + if (ds_ref.scan_iter_cache_ != nullptr) + { + ds_ref.scan_iter_cache_->Return(iter); + } } void DataStoreService::EraseScanIter(uint32_t shard_id, const std::string &session_id) { - scan_iter_cache_.Erase(session_id); + DataShard &ds_ref = data_shards_.at(shard_id); + if (ds_ref.scan_iter_cache_ != nullptr) + { + ds_ref.scan_iter_cache_->Erase(session_id); + } } void DataStoreService::ForceEraseScanIters(uint32_t shard_id) { - scan_iter_cache_.ForceEraseIters(); + DataShard &ds_ref = data_shards_.at(shard_id); + if (ds_ref.scan_iter_cache_ != nullptr) + { + ds_ref.scan_iter_cache_->ForceEraseIters(); + } } void DataStoreService::BatchWriteRecords( std::string_view table_name, int32_t partition_id, + uint32_t shard_id, const std::vector &key_parts, const std::vector &record_parts, const std::vector &ts, @@ -1153,21 +1190,20 @@ void DataStoreService::BatchWriteRecords( const uint16_t parts_cnt_per_key, const uint16_t parts_cnt_per_record) { - uint32_t shard_id = GetShardIdByPartitionId(partition_id); - if (!IsOwnerOfShard(shard_id)) { brpc::ClosureGuard done_guard(done); - cluster_manager_.PrepareShardingError(shard_id, &result); + PrepareShardingError(shard_id, &result); return; } - IncreaseWriteReqCount(); + IncreaseWriteReqCount(shard_id); - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite) { - DecreaseWriteReqCount(); + DecreaseWriteReqCount(shard_id); brpc::ClosureGuard done_guard(done); if (shard_status == DSShardStatus::Closed) { @@ -1185,12 +1221,13 @@ void DataStoreService::BatchWriteRecords( return; } - assert(data_store_ != nullptr); + assert(ds_ref.data_store_ != nullptr); WriteRecordsLocalRequest *batch_write_req = local_write_records_request_pool_.NextObject(); batch_write_req->Reset(this, table_name, partition_id, + shard_id, key_parts, record_parts, ts, @@ -1202,7 +1239,7 @@ void DataStoreService::BatchWriteRecords( parts_cnt_per_key, parts_cnt_per_record); - data_store_->BatchWriteRecords(batch_write_req); + ds_ref.data_store_->BatchWriteRecords(batch_write_req); } void DataStoreService::CreateSnapshotForBackup( @@ -1217,16 +1254,17 @@ void DataStoreService::CreateSnapshotForBackup( if (!IsOwnerOfShard(shard_id)) { brpc::ClosureGuard done_guard(done); - cluster_manager_.PrepareShardingError(shard_id, result); + PrepareShardingError(shard_id, result); return; } - IncreaseWriteReqCount(); + IncreaseWriteReqCount(shard_id); - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite) { - DecreaseWriteReqCount(); + DecreaseWriteReqCount(shard_id); brpc::ClosureGuard done_guard(done); if (shard_status == DSShardStatus::Closed) { @@ -1248,7 +1286,7 @@ void DataStoreService::CreateSnapshotForBackup( rpc_create_snapshot_req_pool_.NextObject(); req->Reset(this, request, response, done); - data_store_->CreateSnapshotForBackup(req); + ds_ref.data_store_->CreateSnapshotForBackup(req); } void DataStoreService::CreateSnapshotForBackup( @@ -1262,16 +1300,17 @@ void DataStoreService::CreateSnapshotForBackup( if (!IsOwnerOfShard(shard_id)) { brpc::ClosureGuard done_guard(done); - cluster_manager_.PrepareShardingError(shard_id, result); + PrepareShardingError(shard_id, result); return; } - IncreaseWriteReqCount(); + IncreaseWriteReqCount(shard_id); - auto shard_status = shard_status_.load(std::memory_order_acquire); + DataShard &ds_ref = data_shards_.at(shard_id); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); if (shard_status != DSShardStatus::ReadWrite) { - DecreaseWriteReqCount(); + DecreaseWriteReqCount(shard_id); brpc::ClosureGuard done_guard(done); if (shard_status == DSShardStatus::Closed) { @@ -1292,10 +1331,11 @@ void DataStoreService::CreateSnapshotForBackup( CreateSnapshotForBackupLocalRequest *req = local_create_snapshot_req_pool_.NextObject(); - req->Reset(this, backup_name, backup_ts, backup_files, result, done); + req->Reset( + this, shard_id, backup_name, backup_ts, backup_files, result, done); // Process request async - data_store_->CreateSnapshotForBackup(req); + ds_ref.data_store_->CreateSnapshotForBackup(req); } void DataStoreService::SyncFileCache( @@ -1305,23 +1345,28 @@ void DataStoreService::SyncFileCache( ::google::protobuf::Closure *done) { brpc::ClosureGuard done_guard(done); - + // Validate shard ID - if (request->shard_id() != shard_id_) - { - LOG(WARNING) << "Invalid shard ID in SyncFileCache request: " - << request->shard_id() << " (expected " << shard_id_ << ")"; - // Note: Since response is Empty, we can't return error code. - // Errors are logged and RPC completes successfully. - // The primary node can check logs if needed. - return; - } - + // if (request->shard_id() != shard_id_) + // { + // LOG(WARNING) << "Invalid shard ID in SyncFileCache request: " + // << request->shard_id() << " (expected " << shard_id_ << + // ")"; + // // Note: Since response is Empty, we can't return error code. + // // Errors are logged and RPC completes successfully. + // // The primary node can check logs if needed. + // return; + // } + + // TODO(lzx): validate this node is the follower of the shard. + auto &ds_ref = data_shards_.at(request->shard_id()); + auto shard_status = ds_ref.shard_status_.load(std::memory_order_acquire); + // Only process if we're a standby node (closed status) - if (shard_status_.load(std::memory_order_acquire) != DSShardStatus::Closed) + if (shard_status != DSShardStatus::Closed) { - LOG(WARNING) << "SyncFileCache called on non-standby node (status: " - << static_cast(shard_status_.load()) << ")"; + LOG(WARNING) << "SyncFileCache called on non-standby node (status: " + << static_cast(shard_status) << ")"; return; } @@ -1357,7 +1402,11 @@ void DataStoreService::ProcessSyncFileCache(SyncFileCacheLocalRequest *req) const auto *request = req->GetRequest(); - if (shard_status_.load(std::memory_order_acquire) != DSShardStatus::Closed) + uint32_t shard_id = request->shard_id(); + auto &ds_ref = data_shards_.at(shard_id); + + if (ds_ref.shard_status_.load(std::memory_order_acquire) != + DSShardStatus::Closed) { LOG(WARNING) << "Shard status is not closed, skipping file sync"; req->Finish(); @@ -1381,7 +1430,7 @@ void DataStoreService::ProcessSyncFileCache(SyncFileCacheLocalRequest *req) } // Construct full storage path with shard ID: {storage_path}/ds_{shard_id}/db/ - std::string db_path = storage_path + "/ds_" + std::to_string(shard_id_) + "/db/"; + std::string db_path = storage_path + "/ds_" + std::to_string(shard_id) + "/db/"; // Create local db_path if not exists if (!std::filesystem::exists(db_path)) @@ -1413,14 +1462,16 @@ void DataStoreService::ProcessSyncFileCache(SyncFileCacheLocalRequest *req) req->Finish(); return; } - - is_file_sync_running_.store(true, std::memory_order_release); + + ds_ref.is_file_sync_running_.store(true, std::memory_order_release); for (const auto &entry : dir_ite) { - if (shard_status_.load(std::memory_order_acquire) != DSShardStatus::Closed) + if (ds_ref.shard_status_.load(std::memory_order_acquire) != + DSShardStatus::Closed) { LOG(WARNING) << "Shard status is not closed, skipping file sync"; - is_file_sync_running_.store(false, std::memory_order_release); + ds_ref.is_file_sync_running_.store(false, + std::memory_order_release); req->Finish(); return; } @@ -1455,7 +1506,7 @@ void DataStoreService::ProcessSyncFileCache(SyncFileCacheLocalRequest *req) if (downloader == nullptr) { LOG(ERROR) << "Failed to create S3 downloader, skipping downloads"; - is_file_sync_running_.store(false, std::memory_order_release); + ds_ref.is_file_sync_running_.store(false, std::memory_order_release); req->Finish(); return; } @@ -1478,7 +1529,8 @@ void DataStoreService::ProcessSyncFileCache(SyncFileCacheLocalRequest *req) for (const auto &filename : files_to_keep) { - if (shard_status_.load(std::memory_order_acquire) != DSShardStatus::Closed) + if (ds_ref.shard_status_.load(std::memory_order_acquire) != + DSShardStatus::Closed) { LOG(WARNING) << "Shard status is not closed, skipping file sync"; break; @@ -1523,7 +1575,7 @@ void DataStoreService::ProcessSyncFileCache(SyncFileCacheLocalRequest *req) // Finish the RPC (response is Empty, so just call done) req->Finish(); - is_file_sync_running_.store(false, std::memory_order_release); + ds_ref.is_file_sync_running_.store(false, std::memory_order_release); } void DataStoreService::FetchDSSClusterConfig( @@ -2085,51 +2137,77 @@ bool DataStoreService::FetchConfigFromPeer( void DataStoreService::CloseDataStore(uint32_t shard_id) { - if (shard_id_ == UINT32_MAX) - { - DLOG(INFO) << "CloseDataStore no-op for DSS has no shard assigned" - << ", shard " << shard_id - << ", shard_id_: " << shard_id_; - return; - } - assert(shard_id == shard_id_); + auto &ds_ref = data_shards_.at(shard_id); if (!IsOwnerOfShard(shard_id)) { + LOG(INFO) + << "CloseDataStore no-op for DSS shard is not owned by this node, " + << shard_id << ", shard_id_: " << ds_ref.shard_id_ + << ", shard_status_: " << ds_ref.shard_status_.load(); return; } - if (shard_status_.load() == DSShardStatus::ReadWrite) + if (ds_ref.shard_status_.load() == DSShardStatus::ReadWrite) { - SwitchReadWriteToReadOnly(shard_id); + bool res = SwitchReadWriteToReadOnly(shard_id); + if (!res) + { + LOG(ERROR) << "SwitchReadWriteToReadOnly failed for DSS shard " + << shard_id << ", shard_id_: " << ds_ref.shard_id_ + << ", shard_status_: " << ds_ref.shard_status_.load(); + } } - - if (shard_status_.load() == DSShardStatus::ReadOnly) + if (ds_ref.shard_status_.load() == DSShardStatus::ReadOnly) { - SwitchReadOnlyToClosed(shard_id); + bool res = SwitchReadOnlyToClosed(shard_id); + if (!res) + { + LOG(ERROR) << "SwitchReadOnlyToClosed failed for DSS shard " + << shard_id << ", shard_id_: " << ds_ref.shard_id_ + << ", shard_status_: " << ds_ref.shard_status_.load(); + } + else + { + LOG(INFO) << "SwitchReadOnlyToClosed success for DSS shard " + << shard_id << ", shard_id_: " << ds_ref.shard_id_ + << ", shard_status_: " << ds_ref.shard_status_.load(); + } } } void DataStoreService::OpenDataStore(uint32_t shard_id) { - // no-op if this DSS does not own any shard - if (shard_id_ == UINT32_MAX) - { - DLOG(INFO) << "OpenDataStore no-op for non-owner DSS" - << ", shard " << shard_id - << ", shard_id_: " << shard_id_; - return; - } - - assert(shard_id == shard_id_); - - DLOG(INFO) << "OpenDataStore for shard " << shard_id - << ", current status: " << static_cast(shard_status_.load()); - if (shard_status_.load() != DSShardStatus::Closed) + auto start_time = std::chrono::steady_clock::now(); + auto &ds_ref = data_shards_.at(shard_id); + if (ds_ref.shard_status_.load() != DSShardStatus::Closed) { + LOG(INFO) << "OpenDataStore no-op for DSS shard status is not closed, " + << shard_id << ", shard_id_: " << ds_ref.shard_id_ + << ", shard_status_: " + << static_cast(ds_ref.shard_status_.load()); return; } DSShardStatus open_mode = DSShardStatus::ReadWrite; bool create_db_if_missing = false; - ConnectAndStartDataStore(shard_id, open_mode, create_db_if_missing); + auto res = + ConnectAndStartDataStore(shard_id, open_mode, create_db_if_missing); + auto end_time = std::chrono::steady_clock::now(); + auto use_time = std::chrono::duration_cast( + end_time - start_time) + .count(); + if (!res) + { + LOG(ERROR) << "OpenDataStore failed for DSS shard " << shard_id + << ", shard_id_: " << ds_ref.shard_id_ << ", shard_status_: " + << static_cast(ds_ref.shard_status_.load()) + << ", use time: " << use_time << " ms"; + } + else + { + LOG(INFO) << "OpenDataStore success for DSS shard " << shard_id + << ", shard_id_: " << ds_ref.shard_id_ << ", shard_status_: " + << static_cast(ds_ref.shard_status_.load()) + << ", use time: " << use_time << " ms"; + } } std::pair @@ -2514,9 +2592,10 @@ bool DataStoreService::SwitchReadWriteToReadOnly(uint32_t shard_id) return false; } + auto &ds_ref = data_shards_.at(shard_id); DSShardStatus expected = DSShardStatus::ReadWrite; - if (!shard_status_.compare_exchange_strong(expected, - DSShardStatus::ReadOnly) && + if (!ds_ref.shard_status_.compare_exchange_strong( + expected, DSShardStatus::ReadOnly) && expected != DSShardStatus::ReadOnly) { DLOG(ERROR) << "SwitchReadWriteToReadOnly failed, shard status is not " @@ -2525,15 +2604,15 @@ bool DataStoreService::SwitchReadWriteToReadOnly(uint32_t shard_id) } // wait for all write requests to finish - while (ongoing_write_requests_.load(std::memory_order_acquire) > 0) + while (ds_ref.ongoing_write_requests_.load(std::memory_order_acquire) > 0) { bthread_usleep(1000); } - if (shard_status_.load(std::memory_order_acquire) == + if (ds_ref.shard_status_.load(std::memory_order_acquire) == DSShardStatus::ReadOnly) { cluster_manager_.SwitchShardToReadOnly(shard_id, expected); - data_store_->SwitchToReadOnly(); + ds_ref.data_store_->SwitchToReadOnly(); return true; } else @@ -2550,10 +2629,10 @@ bool DataStoreService::SwitchReadOnlyToClosed(uint32_t shard_id) { return false; } - + auto &ds_ref = data_shards_.at(shard_id); DSShardStatus expected = DSShardStatus::ReadOnly; - if (!shard_status_.compare_exchange_strong(expected, - DSShardStatus::Closed) && + if (!ds_ref.shard_status_.compare_exchange_strong(expected, + DSShardStatus::Closed) && expected != DSShardStatus::Closed) { DLOG(ERROR) << "SwitchReadOnlyToClosed failed, shard status is not " @@ -2564,8 +2643,7 @@ bool DataStoreService::SwitchReadOnlyToClosed(uint32_t shard_id) if (expected == DSShardStatus::ReadOnly) { cluster_manager_.SwitchShardToClosed(shard_id, expected); - data_store_->Shutdown(); - data_store_ = nullptr; + ds_ref.data_store_->Shutdown(); } return true; } @@ -2578,10 +2656,10 @@ bool DataStoreService::SwitchReadOnlyToReadWrite(uint32_t shard_id) << " is not owner"; return false; } - + auto &ds_ref = data_shards_.at(shard_id); DSShardStatus expected = DSShardStatus::ReadOnly; - if (!shard_status_.compare_exchange_strong(expected, - DSShardStatus::ReadWrite) && + if (!ds_ref.shard_status_.compare_exchange_strong( + expected, DSShardStatus::ReadWrite) && expected != DSShardStatus::ReadWrite) { DLOG(ERROR) << "SwitchReadOnlyToReadWrite failed, shard status is not " @@ -2589,7 +2667,7 @@ bool DataStoreService::SwitchReadOnlyToReadWrite(uint32_t shard_id) return false; } - data_store_->SwitchToReadWrite(); + ds_ref.data_store_->SwitchToReadWrite(); cluster_manager_.SwitchShardToReadWrite(shard_id, expected); return true; } @@ -2825,80 +2903,86 @@ void DataStoreService::FileCacheSyncWorker(uint32_t interval_sec) } } - // Only sync if we're the primary node - if (shard_status_.load(std::memory_order_acquire) != - DSShardStatus::ReadWrite) - { - continue; - } - - if (data_store_ == nullptr) - { - continue; - } - - // Collect file cache - std::vector<::EloqDS::remote::FileInfo> file_infos; - auto *cloud_store = - dynamic_cast(data_store_.get()); - if (cloud_store == nullptr) + for (uint32_t shard_id = 0; shard_id < data_shards_.size(); ++shard_id) { - continue; // Not a RocksDB Cloud store - } - - if (!cloud_store->CollectCachedSstFiles(file_infos)) - { - LOG(WARNING) << "Failed to collect file cache for sync"; - continue; - } - - // Get standby nodes from cluster manager - uint32_t shard_id = shard_id_; - const auto shard = cluster_manager_.GetShard(shard_id); - const auto &members = shard.nodes_; // Access nodes_ vector directly + auto &ds_ref = data_shards_.at(shard_id); - // Send to each standby node - for (const auto &member : members) - { - if (member == cluster_manager_.GetThisNode()) + // Only sync if we're the primary node + if (ds_ref.shard_status_.load(std::memory_order_acquire) != + DSShardStatus::ReadWrite) { - continue; // Skip self + continue; } - // Get channel to standby node by node (not by shard id) - DSSNode standby_node(member.host_name_, member.port_); - auto channel = - cluster_manager_.GetDataStoreServiceChannel(standby_node); - if (channel == nullptr) + if (ds_ref.data_store_ == nullptr) { - LOG(WARNING) << "Failed to get channel to standby node " - << member.host_name_ << ":" << member.port_; continue; } - // Create RPC stub and send - ::EloqDS::remote::DataStoreRpcService_Stub stub(channel.get()); - ::EloqDS::remote::SyncFileCacheRequest request; - google::protobuf::Empty response; - brpc::Controller cntl; - - request.set_shard_id(shard_id); - for (const auto &file_info : file_infos) + // Collect file cache + std::vector<::EloqDS::remote::FileInfo> file_infos; + auto *cloud_store = + dynamic_cast(ds_ref.data_store_.get()); + if (cloud_store == nullptr) { - *request.add_files() = file_info; + continue; // Not a RocksDB Cloud store } - stub.SyncFileCache(&cntl, &request, &response, nullptr); - - if (cntl.Failed()) + if (!cloud_store->CollectCachedSstFiles(file_infos)) { - LOG(WARNING) << "Failed to sync file cache to standby: " - << cntl.ErrorText(); + LOG(WARNING) << "Failed to collect file cache for sync"; + continue; } - else + + // Get standby nodes from cluster manager + const auto shard = cluster_manager_.GetShard(shard_id); + const auto &members = + shard.nodes_; // Access nodes_ vector directly + + // Send to each standby node + for (const auto &member : members) { - DLOG(INFO) << "Synced " << file_infos.size() - << " files to standby node " << member.host_name_; + if (member == cluster_manager_.GetThisNode()) + { + continue; // Skip self + } + + // Get channel to standby node by node (not by shard id) + DSSNode standby_node(member.host_name_, member.port_); + auto channel = + cluster_manager_.GetDataStoreServiceChannel(standby_node); + if (channel == nullptr) + { + LOG(WARNING) << "Failed to get channel to standby node " + << member.host_name_ << ":" << member.port_; + continue; + } + + // Create RPC stub and send + ::EloqDS::remote::DataStoreRpcService_Stub stub(channel.get()); + ::EloqDS::remote::SyncFileCacheRequest request; + google::protobuf::Empty response; + brpc::Controller cntl; + + request.set_shard_id(shard_id); + for (const auto &file_info : file_infos) + { + *request.add_files() = file_info; + } + + stub.SyncFileCache(&cntl, &request, &response, nullptr); + + if (cntl.Failed()) + { + LOG(WARNING) << "Failed to sync file cache to standby: " + << cntl.ErrorText(); + } + else + { + DLOG(INFO) + << "Synced " << file_infos.size() + << " files to standby node " << member.host_name_; + } } } } diff --git a/eloq_data_store_service/data_store_service.h b/eloq_data_store_service/data_store_service.h index 6e51c2f..a5ed8fa 100644 --- a/eloq_data_store_service/data_store_service.h +++ b/eloq_data_store_service/data_store_service.h @@ -195,9 +195,7 @@ class DataStoreService : EloqDS::remote::DataStoreRpcService ~DataStoreService(); - bool StartService(bool create_db_if_missing, - uint32_t dss_leader_node_id, - uint32_t dss_node_id); + bool StartService(bool create_db_if_missing); brpc::Server *GetBrpcServer() { @@ -220,6 +218,7 @@ class DataStoreService : EloqDS::remote::DataStoreRpcService * @brief Point read operation * @param table_name Table name * @param partition_id Partition id + * @param shard_id Shard id * @param key Key * @param record Record (output) * @param ts Timestamp (output) @@ -227,8 +226,9 @@ class DataStoreService : EloqDS::remote::DataStoreRpcService * @param done Callback function */ void Read(const std::string_view table_name, - const uint32_t partition_id, - const std::vector &key, + int32_t partition_id, + uint32_t shard_id, + const std::string_view key, std::string *record, uint64_t *ts, uint64_t *ttl, @@ -252,6 +252,7 @@ class DataStoreService : EloqDS::remote::DataStoreRpcService * @brief Batch write operation * @param table_name Table name * @param partition_id Partition id + * @param shard_id Shard id * @param keys Keys * @param records Records * @param ts Timestamps @@ -262,6 +263,7 @@ class DataStoreService : EloqDS::remote::DataStoreRpcService */ void BatchWriteRecords(std::string_view table_name, int32_t partition_id, + uint32_t shard_id, const std::vector &key_parts, const std::vector &record_parts, const std::vector &ts, @@ -315,6 +317,7 @@ class DataStoreService : EloqDS::remote::DataStoreRpcService * @brief Delete range of data operation * @param table_name Table name * @param partition_id Partition id + * @param shard_id Shard id * @param start_key Start key, * if empty, delete from the beginning of the table * @param end_key End key @@ -323,7 +326,8 @@ class DataStoreService : EloqDS::remote::DataStoreRpcService * @param done Callback function */ void DeleteRange(const std::string_view table_name, - const uint32_t partition_id, + const int32_t partition_id, + uint32_t shard_id, const std::string_view start_key, const std::string_view end_key, const bool skip_wal, @@ -392,6 +396,7 @@ class DataStoreService : EloqDS::remote::DataStoreRpcService * @brief Scan next operation * @param table_name Table name * @param partition_id Partition id + * @param shard_id Shard id * @param start_key Start key * @param end_key End key * @param inclusive_start Inclusive start @@ -404,7 +409,8 @@ class DataStoreService : EloqDS::remote::DataStoreRpcService * @param done Callback function */ void ScanNext(const std::string_view table_name, - uint32_t partition_id, + int32_t partition_id, + uint32_t shard_id, const std::string_view start_key, const std::string_view end_key, bool inclusive_start, @@ -434,12 +440,14 @@ class DataStoreService : EloqDS::remote::DataStoreRpcService * @brief Scan close operation * @param table_name Table name * @param partition_id Partition id + * @param shard_id Shard id * @param session_id Session ID * @param result Result (output) * @param done Callback function */ void ScanClose(const std::string_view table_name, - uint32_t partition_id, + int32_t partition_id, + uint32_t shard_id, std::string *session_id, ::EloqDS::remote::CommonResult *result, ::google::protobuf::Closure *done); @@ -535,11 +543,9 @@ class DataStoreService : EloqDS::remote::DataStoreRpcService * @brief Preapre sharding error * Fill the error code and the topology change in the result */ - void PrepareShardingError(uint32_t partition_id, + void PrepareShardingError(uint32_t shard_id, ::EloqDS::remote::CommonResult *result) { - uint32_t shard_id = - cluster_manager_.GetShardIdByPartitionId(partition_id); cluster_manager_.PrepareShardingError(shard_id, result); } @@ -596,9 +602,10 @@ class DataStoreService : EloqDS::remote::DataStoreRpcService // ======================================================================= DSShardStatus FetchDSShardStatus(uint32_t shard_id) { - if (shard_id_ == shard_id) + if (data_shards_.at(shard_id).shard_id_ == shard_id) { - return shard_status_; + return data_shards_.at(shard_id).shard_status_.load( + std::memory_order_acquire); } return DSShardStatus::Closed; } @@ -614,21 +621,23 @@ class DataStoreService : EloqDS::remote::DataStoreRpcService return data_store_factory_.get(); } - void IncreaseWriteReqCount() + void IncreaseWriteReqCount(uint32_t shard_id) { - ongoing_write_requests_.fetch_add(1, std::memory_order_release); + data_shards_.at(shard_id).ongoing_write_requests_.fetch_add( + 1, std::memory_order_release); } - void DecreaseWriteReqCount() + void DecreaseWriteReqCount(uint32_t shard_id) { - ongoing_write_requests_.fetch_sub(1, std::memory_order_release); + data_shards_.at(shard_id).ongoing_write_requests_.fetch_sub( + 1, std::memory_order_release); } bool IsOwnerOfShard(uint32_t shard_id) const { - return shard_status_.load(std::memory_order_acquire) != - DSShardStatus::Closed && - shard_id_ == shard_id; + const auto &ds_ref = data_shards_.at(shard_id); + return ds_ref.shard_status_.load(std::memory_order_acquire) != + DSShardStatus::Closed; } void CloseDataStore(uint32_t shard_id); @@ -640,18 +649,11 @@ class DataStoreService : EloqDS::remote::DataStoreRpcService } private: - uint32_t GetShardIdByPartitionId(int32_t partition_id) - { - // Now only support single data shard - return 0; - // return cluster_manager_.GetShardIdByPartitionId(partition_id); - } - DataStore *GetDataStore(uint32_t shard_id) { - if (shard_id_ == shard_id) + if (data_shards_.at(shard_id).shard_id_ == shard_id) { - return data_store_.get(); + return data_shards_.at(shard_id).data_store_.get(); } else { @@ -681,32 +683,49 @@ class DataStoreService : EloqDS::remote::DataStoreRpcService uint32_t &migration_status, uint64_t &shard_next_version); - // std::shared_mutex serv_mux_; - int32_t service_port_; std::unique_ptr server_; DataStoreServiceClusterManager cluster_manager_; std::string config_file_path_; std::string migration_log_path_; - // Now, there is only one data store shard in a DataStoreService. - // To avoid using mutex in read or write APIs, use a atomic variable - // (shard_status_) to control concurrency conflicts. - // - During migraion, we change the shard_status_ firstly, then change the - // data_store_ after all read/write requests are finished. - // - In write functions, we increase the ongoing_write_requests_ firstly and - // then check the shard_status_. After the request is executed or if - // shard_status_ is not required, decrease them. - uint32_t shard_id_{UINT32_MAX}; - std::unique_ptr data_store_{nullptr}; - std::atomic shard_status_{DSShardStatus::Closed}; - std::atomic ongoing_write_requests_{0}; - // Whether the file cache sync is running. Used to avoid concurrent local ssd file operations - // between db and file sync worker. - std::atomic is_file_sync_running_{false}; + /** + * @brief Per-shard data structure encapsulating all shard-specific state. + * Each DataShard manages its own data store, status, and scan cache. + * Thread-safety: shard_status_ and ongoing_write_requests_ are atomic. + * data_store_ and scan_iter_cache_ access is protected by shard_status_ + * state machine. + */ + struct DataShard + { + void ShutDown() + { + if (data_store_ != nullptr) + { + data_store_->Shutdown(); + // Don't set data_store_ to nullptr here, as it may be used + // in read operations because we don't use read counter. + } + + if (scan_iter_cache_ != nullptr) + { + scan_iter_cache_->Clear(); + scan_iter_cache_ = nullptr; + } + } - // scan iterator cache - TTLWrapperCache scan_iter_cache_; + uint32_t shard_id_{UINT32_MAX}; + std::unique_ptr data_store_{nullptr}; + std::atomic shard_status_{DSShardStatus::Closed}; + std::atomic ongoing_write_requests_{0}; + std::unique_ptr scan_iter_cache_{nullptr}; + + // Whether the file cache sync is running. Used to avoid concurrent + // local ssd file operations between db and file sync worker. + std::atomic is_file_sync_running_{false}; + }; + + std::array data_shards_; std::unique_ptr data_store_factory_; diff --git a/eloq_data_store_service/data_store_service_util.h b/eloq_data_store_service/data_store_service_util.h index cd1d6d7..66ef76c 100644 --- a/eloq_data_store_service/data_store_service_util.h +++ b/eloq_data_store_service/data_store_service_util.h @@ -21,7 +21,12 @@ */ #pragma once +#include +#include +#include +#include #include +#include #include #include diff --git a/eloq_data_store_service/ds_request.proto b/eloq_data_store_service/ds_request.proto index 1ea3875..f9cdc8b 100644 --- a/eloq_data_store_service/ds_request.proto +++ b/eloq_data_store_service/ds_request.proto @@ -93,9 +93,10 @@ message CommonResult { } message ReadRequest { - uint32 partition_id = 1; + int32 partition_id = 1; string kv_table_name = 2; - repeated bytes key_str = 3; + bytes key_str = 3; + uint32 shard_id = 4; } message ReadResponse { @@ -142,8 +143,9 @@ message BatchWriteRecordsRequest { string kv_table_name = 1; int32 partition_id = 2; - bool skip_wal = 3; - repeated Item items = 4; + uint32 shard_id = 3; + bool skip_wal = 4; + repeated Item items = 5; } message BatchWriteRecordsResponse { @@ -165,6 +167,7 @@ message DeleteRangeRequest { bytes start_key = 3; bytes end_key = 4; bool skip_wal = 5; + uint32 shard_id = 6; } message DeleteRangeResponse { @@ -303,7 +306,7 @@ message SearchCondition { message ScanRequest { string session_id = 1; string kv_table_name_str = 2; - uint32 partition_id = 3; + int32 partition_id = 3; // empty start_key will be treated as negitive infinity or // positive infinity depends on scan_forward bytes start_key = 4; @@ -316,6 +319,7 @@ message ScanRequest { uint32 batch_size = 9; // Number of items to return in scan repeated SearchCondition search_conditions = 10; bool generate_session_id = 11; + uint32 shard_id = 12; } // Response message for scan operations diff --git a/eloq_data_store_service/internal_request.h b/eloq_data_store_service/internal_request.h index 564aa96..1e9e636 100644 --- a/eloq_data_store_service/internal_request.h +++ b/eloq_data_store_service/internal_request.h @@ -50,6 +50,8 @@ class WriteRecordsRequest : public Poolable virtual int32_t GetPartitionId() const = 0; + virtual uint32_t GetShardId() const = 0; + virtual const std::string_view GetRecordPart(size_t index) const = 0; virtual uint16_t PartsCountPerRecord() const = 0; @@ -116,6 +118,11 @@ class WriteRecordsRpcRequest : public WriteRecordsRequest return req_->partition_id(); } + uint32_t GetShardId() const override + { + return req_->shard_id(); + } + const std::string_view GetRecordPart(size_t index) const override { return req_->items(index).value(); @@ -157,7 +164,7 @@ class WriteRecordsRpcRequest : public WriteRecordsRequest void SetFinish(const remote::CommonResult &result) override { - data_store_service_->DecreaseWriteReqCount(); + data_store_service_->DecreaseWriteReqCount(GetShardId()); brpc::ClosureGuard done_guard(done_); // Set error code and error message resp_->mutable_result()->set_error_code(result.error_code()); @@ -183,6 +190,7 @@ class WriteRecordsLocalRequest : public WriteRecordsRequest { table_name_ = ""; partition_id_ = 0; + shard_id_ = UINT32_MAX; key_parts_ = nullptr; record_parts_ = nullptr; ts_ = nullptr; @@ -198,6 +206,7 @@ class WriteRecordsLocalRequest : public WriteRecordsRequest void Reset(DataStoreService *ds_service, std::string_view table_name, int32_t partition_id, + uint32_t shard_id, const std::vector &key_parts, const std::vector &record_parts, const std::vector &ts, @@ -212,6 +221,7 @@ class WriteRecordsLocalRequest : public WriteRecordsRequest data_store_service_ = ds_service; table_name_ = table_name; partition_id_ = partition_id; + shard_id_ = shard_id; key_parts_ = &key_parts; record_parts_ = &record_parts; ts_ = &ts; @@ -255,6 +265,11 @@ class WriteRecordsLocalRequest : public WriteRecordsRequest return partition_id_; } + uint32_t GetShardId() const override + { + return shard_id_; + } + const std::string_view GetRecordPart(size_t index) const override { return record_parts_->at(index); @@ -287,7 +302,7 @@ class WriteRecordsLocalRequest : public WriteRecordsRequest void SetFinish(const remote::CommonResult &result) override { - data_store_service_->DecreaseWriteReqCount(); + data_store_service_->DecreaseWriteReqCount(shard_id_); brpc::ClosureGuard done_guard(done_); // Set error code and error message result_->set_error_code(result.error_code()); @@ -298,6 +313,7 @@ class WriteRecordsLocalRequest : public WriteRecordsRequest DataStoreService *data_store_service_{nullptr}; std::string_view table_name_; int32_t partition_id_; + uint32_t shard_id_{UINT32_MAX}; const std::vector *key_parts_{nullptr}; const std::vector *record_parts_{nullptr}; const std::vector *ts_{nullptr}; @@ -322,6 +338,8 @@ class FlushDataRequest : public Poolable // parameters in virtual const std::vector &GetKvTableNames() const = 0; + virtual uint32_t GetShardId() const = 0; + // finish virtual void SetFinish(const remote::CommonResult &result) = 0; }; @@ -362,9 +380,14 @@ class FlushDataRpcRequest : public FlushDataRequest return kv_table_names_; } + uint32_t GetShardId() const override + { + return req_->shard_id(); + } + void SetFinish(const remote::CommonResult &result) override { - data_store_service_->DecreaseWriteReqCount(); + data_store_service_->DecreaseWriteReqCount(req_->shard_id()); brpc::ClosureGuard done_guard(done_); ::EloqDS::remote::CommonResult *res = resp_->mutable_result(); @@ -401,17 +424,20 @@ class FlushDataLocalRequest : public FlushDataRequest void Clear() override { kv_table_names_ = nullptr; + shard_id_ = UINT32_MAX; result_ = nullptr; done_ = nullptr; } void Reset(DataStoreService *ds_service, const std::vector *kv_table_names, + uint32_t shard_id, remote::CommonResult &result, google::protobuf::Closure *done) { data_store_service_ = ds_service; kv_table_names_ = kv_table_names; + shard_id_ = shard_id; result_ = &result; done_ = done; } @@ -421,9 +447,14 @@ class FlushDataLocalRequest : public FlushDataRequest return *kv_table_names_; } + uint32_t GetShardId() const override + { + return shard_id_; + } + void SetFinish(const remote::CommonResult &result) override { - data_store_service_->DecreaseWriteReqCount(); + data_store_service_->DecreaseWriteReqCount(shard_id_); brpc::ClosureGuard done_guard(done_); result_->set_error_code(result.error_code()); result_->set_error_msg(result.error_msg()); @@ -432,6 +463,7 @@ class FlushDataLocalRequest : public FlushDataRequest private: DataStoreService *data_store_service_{nullptr}; const std::vector *kv_table_names_{nullptr}; + uint32_t shard_id_{UINT32_MAX}; remote::CommonResult *result_{nullptr}; google::protobuf::Closure *done_{nullptr}; }; @@ -447,7 +479,8 @@ class DeleteRangeRequest : public Poolable // parameters in virtual const std::string_view GetTableName() const = 0; - virtual uint32_t GetPartitionId() const = 0; + virtual int32_t GetPartitionId() const = 0; + virtual uint32_t GetShardId() const = 0; virtual const std::string_view GetStartKey() const = 0; virtual const std::string_view GetEndKey() const = 0; virtual bool SkipWal() const = 0; @@ -487,11 +520,16 @@ class DeleteRangeRpcRequest : public DeleteRangeRequest return req_->kv_table_name(); } - uint32_t GetPartitionId() const override + int32_t GetPartitionId() const override { return req_->partition_id(); } + uint32_t GetShardId() const override + { + return req_->shard_id(); + } + const std::string_view GetStartKey() const override { return req_->start_key(); @@ -509,7 +547,7 @@ class DeleteRangeRpcRequest : public DeleteRangeRequest void SetFinish(const remote::CommonResult &result) override { - data_store_service_->DecreaseWriteReqCount(); + data_store_service_->DecreaseWriteReqCount(req_->shard_id()); brpc::ClosureGuard done_guard(done_); ::EloqDS::remote::CommonResult *res = resp_->mutable_result(); @@ -536,6 +574,7 @@ class DeleteRangeLocalRequest : public DeleteRangeRequest { table_name_ = ""; partition_id_ = 0; + shard_id_ = UINT32_MAX; start_key_ = ""; end_key_ = ""; skip_wal_ = false; @@ -545,7 +584,8 @@ class DeleteRangeLocalRequest : public DeleteRangeRequest void Reset(DataStoreService *ds_service, const std::string_view table_name, - const uint32_t partition_id, + const int32_t partition_id, + const uint32_t shard_id, const std::string_view start_key, const std::string_view end_key, const bool skip_wal, @@ -555,6 +595,7 @@ class DeleteRangeLocalRequest : public DeleteRangeRequest data_store_service_ = ds_service; table_name_ = table_name; partition_id_ = partition_id; + shard_id_ = shard_id; start_key_ = start_key; end_key_ = end_key; skip_wal_ = skip_wal; @@ -567,11 +608,16 @@ class DeleteRangeLocalRequest : public DeleteRangeRequest return table_name_; } - uint32_t GetPartitionId() const override + int32_t GetPartitionId() const override { return partition_id_; } + uint32_t GetShardId() const override + { + return shard_id_; + } + const std::string_view GetStartKey() const override { return start_key_; @@ -589,7 +635,7 @@ class DeleteRangeLocalRequest : public DeleteRangeRequest void SetFinish(const remote::CommonResult &result) override { - data_store_service_->DecreaseWriteReqCount(); + data_store_service_->DecreaseWriteReqCount(shard_id_); brpc::ClosureGuard done_guard(done_); result_->set_error_code(result.error_code()); result_->set_error_msg(result.error_msg()); @@ -598,7 +644,8 @@ class DeleteRangeLocalRequest : public DeleteRangeRequest private: DataStoreService *data_store_service_{nullptr}; std::string_view table_name_{""}; - uint32_t partition_id_{0}; + int32_t partition_id_{0}; + uint32_t shard_id_{UINT32_MAX}; std::string_view start_key_{""}; std::string_view end_key_{""}; bool skip_wal_{false}; @@ -625,11 +672,11 @@ class ReadRequest : public Poolable // paramters in virtual const std::string_view GetTableName() const = 0; - virtual std::string_view GetKey(size_t index) const = 0; + virtual const std::string_view GetKey() const = 0; - virtual size_t PartsCountPerKey() const = 0; + virtual int32_t GetPartitionId() const = 0; - virtual uint32_t GetPartitionId() const = 0; + virtual uint32_t GetShardId() const = 0; // parameters out virtual void SetRecord(std::string &&record) = 0; @@ -674,19 +721,19 @@ class ReadRpcRequest : public ReadRequest return req_->kv_table_name(); } - std::string_view GetKey(size_t index) const override + const std::string_view GetKey() const override { - return req_->key_str(index); + return req_->key_str(); } - size_t PartsCountPerKey() const override + int32_t GetPartitionId() const override { - return req_->key_str_size(); + return req_->partition_id(); } - uint32_t GetPartitionId() const override + uint32_t GetShardId() const override { - return req_->partition_id(); + return req_->shard_id(); } void SetRecord(std::string &&record) override @@ -707,13 +754,6 @@ class ReadRpcRequest : public ReadRequest void SetFinish(const ::EloqDS::remote::DataStoreError error_code) override { brpc::ClosureGuard done_guard(done_); - if (error_code == remote::DataStoreError::REQUESTED_NODE_NOT_OWNER) - { - ds_service_->PrepareShardingError(req_->partition_id(), - resp_->mutable_result()); - return; - } - resp_->mutable_result()->set_error_code(error_code); } @@ -733,8 +773,9 @@ class ReadLocalRequest : public ReadRequest void Reset(DataStoreService *ds_service, const std::string_view table_name, - const uint32_t partition_id, - const std::vector *key_parts, + const int32_t partition_id, + const uint32_t shard_id, + const std::string_view key, std::string *record, uint64_t *record_ts, uint64_t *record_ttl, @@ -743,8 +784,9 @@ class ReadLocalRequest : public ReadRequest { ds_service_ = ds_service; table_name_ = table_name; - key_parts_ = key_parts; + key_ = key; partition_id_ = partition_id; + shard_id_ = shard_id; record_ = record; record_ts_ = record_ts; record_ttl_ = record_ttl; @@ -756,8 +798,9 @@ class ReadLocalRequest : public ReadRequest { ds_service_ = nullptr; table_name_ = ""; - key_parts_ = nullptr; + key_ = ""; partition_id_ = 0; + shard_id_ = UINT32_MAX; record_ = nullptr; record_ts_ = nullptr; record_ttl_ = nullptr; @@ -770,19 +813,19 @@ class ReadLocalRequest : public ReadRequest return table_name_; } - std::string_view GetKey(size_t index) const override + const std::string_view GetKey() const override { - return (*key_parts_)[index]; + return key_; } - size_t PartsCountPerKey() const override + int32_t GetPartitionId() const override { - return key_parts_->size(); + return partition_id_; } - uint32_t GetPartitionId() const override + uint32_t GetShardId() const override { - return partition_id_; + return shard_id_; } void SetRecord(std::string &&record) override @@ -803,20 +846,15 @@ class ReadLocalRequest : public ReadRequest void SetFinish(const ::EloqDS::remote::DataStoreError error_code) override { brpc::ClosureGuard done_guard(done_); - if (error_code == remote::DataStoreError::REQUESTED_NODE_NOT_OWNER) - { - ds_service_->PrepareShardingError(partition_id_, result_); - return; - } - result_->set_error_code(error_code); } private: DataStoreService *ds_service_{nullptr}; std::string_view table_name_{""}; - const std::vector *key_parts_{nullptr}; - uint32_t partition_id_{0}; + std::string_view key_{""}; + int32_t partition_id_{0}; + uint32_t shard_id_{UINT32_MAX}; std::string *record_{nullptr}; uint64_t *record_ts_{nullptr}; uint64_t *record_ttl_{nullptr}; @@ -836,6 +874,8 @@ class CreateTableRequest : public Poolable // parameters in virtual const std::string_view GetTableName() const = 0; + virtual uint32_t GetShardId() const = 0; + // finish virtual void SetFinish(const remote::CommonResult &result) = 0; }; @@ -871,9 +911,14 @@ class CreateTableRpcRequest : public CreateTableRequest return req_->kv_table_name(); } + uint32_t GetShardId() const override + { + return req_->shard_id(); + } + void SetFinish(const remote::CommonResult &result) override { - ds_service_->DecreaseWriteReqCount(); + ds_service_->DecreaseWriteReqCount(req_->shard_id()); brpc::ClosureGuard done_guard(done_); ::EloqDS::remote::CommonResult *res = resp_->mutable_result(); @@ -909,17 +954,20 @@ class CreateTableLocalRequest : public CreateTableRequest void Clear() override { table_name_ = ""; + shard_id_ = UINT32_MAX; result_ = nullptr; done_ = nullptr; } void Reset(DataStoreService *ds_service, const std::string_view table_name, + uint32_t shard_id, remote::CommonResult &result, google::protobuf::Closure *done) { ds_service_ = ds_service; table_name_ = table_name; + shard_id_ = shard_id; result_ = &result; done_ = done; } @@ -929,9 +977,14 @@ class CreateTableLocalRequest : public CreateTableRequest return table_name_; } + uint32_t GetShardId() const override + { + return shard_id_; + } + void SetFinish(const remote::CommonResult &result) override { - ds_service_->DecreaseWriteReqCount(); + ds_service_->DecreaseWriteReqCount(shard_id_); brpc::ClosureGuard done_guard(done_); result_->set_error_code(result.error_code()); result_->set_error_msg(result.error_msg()); @@ -940,6 +993,7 @@ class CreateTableLocalRequest : public CreateTableRequest private: DataStoreService *ds_service_{nullptr}; std::string_view table_name_{""}; + uint32_t shard_id_{UINT32_MAX}; remote::CommonResult *result_{nullptr}; google::protobuf::Closure *done_{nullptr}; }; @@ -956,6 +1010,8 @@ class DropTableRequest : public Poolable // parameters in virtual const std::string_view GetTableName() const = 0; + virtual uint32_t GetShardId() const = 0; + // finish virtual void SetFinish(const remote::CommonResult &result) = 0; }; @@ -990,9 +1046,14 @@ class DropTableRpcRequest : public DropTableRequest return req_->kv_table_name(); } + uint32_t GetShardId() const override + { + return req_->shard_id(); + } + void SetFinish(const remote::CommonResult &result) override { - ds_service_->DecreaseWriteReqCount(); + ds_service_->DecreaseWriteReqCount(req_->shard_id()); brpc::ClosureGuard done_guard(done_); ::EloqDS::remote::CommonResult *res = resp_->mutable_result(); @@ -1028,17 +1089,20 @@ class DropTableLocalRequest : public DropTableRequest void Clear() override { table_name_ = ""; + shard_id_ = UINT32_MAX; result_ = nullptr; done_ = nullptr; } void Reset(DataStoreService *ds_service, const std::string_view table_name, + uint32_t shard_id, remote::CommonResult &result, google::protobuf::Closure *done) { ds_service_ = ds_service; table_name_ = table_name; + shard_id_ = shard_id; result_ = &result; done_ = done; } @@ -1048,9 +1112,14 @@ class DropTableLocalRequest : public DropTableRequest return table_name_; } + uint32_t GetShardId() const override + { + return shard_id_; + } + void SetFinish(const remote::CommonResult &result) override { - ds_service_->DecreaseWriteReqCount(); + ds_service_->DecreaseWriteReqCount(shard_id_); brpc::ClosureGuard done_guard(done_); result_->set_error_code(result.error_code()); result_->set_error_msg(result.error_msg()); @@ -1059,6 +1128,7 @@ class DropTableLocalRequest : public DropTableRequest private: DataStoreService *ds_service_{nullptr}; std::string_view table_name_{""}; + uint32_t shard_id_{UINT32_MAX}; remote::CommonResult *result_{nullptr}; google::protobuf::Closure *done_{nullptr}; }; @@ -1075,7 +1145,9 @@ class ScanRequest : public Poolable // parameters in virtual const std::string_view GetTableName() const = 0; - virtual uint32_t GetPartitionId() const = 0; + virtual int32_t GetPartitionId() const = 0; + + virtual uint32_t GetShardId() const = 0; virtual const std::string_view GetStartKey() const = 0; @@ -1147,11 +1219,16 @@ class ScanRpcRequest : public ScanRequest return req_->kv_table_name_str(); } - uint32_t GetPartitionId() const override + int32_t GetPartitionId() const override { return req_->partition_id(); } + uint32_t GetShardId() const override + { + return req_->shard_id(); + } + const std::string_view GetStartKey() const override { return req_->start_key(); @@ -1232,12 +1309,6 @@ class ScanRpcRequest : public ScanRequest const std::string error_message) override { brpc::ClosureGuard done_guard(done_); - if (error_code == remote::DataStoreError::REQUESTED_NODE_NOT_OWNER) - { - ds_service_->PrepareShardingError(req_->partition_id(), - resp_->mutable_result()); - return; - } ::EloqDS::remote::CommonResult *result = resp_->mutable_result(); result->set_error_code(error_code); result->set_error_msg(error_message); @@ -1260,7 +1331,8 @@ class ScanLocalRequest : public ScanRequest void Reset(DataStoreService *ds_service, const std::string_view table_name, - uint32_t partition_id, + int32_t partition_id, + uint32_t shard_id, const std::string_view start_key, const std::string_view end_key, bool inclusive_start, @@ -1277,6 +1349,7 @@ class ScanLocalRequest : public ScanRequest ds_service_ = ds_service; table_name_ = table_name; partition_id_ = partition_id; + shard_id_ = shard_id; start_key_ = start_key; end_key_ = end_key; inclusive_start_ = inclusive_start; @@ -1293,7 +1366,8 @@ class ScanLocalRequest : public ScanRequest void Reset(DataStoreService *ds_service, const std::string_view table_name, - const uint32_t partition_id, + const int32_t partition_id, + const uint32_t shard_id, std::string *session_id, bool generate_session_id, ::EloqDS::remote::CommonResult *result, @@ -1302,6 +1376,7 @@ class ScanLocalRequest : public ScanRequest ds_service_ = ds_service; table_name_ = table_name; partition_id_ = partition_id; + shard_id_ = shard_id; session_id_ = session_id; generate_session_id_ = generate_session_id; result_ = result; @@ -1313,6 +1388,7 @@ class ScanLocalRequest : public ScanRequest ds_service_ = nullptr; table_name_ = ""; partition_id_ = 0; + shard_id_ = UINT32_MAX; start_key_ = ""; end_key_ = ""; inclusive_start_ = false; @@ -1332,11 +1408,16 @@ class ScanLocalRequest : public ScanRequest return table_name_; } - uint32_t GetPartitionId() const override + int32_t GetPartitionId() const override { return partition_id_; } + uint32_t GetShardId() const override + { + return shard_id_; + } + const std::string_view GetStartKey() const override { return start_key_; @@ -1413,11 +1494,6 @@ class ScanLocalRequest : public ScanRequest const std::string error_message) override { brpc::ClosureGuard done_guard(done_); - if (error_code == remote::DataStoreError::REQUESTED_NODE_NOT_OWNER) - { - ds_service_->PrepareShardingError(partition_id_, result_); - return; - } result_->set_error_code(error_code); result_->set_error_msg(error_message); } @@ -1425,7 +1501,8 @@ class ScanLocalRequest : public ScanRequest private: DataStoreService *ds_service_{nullptr}; std::string_view table_name_{""}; - uint32_t partition_id_{0}; + int32_t partition_id_{0}; + uint32_t shard_id_{UINT32_MAX}; std::string_view start_key_{""}; std::string_view end_key_{""}; bool inclusive_start_{false}; @@ -1451,6 +1528,8 @@ class CreateSnapshotForBackupRequest : public Poolable virtual ~CreateSnapshotForBackupRequest() = default; + virtual uint32_t GetShardId() const = 0; + virtual std::string_view GetBackupName() const = 0; virtual uint64_t GetBackupTs() const = 0; virtual void AddBackupFile(const std::string &file) = 0; @@ -1488,6 +1567,11 @@ class CreateSnapshotForBackupRpcRequest : public CreateSnapshotForBackupRequest done_ = nullptr; } + uint32_t GetShardId() const override + { + return req_->shard_id(); + } + std::string_view GetBackupName() const override { return req_->backup_name(); @@ -1506,7 +1590,7 @@ class CreateSnapshotForBackupRpcRequest : public CreateSnapshotForBackupRequest void SetFinish(const ::EloqDS::remote::DataStoreError error_code, const std::string error_message) override { - ds_service_->DecreaseWriteReqCount(); + ds_service_->DecreaseWriteReqCount(req_->shard_id()); brpc::ClosureGuard done_guard(done_); ::EloqDS::remote::CommonResult *result = resp_->mutable_result(); result->set_error_code(error_code); @@ -1531,6 +1615,7 @@ class CreateSnapshotForBackupLocalRequest const CreateSnapshotForBackupLocalRequest &other) = delete; void Reset(DataStoreService *ds_service, + uint32_t shard_id, std::string_view backup_name, const uint64_t backup_ts, std::vector *backup_files, @@ -1538,6 +1623,7 @@ class CreateSnapshotForBackupLocalRequest google::protobuf::Closure *done) { ds_service_ = ds_service; + shard_id_ = shard_id; backup_name_ = backup_name; backup_files_ = backup_files; backup_ts_ = backup_ts; @@ -1548,6 +1634,7 @@ class CreateSnapshotForBackupLocalRequest void Clear() override { ds_service_ = nullptr; + shard_id_ = UINT32_MAX; backup_name_ = ""; backup_files_ = nullptr; backup_ts_ = 0; @@ -1555,6 +1642,11 @@ class CreateSnapshotForBackupLocalRequest done_ = nullptr; } + uint32_t GetShardId() const override + { + return shard_id_; + } + std::string_view GetBackupName() const override { return backup_name_; @@ -1573,7 +1665,7 @@ class CreateSnapshotForBackupLocalRequest void SetFinish(const ::EloqDS::remote::DataStoreError error_code, const std::string error_message) override { - ds_service_->DecreaseWriteReqCount(); + ds_service_->DecreaseWriteReqCount(shard_id_); brpc::ClosureGuard done_guard(done_); result_->set_error_code(error_code); result_->set_error_msg(error_message); @@ -1581,7 +1673,8 @@ class CreateSnapshotForBackupLocalRequest private: DataStoreService *ds_service_{nullptr}; - EloqDS::remote::CommonResult *result_{nullptr}; + uint32_t shard_id_{UINT32_MAX}; + ::EloqDS::remote::CommonResult *result_{nullptr}; std::string_view backup_name_{""}; std::vector *backup_files_{nullptr}; uint64_t backup_ts_{0}; diff --git a/eloq_data_store_service/main.cpp b/eloq_data_store_service/main.cpp index fa222ca..7c6c59d 100644 --- a/eloq_data_store_service/main.cpp +++ b/eloq_data_store_service/main.cpp @@ -318,8 +318,8 @@ int main(int argc, char *argv[]) std::move(ds_factory)); // setup local data store service - bool ret = data_store_service_->StartService( - FLAGS_bootstrap || is_single_node, 0, 0); + bool ret = + data_store_service_->StartService(FLAGS_bootstrap || is_single_node); if (!ret) { LOG(ERROR) << "Failed to start data store service"; diff --git a/eloq_data_store_service/rocksdb_cloud_data_store.cpp b/eloq_data_store_service/rocksdb_cloud_data_store.cpp index 34d452a..8748bec 100644 --- a/eloq_data_store_service/rocksdb_cloud_data_store.cpp +++ b/eloq_data_store_service/rocksdb_cloud_data_store.cpp @@ -158,8 +158,7 @@ RocksDBCloudDataStore::RocksDBCloudDataStore( RocksDBCloudDataStore::~RocksDBCloudDataStore() { - if (query_worker_pool_ != nullptr || data_store_service_ != nullptr || - db_ != nullptr) + if (query_worker_pool_ != nullptr || db_ != nullptr) { Shutdown(); } @@ -167,15 +166,9 @@ RocksDBCloudDataStore::~RocksDBCloudDataStore() void RocksDBCloudDataStore::Shutdown() { - std::unique_lock db_lk(db_mux_); - - // shutdown query worker pool - query_worker_pool_->Shutdown(); - query_worker_pool_ = nullptr; - - data_store_service_->ForceEraseScanIters(shard_id_); - data_store_service_ = nullptr; + RocksDBDataStoreCommon::Shutdown(); + std::unique_lock db_lk(db_mux_); if (db_ != nullptr) { DLOG(INFO) << "RocksDBCloudDataStore Shutdown, db->Close()"; @@ -361,6 +354,7 @@ bool RocksDBCloudDataStore::StartDB() #endif DLOG(INFO) << "DBCloud Open"; + auto start_time = std::chrono::steady_clock::now(); rocksdb::CloudFileSystem *cfs; // Open the cloud file system status = EloqDS::NewCloudFileSystem(cfs_options_, &cfs); @@ -378,6 +372,11 @@ bool RocksDBCloudDataStore::StartDB() return false; } + auto end_time = std::chrono::steady_clock::now(); + auto use_time = std::chrono::duration_cast( + end_time - start_time) + .count(); + LOG(INFO) << "DBCloud open, NewCloudFileSystem took " << use_time << " ms"; std::string cookie_on_open = ""; std::string new_cookie_on_open = ""; @@ -632,7 +631,7 @@ bool RocksDBCloudDataStore::OpenCloudDB( // Disable auto compactions before blocking purger options.disable_auto_compactions = true; - auto start = std::chrono::system_clock::now(); + auto start = std::chrono::steady_clock::now(); std::unique_lock db_lk(db_mux_); rocksdb::Status status; uint32_t retry_num = 0; @@ -652,10 +651,10 @@ bool RocksDBCloudDataStore::OpenCloudDB( bthread_usleep(retry_num * 200000); } - auto end = std::chrono::system_clock::now(); + auto end = std::chrono::steady_clock::now(); auto duration = std::chrono::duration_cast(end - start); - DLOG(INFO) << "DBCloud Open took " << duration.count() << " ms"; + LOG(INFO) << "DBCloud Open took " << duration.count() << " ms"; if (!status.ok()) { diff --git a/eloq_data_store_service/rocksdb_cloud_data_store_factory.h b/eloq_data_store_service/rocksdb_cloud_data_store_factory.h index c545973..97df5e6 100644 --- a/eloq_data_store_service/rocksdb_cloud_data_store_factory.h +++ b/eloq_data_store_service/rocksdb_cloud_data_store_factory.h @@ -51,8 +51,21 @@ class RocksDBCloudDataStoreFactory : public DataStoreFactory DataStoreService *data_store_service, bool start_db = true) override { + // Add shard_id to object_path_ + auto shard_cloud_config = cloud_config_; + if (shard_cloud_config.object_path_.empty()) + { + shard_cloud_config.object_path_ = "rocksdb_cloud_object_path/"; + } + else if (shard_cloud_config.object_path_.back() != '/') + { + shard_cloud_config.object_path_.append("/"); + } + shard_cloud_config.object_path_.append("ds_"); + shard_cloud_config.object_path_.append(std::to_string(shard_id)); + auto ds = std::make_unique( - cloud_config_, + shard_cloud_config, config_, create_if_missing, tx_enable_cache_replacement_, diff --git a/eloq_data_store_service/rocksdb_data_store.cpp b/eloq_data_store_service/rocksdb_data_store.cpp index d230886..c4d3cf1 100644 --- a/eloq_data_store_service/rocksdb_data_store.cpp +++ b/eloq_data_store_service/rocksdb_data_store.cpp @@ -68,16 +68,10 @@ RocksDBDataStore::~RocksDBDataStore() void RocksDBDataStore::Shutdown() { - std::unique_lock db_lk(db_mux_); DLOG(INFO) << "Shutting down RocksDBDataStore"; + RocksDBDataStoreCommon::Shutdown(); - // shutdown query worker pool - query_worker_pool_->Shutdown(); - query_worker_pool_ = nullptr; - - data_store_service_->ForceEraseScanIters(shard_id_); - data_store_service_ = nullptr; - + std::unique_lock db_lk(db_mux_); if (db_ != nullptr) { DLOG(INFO) << "Closing RocksDB at path: " << storage_path_; diff --git a/eloq_data_store_service/rocksdb_data_store_common.cpp b/eloq_data_store_service/rocksdb_data_store_common.cpp index 1e517bb..f0dc94c 100644 --- a/eloq_data_store_service/rocksdb_data_store_common.cpp +++ b/eloq_data_store_service/rocksdb_data_store_common.cpp @@ -276,10 +276,30 @@ bool RocksDBDataStoreCommon::Initialize() query_worker_pool_ = std::make_unique(query_worker_number_); } + else + { + query_worker_pool_->Initialize(); + } return true; } +void RocksDBDataStoreCommon::Shutdown() +{ + // shutdown query worker pool + if (query_worker_pool_ != nullptr) + { + query_worker_pool_->Shutdown(); + // If the data store be reused, query_worker_pool_ will be re-created in + // Initialize(). + } + + if (data_store_service_ != nullptr) + { + data_store_service_->ForceEraseScanIters(shard_id_); + } +} + void RocksDBDataStoreCommon::FlushData(FlushDataRequest *flush_data_req) { bool res = query_worker_pool_->SubmitWork( @@ -424,6 +444,7 @@ void RocksDBDataStoreCommon::Read(ReadRequest *req) auto table_name = req->GetTableName(); uint32_t partition_id = req->GetPartitionId(); + auto key = req->GetKey(); std::shared_lock db_lk(db_mux_); @@ -434,7 +455,7 @@ void RocksDBDataStoreCommon::Read(ReadRequest *req) return; } - std::string key_str = this->BuildKey(table_name, partition_id, req); + std::string key_str = this->BuildKey(table_name, partition_id, key); std::string value; rocksdb::ReadOptions read_options; rocksdb::Status status = db->Get(read_options, key_str, &value); @@ -1140,32 +1161,6 @@ std::string RocksDBDataStoreCommon::BuildKey(const std::string_view table_name, return tmp_key; } -std::string RocksDBDataStoreCommon::BuildKey(const std::string_view table_name, - uint32_t partition_id, - const ReadRequest *read_request) -{ - size_t total_key_size = 0; - for (size_t idx = 0; idx < read_request->PartsCountPerKey(); ++idx) - { - total_key_size += read_request->GetKey(idx).size(); - } - - total_key_size += table_name.size() + 2; - - std::string tmp_key; - tmp_key.reserve(total_key_size); - tmp_key.append(table_name); - tmp_key.append(KEY_SEPARATOR); - tmp_key.append(std::to_string(partition_id)); - tmp_key.append(KEY_SEPARATOR); - - for (size_t idx = 0; idx < read_request->PartsCountPerKey(); ++idx) - { - tmp_key.append(read_request->GetKey(idx)); - } - return tmp_key; -} - const std::string RocksDBDataStoreCommon::BuildKeyForDebug( const std::unique_ptr &key_slices, size_t slice_size) { diff --git a/eloq_data_store_service/rocksdb_data_store_common.h b/eloq_data_store_service/rocksdb_data_store_common.h index f2663cb..b205dbc 100644 --- a/eloq_data_store_service/rocksdb_data_store_common.h +++ b/eloq_data_store_service/rocksdb_data_store_common.h @@ -162,6 +162,8 @@ class RocksDBDataStoreCommon : public DataStore */ bool Initialize() override; + void Shutdown() override; + /** * @brief indicate end of flush entries in a single ckpt for \@param batch * to base table or skindex table in data store, stop and return false if @@ -238,10 +240,6 @@ class RocksDBDataStoreCommon : public DataStore uint32_t partition_id, const std::string_view key); - std::string BuildKey(const std::string_view table_name, - uint32_t partition_id, - const ReadRequest *read_request); - const std::string BuildKeyForDebug( const std::unique_ptr &key_slices, size_t slice_size); diff --git a/eloq_data_store_service/thread_worker_pool.cpp b/eloq_data_store_service/thread_worker_pool.cpp index 3585111..a34ccdc 100644 --- a/eloq_data_store_service/thread_worker_pool.cpp +++ b/eloq_data_store_service/thread_worker_pool.cpp @@ -28,45 +28,58 @@ namespace EloqDS ThreadWorkerPool::ThreadWorkerPool(size_t max_workers_num) : max_workers_num_(max_workers_num) { - for (size_t i = 0; i < max_workers_num_; i++) + Initialize(); +} + +void ThreadWorkerPool::Initialize() +{ + while (shutdown_indicator_.load(std::memory_order_acquire)) { - std::thread worker = std::thread( - [this] - { - while (true) + std::this_thread::sleep_for(std::chrono::microseconds(100)); + } + // start all worker threads. + if (workers_.empty()) + { + for (size_t i = 0; i < max_workers_num_; i++) + { + std::thread worker = std::thread( + [this] { - // Acquire work queue mutex - std::unique_lock lk(work_queue_mutex_); - // Wait for new work come in or shutdown happen - work_queue_cv_.wait( - lk, - [this] - { - return !work_queue_.empty() || - shutdown_indicator_.load( - std::memory_order_acquire); - }); - // Take work if work queue is not empty - if (!work_queue_.empty()) + while (true) { - std::function work = - std::move(work_queue_.front()); - work_queue_.pop_front(); - lk.unlock(); - // Do work - work(); - } - else - { - // Quit loop if shutdown - assert(shutdown_indicator_.load( - std::memory_order_acquire)); - lk.unlock(); - break; + // Acquire work queue mutex + std::unique_lock lk(work_queue_mutex_); + // Wait for new work come in or shutdown happen + work_queue_cv_.wait( + lk, + [this] + { + return !work_queue_.empty() || + shutdown_indicator_.load( + std::memory_order_acquire); + }); + // Take work if work queue is not empty + if (!work_queue_.empty()) + { + std::function work = + std::move(work_queue_.front()); + work_queue_.pop_front(); + lk.unlock(); + // Do work + work(); + } + else + { + // Quit loop if shutdown + assert(shutdown_indicator_.load( + std::memory_order_acquire)); + lk.unlock(); + break; + } } - } - }); - workers_.push_back(std::move(worker)); + }); + workers_.push_back(std::move(worker)); + } } } @@ -103,5 +116,8 @@ void ThreadWorkerPool::Shutdown() worker.join(); } } + + workers_.clear(); + shutdown_indicator_.store(false, std::memory_order_release); } } // namespace EloqDS diff --git a/eloq_data_store_service/thread_worker_pool.h b/eloq_data_store_service/thread_worker_pool.h index 95ea212..372e055 100644 --- a/eloq_data_store_service/thread_worker_pool.h +++ b/eloq_data_store_service/thread_worker_pool.h @@ -39,6 +39,7 @@ class ThreadWorkerPool ThreadWorkerPool(size_t max_workers_num = 1); ~ThreadWorkerPool() = default; + void Initialize(); bool SubmitWork(std::function work); size_t WorkQueueSize(); void Shutdown(); diff --git a/rocksdb_handler.cpp b/rocksdb_handler.cpp index 5266b4d..e6648c6 100644 --- a/rocksdb_handler.cpp +++ b/rocksdb_handler.cpp @@ -2102,7 +2102,7 @@ void RocksDBHandler::RestoreTxCache(txservice::NodeGroupId cc_ng_id, }); } -bool RocksDBHandler::OnLeaderStart(uint32_t *next_leader_node) +bool RocksDBHandler::OnLeaderStart(uint32_t ng_id, uint32_t *next_leader_node) { bthread::Mutex mux; bthread::ConditionVariable cv; @@ -2171,7 +2171,8 @@ uint16_t RocksDBHandler::DecodeBucketIdFromKvKey(const char *data, size_t size) return EloqShare::big_endian_to_host(be_bucket_id); } -void RocksDBHandler::OnStartFollowing(uint32_t leader_node_id, +void RocksDBHandler::OnStartFollowing(uint32_t ng_id, + uint32_t leader_node_id, int64_t term, int64_t standby_term, bool resubscribe) diff --git a/rocksdb_handler.h b/rocksdb_handler.h index 55529e8..ee4b174 100644 --- a/rocksdb_handler.h +++ b/rocksdb_handler.h @@ -528,9 +528,10 @@ class RocksDBHandler : public txservice::store::DataStoreHandler static std::string DecodeTxKeyFromKvKey(const char *data, size_t size); static uint16_t DecodeBucketIdFromKvKey(const char *data, size_t size); - bool OnLeaderStart(uint32_t *next_leader_node) override; + bool OnLeaderStart(uint32_t ng_id, uint32_t *next_leader_node) override; - void OnStartFollowing(uint32_t leader_node_id, + void OnStartFollowing(uint32_t ng_id, + uint32_t leader_node_id, int64_t term, int64_t standby_term, bool resubscribe) override;