-
Notifications
You must be signed in to change notification settings - Fork 591
perf(cluster): use multithreads to optimize sendSnapshotByRawKV #3299
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: unstable
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -723,13 +723,17 @@ migrate-batch-size-kb 16 | |||||
| # Default: 16M | ||||||
| migrate-batch-rate-limit-mb 16 | ||||||
|
|
||||||
|
|
||||||
| # If it is set to yes, kvrocks will skip the deallocation of block cache | ||||||
| # while closing the database to speed up the shutdown | ||||||
| # | ||||||
| # Default: no | ||||||
| # skip-block-cache-deallocation-on-close no | ||||||
|
|
||||||
| # The parallelism of slot migration passing SST files | ||||||
| # | ||||||
| # Default: the number of Kvrocks node cores | ||||||
|
||||||
| # Default: the number of Kvrocks node cores | |
| # Default: 0 (which uses the number of Kvrocks node cores) |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -20,13 +20,16 @@ | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| #include "slot_migrate.h" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| #include <future> | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| #include <memory> | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| #include <utility> | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| #include "arpa/inet.h" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| #include "db_util.h" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| #include "event_util.h" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| #include "fmt/format.h" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| #include "io_util.h" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| #include "netinet/tcp.h" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| #include "storage/batch_extractor.h" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| #include "storage/iterator.h" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| #include "storage/redis_metadata.h" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
@@ -52,7 +55,8 @@ SlotMigrator::SlotMigrator(Server *srv) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| max_pipeline_size_(srv->GetConfig()->pipeline_size), | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| seq_gap_limit_(srv->GetConfig()->sequence_gap), | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| migrate_batch_bytes_per_sec_(srv->GetConfig()->migrate_batch_rate_limit_mb * MiB), | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| migrate_batch_size_bytes_(srv->GetConfig()->migrate_batch_size_kb * KiB) { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| migrate_batch_size_bytes_(srv->GetConfig()->migrate_batch_size_kb * KiB), | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| migrate_slots_send_snapshots_parallelism_(srv->GetConfig()->migrate_slots_send_snapshots_parallelism) { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| // Let metadata_cf_handle_ be nullptr, and get them in real time to avoid accessing invalid pointer, | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| // because metadata_cf_handle_ and db_ will be destroyed if DB is reopened. | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| // [Situation]: | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
@@ -69,6 +73,7 @@ SlotMigrator::SlotMigrator(Server *srv) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| // [Note]: | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| // This problem may exist in all functions of Database called in slot migration process. | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| metadata_cf_handle_ = nullptr; | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| global_rate_limiter_.reset(rocksdb::NewGenericRateLimiter(static_cast<int64_t>(migrate_batch_bytes_per_sec_))); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| if (srv->IsSlave()) { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| SetStopMigrationFlag(true); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
@@ -1251,7 +1256,6 @@ void SlotMigrator::resumeSyncCtx(const Status &migrate_result) { | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| Status SlotMigrator::sendMigrationBatch(BatchSender *batch) { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| // user may dynamically change some configs, apply it when send data | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| batch->SetMaxBytes(migrate_batch_size_bytes_); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| batch->SetMaxBytes(migrate_batch_size_bytes_); | |
| batch->SetMaxBytes(migrate_batch_size_bytes_); | |
| batch->SetBytesPerSecond(migrate_batch_bytes_per_sec_); |
Copilot
AI
Dec 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If parallelism equals 0 (when migrate_slots_send_snapshots_parallelism_ is not properly initialized), std::min will return 0, and no threads will be created. This would result in silent failure to migrate any data. Add explicit validation to ensure parallelism is at least 1 before proceeding with the migration.
| int parallelism = std::min(migrate_slots_send_snapshots_parallelism_, total_slots); | |
| int parallelism = std::min(migrate_slots_send_snapshots_parallelism_, total_slots); | |
| if (parallelism < 1) { | |
| parallelism = 1; | |
| } |
Copilot
AI
Dec 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The lambda captures variables by value (using [=]), including the loop variable 'i'. However, 'i' is only used in the error message on line 1281, which makes it valuable. The other captured variables (cur_start, cur_end) are correctly captured by value since they change in each iteration. This is correct, but consider being explicit about what's captured for better code clarity.
Copilot
AI
Dec 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Each parallel thread creates its own connection to the destination node via createConnectToDstNode(), but there's no mechanism to ensure these connections don't overwhelm the destination node. Consider adding configuration or documentation about the impact of parallel connections, or implementing connection pooling/throttling to prevent resource exhaustion on the destination.
Copilot
AI
Dec 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When any thread fails during parallel migration, the function returns immediately without waiting for other threads to complete. This could leave running threads that continue to execute and potentially access shared resources after the migration has been marked as failed. Consider implementing proper cleanup or cancellation of remaining threads when one fails, or at minimum wait for all threads to finish before returning the error.
| for (auto &result : results) { | |
| auto s = result.get(); | |
| if (!s.IsOK()) { | |
| return {Status::NotOK, fmt::format("[migrate] Parallel migrate get result error: {}", s.Msg())}; | |
| } | |
| } | |
| Status first_error; | |
| bool has_error = false; | |
| for (auto &result : results) { | |
| auto s = result.get(); | |
| if (!s.IsOK() && !has_error) { | |
| first_error = s; | |
| has_error = true; | |
| } | |
| } | |
| if (has_error) { | |
| return {Status::NotOK, | |
| fmt::format("[migrate] Parallel migrate get result error: {}", first_error.Msg())}; | |
| } |
Copilot
AI
Dec 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The namespace_ member variable is accessed by multiple threads in parallel without synchronization. While it's likely set before migration begins and not modified during migration, this should be verified. Consider documenting thread-safety assumptions for member variables accessed in parallel contexts.
Copilot
AI
Dec 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The slot_snapshot_ member variable is accessed by multiple threads without synchronization. While RocksDB snapshots are immutable and thread-safe to read from, the pointer itself should be properly synchronized or documented as being set before parallel access begins. Verify that slot_snapshot_ is fully initialized and won't change during the parallel migration phase.
Copilot
AI
Dec 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The logging information about migration progress (bytes sent, rate, batches, entries) has been removed from the individual thread migrations. This makes debugging and monitoring parallel migrations more difficult, as there's no per-thread visibility. Consider adding aggregate logging or at least debug-level logs for each thread's progress to aid troubleshooting.
Copilot
AI
Dec 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The dst_ip_ and dst_port_ member variables are accessed by multiple threads without synchronization. While these are set before the parallel migration begins and are not modified during migration, they should be documented as thread-safe or protected. Consider making them const or adding documentation that they must not be modified during parallel operations.
| // NOTE: dst_ip_ and dst_port_ are configured before any parallel migration begins | |
| // and are not modified during migration. They must not be mutated while parallel | |
| // operations are in progress, so concurrent reads from multiple threads here are | |
| // considered thread-safe by design. |
Copilot
AI
Dec 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The error handling returns -1 but doesn't close the file descriptor that was successfully created by SockConnect. When authentication fails, the established connection is leaked. The file descriptor from SockConnect should be closed before returning on authentication failure.
Copilot
AI
Dec 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The function signature returns an int that can be negative to indicate failure, but the return type should more clearly express this. Consider using a StatusOr or Result type pattern instead of returning a raw int where negative values mean error. This would make the API more consistent with the rest of the codebase which uses Status objects.
| int SlotMigrator::createConnectToDstNode() { | |
| // Connect to the destination node | |
| auto fd = util::SockConnect(dst_ip_, dst_port_); | |
| if (!fd.IsOK()) { | |
| error("failed to connect to the node error: {}", fd.Msg()); | |
| return -1; | |
| } | |
| std::string pass = srv_->GetConfig()->requirepass; | |
| if (!pass.empty()) { | |
| auto s = authOnDstNode(*fd, pass); | |
| if (!s.IsOK()) { | |
| error("failed to authenticate on destination node error: {}", s.Msg()); | |
| return -1; | |
| } | |
| } | |
| return *fd; | |
| Status SlotMigrator::createConnectToDstNode(int *out_fd) { | |
| // Connect to the destination node | |
| auto fd = util::SockConnect(dst_ip_, dst_port_); | |
| if (!fd.IsOK()) { | |
| auto msg = fmt::format("failed to connect to the node error: {}", fd.Msg()); | |
| error("{}", msg); | |
| return {Status::NotOK, msg}; | |
| } | |
| std::string pass = srv_->GetConfig()->requirepass; | |
| if (!pass.empty()) { | |
| auto s = authOnDstNode(*fd, pass); | |
| if (!s.IsOK()) { | |
| auto msg = fmt::format("failed to authenticate on destination node error: {}", s.Msg()); | |
| error("{}", msg); | |
| return {Status::NotOK, msg}; | |
| } | |
| } | |
| *out_fd = *fd; | |
| return Status::OK(); |
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |||||||||||
| #pragma once | ||||||||||||
|
|
||||||||||||
| #include <rocksdb/db.h> | ||||||||||||
| #include <rocksdb/rate_limiter.h> | ||||||||||||
| #include <rocksdb/status.h> | ||||||||||||
| #include <rocksdb/transaction_log.h> | ||||||||||||
| #include <rocksdb/write_batch.h> | ||||||||||||
|
|
@@ -99,6 +100,9 @@ class SlotMigrator : public redis::Database { | |||||||||||
| void SetSequenceGapLimit(int value) { | ||||||||||||
| if (value > 0) seq_gap_limit_ = value; | ||||||||||||
| } | ||||||||||||
| void SetMigrateSlotsSendSnapshotsParallelism(int value) { | ||||||||||||
| if (value > 0) migrate_slots_send_snapshots_parallelism_ = value; | ||||||||||||
| } | ||||||||||||
|
Comment on lines
+103
to
+105
|
||||||||||||
| void SetMigrateBatchRateLimit(size_t bytes_per_sec) { migrate_batch_bytes_per_sec_ = bytes_per_sec; } | ||||||||||||
| void SetMigrateBatchSize(size_t size) { migrate_batch_size_bytes_ = size; } | ||||||||||||
| void SetStopMigrationFlag(bool value) { stop_migration_ = value; } | ||||||||||||
|
|
@@ -148,6 +152,8 @@ class SlotMigrator : public redis::Database { | |||||||||||
|
|
||||||||||||
| Status sendMigrationBatch(BatchSender *batch); | ||||||||||||
| Status sendSnapshotByRawKV(); | ||||||||||||
| Status migrateSlotRange(int start_slot, int end_slot, int fd); | ||||||||||||
| int createConnectToDstNode(); | ||||||||||||
| Status syncWALByRawKV(); | ||||||||||||
| bool catchUpIncrementalWAL(); | ||||||||||||
| Status migrateIncrementalDataByRawKV(uint64_t end_seq, BatchSender *batch_sender); | ||||||||||||
|
|
@@ -173,6 +179,9 @@ class SlotMigrator : public redis::Database { | |||||||||||
| uint64_t seq_gap_limit_ = kDefaultSequenceGapLimit; | ||||||||||||
| std::atomic<size_t> migrate_batch_bytes_per_sec_ = 1 * GiB; | ||||||||||||
| std::atomic<size_t> migrate_batch_size_bytes_; | ||||||||||||
| int migrate_slots_send_snapshots_parallelism_ = 0; | ||||||||||||
|
||||||||||||
| int migrate_slots_send_snapshots_parallelism_ = 0; | |
| int migrate_slots_send_snapshots_parallelism_ = | |
| std::thread::hardware_concurrency() == 0 | |
| ? 1 | |
| : static_cast<int>(std::thread::hardware_concurrency()); |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -228,6 +228,8 @@ Config::Config() { | |||||||||||||||||||||||||||
| new EnumField<MigrationType>(&migrate_type, migration_types, MigrationType::kRawKeyValue)}, | ||||||||||||||||||||||||||||
| {"migrate-batch-size-kb", false, new IntField(&migrate_batch_size_kb, 16, 1, INT_MAX)}, | ||||||||||||||||||||||||||||
| {"migrate-batch-rate-limit-mb", false, new IntField(&migrate_batch_rate_limit_mb, 16, 1, INT_MAX)}, | ||||||||||||||||||||||||||||
| {"migrate-slots-send-snapshots-parallelism", false, | ||||||||||||||||||||||||||||
| new IntField(&migrate_slots_send_snapshots_parallelism, 0, 0, INT_MAX)}, | ||||||||||||||||||||||||||||
| {"unixsocket", true, new StringField(&unixsocket, "")}, | ||||||||||||||||||||||||||||
| {"unixsocketperm", true, new OctalField(&unixsocketperm, 0777, 1, INT_MAX)}, | ||||||||||||||||||||||||||||
| {"log-retention-days", true, new IntField(&log_retention_days, -1, -1, INT_MAX)}, | ||||||||||||||||||||||||||||
|
|
@@ -610,6 +612,16 @@ void Config::initFieldCallback() { | |||||||||||||||||||||||||||
| srv->slot_migrator->SetMigrateBatchSize(migrate_batch_size_kb * KiB); | ||||||||||||||||||||||||||||
| return Status::OK(); | ||||||||||||||||||||||||||||
| }}, | ||||||||||||||||||||||||||||
| {"migrate-slots-send-snapshots-parallelism", | ||||||||||||||||||||||||||||
| [this](Server *srv, [[maybe_unused]] const std::string &k, [[maybe_unused]] const std::string &v) -> Status { | ||||||||||||||||||||||||||||
| if (migrate_slots_send_snapshots_parallelism == 0) { | ||||||||||||||||||||||||||||
| unsigned int max_parallelism = std::thread::hardware_concurrency(); | ||||||||||||||||||||||||||||
| migrate_slots_send_snapshots_parallelism = static_cast<int>(max_parallelism); | ||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||
| if (!srv) return Status::OK(); | ||||||||||||||||||||||||||||
| srv->slot_migrator->SetMigrateSlotsSendSnapshotsParallelism(migrate_slots_send_snapshots_parallelism); | ||||||||||||||||||||||||||||
|
Comment on lines
+617
to
+622
|
||||||||||||||||||||||||||||
| if (migrate_slots_send_snapshots_parallelism == 0) { | |
| unsigned int max_parallelism = std::thread::hardware_concurrency(); | |
| migrate_slots_send_snapshots_parallelism = static_cast<int>(max_parallelism); | |
| } | |
| if (!srv) return Status::OK(); | |
| srv->slot_migrator->SetMigrateSlotsSendSnapshotsParallelism(migrate_slots_send_snapshots_parallelism); | |
| int effective_parallelism = migrate_slots_send_snapshots_parallelism; | |
| if (effective_parallelism == 0) { | |
| unsigned int max_parallelism = std::thread::hardware_concurrency(); | |
| effective_parallelism = static_cast<int>(max_parallelism); | |
| } | |
| if (!srv) return Status::OK(); | |
| srv->slot_migrator->SetMigrateSlotsSendSnapshotsParallelism(effective_parallelism); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment states "passing SST files" but the actual implementation sends snapshots by raw key-value pairs, not SST files. This is misleading. The comment should accurately describe that this setting controls the parallelism of sending snapshot data during slot migration.