-
Notifications
You must be signed in to change notification settings - Fork 582
[VL] Support multiple segments per partition in columnar shuffle #11722
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
9b67bbb
0be5dae
74e9692
15f2226
54d9dd1
c072f0e
d534477
f96ca3e
4d1a015
d6159c1
6286505
f24c45a
5a2ea3e
9ceb7fd
66d3742
e8f1b54
9cac127
93b3f53
5857915
ecde73f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -356,6 +356,25 @@ class LocalPartitionWriter::PayloadCache { | |
| return arrow::Status::OK(); | ||
| } | ||
|
|
||
| arrow::Result<bool> writeIncremental(uint32_t partitionId, arrow::io::OutputStream *os) { | ||
| GLUTEN_CHECK(!enableDictionary_, "Incremental write is not supported when dictionary is enabled."); | ||
| if ((partitionInUse_.has_value() && partitionInUse_.value() == partitionId) || !hasCachedPayloads(partitionId)) { | ||
| return false; | ||
| } | ||
|
|
||
| auto& payloads = partitionCachedPayload_[partitionId]; | ||
| while (!payloads.empty()) { | ||
| const auto payload = std::move(payloads.front()); | ||
| payloads.pop_front(); | ||
| uint8_t blockType = static_cast<uint8_t>(BlockType::kPlainPayload); | ||
| RETURN_NOT_OK(os->Write(&blockType, sizeof(blockType))); | ||
| RETURN_NOT_OK(payload->serialize(os)); | ||
| compressTime_ += payload->getCompressTime(); | ||
| writeTime_ += payload->getWriteTime(); | ||
| } | ||
| return true; | ||
| } | ||
|
|
||
| bool canSpill() { | ||
| for (auto pid = 0; pid < numPartitions_; ++pid) { | ||
| if (partitionInUse_.has_value() && partitionInUse_.value() == pid) { | ||
|
|
@@ -506,10 +525,12 @@ LocalPartitionWriter::LocalPartitionWriter( | |
| MemoryManager* memoryManager, | ||
| const std::shared_ptr<LocalPartitionWriterOptions>& options, | ||
| const std::string& dataFile, | ||
| std::vector<std::string> localDirs) | ||
| std::vector<std::string> localDirs, | ||
| const std::string& indexFile) | ||
| : PartitionWriter(numPartitions, std::move(codec), memoryManager), | ||
| options_(options), | ||
| dataFile_(dataFile), | ||
| indexFile_(indexFile), | ||
| localDirs_(std::move(localDirs)) { | ||
| init(); | ||
| } | ||
|
|
@@ -562,6 +583,56 @@ void LocalPartitionWriter::init() { | |
| std::default_random_engine engine(rd()); | ||
| std::shuffle(localDirs_.begin(), localDirs_.end(), engine); | ||
| subDirSelection_.assign(localDirs_.size(), 0); | ||
|
|
||
| if (!indexFile_.empty()) { | ||
| usePartitionMultipleSegments_ = true; | ||
| partitionSegments_.resize(numPartitions_); | ||
| } | ||
| } | ||
|
|
||
| // Helper for big-endian conversion (network order) | ||
| #include <arpa/inet.h> | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please move the header to the top, and move |
||
| static uint64_t htonll(uint64_t value) { | ||
| #if __BYTE_ORDER == __LITTLE_ENDIAN | ||
| return (((uint64_t)htonl(value & 0xFFFFFFFFULL)) << 32) | htonl(value >> 32); | ||
| #else | ||
| return value; | ||
| #endif | ||
| } | ||
|
|
||
| arrow::Status LocalPartitionWriter::writeIndexFile() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add some c++ unit tests for the multi-segment partition write? |
||
| if (!usePartitionMultipleSegments_) { | ||
| return arrow::Status::OK(); | ||
| } | ||
|
|
||
| ARROW_ASSIGN_OR_RAISE(auto indexFileOs, openFile(indexFile_, options_->shuffleFileBufferSize)); | ||
|
|
||
| uint64_t segmentOffset = (numPartitions_ + 1) * sizeof(int64_t); | ||
| // write segment index of each partition in big-endian | ||
| for (uint32_t pid = 0; pid < numPartitions_; ++pid) { | ||
| uint64_t beOffset = htonll(segmentOffset); | ||
| RETURN_NOT_OK(indexFileOs->Write(reinterpret_cast<const uint8_t*>(&beOffset), sizeof(beOffset))); | ||
| const auto& segments = partitionSegments_[pid]; | ||
| segmentOffset += (segments.size() * 2 * sizeof(int64_t)); | ||
| } | ||
| uint64_t beOffset = htonll(segmentOffset); | ||
| RETURN_NOT_OK(indexFileOs->Write(reinterpret_cast<const uint8_t*>(&beOffset), sizeof(beOffset))); | ||
| // Write partition segments info in big-endian | ||
| for (uint32_t pid = 0; pid < numPartitions_; ++pid) { | ||
| const auto& segments = partitionSegments_[pid]; | ||
| for (const auto& segment : segments) { | ||
| uint64_t beFirst = htonll(segment.first); | ||
| uint64_t beSecond = htonll(segment.second); | ||
| RETURN_NOT_OK(indexFileOs->Write(reinterpret_cast<const uint8_t*>(&beFirst), sizeof(beFirst))); | ||
| RETURN_NOT_OK(indexFileOs->Write(reinterpret_cast<const uint8_t*>(&beSecond), sizeof(beSecond))); | ||
| } | ||
| } | ||
|
|
||
| // Write an ending marker byte with value 1 | ||
| const uint8_t marker = 1; | ||
| RETURN_NOT_OK(indexFileOs->Write(&marker, 1)); | ||
| RETURN_NOT_OK(indexFileOs->Close()); | ||
| return arrow::Status::OK(); | ||
| } | ||
|
|
||
| arrow::Result<int64_t> LocalPartitionWriter::mergeSpills(uint32_t partitionId, arrow::io::OutputStream* os) { | ||
|
|
@@ -600,13 +671,61 @@ arrow::Status LocalPartitionWriter::writeCachedPayloads(uint32_t partitionId, ar | |
| return arrow::Status::OK(); | ||
| } | ||
|
|
||
| arrow::Status LocalPartitionWriter::flushCachedPayloads() { | ||
| if (dataFileOs_ == nullptr) { | ||
| ARROW_ASSIGN_OR_RAISE(dataFileOs_, openFile(dataFile_, options_->shuffleFileBufferSize)); | ||
| } | ||
| ARROW_ASSIGN_OR_RAISE(int64_t endInDataFile, dataFileOs_->Tell()); | ||
| for (auto pid = 0; pid < numPartitions_; ++pid) { | ||
| auto startInDataFile = endInDataFile; | ||
| ARROW_ASSIGN_OR_RAISE(int64_t spillWrittenBytes, mergeSpills(pid, dataFileOs_.get())); | ||
| ARROW_ASSIGN_OR_RAISE(bool cachePayloadWritten, payloadCache_->writeIncremental(pid, dataFileOs_.get())); | ||
| if (spillWrittenBytes > 0 || cachePayloadWritten) { | ||
| ARROW_ASSIGN_OR_RAISE(endInDataFile, dataFileOs_->Tell()); | ||
| auto bytesWritten = endInDataFile - startInDataFile; | ||
| partitionSegments_[pid].emplace_back(startInDataFile, bytesWritten); | ||
| partitionLengths_[pid] += bytesWritten; | ||
| } | ||
| } | ||
|
|
||
| return arrow::Status::OK(); | ||
| } | ||
|
|
||
| arrow::Status LocalPartitionWriter::writeMemoryPayload(uint32_t partitionId, std::unique_ptr<InMemoryPayload> payload) { | ||
| if (dataFileOs_ == nullptr) { | ||
| ARROW_ASSIGN_OR_RAISE(dataFileOs_, openFile(dataFile_, options_->shuffleFileBufferSize)); | ||
| } | ||
|
|
||
| ARROW_ASSIGN_OR_RAISE(int64_t startOffset, dataFileOs_->Tell()); | ||
| if (codec_ != nullptr) { | ||
| ARROW_ASSIGN_OR_RAISE(auto compressOs, ShuffleCompressedOutputStream::Make(codec_.get(), options_->compressionBufferSize, dataFileOs_, arrow::default_memory_pool())); | ||
| RETURN_NOT_OK(payload->serialize(compressOs.get())); | ||
| RETURN_NOT_OK(compressOs->Flush()); | ||
| compressTime_ += compressOs->compressTime(); | ||
| RETURN_NOT_OK(compressOs->Close()); | ||
| } else { | ||
| RETURN_NOT_OK(payload->serialize(dataFileOs_.get())); | ||
| } | ||
| ARROW_ASSIGN_OR_RAISE(int64_t endOffset, dataFileOs_->Tell()); | ||
| auto bytesWritten = endOffset - startOffset; | ||
| partitionSegments_[partitionId].emplace_back(startOffset, bytesWritten); | ||
| partitionLengths_[partitionId] += bytesWritten; | ||
|
|
||
| return arrow::Status::OK(); | ||
| } | ||
|
|
||
| arrow::Status LocalPartitionWriter::stop(ShuffleWriterMetrics* metrics, int64_t& evictBytes) { | ||
| if (stopped_) { | ||
| return arrow::Status::OK(); | ||
| } | ||
| stopped_ = true; | ||
|
|
||
| if (useSpillFileAsDataFile_) { | ||
| if (usePartitionMultipleSegments_) { | ||
| RETURN_NOT_OK(finishSpill()); | ||
| RETURN_NOT_OK(finishMerger()); | ||
| RETURN_NOT_OK(flushCachedPayloads()); | ||
| RETURN_NOT_OK(writeIndexFile()); | ||
| } else if (useSpillFileAsDataFile_) { | ||
| ARROW_ASSIGN_OR_RAISE(auto spill, spiller_->finish()); | ||
|
|
||
| // Merge the remaining partitions from spills. | ||
|
|
@@ -750,6 +869,9 @@ arrow::Status LocalPartitionWriter::hashEvict( | |
| for (auto& payload : merged) { | ||
| RETURN_NOT_OK(payloadCache_->cache(partitionId, std::move(payload))); | ||
| } | ||
| if (usePartitionMultipleSegments_) { | ||
| RETURN_NOT_OK(flushCachedPayloads()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The hashEvict is not only called for spilling. When the evictType is And when the evitType is |
||
| } | ||
| merged.clear(); | ||
| } | ||
| return arrow::Status::OK(); | ||
|
|
@@ -759,6 +881,12 @@ arrow::Status | |
| LocalPartitionWriter::sortEvict(uint32_t partitionId, std::unique_ptr<InMemoryPayload> inMemoryPayload, bool isFinal, int64_t& evictBytes) { | ||
| rawPartitionLengths_[partitionId] += inMemoryPayload->rawSize(); | ||
|
|
||
| if (usePartitionMultipleSegments_) { | ||
| // If multiple segments per partition is enabled, write directly to the final data file. | ||
| RETURN_NOT_OK(writeMemoryPayload(partitionId, std::move(inMemoryPayload))); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you explain a bit more on how this can reduce the memory usage? Looks like the memory is still only get reclaimed by OOM and spilling. |
||
| return arrow::Status::OK(); | ||
| } | ||
|
|
||
| if (lastEvictPid_ != -1 && (partitionId < lastEvictPid_ || (isFinal && !dataFileOs_))) { | ||
| lastEvictPid_ = -1; | ||
| RETURN_NOT_OK(finishSpill()); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a configuration to enable this feature.