From 3c509abff48ad667fa69d77b4ed1bb71a2e7af1a Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 15 Apr 2024 15:43:35 +0800 Subject: [PATCH 01/32] [Build] Fix build issue. --- src/layers/mlp_llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layers/mlp_llama.cpp b/src/layers/mlp_llama.cpp index f650a2f5..c457a344 100644 --- a/src/layers/mlp_llama.cpp +++ b/src/layers/mlp_llama.cpp @@ -64,7 +64,7 @@ void invokeMLPLLaMA(DataType dt, int numTokens, int hiddenSize, int intermediate || (ctx != nullptr && (ctx->hiddenSize != hiddenSize || ctx->intermediateSize != intermediateSize))) { if (ctx != nullptr) delete ctx; printf(">> create context: %d %d\n", hiddenSize, intermediateSize); - ctx = new DecoderContext(1, hiddenSize, 1, 1, intermediateSize, "silu", 1e-6, 0, 0, 0, 0, 0, 0, 1); + ctx = new DecoderContext(1, hiddenSize, 1, 1, 1, intermediateSize, "silu", 1e-6, 0, 0, 0, 0, 0, 0, 1); ctx->mmHelper = new MMHelper(Env::getInstance().getEngineKind(), Env::getInstance().getEngineIndex()); } From 24de368530410e90489319419354d6c32a05cc6b Mon Sep 17 00:00:00 2001 From: changqi1 Date: Thu, 18 Apr 2024 22:14:06 +0800 Subject: [PATCH 02/32] [Model] Init pipeline parallel. --- examples/cpp/example.cpp | 2 +- src/common/transformer_ctx.h | 20 +-- src/models/common_decoder.h | 79 ++++++++++- src/models/models.cpp | 19 ++- src/models/prompt.h | 232 ++++++++++++++++++++++++++++++++ src/searchers/greedy_search.cpp | 43 +++++- src/searchers/greedy_search.h | 2 + src/utils/thread_util.h | 67 ++++++++- 8 files changed, 442 insertions(+), 22 deletions(-) create mode 100644 src/models/prompt.h diff --git a/examples/cpp/example.cpp b/examples/cpp/example.cpp index 34cf4eb0..f6a68ce7 100644 --- a/examples/cpp/example.cpp +++ b/examples/cpp/example.cpp @@ -467,7 +467,7 @@ int main(int argc, char **argv) { } auto result = model.finalize(); - if (isMaster) { + if (true) { std::cout << "\n[INFO] Final output is: " << std::endl; std::vector sent = tokenizer->batchDecode(result, batchSize); for (auto str : sent) { diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h index cb29ee11..b01d363c 100644 --- a/src/common/transformer_ctx.h +++ b/src/common/transformer_ctx.h @@ -18,15 +18,17 @@ #include #include #include -#include - -#include "allocator.h" #include +#include #include "INIReader.h" +#include "allocator.h" #include "my_types.h" #include "simple_mem_pool.h" #include "split_util.h" +#include "float16.h" +#include "singleton.h" +#include "kvcache_manager.h" namespace fs = std::filesystem; @@ -126,10 +128,10 @@ struct DecoderContext { uint64_t size3; public: - DecoderContext(int _layers, int _hiddenSize, int _headSize, int _attHeadNum, int _kvHeadNum, int _imSize, const std::string &act, - float epsilon, int _vocabSize, int _embeddingSize, int _maxPositions, int _maxPosEmbed, int _maxSeqLength, - int _splitIdx, int _splits, int _ppSize = 1, int _ppRank = 0, RopeParams *_ropeParamsPtr = nullptr, - bool _useLogN = true, bool _useNTK = true, int numThreads = 0) + DecoderContext(int _layers, int _hiddenSize, int _headSize, int _attHeadNum, int _kvHeadNum, int _imSize, + const std::string &act, float epsilon, int _vocabSize, int _embeddingSize, int _maxPositions, + int _maxPosEmbed, int _maxSeqLength, int _splitIdx, int _splits, int _ppSize = 1, int _ppRank = 0, + RopeParams *_ropeParamsPtr = nullptr, bool _useLogN = true, bool _useNTK = true, int numThreads = 0) : layers(_layers) , hiddenSize(_hiddenSize) , attHeadSize(_headSize) @@ -151,9 +153,7 @@ struct DecoderContext { , tpSize(_splits) , tpRank(_splitIdx) , epsilon(epsilon) { - if (attHeadNum != 0) { - this->attFactor = 1 / sqrtf(attHeadSize); - } + if (attHeadNum != 0) { this->attFactor = 1 / sqrtf(attHeadSize); } // Set the default value (don't worry, it can be changed later) this->batchSize = 1; diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index f51cdc25..59df24b6 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -34,6 +34,7 @@ #include "transformer_ctx.h" #include "transpose_util.h" #include "weight_util.h" +#include "prompt.h" using namespace xft; @@ -224,6 +225,7 @@ class CommonDecoder : public AbstractDecoder { DecoderContext *ctx = getDecoderContext(layers, hiddenSize, size_per_head, attHeadNum, kvHeadNum, imSize, act, epsilon, vocabSize, embeddingSize, maxPositions, maxPosEmbed, maxSeqLength, useLogN, useNTK, ropeParamsPtr); + pool = new ThreadPool(4); ctx->ResetConfigReader(configPath); @@ -285,6 +287,7 @@ class CommonDecoder : public AbstractDecoder { // Prepare context DecoderContext *ctx = this->getContext(); ctx->resize(batchSize, seqLen, pastSeqLen); + int hiddenSize = ctx->hiddenSize; if (step == 0) { // Reset initial and accumulated sequence length at the first step @@ -341,14 +344,70 @@ class CommonDecoder : public AbstractDecoder { int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank; int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank; int count = batchSize * inputSeqLen * ctx->hiddenSize; - MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - // TODO: Error: different scope when dynamic loading so file - // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); + if (TaskWaitingQueue::getInstance().empty()) { + TimeLine t("Decoder.MPI_Recv"); + printf("%d: Decoder.MPI_Recv.SyncStart\n", ctx->ppRank); + int32_t promptID; + MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + // TODO: Error: different scope when dynamic loading so file + // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); + printf("%.6f\n", embBuf[0]); + if (!PromptPool::getInstance().has(promptID)) { + PromptMeta *prompt = new PromptMeta(promptID, 0, batchSize, seqLen, hiddenSize); + prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); + PromptPool::getInstance().insert(prompt->promptID, prompt); + } + TaskWaitingQueue::getInstance().push(PromptPool::getInstance().get(promptID)); + printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, promptID); + fflush(stdout); + } else { + pool->enqueue([=, &embBuf] { + TimeLine t("Decoder.MPI_Recv"); + printf("%d: Decoder.MPI_Recv.ASyncStart\n", ctx->ppRank); + int32_t promptID; + MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + // TODO: Error: different scope when dynamic loading so file + // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); + printf("%.6f\n", embBuf[0]); + if (!PromptPool::getInstance().has(promptID)) { + PromptMeta *prompt = new PromptMeta(promptID, 0, batchSize, seqLen, hiddenSize); + prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); + PromptPool::getInstance().insert(prompt->promptID, prompt); + } + TaskWaitingQueue::getInstance().push(PromptPool::getInstance().get(promptID)); + printf("%d: Decoder.MPI_Recv.ASyncDone %d\n", ctx->ppRank, promptID); + fflush(stdout); + }); + } } #endif - // Decoder: forward - int hiddenSize = ctx->hiddenSize; + if (!TaskWaitingQueue::getInstance().isFull()) { + // for (const auto& prompt : PromptPool::getInstance().getAll()) { + // if (!TaskWaitingQueue::getInstance().isFull()) { + // if (prompt->hiddenStatesReceived) { + // TaskWaitingQueue::getInstance().push(prompt); + // } + // } + // } + + if (!InputQueue::getInstance().empty()) { + if (!TaskWaitingQueue::getInstance().isFull()) { + auto prompt = InputQueue::getInstance().pop(); + prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); + PromptPool::getInstance().insert(prompt->promptID, prompt); + TaskWaitingQueue::getInstance().push(PromptPool::getInstance().get(prompt->promptID)); + } + } + } + + PromptMeta *runningTask; + while (!TaskWaitingQueue::getInstance().empty()) { + runningTask = TaskWaitingQueue::getInstance().pop(); + printf("%d: Decoder.forward\n", ctx->ppRank); + // Decoder: forward from runningTask int layers_per_pp_stage = this->decoders.size(); for (int i = 0; i < layers_per_pp_stage; ++i) { int workers = this->messenger.getSize(); @@ -399,15 +458,23 @@ class CommonDecoder : public AbstractDecoder { } } } + } + // else { + // return std::tuple(nullptr, 0, 0); + // } #ifdef PIPELINE_PARALLEL // If current pipeline stage isn't the end of stage, should send data to next stage and return nullptr if (ctx->ppSize > 1 && ctx->ppRank < ctx->ppSize - 1) { + TimeLine t("Decoder.MPI_Send"); int next_world_rank = (ctx->ppRank + 1) * ctx->tpSize + ctx->tpRank; int count = batchSize * inputSeqLen * ctx->hiddenSize; + MPI_Send(&runningTask->promptID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD); MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD); // TODO: Error: different scope when dynamic loading so file // this->messenger.worldSendFP32(embBuf, count, next_world_rank, next_world_rank); + printf("%d: Decoder.MPI_Send %d\n", ctx->ppRank, runningTask->promptID); + fflush(stdout); return std::tuple(nullptr, 0, 0); } #endif @@ -967,6 +1034,8 @@ class CommonDecoder : public AbstractDecoder { // Activation buffers (declared as float, but the actual data type may be different) std::shared_ptr> actBuffers; + ThreadPool *pool; + protected: // Components most LLMs may use std::vector decoders; diff --git a/src/models/models.cpp b/src/models/models.cpp index e7fd4b70..f6568164 100644 --- a/src/models/models.cpp +++ b/src/models/models.cpp @@ -31,6 +31,7 @@ #include "searcher.h" #include "timeline.h" #include "yarn_llama.h" +#include "prompt.h" namespace xft { enum class GenerationMode { GREEDY_SEARCH, BEAM_SEARCH, SAMPLE }; @@ -136,9 +137,25 @@ std::vector Model::generate() { } if (isNewInput) { - isNewInput = false; + static int i = 0; + i++; + if (i > 3) { + isNewInput = false; + i = 0; + } + // TODO: Create it when request input + if (this->isMaster()) { + int promptID = InputQueue::getInstance().createPromptID(); + int tokenID = InputQueue::getInstance().createTokenID(); + InputQueue::getInstance().push(new PromptMeta(promptID, tokenID, batchSize, seqLen, inputIds)); + } return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); } else { + static int i = 0; + i++; + if (i == 10) { + isNewInput = true; + } return searcher->getNextToken(); } } diff --git a/src/models/prompt.h b/src/models/prompt.h new file mode 100644 index 00000000..c10b1ab7 --- /dev/null +++ b/src/models/prompt.h @@ -0,0 +1,232 @@ +// Copyright (c) 2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================ +#pragma once + +#include +#include + +namespace xft { + +template +class PromptMeta { +public: + PromptMeta(int32_t _promptID, int32_t _tokenID, int32_t _batchSize, int32_t _inputSeqLen, std::vector _inputs) { + promptID = _promptID; + tokenID = _tokenID; + batchSize = _batchSize; + inputSeqLen = _inputSeqLen; + inputs = _inputs; + hiddenStatesReceived = false; + } + + PromptMeta(int32_t _promptID, int32_t _tokenID, int32_t _batchSize, int32_t _inputSeqLen, int32_t _hiddenSize) { + promptID = _promptID; + tokenID = _tokenID; + batchSize = _batchSize; + inputSeqLen = _inputSeqLen; + hiddenSize = _hiddenSize; + hiddenStatesReceived = false; + hiddenStates.Resize(batchSize * inputSeqLen, hiddenSize, hiddenSize); + } + + void ResetKVCache(int32_t _hiddenSize, int32_t _pastSeqLen, int32_t _layerIdx, void *_hiddenStates, void *_kvm) { + hiddenSize = _hiddenSize; + pastSeqLen = _pastSeqLen; + layerIdx = _layerIdx; + hiddenStates.Resize(batchSize * inputSeqLen, hiddenSize, hiddenSize); + memcpy(hiddenStates.Data(), _hiddenStates, sizeof(AttnInT) * batchSize * inputSeqLen * hiddenSize); + kvm = _kvm; + } + + int32_t promptID; + int32_t tokenID; + bool hiddenStatesReceived; + +private: + int32_t batchSize; + int32_t inputSeqLen; + int32_t hiddenSize; + int32_t pastSeqLen; + std::vector inputs; + std::vector outputs; + int32_t layerIdx; + hpj::Matrix hiddenStates; + void *kvm; //KVCacheManager +}; + +template +class InputQueue { +public: + static InputQueue &getInstance() { + static InputQueue instance; + return instance; + } + + int32_t createPromptID() { + int32_t id = promptID++; + if (id > 1000) { + promptID = 0; + id = promptID++; + } + return id; + } + + int32_t createTokenID() { + int32_t id = tokenID++; + if (id > 1000) { + tokenID = 0; + id = tokenID++; + } + return id; + } + + bool empty() { + return queue.empty(); + } + + PromptMeta *pop() { + auto buffer = queue.front(); + queue.pop(); + return buffer; + } + + void push(PromptMeta *buffer) { + queue.push(buffer); + } + +private: + InputQueue() {} + + static int32_t promptID; + static int32_t tokenID; + std::queue *> queue; +}; + +template +int32_t InputQueue::promptID = 0; + +template +int32_t InputQueue::tokenID = 0; + +template +class TaskWaitingQueue { +public: + static TaskWaitingQueue &getInstance() { + static TaskWaitingQueue instance; + return instance; + } + + bool empty() { + return queue.empty(); + } + + int32_t size() { + return queue.size(); + } + + bool isFull() { + bool full = false; + if (this->size() >= 4) + full = true; + return full; + } + + PromptMeta *pop() { + auto buffer = queue.front(); + queue.pop(); + return buffer; + } + + void push(PromptMeta *buffer) { + queue.push(buffer); + } + +private: + TaskWaitingQueue() {} + + std::queue *> queue; +}; + +template +class PromptPool { +public: + static PromptPool &getInstance() { + static PromptPool instance; + return instance; + } + + void insert(int32_t key, PromptMeta *prompt) { + hub[key] = prompt; + } + + bool has(int32_t key) const { + return hub.find(key) != hub.end(); + } + + PromptMeta *get(int32_t key) const { + auto it = hub.find(key); + if (it != hub.end()) { + return it->second; + } else { + return nullptr; + } + } + + std::vector *> getAll() { + std::vector *> metas; + for (const auto& pair : hub) { + metas.push_back(pair.second); + } + return metas; + } + + void remove(int32_t key) { + hub.erase(key); + } + + void modify(int32_t oldKey, PromptMeta *newPrompt) { + auto it = hub.find(oldKey); + if (it != hub.end()) { + it->second = newPrompt; + } + } + + bool isUpdated(int32_t key) const { + auto it = hub.find(key); + if (it != hub.end()) { + return it->second.hiddenStatesReceived; + } else { + printf("error: key not found\n"); + return false; + } + } + + bool setOld(int32_t key) { + auto it = hub.find(key); + if (it != hub.end()) { + it->second.hiddenStatesReceived = false; + } else { + printf("error: key not found\n"); + return false; + } + } + +private: + PromptPool() {} + + std::unordered_map *> hub; +}; + +} // namespace xft \ No newline at end of file diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp index 0e55648e..aa221c91 100644 --- a/src/searchers/greedy_search.cpp +++ b/src/searchers/greedy_search.cpp @@ -15,6 +15,11 @@ #include "greedy_search.h" #include "messenger.h" #include "search_utils.h" +#include "prompt.h" + +#include + +using namespace xft; GreedySearch::GreedySearch(AbstractDecoder &dec, const SearcherConfig &config) : decoder(dec), maxLen(config.maxLen), step(0), repetitionPenalty(config.repetitionPenalty) { @@ -26,6 +31,8 @@ GreedySearch::GreedySearch(AbstractDecoder &dec, const SearcherConfig &config) } stopWordsList = {}; stopWordsIndex = {}; + + pool = new ThreadPool(4); } std::vector GreedySearch::syncToken(std::tuple &result) { @@ -38,18 +45,46 @@ std::vector GreedySearch::syncToken(std::tuple &result) this->nextTokens = std::vector(batchSize, 0); if (ctx->ppSize > 1 && ctx->ppRank == 0) { int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank; - MPI_Recv(this->nextTokens.data(), batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank, - MPI_COMM_WORLD, MPI_STATUS_IGNORE); - // TODO: Error: different scope when dynamic loading so file - // messenger.worldRecvINT32(this->nextTokens.data(), batchSize, predictor_world_rank, predictor_world_rank); + // std::thread feedbackWaitingLastPP([&, predictor_world_rank, this](){ + // TimeLine t("GreedySearch.MPI_Recv"); + // MPI_Recv(this->nextTokens.data(), batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank, + // MPI_COMM_WORLD, MPI_STATUS_IGNORE); + // // TODO: Error: different scope when dynamic loading so file + // // messenger.worldRecvINT32(this->nextTokens.data(), batchSize, predictor_world_rank, predictor_world_rank); + // printf("%d\n", this->nextTokens[0]); + // }); + // feedbackWaitingLastPP.detach(); + pool->enqueue([predictor_world_rank, this] { + TimeLine t("GreedySearch.MPI_Recv"); + printf("0: GreedySearch.MPI_Recv.AsyncStart\n"); + int32_t promptID; + MPI_Recv(&promptID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); + printf("%d\n", this->nextTokens[0]); + if (PromptPool::getInstance().has(promptID)) { + TaskWaitingQueue::getInstance().push(PromptPool::getInstance().get(promptID)); + } else { + printf("error: should have promptID\n"); + } + printf("0: GreedySearch.MPI_Recv.AsyncDone %d\n", promptID); + fflush(stdout); + }); } } else { // The last predictor pipeline parallel stage this->nextTokens = this->search(result); if (ctx->ppSize > 1 && ctx->ppRank == ctx->ppSize - 1) { + TimeLine t("GreedySearch.MPI_Send"); int embedding_world_rank = 0 * ctx->tpSize + ctx->tpRank; int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank; + static int32_t promptID = 0; + MPI_Send(&promptID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); MPI_Send(this->nextTokens.data(), batchSize, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); + printf("%d: GreedySearch.MPI_Send %d\n", ctx->ppRank, promptID); + fflush(stdout); + promptID++; // TODO: Error: different scope when dynamic loading so file // messenger.worldSendINT32(this->nextTokens.data(), batchSize, embedding_world_rank, predictor_world_rank); } diff --git a/src/searchers/greedy_search.h b/src/searchers/greedy_search.h index 607d9737..bf2709a5 100644 --- a/src/searchers/greedy_search.h +++ b/src/searchers/greedy_search.h @@ -18,6 +18,7 @@ #include "messenger.h" #include "timeline.h" #include "transformer_ctx.h" +#include "thread_util.h" class GreedySearch : public AbstractSearcher { public: @@ -40,6 +41,7 @@ class GreedySearch : public AbstractSearcher { std::vector search(std::tuple &result); AbstractDecoder &decoder; + ThreadPool *pool; // Predicted token IDs std::vector nextTokens; diff --git a/src/utils/thread_util.h b/src/utils/thread_util.h index c6826051..1d2d0dfe 100644 --- a/src/utils/thread_util.h +++ b/src/utils/thread_util.h @@ -1,6 +1,14 @@ #pragma once #include +#include +#include +#include +#include +#include +#include +#include + template void parallel_for(int tasks, const Lambda &fn) { #pragma omp parallel for @@ -15,4 +23,61 @@ void parallel_for_dschedule(int tasks, const Lambda &fn) { for (int i = 0; i < tasks; i++) { fn(i); } -} \ No newline at end of file +} + +class ThreadPool { +public: + ThreadPool(size_t numThreads) : stop(false) { + for (size_t i = 0; i < numThreads; ++i) { + workers.emplace_back([this] { + while (true) { + std::function task; + + { + std::unique_lock lock(this->queueMutex); + this->condition.wait(lock, [this] { return this->stop || !this->tasks.empty(); }); + + if (this->stop && this->tasks.empty()) { return; } + + task = std::move(this->tasks.front()); + this->tasks.pop(); + } + + task(); + } + }); + } + } + + template + void enqueue(F &&f, Args &&...args) { + { + std::unique_lock lock(queueMutex); + + tasks.emplace([f, args...] { f(args...); }); + } + + condition.notify_one(); + } + + ~ThreadPool() { + { + std::unique_lock lock(queueMutex); + stop = true; + } + + condition.notify_all(); + + for (std::thread &worker : workers) { + worker.join(); + } + } + +private: + std::vector workers; + std::queue> tasks; + + std::mutex queueMutex; + std::condition_variable condition; + bool stop; +}; \ No newline at end of file From 7f23d07d78832d9ea56e02c4ecb8d8c2a4517b0a Mon Sep 17 00:00:00 2001 From: changqi1 Date: Fri, 19 Apr 2024 12:37:34 +0800 Subject: [PATCH 03/32] could run input 3 and 4 in mpi 2 and 4. --- src/common/transformer_ctx.h | 2 ++ src/models/common_decoder.h | 63 ++++++++++++++++++--------------- src/models/models.cpp | 10 +++--- src/searchers/greedy_search.cpp | 44 +++++++++++++---------- 4 files changed, 66 insertions(+), 53 deletions(-) diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h index b01d363c..5f750fe8 100644 --- a/src/common/transformer_ctx.h +++ b/src/common/transformer_ctx.h @@ -66,6 +66,8 @@ struct DecoderContext { int inputSeqLen; // For custom usage int reserved1; + // promptID + int32_t promptID; // Model structure configuration int vocabSize; diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index 59df24b6..14931a24 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -274,7 +274,6 @@ class CommonDecoder : public AbstractDecoder { std::tuple forward(int *ids, int64_t *dims, int step, bool logitsAll = false) { // Assume input has been synced with master in higher level. // Assume the 1st step input's shape is [userSideBS][1][seqLen]. - TimeLine t("Decoder.forward"); TimeLine t1("Decoder.embedding"); int userSideBS = dims[0]; @@ -345,8 +344,9 @@ class CommonDecoder : public AbstractDecoder { int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank; int count = batchSize * inputSeqLen * ctx->hiddenSize; if (TaskWaitingQueue::getInstance().empty()) { - TimeLine t("Decoder.MPI_Recv"); + TimeLine t("Decoder.MPI_Recv." + std::to_string(ctx->promptID)); printf("%d: Decoder.MPI_Recv.SyncStart\n", ctx->ppRank); + fflush(stdout); int32_t promptID; MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); @@ -362,37 +362,36 @@ class CommonDecoder : public AbstractDecoder { printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, promptID); fflush(stdout); } else { - pool->enqueue([=, &embBuf] { - TimeLine t("Decoder.MPI_Recv"); - printf("%d: Decoder.MPI_Recv.ASyncStart\n", ctx->ppRank); - int32_t promptID; - MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - // TODO: Error: different scope when dynamic loading so file - // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); - printf("%.6f\n", embBuf[0]); - if (!PromptPool::getInstance().has(promptID)) { - PromptMeta *prompt = new PromptMeta(promptID, 0, batchSize, seqLen, hiddenSize); - prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); - PromptPool::getInstance().insert(prompt->promptID, prompt); - } - TaskWaitingQueue::getInstance().push(PromptPool::getInstance().get(promptID)); - printf("%d: Decoder.MPI_Recv.ASyncDone %d\n", ctx->ppRank, promptID); - fflush(stdout); - }); + static bool init = false; + if (init == false) { + init = true; + pool->enqueue([curr_world_rank, prev_world_rank, count, batchSize, seqLen, hiddenSize, pastSeqLen, &embBuf, &ctx, this] { + while (true) { + printf("%d: Decoder.MPI_Recv.ASyncStart\n", ctx->ppRank); + fflush(stdout); + int32_t promptID; + MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + // TODO: Error: different scope when dynamic loading so file + // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); + printf("%.6f\n", embBuf[0]); + TimeLine t("Decoder.MPI_Recv"); + if (!PromptPool::getInstance().has(promptID)) { + PromptMeta *prompt = new PromptMeta(promptID, 0, batchSize, seqLen, hiddenSize); + prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); + PromptPool::getInstance().insert(prompt->promptID, prompt); + } + TaskWaitingQueue::getInstance().push(PromptPool::getInstance().get(promptID)); + printf("%d: Decoder.MPI_Recv.ASyncDone %d\n", ctx->ppRank, promptID); + fflush(stdout); + } + }); + } } } #endif if (!TaskWaitingQueue::getInstance().isFull()) { - // for (const auto& prompt : PromptPool::getInstance().getAll()) { - // if (!TaskWaitingQueue::getInstance().isFull()) { - // if (prompt->hiddenStatesReceived) { - // TaskWaitingQueue::getInstance().push(prompt); - // } - // } - // } - if (!InputQueue::getInstance().empty()) { if (!TaskWaitingQueue::getInstance().isFull()) { auto prompt = InputQueue::getInstance().pop(); @@ -403,10 +402,16 @@ class CommonDecoder : public AbstractDecoder { } } + while(TaskWaitingQueue::getInstance().empty()); + PromptMeta *runningTask; - while (!TaskWaitingQueue::getInstance().empty()) { + if (!TaskWaitingQueue::getInstance().empty()) { runningTask = TaskWaitingQueue::getInstance().pop(); + ctx->promptID = runningTask->promptID; + TimeLine t("Decoder.forward." + std::to_string(ctx->promptID)); printf("%d: Decoder.forward\n", ctx->ppRank); + fflush(stdout); + // Decoder: forward from runningTask int layers_per_pp_stage = this->decoders.size(); for (int i = 0; i < layers_per_pp_stage; ++i) { diff --git a/src/models/models.cpp b/src/models/models.cpp index f6568164..2ed3c1cd 100644 --- a/src/models/models.cpp +++ b/src/models/models.cpp @@ -151,11 +151,11 @@ std::vector Model::generate() { } return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); } else { - static int i = 0; - i++; - if (i == 10) { - isNewInput = true; - } + // static int i = 0; + // i++; + // if (i == 10) { + // isNewInput = true; + // } return searcher->getNextToken(); } } diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp index aa221c91..be34ada9 100644 --- a/src/searchers/greedy_search.cpp +++ b/src/searchers/greedy_search.cpp @@ -42,8 +42,10 @@ std::vector GreedySearch::syncToken(std::tuple &result) // Messenger &messenger = decoder.getMessenger(); if (std::get<0>(result) == nullptr) { // The first embedding pipeline parallel stage + static bool init = false; this->nextTokens = std::vector(batchSize, 0); - if (ctx->ppSize > 1 && ctx->ppRank == 0) { + if (ctx->ppSize > 1 && ctx->ppRank == 0 && init == false) { + init = true; int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank; // std::thread feedbackWaitingLastPP([&, predictor_world_rank, this](){ // TimeLine t("GreedySearch.MPI_Recv"); @@ -55,36 +57,40 @@ std::vector GreedySearch::syncToken(std::tuple &result) // }); // feedbackWaitingLastPP.detach(); pool->enqueue([predictor_world_rank, this] { - TimeLine t("GreedySearch.MPI_Recv"); - printf("0: GreedySearch.MPI_Recv.AsyncStart\n"); - int32_t promptID; - MPI_Recv(&promptID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, - MPI_COMM_WORLD, MPI_STATUS_IGNORE); - MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank, - MPI_COMM_WORLD, MPI_STATUS_IGNORE); - printf("%d\n", this->nextTokens[0]); - if (PromptPool::getInstance().has(promptID)) { - TaskWaitingQueue::getInstance().push(PromptPool::getInstance().get(promptID)); - } else { - printf("error: should have promptID\n"); + while (true) { + printf("0: GreedySearch.MPI_Recv.AsyncStart\n"); + fflush(stdout); + int32_t promptID; + MPI_Recv(&promptID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); + printf("%d\n", this->nextTokens[0]); + TimeLine t("GreedySearch.MPI_Recv"); + if (PromptPool::getInstance().has(promptID)) { + auto prompt = PromptPool::getInstance().get(promptID); + TaskWaitingQueue::getInstance().push(prompt); + printf("0: GreedySearch.MPI_Recv.AsyncDone %d\n", promptID); + fflush(stdout); + } else { + printf("error: should have promptID\n"); + fflush(stdout); + } } - printf("0: GreedySearch.MPI_Recv.AsyncDone %d\n", promptID); - fflush(stdout); }); } } else { // The last predictor pipeline parallel stage this->nextTokens = this->search(result); if (ctx->ppSize > 1 && ctx->ppRank == ctx->ppSize - 1) { TimeLine t("GreedySearch.MPI_Send"); + fflush(stdout); int embedding_world_rank = 0 * ctx->tpSize + ctx->tpRank; int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank; - static int32_t promptID = 0; - MPI_Send(&promptID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); + MPI_Send(&ctx->promptID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); MPI_Send(this->nextTokens.data(), batchSize, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); - printf("%d: GreedySearch.MPI_Send %d\n", ctx->ppRank, promptID); + printf("%d: GreedySearch.MPI_Send %d\n", ctx->ppRank, ctx->promptID); fflush(stdout); - promptID++; // TODO: Error: different scope when dynamic loading so file // messenger.worldSendINT32(this->nextTokens.data(), batchSize, embedding_world_rank, predictor_world_rank); } From 4f67f28e7fa4c64597670b9f295cfa255eff469d Mon Sep 17 00:00:00 2001 From: changqi1 Date: Fri, 19 Apr 2024 14:28:40 +0800 Subject: [PATCH 04/32] use threadpool singleton --- src/models/common_decoder.h | 5 +-- src/searchers/greedy_search.cpp | 4 +-- src/searchers/greedy_search.h | 1 - src/utils/thread_util.h | 61 ++++++++++++++++----------------- 4 files changed, 31 insertions(+), 40 deletions(-) diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index 14931a24..c4f15c16 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -225,7 +225,6 @@ class CommonDecoder : public AbstractDecoder { DecoderContext *ctx = getDecoderContext(layers, hiddenSize, size_per_head, attHeadNum, kvHeadNum, imSize, act, epsilon, vocabSize, embeddingSize, maxPositions, maxPosEmbed, maxSeqLength, useLogN, useNTK, ropeParamsPtr); - pool = new ThreadPool(4); ctx->ResetConfigReader(configPath); @@ -365,7 +364,7 @@ class CommonDecoder : public AbstractDecoder { static bool init = false; if (init == false) { init = true; - pool->enqueue([curr_world_rank, prev_world_rank, count, batchSize, seqLen, hiddenSize, pastSeqLen, &embBuf, &ctx, this] { + ThreadPool::getInstance().addTask([curr_world_rank, prev_world_rank, count, batchSize, seqLen, hiddenSize, pastSeqLen, &embBuf, &ctx, this] { while (true) { printf("%d: Decoder.MPI_Recv.ASyncStart\n", ctx->ppRank); fflush(stdout); @@ -1039,8 +1038,6 @@ class CommonDecoder : public AbstractDecoder { // Activation buffers (declared as float, but the actual data type may be different) std::shared_ptr> actBuffers; - ThreadPool *pool; - protected: // Components most LLMs may use std::vector decoders; diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp index be34ada9..d5b3733e 100644 --- a/src/searchers/greedy_search.cpp +++ b/src/searchers/greedy_search.cpp @@ -31,8 +31,6 @@ GreedySearch::GreedySearch(AbstractDecoder &dec, const SearcherConfig &config) } stopWordsList = {}; stopWordsIndex = {}; - - pool = new ThreadPool(4); } std::vector GreedySearch::syncToken(std::tuple &result) { @@ -56,7 +54,7 @@ std::vector GreedySearch::syncToken(std::tuple &result) // printf("%d\n", this->nextTokens[0]); // }); // feedbackWaitingLastPP.detach(); - pool->enqueue([predictor_world_rank, this] { + ThreadPool::getInstance().addTask([predictor_world_rank, this] { while (true) { printf("0: GreedySearch.MPI_Recv.AsyncStart\n"); fflush(stdout); diff --git a/src/searchers/greedy_search.h b/src/searchers/greedy_search.h index bf2709a5..69784c5e 100644 --- a/src/searchers/greedy_search.h +++ b/src/searchers/greedy_search.h @@ -41,7 +41,6 @@ class GreedySearch : public AbstractSearcher { std::vector search(std::tuple &result); AbstractDecoder &decoder; - ThreadPool *pool; // Predicted token IDs std::vector nextTokens; diff --git a/src/utils/thread_util.h b/src/utils/thread_util.h index 1d2d0dfe..17f49453 100644 --- a/src/utils/thread_util.h +++ b/src/utils/thread_util.h @@ -27,53 +27,50 @@ void parallel_for_dschedule(int tasks, const Lambda &fn) { class ThreadPool { public: - ThreadPool(size_t numThreads) : stop(false) { - for (size_t i = 0; i < numThreads; ++i) { - workers.emplace_back([this] { - while (true) { - std::function task; - - { - std::unique_lock lock(this->queueMutex); - this->condition.wait(lock, [this] { return this->stop || !this->tasks.empty(); }); - - if (this->stop && this->tasks.empty()) { return; } - - task = std::move(this->tasks.front()); - this->tasks.pop(); - } - - task(); - } - }); - } + static ThreadPool& getInstance() { + static ThreadPool instance; + return instance; } - template - void enqueue(F &&f, Args &&...args) { + template + void addTask(F&& f, Args&&... args) { { std::unique_lock lock(queueMutex); - - tasks.emplace([f, args...] { f(args...); }); + tasks.emplace(std::bind(std::forward(f), std::forward(args)...)); } - condition.notify_one(); } ~ThreadPool() { - { - std::unique_lock lock(queueMutex); - stop = true; - } - + stop = true; condition.notify_all(); - - for (std::thread &worker : workers) { + for (std::thread& worker : workers) { worker.join(); } } private: + ThreadPool() : stop(false) { + for (size_t i = 0; i < numThreads; ++i) { + workers.emplace_back([this] { + while (true) { + std::function task; + { + std::unique_lock lock(queueMutex); + condition.wait(lock, [this] { return stop || !tasks.empty(); }); + if (stop && tasks.empty()) { + return; + } + task = std::move(tasks.front()); + tasks.pop(); + } + task(); + } + }); + } + } + + static constexpr size_t numThreads = 1; std::vector workers; std::queue> tasks; From d7a43045b20de4c9d08e80c3f5c3db7c689b4c69 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Fri, 19 Apr 2024 14:32:42 +0800 Subject: [PATCH 05/32] format code --- src/models/prompt.h | 44 ++++++++++++----------------------------- src/utils/thread_util.h | 26 +++++++++++++++++------- 2 files changed, 32 insertions(+), 38 deletions(-) diff --git a/src/models/prompt.h b/src/models/prompt.h index c10b1ab7..e9a044dd 100644 --- a/src/models/prompt.h +++ b/src/models/prompt.h @@ -22,7 +22,8 @@ namespace xft { template class PromptMeta { public: - PromptMeta(int32_t _promptID, int32_t _tokenID, int32_t _batchSize, int32_t _inputSeqLen, std::vector _inputs) { + PromptMeta(int32_t _promptID, int32_t _tokenID, int32_t _batchSize, int32_t _inputSeqLen, + std::vector _inputs) { promptID = _promptID; tokenID = _tokenID; batchSize = _batchSize; @@ -92,9 +93,7 @@ class InputQueue { return id; } - bool empty() { - return queue.empty(); - } + bool empty() { return queue.empty(); } PromptMeta *pop() { auto buffer = queue.front(); @@ -102,9 +101,7 @@ class InputQueue { return buffer; } - void push(PromptMeta *buffer) { - queue.push(buffer); - } + void push(PromptMeta *buffer) { queue.push(buffer); } private: InputQueue() {} @@ -128,18 +125,13 @@ class TaskWaitingQueue { return instance; } - bool empty() { - return queue.empty(); - } + bool empty() { return queue.empty(); } - int32_t size() { - return queue.size(); - } + int32_t size() { return queue.size(); } bool isFull() { bool full = false; - if (this->size() >= 4) - full = true; + if (this->size() >= 4) full = true; return full; } @@ -149,9 +141,7 @@ class TaskWaitingQueue { return buffer; } - void push(PromptMeta *buffer) { - queue.push(buffer); - } + void push(PromptMeta *buffer) { queue.push(buffer); } private: TaskWaitingQueue() {} @@ -167,13 +157,9 @@ class PromptPool { return instance; } - void insert(int32_t key, PromptMeta *prompt) { - hub[key] = prompt; - } + void insert(int32_t key, PromptMeta *prompt) { hub[key] = prompt; } - bool has(int32_t key) const { - return hub.find(key) != hub.end(); - } + bool has(int32_t key) const { return hub.find(key) != hub.end(); } PromptMeta *get(int32_t key) const { auto it = hub.find(key); @@ -186,21 +172,17 @@ class PromptPool { std::vector *> getAll() { std::vector *> metas; - for (const auto& pair : hub) { + for (const auto &pair : hub) { metas.push_back(pair.second); } return metas; } - void remove(int32_t key) { - hub.erase(key); - } + void remove(int32_t key) { hub.erase(key); } void modify(int32_t oldKey, PromptMeta *newPrompt) { auto it = hub.find(oldKey); - if (it != hub.end()) { - it->second = newPrompt; - } + if (it != hub.end()) { it->second = newPrompt; } } bool isUpdated(int32_t key) const { diff --git a/src/utils/thread_util.h b/src/utils/thread_util.h index 17f49453..b7209384 100644 --- a/src/utils/thread_util.h +++ b/src/utils/thread_util.h @@ -1,3 +1,17 @@ +// Copyright (c) 2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================ #pragma once #include @@ -27,13 +41,13 @@ void parallel_for_dschedule(int tasks, const Lambda &fn) { class ThreadPool { public: - static ThreadPool& getInstance() { + static ThreadPool &getInstance() { static ThreadPool instance; return instance; } - template - void addTask(F&& f, Args&&... args) { + template + void addTask(F &&f, Args &&...args) { { std::unique_lock lock(queueMutex); tasks.emplace(std::bind(std::forward(f), std::forward(args)...)); @@ -44,7 +58,7 @@ class ThreadPool { ~ThreadPool() { stop = true; condition.notify_all(); - for (std::thread& worker : workers) { + for (std::thread &worker : workers) { worker.join(); } } @@ -58,9 +72,7 @@ class ThreadPool { { std::unique_lock lock(queueMutex); condition.wait(lock, [this] { return stop || !tasks.empty(); }); - if (stop && tasks.empty()) { - return; - } + if (stop && tasks.empty()) { return; } task = std::move(tasks.front()); tasks.pop(); } From 70c17cd8b07024042d242804d81fcb05d53fc636 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Fri, 19 Apr 2024 14:40:01 +0800 Subject: [PATCH 06/32] format code --- src/common/transformer_ctx.h | 25 ++++++++++++------------- src/utils/thread_util.h | 6 +++++- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h index 5f750fe8..d6f858af 100644 --- a/src/common/transformer_ctx.h +++ b/src/common/transformer_ctx.h @@ -18,17 +18,15 @@ #include #include #include -#include #include -#include "INIReader.h" #include "allocator.h" +#include + +#include "INIReader.h" #include "my_types.h" #include "simple_mem_pool.h" #include "split_util.h" -#include "float16.h" -#include "singleton.h" -#include "kvcache_manager.h" namespace fs = std::filesystem; @@ -66,8 +64,7 @@ struct DecoderContext { int inputSeqLen; // For custom usage int reserved1; - // promptID - int32_t promptID; + int promptID; // Model structure configuration int vocabSize; @@ -130,10 +127,10 @@ struct DecoderContext { uint64_t size3; public: - DecoderContext(int _layers, int _hiddenSize, int _headSize, int _attHeadNum, int _kvHeadNum, int _imSize, - const std::string &act, float epsilon, int _vocabSize, int _embeddingSize, int _maxPositions, - int _maxPosEmbed, int _maxSeqLength, int _splitIdx, int _splits, int _ppSize = 1, int _ppRank = 0, - RopeParams *_ropeParamsPtr = nullptr, bool _useLogN = true, bool _useNTK = true, int numThreads = 0) + DecoderContext(int _layers, int _hiddenSize, int _headSize, int _attHeadNum, int _kvHeadNum, int _imSize, const std::string &act, + float epsilon, int _vocabSize, int _embeddingSize, int _maxPositions, int _maxPosEmbed, int _maxSeqLength, + int _splitIdx, int _splits, int _ppSize = 1, int _ppRank = 0, RopeParams *_ropeParamsPtr = nullptr, + bool _useLogN = true, bool _useNTK = true, int numThreads = 0) : layers(_layers) , hiddenSize(_hiddenSize) , attHeadSize(_headSize) @@ -155,7 +152,9 @@ struct DecoderContext { , tpSize(_splits) , tpRank(_splitIdx) , epsilon(epsilon) { - if (attHeadNum != 0) { this->attFactor = 1 / sqrtf(attHeadSize); } + if (attHeadNum != 0) { + this->attFactor = 1 / sqrtf(attHeadSize); + } // Set the default value (don't worry, it can be changed later) this->batchSize = 1; @@ -328,4 +327,4 @@ struct DecoderContext { } ~DecoderContext() { free(this->rawBuffer); } -}; +}; \ No newline at end of file diff --git a/src/utils/thread_util.h b/src/utils/thread_util.h index b7209384..75507274 100644 --- a/src/utils/thread_util.h +++ b/src/utils/thread_util.h @@ -23,6 +23,8 @@ #include #include +namespace xft { + template void parallel_for(int tasks, const Lambda &fn) { #pragma omp parallel for @@ -89,4 +91,6 @@ class ThreadPool { std::mutex queueMutex; std::condition_variable condition; bool stop; -}; \ No newline at end of file +}; + +} // namespace xft \ No newline at end of file From 3eca8ef54025c3fec193b9690800285b193c960e Mon Sep 17 00:00:00 2001 From: changqi1 Date: Fri, 19 Apr 2024 15:07:50 +0800 Subject: [PATCH 07/32] remove non-Master thread code --- src/models/common_decoder.h | 70 +++++++++++-------------------------- 1 file changed, 21 insertions(+), 49 deletions(-) diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index 7ede3316..5e5067f9 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -274,6 +274,7 @@ class CommonDecoder : public AbstractDecoder { std::tuple forward(int *ids, int64_t *dims, int step, bool logitsAll = false) { // Assume input has been synced with master in higher level. // Assume the 1st step input's shape is [userSideBS][1][seqLen]. + TimeLine t("Decoder.forward"); TimeLine t1("Decoder.embedding"); int userSideBS = dims[0]; @@ -315,7 +316,7 @@ class CommonDecoder : public AbstractDecoder { } AttnInT *embBuf = (AttnInT *)actBuffers->Data(); - MlpOutT *outBuf = (MlpOutT *)(embBuf + batchSize * inputSeqLen * ctx->hiddenSize); + MlpOutT *outBuf = (MlpOutT *)(embBuf + batchSize * inputSeqLen * hiddenSize); // Embedding this->embeddingForward(ids, embBuf, batchSize, inputSeqLen); @@ -326,8 +327,8 @@ class CommonDecoder : public AbstractDecoder { dbg.debugPrint("ids:\n"); dbg.dumpMatrix(ids, batchSize, inputSeqLen, inputSeqLen); dbg.debugPrint( - "embBuf(rows: %d, cols: %d, stride: %d):\n", batchSize * inputSeqLen, ctx->hiddenSize, ctx->hiddenSize); - dbg.dumpMatrix(embBuf, batchSize * inputSeqLen, ctx->hiddenSize, ctx->hiddenSize); + "embBuf(rows: %d, cols: %d, stride: %d):\n", batchSize * inputSeqLen, hiddenSize, hiddenSize); + dbg.dumpMatrix(embBuf, batchSize * inputSeqLen, hiddenSize, hiddenSize); #endif // Prepare attention mask @@ -342,52 +343,23 @@ class CommonDecoder : public AbstractDecoder { if (ctx->ppSize > 1 && ctx->ppRank > 0) { int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank; int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank; - int count = batchSize * inputSeqLen * ctx->hiddenSize; - if (TaskWaitingQueue::getInstance().empty()) { - TimeLine t("Decoder.MPI_Recv." + std::to_string(ctx->promptID)); - printf("%d: Decoder.MPI_Recv.SyncStart\n", ctx->ppRank); - fflush(stdout); - int32_t promptID; - MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - // TODO: Error: different scope when dynamic loading so file - // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); - printf("%.6f\n", embBuf[0]); - if (!PromptPool::getInstance().has(promptID)) { - PromptMeta *prompt = new PromptMeta(promptID, 0, batchSize, seqLen, hiddenSize); - prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); - PromptPool::getInstance().insert(prompt->promptID, prompt); - } - TaskWaitingQueue::getInstance().push(PromptPool::getInstance().get(promptID)); - printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, promptID); - fflush(stdout); - } else { - static bool init = false; - if (init == false) { - init = true; - ThreadPool::getInstance().addTask([curr_world_rank, prev_world_rank, count, batchSize, seqLen, hiddenSize, pastSeqLen, &embBuf, &ctx, this] { - while (true) { - printf("%d: Decoder.MPI_Recv.ASyncStart\n", ctx->ppRank); - fflush(stdout); - int32_t promptID; - MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - // TODO: Error: different scope when dynamic loading so file - // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); - printf("%.6f\n", embBuf[0]); - TimeLine t("Decoder.MPI_Recv"); - if (!PromptPool::getInstance().has(promptID)) { - PromptMeta *prompt = new PromptMeta(promptID, 0, batchSize, seqLen, hiddenSize); - prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); - PromptPool::getInstance().insert(prompt->promptID, prompt); - } - TaskWaitingQueue::getInstance().push(PromptPool::getInstance().get(promptID)); - printf("%d: Decoder.MPI_Recv.ASyncDone %d\n", ctx->ppRank, promptID); - fflush(stdout); - } - }); - } + int count = batchSize * inputSeqLen * hiddenSize; + TimeLine t("Decoder.MPI_Recv." + std::to_string(ctx->promptID)); + printf("%d: Decoder.MPI_Recv.SyncStart\n", ctx->ppRank); + fflush(stdout); + int32_t promptID; + MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + // TODO: Error: different scope when dynamic loading so file + // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); + if (!PromptPool::getInstance().has(promptID)) { + PromptMeta *prompt = new PromptMeta(promptID, 0, batchSize, seqLen, hiddenSize); + prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); + PromptPool::getInstance().insert(prompt->promptID, prompt); } + TaskWaitingQueue::getInstance().push(PromptPool::getInstance().get(promptID)); + printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, promptID); + fflush(stdout); } #endif @@ -473,7 +445,7 @@ class CommonDecoder : public AbstractDecoder { if (ctx->ppSize > 1 && ctx->ppRank < ctx->ppSize - 1) { TimeLine t("Decoder.MPI_Send"); int next_world_rank = (ctx->ppRank + 1) * ctx->tpSize + ctx->tpRank; - int count = batchSize * inputSeqLen * ctx->hiddenSize; + int count = batchSize * inputSeqLen * hiddenSize; MPI_Send(&runningTask->promptID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD); MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD); // TODO: Error: different scope when dynamic loading so file From 2216a4082c2f1d7465ccfc6a0ebeb760c6d259b4 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Fri, 19 Apr 2024 15:36:00 +0800 Subject: [PATCH 08/32] format code --- src/models/common_decoder.h | 38 +++++++++++++++------------------ src/models/models.cpp | 5 ----- src/searchers/greedy_search.cpp | 2 -- src/utils/thread_util.h | 1 - 4 files changed, 17 insertions(+), 29 deletions(-) diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index 5e5067f9..c02ed020 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -361,28 +361,26 @@ class CommonDecoder : public AbstractDecoder { printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, promptID); fflush(stdout); } -#endif - if (!TaskWaitingQueue::getInstance().isFull()) { - if (!InputQueue::getInstance().empty()) { - if (!TaskWaitingQueue::getInstance().isFull()) { - auto prompt = InputQueue::getInstance().pop(); - prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); - PromptPool::getInstance().insert(prompt->promptID, prompt); - TaskWaitingQueue::getInstance().push(PromptPool::getInstance().get(prompt->promptID)); - } + if (!InputQueue::getInstance().empty()) { + if (!TaskWaitingQueue::getInstance().isFull()) { + auto prompt = InputQueue::getInstance().pop(); + prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); + PromptPool::getInstance().insert(prompt->promptID, prompt); + TaskWaitingQueue::getInstance().push(PromptPool::getInstance().get(prompt->promptID)); } } - while(TaskWaitingQueue::getInstance().empty()); + while(TaskWaitingQueue::getInstance().empty()); - PromptMeta *runningTask; - if (!TaskWaitingQueue::getInstance().empty()) { - runningTask = TaskWaitingQueue::getInstance().pop(); - ctx->promptID = runningTask->promptID; - TimeLine t("Decoder.forward." + std::to_string(ctx->promptID)); - printf("%d: Decoder.forward\n", ctx->ppRank); - fflush(stdout); + PromptMeta *runningTask; + if (!TaskWaitingQueue::getInstance().empty()) { + runningTask = TaskWaitingQueue::getInstance().pop(); + ctx->promptID = runningTask->promptID; + TimeLine t("Decoder.forward." + std::to_string(ctx->promptID)); + printf("%d: Decoder.forward\n", ctx->ppRank); + fflush(stdout); +#endif // Decoder: forward from runningTask int layers_per_pp_stage = this->decoders.size(); @@ -435,12 +433,10 @@ class CommonDecoder : public AbstractDecoder { } } } - } - // else { - // return std::tuple(nullptr, 0, 0); - // } #ifdef PIPELINE_PARALLEL + } + // If current pipeline stage isn't the end of stage, should send data to next stage and return nullptr if (ctx->ppSize > 1 && ctx->ppRank < ctx->ppSize - 1) { TimeLine t("Decoder.MPI_Send"); diff --git a/src/models/models.cpp b/src/models/models.cpp index ebbe4f7d..80e7c28c 100644 --- a/src/models/models.cpp +++ b/src/models/models.cpp @@ -157,11 +157,6 @@ std::vector Model::generate() { } return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); } else { - // static int i = 0; - // i++; - // if (i == 10) { - // isNewInput = true; - // } return searcher->getNextToken(); } } diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp index d5b3733e..2747cb13 100644 --- a/src/searchers/greedy_search.cpp +++ b/src/searchers/greedy_search.cpp @@ -17,8 +17,6 @@ #include "search_utils.h" #include "prompt.h" -#include - using namespace xft; GreedySearch::GreedySearch(AbstractDecoder &dec, const SearcherConfig &config) diff --git a/src/utils/thread_util.h b/src/utils/thread_util.h index 75507274..a44b08d7 100644 --- a/src/utils/thread_util.h +++ b/src/utils/thread_util.h @@ -16,7 +16,6 @@ #include #include -#include #include #include #include From 143294b74b5f734a9bbcfd294c2b7685bc393ac7 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Fri, 19 Apr 2024 15:39:41 +0800 Subject: [PATCH 09/32] format code --- src/models/common_decoder.h | 2 +- src/searchers/greedy_search.cpp | 1 + src/searchers/greedy_search.h | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index c02ed020..44252ad6 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -382,7 +382,7 @@ class CommonDecoder : public AbstractDecoder { fflush(stdout); #endif - // Decoder: forward from runningTask + // Decoder: forward int layers_per_pp_stage = this->decoders.size(); for (int i = 0; i < layers_per_pp_stage; ++i) { int workers = this->messenger.getSize(); diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp index 2747cb13..ed8c90d4 100644 --- a/src/searchers/greedy_search.cpp +++ b/src/searchers/greedy_search.cpp @@ -15,6 +15,7 @@ #include "greedy_search.h" #include "messenger.h" #include "search_utils.h" +#include "thread_util.h" #include "prompt.h" using namespace xft; diff --git a/src/searchers/greedy_search.h b/src/searchers/greedy_search.h index 69784c5e..607d9737 100644 --- a/src/searchers/greedy_search.h +++ b/src/searchers/greedy_search.h @@ -18,7 +18,6 @@ #include "messenger.h" #include "timeline.h" #include "transformer_ctx.h" -#include "thread_util.h" class GreedySearch : public AbstractSearcher { public: From 700d0c8f7a2aa8764d3f307f54d6a1521343a550 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Fri, 19 Apr 2024 15:43:35 +0800 Subject: [PATCH 10/32] move prompt.h --- src/{models => utils}/prompt.h | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{models => utils}/prompt.h (100%) diff --git a/src/models/prompt.h b/src/utils/prompt.h similarity index 100% rename from src/models/prompt.h rename to src/utils/prompt.h From 8295d0bfc2e5f7c5f6b0b113356921820840596a Mon Sep 17 00:00:00 2001 From: changqi1 Date: Fri, 19 Apr 2024 17:43:50 +0800 Subject: [PATCH 11/32] Add comments --- src/utils/prompt.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/utils/prompt.h b/src/utils/prompt.h index e9a044dd..05213f82 100644 --- a/src/utils/prompt.h +++ b/src/utils/prompt.h @@ -14,9 +14,29 @@ // ============================================================================ #pragma once +#include #include #include +/* + PromptPool + ┌──────┬──────┬──────┐ + │ │ │ ◄───┼──┬─ PromptMeta + ├──────┼──────┼──────┤ │ + │ │ │ ◄───┼──┘ + └▲─┬─▲─┴──────┴──────┘ + │ │ └───────────────────────────────────┐ + ┌──┬──┬──┬──┐ │ │ ┌──┬──┬──┬──┬──┬──┬──┬──┬──┐ │ + │ │ │ │ ├──┘ └─────►│ │ │ │ │ │ │ │ │ ├─┐ │ + └──┴──┴──┴──┘ └──┴──┴──┴──┴──┴──┴──┴──┴──┘ │ │ + InputQueue TaskWaitingQueue0 │ │ + ┌───────────────────────────────┘ │ + │ ┌──┬──┬──┬──┬──┬──┬──┬──┬──┐ │ + └─►│ │ │ │ │ │ │ │ │ ├───┘ + └──┴──┴──┴──┴──┴──┴──┴──┴──┘ + TaskWaitingQueue1 +*/ + namespace xft { template From 93cbade9bdec945f10bcf8239375b6e6b3b95a95 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 10:51:12 +0800 Subject: [PATCH 12/32] input 2 request --- examples/cpp/example.cpp | 2 +- src/models/models.cpp | 2 +- src/searchers/greedy_search.cpp | 28 +++++----------------------- src/searchers/greedy_search.h | 1 + src/utils/prompt.h | 29 ++--------------------------- 5 files changed, 10 insertions(+), 52 deletions(-) diff --git a/examples/cpp/example.cpp b/examples/cpp/example.cpp index f6a68ce7..34cf4eb0 100644 --- a/examples/cpp/example.cpp +++ b/examples/cpp/example.cpp @@ -467,7 +467,7 @@ int main(int argc, char **argv) { } auto result = model.finalize(); - if (true) { + if (isMaster) { std::cout << "\n[INFO] Final output is: " << std::endl; std::vector sent = tokenizer->batchDecode(result, batchSize); for (auto str : sent) { diff --git a/src/models/models.cpp b/src/models/models.cpp index 80e7c28c..0d43c765 100644 --- a/src/models/models.cpp +++ b/src/models/models.cpp @@ -145,7 +145,7 @@ std::vector Model::generate() { if (isNewInput) { static int i = 0; i++; - if (i > 3) { + if (i > 1) { isNewInput = false; i = 0; } diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp index ed8c90d4..5bd5ec0c 100644 --- a/src/searchers/greedy_search.cpp +++ b/src/searchers/greedy_search.cpp @@ -39,38 +39,23 @@ std::vector GreedySearch::syncToken(std::tuple &result) // Messenger &messenger = decoder.getMessenger(); if (std::get<0>(result) == nullptr) { // The first embedding pipeline parallel stage - static bool init = false; this->nextTokens = std::vector(batchSize, 0); - if (ctx->ppSize > 1 && ctx->ppRank == 0 && init == false) { - init = true; + if (ctx->ppSize > 1 && ctx->ppRank == 0 && enabledBackgroundSync == false) { + enabledBackgroundSync = true; int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank; - // std::thread feedbackWaitingLastPP([&, predictor_world_rank, this](){ - // TimeLine t("GreedySearch.MPI_Recv"); - // MPI_Recv(this->nextTokens.data(), batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank, - // MPI_COMM_WORLD, MPI_STATUS_IGNORE); - // // TODO: Error: different scope when dynamic loading so file - // // messenger.worldRecvINT32(this->nextTokens.data(), batchSize, predictor_world_rank, predictor_world_rank); - // printf("%d\n", this->nextTokens[0]); - // }); - // feedbackWaitingLastPP.detach(); ThreadPool::getInstance().addTask([predictor_world_rank, this] { while (true) { - printf("0: GreedySearch.MPI_Recv.AsyncStart\n"); - fflush(stdout); int32_t promptID; MPI_Recv(&promptID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + TimeLine t("GreedySearch.MPI_Recv.prompt" + std::to_string(promptID)); MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - printf("%d\n", this->nextTokens[0]); - TimeLine t("GreedySearch.MPI_Recv"); if (PromptPool::getInstance().has(promptID)) { auto prompt = PromptPool::getInstance().get(promptID); TaskWaitingQueue::getInstance().push(prompt); - printf("0: GreedySearch.MPI_Recv.AsyncDone %d\n", promptID); - fflush(stdout); } else { - printf("error: should have promptID\n"); + printf("Error: should have promptID\n"); fflush(stdout); } } @@ -79,15 +64,12 @@ std::vector GreedySearch::syncToken(std::tuple &result) } else { // The last predictor pipeline parallel stage this->nextTokens = this->search(result); if (ctx->ppSize > 1 && ctx->ppRank == ctx->ppSize - 1) { - TimeLine t("GreedySearch.MPI_Send"); - fflush(stdout); + TimeLine t("GreedySearch.MPI_Send.prompt" + std::to_string(ctx->promptID)); int embedding_world_rank = 0 * ctx->tpSize + ctx->tpRank; int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank; MPI_Send(&ctx->promptID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); MPI_Send(this->nextTokens.data(), batchSize, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); - printf("%d: GreedySearch.MPI_Send %d\n", ctx->ppRank, ctx->promptID); - fflush(stdout); // TODO: Error: different scope when dynamic loading so file // messenger.worldSendINT32(this->nextTokens.data(), batchSize, embedding_world_rank, predictor_world_rank); } diff --git a/src/searchers/greedy_search.h b/src/searchers/greedy_search.h index 607d9737..5b4ec164 100644 --- a/src/searchers/greedy_search.h +++ b/src/searchers/greedy_search.h @@ -47,6 +47,7 @@ class GreedySearch : public AbstractSearcher { std::vector> cachedRepetVec; std::vector doneBatch; + bool enabledBackgroundSync; int batchSize; int step; int curLen; diff --git a/src/utils/prompt.h b/src/utils/prompt.h index 05213f82..753e4bd1 100644 --- a/src/utils/prompt.h +++ b/src/utils/prompt.h @@ -190,38 +190,13 @@ class PromptPool { } } - std::vector *> getAll() { - std::vector *> metas; - for (const auto &pair : hub) { - metas.push_back(pair.second); - } - return metas; - } - void remove(int32_t key) { hub.erase(key); } void modify(int32_t oldKey, PromptMeta *newPrompt) { auto it = hub.find(oldKey); - if (it != hub.end()) { it->second = newPrompt; } - } - - bool isUpdated(int32_t key) const { - auto it = hub.find(key); if (it != hub.end()) { - return it->second.hiddenStatesReceived; - } else { - printf("error: key not found\n"); - return false; - } - } - - bool setOld(int32_t key) { - auto it = hub.find(key); - if (it != hub.end()) { - it->second.hiddenStatesReceived = false; - } else { - printf("error: key not found\n"); - return false; + delete it->second; + it->second = newPrompt; } } From dadc69261abdca841059dde5182e7e69230a1ec4 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 10:53:28 +0800 Subject: [PATCH 13/32] format code --- src/searchers/greedy_search.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp index 5bd5ec0c..ca21a9d7 100644 --- a/src/searchers/greedy_search.cpp +++ b/src/searchers/greedy_search.cpp @@ -14,9 +14,9 @@ // ============================================================================ #include "greedy_search.h" #include "messenger.h" +#include "prompt.h" #include "search_utils.h" #include "thread_util.h" -#include "prompt.h" using namespace xft; @@ -46,11 +46,11 @@ std::vector GreedySearch::syncToken(std::tuple &result) ThreadPool::getInstance().addTask([predictor_world_rank, this] { while (true) { int32_t promptID; - MPI_Recv(&promptID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, - MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(&promptID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, + MPI_STATUS_IGNORE); TimeLine t("GreedySearch.MPI_Recv.prompt" + std::to_string(promptID)); - MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank, - MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, + predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); if (PromptPool::getInstance().has(promptID)) { auto prompt = PromptPool::getInstance().get(promptID); TaskWaitingQueue::getInstance().push(prompt); From 0d0c74b162867d0b8f0af92f39a6498fd883b643 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 13:19:57 +0800 Subject: [PATCH 14/32] modify promptID to sampleID. --- src/common/transformer_ctx.h | 2 +- src/models/common_decoder.h | 31 +++---- src/models/models.cpp | 5 +- src/searchers/greedy_search.cpp | 16 ++-- src/utils/prompt.h | 151 ++++++++++++++++---------------- 5 files changed, 103 insertions(+), 102 deletions(-) diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h index cac52aa0..67611e2b 100644 --- a/src/common/transformer_ctx.h +++ b/src/common/transformer_ctx.h @@ -64,7 +64,7 @@ struct DecoderContext { int inputSeqLen; // For custom usage int reserved1; - int promptID; + int sampleID; // Model structure configuration int vocabSize; diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index 44252ad6..87ccdfd6 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -344,21 +344,21 @@ class CommonDecoder : public AbstractDecoder { int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank; int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank; int count = batchSize * inputSeqLen * hiddenSize; - TimeLine t("Decoder.MPI_Recv." + std::to_string(ctx->promptID)); + TimeLine t("Decoder.MPI_Recv." + std::to_string(ctx->sampleID)); printf("%d: Decoder.MPI_Recv.SyncStart\n", ctx->ppRank); fflush(stdout); - int32_t promptID; - MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + int32_t sampleID; + MPI_Recv(&sampleID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // TODO: Error: different scope when dynamic loading so file // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); - if (!PromptPool::getInstance().has(promptID)) { - PromptMeta *prompt = new PromptMeta(promptID, 0, batchSize, seqLen, hiddenSize); + if (!SamplePool::getInstance().has(sampleID)) { + SampleMeta *prompt = new SampleMeta(sampleID, seqLen, hiddenSize); prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); - PromptPool::getInstance().insert(prompt->promptID, prompt); + SamplePool::getInstance().insert(prompt->getSampleID(), prompt); } - TaskWaitingQueue::getInstance().push(PromptPool::getInstance().get(promptID)); - printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, promptID); + TaskWaitingQueue::getInstance().push(SamplePool::getInstance().get(sampleID)); + printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, sampleID); fflush(stdout); } @@ -366,18 +366,18 @@ class CommonDecoder : public AbstractDecoder { if (!TaskWaitingQueue::getInstance().isFull()) { auto prompt = InputQueue::getInstance().pop(); prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); - PromptPool::getInstance().insert(prompt->promptID, prompt); - TaskWaitingQueue::getInstance().push(PromptPool::getInstance().get(prompt->promptID)); + SamplePool::getInstance().insert(prompt->getSampleID(), prompt); + TaskWaitingQueue::getInstance().push(SamplePool::getInstance().get(prompt->getSampleID())); } } while(TaskWaitingQueue::getInstance().empty()); - PromptMeta *runningTask; + SampleMeta *runningTask; if (!TaskWaitingQueue::getInstance().empty()) { runningTask = TaskWaitingQueue::getInstance().pop(); - ctx->promptID = runningTask->promptID; - TimeLine t("Decoder.forward." + std::to_string(ctx->promptID)); + ctx->sampleID = runningTask->getSampleID(); + TimeLine t("Decoder.forward." + std::to_string(ctx->sampleID)); printf("%d: Decoder.forward\n", ctx->ppRank); fflush(stdout); #endif @@ -442,11 +442,12 @@ class CommonDecoder : public AbstractDecoder { TimeLine t("Decoder.MPI_Send"); int next_world_rank = (ctx->ppRank + 1) * ctx->tpSize + ctx->tpRank; int count = batchSize * inputSeqLen * hiddenSize; - MPI_Send(&runningTask->promptID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD); + int32_t sampleID = runningTask->getSampleID(); + MPI_Send(&sampleID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD); MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD); // TODO: Error: different scope when dynamic loading so file // this->messenger.worldSendFP32(embBuf, count, next_world_rank, next_world_rank); - printf("%d: Decoder.MPI_Send %d\n", ctx->ppRank, runningTask->promptID); + printf("%d: Decoder.MPI_Send %d\n", ctx->ppRank, sampleID); fflush(stdout); return std::tuple(nullptr, 0, 0); } diff --git a/src/models/models.cpp b/src/models/models.cpp index 0d43c765..da76348e 100644 --- a/src/models/models.cpp +++ b/src/models/models.cpp @@ -151,9 +151,8 @@ std::vector Model::generate() { } // TODO: Create it when request input if (this->isMaster()) { - int promptID = InputQueue::getInstance().createPromptID(); - int tokenID = InputQueue::getInstance().createTokenID(); - InputQueue::getInstance().push(new PromptMeta(promptID, tokenID, batchSize, seqLen, inputIds)); + int sampleID = InputQueue::getInstance().createSampleID(); + InputQueue::getInstance().push(new SampleMeta(sampleID, seqLen, inputIds)); } return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); } else { diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp index ca21a9d7..3d718bd0 100644 --- a/src/searchers/greedy_search.cpp +++ b/src/searchers/greedy_search.cpp @@ -45,17 +45,17 @@ std::vector GreedySearch::syncToken(std::tuple &result) int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank; ThreadPool::getInstance().addTask([predictor_world_rank, this] { while (true) { - int32_t promptID; - MPI_Recv(&promptID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, + int32_t sampleID; + MPI_Recv(&sampleID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - TimeLine t("GreedySearch.MPI_Recv.prompt" + std::to_string(promptID)); + TimeLine t("GreedySearch.MPI_Recv.prompt" + std::to_string(sampleID)); MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - if (PromptPool::getInstance().has(promptID)) { - auto prompt = PromptPool::getInstance().get(promptID); + if (SamplePool::getInstance().has(sampleID)) { + auto prompt = SamplePool::getInstance().get(sampleID); TaskWaitingQueue::getInstance().push(prompt); } else { - printf("Error: should have promptID\n"); + printf("Error: should have sampleID\n"); fflush(stdout); } } @@ -64,10 +64,10 @@ std::vector GreedySearch::syncToken(std::tuple &result) } else { // The last predictor pipeline parallel stage this->nextTokens = this->search(result); if (ctx->ppSize > 1 && ctx->ppRank == ctx->ppSize - 1) { - TimeLine t("GreedySearch.MPI_Send.prompt" + std::to_string(ctx->promptID)); + TimeLine t("GreedySearch.MPI_Send.prompt" + std::to_string(ctx->sampleID)); int embedding_world_rank = 0 * ctx->tpSize + ctx->tpRank; int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank; - MPI_Send(&ctx->promptID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); + MPI_Send(&ctx->sampleID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); MPI_Send(this->nextTokens.data(), batchSize, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); // TODO: Error: different scope when dynamic loading so file diff --git a/src/utils/prompt.h b/src/utils/prompt.h index 753e4bd1..2a25c9d3 100644 --- a/src/utils/prompt.h +++ b/src/utils/prompt.h @@ -19,72 +19,82 @@ #include /* - PromptPool - ┌──────┬──────┬──────┐ - │ │ │ ◄───┼──┬─ PromptMeta - ├──────┼──────┼──────┤ │ - │ │ │ ◄───┼──┘ - └▲─┬─▲─┴──────┴──────┘ - │ │ └───────────────────────────────────┐ - ┌──┬──┬──┬──┐ │ │ ┌──┬──┬──┬──┬──┬──┬──┬──┬──┐ │ - │ │ │ │ ├──┘ └─────►│ │ │ │ │ │ │ │ │ ├─┐ │ - └──┴──┴──┴──┘ └──┴──┴──┴──┴──┴──┴──┴──┴──┘ │ │ - InputQueue TaskWaitingQueue0 │ │ - ┌───────────────────────────────┘ │ - │ ┌──┬──┬──┬──┬──┬──┬──┬──┬──┐ │ - └─►│ │ │ │ │ │ │ │ │ ├───┘ - └──┴──┴──┴──┴──┴──┴──┴──┴──┘ - TaskWaitingQueue1 + SamplePool + ┌──────┬──────┬──────┐ + │ │ │ ◄───┼──┬─ SampleMeta + ├──────┼──────┼──────┤ │ + BatchInputs │ │ │ ◄───┼──┘ + │ └▲─┬─▲─┴──────┴──────┘ + │ │ │ └───────────────────────────────────┐ + ▼ ┌──┬──┬──┬──┐ │ │ ┌──┬──┬──┬──┬──┬──┬──┬──┬──┐ │ + Input ─►│ │ │ │ ├──┘ └─────►│ │ │ │ │ │ │ │ │ ├─┐ │ + └──┴──┴──┴──┘ └──┴──┴──┴──┴──┴──┴──┴──┴──┘ │ │ + InputQueue TaskWaitingQueue0 │ │ + ┌───────────────────────────────┘ │ + │ ┌──┬──┬──┬──┬──┬──┬──┬──┬──┐ │ + └─►│ │ │ │ │ │ │ │ │ ├───┘ + └──┴──┴──┴──┴──┴──┴──┴──┴──┘ + TaskWaitingQueue1 */ namespace xft { -template -class PromptMeta { +// The Sample is one of batch inputs +template +class SampleMeta { public: - PromptMeta(int32_t _promptID, int32_t _tokenID, int32_t _batchSize, int32_t _inputSeqLen, - std::vector _inputs) { - promptID = _promptID; - tokenID = _tokenID; - batchSize = _batchSize; - inputSeqLen = _inputSeqLen; - inputs = _inputs; - hiddenStatesReceived = false; + SampleMeta(int32_t _sampleID, int32_t _inputSeqLen, std::vector _inputTokens) + : sampleID(_sampleID), inputSeqLen(_inputSeqLen), bePrefill(true) { + inputTokens.resize(_inputSeqLen); + inputTokens.assign(_inputTokens.begin(), _inputTokens.end()); + pastTokens.resize(_inputSeqLen); } - PromptMeta(int32_t _promptID, int32_t _tokenID, int32_t _batchSize, int32_t _inputSeqLen, int32_t _hiddenSize) { - promptID = _promptID; - tokenID = _tokenID; - batchSize = _batchSize; - inputSeqLen = _inputSeqLen; - hiddenSize = _hiddenSize; - hiddenStatesReceived = false; - hiddenStates.Resize(batchSize * inputSeqLen, hiddenSize, hiddenSize); + SampleMeta(int32_t _sampleID, int32_t _inputSeqLen, int32_t _hiddenSize) + : sampleID(_sampleID), inputSeqLen(_inputSeqLen), hiddenSize(_hiddenSize), bePrefill(true) { + inputTokens.resize(_inputSeqLen); + pastTokens.resize(_inputSeqLen); } void ResetKVCache(int32_t _hiddenSize, int32_t _pastSeqLen, int32_t _layerIdx, void *_hiddenStates, void *_kvm) { hiddenSize = _hiddenSize; pastSeqLen = _pastSeqLen; layerIdx = _layerIdx; - hiddenStates.Resize(batchSize * inputSeqLen, hiddenSize, hiddenSize); - memcpy(hiddenStates.Data(), _hiddenStates, sizeof(AttnInT) * batchSize * inputSeqLen * hiddenSize); + hiddenStates.Resize(inputSeqLen, hiddenSize, hiddenSize); + memcpy(hiddenStates.Data(), _hiddenStates, sizeof(T) * inputSeqLen * hiddenSize); kvm = _kvm; } - int32_t promptID; - int32_t tokenID; - bool hiddenStatesReceived; + int32_t getSampleID() const { return sampleID; } + + // Get the input tokens in sample + int32_t *getInputTokens() const { return inputTokens.data(); } + + // For generated tokens + void addGeneratedToken(int32_t token) { pastTokens.push_back(token); } + + int32_t getLatestToken() const { return pastTokens.back(); } + + int32_t *getTotalTokens() const { return pastTokens.data(); } + + bool isPrefill() const { return bePrefill; } + + void setPrefill(bool _bePrefill) { bePrefill = _bePrefill; } private: - int32_t batchSize; + int32_t sampleID; int32_t inputSeqLen; int32_t hiddenSize; int32_t pastSeqLen; - std::vector inputs; - std::vector outputs; + std::vector inputTokens; + std::vector pastTokens; // generated tokens + + // Indicates whether the sample is in the prefill phase + bool bePrefill; + int32_t layerIdx; - hpj::Matrix hiddenStates; - void *kvm; //KVCacheManager + hpj::Matrix hiddenStates; + void *kvm; // KVCacheManager }; template @@ -95,44 +105,35 @@ class InputQueue { return instance; } - int32_t createPromptID() { - int32_t id = promptID++; - if (id > 1000) { - promptID = 0; - id = promptID++; - } - return id; - } - - int32_t createTokenID() { - int32_t id = tokenID++; - if (id > 1000) { - tokenID = 0; - id = tokenID++; + int32_t createSampleID() { + int32_t id = sampleID++; + if (id >= 10 * 1024) { + sampleID = 0; + id = sampleID++; } return id; } bool empty() { return queue.empty(); } - PromptMeta *pop() { + SampleMeta *pop() { auto buffer = queue.front(); queue.pop(); return buffer; } - void push(PromptMeta *buffer) { queue.push(buffer); } + void push(SampleMeta *buffer) { queue.push(buffer); } private: InputQueue() {} - static int32_t promptID; + static int32_t sampleID; static int32_t tokenID; - std::queue *> queue; + std::queue *> queue; }; template -int32_t InputQueue::promptID = 0; +int32_t InputQueue::sampleID = 0; template int32_t InputQueue::tokenID = 0; @@ -155,33 +156,33 @@ class TaskWaitingQueue { return full; } - PromptMeta *pop() { + SampleMeta *pop() { auto buffer = queue.front(); queue.pop(); return buffer; } - void push(PromptMeta *buffer) { queue.push(buffer); } + void push(SampleMeta *buffer) { queue.push(buffer); } private: TaskWaitingQueue() {} - std::queue *> queue; + std::queue *> queue; }; template -class PromptPool { +class SamplePool { public: - static PromptPool &getInstance() { - static PromptPool instance; + static SamplePool &getInstance() { + static SamplePool instance; return instance; } - void insert(int32_t key, PromptMeta *prompt) { hub[key] = prompt; } + void insert(int32_t key, SampleMeta *sample) { hub[key] = sample; } bool has(int32_t key) const { return hub.find(key) != hub.end(); } - PromptMeta *get(int32_t key) const { + SampleMeta *get(int32_t key) const { auto it = hub.find(key); if (it != hub.end()) { return it->second; @@ -192,18 +193,18 @@ class PromptPool { void remove(int32_t key) { hub.erase(key); } - void modify(int32_t oldKey, PromptMeta *newPrompt) { + void modify(int32_t oldKey, SampleMeta *newSample) { auto it = hub.find(oldKey); if (it != hub.end()) { delete it->second; - it->second = newPrompt; + it->second = newSample; } } private: - PromptPool() {} + SamplePool() {} - std::unordered_map *> hub; + std::unordered_map *> hub; }; } // namespace xft \ No newline at end of file From 291bd28338c3fa6bee06b4b9306f9bd43efc7216 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 13:21:56 +0800 Subject: [PATCH 15/32] rename filename --- src/{utils/prompt.h => common/sample_info.h} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{utils/prompt.h => common/sample_info.h} (100%) diff --git a/src/utils/prompt.h b/src/common/sample_info.h similarity index 100% rename from src/utils/prompt.h rename to src/common/sample_info.h From 6ffe206429e58dd882fb228a42a4128cd82586bb Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 13:25:54 +0800 Subject: [PATCH 16/32] format code --- src/models/common_decoder.h | 8 -------- src/models/models.cpp | 12 +----------- 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index 87ccdfd6..20f2b8e5 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -345,8 +345,6 @@ class CommonDecoder : public AbstractDecoder { int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank; int count = batchSize * inputSeqLen * hiddenSize; TimeLine t("Decoder.MPI_Recv." + std::to_string(ctx->sampleID)); - printf("%d: Decoder.MPI_Recv.SyncStart\n", ctx->ppRank); - fflush(stdout); int32_t sampleID; MPI_Recv(&sampleID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); @@ -358,8 +356,6 @@ class CommonDecoder : public AbstractDecoder { SamplePool::getInstance().insert(prompt->getSampleID(), prompt); } TaskWaitingQueue::getInstance().push(SamplePool::getInstance().get(sampleID)); - printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, sampleID); - fflush(stdout); } if (!InputQueue::getInstance().empty()) { @@ -378,8 +374,6 @@ class CommonDecoder : public AbstractDecoder { runningTask = TaskWaitingQueue::getInstance().pop(); ctx->sampleID = runningTask->getSampleID(); TimeLine t("Decoder.forward." + std::to_string(ctx->sampleID)); - printf("%d: Decoder.forward\n", ctx->ppRank); - fflush(stdout); #endif // Decoder: forward @@ -447,8 +441,6 @@ class CommonDecoder : public AbstractDecoder { MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD); // TODO: Error: different scope when dynamic loading so file // this->messenger.worldSendFP32(embBuf, count, next_world_rank, next_world_rank); - printf("%d: Decoder.MPI_Send %d\n", ctx->ppRank, sampleID); - fflush(stdout); return std::tuple(nullptr, 0, 0); } #endif diff --git a/src/models/models.cpp b/src/models/models.cpp index da76348e..45a1e51b 100644 --- a/src/models/models.cpp +++ b/src/models/models.cpp @@ -143,17 +143,7 @@ std::vector Model::generate() { } if (isNewInput) { - static int i = 0; - i++; - if (i > 1) { - isNewInput = false; - i = 0; - } - // TODO: Create it when request input - if (this->isMaster()) { - int sampleID = InputQueue::getInstance().createSampleID(); - InputQueue::getInstance().push(new SampleMeta(sampleID, seqLen, inputIds)); - } + isNewInput = false; return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); } else { return searcher->getNextToken(); From 6051c842af9606a6af748afc0b1cd7bb31ce6333 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 13:31:30 +0800 Subject: [PATCH 17/32] format code --- src/models/common_decoder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index 20f2b8e5..a0a9fc35 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -344,9 +344,9 @@ class CommonDecoder : public AbstractDecoder { int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank; int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank; int count = batchSize * inputSeqLen * hiddenSize; - TimeLine t("Decoder.MPI_Recv." + std::to_string(ctx->sampleID)); int32_t sampleID; MPI_Recv(&sampleID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + TimeLine t("Decoder.MPI_Recv." + std::to_string(sampleID)); MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // TODO: Error: different scope when dynamic loading so file // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); From d4bd7dcecb8bfdce0a02e4d8e77c43cc5cf1c5bf Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 13:34:28 +0800 Subject: [PATCH 18/32] format code --- src/models/common_decoder.h | 2 +- src/models/models.cpp | 2 +- src/searchers/greedy_search.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index a0a9fc35..02d41cb5 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -35,7 +35,7 @@ #include "transformer_ctx.h" #include "transpose_util.h" #include "weight_util.h" -#include "prompt.h" +#include "sample_info.h" using namespace xft; diff --git a/src/models/models.cpp b/src/models/models.cpp index 45a1e51b..d7eaa4ec 100644 --- a/src/models/models.cpp +++ b/src/models/models.cpp @@ -32,7 +32,7 @@ #include "searcher.h" #include "timeline.h" #include "yarn_llama.h" -#include "prompt.h" +#include "sample_info.h" namespace xft { enum class GenerationMode { GREEDY_SEARCH, BEAM_SEARCH, SAMPLE }; diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp index 3d718bd0..75ba689e 100644 --- a/src/searchers/greedy_search.cpp +++ b/src/searchers/greedy_search.cpp @@ -14,7 +14,7 @@ // ============================================================================ #include "greedy_search.h" #include "messenger.h" -#include "prompt.h" +#include "sample_info.h" #include "search_utils.h" #include "thread_util.h" From f7708afa365b135bc649000967003aebaada08b2 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 14:35:41 +0800 Subject: [PATCH 19/32] update samplePool --- src/common/sample_info.h | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/src/common/sample_info.h b/src/common/sample_info.h index 2a25c9d3..e0b789db 100644 --- a/src/common/sample_info.h +++ b/src/common/sample_info.h @@ -178,7 +178,23 @@ class SamplePool { return instance; } - void insert(int32_t key, SampleMeta *sample) { hub[key] = sample; } + bool add(int32_t key, SampleMeta *sample) { + bool exist = has(key); + if (!exist) { + hub[key] = sample; + } + + return exist; + } + + void forceAdd(int32_t key, SampleMeta *sample) { + auto it = hub.find(key); + if (it != hub.end()) { + delete it->second; + } + + hub[key] = sample; + } bool has(int32_t key) const { return hub.find(key) != hub.end(); } @@ -191,9 +207,13 @@ class SamplePool { } } - void remove(int32_t key) { hub.erase(key); } + void remove(int32_t key) { + if (has(key)) { + hub.erase(key); + } + } - void modify(int32_t oldKey, SampleMeta *newSample) { + void replace(int32_t oldKey, SampleMeta *newSample) { auto it = hub.find(oldKey); if (it != hub.end()) { delete it->second; From cf4ff411c90d488f892e3b584b28ec87a937b5d5 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 14:38:37 +0800 Subject: [PATCH 20/32] update samplePool --- src/common/sample_info.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/common/sample_info.h b/src/common/sample_info.h index e0b789db..6d383a74 100644 --- a/src/common/sample_info.h +++ b/src/common/sample_info.h @@ -213,12 +213,16 @@ class SamplePool { } } - void replace(int32_t oldKey, SampleMeta *newSample) { + bool replace(int32_t oldKey, SampleMeta *newSample) { + bool ret = false; auto it = hub.find(oldKey); if (it != hub.end()) { delete it->second; it->second = newSample; + ret = true; } + + return ret; } private: From e96c1a83cc5190e1eb7db3138c184e1d2e601c5d Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 15:01:35 +0800 Subject: [PATCH 21/32] update code --- src/common/sample_info.h | 14 ++++---------- src/models/common_decoder.h | 14 +++++++------- src/utils/environment.h | 21 +++++++++++++++++++++ 3 files changed, 32 insertions(+), 17 deletions(-) diff --git a/src/common/sample_info.h b/src/common/sample_info.h index 6d383a74..e9322900 100644 --- a/src/common/sample_info.h +++ b/src/common/sample_info.h @@ -152,7 +152,7 @@ class TaskWaitingQueue { bool isFull() { bool full = false; - if (this->size() >= 4) full = true; + if (this->size() >= Env::getInstance().getMaxRequestNum()) { full = true; } return full; } @@ -180,18 +180,14 @@ class SamplePool { bool add(int32_t key, SampleMeta *sample) { bool exist = has(key); - if (!exist) { - hub[key] = sample; - } + if (!exist) { hub[key] = sample; } return exist; } void forceAdd(int32_t key, SampleMeta *sample) { auto it = hub.find(key); - if (it != hub.end()) { - delete it->second; - } + if (it != hub.end()) { delete it->second; } hub[key] = sample; } @@ -208,9 +204,7 @@ class SamplePool { } void remove(int32_t key) { - if (has(key)) { - hub.erase(key); - } + if (has(key)) { hub.erase(key); } } bool replace(int32_t oldKey, SampleMeta *newSample) { diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index 02d41cb5..4cdb1cef 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -351,19 +351,19 @@ class CommonDecoder : public AbstractDecoder { // TODO: Error: different scope when dynamic loading so file // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); if (!SamplePool::getInstance().has(sampleID)) { - SampleMeta *prompt = new SampleMeta(sampleID, seqLen, hiddenSize); - prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); - SamplePool::getInstance().insert(prompt->getSampleID(), prompt); + SampleMeta *sample = new SampleMeta(sampleID, seqLen, hiddenSize); + sample->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); + SamplePool::getInstance().forceAdd(sample->getSampleID(), sample); } TaskWaitingQueue::getInstance().push(SamplePool::getInstance().get(sampleID)); } if (!InputQueue::getInstance().empty()) { if (!TaskWaitingQueue::getInstance().isFull()) { - auto prompt = InputQueue::getInstance().pop(); - prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); - SamplePool::getInstance().insert(prompt->getSampleID(), prompt); - TaskWaitingQueue::getInstance().push(SamplePool::getInstance().get(prompt->getSampleID())); + auto sample = InputQueue::getInstance().pop(); + sample->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); + SamplePool::getInstance().forceAdd(sample->getSampleID(), sample); + TaskWaitingQueue::getInstance().push(SamplePool::getInstance().get(sample->getSampleID())); } } diff --git a/src/utils/environment.h b/src/utils/environment.h index e6d94338..2630ff74 100644 --- a/src/utils/environment.h +++ b/src/utils/environment.h @@ -41,6 +41,9 @@ class Env { // get Engine Kind and Index int getPipelineStage() { return pipelineStageValue; } + // get Engine Kind and Index + int getMaxRequestNum() { return maxRequestNumValue; } + // get AMX Threshold M int getAMXThresholdM() { return AMXThresholdMValue; } @@ -73,6 +76,9 @@ class Env { // init Pipeline Parallel initPipelineStage(); + // init Max request number + initMaxRequestNum(); + // init Engine Kind and Index initEngineKindIndex(); @@ -173,6 +179,21 @@ class Env { } } + // Max request number + int maxRequestNumValue = 1; + void initMaxRequestNum() { + char *xft_max_request_num_value = getenv("XFT_MAX_REQUEST_NUM"); + if (xft_max_request_num_value != NULL) { + int value = atoi(xft_max_request_num_value); + if (value >= 1) + maxRequestNumValue = value; + else + printf("[ERROR] XFT_MAX_REQUEST_NUM value need to be greater than 0.\n"); + } else { + maxRequestNumValue = 1; + } + } + // AMX Threshold M int AMXThresholdMValue = 1; void initAMXThresholdM() { From dc1a9e55c249d2b69be92fb4da4448782c4043d7 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 19:40:27 +0800 Subject: [PATCH 22/32] sampleID to seqenceID --- src/common/{sample_info.h => sequence.h} | 147 ++++++++++++----------- src/common/transformer_ctx.h | 2 +- src/models/common_decoder.h | 50 ++++---- src/models/models.cpp | 2 +- src/searchers/greedy_search.cpp | 20 +-- 5 files changed, 115 insertions(+), 106 deletions(-) rename src/common/{sample_info.h => sequence.h} (59%) diff --git a/src/common/sample_info.h b/src/common/sequence.h similarity index 59% rename from src/common/sample_info.h rename to src/common/sequence.h index e9322900..cef2f895 100644 --- a/src/common/sample_info.h +++ b/src/common/sequence.h @@ -19,9 +19,9 @@ #include /* - SamplePool + SequencePool ┌──────┬──────┬──────┐ - │ │ │ ◄───┼──┬─ SampleMeta + │ │ │ ◄───┼──┬─ SequenceMeta ├──────┼──────┼──────┤ │ BatchInputs │ │ │ ◄───┼──┘ │ └▲─┬─▲─┴──────┴──────┘ @@ -39,65 +39,77 @@ namespace xft { -// The Sample is one of batch inputs -template -class SampleMeta { +// The Sequence is one sequence of batch inputs and includes the generated tokens. +class SequenceMeta { public: - SampleMeta(int32_t _sampleID, int32_t _inputSeqLen, std::vector _inputTokens) - : sampleID(_sampleID), inputSeqLen(_inputSeqLen), bePrefill(true) { + SequenceMeta(int32_t _sequenceID, int32_t _inputSeqLen, std::vector _inputTokens) + : sequenceID(_sequenceID), inputSeqLen(_inputSeqLen), step(0) { inputTokens.resize(_inputSeqLen); inputTokens.assign(_inputTokens.begin(), _inputTokens.end()); - pastTokens.resize(_inputSeqLen); + nextTokens.resize(_inputSeqLen); } - SampleMeta(int32_t _sampleID, int32_t _inputSeqLen, int32_t _hiddenSize) - : sampleID(_sampleID), inputSeqLen(_inputSeqLen), hiddenSize(_hiddenSize), bePrefill(true) { - inputTokens.resize(_inputSeqLen); - pastTokens.resize(_inputSeqLen); + SequenceMeta(int32_t _sequenceID, int32_t _inputSeqLen) + : sequenceID(_sequenceID), inputSeqLen(_inputSeqLen), inputTokens(_inputSeqLen, 0), step(0) { + nextTokens.resize(_inputSeqLen); } - void ResetKVCache(int32_t _hiddenSize, int32_t _pastSeqLen, int32_t _layerIdx, void *_hiddenStates, void *_kvm) { - hiddenSize = _hiddenSize; - pastSeqLen = _pastSeqLen; - layerIdx = _layerIdx; - hiddenStates.Resize(inputSeqLen, hiddenSize, hiddenSize); - memcpy(hiddenStates.Data(), _hiddenStates, sizeof(T) * inputSeqLen * hiddenSize); - kvm = _kvm; - } + ~SequenceMeta() {} + + int32_t getSequenceID() const { return sequenceID; } + + // Get the input tokens in sequence + int32_t getInputSeqLen() const { return inputSeqLen; } - int32_t getSampleID() const { return sampleID; } + const int32_t *getInputTokens() const { return inputTokens.data(); } - // Get the input tokens in sample - int32_t *getInputTokens() const { return inputTokens.data(); } + int32_t getPastSeqLen() const { return pastSeqLen; } - // For generated tokens - void addGeneratedToken(int32_t token) { pastTokens.push_back(token); } + void setPastSeqLen(int32_t _pastSeqLen) { pastSeqLen = _pastSeqLen; } - int32_t getLatestToken() const { return pastTokens.back(); } + // For next tokens + void addNextToken(int32_t token) { nextTokens.push_back(token); } - int32_t *getTotalTokens() const { return pastTokens.data(); } + int32_t getLatestToken() const { return nextTokens.back(); } - bool isPrefill() const { return bePrefill; } + const int32_t *getTotalTokens() const { return nextTokens.data(); } - void setPrefill(bool _bePrefill) { bePrefill = _bePrefill; } + int32_t getStep() const { return step; } + + void setStep(int32_t _step) { step = _step; } private: - int32_t sampleID; + int32_t sequenceID; int32_t inputSeqLen; - int32_t hiddenSize; int32_t pastSeqLen; - std::vector inputTokens; - std::vector pastTokens; // generated tokens + std::vector inputTokens; // input tokens + next tokens + std::vector nextTokens; // next tokens - // Indicates whether the sample is in the prefill phase - bool bePrefill; + // Indicates whether the sequence is in the prefill phase + int32_t step; + +#ifdef PIPELINE_PARALLEL +public: + template + void allocBuffer(int32_t hiddenSize, void *_hiddenStates) { + hiddenStates = xft::alloc(sizeof(T) * getInputSeqLen() * hiddenSize); + memcpy(hiddenStates, _hiddenStates, sizeof(T) * getInputSeqLen() * hiddenSize); + } - int32_t layerIdx; - hpj::Matrix hiddenStates; - void *kvm; // KVCacheManager +private: + int32_t hiddenSize; + void* hiddenStates; +#endif }; -template +// For beam searcher +// class SequenceGroupMeta { +// public: +// SequenceGroupMeta(int32_t num_beams) { sequence = new SequenceMeta[num_beams]; } + +// SequenceMeta *sequence; +// }; + class InputQueue { public: static InputQueue &getInstance() { @@ -105,40 +117,33 @@ class InputQueue { return instance; } - int32_t createSampleID() { - int32_t id = sampleID++; + int32_t createSequenceID() { + int32_t id = sequenceID++; if (id >= 10 * 1024) { - sampleID = 0; - id = sampleID++; + sequenceID = 0; + id = sequenceID++; } return id; } bool empty() { return queue.empty(); } - SampleMeta *pop() { + SequenceMeta *pop() { auto buffer = queue.front(); queue.pop(); return buffer; } - void push(SampleMeta *buffer) { queue.push(buffer); } + void push(SequenceMeta *buffer) { queue.push(buffer); } private: InputQueue() {} - static int32_t sampleID; - static int32_t tokenID; - std::queue *> queue; + int32_t sequenceID = 0; + std::queue queue; }; -template -int32_t InputQueue::sampleID = 0; - -template -int32_t InputQueue::tokenID = 0; -template class TaskWaitingQueue { public: static TaskWaitingQueue &getInstance() { @@ -156,45 +161,45 @@ class TaskWaitingQueue { return full; } - SampleMeta *pop() { + SequenceMeta *pop() { auto buffer = queue.front(); queue.pop(); return buffer; } - void push(SampleMeta *buffer) { queue.push(buffer); } + void push(SequenceMeta *buffer) { queue.push(buffer); } private: TaskWaitingQueue() {} - std::queue *> queue; + std::queue queue; }; -template -class SamplePool { + +class SequencePool { public: - static SamplePool &getInstance() { - static SamplePool instance; + static SequencePool &getInstance() { + static SequencePool instance; return instance; } - bool add(int32_t key, SampleMeta *sample) { + bool add(int32_t key, SequenceMeta *sequence) { bool exist = has(key); - if (!exist) { hub[key] = sample; } + if (!exist) { hub[key] = sequence; } return exist; } - void forceAdd(int32_t key, SampleMeta *sample) { + void forceAdd(int32_t key, SequenceMeta *sequence) { auto it = hub.find(key); if (it != hub.end()) { delete it->second; } - hub[key] = sample; + hub[key] = sequence; } bool has(int32_t key) const { return hub.find(key) != hub.end(); } - SampleMeta *get(int32_t key) const { + SequenceMeta *get(int32_t key) const { auto it = hub.find(key); if (it != hub.end()) { return it->second; @@ -207,12 +212,12 @@ class SamplePool { if (has(key)) { hub.erase(key); } } - bool replace(int32_t oldKey, SampleMeta *newSample) { + bool replace(int32_t oldKey, SequenceMeta *newSequence) { bool ret = false; auto it = hub.find(oldKey); if (it != hub.end()) { delete it->second; - it->second = newSample; + it->second = newSequence; ret = true; } @@ -220,9 +225,11 @@ class SamplePool { } private: - SamplePool() {} + SequencePool() {} + + std::unordered_map hub; - std::unordered_map *> hub; + //mgr }; } // namespace xft \ No newline at end of file diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h index 67611e2b..5aa39501 100644 --- a/src/common/transformer_ctx.h +++ b/src/common/transformer_ctx.h @@ -64,7 +64,7 @@ struct DecoderContext { int inputSeqLen; // For custom usage int reserved1; - int sampleID; + int sequenceID; // Model structure configuration int vocabSize; diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index 4cdb1cef..2eaf9317 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -35,7 +35,7 @@ #include "transformer_ctx.h" #include "transpose_util.h" #include "weight_util.h" -#include "sample_info.h" +#include "sequence.h" using namespace xft; @@ -279,7 +279,7 @@ class CommonDecoder : public AbstractDecoder { int userSideBS = dims[0]; int beamSize = dims[1]; - int batchSize = (step == 0 ? userSideBS : userSideBS * beamSize); // as samples are duplicated at step 0 + int batchSize = (step == 0 ? userSideBS : userSideBS * beamSize); // as sequence are duplicated at step 0 int seqLen = dims[2]; int pastSeqLen = step == 0 ? 0 : this->accSeqLen; int inputSeqLen = seqLen; @@ -344,36 +344,38 @@ class CommonDecoder : public AbstractDecoder { int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank; int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank; int count = batchSize * inputSeqLen * hiddenSize; - int32_t sampleID; - MPI_Recv(&sampleID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - TimeLine t("Decoder.MPI_Recv." + std::to_string(sampleID)); + int32_t sequenceID; + MPI_Recv(&sequenceID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + TimeLine t("Decoder.MPI_Recv." + std::to_string(sequenceID)); MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // TODO: Error: different scope when dynamic loading so file // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); - if (!SamplePool::getInstance().has(sampleID)) { - SampleMeta *sample = new SampleMeta(sampleID, seqLen, hiddenSize); - sample->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); - SamplePool::getInstance().forceAdd(sample->getSampleID(), sample); + if (!SequencePool::getInstance().has(sequenceID)) { + SequenceMeta *sequence = new SequenceMeta(sequenceID, seqLen); + sequence->setPastSeqLen(pastSeqLen); + sequence->allocBuffer(hiddenSize, embBuf); + SequencePool::getInstance().forceAdd(sequence->getSequenceID(), sequence); } - TaskWaitingQueue::getInstance().push(SamplePool::getInstance().get(sampleID)); + TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequenceID)); } - if (!InputQueue::getInstance().empty()) { - if (!TaskWaitingQueue::getInstance().isFull()) { - auto sample = InputQueue::getInstance().pop(); - sample->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get()); - SamplePool::getInstance().forceAdd(sample->getSampleID(), sample); - TaskWaitingQueue::getInstance().push(SamplePool::getInstance().get(sample->getSampleID())); + if (!InputQueue::getInstance().empty()) { + if (!TaskWaitingQueue::getInstance().isFull()) { + auto sequence = InputQueue::getInstance().pop(); + sequence->setPastSeqLen(pastSeqLen); + sequence->allocBuffer(hiddenSize, embBuf); + SequencePool::getInstance().forceAdd(sequence->getSequenceID(), sequence); + TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequence->getSequenceID())); } } - while(TaskWaitingQueue::getInstance().empty()); + while(TaskWaitingQueue::getInstance().empty()); - SampleMeta *runningTask; - if (!TaskWaitingQueue::getInstance().empty()) { - runningTask = TaskWaitingQueue::getInstance().pop(); - ctx->sampleID = runningTask->getSampleID(); - TimeLine t("Decoder.forward." + std::to_string(ctx->sampleID)); + SequenceMeta *runningTask; + if (!TaskWaitingQueue::getInstance().empty()) { + runningTask = TaskWaitingQueue::getInstance().pop(); + ctx->sequenceID = runningTask->getSequenceID(); + TimeLine t("Decoder.step." + std::to_string(ctx->sequenceID)); #endif // Decoder: forward @@ -436,8 +438,8 @@ class CommonDecoder : public AbstractDecoder { TimeLine t("Decoder.MPI_Send"); int next_world_rank = (ctx->ppRank + 1) * ctx->tpSize + ctx->tpRank; int count = batchSize * inputSeqLen * hiddenSize; - int32_t sampleID = runningTask->getSampleID(); - MPI_Send(&sampleID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD); + int32_t sequenceID = runningTask->getSequenceID(); + MPI_Send(&sequenceID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD); MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD); // TODO: Error: different scope when dynamic loading so file // this->messenger.worldSendFP32(embBuf, count, next_world_rank, next_world_rank); diff --git a/src/models/models.cpp b/src/models/models.cpp index d7eaa4ec..6e45b0de 100644 --- a/src/models/models.cpp +++ b/src/models/models.cpp @@ -32,7 +32,7 @@ #include "searcher.h" #include "timeline.h" #include "yarn_llama.h" -#include "sample_info.h" +#include "sequence.h" namespace xft { enum class GenerationMode { GREEDY_SEARCH, BEAM_SEARCH, SAMPLE }; diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp index 75ba689e..620a89c5 100644 --- a/src/searchers/greedy_search.cpp +++ b/src/searchers/greedy_search.cpp @@ -14,7 +14,7 @@ // ============================================================================ #include "greedy_search.h" #include "messenger.h" -#include "sample_info.h" +#include "sequence.h" #include "search_utils.h" #include "thread_util.h" @@ -45,17 +45,17 @@ std::vector GreedySearch::syncToken(std::tuple &result) int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank; ThreadPool::getInstance().addTask([predictor_world_rank, this] { while (true) { - int32_t sampleID; - MPI_Recv(&sampleID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, + int32_t sequenceID; + MPI_Recv(&sequenceID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - TimeLine t("GreedySearch.MPI_Recv.prompt" + std::to_string(sampleID)); + TimeLine t("GreedySearch.MPI_Recv.sequence" + std::to_string(sequenceID)); MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - if (SamplePool::getInstance().has(sampleID)) { - auto prompt = SamplePool::getInstance().get(sampleID); - TaskWaitingQueue::getInstance().push(prompt); + if (SequencePool::getInstance().has(sequenceID)) { + auto sequence = SequencePool::getInstance().get(sequenceID); + TaskWaitingQueue::getInstance().push(sequence); } else { - printf("Error: should have sampleID\n"); + printf("Error: should have sequenceID\n"); fflush(stdout); } } @@ -64,10 +64,10 @@ std::vector GreedySearch::syncToken(std::tuple &result) } else { // The last predictor pipeline parallel stage this->nextTokens = this->search(result); if (ctx->ppSize > 1 && ctx->ppRank == ctx->ppSize - 1) { - TimeLine t("GreedySearch.MPI_Send.prompt" + std::to_string(ctx->sampleID)); + TimeLine t("GreedySearch.MPI_Send.sequence" + std::to_string(ctx->sequenceID)); int embedding_world_rank = 0 * ctx->tpSize + ctx->tpRank; int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank; - MPI_Send(&ctx->sampleID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); + MPI_Send(&ctx->sequenceID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); MPI_Send(this->nextTokens.data(), batchSize, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); // TODO: Error: different scope when dynamic loading so file From 171f45176eb44e1d7eff8644599e2806ec400f22 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 19:57:15 +0800 Subject: [PATCH 23/32] update --- src/common/sequence.h | 27 ++++++++++++++++----------- src/models/common_decoder.h | 4 ++-- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/common/sequence.h b/src/common/sequence.h index cef2f895..e2055e7f 100644 --- a/src/common/sequence.h +++ b/src/common/sequence.h @@ -183,18 +183,23 @@ class SequencePool { return instance; } - bool add(int32_t key, SequenceMeta *sequence) { - bool exist = has(key); - if (!exist) { hub[key] = sequence; } - - return exist; - } - - void forceAdd(int32_t key, SequenceMeta *sequence) { - auto it = hub.find(key); - if (it != hub.end()) { delete it->second; } + bool add(int32_t key, SequenceMeta *sequence, bool force = false) { + bool isSuccess = false; + if (force) { + auto it = hub.find(key); + if (it != hub.end()) { delete it->second; } + + hub[key] = sequence; + isSuccess = true; + } else { + bool exist = has(key); + if (!exist) { + hub[key] = sequence; + isSuccess = true; + } + } - hub[key] = sequence; + return isSuccess; } bool has(int32_t key) const { return hub.find(key) != hub.end(); } diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index 2eaf9317..60820936 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -354,7 +354,7 @@ class CommonDecoder : public AbstractDecoder { SequenceMeta *sequence = new SequenceMeta(sequenceID, seqLen); sequence->setPastSeqLen(pastSeqLen); sequence->allocBuffer(hiddenSize, embBuf); - SequencePool::getInstance().forceAdd(sequence->getSequenceID(), sequence); + SequencePool::getInstance().add(sequence->getSequenceID(), sequence); } TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequenceID)); } @@ -364,7 +364,7 @@ class CommonDecoder : public AbstractDecoder { auto sequence = InputQueue::getInstance().pop(); sequence->setPastSeqLen(pastSeqLen); sequence->allocBuffer(hiddenSize, embBuf); - SequencePool::getInstance().forceAdd(sequence->getSequenceID(), sequence); + SequencePool::getInstance().add(sequence->getSequenceID(), sequence); TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequence->getSequenceID())); } } From 1df416ee99d6ac8d7d28faccf524cc4c62982b08 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 20:43:34 +0800 Subject: [PATCH 24/32] udpate --- src/common/sequence.h | 152 +++++++++++++++++++++--------------- src/models/common_decoder.h | 2 +- 2 files changed, 90 insertions(+), 64 deletions(-) diff --git a/src/common/sequence.h b/src/common/sequence.h index e2055e7f..fb46bdd7 100644 --- a/src/common/sequence.h +++ b/src/common/sequence.h @@ -39,7 +39,7 @@ namespace xft { -// The Sequence is one sequence of batch inputs and includes the generated tokens. +// The SequenceMeta is one sequence of batch inputs and includes the generated tokens. class SequenceMeta { public: SequenceMeta(int32_t _sequenceID, int32_t _inputSeqLen, std::vector _inputTokens) @@ -110,6 +110,94 @@ class SequenceMeta { // SequenceMeta *sequence; // }; + +// SequencePool +// ┌──────┬──────┬──────┐ +// │ │ │ ◄───┼──┬─ SequenceMeta +// ├──────┼──────┼──────┤ │ +// │ │ │ ◄───┼──┘ +// └──────┴──────┴──────┘ +class SequencePool { +public: + static SequencePool &getInstance() { + static SequencePool instance; + return instance; + } + + SequenceMeta *createMeta(int32_t sequenceID, int32_t inputSeqLen, + std::vector inputTokens = std::vector()) { + auto *sequenceMeta = new SequenceMeta(sequenceID, inputSeqLen, inputTokens); + return sequenceMeta; + } + + bool add(int32_t sequenceID, SequenceMeta *sequence, bool force = false) { + bool isSuccess = false; + if (force) { + auto it = hub.find(sequenceID); + if (it != hub.end()) { + remove(it->first); + } + + hub[sequenceID] = sequence; + isSuccess = true; + } else { + bool exist = has(sequenceID); + if (!exist) { + hub[sequenceID] = sequence; + isSuccess = true; + } + } + + return isSuccess; + } + + bool has(int32_t sequenceID) const { return hub.find(sequenceID) != hub.end(); } + + SequenceMeta *get(int32_t sequenceID) const { + auto it = hub.find(sequenceID); + if (it != hub.end()) { + return it->second; + } else { + return nullptr; + } + } + + bool remove(int32_t sequenceID, bool deep = false) { + bool isSuccess = false; + if (has(sequenceID)) { + if (deep == true) { + auto it = hub.find(sequenceID); + if (it != hub.end()) { + delete it->second; + } + } + isSuccess = hub.erase(sequenceID); + } + + return isSuccess; + } + + bool replace(int32_t sequenceID, SequenceMeta *newSequenceMeta) { + bool isSuccess = false; + auto it = hub.find(sequenceID); + if (it != hub.end()) { + remove(it->first); + hub[sequenceID] = newSequenceMeta; + isSuccess = true; + } + + return isSuccess; + } + +private: + SequencePool() {} + + std::unordered_map hub; + + //mgr +}; + + class InputQueue { public: static InputQueue &getInstance() { @@ -175,66 +263,4 @@ class TaskWaitingQueue { std::queue queue; }; - -class SequencePool { -public: - static SequencePool &getInstance() { - static SequencePool instance; - return instance; - } - - bool add(int32_t key, SequenceMeta *sequence, bool force = false) { - bool isSuccess = false; - if (force) { - auto it = hub.find(key); - if (it != hub.end()) { delete it->second; } - - hub[key] = sequence; - isSuccess = true; - } else { - bool exist = has(key); - if (!exist) { - hub[key] = sequence; - isSuccess = true; - } - } - - return isSuccess; - } - - bool has(int32_t key) const { return hub.find(key) != hub.end(); } - - SequenceMeta *get(int32_t key) const { - auto it = hub.find(key); - if (it != hub.end()) { - return it->second; - } else { - return nullptr; - } - } - - void remove(int32_t key) { - if (has(key)) { hub.erase(key); } - } - - bool replace(int32_t oldKey, SequenceMeta *newSequence) { - bool ret = false; - auto it = hub.find(oldKey); - if (it != hub.end()) { - delete it->second; - it->second = newSequence; - ret = true; - } - - return ret; - } - -private: - SequencePool() {} - - std::unordered_map hub; - - //mgr -}; - } // namespace xft \ No newline at end of file diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index 60820936..3e06555d 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -351,7 +351,7 @@ class CommonDecoder : public AbstractDecoder { // TODO: Error: different scope when dynamic loading so file // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); if (!SequencePool::getInstance().has(sequenceID)) { - SequenceMeta *sequence = new SequenceMeta(sequenceID, seqLen); + SequenceMeta *sequence = SequencePool::getInstance().createMeta(sequenceID, seqLen); sequence->setPastSeqLen(pastSeqLen); sequence->allocBuffer(hiddenSize, embBuf); SequencePool::getInstance().add(sequence->getSequenceID(), sequence); From 3b287dfb8ca40fefd466380505a4b1dce68123d8 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 21:13:17 +0800 Subject: [PATCH 25/32] udpate --- src/common/sequence.h | 45 ++++++++++++++++++++------------- src/models/common_decoder.h | 8 +++--- src/searchers/greedy_search.cpp | 4 +-- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/src/common/sequence.h b/src/common/sequence.h index fb46bdd7..204f543f 100644 --- a/src/common/sequence.h +++ b/src/common/sequence.h @@ -42,7 +42,7 @@ namespace xft { // The SequenceMeta is one sequence of batch inputs and includes the generated tokens. class SequenceMeta { public: - SequenceMeta(int32_t _sequenceID, int32_t _inputSeqLen, std::vector _inputTokens) + SequenceMeta(int32_t _sequenceID, int32_t _inputSeqLen, std::vector &_inputTokens) : sequenceID(_sequenceID), inputSeqLen(_inputSeqLen), step(0) { inputTokens.resize(_inputSeqLen); inputTokens.assign(_inputTokens.begin(), _inputTokens.end()); @@ -102,13 +102,19 @@ class SequenceMeta { #endif }; + // For beam searcher -// class SequenceGroupMeta { -// public: -// SequenceGroupMeta(int32_t num_beams) { sequence = new SequenceMeta[num_beams]; } +class SequenceGroupMeta { +public: + SequenceGroupMeta(int32_t _num_beams, std::vector &seq) { + num_beams = _num_beams; + sequences = seq; + } -// SequenceMeta *sequence; -// }; +private: + int32_t num_beams; + std::vector sequences; +}; // SequencePool @@ -125,17 +131,22 @@ class SequencePool { } SequenceMeta *createMeta(int32_t sequenceID, int32_t inputSeqLen, - std::vector inputTokens = std::vector()) { + std::vector &inputTokens) { auto *sequenceMeta = new SequenceMeta(sequenceID, inputSeqLen, inputTokens); return sequenceMeta; } + SequenceMeta *createMeta(int32_t sequenceID, int32_t inputSeqLen) { + auto *sequenceMeta = new SequenceMeta(sequenceID, inputSeqLen); + return sequenceMeta; + } + bool add(int32_t sequenceID, SequenceMeta *sequence, bool force = false) { bool isSuccess = false; if (force) { auto it = hub.find(sequenceID); if (it != hub.end()) { - remove(it->first); + remove(it->first, true); } hub[sequenceID] = sequence; @@ -181,7 +192,7 @@ class SequencePool { bool isSuccess = false; auto it = hub.find(sequenceID); if (it != hub.end()) { - remove(it->first); + remove(it->first, true); hub[sequenceID] = newSequenceMeta; isSuccess = true; } @@ -193,11 +204,10 @@ class SequencePool { SequencePool() {} std::unordered_map hub; - - //mgr }; +// Manage input sequenceMeta class InputQueue { public: static InputQueue &getInstance() { @@ -217,12 +227,12 @@ class InputQueue { bool empty() { return queue.empty(); } SequenceMeta *pop() { - auto buffer = queue.front(); + auto seq = queue.front(); queue.pop(); - return buffer; + return seq; } - void push(SequenceMeta *buffer) { queue.push(buffer); } + void push(SequenceMeta *seq) { queue.push(seq); } private: InputQueue() {} @@ -232,6 +242,7 @@ class InputQueue { }; +// Manage executive sequenceMeta class TaskWaitingQueue { public: static TaskWaitingQueue &getInstance() { @@ -250,12 +261,12 @@ class TaskWaitingQueue { } SequenceMeta *pop() { - auto buffer = queue.front(); + auto seq = queue.front(); queue.pop(); - return buffer; + return seq; } - void push(SequenceMeta *buffer) { queue.push(buffer); } + void push(SequenceMeta *seq) { queue.push(seq); } private: TaskWaitingQueue() {} diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index 3e06555d..90d1590c 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -346,7 +346,7 @@ class CommonDecoder : public AbstractDecoder { int count = batchSize * inputSeqLen * hiddenSize; int32_t sequenceID; MPI_Recv(&sequenceID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - TimeLine t("Decoder.MPI_Recv." + std::to_string(sequenceID)); + TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Recv"); MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // TODO: Error: different scope when dynamic loading so file // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); @@ -375,7 +375,7 @@ class CommonDecoder : public AbstractDecoder { if (!TaskWaitingQueue::getInstance().empty()) { runningTask = TaskWaitingQueue::getInstance().pop(); ctx->sequenceID = runningTask->getSequenceID(); - TimeLine t("Decoder.step." + std::to_string(ctx->sequenceID)); + TimeLine t("Decoder.Seq" + std::to_string(ctx->sequenceID) + ".Step"); #endif // Decoder: forward @@ -435,10 +435,10 @@ class CommonDecoder : public AbstractDecoder { // If current pipeline stage isn't the end of stage, should send data to next stage and return nullptr if (ctx->ppSize > 1 && ctx->ppRank < ctx->ppSize - 1) { - TimeLine t("Decoder.MPI_Send"); + int32_t sequenceID = runningTask->getSequenceID(); + TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Send"); int next_world_rank = (ctx->ppRank + 1) * ctx->tpSize + ctx->tpRank; int count = batchSize * inputSeqLen * hiddenSize; - int32_t sequenceID = runningTask->getSequenceID(); MPI_Send(&sequenceID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD); MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD); // TODO: Error: different scope when dynamic loading so file diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp index 620a89c5..d0611b0f 100644 --- a/src/searchers/greedy_search.cpp +++ b/src/searchers/greedy_search.cpp @@ -48,7 +48,7 @@ std::vector GreedySearch::syncToken(std::tuple &result) int32_t sequenceID; MPI_Recv(&sequenceID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - TimeLine t("GreedySearch.MPI_Recv.sequence" + std::to_string(sequenceID)); + TimeLine t("GreedySearch.Seq" + std::to_string(sequenceID) + ".MPI_Recv"); MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); if (SequencePool::getInstance().has(sequenceID)) { @@ -64,7 +64,7 @@ std::vector GreedySearch::syncToken(std::tuple &result) } else { // The last predictor pipeline parallel stage this->nextTokens = this->search(result); if (ctx->ppSize > 1 && ctx->ppRank == ctx->ppSize - 1) { - TimeLine t("GreedySearch.MPI_Send.sequence" + std::to_string(ctx->sequenceID)); + TimeLine t("GreedySearch.Seq" + std::to_string(ctx->sequenceID) + ".MPI_Send"); int embedding_world_rank = 0 * ctx->tpSize + ctx->tpRank; int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank; MPI_Send(&ctx->sequenceID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); From 8d5dfe18e3d319bfe1d803d767138f35e398f0e0 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 21:18:28 +0800 Subject: [PATCH 26/32] update --- src/common/sequence.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/common/sequence.h b/src/common/sequence.h index 204f543f..af32ead9 100644 --- a/src/common/sequence.h +++ b/src/common/sequence.h @@ -84,8 +84,6 @@ class SequenceMeta { int32_t pastSeqLen; std::vector inputTokens; // input tokens + next tokens std::vector nextTokens; // next tokens - - // Indicates whether the sequence is in the prefill phase int32_t step; #ifdef PIPELINE_PARALLEL From d77e7c12216796c1aef7a0e8755703366c6a8b0a Mon Sep 17 00:00:00 2001 From: changqi1 Date: Mon, 22 Apr 2024 21:23:39 +0800 Subject: [PATCH 27/32] update --- src/common/sequence.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/common/sequence.h b/src/common/sequence.h index af32ead9..bc696840 100644 --- a/src/common/sequence.h +++ b/src/common/sequence.h @@ -128,6 +128,15 @@ class SequencePool { return instance; } + int32_t createSequenceID() { + int32_t id = globalSequenceID++; + if (id >= 10 * 1024) { + globalSequenceID = 0; + id = globalSequenceID++; + } + return id; + } + SequenceMeta *createMeta(int32_t sequenceID, int32_t inputSeqLen, std::vector &inputTokens) { auto *sequenceMeta = new SequenceMeta(sequenceID, inputSeqLen, inputTokens); @@ -201,6 +210,7 @@ class SequencePool { private: SequencePool() {} + int32_t globalSequenceID = 0; std::unordered_map hub; }; @@ -213,15 +223,6 @@ class InputQueue { return instance; } - int32_t createSequenceID() { - int32_t id = sequenceID++; - if (id >= 10 * 1024) { - sequenceID = 0; - id = sequenceID++; - } - return id; - } - bool empty() { return queue.empty(); } SequenceMeta *pop() { @@ -235,7 +236,6 @@ class InputQueue { private: InputQueue() {} - int32_t sequenceID = 0; std::queue queue; }; From fffd7b6ad9d7dcf161bbbcdfbccb3aaaa257fdcc Mon Sep 17 00:00:00 2001 From: changqi1 Date: Tue, 23 Apr 2024 10:19:12 +0800 Subject: [PATCH 28/32] add PIPELINE_PARALLEL macro --- src/common/sequence.h | 45 ++++++++++++++++++++++-------------- src/common/transformer_ctx.h | 3 +++ src/models/common_decoder.h | 8 +++---- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/src/common/sequence.h b/src/common/sequence.h index bc696840..d036e042 100644 --- a/src/common/sequence.h +++ b/src/common/sequence.h @@ -43,14 +43,15 @@ namespace xft { class SequenceMeta { public: SequenceMeta(int32_t _sequenceID, int32_t _inputSeqLen, std::vector &_inputTokens) - : sequenceID(_sequenceID), inputSeqLen(_inputSeqLen), step(0) { + : sequenceID(_sequenceID), inputSeqLen(_inputSeqLen), pastSeqLen(0), step(0) { inputTokens.resize(_inputSeqLen); inputTokens.assign(_inputTokens.begin(), _inputTokens.end()); nextTokens.resize(_inputSeqLen); + setPastSeqLen(getPastSeqLen()); } SequenceMeta(int32_t _sequenceID, int32_t _inputSeqLen) - : sequenceID(_sequenceID), inputSeqLen(_inputSeqLen), inputTokens(_inputSeqLen, 0), step(0) { + : sequenceID(_sequenceID), inputSeqLen(_inputSeqLen), inputTokens(_inputSeqLen, 0), pastSeqLen(0), step(0) { nextTokens.resize(_inputSeqLen); } @@ -58,6 +59,21 @@ class SequenceMeta { int32_t getSequenceID() const { return sequenceID; } + // For first tokens + void stepForward() { + if (getStep() == 0) { + setPastSeqLen(inputTokens.size()); + setStep(getStep() + 1); + } + } + + // For next token + void stepForward(int32_t token) { + addNextToken(token); + setPastSeqLen(getPastSeqLen() + 1); + setStep(getStep() + 1); + } + // Get the input tokens in sequence int32_t getInputSeqLen() const { return inputSeqLen; } @@ -68,11 +84,15 @@ class SequenceMeta { void setPastSeqLen(int32_t _pastSeqLen) { pastSeqLen = _pastSeqLen; } // For next tokens - void addNextToken(int32_t token) { nextTokens.push_back(token); } + void addNextToken(int32_t token) { + nextTokens.clear(); + nextTokens.push_back(token); + inputTokens.push_back(token); + } int32_t getLatestToken() const { return nextTokens.back(); } - const int32_t *getTotalTokens() const { return nextTokens.data(); } + const int32_t *getTotalTokens() const { return getInputTokens(); } int32_t getStep() const { return step; } @@ -96,11 +116,10 @@ class SequenceMeta { private: int32_t hiddenSize; - void* hiddenStates; + void *hiddenStates; #endif }; - // For beam searcher class SequenceGroupMeta { public: @@ -114,7 +133,6 @@ class SequenceGroupMeta { std::vector sequences; }; - // SequencePool // ┌──────┬──────┬──────┐ // │ │ │ ◄───┼──┬─ SequenceMeta @@ -137,8 +155,7 @@ class SequencePool { return id; } - SequenceMeta *createMeta(int32_t sequenceID, int32_t inputSeqLen, - std::vector &inputTokens) { + SequenceMeta *createMeta(int32_t sequenceID, int32_t inputSeqLen, std::vector &inputTokens) { auto *sequenceMeta = new SequenceMeta(sequenceID, inputSeqLen, inputTokens); return sequenceMeta; } @@ -152,9 +169,7 @@ class SequencePool { bool isSuccess = false; if (force) { auto it = hub.find(sequenceID); - if (it != hub.end()) { - remove(it->first, true); - } + if (it != hub.end()) { remove(it->first, true); } hub[sequenceID] = sequence; isSuccess = true; @@ -185,9 +200,7 @@ class SequencePool { if (has(sequenceID)) { if (deep == true) { auto it = hub.find(sequenceID); - if (it != hub.end()) { - delete it->second; - } + if (it != hub.end()) { delete it->second; } } isSuccess = hub.erase(sequenceID); } @@ -214,7 +227,6 @@ class SequencePool { std::unordered_map hub; }; - // Manage input sequenceMeta class InputQueue { public: @@ -239,7 +251,6 @@ class InputQueue { std::queue queue; }; - // Manage executive sequenceMeta class TaskWaitingQueue { public: diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h index 5aa39501..0a48c88b 100644 --- a/src/common/transformer_ctx.h +++ b/src/common/transformer_ctx.h @@ -64,7 +64,10 @@ struct DecoderContext { int inputSeqLen; // For custom usage int reserved1; + +#ifdef PIPELINE_PARALLEL int sequenceID; +#endif // Model structure configuration int vocabSize; diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index 90d1590c..2382a73f 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -371,11 +371,12 @@ class CommonDecoder : public AbstractDecoder { while(TaskWaitingQueue::getInstance().empty()); - SequenceMeta *runningTask; + SequenceMeta *runningTask = nullptr; + int32_t sequenceID = -1; if (!TaskWaitingQueue::getInstance().empty()) { runningTask = TaskWaitingQueue::getInstance().pop(); - ctx->sequenceID = runningTask->getSequenceID(); - TimeLine t("Decoder.Seq" + std::to_string(ctx->sequenceID) + ".Step"); + sequenceID = runningTask->getSequenceID(); + TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".Step"); #endif // Decoder: forward @@ -435,7 +436,6 @@ class CommonDecoder : public AbstractDecoder { // If current pipeline stage isn't the end of stage, should send data to next stage and return nullptr if (ctx->ppSize > 1 && ctx->ppRank < ctx->ppSize - 1) { - int32_t sequenceID = runningTask->getSequenceID(); TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Send"); int next_world_rank = (ctx->ppRank + 1) * ctx->tpSize + ctx->tpRank; int count = batchSize * inputSeqLen * hiddenSize; From dcff843e642fd226ebd30a2efa6739946726ff3f Mon Sep 17 00:00:00 2001 From: changqi1 Date: Thu, 25 Apr 2024 09:31:46 +0800 Subject: [PATCH 29/32] Update pp inputs --- CMakeLists.txt | 2 +- examples/cpp/example.cpp | 6 +++- src/common/sequence.h | 4 ++- src/models/common_decoder.h | 50 +++++++++++++++++---------- src/models/models.cpp | 60 ++++++++++++++++++++++++++++++--- src/searchers/greedy_search.cpp | 4 +++ 6 files changed, 101 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index acc21129..08b5b90c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -179,7 +179,7 @@ add_definitions(-DAVX512_FP16_WEIGHT_ONLY_INT4=true) add_definitions(-DAVX512_FP32_WEIGHT_ONLY_NF4=true) # add_definitions(-DAVX512_FP16_WEIGHT_ONLY_NF4=true) -# add_definitions(-DDEBUG=true) +add_definitions(-DDEBUG=true) # add_definitions(-DSTEP_BY_STEP_ATTN=true) add_definitions(-DUSE_SHM=true) option(XFT_BUILD_TESTS "Build xfastertransformer unit tests" OFF) diff --git a/examples/cpp/example.cpp b/examples/cpp/example.cpp index 34cf4eb0..7040edfb 100644 --- a/examples/cpp/example.cpp +++ b/examples/cpp/example.cpp @@ -441,11 +441,15 @@ int main(int argc, char **argv) { if (!model.isDone()) { Timer t(isMaster, "[INFO] First token"); firstIds = model.generate(); + printf("firstIds[0]: %d\n", firstIds[0]); + fflush(stdout); } Timer timerSecond; if (!model.isDone()) { secondIds = model.generate(); + printf("secondIds[0]: %d\n", secondIds[0]); + fflush(stdout); secondIdCount++; } @@ -467,7 +471,7 @@ int main(int argc, char **argv) { } auto result = model.finalize(); - if (isMaster) { + if (true) { std::cout << "\n[INFO] Final output is: " << std::endl; std::vector sent = tokenizer->batchDecode(result, batchSize); for (auto str : sent) { diff --git a/src/common/sequence.h b/src/common/sequence.h index d036e042..a3d79bab 100644 --- a/src/common/sequence.h +++ b/src/common/sequence.h @@ -69,7 +69,7 @@ class SequenceMeta { // For next token void stepForward(int32_t token) { - addNextToken(token); + // addNextToken(token); setPastSeqLen(getPastSeqLen() + 1); setStep(getStep() + 1); } @@ -269,6 +269,8 @@ class TaskWaitingQueue { return full; } + SequenceMeta *front() { return queue.front(); } + SequenceMeta *pop() { auto seq = queue.front(); queue.pop(); diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index 2382a73f..8cb70120 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -31,11 +31,12 @@ #include "mlp_chatglm2.h" #include "mlp_standard.h" #include "model_factory.h" +#include "sequence.h" +#include "thread_util.h" #include "timeline.h" #include "transformer_ctx.h" #include "transpose_util.h" #include "weight_util.h" -#include "sequence.h" using namespace xft; @@ -326,8 +327,7 @@ class CommonDecoder : public AbstractDecoder { dbg.debugPrint("---- embedding.forward ----\n"); dbg.debugPrint("ids:\n"); dbg.dumpMatrix(ids, batchSize, inputSeqLen, inputSeqLen); - dbg.debugPrint( - "embBuf(rows: %d, cols: %d, stride: %d):\n", batchSize * inputSeqLen, hiddenSize, hiddenSize); + dbg.debugPrint("embBuf(rows: %d, cols: %d, stride: %d):\n", batchSize * inputSeqLen, hiddenSize, hiddenSize); dbg.dumpMatrix(embBuf, batchSize * inputSeqLen, hiddenSize, hiddenSize); #endif @@ -344,22 +344,28 @@ class CommonDecoder : public AbstractDecoder { int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank; int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank; int count = batchSize * inputSeqLen * hiddenSize; - int32_t sequenceID; - MPI_Recv(&sequenceID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Recv"); - MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - // TODO: Error: different scope when dynamic loading so file - // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); - if (!SequencePool::getInstance().has(sequenceID)) { - SequenceMeta *sequence = SequencePool::getInstance().createMeta(sequenceID, seqLen); - sequence->setPastSeqLen(pastSeqLen); - sequence->allocBuffer(hiddenSize, embBuf); - SequencePool::getInstance().add(sequence->getSequenceID(), sequence); - } - TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequenceID)); + ThreadPool::getInstance().addTask([curr_world_rank, prev_world_rank, count, &embBuf, seqLen, hiddenSize, pastSeqLen] { + while (true) { + int32_t sequenceID; + MPI_Recv(&sequenceID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Recv"); + MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + // TODO: Error: different scope when dynamic loading so file + // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); + printf("Decoder.Seq%d.MPI_Recv\n", sequenceID); + fflush(stdout); + if (!SequencePool::getInstance().has(sequenceID)) { + SequenceMeta *sequence = SequencePool::getInstance().createMeta(sequenceID, seqLen); + sequence->setPastSeqLen(pastSeqLen); + sequence->allocBuffer(hiddenSize, embBuf); + SequencePool::getInstance().add(sequence->getSequenceID(), sequence); + } + TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequenceID)); + } + }); } - if (!InputQueue::getInstance().empty()) { + while (!InputQueue::getInstance().empty()) { if (!TaskWaitingQueue::getInstance().isFull()) { auto sequence = InputQueue::getInstance().pop(); sequence->setPastSeqLen(pastSeqLen); @@ -369,13 +375,19 @@ class CommonDecoder : public AbstractDecoder { } } - while(TaskWaitingQueue::getInstance().empty()); + while (TaskWaitingQueue::getInstance().empty()); SequenceMeta *runningTask = nullptr; int32_t sequenceID = -1; if (!TaskWaitingQueue::getInstance().empty()) { runningTask = TaskWaitingQueue::getInstance().pop(); sequenceID = runningTask->getSequenceID(); + ctx->sequenceID = runningTask->getSequenceID(); + runningTask->setPastSeqLen(pastSeqLen); + runningTask->allocBuffer(hiddenSize, embBuf); + printf("Decoder.Seq%d.step\n", sequenceID); + fflush(stdout); + TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".Step"); #endif @@ -443,6 +455,8 @@ class CommonDecoder : public AbstractDecoder { MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD); // TODO: Error: different scope when dynamic loading so file // this->messenger.worldSendFP32(embBuf, count, next_world_rank, next_world_rank); + printf("Decoder.Seq%d.MPI_Send\n", sequenceID); + fflush(stdout); return std::tuple(nullptr, 0, 0); } #endif diff --git a/src/models/models.cpp b/src/models/models.cpp index 6e45b0de..dfbafdf3 100644 --- a/src/models/models.cpp +++ b/src/models/models.cpp @@ -85,6 +85,21 @@ void Model::input(std::vector &inputIds_, int batchSize_) { inputIds.resize(dims[1]); if (decoder->getRank() == 0) { inputIds = inputIds_; } messenger.broadcast(inputIds.data(), dims[1]); + + if (this->isMaster()) { + for (int i = 0; i < 2; ++i) { + int sequenceID = SequencePool::getInstance().createSequenceID(); + InputQueue::getInstance().push(SequencePool::getInstance().createMeta(sequenceID, seqLen, inputIds)); + } + + while (!InputQueue::getInstance().empty()) { + if (!TaskWaitingQueue::getInstance().isFull()) { + auto sequence = InputQueue::getInstance().pop(); + SequencePool::getInstance().add(sequence->getSequenceID(), sequence); + TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequence->getSequenceID())); + } + } + } } void Model::config(int maxLen_, int numBeams_, int numBeamHypsToKeep_, float lenPenalty_, bool doEarlyStopping_, @@ -142,12 +157,49 @@ std::vector Model::generate() { exit(-1); } - if (isNewInput) { - isNewInput = false; - return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); + if (this->isMaster()) { + while(TaskWaitingQueue::getInstance().empty()); + + if (TaskWaitingQueue::getInstance().front()->getStep() == 0) { + auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); + TaskWaitingQueue::getInstance().front()->stepForward(); + return token; + } else { + auto token = searcher->getNextToken(); + TaskWaitingQueue::getInstance().front()->stepForward(token[0]); + return token; + } } else { - return searcher->getNextToken(); + if (!isNewInput) { + while(TaskWaitingQueue::getInstance().empty()); + + if (TaskWaitingQueue::getInstance().front()->getStep() == 0) { + auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); + TaskWaitingQueue::getInstance().front()->stepForward(); + return token; + } else { + auto token = searcher->getNextToken(); + TaskWaitingQueue::getInstance().front()->stepForward(token[0]); + return token; + } + } else { + isNewInput = false; + auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); + TaskWaitingQueue::getInstance().front()->stepForward(); + return token; + } } + + // if (isNewInput) { + // printf("1st\n"); + // // TaskWaitingQueue::getInstance().front()->getStep() == 0 + // // TaskWaitingQueue::getInstance().front()->stepForward(); + // isNewInput = false; + // return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); + // } else { + // printf("2nd\n"); + // return searcher->getNextToken(); + // } } void Model::createSearcher(SearcherConfig &config_) { diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp index d0611b0f..cbfbcc5a 100644 --- a/src/searchers/greedy_search.cpp +++ b/src/searchers/greedy_search.cpp @@ -51,6 +51,8 @@ std::vector GreedySearch::syncToken(std::tuple &result) TimeLine t("GreedySearch.Seq" + std::to_string(sequenceID) + ".MPI_Recv"); MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + printf("GreedySearch.Seq%d.MPI_Recv\n", sequenceID); + fflush(stdout); if (SequencePool::getInstance().has(sequenceID)) { auto sequence = SequencePool::getInstance().get(sequenceID); TaskWaitingQueue::getInstance().push(sequence); @@ -72,6 +74,8 @@ std::vector GreedySearch::syncToken(std::tuple &result) MPI_COMM_WORLD); // TODO: Error: different scope when dynamic loading so file // messenger.worldSendINT32(this->nextTokens.data(), batchSize, embedding_world_rank, predictor_world_rank); + printf("GreedySearch.Seq%d.MPI_Send\n", ctx->sequenceID); + fflush(stdout); } } #else From 90cabf882b7b7c843c01236b7012d54b69632430 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Sun, 28 Apr 2024 13:46:58 +0800 Subject: [PATCH 30/32] run good --- src/common/transformer_ctx.h | 2 +- src/models/common_decoder.h | 61 ++++++++++++++++++++++----------- src/models/models.cpp | 5 +++ src/searchers/greedy_search.cpp | 22 +++++++----- src/searchers/greedy_search.h | 2 +- 5 files changed, 62 insertions(+), 30 deletions(-) diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h index 0a48c88b..8c11815d 100644 --- a/src/common/transformer_ctx.h +++ b/src/common/transformer_ctx.h @@ -66,7 +66,7 @@ struct DecoderContext { int reserved1; #ifdef PIPELINE_PARALLEL - int sequenceID; + int32_t sequenceID; #endif // Model structure configuration diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index 8cb70120..d96ac699 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -340,24 +340,39 @@ class CommonDecoder : public AbstractDecoder { #ifdef PIPELINE_PARALLEL // if current pipeline parallel stage rank isn't the first stage, should receive previous stage data - if (ctx->ppSize > 1 && ctx->ppRank > 0) { + if (ctx->ppSize > 1 && ctx->ppRank > 0 && enabledBackgroundSync == false) { + enabledBackgroundSync = true; int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank; int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank; - int count = batchSize * inputSeqLen * hiddenSize; - ThreadPool::getInstance().addTask([curr_world_rank, prev_world_rank, count, &embBuf, seqLen, hiddenSize, pastSeqLen] { + // int64_t count = batchSize * inputSeqLen * hiddenSize; + ThreadPool::getInstance().addTask([curr_world_rank, prev_world_rank, seqLen, hiddenSize, pastSeqLen, this] { while (true) { - int32_t sequenceID; - MPI_Recv(&sequenceID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Recv"); - MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + int64_t recvBuf[2] = {0, 0}; + MPI_Recv(&recvBuf, 2, MPI_INT64_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + int32_t sequenceID = recvBuf[0]; + int64_t count = recvBuf[1]; + MPI_Recv(this->actBuffers->Data(), count, MPI_FLOAT, prev_world_rank, curr_world_rank + 1000, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + + // MPI_Status status; + // MPI_Probe(prev_world_rank, curr_world_rank, MPI_COMM_WORLD, &status); + // int number_amount; + // MPI_Get_count(&status, MPI_FLOAT, &number_amount); + // printf("Decoder.probe.%d\n", number_amount); + // fflush(stdout); + // float recvBuf[number_amount] = {0.0f}; + // MPI_Recv(&recvBuf, number_amount, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + // int32_t sequenceID = recvBuf[0]; + // int64_t count = recvBuf[1]; + printf("Decoder.Seq%d.MPI_Recv%d\n", sequenceID, count); + fflush(stdout); + // memcpy(this->actBuffers->Data(), recvBuf + 2, count * sizeof(float)); // TODO: Error: different scope when dynamic loading so file // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); - printf("Decoder.Seq%d.MPI_Recv\n", sequenceID); - fflush(stdout); + TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Recv"); if (!SequencePool::getInstance().has(sequenceID)) { SequenceMeta *sequence = SequencePool::getInstance().createMeta(sequenceID, seqLen); - sequence->setPastSeqLen(pastSeqLen); - sequence->allocBuffer(hiddenSize, embBuf); + // sequence->setPastSeqLen(pastSeqLen); + // sequence->allocBuffer(hiddenSize, embBuf); SequencePool::getInstance().add(sequence->getSequenceID(), sequence); } TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequenceID)); @@ -368,8 +383,8 @@ class CommonDecoder : public AbstractDecoder { while (!InputQueue::getInstance().empty()) { if (!TaskWaitingQueue::getInstance().isFull()) { auto sequence = InputQueue::getInstance().pop(); - sequence->setPastSeqLen(pastSeqLen); - sequence->allocBuffer(hiddenSize, embBuf); + // sequence->setPastSeqLen(pastSeqLen); + // sequence->allocBuffer(hiddenSize, embBuf); SequencePool::getInstance().add(sequence->getSequenceID(), sequence); TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequence->getSequenceID())); } @@ -380,11 +395,11 @@ class CommonDecoder : public AbstractDecoder { SequenceMeta *runningTask = nullptr; int32_t sequenceID = -1; if (!TaskWaitingQueue::getInstance().empty()) { - runningTask = TaskWaitingQueue::getInstance().pop(); + runningTask = TaskWaitingQueue::getInstance().front(); sequenceID = runningTask->getSequenceID(); ctx->sequenceID = runningTask->getSequenceID(); - runningTask->setPastSeqLen(pastSeqLen); - runningTask->allocBuffer(hiddenSize, embBuf); + // runningTask->setPastSeqLen(pastSeqLen); + // runningTask->allocBuffer(hiddenSize, embBuf); printf("Decoder.Seq%d.step\n", sequenceID); fflush(stdout); @@ -450,12 +465,16 @@ class CommonDecoder : public AbstractDecoder { if (ctx->ppSize > 1 && ctx->ppRank < ctx->ppSize - 1) { TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Send"); int next_world_rank = (ctx->ppRank + 1) * ctx->tpSize + ctx->tpRank; - int count = batchSize * inputSeqLen * hiddenSize; - MPI_Send(&sequenceID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD); - MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD); + int64_t count = batchSize * inputSeqLen * hiddenSize; + int64_t sendBuf[2] = {sequenceID, count}; + MPI_Send(&sendBuf, 2, MPI_INT64_T, next_world_rank, next_world_rank, MPI_COMM_WORLD); + MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank + 1000, MPI_COMM_WORLD); + // float sendBuf[2 + count] = {(float)sequenceID, (float)count}; + // memcpy(sendBuf + 2, embBuf, count * sizeof(float)); + // MPI_Send(&sendBuf, 2 + count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD); // TODO: Error: different scope when dynamic loading so file // this->messenger.worldSendFP32(embBuf, count, next_world_rank, next_world_rank); - printf("Decoder.Seq%d.MPI_Send\n", sequenceID); + printf("Decoder.Seq%d.MPI_Send%d\n", sequenceID, count); fflush(stdout); return std::tuple(nullptr, 0, 0); } @@ -1030,6 +1049,8 @@ class CommonDecoder : public AbstractDecoder { int startId; int endId; + bool enabledBackgroundSync = false; + #ifdef DEBUG Debugger dbg; #endif diff --git a/src/models/models.cpp b/src/models/models.cpp index dfbafdf3..54f2421f 100644 --- a/src/models/models.cpp +++ b/src/models/models.cpp @@ -163,10 +163,12 @@ std::vector Model::generate() { if (TaskWaitingQueue::getInstance().front()->getStep() == 0) { auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); TaskWaitingQueue::getInstance().front()->stepForward(); + TaskWaitingQueue::getInstance().pop(); return token; } else { auto token = searcher->getNextToken(); TaskWaitingQueue::getInstance().front()->stepForward(token[0]); + TaskWaitingQueue::getInstance().pop(); return token; } } else { @@ -176,16 +178,19 @@ std::vector Model::generate() { if (TaskWaitingQueue::getInstance().front()->getStep() == 0) { auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); TaskWaitingQueue::getInstance().front()->stepForward(); + TaskWaitingQueue::getInstance().pop(); return token; } else { auto token = searcher->getNextToken(); TaskWaitingQueue::getInstance().front()->stepForward(token[0]); + TaskWaitingQueue::getInstance().pop(); return token; } } else { isNewInput = false; auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); TaskWaitingQueue::getInstance().front()->stepForward(); + TaskWaitingQueue::getInstance().pop(); return token; } } diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp index cbfbcc5a..3ad6ba1f 100644 --- a/src/searchers/greedy_search.cpp +++ b/src/searchers/greedy_search.cpp @@ -45,12 +45,16 @@ std::vector GreedySearch::syncToken(std::tuple &result) int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank; ThreadPool::getInstance().addTask([predictor_world_rank, this] { while (true) { - int32_t sequenceID; - MPI_Recv(&sequenceID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, + int32_t recvBuf[2]; + MPI_Recv(&recvBuf, 2, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - TimeLine t("GreedySearch.Seq" + std::to_string(sequenceID) + ".MPI_Recv"); - MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, - predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + int32_t sequenceID = recvBuf[0]; + this->nextTokens[0] = recvBuf[1]; + // MPI_Recv(&sequenceID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, + // MPI_STATUS_IGNORE); + // TimeLine t("GreedySearch.Seq" + std::to_string(sequenceID) + ".MPI_Recv"); + // MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, + // predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); printf("GreedySearch.Seq%d.MPI_Recv\n", sequenceID); fflush(stdout); if (SequencePool::getInstance().has(sequenceID)) { @@ -69,9 +73,11 @@ std::vector GreedySearch::syncToken(std::tuple &result) TimeLine t("GreedySearch.Seq" + std::to_string(ctx->sequenceID) + ".MPI_Send"); int embedding_world_rank = 0 * ctx->tpSize + ctx->tpRank; int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank; - MPI_Send(&ctx->sequenceID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); - MPI_Send(this->nextTokens.data(), batchSize, MPI_INT32_T, embedding_world_rank, predictor_world_rank, - MPI_COMM_WORLD); + int32_t sendBuf[2] = {ctx->sequenceID, nextTokens[0]}; + MPI_Send(&sendBuf, 2, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); + // MPI_Send(&ctx->sequenceID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD); + // MPI_Send(this->nextTokens.data(), batchSize, MPI_INT32_T, embedding_world_rank, predictor_world_rank, + // MPI_COMM_WORLD); // TODO: Error: different scope when dynamic loading so file // messenger.worldSendINT32(this->nextTokens.data(), batchSize, embedding_world_rank, predictor_world_rank); printf("GreedySearch.Seq%d.MPI_Send\n", ctx->sequenceID); diff --git a/src/searchers/greedy_search.h b/src/searchers/greedy_search.h index 5b4ec164..22a9c419 100644 --- a/src/searchers/greedy_search.h +++ b/src/searchers/greedy_search.h @@ -47,7 +47,7 @@ class GreedySearch : public AbstractSearcher { std::vector> cachedRepetVec; std::vector doneBatch; - bool enabledBackgroundSync; + bool enabledBackgroundSync = false; int batchSize; int step; int curLen; From e9d543735d10cd8810f1516fcf28b7e5054d5cb7 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Sun, 28 Apr 2024 14:52:09 +0800 Subject: [PATCH 31/32] run good --- CMakeLists.txt | 2 +- src/common/sequence.h | 5 +++++ src/models/common_decoder.h | 7 ++++--- src/models/models.cpp | 1 + 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 08b5b90c..acc21129 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -179,7 +179,7 @@ add_definitions(-DAVX512_FP16_WEIGHT_ONLY_INT4=true) add_definitions(-DAVX512_FP32_WEIGHT_ONLY_NF4=true) # add_definitions(-DAVX512_FP16_WEIGHT_ONLY_NF4=true) -add_definitions(-DDEBUG=true) +# add_definitions(-DDEBUG=true) # add_definitions(-DSTEP_BY_STEP_ATTN=true) add_definitions(-DUSE_SHM=true) option(XFT_BUILD_TESTS "Build xfastertransformer unit tests" OFF) diff --git a/src/common/sequence.h b/src/common/sequence.h index a3d79bab..ae10ac2f 100644 --- a/src/common/sequence.h +++ b/src/common/sequence.h @@ -114,8 +114,13 @@ class SequenceMeta { memcpy(hiddenStates, _hiddenStates, sizeof(T) * getInputSeqLen() * hiddenSize); } + int32_t getHiddenStatesSize() const { return hiddenStatesSize; } + + void setHiddenStatesSize(int32_t _hiddenStatesSize) { hiddenStatesSize = _hiddenStatesSize; } + private: int32_t hiddenSize; + int64_t hiddenStatesSize; void *hiddenStates; #endif }; diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index d96ac699..cff904a1 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -339,11 +339,11 @@ class CommonDecoder : public AbstractDecoder { t1.release(); #ifdef PIPELINE_PARALLEL + int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank; + int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank; // if current pipeline parallel stage rank isn't the first stage, should receive previous stage data if (ctx->ppSize > 1 && ctx->ppRank > 0 && enabledBackgroundSync == false) { enabledBackgroundSync = true; - int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank; - int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank; // int64_t count = batchSize * inputSeqLen * hiddenSize; ThreadPool::getInstance().addTask([curr_world_rank, prev_world_rank, seqLen, hiddenSize, pastSeqLen, this] { while (true) { @@ -351,7 +351,6 @@ class CommonDecoder : public AbstractDecoder { MPI_Recv(&recvBuf, 2, MPI_INT64_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); int32_t sequenceID = recvBuf[0]; int64_t count = recvBuf[1]; - MPI_Recv(this->actBuffers->Data(), count, MPI_FLOAT, prev_world_rank, curr_world_rank + 1000, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // MPI_Status status; // MPI_Probe(prev_world_rank, curr_world_rank, MPI_COMM_WORLD, &status); @@ -371,6 +370,7 @@ class CommonDecoder : public AbstractDecoder { TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Recv"); if (!SequencePool::getInstance().has(sequenceID)) { SequenceMeta *sequence = SequencePool::getInstance().createMeta(sequenceID, seqLen); + sequence->setHiddenStatesSize(count); // sequence->setPastSeqLen(pastSeqLen); // sequence->allocBuffer(hiddenSize, embBuf); SequencePool::getInstance().add(sequence->getSequenceID(), sequence); @@ -400,6 +400,7 @@ class CommonDecoder : public AbstractDecoder { ctx->sequenceID = runningTask->getSequenceID(); // runningTask->setPastSeqLen(pastSeqLen); // runningTask->allocBuffer(hiddenSize, embBuf); + MPI_Recv(embBuf, TaskWaitingQueue::getInstance().front()->getHiddenStatesSize(), MPI_FLOAT, prev_world_rank, curr_world_rank + 1000, MPI_COMM_WORLD, MPI_STATUS_IGNORE); printf("Decoder.Seq%d.step\n", sequenceID); fflush(stdout); diff --git a/src/models/models.cpp b/src/models/models.cpp index 54f2421f..4ed419c1 100644 --- a/src/models/models.cpp +++ b/src/models/models.cpp @@ -166,6 +166,7 @@ std::vector Model::generate() { TaskWaitingQueue::getInstance().pop(); return token; } else { + isNewInput = false; auto token = searcher->getNextToken(); TaskWaitingQueue::getInstance().front()->stepForward(token[0]); TaskWaitingQueue::getInstance().pop(); From 9e1b7705ec85f19e89d43a146bd72244cd6e1b60 Mon Sep 17 00:00:00 2001 From: changqi1 Date: Sun, 28 Apr 2024 22:11:22 +0800 Subject: [PATCH 32/32] could run --- examples/cpp/example.cpp | 4 --- src/common/sequence.h | 20 ++++++++++++ src/models/common_decoder.h | 27 +++------------- src/models/models.cpp | 56 ++++++++++----------------------- src/searchers/greedy_search.cpp | 13 ++++---- src/searchers/greedy_search.h | 2 +- src/utils/thread_util.h | 2 +- 7 files changed, 48 insertions(+), 76 deletions(-) diff --git a/examples/cpp/example.cpp b/examples/cpp/example.cpp index 7040edfb..f6a68ce7 100644 --- a/examples/cpp/example.cpp +++ b/examples/cpp/example.cpp @@ -441,15 +441,11 @@ int main(int argc, char **argv) { if (!model.isDone()) { Timer t(isMaster, "[INFO] First token"); firstIds = model.generate(); - printf("firstIds[0]: %d\n", firstIds[0]); - fflush(stdout); } Timer timerSecond; if (!model.isDone()) { secondIds = model.generate(); - printf("secondIds[0]: %d\n", secondIds[0]); - fflush(stdout); secondIdCount++; } diff --git a/src/common/sequence.h b/src/common/sequence.h index ae10ac2f..84049436 100644 --- a/src/common/sequence.h +++ b/src/common/sequence.h @@ -225,6 +225,14 @@ class SequencePool { return isSuccess; } + void clear() { + for (auto &it : hub) { + delete it.second; + } + hub.clear(); + globalSequenceID = 0; + } + private: SequencePool() {} @@ -250,6 +258,12 @@ class InputQueue { void push(SequenceMeta *seq) { queue.push(seq); } + void clear() { + while (!queue.empty()) { + queue.pop(); + } + } + private: InputQueue() {} @@ -284,6 +298,12 @@ class TaskWaitingQueue { void push(SequenceMeta *seq) { queue.push(seq); } + void clear() { + while (!queue.empty()) { + queue.pop(); + } + } + private: TaskWaitingQueue() {} diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h index cff904a1..cbe06e82 100644 --- a/src/models/common_decoder.h +++ b/src/models/common_decoder.h @@ -348,23 +348,10 @@ class CommonDecoder : public AbstractDecoder { ThreadPool::getInstance().addTask([curr_world_rank, prev_world_rank, seqLen, hiddenSize, pastSeqLen, this] { while (true) { int64_t recvBuf[2] = {0, 0}; - MPI_Recv(&recvBuf, 2, MPI_INT64_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(&recvBuf, 2, MPI_INT64_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, + MPI_STATUS_IGNORE); int32_t sequenceID = recvBuf[0]; int64_t count = recvBuf[1]; - - // MPI_Status status; - // MPI_Probe(prev_world_rank, curr_world_rank, MPI_COMM_WORLD, &status); - // int number_amount; - // MPI_Get_count(&status, MPI_FLOAT, &number_amount); - // printf("Decoder.probe.%d\n", number_amount); - // fflush(stdout); - // float recvBuf[number_amount] = {0.0f}; - // MPI_Recv(&recvBuf, number_amount, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - // int32_t sequenceID = recvBuf[0]; - // int64_t count = recvBuf[1]; - printf("Decoder.Seq%d.MPI_Recv%d\n", sequenceID, count); - fflush(stdout); - // memcpy(this->actBuffers->Data(), recvBuf + 2, count * sizeof(float)); // TODO: Error: different scope when dynamic loading so file // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank); TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Recv"); @@ -400,9 +387,8 @@ class CommonDecoder : public AbstractDecoder { ctx->sequenceID = runningTask->getSequenceID(); // runningTask->setPastSeqLen(pastSeqLen); // runningTask->allocBuffer(hiddenSize, embBuf); - MPI_Recv(embBuf, TaskWaitingQueue::getInstance().front()->getHiddenStatesSize(), MPI_FLOAT, prev_world_rank, curr_world_rank + 1000, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - printf("Decoder.Seq%d.step\n", sequenceID); - fflush(stdout); + MPI_Recv(embBuf, TaskWaitingQueue::getInstance().front()->getHiddenStatesSize(), MPI_FLOAT, prev_world_rank, + curr_world_rank + 1000, MPI_COMM_WORLD, MPI_STATUS_IGNORE); TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".Step"); #endif @@ -470,13 +456,8 @@ class CommonDecoder : public AbstractDecoder { int64_t sendBuf[2] = {sequenceID, count}; MPI_Send(&sendBuf, 2, MPI_INT64_T, next_world_rank, next_world_rank, MPI_COMM_WORLD); MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank + 1000, MPI_COMM_WORLD); - // float sendBuf[2 + count] = {(float)sequenceID, (float)count}; - // memcpy(sendBuf + 2, embBuf, count * sizeof(float)); - // MPI_Send(&sendBuf, 2 + count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD); // TODO: Error: different scope when dynamic loading so file // this->messenger.worldSendFP32(embBuf, count, next_world_rank, next_world_rank); - printf("Decoder.Seq%d.MPI_Send%d\n", sequenceID, count); - fflush(stdout); return std::tuple(nullptr, 0, 0); } #endif diff --git a/src/models/models.cpp b/src/models/models.cpp index 4ed419c1..18f08236 100644 --- a/src/models/models.cpp +++ b/src/models/models.cpp @@ -130,6 +130,11 @@ void Model::config(SearcherConfig &config_, const std::vector> // Slaves get exit flags and exit directly if (decoder->getRank() > 0 && configuration.numBeams == 0) { exit(0); } + InputQueue::getInstance().clear(); + TaskWaitingQueue::getInstance().clear(); + SequencePool::getInstance().clear(); + // ThreadPool::getInstance().clear(); + createSearcher(configuration); setStopWords(stopWordsList_); } @@ -157,55 +162,26 @@ std::vector Model::generate() { exit(-1); } - if (this->isMaster()) { + std::vector token; + if (!this->isMaster() && isNewInput) { + isNewInput = false; + token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); + TaskWaitingQueue::getInstance().front()->stepForward(); + } else { while(TaskWaitingQueue::getInstance().empty()); if (TaskWaitingQueue::getInstance().front()->getStep() == 0) { - auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); + isNewInput = false; + token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); TaskWaitingQueue::getInstance().front()->stepForward(); - TaskWaitingQueue::getInstance().pop(); - return token; } else { - isNewInput = false; - auto token = searcher->getNextToken(); + token = searcher->getNextToken(); TaskWaitingQueue::getInstance().front()->stepForward(token[0]); - TaskWaitingQueue::getInstance().pop(); - return token; - } - } else { - if (!isNewInput) { - while(TaskWaitingQueue::getInstance().empty()); - - if (TaskWaitingQueue::getInstance().front()->getStep() == 0) { - auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); - TaskWaitingQueue::getInstance().front()->stepForward(); - TaskWaitingQueue::getInstance().pop(); - return token; - } else { - auto token = searcher->getNextToken(); - TaskWaitingQueue::getInstance().front()->stepForward(token[0]); - TaskWaitingQueue::getInstance().pop(); - return token; - } - } else { - isNewInput = false; - auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); - TaskWaitingQueue::getInstance().front()->stepForward(); - TaskWaitingQueue::getInstance().pop(); - return token; } } - // if (isNewInput) { - // printf("1st\n"); - // // TaskWaitingQueue::getInstance().front()->getStep() == 0 - // // TaskWaitingQueue::getInstance().front()->stepForward(); - // isNewInput = false; - // return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize); - // } else { - // printf("2nd\n"); - // return searcher->getNextToken(); - // } + TaskWaitingQueue::getInstance().pop(); + return token; } void Model::createSearcher(SearcherConfig &config_) { diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp index 3ad6ba1f..493fb717 100644 --- a/src/searchers/greedy_search.cpp +++ b/src/searchers/greedy_search.cpp @@ -21,7 +21,7 @@ using namespace xft; GreedySearch::GreedySearch(AbstractDecoder &dec, const SearcherConfig &config) - : decoder(dec), maxLen(config.maxLen), step(0), repetitionPenalty(config.repetitionPenalty) { + : decoder(dec), maxLen(config.maxLen), step(0), repetitionPenalty(config.repetitionPenalty), enabledBackgroundSync(false) { eosTokenId = config.eosTokenId == -1 ? decoder.getEndId() : config.eosTokenId; padTokenId = config.padTokenId == -1 ? eosTokenId : config.padTokenId; if (repetitionPenalty <= 0) { @@ -39,7 +39,6 @@ std::vector GreedySearch::syncToken(std::tuple &result) // Messenger &messenger = decoder.getMessenger(); if (std::get<0>(result) == nullptr) { // The first embedding pipeline parallel stage - this->nextTokens = std::vector(batchSize, 0); if (ctx->ppSize > 1 && ctx->ppRank == 0 && enabledBackgroundSync == false) { enabledBackgroundSync = true; int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank; @@ -52,11 +51,9 @@ std::vector GreedySearch::syncToken(std::tuple &result) this->nextTokens[0] = recvBuf[1]; // MPI_Recv(&sequenceID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD, // MPI_STATUS_IGNORE); - // TimeLine t("GreedySearch.Seq" + std::to_string(sequenceID) + ".MPI_Recv"); + TimeLine t("GreedySearch.Seq" + std::to_string(sequenceID) + ".MPI_Recv"); // MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, // predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - printf("GreedySearch.Seq%d.MPI_Recv\n", sequenceID); - fflush(stdout); if (SequencePool::getInstance().has(sequenceID)) { auto sequence = SequencePool::getInstance().get(sequenceID); TaskWaitingQueue::getInstance().push(sequence); @@ -80,8 +77,6 @@ std::vector GreedySearch::syncToken(std::tuple &result) // MPI_COMM_WORLD); // TODO: Error: different scope when dynamic loading so file // messenger.worldSendINT32(this->nextTokens.data(), batchSize, embedding_world_rank, predictor_world_rank); - printf("GreedySearch.Seq%d.MPI_Send\n", ctx->sequenceID); - fflush(stdout); } } #else @@ -112,6 +107,8 @@ std::vector GreedySearch::getNextToken(int *ids, int batchSize, int seqLen) std::copy(ids, ids + batchSize * seqLen, output.begin()); int64_t dims[3] = {batchSize, 1, seqLen}; + if (this->nextTokens.size() != batchSize) + this->nextTokens.resize(batchSize, 0); std::tuple result = decoder.forward(ids, dims, this->step++); @@ -122,6 +119,8 @@ std::vector GreedySearch::getNextToken(int *ids, int batchSize, int seqLen) std::vector GreedySearch::getNextToken() { TimeLine t("Next Token"); int64_t dims[3] = {batchSize, 1, 1}; + if (this->nextTokens.size() != batchSize) + this->nextTokens.resize(batchSize, 0); std::tuple result = decoder.forward(nextTokens.data(), dims, this->step++); diff --git a/src/searchers/greedy_search.h b/src/searchers/greedy_search.h index 22a9c419..5b4ec164 100644 --- a/src/searchers/greedy_search.h +++ b/src/searchers/greedy_search.h @@ -47,7 +47,7 @@ class GreedySearch : public AbstractSearcher { std::vector> cachedRepetVec; std::vector doneBatch; - bool enabledBackgroundSync = false; + bool enabledBackgroundSync; int batchSize; int step; int curLen; diff --git a/src/utils/thread_util.h b/src/utils/thread_util.h index a44b08d7..f22c59ac 100644 --- a/src/utils/thread_util.h +++ b/src/utils/thread_util.h @@ -56,7 +56,7 @@ class ThreadPool { condition.notify_one(); } - ~ThreadPool() { + void clear() { stop = true; condition.notify_all(); for (std::thread &worker : workers) {