From 3c509abff48ad667fa69d77b4ed1bb71a2e7af1a Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 15 Apr 2024 15:43:35 +0800
Subject: [PATCH 01/32] [Build] Fix build issue.

---
 src/layers/mlp_llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/layers/mlp_llama.cpp b/src/layers/mlp_llama.cpp
index f650a2f5..c457a344 100644
--- a/src/layers/mlp_llama.cpp
+++ b/src/layers/mlp_llama.cpp
@@ -64,7 +64,7 @@ void invokeMLPLLaMA(DataType dt, int numTokens, int hiddenSize, int intermediate
                 || (ctx != nullptr && (ctx->hiddenSize != hiddenSize || ctx->intermediateSize != intermediateSize))) {
             if (ctx != nullptr) delete ctx;
             printf(">> create context: %d %d\n", hiddenSize, intermediateSize);
-            ctx = new DecoderContext(1, hiddenSize, 1, 1, intermediateSize, "silu", 1e-6, 0, 0, 0, 0, 0, 0, 1);
+            ctx = new DecoderContext(1, hiddenSize, 1, 1, 1, intermediateSize, "silu", 1e-6, 0, 0, 0, 0, 0, 0, 1);
             ctx->mmHelper = new MMHelper(Env::getInstance().getEngineKind(), Env::getInstance().getEngineIndex());
         }
 

From 24de368530410e90489319419354d6c32a05cc6b Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Thu, 18 Apr 2024 22:14:06 +0800
Subject: [PATCH 02/32] [Model] Init pipeline parallel.

---
 examples/cpp/example.cpp        |   2 +-
 src/common/transformer_ctx.h    |  20 +--
 src/models/common_decoder.h     |  79 ++++++++++-
 src/models/models.cpp           |  19 ++-
 src/models/prompt.h             | 232 ++++++++++++++++++++++++++++++++
 src/searchers/greedy_search.cpp |  43 +++++-
 src/searchers/greedy_search.h   |   2 +
 src/utils/thread_util.h         |  67 ++++++++-
 8 files changed, 442 insertions(+), 22 deletions(-)
 create mode 100644 src/models/prompt.h

diff --git a/examples/cpp/example.cpp b/examples/cpp/example.cpp
index 34cf4eb0..f6a68ce7 100644
--- a/examples/cpp/example.cpp
+++ b/examples/cpp/example.cpp
@@ -467,7 +467,7 @@ int main(int argc, char **argv) {
         }
         auto result = model.finalize();
 
-        if (isMaster) {
+        if (true) {
             std::cout << "\n[INFO] Final output is: " << std::endl;
             std::vector<std::string> sent = tokenizer->batchDecode(result, batchSize);
             for (auto str : sent) {
diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h
index cb29ee11..b01d363c 100644
--- a/src/common/transformer_ctx.h
+++ b/src/common/transformer_ctx.h
@@ -18,15 +18,17 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
-#include <string>
-
-#include "allocator.h"
 #include <filesystem>
+#include <string>
 
 #include "INIReader.h"
+#include "allocator.h"
 #include "my_types.h"
 #include "simple_mem_pool.h"
 #include "split_util.h"
+#include "float16.h"
+#include "singleton.h"
+#include "kvcache_manager.h"
 
 namespace fs = std::filesystem;
 
@@ -126,10 +128,10 @@ struct DecoderContext {
     uint64_t size3;
 
 public:
-    DecoderContext(int _layers, int _hiddenSize, int _headSize, int _attHeadNum, int _kvHeadNum, int _imSize, const std::string &act,
-            float epsilon, int _vocabSize, int _embeddingSize, int _maxPositions, int _maxPosEmbed, int _maxSeqLength,
-            int _splitIdx, int _splits, int _ppSize = 1, int _ppRank = 0, RopeParams *_ropeParamsPtr = nullptr,
-            bool _useLogN = true, bool _useNTK = true, int numThreads = 0)
+    DecoderContext(int _layers, int _hiddenSize, int _headSize, int _attHeadNum, int _kvHeadNum, int _imSize,
+            const std::string &act, float epsilon, int _vocabSize, int _embeddingSize, int _maxPositions,
+            int _maxPosEmbed, int _maxSeqLength, int _splitIdx, int _splits, int _ppSize = 1, int _ppRank = 0,
+            RopeParams *_ropeParamsPtr = nullptr, bool _useLogN = true, bool _useNTK = true, int numThreads = 0)
         : layers(_layers)
         , hiddenSize(_hiddenSize)
         , attHeadSize(_headSize)
@@ -151,9 +153,7 @@ struct DecoderContext {
         , tpSize(_splits)
         , tpRank(_splitIdx)
         , epsilon(epsilon) {
-        if (attHeadNum != 0) {
-            this->attFactor = 1 / sqrtf(attHeadSize);
-        }
+        if (attHeadNum != 0) { this->attFactor = 1 / sqrtf(attHeadSize); }
 
         // Set the default value (don't worry, it can be changed later)
         this->batchSize = 1;
diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index f51cdc25..59df24b6 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -34,6 +34,7 @@
 #include "transformer_ctx.h"
 #include "transpose_util.h"
 #include "weight_util.h"
+#include "prompt.h"
 
 using namespace xft;
 
@@ -224,6 +225,7 @@ class CommonDecoder : public AbstractDecoder {
         DecoderContext *ctx = getDecoderContext(layers, hiddenSize, size_per_head, attHeadNum, kvHeadNum, imSize, act,
                 epsilon, vocabSize, embeddingSize, maxPositions, maxPosEmbed, maxSeqLength, useLogN, useNTK,
                 ropeParamsPtr);
+        pool = new ThreadPool(4);
 
         ctx->ResetConfigReader(configPath);
 
@@ -285,6 +287,7 @@ class CommonDecoder : public AbstractDecoder {
         // Prepare context
         DecoderContext *ctx = this->getContext();
         ctx->resize(batchSize, seqLen, pastSeqLen);
+        int hiddenSize = ctx->hiddenSize;
 
         if (step == 0) {
             // Reset initial and accumulated sequence length at the first step
@@ -341,14 +344,70 @@ class CommonDecoder : public AbstractDecoder {
             int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank;
             int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank;
             int count = batchSize * inputSeqLen * ctx->hiddenSize;
-            MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-            // TODO: Error: different scope when dynamic loading so file
-            // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
+            if (TaskWaitingQueue<AttnInT>::getInstance().empty()) {
+                TimeLine t("Decoder.MPI_Recv");
+                printf("%d: Decoder.MPI_Recv.SyncStart\n", ctx->ppRank);
+                int32_t promptID;
+                MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                // TODO: Error: different scope when dynamic loading so file
+                // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
+                printf("%.6f\n", embBuf[0]);
+                if (!PromptPool<AttnInT>::getInstance().has(promptID)) {
+                    PromptMeta<AttnInT> *prompt = new PromptMeta<AttnInT>(promptID, 0, batchSize, seqLen, hiddenSize);
+                    prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
+                    PromptPool<AttnInT>::getInstance().insert(prompt->promptID, prompt);
+                }
+                TaskWaitingQueue<AttnInT>::getInstance().push(PromptPool<AttnInT>::getInstance().get(promptID));
+                printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, promptID);
+                fflush(stdout);
+            } else {
+                pool->enqueue([=, &embBuf] {
+                    TimeLine t("Decoder.MPI_Recv");
+                    printf("%d: Decoder.MPI_Recv.ASyncStart\n", ctx->ppRank);
+                    int32_t promptID;
+                    MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                    MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                    // TODO: Error: different scope when dynamic loading so file
+                    // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
+                    printf("%.6f\n", embBuf[0]);
+                    if (!PromptPool<AttnInT>::getInstance().has(promptID)) {
+                        PromptMeta<AttnInT> *prompt = new PromptMeta<AttnInT>(promptID, 0, batchSize, seqLen, hiddenSize);
+                        prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
+                        PromptPool<AttnInT>::getInstance().insert(prompt->promptID, prompt);
+                    }
+                    TaskWaitingQueue<AttnInT>::getInstance().push(PromptPool<AttnInT>::getInstance().get(promptID));
+                    printf("%d: Decoder.MPI_Recv.ASyncDone %d\n", ctx->ppRank, promptID);
+                    fflush(stdout);
+                });
+            }
         }
 #endif
 
-        // Decoder: forward
-        int hiddenSize = ctx->hiddenSize;
+        if (!TaskWaitingQueue<AttnInT>::getInstance().isFull()) {
+            // for (const auto& prompt : PromptPool<AttnInT>::getInstance().getAll()) {
+            //     if (!TaskWaitingQueue<AttnInT>::getInstance().isFull()) {
+            //         if (prompt->hiddenStatesReceived) {
+            //             TaskWaitingQueue<AttnInT>::getInstance().push(prompt);
+            //         }
+            //     }
+            // }
+
+            if (!InputQueue<AttnInT>::getInstance().empty()) {
+                if (!TaskWaitingQueue<AttnInT>::getInstance().isFull()) {
+                    auto prompt = InputQueue<AttnInT>::getInstance().pop();
+                    prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
+                    PromptPool<AttnInT>::getInstance().insert(prompt->promptID, prompt);
+                    TaskWaitingQueue<AttnInT>::getInstance().push(PromptPool<AttnInT>::getInstance().get(prompt->promptID));
+                }
+            }
+        }
+
+    PromptMeta<AttnInT> *runningTask;
+    while (!TaskWaitingQueue<AttnInT>::getInstance().empty()) {
+        runningTask = TaskWaitingQueue<AttnInT>::getInstance().pop();
+        printf("%d: Decoder.forward\n", ctx->ppRank);
+        // Decoder: forward from runningTask
         int layers_per_pp_stage = this->decoders.size();
         for (int i = 0; i < layers_per_pp_stage; ++i) {
             int workers = this->messenger.getSize();
@@ -399,15 +458,23 @@ class CommonDecoder : public AbstractDecoder {
                 }
             }
         }
+    } 
+    // else {
+    //     return std::tuple<float *, int, int>(nullptr, 0, 0);
+    // }
 
 #ifdef PIPELINE_PARALLEL
         // If current pipeline stage isn't the end of stage, should send data to next stage and return nullptr
         if (ctx->ppSize > 1 && ctx->ppRank < ctx->ppSize - 1) {
+            TimeLine t("Decoder.MPI_Send");
             int next_world_rank = (ctx->ppRank + 1) * ctx->tpSize + ctx->tpRank;
             int count = batchSize * inputSeqLen * ctx->hiddenSize;
+            MPI_Send(&runningTask->promptID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD);
             MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD);
             // TODO: Error: different scope when dynamic loading so file
             // this->messenger.worldSendFP32(embBuf, count, next_world_rank, next_world_rank);
+            printf("%d: Decoder.MPI_Send %d\n", ctx->ppRank, runningTask->promptID);
+            fflush(stdout);
             return std::tuple<float *, int, int>(nullptr, 0, 0);
         }
 #endif
@@ -967,6 +1034,8 @@ class CommonDecoder : public AbstractDecoder {
     // Activation buffers (declared as float, but the actual data type may be different)
     std::shared_ptr<hpj::Matrix<float>> actBuffers;
 
+    ThreadPool *pool;
+
 protected:
     // Components most LLMs may use
     std::vector<DECODER *> decoders;
diff --git a/src/models/models.cpp b/src/models/models.cpp
index e7fd4b70..f6568164 100644
--- a/src/models/models.cpp
+++ b/src/models/models.cpp
@@ -31,6 +31,7 @@
 #include "searcher.h"
 #include "timeline.h"
 #include "yarn_llama.h"
+#include "prompt.h"
 
 namespace xft {
 enum class GenerationMode { GREEDY_SEARCH, BEAM_SEARCH, SAMPLE };
@@ -136,9 +137,25 @@ std::vector<int32_t> Model::generate() {
     }
 
     if (isNewInput) {
-        isNewInput = false;
+        static int i = 0;
+        i++;
+        if (i > 3) {
+            isNewInput = false;
+            i = 0;
+        }
+        // TODO: Create it when request input
+        if (this->isMaster()) {
+            int promptID = InputQueue<float>::getInstance().createPromptID();
+            int tokenID = InputQueue<float>::getInstance().createTokenID();
+            InputQueue<float>::getInstance().push(new PromptMeta<float>(promptID, tokenID, batchSize, seqLen, inputIds));
+        }
         return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
     } else {
+        static int i = 0;
+        i++;
+        if (i == 10) {
+            isNewInput = true;
+        }
         return searcher->getNextToken();
     }
 }
diff --git a/src/models/prompt.h b/src/models/prompt.h
new file mode 100644
index 00000000..c10b1ab7
--- /dev/null
+++ b/src/models/prompt.h
@@ -0,0 +1,232 @@
+// Copyright (c) 2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// ============================================================================
+#pragma once
+
+#include <queue>
+#include <unordered_map>
+
+namespace xft {
+
+template <typename AttnInT>
+class PromptMeta {
+public:
+    PromptMeta(int32_t _promptID, int32_t _tokenID, int32_t _batchSize, int32_t _inputSeqLen, std::vector<int32_t> _inputs) {
+        promptID = _promptID;
+        tokenID = _tokenID;
+        batchSize = _batchSize;
+        inputSeqLen = _inputSeqLen;
+        inputs = _inputs;
+        hiddenStatesReceived = false;
+    }
+
+    PromptMeta(int32_t _promptID, int32_t _tokenID, int32_t _batchSize, int32_t _inputSeqLen, int32_t _hiddenSize) {
+        promptID = _promptID;
+        tokenID = _tokenID;
+        batchSize = _batchSize;
+        inputSeqLen = _inputSeqLen;
+        hiddenSize = _hiddenSize;
+        hiddenStatesReceived = false;
+        hiddenStates.Resize(batchSize * inputSeqLen, hiddenSize, hiddenSize);
+    }
+
+    void ResetKVCache(int32_t _hiddenSize, int32_t _pastSeqLen, int32_t _layerIdx, void *_hiddenStates, void *_kvm) {
+        hiddenSize = _hiddenSize;
+        pastSeqLen = _pastSeqLen;
+        layerIdx = _layerIdx;
+        hiddenStates.Resize(batchSize * inputSeqLen, hiddenSize, hiddenSize);
+        memcpy(hiddenStates.Data(), _hiddenStates, sizeof(AttnInT) * batchSize * inputSeqLen * hiddenSize);
+        kvm = _kvm;
+    }
+
+    int32_t promptID;
+    int32_t tokenID;
+    bool hiddenStatesReceived;
+
+private:
+    int32_t batchSize;
+    int32_t inputSeqLen;
+    int32_t hiddenSize;
+    int32_t pastSeqLen;
+    std::vector<int32_t> inputs;
+    std::vector<int32_t> outputs;
+    int32_t layerIdx;
+    hpj::Matrix<AttnInT> hiddenStates;
+    void *kvm; //KVCacheManager<KVCacheT>
+};
+
+template <typename T>
+class InputQueue {
+public:
+    static InputQueue &getInstance() {
+        static InputQueue instance;
+        return instance;
+    }
+
+    int32_t createPromptID() {
+        int32_t id = promptID++;
+        if (id > 1000) {
+            promptID = 0;
+            id = promptID++;
+        }
+        return id;
+    }
+
+    int32_t createTokenID() {
+        int32_t id = tokenID++;
+        if (id > 1000) {
+            tokenID = 0;
+            id = tokenID++;
+        }
+        return id;
+    }
+
+    bool empty() {
+        return queue.empty();
+    }
+
+    PromptMeta<T> *pop() {
+        auto buffer = queue.front();
+        queue.pop();
+        return buffer;
+    }
+
+    void push(PromptMeta<T> *buffer) {
+        queue.push(buffer);
+    }
+
+private:
+    InputQueue() {}
+
+    static int32_t promptID;
+    static int32_t tokenID;
+    std::queue<PromptMeta<T> *> queue;
+};
+
+template <typename T>
+int32_t InputQueue<T>::promptID = 0;
+
+template <typename T>
+int32_t InputQueue<T>::tokenID = 0;
+
+template <typename T>
+class TaskWaitingQueue {
+public:
+    static TaskWaitingQueue &getInstance() {
+        static TaskWaitingQueue instance;
+        return instance;
+    }
+
+    bool empty() {
+        return queue.empty();
+    }
+
+    int32_t size() {
+        return queue.size();
+    }
+
+    bool isFull() {
+        bool full = false;
+        if (this->size() >= 4)
+            full = true;
+        return full;
+    }
+
+    PromptMeta<T> *pop() {
+        auto buffer = queue.front();
+        queue.pop();
+        return buffer;
+    }
+
+    void push(PromptMeta<T> *buffer) {
+        queue.push(buffer);
+    }
+
+private:
+    TaskWaitingQueue() {}
+
+    std::queue<PromptMeta<T> *> queue;
+};
+
+template <typename T>
+class PromptPool {
+public:
+    static PromptPool &getInstance() {
+        static PromptPool instance;
+        return instance;
+    }
+
+    void insert(int32_t key, PromptMeta<T> *prompt) {
+        hub[key] = prompt;
+    }
+
+    bool has(int32_t key) const {
+        return hub.find(key) != hub.end();
+    }
+
+    PromptMeta<T> *get(int32_t key) const {
+        auto it = hub.find(key);
+        if (it != hub.end()) {
+            return it->second;
+        } else {
+            return nullptr;
+        }
+    }
+
+    std::vector<PromptMeta<T> *> getAll() {
+        std::vector<PromptMeta<T> *> metas;
+        for (const auto& pair : hub) {
+            metas.push_back(pair.second);
+        }
+        return metas;
+    }
+
+    void remove(int32_t key) {
+        hub.erase(key);
+    }
+
+    void modify(int32_t oldKey, PromptMeta<T> *newPrompt) {
+        auto it = hub.find(oldKey);
+        if (it != hub.end()) {
+            it->second = newPrompt;
+        }
+    }
+
+    bool isUpdated(int32_t key) const {
+        auto it = hub.find(key);
+        if (it != hub.end()) {
+            return it->second.hiddenStatesReceived;
+        } else {
+            printf("error: key not found\n");
+            return false;
+        }
+    }
+
+    bool setOld(int32_t key) {
+        auto it = hub.find(key);
+        if (it != hub.end()) {
+            it->second.hiddenStatesReceived = false;
+        } else {
+            printf("error: key not found\n");
+            return false;
+        }
+    }
+
+private:
+    PromptPool() {}
+
+    std::unordered_map<int32_t, PromptMeta<T> *> hub;
+};
+
+} // namespace xft
\ No newline at end of file
diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp
index 0e55648e..aa221c91 100644
--- a/src/searchers/greedy_search.cpp
+++ b/src/searchers/greedy_search.cpp
@@ -15,6 +15,11 @@
 #include "greedy_search.h"
 #include "messenger.h"
 #include "search_utils.h"
+#include "prompt.h"
+
+#include <thread>
+
+using namespace xft;
 
 GreedySearch::GreedySearch(AbstractDecoder &dec, const SearcherConfig &config)
     : decoder(dec), maxLen(config.maxLen), step(0), repetitionPenalty(config.repetitionPenalty) {
@@ -26,6 +31,8 @@ GreedySearch::GreedySearch(AbstractDecoder &dec, const SearcherConfig &config)
     }
     stopWordsList = {};
     stopWordsIndex = {};
+
+    pool = new ThreadPool(4);
 }
 
 std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result) {
@@ -38,18 +45,46 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
         this->nextTokens = std::vector<int>(batchSize, 0);
         if (ctx->ppSize > 1 && ctx->ppRank == 0) {
             int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank;
-            MPI_Recv(this->nextTokens.data(), batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank,
-                    MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-            // TODO: Error: different scope when dynamic loading so file
-            // messenger.worldRecvINT32(this->nextTokens.data(), batchSize, predictor_world_rank, predictor_world_rank);
+            // std::thread feedbackWaitingLastPP([&, predictor_world_rank, this](){
+            //     TimeLine t("GreedySearch.MPI_Recv");
+            //     MPI_Recv(this->nextTokens.data(), batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank,
+            //             MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            //     // TODO: Error: different scope when dynamic loading so file
+            //     // messenger.worldRecvINT32(this->nextTokens.data(), batchSize, predictor_world_rank, predictor_world_rank);
+            //     printf("%d\n", this->nextTokens[0]);
+            // });
+            // feedbackWaitingLastPP.detach();
+            pool->enqueue([predictor_world_rank, this] {
+                TimeLine t("GreedySearch.MPI_Recv");
+                printf("0: GreedySearch.MPI_Recv.AsyncStart\n");
+                int32_t promptID;
+                MPI_Recv(&promptID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank,
+                        MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank,
+                        MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                printf("%d\n", this->nextTokens[0]);
+                if (PromptPool<float>::getInstance().has(promptID)) {
+                    TaskWaitingQueue<float>::getInstance().push(PromptPool<float>::getInstance().get(promptID));
+                } else {
+                    printf("error: should have promptID\n");
+                }
+                printf("0: GreedySearch.MPI_Recv.AsyncDone %d\n", promptID);
+                fflush(stdout);
+            });
         }
     } else { // The last predictor pipeline parallel stage
         this->nextTokens = this->search(result);
         if (ctx->ppSize > 1 && ctx->ppRank == ctx->ppSize - 1) {
+            TimeLine t("GreedySearch.MPI_Send");
             int embedding_world_rank = 0 * ctx->tpSize + ctx->tpRank;
             int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank;
+            static int32_t promptID = 0;
+            MPI_Send(&promptID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD);
             MPI_Send(this->nextTokens.data(), batchSize, MPI_INT32_T, embedding_world_rank, predictor_world_rank,
                     MPI_COMM_WORLD);
+            printf("%d: GreedySearch.MPI_Send %d\n", ctx->ppRank, promptID);
+            fflush(stdout);
+            promptID++;
             // TODO: Error: different scope when dynamic loading so file
             // messenger.worldSendINT32(this->nextTokens.data(), batchSize, embedding_world_rank, predictor_world_rank);
         }
diff --git a/src/searchers/greedy_search.h b/src/searchers/greedy_search.h
index 607d9737..bf2709a5 100644
--- a/src/searchers/greedy_search.h
+++ b/src/searchers/greedy_search.h
@@ -18,6 +18,7 @@
 #include "messenger.h"
 #include "timeline.h"
 #include "transformer_ctx.h"
+#include "thread_util.h"
 
 class GreedySearch : public AbstractSearcher {
 public:
@@ -40,6 +41,7 @@ class GreedySearch : public AbstractSearcher {
     std::vector<int> search(std::tuple<float *, int, int> &result);
 
     AbstractDecoder &decoder;
+    ThreadPool *pool;
 
     // Predicted token IDs
     std::vector<int> nextTokens;
diff --git a/src/utils/thread_util.h b/src/utils/thread_util.h
index c6826051..1d2d0dfe 100644
--- a/src/utils/thread_util.h
+++ b/src/utils/thread_util.h
@@ -1,6 +1,14 @@
 #pragma once
 #include <omp.h>
 
+#include <functional>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+#include <condition_variable>
+
 template <typename Lambda>
 void parallel_for(int tasks, const Lambda &fn) {
 #pragma omp parallel for
@@ -15,4 +23,61 @@ void parallel_for_dschedule(int tasks, const Lambda &fn) {
     for (int i = 0; i < tasks; i++) {
         fn(i);
     }
-}
\ No newline at end of file
+}
+
+class ThreadPool {
+public:
+    ThreadPool(size_t numThreads) : stop(false) {
+        for (size_t i = 0; i < numThreads; ++i) {
+            workers.emplace_back([this] {
+                while (true) {
+                    std::function<void()> task;
+
+                    {
+                        std::unique_lock<std::mutex> lock(this->queueMutex);
+                        this->condition.wait(lock, [this] { return this->stop || !this->tasks.empty(); });
+
+                        if (this->stop && this->tasks.empty()) { return; }
+
+                        task = std::move(this->tasks.front());
+                        this->tasks.pop();
+                    }
+
+                    task();
+                }
+            });
+        }
+    }
+
+    template <class F, class... Args>
+    void enqueue(F &&f, Args &&...args) {
+        {
+            std::unique_lock<std::mutex> lock(queueMutex);
+
+            tasks.emplace([f, args...] { f(args...); });
+        }
+
+        condition.notify_one();
+    }
+
+    ~ThreadPool() {
+        {
+            std::unique_lock<std::mutex> lock(queueMutex);
+            stop = true;
+        }
+
+        condition.notify_all();
+
+        for (std::thread &worker : workers) {
+            worker.join();
+        }
+    }
+
+private:
+    std::vector<std::thread> workers;
+    std::queue<std::function<void()>> tasks;
+
+    std::mutex queueMutex;
+    std::condition_variable condition;
+    bool stop;
+};
\ No newline at end of file

From 7f23d07d78832d9ea56e02c4ecb8d8c2a4517b0a Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Fri, 19 Apr 2024 12:37:34 +0800
Subject: [PATCH 03/32] could run input 3 and 4 in mpi 2 and 4.

---
 src/common/transformer_ctx.h    |  2 ++
 src/models/common_decoder.h     | 63 ++++++++++++++++++---------------
 src/models/models.cpp           | 10 +++---
 src/searchers/greedy_search.cpp | 44 +++++++++++++----------
 4 files changed, 66 insertions(+), 53 deletions(-)

diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h
index b01d363c..5f750fe8 100644
--- a/src/common/transformer_ctx.h
+++ b/src/common/transformer_ctx.h
@@ -66,6 +66,8 @@ struct DecoderContext {
     int inputSeqLen;
     // For custom usage
     int reserved1;
+    // promptID
+    int32_t promptID;
 
     // Model structure configuration
     int vocabSize;
diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index 59df24b6..14931a24 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -274,7 +274,6 @@ class CommonDecoder : public AbstractDecoder {
     std::tuple<float *, int, int> forward(int *ids, int64_t *dims, int step, bool logitsAll = false) {
         // Assume input has been synced with master in higher level.
         // Assume the 1st step input's shape is [userSideBS][1][seqLen].
-        TimeLine t("Decoder.forward");
         TimeLine t1("Decoder.embedding");
 
         int userSideBS = dims[0];
@@ -345,8 +344,9 @@ class CommonDecoder : public AbstractDecoder {
             int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank;
             int count = batchSize * inputSeqLen * ctx->hiddenSize;
             if (TaskWaitingQueue<AttnInT>::getInstance().empty()) {
-                TimeLine t("Decoder.MPI_Recv");
+                TimeLine t("Decoder.MPI_Recv." + std::to_string(ctx->promptID));
                 printf("%d: Decoder.MPI_Recv.SyncStart\n", ctx->ppRank);
+                fflush(stdout);
                 int32_t promptID;
                 MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                 MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
@@ -362,37 +362,36 @@ class CommonDecoder : public AbstractDecoder {
                 printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, promptID);
                 fflush(stdout);
             } else {
-                pool->enqueue([=, &embBuf] {
-                    TimeLine t("Decoder.MPI_Recv");
-                    printf("%d: Decoder.MPI_Recv.ASyncStart\n", ctx->ppRank);
-                    int32_t promptID;
-                    MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-                    MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-                    // TODO: Error: different scope when dynamic loading so file
-                    // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
-                    printf("%.6f\n", embBuf[0]);
-                    if (!PromptPool<AttnInT>::getInstance().has(promptID)) {
-                        PromptMeta<AttnInT> *prompt = new PromptMeta<AttnInT>(promptID, 0, batchSize, seqLen, hiddenSize);
-                        prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
-                        PromptPool<AttnInT>::getInstance().insert(prompt->promptID, prompt);
-                    }
-                    TaskWaitingQueue<AttnInT>::getInstance().push(PromptPool<AttnInT>::getInstance().get(promptID));
-                    printf("%d: Decoder.MPI_Recv.ASyncDone %d\n", ctx->ppRank, promptID);
-                    fflush(stdout);
-                });
+                static bool init = false;
+                if (init == false) {
+                    init = true;
+                    pool->enqueue([curr_world_rank, prev_world_rank, count, batchSize, seqLen, hiddenSize, pastSeqLen, &embBuf, &ctx, this] {
+                        while (true) {
+                            printf("%d: Decoder.MPI_Recv.ASyncStart\n", ctx->ppRank);
+                            fflush(stdout);
+                            int32_t promptID;
+                            MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                            MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                            // TODO: Error: different scope when dynamic loading so file
+                            // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
+                            printf("%.6f\n", embBuf[0]);
+                            TimeLine t("Decoder.MPI_Recv");
+                            if (!PromptPool<AttnInT>::getInstance().has(promptID)) {
+                                PromptMeta<AttnInT> *prompt = new PromptMeta<AttnInT>(promptID, 0, batchSize, seqLen, hiddenSize);
+                                prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
+                                PromptPool<AttnInT>::getInstance().insert(prompt->promptID, prompt);
+                            }
+                            TaskWaitingQueue<AttnInT>::getInstance().push(PromptPool<AttnInT>::getInstance().get(promptID));
+                            printf("%d: Decoder.MPI_Recv.ASyncDone %d\n", ctx->ppRank, promptID);
+                            fflush(stdout);
+                        }
+                    });
+                }
             }
         }
 #endif
 
         if (!TaskWaitingQueue<AttnInT>::getInstance().isFull()) {
-            // for (const auto& prompt : PromptPool<AttnInT>::getInstance().getAll()) {
-            //     if (!TaskWaitingQueue<AttnInT>::getInstance().isFull()) {
-            //         if (prompt->hiddenStatesReceived) {
-            //             TaskWaitingQueue<AttnInT>::getInstance().push(prompt);
-            //         }
-            //     }
-            // }
-
             if (!InputQueue<AttnInT>::getInstance().empty()) {
                 if (!TaskWaitingQueue<AttnInT>::getInstance().isFull()) {
                     auto prompt = InputQueue<AttnInT>::getInstance().pop();
@@ -403,10 +402,16 @@ class CommonDecoder : public AbstractDecoder {
             }
         }
 
+    while(TaskWaitingQueue<float>::getInstance().empty());
+
     PromptMeta<AttnInT> *runningTask;
-    while (!TaskWaitingQueue<AttnInT>::getInstance().empty()) {
+    if (!TaskWaitingQueue<AttnInT>::getInstance().empty()) {
         runningTask = TaskWaitingQueue<AttnInT>::getInstance().pop();
+        ctx->promptID = runningTask->promptID;
+        TimeLine t("Decoder.forward." + std::to_string(ctx->promptID));
         printf("%d: Decoder.forward\n", ctx->ppRank);
+        fflush(stdout);
+
         // Decoder: forward from runningTask
         int layers_per_pp_stage = this->decoders.size();
         for (int i = 0; i < layers_per_pp_stage; ++i) {
diff --git a/src/models/models.cpp b/src/models/models.cpp
index f6568164..2ed3c1cd 100644
--- a/src/models/models.cpp
+++ b/src/models/models.cpp
@@ -151,11 +151,11 @@ std::vector<int32_t> Model::generate() {
         }
         return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
     } else {
-        static int i = 0;
-        i++;
-        if (i == 10) {
-            isNewInput = true;
-        }
+        // static int i = 0;
+        // i++;
+        // if (i == 10) {
+        //     isNewInput = true;
+        // }
         return searcher->getNextToken();
     }
 }
diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp
index aa221c91..be34ada9 100644
--- a/src/searchers/greedy_search.cpp
+++ b/src/searchers/greedy_search.cpp
@@ -42,8 +42,10 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
     // Messenger &messenger = decoder.getMessenger();
 
     if (std::get<0>(result) == nullptr) { // The first embedding pipeline parallel stage
+        static bool init = false;
         this->nextTokens = std::vector<int>(batchSize, 0);
-        if (ctx->ppSize > 1 && ctx->ppRank == 0) {
+        if (ctx->ppSize > 1 && ctx->ppRank == 0 && init == false) {
+            init = true;
             int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank;
             // std::thread feedbackWaitingLastPP([&, predictor_world_rank, this](){
             //     TimeLine t("GreedySearch.MPI_Recv");
@@ -55,36 +57,40 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
             // });
             // feedbackWaitingLastPP.detach();
             pool->enqueue([predictor_world_rank, this] {
-                TimeLine t("GreedySearch.MPI_Recv");
-                printf("0: GreedySearch.MPI_Recv.AsyncStart\n");
-                int32_t promptID;
-                MPI_Recv(&promptID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank,
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-                MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank,
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-                printf("%d\n", this->nextTokens[0]);
-                if (PromptPool<float>::getInstance().has(promptID)) {
-                    TaskWaitingQueue<float>::getInstance().push(PromptPool<float>::getInstance().get(promptID));
-                } else {
-                    printf("error: should have promptID\n");
+                while (true) {
+                    printf("0: GreedySearch.MPI_Recv.AsyncStart\n");
+                    fflush(stdout);
+                    int32_t promptID;
+                    MPI_Recv(&promptID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank,
+                            MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                    MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank,
+                            MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                    printf("%d\n", this->nextTokens[0]);
+                    TimeLine t("GreedySearch.MPI_Recv");
+                    if (PromptPool<float>::getInstance().has(promptID)) {
+                        auto prompt = PromptPool<float>::getInstance().get(promptID);
+                        TaskWaitingQueue<float>::getInstance().push(prompt);
+                        printf("0: GreedySearch.MPI_Recv.AsyncDone %d\n", promptID);
+                        fflush(stdout);
+                    } else {
+                        printf("error: should have promptID\n");
+                        fflush(stdout);
+                    }
                 }
-                printf("0: GreedySearch.MPI_Recv.AsyncDone %d\n", promptID);
-                fflush(stdout);
             });
         }
     } else { // The last predictor pipeline parallel stage
         this->nextTokens = this->search(result);
         if (ctx->ppSize > 1 && ctx->ppRank == ctx->ppSize - 1) {
             TimeLine t("GreedySearch.MPI_Send");
+            fflush(stdout);
             int embedding_world_rank = 0 * ctx->tpSize + ctx->tpRank;
             int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank;
-            static int32_t promptID = 0;
-            MPI_Send(&promptID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD);
+            MPI_Send(&ctx->promptID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD);
             MPI_Send(this->nextTokens.data(), batchSize, MPI_INT32_T, embedding_world_rank, predictor_world_rank,
                     MPI_COMM_WORLD);
-            printf("%d: GreedySearch.MPI_Send %d\n", ctx->ppRank, promptID);
+            printf("%d: GreedySearch.MPI_Send %d\n", ctx->ppRank, ctx->promptID);
             fflush(stdout);
-            promptID++;
             // TODO: Error: different scope when dynamic loading so file
             // messenger.worldSendINT32(this->nextTokens.data(), batchSize, embedding_world_rank, predictor_world_rank);
         }

From 4f67f28e7fa4c64597670b9f295cfa255eff469d Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Fri, 19 Apr 2024 14:28:40 +0800
Subject: [PATCH 04/32] use threadpool singleton

---
 src/models/common_decoder.h     |  5 +--
 src/searchers/greedy_search.cpp |  4 +--
 src/searchers/greedy_search.h   |  1 -
 src/utils/thread_util.h         | 61 ++++++++++++++++-----------------
 4 files changed, 31 insertions(+), 40 deletions(-)

diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index 14931a24..c4f15c16 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -225,7 +225,6 @@ class CommonDecoder : public AbstractDecoder {
         DecoderContext *ctx = getDecoderContext(layers, hiddenSize, size_per_head, attHeadNum, kvHeadNum, imSize, act,
                 epsilon, vocabSize, embeddingSize, maxPositions, maxPosEmbed, maxSeqLength, useLogN, useNTK,
                 ropeParamsPtr);
-        pool = new ThreadPool(4);
 
         ctx->ResetConfigReader(configPath);
 
@@ -365,7 +364,7 @@ class CommonDecoder : public AbstractDecoder {
                 static bool init = false;
                 if (init == false) {
                     init = true;
-                    pool->enqueue([curr_world_rank, prev_world_rank, count, batchSize, seqLen, hiddenSize, pastSeqLen, &embBuf, &ctx, this] {
+                    ThreadPool::getInstance().addTask([curr_world_rank, prev_world_rank, count, batchSize, seqLen, hiddenSize, pastSeqLen, &embBuf, &ctx, this] {
                         while (true) {
                             printf("%d: Decoder.MPI_Recv.ASyncStart\n", ctx->ppRank);
                             fflush(stdout);
@@ -1039,8 +1038,6 @@ class CommonDecoder : public AbstractDecoder {
     // Activation buffers (declared as float, but the actual data type may be different)
     std::shared_ptr<hpj::Matrix<float>> actBuffers;
 
-    ThreadPool *pool;
-
 protected:
     // Components most LLMs may use
     std::vector<DECODER *> decoders;
diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp
index be34ada9..d5b3733e 100644
--- a/src/searchers/greedy_search.cpp
+++ b/src/searchers/greedy_search.cpp
@@ -31,8 +31,6 @@ GreedySearch::GreedySearch(AbstractDecoder &dec, const SearcherConfig &config)
     }
     stopWordsList = {};
     stopWordsIndex = {};
-
-    pool = new ThreadPool(4);
 }
 
 std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result) {
@@ -56,7 +54,7 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
             //     printf("%d\n", this->nextTokens[0]);
             // });
             // feedbackWaitingLastPP.detach();
-            pool->enqueue([predictor_world_rank, this] {
+            ThreadPool::getInstance().addTask([predictor_world_rank, this] {
                 while (true) {
                     printf("0: GreedySearch.MPI_Recv.AsyncStart\n");
                     fflush(stdout);
diff --git a/src/searchers/greedy_search.h b/src/searchers/greedy_search.h
index bf2709a5..69784c5e 100644
--- a/src/searchers/greedy_search.h
+++ b/src/searchers/greedy_search.h
@@ -41,7 +41,6 @@ class GreedySearch : public AbstractSearcher {
     std::vector<int> search(std::tuple<float *, int, int> &result);
 
     AbstractDecoder &decoder;
-    ThreadPool *pool;
 
     // Predicted token IDs
     std::vector<int> nextTokens;
diff --git a/src/utils/thread_util.h b/src/utils/thread_util.h
index 1d2d0dfe..17f49453 100644
--- a/src/utils/thread_util.h
+++ b/src/utils/thread_util.h
@@ -27,53 +27,50 @@ void parallel_for_dschedule(int tasks, const Lambda &fn) {
 
 class ThreadPool {
 public:
-    ThreadPool(size_t numThreads) : stop(false) {
-        for (size_t i = 0; i < numThreads; ++i) {
-            workers.emplace_back([this] {
-                while (true) {
-                    std::function<void()> task;
-
-                    {
-                        std::unique_lock<std::mutex> lock(this->queueMutex);
-                        this->condition.wait(lock, [this] { return this->stop || !this->tasks.empty(); });
-
-                        if (this->stop && this->tasks.empty()) { return; }
-
-                        task = std::move(this->tasks.front());
-                        this->tasks.pop();
-                    }
-
-                    task();
-                }
-            });
-        }
+    static ThreadPool& getInstance() {
+        static ThreadPool instance;
+        return instance;
     }
 
-    template <class F, class... Args>
-    void enqueue(F &&f, Args &&...args) {
+    template<typename F, typename... Args>
+    void addTask(F&& f, Args&&... args) {
         {
             std::unique_lock<std::mutex> lock(queueMutex);
-
-            tasks.emplace([f, args...] { f(args...); });
+            tasks.emplace(std::bind(std::forward<F>(f), std::forward<Args>(args)...));
         }
-
         condition.notify_one();
     }
 
     ~ThreadPool() {
-        {
-            std::unique_lock<std::mutex> lock(queueMutex);
-            stop = true;
-        }
-
+        stop = true;
         condition.notify_all();
-
-        for (std::thread &worker : workers) {
+        for (std::thread& worker : workers) {
             worker.join();
         }
     }
 
 private:
+    ThreadPool() : stop(false) {
+        for (size_t i = 0; i < numThreads; ++i) {
+            workers.emplace_back([this] {
+                while (true) {
+                    std::function<void()> task;
+                    {
+                        std::unique_lock<std::mutex> lock(queueMutex);
+                        condition.wait(lock, [this] { return stop || !tasks.empty(); });
+                        if (stop && tasks.empty()) {
+                            return;
+                        }
+                        task = std::move(tasks.front());
+                        tasks.pop();
+                    }
+                    task();
+                }
+            });
+        }
+    }
+
+    static constexpr size_t numThreads = 1;
     std::vector<std::thread> workers;
     std::queue<std::function<void()>> tasks;
 

From d7a43045b20de4c9d08e80c3f5c3db7c689b4c69 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Fri, 19 Apr 2024 14:32:42 +0800
Subject: [PATCH 05/32] format code

---
 src/models/prompt.h     | 44 ++++++++++++-----------------------------
 src/utils/thread_util.h | 26 +++++++++++++++++-------
 2 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/src/models/prompt.h b/src/models/prompt.h
index c10b1ab7..e9a044dd 100644
--- a/src/models/prompt.h
+++ b/src/models/prompt.h
@@ -22,7 +22,8 @@ namespace xft {
 template <typename AttnInT>
 class PromptMeta {
 public:
-    PromptMeta(int32_t _promptID, int32_t _tokenID, int32_t _batchSize, int32_t _inputSeqLen, std::vector<int32_t> _inputs) {
+    PromptMeta(int32_t _promptID, int32_t _tokenID, int32_t _batchSize, int32_t _inputSeqLen,
+            std::vector<int32_t> _inputs) {
         promptID = _promptID;
         tokenID = _tokenID;
         batchSize = _batchSize;
@@ -92,9 +93,7 @@ class InputQueue {
         return id;
     }
 
-    bool empty() {
-        return queue.empty();
-    }
+    bool empty() { return queue.empty(); }
 
     PromptMeta<T> *pop() {
         auto buffer = queue.front();
@@ -102,9 +101,7 @@ class InputQueue {
         return buffer;
     }
 
-    void push(PromptMeta<T> *buffer) {
-        queue.push(buffer);
-    }
+    void push(PromptMeta<T> *buffer) { queue.push(buffer); }
 
 private:
     InputQueue() {}
@@ -128,18 +125,13 @@ class TaskWaitingQueue {
         return instance;
     }
 
-    bool empty() {
-        return queue.empty();
-    }
+    bool empty() { return queue.empty(); }
 
-    int32_t size() {
-        return queue.size();
-    }
+    int32_t size() { return queue.size(); }
 
     bool isFull() {
         bool full = false;
-        if (this->size() >= 4)
-            full = true;
+        if (this->size() >= 4) full = true;
         return full;
     }
 
@@ -149,9 +141,7 @@ class TaskWaitingQueue {
         return buffer;
     }
 
-    void push(PromptMeta<T> *buffer) {
-        queue.push(buffer);
-    }
+    void push(PromptMeta<T> *buffer) { queue.push(buffer); }
 
 private:
     TaskWaitingQueue() {}
@@ -167,13 +157,9 @@ class PromptPool {
         return instance;
     }
 
-    void insert(int32_t key, PromptMeta<T> *prompt) {
-        hub[key] = prompt;
-    }
+    void insert(int32_t key, PromptMeta<T> *prompt) { hub[key] = prompt; }
 
-    bool has(int32_t key) const {
-        return hub.find(key) != hub.end();
-    }
+    bool has(int32_t key) const { return hub.find(key) != hub.end(); }
 
     PromptMeta<T> *get(int32_t key) const {
         auto it = hub.find(key);
@@ -186,21 +172,17 @@ class PromptPool {
 
     std::vector<PromptMeta<T> *> getAll() {
         std::vector<PromptMeta<T> *> metas;
-        for (const auto& pair : hub) {
+        for (const auto &pair : hub) {
             metas.push_back(pair.second);
         }
         return metas;
     }
 
-    void remove(int32_t key) {
-        hub.erase(key);
-    }
+    void remove(int32_t key) { hub.erase(key); }
 
     void modify(int32_t oldKey, PromptMeta<T> *newPrompt) {
         auto it = hub.find(oldKey);
-        if (it != hub.end()) {
-            it->second = newPrompt;
-        }
+        if (it != hub.end()) { it->second = newPrompt; }
     }
 
     bool isUpdated(int32_t key) const {
diff --git a/src/utils/thread_util.h b/src/utils/thread_util.h
index 17f49453..b7209384 100644
--- a/src/utils/thread_util.h
+++ b/src/utils/thread_util.h
@@ -1,3 +1,17 @@
+// Copyright (c) 2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// ============================================================================
 #pragma once
 #include <omp.h>
 
@@ -27,13 +41,13 @@ void parallel_for_dschedule(int tasks, const Lambda &fn) {
 
 class ThreadPool {
 public:
-    static ThreadPool& getInstance() {
+    static ThreadPool &getInstance() {
         static ThreadPool instance;
         return instance;
     }
 
-    template<typename F, typename... Args>
-    void addTask(F&& f, Args&&... args) {
+    template <typename F, typename... Args>
+    void addTask(F &&f, Args &&...args) {
         {
             std::unique_lock<std::mutex> lock(queueMutex);
             tasks.emplace(std::bind(std::forward<F>(f), std::forward<Args>(args)...));
@@ -44,7 +58,7 @@ class ThreadPool {
     ~ThreadPool() {
         stop = true;
         condition.notify_all();
-        for (std::thread& worker : workers) {
+        for (std::thread &worker : workers) {
             worker.join();
         }
     }
@@ -58,9 +72,7 @@ class ThreadPool {
                     {
                         std::unique_lock<std::mutex> lock(queueMutex);
                         condition.wait(lock, [this] { return stop || !tasks.empty(); });
-                        if (stop && tasks.empty()) {
-                            return;
-                        }
+                        if (stop && tasks.empty()) { return; }
                         task = std::move(tasks.front());
                         tasks.pop();
                     }

From 70c17cd8b07024042d242804d81fcb05d53fc636 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Fri, 19 Apr 2024 14:40:01 +0800
Subject: [PATCH 06/32] format code

---
 src/common/transformer_ctx.h | 25 ++++++++++++-------------
 src/utils/thread_util.h      |  6 +++++-
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h
index 5f750fe8..d6f858af 100644
--- a/src/common/transformer_ctx.h
+++ b/src/common/transformer_ctx.h
@@ -18,17 +18,15 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
-#include <filesystem>
 #include <string>
 
-#include "INIReader.h"
 #include "allocator.h"
+#include <filesystem>
+
+#include "INIReader.h"
 #include "my_types.h"
 #include "simple_mem_pool.h"
 #include "split_util.h"
-#include "float16.h"
-#include "singleton.h"
-#include "kvcache_manager.h"
 
 namespace fs = std::filesystem;
 
@@ -66,8 +64,7 @@ struct DecoderContext {
     int inputSeqLen;
     // For custom usage
     int reserved1;
-    // promptID
-    int32_t promptID;
+    int promptID;
 
     // Model structure configuration
     int vocabSize;
@@ -130,10 +127,10 @@ struct DecoderContext {
     uint64_t size3;
 
 public:
-    DecoderContext(int _layers, int _hiddenSize, int _headSize, int _attHeadNum, int _kvHeadNum, int _imSize,
-            const std::string &act, float epsilon, int _vocabSize, int _embeddingSize, int _maxPositions,
-            int _maxPosEmbed, int _maxSeqLength, int _splitIdx, int _splits, int _ppSize = 1, int _ppRank = 0,
-            RopeParams *_ropeParamsPtr = nullptr, bool _useLogN = true, bool _useNTK = true, int numThreads = 0)
+    DecoderContext(int _layers, int _hiddenSize, int _headSize, int _attHeadNum, int _kvHeadNum, int _imSize, const std::string &act,
+            float epsilon, int _vocabSize, int _embeddingSize, int _maxPositions, int _maxPosEmbed, int _maxSeqLength,
+            int _splitIdx, int _splits, int _ppSize = 1, int _ppRank = 0, RopeParams *_ropeParamsPtr = nullptr,
+            bool _useLogN = true, bool _useNTK = true, int numThreads = 0)
         : layers(_layers)
         , hiddenSize(_hiddenSize)
         , attHeadSize(_headSize)
@@ -155,7 +152,9 @@ struct DecoderContext {
         , tpSize(_splits)
         , tpRank(_splitIdx)
         , epsilon(epsilon) {
-        if (attHeadNum != 0) { this->attFactor = 1 / sqrtf(attHeadSize); }
+        if (attHeadNum != 0) {
+            this->attFactor = 1 / sqrtf(attHeadSize);
+        }
 
         // Set the default value (don't worry, it can be changed later)
         this->batchSize = 1;
@@ -328,4 +327,4 @@ struct DecoderContext {
     }
 
     ~DecoderContext() { free(this->rawBuffer); }
-};
+};
\ No newline at end of file
diff --git a/src/utils/thread_util.h b/src/utils/thread_util.h
index b7209384..75507274 100644
--- a/src/utils/thread_util.h
+++ b/src/utils/thread_util.h
@@ -23,6 +23,8 @@
 #include <vector>
 #include <condition_variable>
 
+namespace xft {
+
 template <typename Lambda>
 void parallel_for(int tasks, const Lambda &fn) {
 #pragma omp parallel for
@@ -89,4 +91,6 @@ class ThreadPool {
     std::mutex queueMutex;
     std::condition_variable condition;
     bool stop;
-};
\ No newline at end of file
+};
+
+} // namespace xft
\ No newline at end of file

From 3eca8ef54025c3fec193b9690800285b193c960e Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Fri, 19 Apr 2024 15:07:50 +0800
Subject: [PATCH 07/32] remove non-Master thread code

---
 src/models/common_decoder.h | 70 +++++++++++--------------------------
 1 file changed, 21 insertions(+), 49 deletions(-)

diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index 7ede3316..5e5067f9 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -274,6 +274,7 @@ class CommonDecoder : public AbstractDecoder {
     std::tuple<float *, int, int> forward(int *ids, int64_t *dims, int step, bool logitsAll = false) {
         // Assume input has been synced with master in higher level.
         // Assume the 1st step input's shape is [userSideBS][1][seqLen].
+        TimeLine t("Decoder.forward");
         TimeLine t1("Decoder.embedding");
 
         int userSideBS = dims[0];
@@ -315,7 +316,7 @@ class CommonDecoder : public AbstractDecoder {
         }
 
         AttnInT *embBuf = (AttnInT *)actBuffers->Data();
-        MlpOutT *outBuf = (MlpOutT *)(embBuf + batchSize * inputSeqLen * ctx->hiddenSize);
+        MlpOutT *outBuf = (MlpOutT *)(embBuf + batchSize * inputSeqLen * hiddenSize);
 
         // Embedding
         this->embeddingForward(ids, embBuf, batchSize, inputSeqLen);
@@ -326,8 +327,8 @@ class CommonDecoder : public AbstractDecoder {
         dbg.debugPrint("ids:\n");
         dbg.dumpMatrix(ids, batchSize, inputSeqLen, inputSeqLen);
         dbg.debugPrint(
-                "embBuf(rows: %d, cols: %d, stride: %d):\n", batchSize * inputSeqLen, ctx->hiddenSize, ctx->hiddenSize);
-        dbg.dumpMatrix(embBuf, batchSize * inputSeqLen, ctx->hiddenSize, ctx->hiddenSize);
+                "embBuf(rows: %d, cols: %d, stride: %d):\n", batchSize * inputSeqLen, hiddenSize, hiddenSize);
+        dbg.dumpMatrix(embBuf, batchSize * inputSeqLen, hiddenSize, hiddenSize);
 #endif
 
         // Prepare attention mask
@@ -342,52 +343,23 @@ class CommonDecoder : public AbstractDecoder {
         if (ctx->ppSize > 1 && ctx->ppRank > 0) {
             int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank;
             int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank;
-            int count = batchSize * inputSeqLen * ctx->hiddenSize;
-            if (TaskWaitingQueue<AttnInT>::getInstance().empty()) {
-                TimeLine t("Decoder.MPI_Recv." + std::to_string(ctx->promptID));
-                printf("%d: Decoder.MPI_Recv.SyncStart\n", ctx->ppRank);
-                fflush(stdout);
-                int32_t promptID;
-                MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-                MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-                // TODO: Error: different scope when dynamic loading so file
-                // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
-                printf("%.6f\n", embBuf[0]);
-                if (!PromptPool<AttnInT>::getInstance().has(promptID)) {
-                    PromptMeta<AttnInT> *prompt = new PromptMeta<AttnInT>(promptID, 0, batchSize, seqLen, hiddenSize);
-                    prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
-                    PromptPool<AttnInT>::getInstance().insert(prompt->promptID, prompt);
-                }
-                TaskWaitingQueue<AttnInT>::getInstance().push(PromptPool<AttnInT>::getInstance().get(promptID));
-                printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, promptID);
-                fflush(stdout);
-            } else {
-                static bool init = false;
-                if (init == false) {
-                    init = true;
-                    ThreadPool::getInstance().addTask([curr_world_rank, prev_world_rank, count, batchSize, seqLen, hiddenSize, pastSeqLen, &embBuf, &ctx, this] {
-                        while (true) {
-                            printf("%d: Decoder.MPI_Recv.ASyncStart\n", ctx->ppRank);
-                            fflush(stdout);
-                            int32_t promptID;
-                            MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-                            MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-                            // TODO: Error: different scope when dynamic loading so file
-                            // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
-                            printf("%.6f\n", embBuf[0]);
-                            TimeLine t("Decoder.MPI_Recv");
-                            if (!PromptPool<AttnInT>::getInstance().has(promptID)) {
-                                PromptMeta<AttnInT> *prompt = new PromptMeta<AttnInT>(promptID, 0, batchSize, seqLen, hiddenSize);
-                                prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
-                                PromptPool<AttnInT>::getInstance().insert(prompt->promptID, prompt);
-                            }
-                            TaskWaitingQueue<AttnInT>::getInstance().push(PromptPool<AttnInT>::getInstance().get(promptID));
-                            printf("%d: Decoder.MPI_Recv.ASyncDone %d\n", ctx->ppRank, promptID);
-                            fflush(stdout);
-                        }
-                    });
-                }
+            int count = batchSize * inputSeqLen * hiddenSize;
+            TimeLine t("Decoder.MPI_Recv." + std::to_string(ctx->promptID));
+            printf("%d: Decoder.MPI_Recv.SyncStart\n", ctx->ppRank);
+            fflush(stdout);
+            int32_t promptID;
+            MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            // TODO: Error: different scope when dynamic loading so file
+            // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
+            if (!PromptPool<AttnInT>::getInstance().has(promptID)) {
+                PromptMeta<AttnInT> *prompt = new PromptMeta<AttnInT>(promptID, 0, batchSize, seqLen, hiddenSize);
+                prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
+                PromptPool<AttnInT>::getInstance().insert(prompt->promptID, prompt);
             }
+            TaskWaitingQueue<AttnInT>::getInstance().push(PromptPool<AttnInT>::getInstance().get(promptID));
+            printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, promptID);
+            fflush(stdout);
         }
 #endif
 
@@ -473,7 +445,7 @@ class CommonDecoder : public AbstractDecoder {
         if (ctx->ppSize > 1 && ctx->ppRank < ctx->ppSize - 1) {
             TimeLine t("Decoder.MPI_Send");
             int next_world_rank = (ctx->ppRank + 1) * ctx->tpSize + ctx->tpRank;
-            int count = batchSize * inputSeqLen * ctx->hiddenSize;
+            int count = batchSize * inputSeqLen * hiddenSize;
             MPI_Send(&runningTask->promptID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD);
             MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD);
             // TODO: Error: different scope when dynamic loading so file

From 2216a4082c2f1d7465ccfc6a0ebeb760c6d259b4 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Fri, 19 Apr 2024 15:36:00 +0800
Subject: [PATCH 08/32] format code

---
 src/models/common_decoder.h     | 38 +++++++++++++++------------------
 src/models/models.cpp           |  5 -----
 src/searchers/greedy_search.cpp |  2 --
 src/utils/thread_util.h         |  1 -
 4 files changed, 17 insertions(+), 29 deletions(-)

diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index 5e5067f9..c02ed020 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -361,28 +361,26 @@ class CommonDecoder : public AbstractDecoder {
             printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, promptID);
             fflush(stdout);
         }
-#endif
 
-        if (!TaskWaitingQueue<AttnInT>::getInstance().isFull()) {
-            if (!InputQueue<AttnInT>::getInstance().empty()) {
-                if (!TaskWaitingQueue<AttnInT>::getInstance().isFull()) {
-                    auto prompt = InputQueue<AttnInT>::getInstance().pop();
-                    prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
-                    PromptPool<AttnInT>::getInstance().insert(prompt->promptID, prompt);
-                    TaskWaitingQueue<AttnInT>::getInstance().push(PromptPool<AttnInT>::getInstance().get(prompt->promptID));
-                }
+        if (!InputQueue<AttnInT>::getInstance().empty()) {
+            if (!TaskWaitingQueue<AttnInT>::getInstance().isFull()) {
+                auto prompt = InputQueue<AttnInT>::getInstance().pop();
+                prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
+                PromptPool<AttnInT>::getInstance().insert(prompt->promptID, prompt);
+                TaskWaitingQueue<AttnInT>::getInstance().push(PromptPool<AttnInT>::getInstance().get(prompt->promptID));
             }
         }
 
-    while(TaskWaitingQueue<float>::getInstance().empty());
+        while(TaskWaitingQueue<float>::getInstance().empty());
 
-    PromptMeta<AttnInT> *runningTask;
-    if (!TaskWaitingQueue<AttnInT>::getInstance().empty()) {
-        runningTask = TaskWaitingQueue<AttnInT>::getInstance().pop();
-        ctx->promptID = runningTask->promptID;
-        TimeLine t("Decoder.forward." + std::to_string(ctx->promptID));
-        printf("%d: Decoder.forward\n", ctx->ppRank);
-        fflush(stdout);
+        PromptMeta<AttnInT> *runningTask;
+        if (!TaskWaitingQueue<AttnInT>::getInstance().empty()) {
+            runningTask = TaskWaitingQueue<AttnInT>::getInstance().pop();
+            ctx->promptID = runningTask->promptID;
+            TimeLine t("Decoder.forward." + std::to_string(ctx->promptID));
+            printf("%d: Decoder.forward\n", ctx->ppRank);
+            fflush(stdout);
+#endif
 
         // Decoder: forward from runningTask
         int layers_per_pp_stage = this->decoders.size();
@@ -435,12 +433,10 @@ class CommonDecoder : public AbstractDecoder {
                 }
             }
         }
-    } 
-    // else {
-    //     return std::tuple<float *, int, int>(nullptr, 0, 0);
-    // }
 
 #ifdef PIPELINE_PARALLEL
+        }
+
         // If current pipeline stage isn't the end of stage, should send data to next stage and return nullptr
         if (ctx->ppSize > 1 && ctx->ppRank < ctx->ppSize - 1) {
             TimeLine t("Decoder.MPI_Send");
diff --git a/src/models/models.cpp b/src/models/models.cpp
index ebbe4f7d..80e7c28c 100644
--- a/src/models/models.cpp
+++ b/src/models/models.cpp
@@ -157,11 +157,6 @@ std::vector<int32_t> Model::generate() {
         }
         return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
     } else {
-        // static int i = 0;
-        // i++;
-        // if (i == 10) {
-        //     isNewInput = true;
-        // }
         return searcher->getNextToken();
     }
 }
diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp
index d5b3733e..2747cb13 100644
--- a/src/searchers/greedy_search.cpp
+++ b/src/searchers/greedy_search.cpp
@@ -17,8 +17,6 @@
 #include "search_utils.h"
 #include "prompt.h"
 
-#include <thread>
-
 using namespace xft;
 
 GreedySearch::GreedySearch(AbstractDecoder &dec, const SearcherConfig &config)
diff --git a/src/utils/thread_util.h b/src/utils/thread_util.h
index 75507274..a44b08d7 100644
--- a/src/utils/thread_util.h
+++ b/src/utils/thread_util.h
@@ -16,7 +16,6 @@
 #include <omp.h>
 
 #include <functional>
-#include <iostream>
 #include <mutex>
 #include <queue>
 #include <thread>

From 143294b74b5f734a9bbcfd294c2b7685bc393ac7 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Fri, 19 Apr 2024 15:39:41 +0800
Subject: [PATCH 09/32] format code

---
 src/models/common_decoder.h     | 2 +-
 src/searchers/greedy_search.cpp | 1 +
 src/searchers/greedy_search.h   | 1 -
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index c02ed020..44252ad6 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -382,7 +382,7 @@ class CommonDecoder : public AbstractDecoder {
             fflush(stdout);
 #endif
 
-        // Decoder: forward from runningTask
+        // Decoder: forward
         int layers_per_pp_stage = this->decoders.size();
         for (int i = 0; i < layers_per_pp_stage; ++i) {
             int workers = this->messenger.getSize();
diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp
index 2747cb13..ed8c90d4 100644
--- a/src/searchers/greedy_search.cpp
+++ b/src/searchers/greedy_search.cpp
@@ -15,6 +15,7 @@
 #include "greedy_search.h"
 #include "messenger.h"
 #include "search_utils.h"
+#include "thread_util.h"
 #include "prompt.h"
 
 using namespace xft;
diff --git a/src/searchers/greedy_search.h b/src/searchers/greedy_search.h
index 69784c5e..607d9737 100644
--- a/src/searchers/greedy_search.h
+++ b/src/searchers/greedy_search.h
@@ -18,7 +18,6 @@
 #include "messenger.h"
 #include "timeline.h"
 #include "transformer_ctx.h"
-#include "thread_util.h"
 
 class GreedySearch : public AbstractSearcher {
 public:

From 700d0c8f7a2aa8764d3f307f54d6a1521343a550 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Fri, 19 Apr 2024 15:43:35 +0800
Subject: [PATCH 10/32] move prompt.h

---
 src/{models => utils}/prompt.h | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/{models => utils}/prompt.h (100%)

diff --git a/src/models/prompt.h b/src/utils/prompt.h
similarity index 100%
rename from src/models/prompt.h
rename to src/utils/prompt.h

From 8295d0bfc2e5f7c5f6b0b113356921820840596a Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Fri, 19 Apr 2024 17:43:50 +0800
Subject: [PATCH 11/32] Add comments

---
 src/utils/prompt.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/utils/prompt.h b/src/utils/prompt.h
index e9a044dd..05213f82 100644
--- a/src/utils/prompt.h
+++ b/src/utils/prompt.h
@@ -14,9 +14,29 @@
 // ============================================================================
 #pragma once
 
+#include <cstdint>
 #include <queue>
 #include <unordered_map>
 
+/*
+                  PromptPool
+                 ┌──────┬──────┬──────┐
+                 │      │      │  ◄───┼──┬─ PromptMeta
+                 ├──────┼──────┼──────┤  │
+                 │      │      │  ◄───┼──┘
+                 └▲─┬─▲─┴──────┴──────┘
+                  │ │ └───────────────────────────────────┐
+   ┌──┬──┬──┬──┐  │ │      ┌──┬──┬──┬──┬──┬──┬──┬──┬──┐   │
+   │  │  │  │  ├──┘ └─────►│  │  │  │  │  │  │  │  │  ├─┐ │
+   └──┴──┴──┴──┘           └──┴──┴──┴──┴──┴──┴──┴──┴──┘ │ │
+    InputQueue              TaskWaitingQueue0           │ │
+                        ┌───────────────────────────────┘ │
+                        │  ┌──┬──┬──┬──┬──┬──┬──┬──┬──┐   │
+                        └─►│  │  │  │  │  │  │  │  │  ├───┘
+                           └──┴──┴──┴──┴──┴──┴──┴──┴──┘
+                            TaskWaitingQueue1
+*/
+
 namespace xft {
 
 template <typename AttnInT>

From 93cbade9bdec945f10bcf8239375b6e6b3b95a95 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 10:51:12 +0800
Subject: [PATCH 12/32] input 2 request

---
 examples/cpp/example.cpp        |  2 +-
 src/models/models.cpp           |  2 +-
 src/searchers/greedy_search.cpp | 28 +++++-----------------------
 src/searchers/greedy_search.h   |  1 +
 src/utils/prompt.h              | 29 ++---------------------------
 5 files changed, 10 insertions(+), 52 deletions(-)

diff --git a/examples/cpp/example.cpp b/examples/cpp/example.cpp
index f6a68ce7..34cf4eb0 100644
--- a/examples/cpp/example.cpp
+++ b/examples/cpp/example.cpp
@@ -467,7 +467,7 @@ int main(int argc, char **argv) {
         }
         auto result = model.finalize();
 
-        if (true) {
+        if (isMaster) {
             std::cout << "\n[INFO] Final output is: " << std::endl;
             std::vector<std::string> sent = tokenizer->batchDecode(result, batchSize);
             for (auto str : sent) {
diff --git a/src/models/models.cpp b/src/models/models.cpp
index 80e7c28c..0d43c765 100644
--- a/src/models/models.cpp
+++ b/src/models/models.cpp
@@ -145,7 +145,7 @@ std::vector<int32_t> Model::generate() {
     if (isNewInput) {
         static int i = 0;
         i++;
-        if (i > 3) {
+        if (i > 1) {
             isNewInput = false;
             i = 0;
         }
diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp
index ed8c90d4..5bd5ec0c 100644
--- a/src/searchers/greedy_search.cpp
+++ b/src/searchers/greedy_search.cpp
@@ -39,38 +39,23 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
     // Messenger &messenger = decoder.getMessenger();
 
     if (std::get<0>(result) == nullptr) { // The first embedding pipeline parallel stage
-        static bool init = false;
         this->nextTokens = std::vector<int>(batchSize, 0);
-        if (ctx->ppSize > 1 && ctx->ppRank == 0 && init == false) {
-            init = true;
+        if (ctx->ppSize > 1 && ctx->ppRank == 0 && enabledBackgroundSync == false) {
+            enabledBackgroundSync = true;
             int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank;
-            // std::thread feedbackWaitingLastPP([&, predictor_world_rank, this](){
-            //     TimeLine t("GreedySearch.MPI_Recv");
-            //     MPI_Recv(this->nextTokens.data(), batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank,
-            //             MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-            //     // TODO: Error: different scope when dynamic loading so file
-            //     // messenger.worldRecvINT32(this->nextTokens.data(), batchSize, predictor_world_rank, predictor_world_rank);
-            //     printf("%d\n", this->nextTokens[0]);
-            // });
-            // feedbackWaitingLastPP.detach();
             ThreadPool::getInstance().addTask([predictor_world_rank, this] {
                 while (true) {
-                    printf("0: GreedySearch.MPI_Recv.AsyncStart\n");
-                    fflush(stdout);
                     int32_t promptID;
                     MPI_Recv(&promptID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank,
                             MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                    TimeLine t("GreedySearch.MPI_Recv.prompt" + std::to_string(promptID));
                     MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank,
                             MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-                    printf("%d\n", this->nextTokens[0]);
-                    TimeLine t("GreedySearch.MPI_Recv");
                     if (PromptPool<float>::getInstance().has(promptID)) {
                         auto prompt = PromptPool<float>::getInstance().get(promptID);
                         TaskWaitingQueue<float>::getInstance().push(prompt);
-                        printf("0: GreedySearch.MPI_Recv.AsyncDone %d\n", promptID);
-                        fflush(stdout);
                     } else {
-                        printf("error: should have promptID\n");
+                        printf("Error: should have promptID\n");
                         fflush(stdout);
                     }
                 }
@@ -79,15 +64,12 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
     } else { // The last predictor pipeline parallel stage
         this->nextTokens = this->search(result);
         if (ctx->ppSize > 1 && ctx->ppRank == ctx->ppSize - 1) {
-            TimeLine t("GreedySearch.MPI_Send");
-            fflush(stdout);
+            TimeLine t("GreedySearch.MPI_Send.prompt" + std::to_string(ctx->promptID));
             int embedding_world_rank = 0 * ctx->tpSize + ctx->tpRank;
             int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank;
             MPI_Send(&ctx->promptID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD);
             MPI_Send(this->nextTokens.data(), batchSize, MPI_INT32_T, embedding_world_rank, predictor_world_rank,
                     MPI_COMM_WORLD);
-            printf("%d: GreedySearch.MPI_Send %d\n", ctx->ppRank, ctx->promptID);
-            fflush(stdout);
             // TODO: Error: different scope when dynamic loading so file
             // messenger.worldSendINT32(this->nextTokens.data(), batchSize, embedding_world_rank, predictor_world_rank);
         }
diff --git a/src/searchers/greedy_search.h b/src/searchers/greedy_search.h
index 607d9737..5b4ec164 100644
--- a/src/searchers/greedy_search.h
+++ b/src/searchers/greedy_search.h
@@ -47,6 +47,7 @@ class GreedySearch : public AbstractSearcher {
     std::vector<std::vector<int>> cachedRepetVec;
     std::vector<int> doneBatch;
 
+    bool enabledBackgroundSync;
     int batchSize;
     int step;
     int curLen;
diff --git a/src/utils/prompt.h b/src/utils/prompt.h
index 05213f82..753e4bd1 100644
--- a/src/utils/prompt.h
+++ b/src/utils/prompt.h
@@ -190,38 +190,13 @@ class PromptPool {
         }
     }
 
-    std::vector<PromptMeta<T> *> getAll() {
-        std::vector<PromptMeta<T> *> metas;
-        for (const auto &pair : hub) {
-            metas.push_back(pair.second);
-        }
-        return metas;
-    }
-
     void remove(int32_t key) { hub.erase(key); }
 
     void modify(int32_t oldKey, PromptMeta<T> *newPrompt) {
         auto it = hub.find(oldKey);
-        if (it != hub.end()) { it->second = newPrompt; }
-    }
-
-    bool isUpdated(int32_t key) const {
-        auto it = hub.find(key);
         if (it != hub.end()) {
-            return it->second.hiddenStatesReceived;
-        } else {
-            printf("error: key not found\n");
-            return false;
-        }
-    }
-
-    bool setOld(int32_t key) {
-        auto it = hub.find(key);
-        if (it != hub.end()) {
-            it->second.hiddenStatesReceived = false;
-        } else {
-            printf("error: key not found\n");
-            return false;
+            delete it->second;
+            it->second = newPrompt;
         }
     }
 

From dadc69261abdca841059dde5182e7e69230a1ec4 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 10:53:28 +0800
Subject: [PATCH 13/32] format code

---
 src/searchers/greedy_search.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp
index 5bd5ec0c..ca21a9d7 100644
--- a/src/searchers/greedy_search.cpp
+++ b/src/searchers/greedy_search.cpp
@@ -14,9 +14,9 @@
 // ============================================================================
 #include "greedy_search.h"
 #include "messenger.h"
+#include "prompt.h"
 #include "search_utils.h"
 #include "thread_util.h"
-#include "prompt.h"
 
 using namespace xft;
 
@@ -46,11 +46,11 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
             ThreadPool::getInstance().addTask([predictor_world_rank, this] {
                 while (true) {
                     int32_t promptID;
-                    MPI_Recv(&promptID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank,
-                            MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                    MPI_Recv(&promptID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD,
+                            MPI_STATUS_IGNORE);
                     TimeLine t("GreedySearch.MPI_Recv.prompt" + std::to_string(promptID));
-                    MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank, predictor_world_rank,
-                            MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                    MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank,
+                            predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                     if (PromptPool<float>::getInstance().has(promptID)) {
                         auto prompt = PromptPool<float>::getInstance().get(promptID);
                         TaskWaitingQueue<float>::getInstance().push(prompt);

From 0d0c74b162867d0b8f0af92f39a6498fd883b643 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 13:19:57 +0800
Subject: [PATCH 14/32] modify promptID to sampleID.

---
 src/common/transformer_ctx.h    |   2 +-
 src/models/common_decoder.h     |  31 +++----
 src/models/models.cpp           |   5 +-
 src/searchers/greedy_search.cpp |  16 ++--
 src/utils/prompt.h              | 151 ++++++++++++++++----------------
 5 files changed, 103 insertions(+), 102 deletions(-)

diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h
index cac52aa0..67611e2b 100644
--- a/src/common/transformer_ctx.h
+++ b/src/common/transformer_ctx.h
@@ -64,7 +64,7 @@ struct DecoderContext {
     int inputSeqLen;
     // For custom usage
     int reserved1;
-    int promptID;
+    int sampleID;
 
     // Model structure configuration
     int vocabSize;
diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index 44252ad6..87ccdfd6 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -344,21 +344,21 @@ class CommonDecoder : public AbstractDecoder {
             int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank;
             int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank;
             int count = batchSize * inputSeqLen * hiddenSize;
-            TimeLine t("Decoder.MPI_Recv." + std::to_string(ctx->promptID));
+            TimeLine t("Decoder.MPI_Recv." + std::to_string(ctx->sampleID));
             printf("%d: Decoder.MPI_Recv.SyncStart\n", ctx->ppRank);
             fflush(stdout);
-            int32_t promptID;
-            MPI_Recv(&promptID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            int32_t sampleID;
+            MPI_Recv(&sampleID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
             MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
             // TODO: Error: different scope when dynamic loading so file
             // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
-            if (!PromptPool<AttnInT>::getInstance().has(promptID)) {
-                PromptMeta<AttnInT> *prompt = new PromptMeta<AttnInT>(promptID, 0, batchSize, seqLen, hiddenSize);
+            if (!SamplePool<AttnInT>::getInstance().has(sampleID)) {
+                SampleMeta<AttnInT> *prompt = new SampleMeta<AttnInT>(sampleID, seqLen, hiddenSize);
                 prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
-                PromptPool<AttnInT>::getInstance().insert(prompt->promptID, prompt);
+                SamplePool<AttnInT>::getInstance().insert(prompt->getSampleID(), prompt);
             }
-            TaskWaitingQueue<AttnInT>::getInstance().push(PromptPool<AttnInT>::getInstance().get(promptID));
-            printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, promptID);
+            TaskWaitingQueue<AttnInT>::getInstance().push(SamplePool<AttnInT>::getInstance().get(sampleID));
+            printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, sampleID);
             fflush(stdout);
         }
 
@@ -366,18 +366,18 @@ class CommonDecoder : public AbstractDecoder {
             if (!TaskWaitingQueue<AttnInT>::getInstance().isFull()) {
                 auto prompt = InputQueue<AttnInT>::getInstance().pop();
                 prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
-                PromptPool<AttnInT>::getInstance().insert(prompt->promptID, prompt);
-                TaskWaitingQueue<AttnInT>::getInstance().push(PromptPool<AttnInT>::getInstance().get(prompt->promptID));
+                SamplePool<AttnInT>::getInstance().insert(prompt->getSampleID(), prompt);
+                TaskWaitingQueue<AttnInT>::getInstance().push(SamplePool<AttnInT>::getInstance().get(prompt->getSampleID()));
             }
         }
 
         while(TaskWaitingQueue<float>::getInstance().empty());
 
-        PromptMeta<AttnInT> *runningTask;
+        SampleMeta<AttnInT> *runningTask;
         if (!TaskWaitingQueue<AttnInT>::getInstance().empty()) {
             runningTask = TaskWaitingQueue<AttnInT>::getInstance().pop();
-            ctx->promptID = runningTask->promptID;
-            TimeLine t("Decoder.forward." + std::to_string(ctx->promptID));
+            ctx->sampleID = runningTask->getSampleID();
+            TimeLine t("Decoder.forward." + std::to_string(ctx->sampleID));
             printf("%d: Decoder.forward\n", ctx->ppRank);
             fflush(stdout);
 #endif
@@ -442,11 +442,12 @@ class CommonDecoder : public AbstractDecoder {
             TimeLine t("Decoder.MPI_Send");
             int next_world_rank = (ctx->ppRank + 1) * ctx->tpSize + ctx->tpRank;
             int count = batchSize * inputSeqLen * hiddenSize;
-            MPI_Send(&runningTask->promptID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD);
+            int32_t sampleID = runningTask->getSampleID();
+            MPI_Send(&sampleID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD);
             MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD);
             // TODO: Error: different scope when dynamic loading so file
             // this->messenger.worldSendFP32(embBuf, count, next_world_rank, next_world_rank);
-            printf("%d: Decoder.MPI_Send %d\n", ctx->ppRank, runningTask->promptID);
+            printf("%d: Decoder.MPI_Send %d\n", ctx->ppRank, sampleID);
             fflush(stdout);
             return std::tuple<float *, int, int>(nullptr, 0, 0);
         }
diff --git a/src/models/models.cpp b/src/models/models.cpp
index 0d43c765..da76348e 100644
--- a/src/models/models.cpp
+++ b/src/models/models.cpp
@@ -151,9 +151,8 @@ std::vector<int32_t> Model::generate() {
         }
         // TODO: Create it when request input
         if (this->isMaster()) {
-            int promptID = InputQueue<float>::getInstance().createPromptID();
-            int tokenID = InputQueue<float>::getInstance().createTokenID();
-            InputQueue<float>::getInstance().push(new PromptMeta<float>(promptID, tokenID, batchSize, seqLen, inputIds));
+            int sampleID = InputQueue<float>::getInstance().createSampleID();
+            InputQueue<float>::getInstance().push(new SampleMeta<float>(sampleID, seqLen, inputIds));
         }
         return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
     } else {
diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp
index ca21a9d7..3d718bd0 100644
--- a/src/searchers/greedy_search.cpp
+++ b/src/searchers/greedy_search.cpp
@@ -45,17 +45,17 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
             int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank;
             ThreadPool::getInstance().addTask([predictor_world_rank, this] {
                 while (true) {
-                    int32_t promptID;
-                    MPI_Recv(&promptID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD,
+                    int32_t sampleID;
+                    MPI_Recv(&sampleID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD,
                             MPI_STATUS_IGNORE);
-                    TimeLine t("GreedySearch.MPI_Recv.prompt" + std::to_string(promptID));
+                    TimeLine t("GreedySearch.MPI_Recv.prompt" + std::to_string(sampleID));
                     MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank,
                             predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-                    if (PromptPool<float>::getInstance().has(promptID)) {
-                        auto prompt = PromptPool<float>::getInstance().get(promptID);
+                    if (SamplePool<float>::getInstance().has(sampleID)) {
+                        auto prompt = SamplePool<float>::getInstance().get(sampleID);
                         TaskWaitingQueue<float>::getInstance().push(prompt);
                     } else {
-                        printf("Error: should have promptID\n");
+                        printf("Error: should have sampleID\n");
                         fflush(stdout);
                     }
                 }
@@ -64,10 +64,10 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
     } else { // The last predictor pipeline parallel stage
         this->nextTokens = this->search(result);
         if (ctx->ppSize > 1 && ctx->ppRank == ctx->ppSize - 1) {
-            TimeLine t("GreedySearch.MPI_Send.prompt" + std::to_string(ctx->promptID));
+            TimeLine t("GreedySearch.MPI_Send.prompt" + std::to_string(ctx->sampleID));
             int embedding_world_rank = 0 * ctx->tpSize + ctx->tpRank;
             int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank;
-            MPI_Send(&ctx->promptID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD);
+            MPI_Send(&ctx->sampleID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD);
             MPI_Send(this->nextTokens.data(), batchSize, MPI_INT32_T, embedding_world_rank, predictor_world_rank,
                     MPI_COMM_WORLD);
             // TODO: Error: different scope when dynamic loading so file
diff --git a/src/utils/prompt.h b/src/utils/prompt.h
index 753e4bd1..2a25c9d3 100644
--- a/src/utils/prompt.h
+++ b/src/utils/prompt.h
@@ -19,72 +19,82 @@
 #include <unordered_map>
 
 /*
-                  PromptPool
-                 ┌──────┬──────┬──────┐
-                 │      │      │  ◄───┼──┬─ PromptMeta
-                 ├──────┼──────┼──────┤  │
-                 │      │      │  ◄───┼──┘
-                 └▲─┬─▲─┴──────┴──────┘
-                  │ │ └───────────────────────────────────┐
-   ┌──┬──┬──┬──┐  │ │      ┌──┬──┬──┬──┬──┬──┬──┬──┬──┐   │
-   │  │  │  │  ├──┘ └─────►│  │  │  │  │  │  │  │  │  ├─┐ │
-   └──┴──┴──┴──┘           └──┴──┴──┴──┴──┴──┴──┴──┴──┘ │ │
-    InputQueue              TaskWaitingQueue0           │ │
-                        ┌───────────────────────────────┘ │
-                        │  ┌──┬──┬──┬──┬──┬──┬──┬──┬──┐   │
-                        └─►│  │  │  │  │  │  │  │  │  ├───┘
-                           └──┴──┴──┴──┴──┴──┴──┴──┴──┘
-                            TaskWaitingQueue1
+                           SamplePool
+                          ┌──────┬──────┬──────┐
+                          │      │      │  ◄───┼──┬─ SampleMeta
+                          ├──────┼──────┼──────┤  │
+    BatchInputs           │      │      │  ◄───┼──┘
+      │                   └▲─┬─▲─┴──────┴──────┘
+      │                    │ │ └───────────────────────────────────┐
+      ▼     ┌──┬──┬──┬──┐  │ │      ┌──┬──┬──┬──┬──┬──┬──┬──┬──┐   │
+    Input ─►│  │  │  │  ├──┘ └─────►│  │  │  │  │  │  │  │  │  ├─┐ │
+            └──┴──┴──┴──┘           └──┴──┴──┴──┴──┴──┴──┴──┴──┘ │ │
+            InputQueue              TaskWaitingQueue0            │ │
+                                 ┌───────────────────────────────┘ │
+                                 │  ┌──┬──┬──┬──┬──┬──┬──┬──┬──┐   │
+                                 └─►│  │  │  │  │  │  │  │  │  ├───┘
+                                    └──┴──┴──┴──┴──┴──┴──┴──┴──┘
+                                    TaskWaitingQueue1
 */
 
 namespace xft {
 
-template <typename AttnInT>
-class PromptMeta {
+// The Sample is one of batch inputs
+template <typename T>
+class SampleMeta {
 public:
-    PromptMeta(int32_t _promptID, int32_t _tokenID, int32_t _batchSize, int32_t _inputSeqLen,
-            std::vector<int32_t> _inputs) {
-        promptID = _promptID;
-        tokenID = _tokenID;
-        batchSize = _batchSize;
-        inputSeqLen = _inputSeqLen;
-        inputs = _inputs;
-        hiddenStatesReceived = false;
+    SampleMeta(int32_t _sampleID, int32_t _inputSeqLen, std::vector<int32_t> _inputTokens)
+        : sampleID(_sampleID), inputSeqLen(_inputSeqLen), bePrefill(true) {
+        inputTokens.resize(_inputSeqLen);
+        inputTokens.assign(_inputTokens.begin(), _inputTokens.end());
+        pastTokens.resize(_inputSeqLen);
     }
 
-    PromptMeta(int32_t _promptID, int32_t _tokenID, int32_t _batchSize, int32_t _inputSeqLen, int32_t _hiddenSize) {
-        promptID = _promptID;
-        tokenID = _tokenID;
-        batchSize = _batchSize;
-        inputSeqLen = _inputSeqLen;
-        hiddenSize = _hiddenSize;
-        hiddenStatesReceived = false;
-        hiddenStates.Resize(batchSize * inputSeqLen, hiddenSize, hiddenSize);
+    SampleMeta(int32_t _sampleID, int32_t _inputSeqLen, int32_t _hiddenSize)
+        : sampleID(_sampleID), inputSeqLen(_inputSeqLen), hiddenSize(_hiddenSize), bePrefill(true) {
+        inputTokens.resize(_inputSeqLen);
+        pastTokens.resize(_inputSeqLen);
     }
 
     void ResetKVCache(int32_t _hiddenSize, int32_t _pastSeqLen, int32_t _layerIdx, void *_hiddenStates, void *_kvm) {
         hiddenSize = _hiddenSize;
         pastSeqLen = _pastSeqLen;
         layerIdx = _layerIdx;
-        hiddenStates.Resize(batchSize * inputSeqLen, hiddenSize, hiddenSize);
-        memcpy(hiddenStates.Data(), _hiddenStates, sizeof(AttnInT) * batchSize * inputSeqLen * hiddenSize);
+        hiddenStates.Resize(inputSeqLen, hiddenSize, hiddenSize);
+        memcpy(hiddenStates.Data(), _hiddenStates, sizeof(T) * inputSeqLen * hiddenSize);
         kvm = _kvm;
     }
 
-    int32_t promptID;
-    int32_t tokenID;
-    bool hiddenStatesReceived;
+    int32_t getSampleID() const { return sampleID; }
+
+    // Get the input tokens in sample
+    int32_t *getInputTokens() const { return inputTokens.data(); }
+
+    // For generated tokens
+    void addGeneratedToken(int32_t token) { pastTokens.push_back(token); }
+
+    int32_t getLatestToken() const { return pastTokens.back(); }
+
+    int32_t *getTotalTokens() const { return pastTokens.data(); }
+
+    bool isPrefill() const { return bePrefill; }
+
+    void setPrefill(bool _bePrefill) { bePrefill = _bePrefill; }
 
 private:
-    int32_t batchSize;
+    int32_t sampleID;
     int32_t inputSeqLen;
     int32_t hiddenSize;
     int32_t pastSeqLen;
-    std::vector<int32_t> inputs;
-    std::vector<int32_t> outputs;
+    std::vector<int32_t> inputTokens;
+    std::vector<int32_t> pastTokens; // generated tokens
+
+    // Indicates whether the sample is in the prefill phase
+    bool bePrefill;
+
     int32_t layerIdx;
-    hpj::Matrix<AttnInT> hiddenStates;
-    void *kvm; //KVCacheManager<KVCacheT>
+    hpj::Matrix<T> hiddenStates;
+    void *kvm; // KVCacheManager<KVCacheT>
 };
 
 template <typename T>
@@ -95,44 +105,35 @@ class InputQueue {
         return instance;
     }
 
-    int32_t createPromptID() {
-        int32_t id = promptID++;
-        if (id > 1000) {
-            promptID = 0;
-            id = promptID++;
-        }
-        return id;
-    }
-
-    int32_t createTokenID() {
-        int32_t id = tokenID++;
-        if (id > 1000) {
-            tokenID = 0;
-            id = tokenID++;
+    int32_t createSampleID() {
+        int32_t id = sampleID++;
+        if (id >= 10 * 1024) {
+            sampleID = 0;
+            id = sampleID++;
         }
         return id;
     }
 
     bool empty() { return queue.empty(); }
 
-    PromptMeta<T> *pop() {
+    SampleMeta<T> *pop() {
         auto buffer = queue.front();
         queue.pop();
         return buffer;
     }
 
-    void push(PromptMeta<T> *buffer) { queue.push(buffer); }
+    void push(SampleMeta<T> *buffer) { queue.push(buffer); }
 
 private:
     InputQueue() {}
 
-    static int32_t promptID;
+    static int32_t sampleID;
     static int32_t tokenID;
-    std::queue<PromptMeta<T> *> queue;
+    std::queue<SampleMeta<T> *> queue;
 };
 
 template <typename T>
-int32_t InputQueue<T>::promptID = 0;
+int32_t InputQueue<T>::sampleID = 0;
 
 template <typename T>
 int32_t InputQueue<T>::tokenID = 0;
@@ -155,33 +156,33 @@ class TaskWaitingQueue {
         return full;
     }
 
-    PromptMeta<T> *pop() {
+    SampleMeta<T> *pop() {
         auto buffer = queue.front();
         queue.pop();
         return buffer;
     }
 
-    void push(PromptMeta<T> *buffer) { queue.push(buffer); }
+    void push(SampleMeta<T> *buffer) { queue.push(buffer); }
 
 private:
     TaskWaitingQueue() {}
 
-    std::queue<PromptMeta<T> *> queue;
+    std::queue<SampleMeta<T> *> queue;
 };
 
 template <typename T>
-class PromptPool {
+class SamplePool {
 public:
-    static PromptPool &getInstance() {
-        static PromptPool instance;
+    static SamplePool &getInstance() {
+        static SamplePool instance;
         return instance;
     }
 
-    void insert(int32_t key, PromptMeta<T> *prompt) { hub[key] = prompt; }
+    void insert(int32_t key, SampleMeta<T> *sample) { hub[key] = sample; }
 
     bool has(int32_t key) const { return hub.find(key) != hub.end(); }
 
-    PromptMeta<T> *get(int32_t key) const {
+    SampleMeta<T> *get(int32_t key) const {
         auto it = hub.find(key);
         if (it != hub.end()) {
             return it->second;
@@ -192,18 +193,18 @@ class PromptPool {
 
     void remove(int32_t key) { hub.erase(key); }
 
-    void modify(int32_t oldKey, PromptMeta<T> *newPrompt) {
+    void modify(int32_t oldKey, SampleMeta<T> *newSample) {
         auto it = hub.find(oldKey);
         if (it != hub.end()) {
             delete it->second;
-            it->second = newPrompt;
+            it->second = newSample;
         }
     }
 
 private:
-    PromptPool() {}
+    SamplePool() {}
 
-    std::unordered_map<int32_t, PromptMeta<T> *> hub;
+    std::unordered_map<int32_t, SampleMeta<T> *> hub;
 };
 
 } // namespace xft
\ No newline at end of file

From 291bd28338c3fa6bee06b4b9306f9bd43efc7216 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 13:21:56 +0800
Subject: [PATCH 15/32] rename filename

---
 src/{utils/prompt.h => common/sample_info.h} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/{utils/prompt.h => common/sample_info.h} (100%)

diff --git a/src/utils/prompt.h b/src/common/sample_info.h
similarity index 100%
rename from src/utils/prompt.h
rename to src/common/sample_info.h

From 6ffe206429e58dd882fb228a42a4128cd82586bb Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 13:25:54 +0800
Subject: [PATCH 16/32] format code

---
 src/models/common_decoder.h |  8 --------
 src/models/models.cpp       | 12 +-----------
 2 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index 87ccdfd6..20f2b8e5 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -345,8 +345,6 @@ class CommonDecoder : public AbstractDecoder {
             int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank;
             int count = batchSize * inputSeqLen * hiddenSize;
             TimeLine t("Decoder.MPI_Recv." + std::to_string(ctx->sampleID));
-            printf("%d: Decoder.MPI_Recv.SyncStart\n", ctx->ppRank);
-            fflush(stdout);
             int32_t sampleID;
             MPI_Recv(&sampleID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
             MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
@@ -358,8 +356,6 @@ class CommonDecoder : public AbstractDecoder {
                 SamplePool<AttnInT>::getInstance().insert(prompt->getSampleID(), prompt);
             }
             TaskWaitingQueue<AttnInT>::getInstance().push(SamplePool<AttnInT>::getInstance().get(sampleID));
-            printf("%d: Decoder.MPI_Recv.SyncDone %d\n", ctx->ppRank, sampleID);
-            fflush(stdout);
         }
 
         if (!InputQueue<AttnInT>::getInstance().empty()) {
@@ -378,8 +374,6 @@ class CommonDecoder : public AbstractDecoder {
             runningTask = TaskWaitingQueue<AttnInT>::getInstance().pop();
             ctx->sampleID = runningTask->getSampleID();
             TimeLine t("Decoder.forward." + std::to_string(ctx->sampleID));
-            printf("%d: Decoder.forward\n", ctx->ppRank);
-            fflush(stdout);
 #endif
 
         // Decoder: forward
@@ -447,8 +441,6 @@ class CommonDecoder : public AbstractDecoder {
             MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD);
             // TODO: Error: different scope when dynamic loading so file
             // this->messenger.worldSendFP32(embBuf, count, next_world_rank, next_world_rank);
-            printf("%d: Decoder.MPI_Send %d\n", ctx->ppRank, sampleID);
-            fflush(stdout);
             return std::tuple<float *, int, int>(nullptr, 0, 0);
         }
 #endif
diff --git a/src/models/models.cpp b/src/models/models.cpp
index da76348e..45a1e51b 100644
--- a/src/models/models.cpp
+++ b/src/models/models.cpp
@@ -143,17 +143,7 @@ std::vector<int32_t> Model::generate() {
     }
 
     if (isNewInput) {
-        static int i = 0;
-        i++;
-        if (i > 1) {
-            isNewInput = false;
-            i = 0;
-        }
-        // TODO: Create it when request input
-        if (this->isMaster()) {
-            int sampleID = InputQueue<float>::getInstance().createSampleID();
-            InputQueue<float>::getInstance().push(new SampleMeta<float>(sampleID, seqLen, inputIds));
-        }
+        isNewInput = false;
         return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
     } else {
         return searcher->getNextToken();

From 6051c842af9606a6af748afc0b1cd7bb31ce6333 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 13:31:30 +0800
Subject: [PATCH 17/32] format code

---
 src/models/common_decoder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index 20f2b8e5..a0a9fc35 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -344,9 +344,9 @@ class CommonDecoder : public AbstractDecoder {
             int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank;
             int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank;
             int count = batchSize * inputSeqLen * hiddenSize;
-            TimeLine t("Decoder.MPI_Recv." + std::to_string(ctx->sampleID));
             int32_t sampleID;
             MPI_Recv(&sampleID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            TimeLine t("Decoder.MPI_Recv." + std::to_string(sampleID));
             MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
             // TODO: Error: different scope when dynamic loading so file
             // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);

From d4bd7dcecb8bfdce0a02e4d8e77c43cc5cf1c5bf Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 13:34:28 +0800
Subject: [PATCH 18/32] format  code

---
 src/models/common_decoder.h     | 2 +-
 src/models/models.cpp           | 2 +-
 src/searchers/greedy_search.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index a0a9fc35..02d41cb5 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -35,7 +35,7 @@
 #include "transformer_ctx.h"
 #include "transpose_util.h"
 #include "weight_util.h"
-#include "prompt.h"
+#include "sample_info.h"
 
 using namespace xft;
 
diff --git a/src/models/models.cpp b/src/models/models.cpp
index 45a1e51b..d7eaa4ec 100644
--- a/src/models/models.cpp
+++ b/src/models/models.cpp
@@ -32,7 +32,7 @@
 #include "searcher.h"
 #include "timeline.h"
 #include "yarn_llama.h"
-#include "prompt.h"
+#include "sample_info.h"
 
 namespace xft {
 enum class GenerationMode { GREEDY_SEARCH, BEAM_SEARCH, SAMPLE };
diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp
index 3d718bd0..75ba689e 100644
--- a/src/searchers/greedy_search.cpp
+++ b/src/searchers/greedy_search.cpp
@@ -14,7 +14,7 @@
 // ============================================================================
 #include "greedy_search.h"
 #include "messenger.h"
-#include "prompt.h"
+#include "sample_info.h"
 #include "search_utils.h"
 #include "thread_util.h"
 

From f7708afa365b135bc649000967003aebaada08b2 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 14:35:41 +0800
Subject: [PATCH 19/32] update samplePool

---
 src/common/sample_info.h | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/common/sample_info.h b/src/common/sample_info.h
index 2a25c9d3..e0b789db 100644
--- a/src/common/sample_info.h
+++ b/src/common/sample_info.h
@@ -178,7 +178,23 @@ class SamplePool {
         return instance;
     }
 
-    void insert(int32_t key, SampleMeta<T> *sample) { hub[key] = sample; }
+    bool add(int32_t key, SampleMeta<T> *sample) {
+        bool exist = has(key);
+        if (!exist) {
+            hub[key] = sample;
+        }
+
+        return exist;
+    }
+
+    void forceAdd(int32_t key, SampleMeta<T> *sample) {
+        auto it = hub.find(key);
+        if (it != hub.end()) {
+            delete it->second;
+        }
+
+        hub[key] = sample;
+    }
 
     bool has(int32_t key) const { return hub.find(key) != hub.end(); }
 
@@ -191,9 +207,13 @@ class SamplePool {
         }
     }
 
-    void remove(int32_t key) { hub.erase(key); }
+    void remove(int32_t key) {
+        if (has(key)) {
+            hub.erase(key);
+        }
+    }
 
-    void modify(int32_t oldKey, SampleMeta<T> *newSample) {
+    void replace(int32_t oldKey, SampleMeta<T> *newSample) {
         auto it = hub.find(oldKey);
         if (it != hub.end()) {
             delete it->second;

From cf4ff411c90d488f892e3b584b28ec87a937b5d5 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 14:38:37 +0800
Subject: [PATCH 20/32] update samplePool

---
 src/common/sample_info.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/common/sample_info.h b/src/common/sample_info.h
index e0b789db..6d383a74 100644
--- a/src/common/sample_info.h
+++ b/src/common/sample_info.h
@@ -213,12 +213,16 @@ class SamplePool {
         }
     }
 
-    void replace(int32_t oldKey, SampleMeta<T> *newSample) {
+    bool replace(int32_t oldKey, SampleMeta<T> *newSample) {
+        bool ret = false;
         auto it = hub.find(oldKey);
         if (it != hub.end()) {
             delete it->second;
             it->second = newSample;
+            ret = true;
         }
+
+        return ret;
     }
 
 private:

From e96c1a83cc5190e1eb7db3138c184e1d2e601c5d Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 15:01:35 +0800
Subject: [PATCH 21/32] update code

---
 src/common/sample_info.h    | 14 ++++----------
 src/models/common_decoder.h | 14 +++++++-------
 src/utils/environment.h     | 21 +++++++++++++++++++++
 3 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/src/common/sample_info.h b/src/common/sample_info.h
index 6d383a74..e9322900 100644
--- a/src/common/sample_info.h
+++ b/src/common/sample_info.h
@@ -152,7 +152,7 @@ class TaskWaitingQueue {
 
     bool isFull() {
         bool full = false;
-        if (this->size() >= 4) full = true;
+        if (this->size() >= Env::getInstance().getMaxRequestNum()) { full = true; }
         return full;
     }
 
@@ -180,18 +180,14 @@ class SamplePool {
 
     bool add(int32_t key, SampleMeta<T> *sample) {
         bool exist = has(key);
-        if (!exist) {
-            hub[key] = sample;
-        }
+        if (!exist) { hub[key] = sample; }
 
         return exist;
     }
 
     void forceAdd(int32_t key, SampleMeta<T> *sample) {
         auto it = hub.find(key);
-        if (it != hub.end()) {
-            delete it->second;
-        }
+        if (it != hub.end()) { delete it->second; }
 
         hub[key] = sample;
     }
@@ -208,9 +204,7 @@ class SamplePool {
     }
 
     void remove(int32_t key) {
-        if (has(key)) {
-            hub.erase(key);
-        }
+        if (has(key)) { hub.erase(key); }
     }
 
     bool replace(int32_t oldKey, SampleMeta<T> *newSample) {
diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index 02d41cb5..4cdb1cef 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -351,19 +351,19 @@ class CommonDecoder : public AbstractDecoder {
             // TODO: Error: different scope when dynamic loading so file
             // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
             if (!SamplePool<AttnInT>::getInstance().has(sampleID)) {
-                SampleMeta<AttnInT> *prompt = new SampleMeta<AttnInT>(sampleID, seqLen, hiddenSize);
-                prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
-                SamplePool<AttnInT>::getInstance().insert(prompt->getSampleID(), prompt);
+                SampleMeta<AttnInT> *sample = new SampleMeta<AttnInT>(sampleID, seqLen, hiddenSize);
+                sample->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
+                SamplePool<AttnInT>::getInstance().forceAdd(sample->getSampleID(), sample);
             }
             TaskWaitingQueue<AttnInT>::getInstance().push(SamplePool<AttnInT>::getInstance().get(sampleID));
         }
 
         if (!InputQueue<AttnInT>::getInstance().empty()) {
             if (!TaskWaitingQueue<AttnInT>::getInstance().isFull()) {
-                auto prompt = InputQueue<AttnInT>::getInstance().pop();
-                prompt->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
-                SamplePool<AttnInT>::getInstance().insert(prompt->getSampleID(), prompt);
-                TaskWaitingQueue<AttnInT>::getInstance().push(SamplePool<AttnInT>::getInstance().get(prompt->getSampleID()));
+                auto sample = InputQueue<AttnInT>::getInstance().pop();
+                sample->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
+                SamplePool<AttnInT>::getInstance().forceAdd(sample->getSampleID(), sample);
+                TaskWaitingQueue<AttnInT>::getInstance().push(SamplePool<AttnInT>::getInstance().get(sample->getSampleID()));
             }
         }
 
diff --git a/src/utils/environment.h b/src/utils/environment.h
index e6d94338..2630ff74 100644
--- a/src/utils/environment.h
+++ b/src/utils/environment.h
@@ -41,6 +41,9 @@ class Env {
     // get Engine Kind and Index
     int getPipelineStage() { return pipelineStageValue; }
 
+    // get Engine Kind and Index
+    int getMaxRequestNum() { return maxRequestNumValue; }
+
     // get AMX Threshold M
     int getAMXThresholdM() { return AMXThresholdMValue; }
 
@@ -73,6 +76,9 @@ class Env {
         // init Pipeline Parallel
         initPipelineStage();
 
+        // init Max request number
+        initMaxRequestNum();
+
         // init Engine Kind and Index
         initEngineKindIndex();
 
@@ -173,6 +179,21 @@ class Env {
         }
     }
 
+    // Max request number
+    int maxRequestNumValue = 1;
+    void initMaxRequestNum() {
+        char *xft_max_request_num_value = getenv("XFT_MAX_REQUEST_NUM");
+        if (xft_max_request_num_value != NULL) {
+            int value = atoi(xft_max_request_num_value);
+            if (value >= 1)
+                maxRequestNumValue = value;
+            else
+                printf("[ERROR] XFT_MAX_REQUEST_NUM value need to be greater than 0.\n");
+        } else {
+            maxRequestNumValue = 1;
+        }
+    }
+
     // AMX Threshold M
     int AMXThresholdMValue = 1;
     void initAMXThresholdM() {

From dc1a9e55c249d2b69be92fb4da4448782c4043d7 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 19:40:27 +0800
Subject: [PATCH 22/32] sampleID to seqenceID

---
 src/common/{sample_info.h => sequence.h} | 147 ++++++++++++-----------
 src/common/transformer_ctx.h             |   2 +-
 src/models/common_decoder.h              |  50 ++++----
 src/models/models.cpp                    |   2 +-
 src/searchers/greedy_search.cpp          |  20 +--
 5 files changed, 115 insertions(+), 106 deletions(-)
 rename src/common/{sample_info.h => sequence.h} (59%)

diff --git a/src/common/sample_info.h b/src/common/sequence.h
similarity index 59%
rename from src/common/sample_info.h
rename to src/common/sequence.h
index e9322900..cef2f895 100644
--- a/src/common/sample_info.h
+++ b/src/common/sequence.h
@@ -19,9 +19,9 @@
 #include <unordered_map>
 
 /*
-                           SamplePool
+                           SequencePool
                           ┌──────┬──────┬──────┐
-                          │      │      │  ◄───┼──┬─ SampleMeta
+                          │      │      │  ◄───┼──┬─ SequenceMeta
                           ├──────┼──────┼──────┤  │
     BatchInputs           │      │      │  ◄───┼──┘
       │                   └▲─┬─▲─┴──────┴──────┘
@@ -39,65 +39,77 @@
 
 namespace xft {
 
-// The Sample is one of batch inputs
-template <typename T>
-class SampleMeta {
+// The Sequence is one sequence of batch inputs and includes the generated tokens.
+class SequenceMeta {
 public:
-    SampleMeta(int32_t _sampleID, int32_t _inputSeqLen, std::vector<int32_t> _inputTokens)
-        : sampleID(_sampleID), inputSeqLen(_inputSeqLen), bePrefill(true) {
+    SequenceMeta(int32_t _sequenceID, int32_t _inputSeqLen, std::vector<int32_t> _inputTokens)
+        : sequenceID(_sequenceID), inputSeqLen(_inputSeqLen), step(0) {
         inputTokens.resize(_inputSeqLen);
         inputTokens.assign(_inputTokens.begin(), _inputTokens.end());
-        pastTokens.resize(_inputSeqLen);
+        nextTokens.resize(_inputSeqLen);
     }
 
-    SampleMeta(int32_t _sampleID, int32_t _inputSeqLen, int32_t _hiddenSize)
-        : sampleID(_sampleID), inputSeqLen(_inputSeqLen), hiddenSize(_hiddenSize), bePrefill(true) {
-        inputTokens.resize(_inputSeqLen);
-        pastTokens.resize(_inputSeqLen);
+    SequenceMeta(int32_t _sequenceID, int32_t _inputSeqLen)
+        : sequenceID(_sequenceID), inputSeqLen(_inputSeqLen), inputTokens(_inputSeqLen, 0), step(0) {
+        nextTokens.resize(_inputSeqLen);
     }
 
-    void ResetKVCache(int32_t _hiddenSize, int32_t _pastSeqLen, int32_t _layerIdx, void *_hiddenStates, void *_kvm) {
-        hiddenSize = _hiddenSize;
-        pastSeqLen = _pastSeqLen;
-        layerIdx = _layerIdx;
-        hiddenStates.Resize(inputSeqLen, hiddenSize, hiddenSize);
-        memcpy(hiddenStates.Data(), _hiddenStates, sizeof(T) * inputSeqLen * hiddenSize);
-        kvm = _kvm;
-    }
+    ~SequenceMeta() {}
+
+    int32_t getSequenceID() const { return sequenceID; }
+
+    // Get the input tokens in sequence
+    int32_t getInputSeqLen() const { return inputSeqLen; }
 
-    int32_t getSampleID() const { return sampleID; }
+    const int32_t *getInputTokens() const { return inputTokens.data(); }
 
-    // Get the input tokens in sample
-    int32_t *getInputTokens() const { return inputTokens.data(); }
+    int32_t getPastSeqLen() const { return pastSeqLen; }
 
-    // For generated tokens
-    void addGeneratedToken(int32_t token) { pastTokens.push_back(token); }
+    void setPastSeqLen(int32_t _pastSeqLen) { pastSeqLen = _pastSeqLen; }
 
-    int32_t getLatestToken() const { return pastTokens.back(); }
+    // For next tokens
+    void addNextToken(int32_t token) { nextTokens.push_back(token); }
 
-    int32_t *getTotalTokens() const { return pastTokens.data(); }
+    int32_t getLatestToken() const { return nextTokens.back(); }
 
-    bool isPrefill() const { return bePrefill; }
+    const int32_t *getTotalTokens() const { return nextTokens.data(); }
 
-    void setPrefill(bool _bePrefill) { bePrefill = _bePrefill; }
+    int32_t getStep() const { return step; }
+
+    void setStep(int32_t _step) { step = _step; }
 
 private:
-    int32_t sampleID;
+    int32_t sequenceID;
     int32_t inputSeqLen;
-    int32_t hiddenSize;
     int32_t pastSeqLen;
-    std::vector<int32_t> inputTokens;
-    std::vector<int32_t> pastTokens; // generated tokens
+    std::vector<int32_t> inputTokens; // input tokens + next tokens
+    std::vector<int32_t> nextTokens; // next tokens
 
-    // Indicates whether the sample is in the prefill phase
-    bool bePrefill;
+    // Indicates whether the sequence is in the prefill phase
+    int32_t step;
+
+#ifdef PIPELINE_PARALLEL
+public:
+    template <typename T>
+    void allocBuffer(int32_t hiddenSize, void *_hiddenStates) {
+        hiddenStates = xft::alloc(sizeof(T) * getInputSeqLen() * hiddenSize);
+        memcpy(hiddenStates, _hiddenStates, sizeof(T) * getInputSeqLen() * hiddenSize);
+    }
 
-    int32_t layerIdx;
-    hpj::Matrix<T> hiddenStates;
-    void *kvm; // KVCacheManager<KVCacheT>
+private:
+    int32_t hiddenSize;
+    void* hiddenStates;
+#endif
 };
 
-template <typename T>
+// For beam searcher
+// class SequenceGroupMeta {
+// public:
+//     SequenceGroupMeta(int32_t num_beams) { sequence = new SequenceMeta[num_beams]; }
+
+//     SequenceMeta *sequence;
+// };
+
 class InputQueue {
 public:
     static InputQueue &getInstance() {
@@ -105,40 +117,33 @@ class InputQueue {
         return instance;
     }
 
-    int32_t createSampleID() {
-        int32_t id = sampleID++;
+    int32_t createSequenceID() {
+        int32_t id = sequenceID++;
         if (id >= 10 * 1024) {
-            sampleID = 0;
-            id = sampleID++;
+            sequenceID = 0;
+            id = sequenceID++;
         }
         return id;
     }
 
     bool empty() { return queue.empty(); }
 
-    SampleMeta<T> *pop() {
+    SequenceMeta *pop() {
         auto buffer = queue.front();
         queue.pop();
         return buffer;
     }
 
-    void push(SampleMeta<T> *buffer) { queue.push(buffer); }
+    void push(SequenceMeta *buffer) { queue.push(buffer); }
 
 private:
     InputQueue() {}
 
-    static int32_t sampleID;
-    static int32_t tokenID;
-    std::queue<SampleMeta<T> *> queue;
+    int32_t sequenceID = 0;
+    std::queue<SequenceMeta *> queue;
 };
 
-template <typename T>
-int32_t InputQueue<T>::sampleID = 0;
-
-template <typename T>
-int32_t InputQueue<T>::tokenID = 0;
 
-template <typename T>
 class TaskWaitingQueue {
 public:
     static TaskWaitingQueue &getInstance() {
@@ -156,45 +161,45 @@ class TaskWaitingQueue {
         return full;
     }
 
-    SampleMeta<T> *pop() {
+    SequenceMeta *pop() {
         auto buffer = queue.front();
         queue.pop();
         return buffer;
     }
 
-    void push(SampleMeta<T> *buffer) { queue.push(buffer); }
+    void push(SequenceMeta *buffer) { queue.push(buffer); }
 
 private:
     TaskWaitingQueue() {}
 
-    std::queue<SampleMeta<T> *> queue;
+    std::queue<SequenceMeta *> queue;
 };
 
-template <typename T>
-class SamplePool {
+
+class SequencePool {
 public:
-    static SamplePool &getInstance() {
-        static SamplePool instance;
+    static SequencePool &getInstance() {
+        static SequencePool instance;
         return instance;
     }
 
-    bool add(int32_t key, SampleMeta<T> *sample) {
+    bool add(int32_t key, SequenceMeta *sequence) {
         bool exist = has(key);
-        if (!exist) { hub[key] = sample; }
+        if (!exist) { hub[key] = sequence; }
 
         return exist;
     }
 
-    void forceAdd(int32_t key, SampleMeta<T> *sample) {
+    void forceAdd(int32_t key, SequenceMeta *sequence) {
         auto it = hub.find(key);
         if (it != hub.end()) { delete it->second; }
 
-        hub[key] = sample;
+        hub[key] = sequence;
     }
 
     bool has(int32_t key) const { return hub.find(key) != hub.end(); }
 
-    SampleMeta<T> *get(int32_t key) const {
+    SequenceMeta *get(int32_t key) const {
         auto it = hub.find(key);
         if (it != hub.end()) {
             return it->second;
@@ -207,12 +212,12 @@ class SamplePool {
         if (has(key)) { hub.erase(key); }
     }
 
-    bool replace(int32_t oldKey, SampleMeta<T> *newSample) {
+    bool replace(int32_t oldKey, SequenceMeta *newSequence) {
         bool ret = false;
         auto it = hub.find(oldKey);
         if (it != hub.end()) {
             delete it->second;
-            it->second = newSample;
+            it->second = newSequence;
             ret = true;
         }
 
@@ -220,9 +225,11 @@ class SamplePool {
     }
 
 private:
-    SamplePool() {}
+    SequencePool() {}
+
+    std::unordered_map<int32_t, SequenceMeta *> hub;
 
-    std::unordered_map<int32_t, SampleMeta<T> *> hub;
+    //mgr
 };
 
 } // namespace xft
\ No newline at end of file
diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h
index 67611e2b..5aa39501 100644
--- a/src/common/transformer_ctx.h
+++ b/src/common/transformer_ctx.h
@@ -64,7 +64,7 @@ struct DecoderContext {
     int inputSeqLen;
     // For custom usage
     int reserved1;
-    int sampleID;
+    int sequenceID;
 
     // Model structure configuration
     int vocabSize;
diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index 4cdb1cef..2eaf9317 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -35,7 +35,7 @@
 #include "transformer_ctx.h"
 #include "transpose_util.h"
 #include "weight_util.h"
-#include "sample_info.h"
+#include "sequence.h"
 
 using namespace xft;
 
@@ -279,7 +279,7 @@ class CommonDecoder : public AbstractDecoder {
 
         int userSideBS = dims[0];
         int beamSize = dims[1];
-        int batchSize = (step == 0 ? userSideBS : userSideBS * beamSize); // as samples are duplicated at step 0
+        int batchSize = (step == 0 ? userSideBS : userSideBS * beamSize); // as sequence are duplicated at step 0
         int seqLen = dims[2];
         int pastSeqLen = step == 0 ? 0 : this->accSeqLen;
         int inputSeqLen = seqLen;
@@ -344,36 +344,38 @@ class CommonDecoder : public AbstractDecoder {
             int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank;
             int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank;
             int count = batchSize * inputSeqLen * hiddenSize;
-            int32_t sampleID;
-            MPI_Recv(&sampleID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-            TimeLine t("Decoder.MPI_Recv." + std::to_string(sampleID));
+            int32_t sequenceID;
+            MPI_Recv(&sequenceID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            TimeLine t("Decoder.MPI_Recv." + std::to_string(sequenceID));
             MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
             // TODO: Error: different scope when dynamic loading so file
             // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
-            if (!SamplePool<AttnInT>::getInstance().has(sampleID)) {
-                SampleMeta<AttnInT> *sample = new SampleMeta<AttnInT>(sampleID, seqLen, hiddenSize);
-                sample->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
-                SamplePool<AttnInT>::getInstance().forceAdd(sample->getSampleID(), sample);
+            if (!SequencePool::getInstance().has(sequenceID)) {
+                SequenceMeta *sequence = new SequenceMeta(sequenceID, seqLen);
+                sequence->setPastSeqLen(pastSeqLen);
+                sequence->allocBuffer<AttnInT>(hiddenSize, embBuf);
+                SequencePool::getInstance().forceAdd(sequence->getSequenceID(), sequence);
             }
-            TaskWaitingQueue<AttnInT>::getInstance().push(SamplePool<AttnInT>::getInstance().get(sampleID));
+            TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequenceID));
         }
 
-        if (!InputQueue<AttnInT>::getInstance().empty()) {
-            if (!TaskWaitingQueue<AttnInT>::getInstance().isFull()) {
-                auto sample = InputQueue<AttnInT>::getInstance().pop();
-                sample->ResetKVCache(hiddenSize, pastSeqLen, 0, embBuf, this->kvCacheMgr.get());
-                SamplePool<AttnInT>::getInstance().forceAdd(sample->getSampleID(), sample);
-                TaskWaitingQueue<AttnInT>::getInstance().push(SamplePool<AttnInT>::getInstance().get(sample->getSampleID()));
+        if (!InputQueue::getInstance().empty()) {
+            if (!TaskWaitingQueue::getInstance().isFull()) {
+                auto sequence = InputQueue::getInstance().pop();
+                sequence->setPastSeqLen(pastSeqLen);
+                sequence->allocBuffer<AttnInT>(hiddenSize, embBuf);
+                SequencePool::getInstance().forceAdd(sequence->getSequenceID(), sequence);
+                TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequence->getSequenceID()));
             }
         }
 
-        while(TaskWaitingQueue<float>::getInstance().empty());
+        while(TaskWaitingQueue::getInstance().empty());
 
-        SampleMeta<AttnInT> *runningTask;
-        if (!TaskWaitingQueue<AttnInT>::getInstance().empty()) {
-            runningTask = TaskWaitingQueue<AttnInT>::getInstance().pop();
-            ctx->sampleID = runningTask->getSampleID();
-            TimeLine t("Decoder.forward." + std::to_string(ctx->sampleID));
+        SequenceMeta *runningTask;
+        if (!TaskWaitingQueue::getInstance().empty()) {
+            runningTask = TaskWaitingQueue::getInstance().pop();
+            ctx->sequenceID = runningTask->getSequenceID();
+            TimeLine t("Decoder.step." + std::to_string(ctx->sequenceID));
 #endif
 
         // Decoder: forward
@@ -436,8 +438,8 @@ class CommonDecoder : public AbstractDecoder {
             TimeLine t("Decoder.MPI_Send");
             int next_world_rank = (ctx->ppRank + 1) * ctx->tpSize + ctx->tpRank;
             int count = batchSize * inputSeqLen * hiddenSize;
-            int32_t sampleID = runningTask->getSampleID();
-            MPI_Send(&sampleID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD);
+            int32_t sequenceID = runningTask->getSequenceID();
+            MPI_Send(&sequenceID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD);
             MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD);
             // TODO: Error: different scope when dynamic loading so file
             // this->messenger.worldSendFP32(embBuf, count, next_world_rank, next_world_rank);
diff --git a/src/models/models.cpp b/src/models/models.cpp
index d7eaa4ec..6e45b0de 100644
--- a/src/models/models.cpp
+++ b/src/models/models.cpp
@@ -32,7 +32,7 @@
 #include "searcher.h"
 #include "timeline.h"
 #include "yarn_llama.h"
-#include "sample_info.h"
+#include "sequence.h"
 
 namespace xft {
 enum class GenerationMode { GREEDY_SEARCH, BEAM_SEARCH, SAMPLE };
diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp
index 75ba689e..620a89c5 100644
--- a/src/searchers/greedy_search.cpp
+++ b/src/searchers/greedy_search.cpp
@@ -14,7 +14,7 @@
 // ============================================================================
 #include "greedy_search.h"
 #include "messenger.h"
-#include "sample_info.h"
+#include "sequence.h"
 #include "search_utils.h"
 #include "thread_util.h"
 
@@ -45,17 +45,17 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
             int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank;
             ThreadPool::getInstance().addTask([predictor_world_rank, this] {
                 while (true) {
-                    int32_t sampleID;
-                    MPI_Recv(&sampleID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD,
+                    int32_t sequenceID;
+                    MPI_Recv(&sequenceID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD,
                             MPI_STATUS_IGNORE);
-                    TimeLine t("GreedySearch.MPI_Recv.prompt" + std::to_string(sampleID));
+                    TimeLine t("GreedySearch.MPI_Recv.sequence" + std::to_string(sequenceID));
                     MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank,
                             predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-                    if (SamplePool<float>::getInstance().has(sampleID)) {
-                        auto prompt = SamplePool<float>::getInstance().get(sampleID);
-                        TaskWaitingQueue<float>::getInstance().push(prompt);
+                    if (SequencePool::getInstance().has(sequenceID)) {
+                        auto sequence = SequencePool::getInstance().get(sequenceID);
+                        TaskWaitingQueue::getInstance().push(sequence);
                     } else {
-                        printf("Error: should have sampleID\n");
+                        printf("Error: should have sequenceID\n");
                         fflush(stdout);
                     }
                 }
@@ -64,10 +64,10 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
     } else { // The last predictor pipeline parallel stage
         this->nextTokens = this->search(result);
         if (ctx->ppSize > 1 && ctx->ppRank == ctx->ppSize - 1) {
-            TimeLine t("GreedySearch.MPI_Send.prompt" + std::to_string(ctx->sampleID));
+            TimeLine t("GreedySearch.MPI_Send.sequence" + std::to_string(ctx->sequenceID));
             int embedding_world_rank = 0 * ctx->tpSize + ctx->tpRank;
             int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank;
-            MPI_Send(&ctx->sampleID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD);
+            MPI_Send(&ctx->sequenceID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD);
             MPI_Send(this->nextTokens.data(), batchSize, MPI_INT32_T, embedding_world_rank, predictor_world_rank,
                     MPI_COMM_WORLD);
             // TODO: Error: different scope when dynamic loading so file

From 171f45176eb44e1d7eff8644599e2806ec400f22 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 19:57:15 +0800
Subject: [PATCH 23/32] update

---
 src/common/sequence.h       | 27 ++++++++++++++++-----------
 src/models/common_decoder.h |  4 ++--
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/common/sequence.h b/src/common/sequence.h
index cef2f895..e2055e7f 100644
--- a/src/common/sequence.h
+++ b/src/common/sequence.h
@@ -183,18 +183,23 @@ class SequencePool {
         return instance;
     }
 
-    bool add(int32_t key, SequenceMeta *sequence) {
-        bool exist = has(key);
-        if (!exist) { hub[key] = sequence; }
-
-        return exist;
-    }
-
-    void forceAdd(int32_t key, SequenceMeta *sequence) {
-        auto it = hub.find(key);
-        if (it != hub.end()) { delete it->second; }
+    bool add(int32_t key, SequenceMeta *sequence, bool force = false) {
+        bool isSuccess = false;
+        if (force) {
+            auto it = hub.find(key);
+            if (it != hub.end()) { delete it->second; }
+
+            hub[key] = sequence;
+            isSuccess = true;
+        } else {
+            bool exist = has(key);
+            if (!exist) {
+                hub[key] = sequence;
+                isSuccess = true;
+            }
+        }
 
-        hub[key] = sequence;
+        return isSuccess;
     }
 
     bool has(int32_t key) const { return hub.find(key) != hub.end(); }
diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index 2eaf9317..60820936 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -354,7 +354,7 @@ class CommonDecoder : public AbstractDecoder {
                 SequenceMeta *sequence = new SequenceMeta(sequenceID, seqLen);
                 sequence->setPastSeqLen(pastSeqLen);
                 sequence->allocBuffer<AttnInT>(hiddenSize, embBuf);
-                SequencePool::getInstance().forceAdd(sequence->getSequenceID(), sequence);
+                SequencePool::getInstance().add(sequence->getSequenceID(), sequence);
             }
             TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequenceID));
         }
@@ -364,7 +364,7 @@ class CommonDecoder : public AbstractDecoder {
                 auto sequence = InputQueue::getInstance().pop();
                 sequence->setPastSeqLen(pastSeqLen);
                 sequence->allocBuffer<AttnInT>(hiddenSize, embBuf);
-                SequencePool::getInstance().forceAdd(sequence->getSequenceID(), sequence);
+                SequencePool::getInstance().add(sequence->getSequenceID(), sequence);
                 TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequence->getSequenceID()));
             }
         }

From 1df416ee99d6ac8d7d28faccf524cc4c62982b08 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 20:43:34 +0800
Subject: [PATCH 24/32] udpate

---
 src/common/sequence.h       | 152 +++++++++++++++++++++---------------
 src/models/common_decoder.h |   2 +-
 2 files changed, 90 insertions(+), 64 deletions(-)

diff --git a/src/common/sequence.h b/src/common/sequence.h
index e2055e7f..fb46bdd7 100644
--- a/src/common/sequence.h
+++ b/src/common/sequence.h
@@ -39,7 +39,7 @@
 
 namespace xft {
 
-// The Sequence is one sequence of batch inputs and includes the generated tokens.
+// The SequenceMeta is one sequence of batch inputs and includes the generated tokens.
 class SequenceMeta {
 public:
     SequenceMeta(int32_t _sequenceID, int32_t _inputSeqLen, std::vector<int32_t> _inputTokens)
@@ -110,6 +110,94 @@ class SequenceMeta {
 //     SequenceMeta *sequence;
 // };
 
+
+//    SequencePool
+//    ┌──────┬──────┬──────┐
+//    │      │      │  ◄───┼──┬─ SequenceMeta
+//    ├──────┼──────┼──────┤  │
+//    │      │      │  ◄───┼──┘
+//    └──────┴──────┴──────┘
+class SequencePool {
+public:
+    static SequencePool &getInstance() {
+        static SequencePool instance;
+        return instance;
+    }
+
+    SequenceMeta *createMeta(int32_t sequenceID, int32_t inputSeqLen,
+            std::vector<int32_t> inputTokens = std::vector<int32_t>()) {
+        auto *sequenceMeta = new SequenceMeta(sequenceID, inputSeqLen, inputTokens);
+        return sequenceMeta;
+    }
+
+    bool add(int32_t sequenceID, SequenceMeta *sequence, bool force = false) {
+        bool isSuccess = false;
+        if (force) {
+            auto it = hub.find(sequenceID);
+            if (it != hub.end()) { 
+                remove(it->first);
+            }
+
+            hub[sequenceID] = sequence;
+            isSuccess = true;
+        } else {
+            bool exist = has(sequenceID);
+            if (!exist) {
+                hub[sequenceID] = sequence;
+                isSuccess = true;
+            }
+        }
+
+        return isSuccess;
+    }
+
+    bool has(int32_t sequenceID) const { return hub.find(sequenceID) != hub.end(); }
+
+    SequenceMeta *get(int32_t sequenceID) const {
+        auto it = hub.find(sequenceID);
+        if (it != hub.end()) {
+            return it->second;
+        } else {
+            return nullptr;
+        }
+    }
+
+    bool remove(int32_t sequenceID, bool deep = false) {
+        bool isSuccess = false;
+        if (has(sequenceID)) {
+            if (deep == true) {
+                auto it = hub.find(sequenceID);
+                if (it != hub.end()) { 
+                    delete it->second;
+                }
+            }
+            isSuccess = hub.erase(sequenceID);
+        }
+
+        return isSuccess;
+    }
+
+    bool replace(int32_t sequenceID, SequenceMeta *newSequenceMeta) {
+        bool isSuccess = false;
+        auto it = hub.find(sequenceID);
+        if (it != hub.end()) {
+            remove(it->first);
+            hub[sequenceID] = newSequenceMeta;
+            isSuccess = true;
+        }
+
+        return isSuccess;
+    }
+
+private:
+    SequencePool() {}
+
+    std::unordered_map<int32_t, SequenceMeta *> hub;
+
+    //mgr
+};
+
+
 class InputQueue {
 public:
     static InputQueue &getInstance() {
@@ -175,66 +263,4 @@ class TaskWaitingQueue {
     std::queue<SequenceMeta *> queue;
 };
 
-
-class SequencePool {
-public:
-    static SequencePool &getInstance() {
-        static SequencePool instance;
-        return instance;
-    }
-
-    bool add(int32_t key, SequenceMeta *sequence, bool force = false) {
-        bool isSuccess = false;
-        if (force) {
-            auto it = hub.find(key);
-            if (it != hub.end()) { delete it->second; }
-
-            hub[key] = sequence;
-            isSuccess = true;
-        } else {
-            bool exist = has(key);
-            if (!exist) {
-                hub[key] = sequence;
-                isSuccess = true;
-            }
-        }
-
-        return isSuccess;
-    }
-
-    bool has(int32_t key) const { return hub.find(key) != hub.end(); }
-
-    SequenceMeta *get(int32_t key) const {
-        auto it = hub.find(key);
-        if (it != hub.end()) {
-            return it->second;
-        } else {
-            return nullptr;
-        }
-    }
-
-    void remove(int32_t key) {
-        if (has(key)) { hub.erase(key); }
-    }
-
-    bool replace(int32_t oldKey, SequenceMeta *newSequence) {
-        bool ret = false;
-        auto it = hub.find(oldKey);
-        if (it != hub.end()) {
-            delete it->second;
-            it->second = newSequence;
-            ret = true;
-        }
-
-        return ret;
-    }
-
-private:
-    SequencePool() {}
-
-    std::unordered_map<int32_t, SequenceMeta *> hub;
-
-    //mgr
-};
-
 } // namespace xft
\ No newline at end of file
diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index 60820936..3e06555d 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -351,7 +351,7 @@ class CommonDecoder : public AbstractDecoder {
             // TODO: Error: different scope when dynamic loading so file
             // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
             if (!SequencePool::getInstance().has(sequenceID)) {
-                SequenceMeta *sequence = new SequenceMeta(sequenceID, seqLen);
+                SequenceMeta *sequence = SequencePool::getInstance().createMeta(sequenceID, seqLen);
                 sequence->setPastSeqLen(pastSeqLen);
                 sequence->allocBuffer<AttnInT>(hiddenSize, embBuf);
                 SequencePool::getInstance().add(sequence->getSequenceID(), sequence);

From 3b287dfb8ca40fefd466380505a4b1dce68123d8 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 21:13:17 +0800
Subject: [PATCH 25/32] udpate

---
 src/common/sequence.h           | 45 ++++++++++++++++++++-------------
 src/models/common_decoder.h     |  8 +++---
 src/searchers/greedy_search.cpp |  4 +--
 3 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/src/common/sequence.h b/src/common/sequence.h
index fb46bdd7..204f543f 100644
--- a/src/common/sequence.h
+++ b/src/common/sequence.h
@@ -42,7 +42,7 @@ namespace xft {
 // The SequenceMeta is one sequence of batch inputs and includes the generated tokens.
 class SequenceMeta {
 public:
-    SequenceMeta(int32_t _sequenceID, int32_t _inputSeqLen, std::vector<int32_t> _inputTokens)
+    SequenceMeta(int32_t _sequenceID, int32_t _inputSeqLen, std::vector<int32_t> &_inputTokens)
         : sequenceID(_sequenceID), inputSeqLen(_inputSeqLen), step(0) {
         inputTokens.resize(_inputSeqLen);
         inputTokens.assign(_inputTokens.begin(), _inputTokens.end());
@@ -102,13 +102,19 @@ class SequenceMeta {
 #endif
 };
 
+
 // For beam searcher
-// class SequenceGroupMeta {
-// public:
-//     SequenceGroupMeta(int32_t num_beams) { sequence = new SequenceMeta[num_beams]; }
+class SequenceGroupMeta {
+public:
+    SequenceGroupMeta(int32_t _num_beams, std::vector<SequenceMeta *> &seq) {
+        num_beams = _num_beams;
+        sequences = seq;
+    }
 
-//     SequenceMeta *sequence;
-// };
+private:
+    int32_t num_beams;
+    std::vector<SequenceMeta *> sequences;
+};
 
 
 //    SequencePool
@@ -125,17 +131,22 @@ class SequencePool {
     }
 
     SequenceMeta *createMeta(int32_t sequenceID, int32_t inputSeqLen,
-            std::vector<int32_t> inputTokens = std::vector<int32_t>()) {
+            std::vector<int32_t> &inputTokens) {
         auto *sequenceMeta = new SequenceMeta(sequenceID, inputSeqLen, inputTokens);
         return sequenceMeta;
     }
 
+    SequenceMeta *createMeta(int32_t sequenceID, int32_t inputSeqLen) {
+        auto *sequenceMeta = new SequenceMeta(sequenceID, inputSeqLen);
+        return sequenceMeta;
+    }
+
     bool add(int32_t sequenceID, SequenceMeta *sequence, bool force = false) {
         bool isSuccess = false;
         if (force) {
             auto it = hub.find(sequenceID);
             if (it != hub.end()) { 
-                remove(it->first);
+                remove(it->first, true);
             }
 
             hub[sequenceID] = sequence;
@@ -181,7 +192,7 @@ class SequencePool {
         bool isSuccess = false;
         auto it = hub.find(sequenceID);
         if (it != hub.end()) {
-            remove(it->first);
+            remove(it->first, true);
             hub[sequenceID] = newSequenceMeta;
             isSuccess = true;
         }
@@ -193,11 +204,10 @@ class SequencePool {
     SequencePool() {}
 
     std::unordered_map<int32_t, SequenceMeta *> hub;
-
-    //mgr
 };
 
 
+// Manage input sequenceMeta
 class InputQueue {
 public:
     static InputQueue &getInstance() {
@@ -217,12 +227,12 @@ class InputQueue {
     bool empty() { return queue.empty(); }
 
     SequenceMeta *pop() {
-        auto buffer = queue.front();
+        auto seq = queue.front();
         queue.pop();
-        return buffer;
+        return seq;
     }
 
-    void push(SequenceMeta *buffer) { queue.push(buffer); }
+    void push(SequenceMeta *seq) { queue.push(seq); }
 
 private:
     InputQueue() {}
@@ -232,6 +242,7 @@ class InputQueue {
 };
 
 
+// Manage executive sequenceMeta
 class TaskWaitingQueue {
 public:
     static TaskWaitingQueue &getInstance() {
@@ -250,12 +261,12 @@ class TaskWaitingQueue {
     }
 
     SequenceMeta *pop() {
-        auto buffer = queue.front();
+        auto seq = queue.front();
         queue.pop();
-        return buffer;
+        return seq;
     }
 
-    void push(SequenceMeta *buffer) { queue.push(buffer); }
+    void push(SequenceMeta *seq) { queue.push(seq); }
 
 private:
     TaskWaitingQueue() {}
diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index 3e06555d..90d1590c 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -346,7 +346,7 @@ class CommonDecoder : public AbstractDecoder {
             int count = batchSize * inputSeqLen * hiddenSize;
             int32_t sequenceID;
             MPI_Recv(&sequenceID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-            TimeLine t("Decoder.MPI_Recv." + std::to_string(sequenceID));
+            TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Recv");
             MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
             // TODO: Error: different scope when dynamic loading so file
             // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
@@ -375,7 +375,7 @@ class CommonDecoder : public AbstractDecoder {
         if (!TaskWaitingQueue::getInstance().empty()) {
             runningTask = TaskWaitingQueue::getInstance().pop();
             ctx->sequenceID = runningTask->getSequenceID();
-            TimeLine t("Decoder.step." + std::to_string(ctx->sequenceID));
+            TimeLine t("Decoder.Seq" + std::to_string(ctx->sequenceID) + ".Step");
 #endif
 
         // Decoder: forward
@@ -435,10 +435,10 @@ class CommonDecoder : public AbstractDecoder {
 
         // If current pipeline stage isn't the end of stage, should send data to next stage and return nullptr
         if (ctx->ppSize > 1 && ctx->ppRank < ctx->ppSize - 1) {
-            TimeLine t("Decoder.MPI_Send");
+            int32_t sequenceID = runningTask->getSequenceID();
+            TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Send");
             int next_world_rank = (ctx->ppRank + 1) * ctx->tpSize + ctx->tpRank;
             int count = batchSize * inputSeqLen * hiddenSize;
-            int32_t sequenceID = runningTask->getSequenceID();
             MPI_Send(&sequenceID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD);
             MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD);
             // TODO: Error: different scope when dynamic loading so file
diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp
index 620a89c5..d0611b0f 100644
--- a/src/searchers/greedy_search.cpp
+++ b/src/searchers/greedy_search.cpp
@@ -48,7 +48,7 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
                     int32_t sequenceID;
                     MPI_Recv(&sequenceID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD,
                             MPI_STATUS_IGNORE);
-                    TimeLine t("GreedySearch.MPI_Recv.sequence" + std::to_string(sequenceID));
+                    TimeLine t("GreedySearch.Seq" + std::to_string(sequenceID) + ".MPI_Recv");
                     MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank,
                             predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                     if (SequencePool::getInstance().has(sequenceID)) {
@@ -64,7 +64,7 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
     } else { // The last predictor pipeline parallel stage
         this->nextTokens = this->search(result);
         if (ctx->ppSize > 1 && ctx->ppRank == ctx->ppSize - 1) {
-            TimeLine t("GreedySearch.MPI_Send.sequence" + std::to_string(ctx->sequenceID));
+            TimeLine t("GreedySearch.Seq" + std::to_string(ctx->sequenceID) + ".MPI_Send");
             int embedding_world_rank = 0 * ctx->tpSize + ctx->tpRank;
             int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank;
             MPI_Send(&ctx->sequenceID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD);

From 8d5dfe18e3d319bfe1d803d767138f35e398f0e0 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 21:18:28 +0800
Subject: [PATCH 26/32] update

---
 src/common/sequence.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/common/sequence.h b/src/common/sequence.h
index 204f543f..af32ead9 100644
--- a/src/common/sequence.h
+++ b/src/common/sequence.h
@@ -84,8 +84,6 @@ class SequenceMeta {
     int32_t pastSeqLen;
     std::vector<int32_t> inputTokens; // input tokens + next tokens
     std::vector<int32_t> nextTokens; // next tokens
-
-    // Indicates whether the sequence is in the prefill phase
     int32_t step;
 
 #ifdef PIPELINE_PARALLEL

From d77e7c12216796c1aef7a0e8755703366c6a8b0a Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Mon, 22 Apr 2024 21:23:39 +0800
Subject: [PATCH 27/32] update

---
 src/common/sequence.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/common/sequence.h b/src/common/sequence.h
index af32ead9..bc696840 100644
--- a/src/common/sequence.h
+++ b/src/common/sequence.h
@@ -128,6 +128,15 @@ class SequencePool {
         return instance;
     }
 
+    int32_t createSequenceID() {
+        int32_t id = globalSequenceID++;
+        if (id >= 10 * 1024) {
+            globalSequenceID = 0;
+            id = globalSequenceID++;
+        }
+        return id;
+    }
+
     SequenceMeta *createMeta(int32_t sequenceID, int32_t inputSeqLen,
             std::vector<int32_t> &inputTokens) {
         auto *sequenceMeta = new SequenceMeta(sequenceID, inputSeqLen, inputTokens);
@@ -201,6 +210,7 @@ class SequencePool {
 private:
     SequencePool() {}
 
+    int32_t globalSequenceID = 0;
     std::unordered_map<int32_t, SequenceMeta *> hub;
 };
 
@@ -213,15 +223,6 @@ class InputQueue {
         return instance;
     }
 
-    int32_t createSequenceID() {
-        int32_t id = sequenceID++;
-        if (id >= 10 * 1024) {
-            sequenceID = 0;
-            id = sequenceID++;
-        }
-        return id;
-    }
-
     bool empty() { return queue.empty(); }
 
     SequenceMeta *pop() {
@@ -235,7 +236,6 @@ class InputQueue {
 private:
     InputQueue() {}
 
-    int32_t sequenceID = 0;
     std::queue<SequenceMeta *> queue;
 };
 

From fffd7b6ad9d7dcf161bbbcdfbccb3aaaa257fdcc Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Tue, 23 Apr 2024 10:19:12 +0800
Subject: [PATCH 28/32] add PIPELINE_PARALLEL macro

---
 src/common/sequence.h        | 45 ++++++++++++++++++++++--------------
 src/common/transformer_ctx.h |  3 +++
 src/models/common_decoder.h  |  8 +++----
 3 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/src/common/sequence.h b/src/common/sequence.h
index bc696840..d036e042 100644
--- a/src/common/sequence.h
+++ b/src/common/sequence.h
@@ -43,14 +43,15 @@ namespace xft {
 class SequenceMeta {
 public:
     SequenceMeta(int32_t _sequenceID, int32_t _inputSeqLen, std::vector<int32_t> &_inputTokens)
-        : sequenceID(_sequenceID), inputSeqLen(_inputSeqLen), step(0) {
+        : sequenceID(_sequenceID), inputSeqLen(_inputSeqLen), pastSeqLen(0), step(0) {
         inputTokens.resize(_inputSeqLen);
         inputTokens.assign(_inputTokens.begin(), _inputTokens.end());
         nextTokens.resize(_inputSeqLen);
+        setPastSeqLen(getPastSeqLen());
     }
 
     SequenceMeta(int32_t _sequenceID, int32_t _inputSeqLen)
-        : sequenceID(_sequenceID), inputSeqLen(_inputSeqLen), inputTokens(_inputSeqLen, 0), step(0) {
+        : sequenceID(_sequenceID), inputSeqLen(_inputSeqLen), inputTokens(_inputSeqLen, 0), pastSeqLen(0), step(0) {
         nextTokens.resize(_inputSeqLen);
     }
 
@@ -58,6 +59,21 @@ class SequenceMeta {
 
     int32_t getSequenceID() const { return sequenceID; }
 
+    // For first tokens
+    void stepForward() {
+        if (getStep() == 0) {
+            setPastSeqLen(inputTokens.size());
+            setStep(getStep() + 1);
+        }
+    }
+
+    // For next token
+    void stepForward(int32_t token) {
+        addNextToken(token);
+        setPastSeqLen(getPastSeqLen() + 1);
+        setStep(getStep() + 1);
+    }
+
     // Get the input tokens in sequence
     int32_t getInputSeqLen() const { return inputSeqLen; }
 
@@ -68,11 +84,15 @@ class SequenceMeta {
     void setPastSeqLen(int32_t _pastSeqLen) { pastSeqLen = _pastSeqLen; }
 
     // For next tokens
-    void addNextToken(int32_t token) { nextTokens.push_back(token); }
+    void addNextToken(int32_t token) {
+        nextTokens.clear();
+        nextTokens.push_back(token);
+        inputTokens.push_back(token);
+    }
 
     int32_t getLatestToken() const { return nextTokens.back(); }
 
-    const int32_t *getTotalTokens() const { return nextTokens.data(); }
+    const int32_t *getTotalTokens() const { return getInputTokens(); }
 
     int32_t getStep() const { return step; }
 
@@ -96,11 +116,10 @@ class SequenceMeta {
 
 private:
     int32_t hiddenSize;
-    void* hiddenStates;
+    void *hiddenStates;
 #endif
 };
 
-
 // For beam searcher
 class SequenceGroupMeta {
 public:
@@ -114,7 +133,6 @@ class SequenceGroupMeta {
     std::vector<SequenceMeta *> sequences;
 };
 
-
 //    SequencePool
 //    ┌──────┬──────┬──────┐
 //    │      │      │  ◄───┼──┬─ SequenceMeta
@@ -137,8 +155,7 @@ class SequencePool {
         return id;
     }
 
-    SequenceMeta *createMeta(int32_t sequenceID, int32_t inputSeqLen,
-            std::vector<int32_t> &inputTokens) {
+    SequenceMeta *createMeta(int32_t sequenceID, int32_t inputSeqLen, std::vector<int32_t> &inputTokens) {
         auto *sequenceMeta = new SequenceMeta(sequenceID, inputSeqLen, inputTokens);
         return sequenceMeta;
     }
@@ -152,9 +169,7 @@ class SequencePool {
         bool isSuccess = false;
         if (force) {
             auto it = hub.find(sequenceID);
-            if (it != hub.end()) { 
-                remove(it->first, true);
-            }
+            if (it != hub.end()) { remove(it->first, true); }
 
             hub[sequenceID] = sequence;
             isSuccess = true;
@@ -185,9 +200,7 @@ class SequencePool {
         if (has(sequenceID)) {
             if (deep == true) {
                 auto it = hub.find(sequenceID);
-                if (it != hub.end()) { 
-                    delete it->second;
-                }
+                if (it != hub.end()) { delete it->second; }
             }
             isSuccess = hub.erase(sequenceID);
         }
@@ -214,7 +227,6 @@ class SequencePool {
     std::unordered_map<int32_t, SequenceMeta *> hub;
 };
 
-
 // Manage input sequenceMeta
 class InputQueue {
 public:
@@ -239,7 +251,6 @@ class InputQueue {
     std::queue<SequenceMeta *> queue;
 };
 
-
 // Manage executive sequenceMeta
 class TaskWaitingQueue {
 public:
diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h
index 5aa39501..0a48c88b 100644
--- a/src/common/transformer_ctx.h
+++ b/src/common/transformer_ctx.h
@@ -64,7 +64,10 @@ struct DecoderContext {
     int inputSeqLen;
     // For custom usage
     int reserved1;
+
+#ifdef PIPELINE_PARALLEL
     int sequenceID;
+#endif
 
     // Model structure configuration
     int vocabSize;
diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index 90d1590c..2382a73f 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -371,11 +371,12 @@ class CommonDecoder : public AbstractDecoder {
 
         while(TaskWaitingQueue::getInstance().empty());
 
-        SequenceMeta *runningTask;
+        SequenceMeta *runningTask = nullptr;
+        int32_t sequenceID = -1;
         if (!TaskWaitingQueue::getInstance().empty()) {
             runningTask = TaskWaitingQueue::getInstance().pop();
-            ctx->sequenceID = runningTask->getSequenceID();
-            TimeLine t("Decoder.Seq" + std::to_string(ctx->sequenceID) + ".Step");
+            sequenceID = runningTask->getSequenceID();
+            TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".Step");
 #endif
 
         // Decoder: forward
@@ -435,7 +436,6 @@ class CommonDecoder : public AbstractDecoder {
 
         // If current pipeline stage isn't the end of stage, should send data to next stage and return nullptr
         if (ctx->ppSize > 1 && ctx->ppRank < ctx->ppSize - 1) {
-            int32_t sequenceID = runningTask->getSequenceID();
             TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Send");
             int next_world_rank = (ctx->ppRank + 1) * ctx->tpSize + ctx->tpRank;
             int count = batchSize * inputSeqLen * hiddenSize;

From dcff843e642fd226ebd30a2efa6739946726ff3f Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Thu, 25 Apr 2024 09:31:46 +0800
Subject: [PATCH 29/32] Update pp inputs

---
 CMakeLists.txt                  |  2 +-
 examples/cpp/example.cpp        |  6 +++-
 src/common/sequence.h           |  4 ++-
 src/models/common_decoder.h     | 50 +++++++++++++++++----------
 src/models/models.cpp           | 60 ++++++++++++++++++++++++++++++---
 src/searchers/greedy_search.cpp |  4 +++
 6 files changed, 101 insertions(+), 25 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index acc21129..08b5b90c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -179,7 +179,7 @@ add_definitions(-DAVX512_FP16_WEIGHT_ONLY_INT4=true)
 add_definitions(-DAVX512_FP32_WEIGHT_ONLY_NF4=true)
 # add_definitions(-DAVX512_FP16_WEIGHT_ONLY_NF4=true)
 
-# add_definitions(-DDEBUG=true)
+add_definitions(-DDEBUG=true)
 # add_definitions(-DSTEP_BY_STEP_ATTN=true)
 add_definitions(-DUSE_SHM=true)
 option(XFT_BUILD_TESTS "Build xfastertransformer unit tests" OFF)
diff --git a/examples/cpp/example.cpp b/examples/cpp/example.cpp
index 34cf4eb0..7040edfb 100644
--- a/examples/cpp/example.cpp
+++ b/examples/cpp/example.cpp
@@ -441,11 +441,15 @@ int main(int argc, char **argv) {
         if (!model.isDone()) {
             Timer t(isMaster, "[INFO] First token");
             firstIds = model.generate();
+            printf("firstIds[0]: %d\n", firstIds[0]);
+            fflush(stdout);
         }
 
         Timer timerSecond;
         if (!model.isDone()) {
             secondIds = model.generate();
+            printf("secondIds[0]: %d\n", secondIds[0]);
+            fflush(stdout);
             secondIdCount++;
         }
 
@@ -467,7 +471,7 @@ int main(int argc, char **argv) {
         }
         auto result = model.finalize();
 
-        if (isMaster) {
+        if (true) {
             std::cout << "\n[INFO] Final output is: " << std::endl;
             std::vector<std::string> sent = tokenizer->batchDecode(result, batchSize);
             for (auto str : sent) {
diff --git a/src/common/sequence.h b/src/common/sequence.h
index d036e042..a3d79bab 100644
--- a/src/common/sequence.h
+++ b/src/common/sequence.h
@@ -69,7 +69,7 @@ class SequenceMeta {
 
     // For next token
     void stepForward(int32_t token) {
-        addNextToken(token);
+        // addNextToken(token);
         setPastSeqLen(getPastSeqLen() + 1);
         setStep(getStep() + 1);
     }
@@ -269,6 +269,8 @@ class TaskWaitingQueue {
         return full;
     }
 
+    SequenceMeta *front() { return queue.front(); }
+
     SequenceMeta *pop() {
         auto seq = queue.front();
         queue.pop();
diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index 2382a73f..8cb70120 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -31,11 +31,12 @@
 #include "mlp_chatglm2.h"
 #include "mlp_standard.h"
 #include "model_factory.h"
+#include "sequence.h"
+#include "thread_util.h"
 #include "timeline.h"
 #include "transformer_ctx.h"
 #include "transpose_util.h"
 #include "weight_util.h"
-#include "sequence.h"
 
 using namespace xft;
 
@@ -326,8 +327,7 @@ class CommonDecoder : public AbstractDecoder {
         dbg.debugPrint("---- embedding.forward ----\n");
         dbg.debugPrint("ids:\n");
         dbg.dumpMatrix(ids, batchSize, inputSeqLen, inputSeqLen);
-        dbg.debugPrint(
-                "embBuf(rows: %d, cols: %d, stride: %d):\n", batchSize * inputSeqLen, hiddenSize, hiddenSize);
+        dbg.debugPrint("embBuf(rows: %d, cols: %d, stride: %d):\n", batchSize * inputSeqLen, hiddenSize, hiddenSize);
         dbg.dumpMatrix(embBuf, batchSize * inputSeqLen, hiddenSize, hiddenSize);
 #endif
 
@@ -344,22 +344,28 @@ class CommonDecoder : public AbstractDecoder {
             int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank;
             int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank;
             int count = batchSize * inputSeqLen * hiddenSize;
-            int32_t sequenceID;
-            MPI_Recv(&sequenceID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-            TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Recv");
-            MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-            // TODO: Error: different scope when dynamic loading so file
-            // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
-            if (!SequencePool::getInstance().has(sequenceID)) {
-                SequenceMeta *sequence = SequencePool::getInstance().createMeta(sequenceID, seqLen);
-                sequence->setPastSeqLen(pastSeqLen);
-                sequence->allocBuffer<AttnInT>(hiddenSize, embBuf);
-                SequencePool::getInstance().add(sequence->getSequenceID(), sequence);
-            }
-            TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequenceID));
+            ThreadPool::getInstance().addTask([curr_world_rank, prev_world_rank, count, &embBuf, seqLen, hiddenSize, pastSeqLen] {
+                while (true) {
+                    int32_t sequenceID;
+                    MPI_Recv(&sequenceID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                    TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Recv");
+                    MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                    // TODO: Error: different scope when dynamic loading so file
+                    // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
+                    printf("Decoder.Seq%d.MPI_Recv\n", sequenceID);
+                    fflush(stdout);
+                    if (!SequencePool::getInstance().has(sequenceID)) {
+                        SequenceMeta *sequence = SequencePool::getInstance().createMeta(sequenceID, seqLen);
+                        sequence->setPastSeqLen(pastSeqLen);
+                        sequence->allocBuffer<AttnInT>(hiddenSize, embBuf);
+                        SequencePool::getInstance().add(sequence->getSequenceID(), sequence);
+                    }
+                    TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequenceID));
+                }
+            });
         }
 
-        if (!InputQueue::getInstance().empty()) {
+        while (!InputQueue::getInstance().empty()) {
             if (!TaskWaitingQueue::getInstance().isFull()) {
                 auto sequence = InputQueue::getInstance().pop();
                 sequence->setPastSeqLen(pastSeqLen);
@@ -369,13 +375,19 @@ class CommonDecoder : public AbstractDecoder {
             }
         }
 
-        while(TaskWaitingQueue::getInstance().empty());
+        while (TaskWaitingQueue::getInstance().empty());
 
         SequenceMeta *runningTask = nullptr;
         int32_t sequenceID = -1;
         if (!TaskWaitingQueue::getInstance().empty()) {
             runningTask = TaskWaitingQueue::getInstance().pop();
             sequenceID = runningTask->getSequenceID();
+            ctx->sequenceID = runningTask->getSequenceID();
+            runningTask->setPastSeqLen(pastSeqLen);
+            runningTask->allocBuffer<AttnInT>(hiddenSize, embBuf);
+            printf("Decoder.Seq%d.step\n", sequenceID);
+            fflush(stdout);
+
             TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".Step");
 #endif
 
@@ -443,6 +455,8 @@ class CommonDecoder : public AbstractDecoder {
             MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD);
             // TODO: Error: different scope when dynamic loading so file
             // this->messenger.worldSendFP32(embBuf, count, next_world_rank, next_world_rank);
+            printf("Decoder.Seq%d.MPI_Send\n", sequenceID);
+            fflush(stdout);
             return std::tuple<float *, int, int>(nullptr, 0, 0);
         }
 #endif
diff --git a/src/models/models.cpp b/src/models/models.cpp
index 6e45b0de..dfbafdf3 100644
--- a/src/models/models.cpp
+++ b/src/models/models.cpp
@@ -85,6 +85,21 @@ void Model::input(std::vector<int32_t> &inputIds_, int batchSize_) {
     inputIds.resize(dims[1]);
     if (decoder->getRank() == 0) { inputIds = inputIds_; }
     messenger.broadcast(inputIds.data(), dims[1]);
+
+    if (this->isMaster()) {
+        for (int i = 0; i < 2; ++i) {
+            int sequenceID = SequencePool::getInstance().createSequenceID();
+            InputQueue::getInstance().push(SequencePool::getInstance().createMeta(sequenceID, seqLen, inputIds));
+        }
+
+        while (!InputQueue::getInstance().empty()) {
+            if (!TaskWaitingQueue::getInstance().isFull()) {
+                auto sequence = InputQueue::getInstance().pop();
+                SequencePool::getInstance().add(sequence->getSequenceID(), sequence);
+                TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequence->getSequenceID()));
+            }
+        }
+    }
 }
 
 void Model::config(int maxLen_, int numBeams_, int numBeamHypsToKeep_, float lenPenalty_, bool doEarlyStopping_,
@@ -142,12 +157,49 @@ std::vector<int32_t> Model::generate() {
         exit(-1);
     }
 
-    if (isNewInput) {
-        isNewInput = false;
-        return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
+    if (this->isMaster()) {
+        while(TaskWaitingQueue::getInstance().empty());
+
+        if (TaskWaitingQueue::getInstance().front()->getStep() == 0) {
+            auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
+            TaskWaitingQueue::getInstance().front()->stepForward();
+            return token;
+        } else {
+            auto token = searcher->getNextToken();
+            TaskWaitingQueue::getInstance().front()->stepForward(token[0]);
+            return token;
+        }
     } else {
-        return searcher->getNextToken();
+        if (!isNewInput) {
+            while(TaskWaitingQueue::getInstance().empty());
+
+            if (TaskWaitingQueue::getInstance().front()->getStep() == 0) {
+                auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
+                TaskWaitingQueue::getInstance().front()->stepForward();
+                return token;
+            } else {
+                auto token = searcher->getNextToken();
+                TaskWaitingQueue::getInstance().front()->stepForward(token[0]);
+                return token;
+            }
+        } else {
+            isNewInput = false;
+            auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
+            TaskWaitingQueue::getInstance().front()->stepForward();
+            return token;
+        }
     }
+
+    // if (isNewInput) {
+    //     printf("1st\n");
+    //     // TaskWaitingQueue::getInstance().front()->getStep() == 0
+    //     // TaskWaitingQueue::getInstance().front()->stepForward();
+    //     isNewInput = false;
+    //     return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
+    // } else {
+    //     printf("2nd\n");
+    //     return searcher->getNextToken();
+    // }
 }
 
 void Model::createSearcher(SearcherConfig &config_) {
diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp
index d0611b0f..cbfbcc5a 100644
--- a/src/searchers/greedy_search.cpp
+++ b/src/searchers/greedy_search.cpp
@@ -51,6 +51,8 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
                     TimeLine t("GreedySearch.Seq" + std::to_string(sequenceID) + ".MPI_Recv");
                     MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank,
                             predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                    printf("GreedySearch.Seq%d.MPI_Recv\n", sequenceID);
+                    fflush(stdout);
                     if (SequencePool::getInstance().has(sequenceID)) {
                         auto sequence = SequencePool::getInstance().get(sequenceID);
                         TaskWaitingQueue::getInstance().push(sequence);
@@ -72,6 +74,8 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
                     MPI_COMM_WORLD);
             // TODO: Error: different scope when dynamic loading so file
             // messenger.worldSendINT32(this->nextTokens.data(), batchSize, embedding_world_rank, predictor_world_rank);
+            printf("GreedySearch.Seq%d.MPI_Send\n", ctx->sequenceID);
+            fflush(stdout);
         }
     }
 #else

From 90cabf882b7b7c843c01236b7012d54b69632430 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Sun, 28 Apr 2024 13:46:58 +0800
Subject: [PATCH 30/32] run good

---
 src/common/transformer_ctx.h    |  2 +-
 src/models/common_decoder.h     | 61 ++++++++++++++++++++++-----------
 src/models/models.cpp           |  5 +++
 src/searchers/greedy_search.cpp | 22 +++++++-----
 src/searchers/greedy_search.h   |  2 +-
 5 files changed, 62 insertions(+), 30 deletions(-)

diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h
index 0a48c88b..8c11815d 100644
--- a/src/common/transformer_ctx.h
+++ b/src/common/transformer_ctx.h
@@ -66,7 +66,7 @@ struct DecoderContext {
     int reserved1;
 
 #ifdef PIPELINE_PARALLEL
-    int sequenceID;
+    int32_t sequenceID;
 #endif
 
     // Model structure configuration
diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index 8cb70120..d96ac699 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -340,24 +340,39 @@ class CommonDecoder : public AbstractDecoder {
 
 #ifdef PIPELINE_PARALLEL
         // if current pipeline parallel stage rank isn't the first stage, should receive previous stage data
-        if (ctx->ppSize > 1 && ctx->ppRank > 0) {
+        if (ctx->ppSize > 1 && ctx->ppRank > 0 && enabledBackgroundSync == false) {
+            enabledBackgroundSync = true;
             int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank;
             int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank;
-            int count = batchSize * inputSeqLen * hiddenSize;
-            ThreadPool::getInstance().addTask([curr_world_rank, prev_world_rank, count, &embBuf, seqLen, hiddenSize, pastSeqLen] {
+            // int64_t count = batchSize * inputSeqLen * hiddenSize;
+            ThreadPool::getInstance().addTask([curr_world_rank, prev_world_rank, seqLen, hiddenSize, pastSeqLen, this] {
                 while (true) {
-                    int32_t sequenceID;
-                    MPI_Recv(&sequenceID, 1, MPI_INT32_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-                    TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Recv");
-                    MPI_Recv(embBuf, count, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                    int64_t recvBuf[2] = {0, 0};
+                    MPI_Recv(&recvBuf, 2, MPI_INT64_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                    int32_t sequenceID = recvBuf[0];
+                    int64_t count = recvBuf[1];
+                    MPI_Recv(this->actBuffers->Data(), count, MPI_FLOAT, prev_world_rank, curr_world_rank + 1000, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+                    // MPI_Status status;
+                    // MPI_Probe(prev_world_rank, curr_world_rank, MPI_COMM_WORLD, &status);
+                    // int number_amount;
+                    // MPI_Get_count(&status, MPI_FLOAT, &number_amount);
+                    // printf("Decoder.probe.%d\n", number_amount);
+                    // fflush(stdout);
+                    // float recvBuf[number_amount] = {0.0f};
+                    // MPI_Recv(&recvBuf, number_amount, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                    // int32_t sequenceID = recvBuf[0];
+                    // int64_t count = recvBuf[1];
+                    printf("Decoder.Seq%d.MPI_Recv%d\n", sequenceID, count);
+                    fflush(stdout);
+                    // memcpy(this->actBuffers->Data(), recvBuf + 2, count * sizeof(float));
                     // TODO: Error: different scope when dynamic loading so file
                     // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
-                    printf("Decoder.Seq%d.MPI_Recv\n", sequenceID);
-                    fflush(stdout);
+                    TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Recv");
                     if (!SequencePool::getInstance().has(sequenceID)) {
                         SequenceMeta *sequence = SequencePool::getInstance().createMeta(sequenceID, seqLen);
-                        sequence->setPastSeqLen(pastSeqLen);
-                        sequence->allocBuffer<AttnInT>(hiddenSize, embBuf);
+                        // sequence->setPastSeqLen(pastSeqLen);
+                        // sequence->allocBuffer<AttnInT>(hiddenSize, embBuf);
                         SequencePool::getInstance().add(sequence->getSequenceID(), sequence);
                     }
                     TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequenceID));
@@ -368,8 +383,8 @@ class CommonDecoder : public AbstractDecoder {
         while (!InputQueue::getInstance().empty()) {
             if (!TaskWaitingQueue::getInstance().isFull()) {
                 auto sequence = InputQueue::getInstance().pop();
-                sequence->setPastSeqLen(pastSeqLen);
-                sequence->allocBuffer<AttnInT>(hiddenSize, embBuf);
+                // sequence->setPastSeqLen(pastSeqLen);
+                // sequence->allocBuffer<AttnInT>(hiddenSize, embBuf);
                 SequencePool::getInstance().add(sequence->getSequenceID(), sequence);
                 TaskWaitingQueue::getInstance().push(SequencePool::getInstance().get(sequence->getSequenceID()));
             }
@@ -380,11 +395,11 @@ class CommonDecoder : public AbstractDecoder {
         SequenceMeta *runningTask = nullptr;
         int32_t sequenceID = -1;
         if (!TaskWaitingQueue::getInstance().empty()) {
-            runningTask = TaskWaitingQueue::getInstance().pop();
+            runningTask = TaskWaitingQueue::getInstance().front();
             sequenceID = runningTask->getSequenceID();
             ctx->sequenceID = runningTask->getSequenceID();
-            runningTask->setPastSeqLen(pastSeqLen);
-            runningTask->allocBuffer<AttnInT>(hiddenSize, embBuf);
+            // runningTask->setPastSeqLen(pastSeqLen);
+            // runningTask->allocBuffer<AttnInT>(hiddenSize, embBuf);
             printf("Decoder.Seq%d.step\n", sequenceID);
             fflush(stdout);
 
@@ -450,12 +465,16 @@ class CommonDecoder : public AbstractDecoder {
         if (ctx->ppSize > 1 && ctx->ppRank < ctx->ppSize - 1) {
             TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Send");
             int next_world_rank = (ctx->ppRank + 1) * ctx->tpSize + ctx->tpRank;
-            int count = batchSize * inputSeqLen * hiddenSize;
-            MPI_Send(&sequenceID, 1, MPI_INT32_T, next_world_rank, next_world_rank, MPI_COMM_WORLD);
-            MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD);
+            int64_t count = batchSize * inputSeqLen * hiddenSize;
+            int64_t sendBuf[2] = {sequenceID, count};
+            MPI_Send(&sendBuf, 2, MPI_INT64_T, next_world_rank, next_world_rank, MPI_COMM_WORLD);
+            MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank + 1000, MPI_COMM_WORLD);
+            // float sendBuf[2 + count] = {(float)sequenceID, (float)count};
+            // memcpy(sendBuf + 2, embBuf, count * sizeof(float));
+            // MPI_Send(&sendBuf, 2 + count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD);
             // TODO: Error: different scope when dynamic loading so file
             // this->messenger.worldSendFP32(embBuf, count, next_world_rank, next_world_rank);
-            printf("Decoder.Seq%d.MPI_Send\n", sequenceID);
+            printf("Decoder.Seq%d.MPI_Send%d\n", sequenceID, count);
             fflush(stdout);
             return std::tuple<float *, int, int>(nullptr, 0, 0);
         }
@@ -1030,6 +1049,8 @@ class CommonDecoder : public AbstractDecoder {
     int startId;
     int endId;
 
+    bool enabledBackgroundSync = false;
+
 #ifdef DEBUG
     Debugger dbg;
 #endif
diff --git a/src/models/models.cpp b/src/models/models.cpp
index dfbafdf3..54f2421f 100644
--- a/src/models/models.cpp
+++ b/src/models/models.cpp
@@ -163,10 +163,12 @@ std::vector<int32_t> Model::generate() {
         if (TaskWaitingQueue::getInstance().front()->getStep() == 0) {
             auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
             TaskWaitingQueue::getInstance().front()->stepForward();
+            TaskWaitingQueue::getInstance().pop();
             return token;
         } else {
             auto token = searcher->getNextToken();
             TaskWaitingQueue::getInstance().front()->stepForward(token[0]);
+            TaskWaitingQueue::getInstance().pop();
             return token;
         }
     } else {
@@ -176,16 +178,19 @@ std::vector<int32_t> Model::generate() {
             if (TaskWaitingQueue::getInstance().front()->getStep() == 0) {
                 auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
                 TaskWaitingQueue::getInstance().front()->stepForward();
+                TaskWaitingQueue::getInstance().pop();
                 return token;
             } else {
                 auto token = searcher->getNextToken();
                 TaskWaitingQueue::getInstance().front()->stepForward(token[0]);
+                TaskWaitingQueue::getInstance().pop();
                 return token;
             }
         } else {
             isNewInput = false;
             auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
             TaskWaitingQueue::getInstance().front()->stepForward();
+            TaskWaitingQueue::getInstance().pop();
             return token;
         }
     }
diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp
index cbfbcc5a..3ad6ba1f 100644
--- a/src/searchers/greedy_search.cpp
+++ b/src/searchers/greedy_search.cpp
@@ -45,12 +45,16 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
             int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank;
             ThreadPool::getInstance().addTask([predictor_world_rank, this] {
                 while (true) {
-                    int32_t sequenceID;
-                    MPI_Recv(&sequenceID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD,
+                    int32_t recvBuf[2];
+                    MPI_Recv(&recvBuf, 2, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD,
                             MPI_STATUS_IGNORE);
-                    TimeLine t("GreedySearch.Seq" + std::to_string(sequenceID) + ".MPI_Recv");
-                    MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank,
-                            predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                    int32_t sequenceID = recvBuf[0];
+                    this->nextTokens[0] = recvBuf[1];
+                    // MPI_Recv(&sequenceID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD,
+                    //         MPI_STATUS_IGNORE);
+                    // TimeLine t("GreedySearch.Seq" + std::to_string(sequenceID) + ".MPI_Recv");
+                    // MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank,
+                    //         predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                     printf("GreedySearch.Seq%d.MPI_Recv\n", sequenceID);
                     fflush(stdout);
                     if (SequencePool::getInstance().has(sequenceID)) {
@@ -69,9 +73,11 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
             TimeLine t("GreedySearch.Seq" + std::to_string(ctx->sequenceID) + ".MPI_Send");
             int embedding_world_rank = 0 * ctx->tpSize + ctx->tpRank;
             int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank;
-            MPI_Send(&ctx->sequenceID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD);
-            MPI_Send(this->nextTokens.data(), batchSize, MPI_INT32_T, embedding_world_rank, predictor_world_rank,
-                    MPI_COMM_WORLD);
+            int32_t sendBuf[2] = {ctx->sequenceID, nextTokens[0]};
+            MPI_Send(&sendBuf, 2, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD);
+            // MPI_Send(&ctx->sequenceID, 1, MPI_INT32_T, embedding_world_rank, predictor_world_rank, MPI_COMM_WORLD);
+            // MPI_Send(this->nextTokens.data(), batchSize, MPI_INT32_T, embedding_world_rank, predictor_world_rank,
+            //         MPI_COMM_WORLD);
             // TODO: Error: different scope when dynamic loading so file
             // messenger.worldSendINT32(this->nextTokens.data(), batchSize, embedding_world_rank, predictor_world_rank);
             printf("GreedySearch.Seq%d.MPI_Send\n", ctx->sequenceID);
diff --git a/src/searchers/greedy_search.h b/src/searchers/greedy_search.h
index 5b4ec164..22a9c419 100644
--- a/src/searchers/greedy_search.h
+++ b/src/searchers/greedy_search.h
@@ -47,7 +47,7 @@ class GreedySearch : public AbstractSearcher {
     std::vector<std::vector<int>> cachedRepetVec;
     std::vector<int> doneBatch;
 
-    bool enabledBackgroundSync;
+    bool enabledBackgroundSync = false;
     int batchSize;
     int step;
     int curLen;

From e9d543735d10cd8810f1516fcf28b7e5054d5cb7 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Sun, 28 Apr 2024 14:52:09 +0800
Subject: [PATCH 31/32] run good

---
 CMakeLists.txt              | 2 +-
 src/common/sequence.h       | 5 +++++
 src/models/common_decoder.h | 7 ++++---
 src/models/models.cpp       | 1 +
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 08b5b90c..acc21129 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -179,7 +179,7 @@ add_definitions(-DAVX512_FP16_WEIGHT_ONLY_INT4=true)
 add_definitions(-DAVX512_FP32_WEIGHT_ONLY_NF4=true)
 # add_definitions(-DAVX512_FP16_WEIGHT_ONLY_NF4=true)
 
-add_definitions(-DDEBUG=true)
+# add_definitions(-DDEBUG=true)
 # add_definitions(-DSTEP_BY_STEP_ATTN=true)
 add_definitions(-DUSE_SHM=true)
 option(XFT_BUILD_TESTS "Build xfastertransformer unit tests" OFF)
diff --git a/src/common/sequence.h b/src/common/sequence.h
index a3d79bab..ae10ac2f 100644
--- a/src/common/sequence.h
+++ b/src/common/sequence.h
@@ -114,8 +114,13 @@ class SequenceMeta {
         memcpy(hiddenStates, _hiddenStates, sizeof(T) * getInputSeqLen() * hiddenSize);
     }
 
+    int32_t getHiddenStatesSize() const { return hiddenStatesSize; }
+
+    void setHiddenStatesSize(int32_t _hiddenStatesSize) { hiddenStatesSize = _hiddenStatesSize; }
+
 private:
     int32_t hiddenSize;
+    int64_t hiddenStatesSize;
     void *hiddenStates;
 #endif
 };
diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index d96ac699..cff904a1 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -339,11 +339,11 @@ class CommonDecoder : public AbstractDecoder {
         t1.release();
 
 #ifdef PIPELINE_PARALLEL
+        int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank;
+        int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank;
         // if current pipeline parallel stage rank isn't the first stage, should receive previous stage data
         if (ctx->ppSize > 1 && ctx->ppRank > 0 && enabledBackgroundSync == false) {
             enabledBackgroundSync = true;
-            int curr_world_rank = ctx->ppRank * ctx->tpSize + ctx->tpRank;
-            int prev_world_rank = (ctx->ppRank - 1) * ctx->tpSize + ctx->tpRank;
             // int64_t count = batchSize * inputSeqLen * hiddenSize;
             ThreadPool::getInstance().addTask([curr_world_rank, prev_world_rank, seqLen, hiddenSize, pastSeqLen, this] {
                 while (true) {
@@ -351,7 +351,6 @@ class CommonDecoder : public AbstractDecoder {
                     MPI_Recv(&recvBuf, 2, MPI_INT64_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                     int32_t sequenceID = recvBuf[0];
                     int64_t count = recvBuf[1];
-                    MPI_Recv(this->actBuffers->Data(), count, MPI_FLOAT, prev_world_rank, curr_world_rank + 1000, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
 
                     // MPI_Status status;
                     // MPI_Probe(prev_world_rank, curr_world_rank, MPI_COMM_WORLD, &status);
@@ -371,6 +370,7 @@ class CommonDecoder : public AbstractDecoder {
                     TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Recv");
                     if (!SequencePool::getInstance().has(sequenceID)) {
                         SequenceMeta *sequence = SequencePool::getInstance().createMeta(sequenceID, seqLen);
+                        sequence->setHiddenStatesSize(count);
                         // sequence->setPastSeqLen(pastSeqLen);
                         // sequence->allocBuffer<AttnInT>(hiddenSize, embBuf);
                         SequencePool::getInstance().add(sequence->getSequenceID(), sequence);
@@ -400,6 +400,7 @@ class CommonDecoder : public AbstractDecoder {
             ctx->sequenceID = runningTask->getSequenceID();
             // runningTask->setPastSeqLen(pastSeqLen);
             // runningTask->allocBuffer<AttnInT>(hiddenSize, embBuf);
+            MPI_Recv(embBuf, TaskWaitingQueue::getInstance().front()->getHiddenStatesSize(), MPI_FLOAT, prev_world_rank, curr_world_rank + 1000, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
             printf("Decoder.Seq%d.step\n", sequenceID);
             fflush(stdout);
 
diff --git a/src/models/models.cpp b/src/models/models.cpp
index 54f2421f..4ed419c1 100644
--- a/src/models/models.cpp
+++ b/src/models/models.cpp
@@ -166,6 +166,7 @@ std::vector<int32_t> Model::generate() {
             TaskWaitingQueue::getInstance().pop();
             return token;
         } else {
+            isNewInput = false;
             auto token = searcher->getNextToken();
             TaskWaitingQueue::getInstance().front()->stepForward(token[0]);
             TaskWaitingQueue::getInstance().pop();

From 9e1b7705ec85f19e89d43a146bd72244cd6e1b60 Mon Sep 17 00:00:00 2001
From: changqi1 <changqing.li@intel.com>
Date: Sun, 28 Apr 2024 22:11:22 +0800
Subject: [PATCH 32/32] could run

---
 examples/cpp/example.cpp        |  4 ---
 src/common/sequence.h           | 20 ++++++++++++
 src/models/common_decoder.h     | 27 +++-------------
 src/models/models.cpp           | 56 ++++++++++-----------------------
 src/searchers/greedy_search.cpp | 13 ++++----
 src/searchers/greedy_search.h   |  2 +-
 src/utils/thread_util.h         |  2 +-
 7 files changed, 48 insertions(+), 76 deletions(-)

diff --git a/examples/cpp/example.cpp b/examples/cpp/example.cpp
index 7040edfb..f6a68ce7 100644
--- a/examples/cpp/example.cpp
+++ b/examples/cpp/example.cpp
@@ -441,15 +441,11 @@ int main(int argc, char **argv) {
         if (!model.isDone()) {
             Timer t(isMaster, "[INFO] First token");
             firstIds = model.generate();
-            printf("firstIds[0]: %d\n", firstIds[0]);
-            fflush(stdout);
         }
 
         Timer timerSecond;
         if (!model.isDone()) {
             secondIds = model.generate();
-            printf("secondIds[0]: %d\n", secondIds[0]);
-            fflush(stdout);
             secondIdCount++;
         }
 
diff --git a/src/common/sequence.h b/src/common/sequence.h
index ae10ac2f..84049436 100644
--- a/src/common/sequence.h
+++ b/src/common/sequence.h
@@ -225,6 +225,14 @@ class SequencePool {
         return isSuccess;
     }
 
+    void clear() {
+        for (auto &it : hub) {
+            delete it.second;
+        }
+        hub.clear();
+        globalSequenceID = 0;
+    }
+
 private:
     SequencePool() {}
 
@@ -250,6 +258,12 @@ class InputQueue {
 
     void push(SequenceMeta *seq) { queue.push(seq); }
 
+    void clear() {
+        while (!queue.empty()) {
+            queue.pop();
+        }
+    }
+
 private:
     InputQueue() {}
 
@@ -284,6 +298,12 @@ class TaskWaitingQueue {
 
     void push(SequenceMeta *seq) { queue.push(seq); }
 
+    void clear() {
+        while (!queue.empty()) {
+            queue.pop();
+        }
+    }
+
 private:
     TaskWaitingQueue() {}
 
diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
index cff904a1..cbe06e82 100644
--- a/src/models/common_decoder.h
+++ b/src/models/common_decoder.h
@@ -348,23 +348,10 @@ class CommonDecoder : public AbstractDecoder {
             ThreadPool::getInstance().addTask([curr_world_rank, prev_world_rank, seqLen, hiddenSize, pastSeqLen, this] {
                 while (true) {
                     int64_t recvBuf[2] = {0, 0};
-                    MPI_Recv(&recvBuf, 2, MPI_INT64_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                    MPI_Recv(&recvBuf, 2, MPI_INT64_T, prev_world_rank, curr_world_rank, MPI_COMM_WORLD,
+                            MPI_STATUS_IGNORE);
                     int32_t sequenceID = recvBuf[0];
                     int64_t count = recvBuf[1];
-
-                    // MPI_Status status;
-                    // MPI_Probe(prev_world_rank, curr_world_rank, MPI_COMM_WORLD, &status);
-                    // int number_amount;
-                    // MPI_Get_count(&status, MPI_FLOAT, &number_amount);
-                    // printf("Decoder.probe.%d\n", number_amount);
-                    // fflush(stdout);
-                    // float recvBuf[number_amount] = {0.0f};
-                    // MPI_Recv(&recvBuf, number_amount, MPI_FLOAT, prev_world_rank, curr_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-                    // int32_t sequenceID = recvBuf[0];
-                    // int64_t count = recvBuf[1];
-                    printf("Decoder.Seq%d.MPI_Recv%d\n", sequenceID, count);
-                    fflush(stdout);
-                    // memcpy(this->actBuffers->Data(), recvBuf + 2, count * sizeof(float));
                     // TODO: Error: different scope when dynamic loading so file
                     // this->messenger.worldRecvFP32(embBuf, count, prev_world_rank, curr_world_rank);
                     TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".MPI_Recv");
@@ -400,9 +387,8 @@ class CommonDecoder : public AbstractDecoder {
             ctx->sequenceID = runningTask->getSequenceID();
             // runningTask->setPastSeqLen(pastSeqLen);
             // runningTask->allocBuffer<AttnInT>(hiddenSize, embBuf);
-            MPI_Recv(embBuf, TaskWaitingQueue::getInstance().front()->getHiddenStatesSize(), MPI_FLOAT, prev_world_rank, curr_world_rank + 1000, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-            printf("Decoder.Seq%d.step\n", sequenceID);
-            fflush(stdout);
+            MPI_Recv(embBuf, TaskWaitingQueue::getInstance().front()->getHiddenStatesSize(), MPI_FLOAT, prev_world_rank,
+                    curr_world_rank + 1000, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
 
             TimeLine t("Decoder.Seq" + std::to_string(sequenceID) + ".Step");
 #endif
@@ -470,13 +456,8 @@ class CommonDecoder : public AbstractDecoder {
             int64_t sendBuf[2] = {sequenceID, count};
             MPI_Send(&sendBuf, 2, MPI_INT64_T, next_world_rank, next_world_rank, MPI_COMM_WORLD);
             MPI_Send(embBuf, count, MPI_FLOAT, next_world_rank, next_world_rank + 1000, MPI_COMM_WORLD);
-            // float sendBuf[2 + count] = {(float)sequenceID, (float)count};
-            // memcpy(sendBuf + 2, embBuf, count * sizeof(float));
-            // MPI_Send(&sendBuf, 2 + count, MPI_FLOAT, next_world_rank, next_world_rank, MPI_COMM_WORLD);
             // TODO: Error: different scope when dynamic loading so file
             // this->messenger.worldSendFP32(embBuf, count, next_world_rank, next_world_rank);
-            printf("Decoder.Seq%d.MPI_Send%d\n", sequenceID, count);
-            fflush(stdout);
             return std::tuple<float *, int, int>(nullptr, 0, 0);
         }
 #endif
diff --git a/src/models/models.cpp b/src/models/models.cpp
index 4ed419c1..18f08236 100644
--- a/src/models/models.cpp
+++ b/src/models/models.cpp
@@ -130,6 +130,11 @@ void Model::config(SearcherConfig &config_, const std::vector<std::vector<int>>
     // Slaves get exit flags and exit directly
     if (decoder->getRank() > 0 && configuration.numBeams == 0) { exit(0); }
 
+    InputQueue::getInstance().clear();
+    TaskWaitingQueue::getInstance().clear();
+    SequencePool::getInstance().clear();
+    // ThreadPool::getInstance().clear();
+
     createSearcher(configuration);
     setStopWords(stopWordsList_);
 }
@@ -157,55 +162,26 @@ std::vector<int32_t> Model::generate() {
         exit(-1);
     }
 
-    if (this->isMaster()) {
+    std::vector<int> token;
+    if (!this->isMaster() && isNewInput) {
+        isNewInput = false;
+        token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
+        TaskWaitingQueue::getInstance().front()->stepForward();
+    } else {
         while(TaskWaitingQueue::getInstance().empty());
 
         if (TaskWaitingQueue::getInstance().front()->getStep() == 0) {
-            auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
+            isNewInput = false;
+            token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
             TaskWaitingQueue::getInstance().front()->stepForward();
-            TaskWaitingQueue::getInstance().pop();
-            return token;
         } else {
-            isNewInput = false;
-            auto token = searcher->getNextToken();
+            token = searcher->getNextToken();
             TaskWaitingQueue::getInstance().front()->stepForward(token[0]);
-            TaskWaitingQueue::getInstance().pop();
-            return token;
-        }
-    } else {
-        if (!isNewInput) {
-            while(TaskWaitingQueue::getInstance().empty());
-
-            if (TaskWaitingQueue::getInstance().front()->getStep() == 0) {
-                auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
-                TaskWaitingQueue::getInstance().front()->stepForward();
-                TaskWaitingQueue::getInstance().pop();
-                return token;
-            } else {
-                auto token = searcher->getNextToken();
-                TaskWaitingQueue::getInstance().front()->stepForward(token[0]);
-                TaskWaitingQueue::getInstance().pop();
-                return token;
-            }
-        } else {
-            isNewInput = false;
-            auto token = searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
-            TaskWaitingQueue::getInstance().front()->stepForward();
-            TaskWaitingQueue::getInstance().pop();
-            return token;
         }
     }
 
-    // if (isNewInput) {
-    //     printf("1st\n");
-    //     // TaskWaitingQueue::getInstance().front()->getStep() == 0
-    //     // TaskWaitingQueue::getInstance().front()->stepForward();
-    //     isNewInput = false;
-    //     return searcher->getNextToken(inputIds.data(), batchSize, inputIds.size() / batchSize);
-    // } else {
-    //     printf("2nd\n");
-    //     return searcher->getNextToken();
-    // }
+    TaskWaitingQueue::getInstance().pop();
+    return token;
 }
 
 void Model::createSearcher(SearcherConfig &config_) {
diff --git a/src/searchers/greedy_search.cpp b/src/searchers/greedy_search.cpp
index 3ad6ba1f..493fb717 100644
--- a/src/searchers/greedy_search.cpp
+++ b/src/searchers/greedy_search.cpp
@@ -21,7 +21,7 @@
 using namespace xft;
 
 GreedySearch::GreedySearch(AbstractDecoder &dec, const SearcherConfig &config)
-    : decoder(dec), maxLen(config.maxLen), step(0), repetitionPenalty(config.repetitionPenalty) {
+    : decoder(dec), maxLen(config.maxLen), step(0), repetitionPenalty(config.repetitionPenalty), enabledBackgroundSync(false) {
     eosTokenId = config.eosTokenId == -1 ? decoder.getEndId() : config.eosTokenId;
     padTokenId = config.padTokenId == -1 ? eosTokenId : config.padTokenId;
     if (repetitionPenalty <= 0) {
@@ -39,7 +39,6 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
     // Messenger &messenger = decoder.getMessenger();
 
     if (std::get<0>(result) == nullptr) { // The first embedding pipeline parallel stage
-        this->nextTokens = std::vector<int>(batchSize, 0);
         if (ctx->ppSize > 1 && ctx->ppRank == 0 && enabledBackgroundSync == false) {
             enabledBackgroundSync = true;
             int predictor_world_rank = (ctx->ppSize - 1) * ctx->tpSize + ctx->tpRank;
@@ -52,11 +51,9 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
                     this->nextTokens[0] = recvBuf[1];
                     // MPI_Recv(&sequenceID, 1, MPI_INT32_T, predictor_world_rank, predictor_world_rank, MPI_COMM_WORLD,
                     //         MPI_STATUS_IGNORE);
-                    // TimeLine t("GreedySearch.Seq" + std::to_string(sequenceID) + ".MPI_Recv");
+                    TimeLine t("GreedySearch.Seq" + std::to_string(sequenceID) + ".MPI_Recv");
                     // MPI_Recv(this->nextTokens.data(), this->batchSize, MPI_INT32_T, predictor_world_rank,
                     //         predictor_world_rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-                    printf("GreedySearch.Seq%d.MPI_Recv\n", sequenceID);
-                    fflush(stdout);
                     if (SequencePool::getInstance().has(sequenceID)) {
                         auto sequence = SequencePool::getInstance().get(sequenceID);
                         TaskWaitingQueue::getInstance().push(sequence);
@@ -80,8 +77,6 @@ std::vector<int> GreedySearch::syncToken(std::tuple<float *, int, int> &result)
             //         MPI_COMM_WORLD);
             // TODO: Error: different scope when dynamic loading so file
             // messenger.worldSendINT32(this->nextTokens.data(), batchSize, embedding_world_rank, predictor_world_rank);
-            printf("GreedySearch.Seq%d.MPI_Send\n", ctx->sequenceID);
-            fflush(stdout);
         }
     }
 #else
@@ -112,6 +107,8 @@ std::vector<int> GreedySearch::getNextToken(int *ids, int batchSize, int seqLen)
     std::copy(ids, ids + batchSize * seqLen, output.begin());
 
     int64_t dims[3] = {batchSize, 1, seqLen};
+    if (this->nextTokens.size() != batchSize)
+        this->nextTokens.resize(batchSize, 0);
 
     std::tuple<float *, int, int> result = decoder.forward(ids, dims, this->step++);
 
@@ -122,6 +119,8 @@ std::vector<int> GreedySearch::getNextToken(int *ids, int batchSize, int seqLen)
 std::vector<int> GreedySearch::getNextToken() {
     TimeLine t("Next Token");
     int64_t dims[3] = {batchSize, 1, 1};
+    if (this->nextTokens.size() != batchSize)
+        this->nextTokens.resize(batchSize, 0);
 
     std::tuple<float *, int, int> result = decoder.forward(nextTokens.data(), dims, this->step++);
 
diff --git a/src/searchers/greedy_search.h b/src/searchers/greedy_search.h
index 22a9c419..5b4ec164 100644
--- a/src/searchers/greedy_search.h
+++ b/src/searchers/greedy_search.h
@@ -47,7 +47,7 @@ class GreedySearch : public AbstractSearcher {
     std::vector<std::vector<int>> cachedRepetVec;
     std::vector<int> doneBatch;
 
-    bool enabledBackgroundSync = false;
+    bool enabledBackgroundSync;
     int batchSize;
     int step;
     int curLen;
diff --git a/src/utils/thread_util.h b/src/utils/thread_util.h
index a44b08d7..f22c59ac 100644
--- a/src/utils/thread_util.h
+++ b/src/utils/thread_util.h
@@ -56,7 +56,7 @@ class ThreadPool {
         condition.notify_one();
     }
 
-    ~ThreadPool() {
+    void clear() {
         stop = true;
         condition.notify_all();
         for (std::thread &worker : workers) {