From a1b9a04b748fae681cb1ad9e936b65b495ee4aeb Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sun, 24 Sep 2023 10:05:34 -0500
Subject: [PATCH 01/63] Refactor MPI for heterogenous cluster support.

Adds support for different options and number of layers
per node.

The per-node options are implemented as parsing
command-line options from a file instead of from the
command-line itself. This allows each node to have its own
version of this options file.

The different number of layers per-node is implemented
as a new option, `mpi-layer-split`, that takes
a list of percentages. These percentages are used to calculate
the range of layers to delegate to each node. The ranges
are calculated on the head node and then scattered to the other
nodes to maintain a single source of truth.
---
 common/common.cpp           |  17 +
 common/common.h             |   1 +
 examples/mpi/CMakeLists.txt |   8 +
 examples/mpi/README.md      |  80 +++
 examples/mpi/mpi.cpp        | 945 ++++++++++++++++++++++++++++++++++++
 ggml-mpi.c                  | 122 ++++-
 ggml-mpi.h                  |  15 +-
 llama.cpp                   |  40 +-
 llama.h                     |   4 +-
 9 files changed, 1198 insertions(+), 34 deletions(-)
 create mode 100644 examples/mpi/CMakeLists.txt
 create mode 100644 examples/mpi/README.md
 create mode 100644 examples/mpi/mpi.cpp
diff --git a/common/common.cpp b/common/common.cpp
index 1dcc235eac0..f56ba760d96 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -540,6 +540,23 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
 #else
             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
 #endif
+        } else if (arg == "--mpi-layer-split") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string arg_next = argv[i];
+
+            // split string by , and /
+            const std::regex regex{R"([,/]+)"};
+            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+            std::vector<std::string> split_arg{it, {}};
+            params.mpi_layer_split.resize(split_arg.size());
+            for (size_t i = 0; i < split_arg.size(); ++i) {
+                params.mpi_layer_split[i] = std::stof(split_arg[i]);
+            }
+
+
         } else if (arg == "--tensor-split" || arg == "-ts") {
             if (++i >= argc) {
                 invalid_param = true;
diff --git a/common/common.h b/common/common.h
index 2f6fe48ab53..176bd23d976 100644
--- a/common/common.h
+++ b/common/common.h
@@ -60,6 +60,7 @@ struct gpt_params {
     int32_t n_gpu_layers                    = -1;    // number of layers to store in VRAM (-1 - use default)
     int32_t n_gpu_layers_draft              = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
     int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
+    std::vector<float> mpi_layer_split      = {1.0}; // list of percentages of the total number of layers
     float   tensor_split[LLAMA_MAX_DEVICES] = {0};   // how split tensors should be distributed across GPUs
     int32_t n_beams                         = 0;     // if non-zero then use beam search of given width.
     float   rope_freq_base                  = 0.0f;  // RoPE base frequency
diff --git a/examples/mpi/CMakeLists.txt b/examples/mpi/CMakeLists.txt
new file mode 100644
index 00000000000..07d83b61d99
--- /dev/null
+++ b/examples/mpi/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET mpi)
+add_executable(${TARGET} mpi.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/mpi/README.md b/examples/mpi/README.md
new file mode 100644
index 00000000000..44a04791567
--- /dev/null
+++ b/examples/mpi/README.md
@@ -0,0 +1,80 @@
+# llama.cpp/example/mpi
+
+This example program allows you to use various LLaMA language models in an easy and efficient way across an MPI cluster.
+It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
+
+## Table of Contents
+
+1. [Quick Start](#quick-start)
+2. [Common Options](#common-options)
+
+## Quick Start
+
+To get started right away, write the following to a file on each node, making sure to use the correct path for the model you have:
+```bash
+--mpi-layer-split 0.8,0.2 -t 4 -m ~/llm-local/codellama-7b.Q3_K_M.gguf --color -c 512 --temp 0.0 --repeat_penalty 1.0 -n 128 -p "double fast_inverse_square_root(double x"
+```
+
+Each node may have different options, currently they must have the same number of arguments to the mpi-layer-split option and the same
+model path, but that will eventually be synchronized from the head node.
+
+Next, write the hostsfile on the head node. Make sure there is only one slot on each node.
+
+Finally, run the following command on the head node to start the program across the cluster:
+
+#### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+mpirun -hostfile hostsfile -mca orte_keep_fqdn_hostnames t --bind-to none ./mpi options.txt
+```
+
+Where `hostsfile` is the file containing the cluster hostname configuration and `options.txt` is the path
+where each node can find its own options. Storing the model on a network filesystem has not yet been
+tested and optimized for.
+
+#### Windows:
+Not supported currently.
+
+For an interactive experience, try this command:
+
+#### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \
+'User: Hi
+AI: Hello. I am an AI chatbot. Would you like to talk?
+User: Sure!
+AI: What would you like to talk about?
+User:'
+```
+
+#### Windows:
+
+```powershell
+main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
+```
+
+The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
+
+#### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
+```
+
+#### Windows:
+
+```powershell
+main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
+```
+
+## Common Options
+
+In this section, we cover the most commonly used options for running the `mpi` program with the LLaMA models:
+
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
+-   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
+-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `--mpi-layer-split`: Set the percentage of layers to distribute to each node. Must have the same number of arguments as the number of nodes in the cluster. Only the layer split percentages passed to the head node are used, they are scattered to all other nodes in the cluster.
diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
new file mode 100644
index 00000000000..8d14d8e6186
--- /dev/null
+++ b/examples/mpi/mpi.cpp
@@ -0,0 +1,945 @@
+// Defines sigaction on msys:
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "common.h"
+#include "console.h"
+#include "llama.h"
+#include "build-info.h"
+#include "grammar-parser.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <wordexp.h>
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <signal.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+
+static llama_context           ** g_ctx;
+static llama_model             ** g_model;
+static gpt_params               * g_params;
+static std::vector<llama_token> * g_input_tokens;
+static std::ostringstream       * g_output_ss;
+static std::vector<llama_token> * g_output_tokens;
+static bool is_interacting = false;
+
+void write_logfile(
+    const llama_context * ctx, const gpt_params & params, const llama_model * model,
+    const std::vector<llama_token> input_tokens, const std::string output, const std::vector<llama_token> output_tokens) {
+
+    if (params.logdir.empty()) {
+        return;
+    }
+
+    const std::string timestamp = get_sortable_timestamp();
+
+    const bool success = create_directory_with_parents(params.logdir);
+    if (!success) {
+        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+                __func__, params.logdir.c_str());
+        return;
+    }
+
+    const std::string logfile_path = params.logdir + timestamp + ".yml";
+    FILE * logfile = fopen(logfile_path.c_str(), "w");
+
+    if (logfile == NULL) {
+        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        return;
+    }
+
+    fprintf(logfile, "binary: main\n");
+    char model_desc[128];
+    llama_model_desc(model, model_desc, sizeof(model_desc));
+    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
+
+    fprintf(logfile, "\n");
+    fprintf(logfile, "######################\n");
+    fprintf(logfile, "# Generation Results #\n");
+    fprintf(logfile, "######################\n");
+    fprintf(logfile, "\n");
+
+    dump_string_yaml_multiline(logfile, "output", output.c_str());
+    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
+
+    llama_dump_timing_info_yaml(logfile, ctx);
+    fclose(logfile);
+}
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+void sigint_handler(int signo) {
+    if (signo == SIGINT) {
+        if (!is_interacting) {
+            is_interacting = true;
+        } else {
+            console::cleanup();
+            printf("\n");
+            llama_print_timings(*g_ctx);
+            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
+            _exit(130);
+        }
+    }
+}
+#endif
+
+int main(int argc, char ** argv) {
+
+    gpt_params params;
+    g_params = &params;
+
+    if (argc > 2) {
+        fprintf(stderr, "Must only have one argument, the file to read options from.\n");
+        return 2;
+    }
+
+    std::string rawOptions = argv[0];
+    rawOptions += ' ';
+    std::ifstream optionsFile(argv[1]);
+    if (optionsFile.is_open()) {
+        std::ostringstream buf;
+        buf << optionsFile.rdbuf();
+        rawOptions += buf.str();
+        optionsFile.close();
+
+    } else {
+        fprintf(stderr, "Cannot open options file at path %s\n", argv[1]);
+        return 3;
+    }
+
+    rawOptions.erase(rawOptions.find_last_not_of(" \t\n\r\f\v") + 1);
+
+    printf("%s", rawOptions.c_str());
+
+    wordexp_t  splitOptions;
+    wordexp(rawOptions.c_str(), &splitOptions, WRDE_NOCMD);
+    //char** loadedArgs = (char **) malloc(1 + sizeof(char*) * splitOptions.we_wordc);
+    //loadedArgs[0] = argv[0];
+    //memcpy(&loadedArgs[1], splitOptions.we_wordv, sizeof(char*) * splitOptions.we_wordc);
+    printf("Loaded argc: %d", splitOptions.we_wordc);
+    for (int i = 0; i < splitOptions.we_wordc; i++) {
+
+        printf(" %s", splitOptions.we_wordv[i]);
+    }
+    printf("\n");
+
+    if (gpt_params_parse(splitOptions.we_wordc, splitOptions.we_wordv, params) == false) {
+        wordfree(&splitOptions);
+        return 1;
+    }
+    wordfree(&splitOptions);
+
+    // save choice to use color for later
+    // (note for later: this is a slightly awkward choice)
+    console::init(params.simple_io, params.use_color);
+    atexit([]() { console::cleanup(); });
+
+    if (params.perplexity) {
+        printf("\n************\n");
+        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        printf("************\n\n");
+
+        return 0;
+    }
+
+    if (params.embedding) {
+        printf("\n************\n");
+        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        printf("************\n\n");
+
+        return 0;
+    }
+
+    if (params.rope_freq_base != 10000.0) {
+        fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
+    }
+
+    if (params.rope_freq_scale != 1.0) {
+        fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
+    }
+
+    if (params.n_ctx > 2048) {
+        // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
+        fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
+    } else if (params.n_ctx < 8) {
+        fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
+    }
+
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+
+    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    llama_backend_init(params.numa);
+
+    llama_model * model;
+    llama_context * ctx;
+    llama_context * ctx_guidance = NULL;
+    g_model = &model;
+    g_ctx = &ctx;
+
+    // load the model and apply lora adapter, if any
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (params.cfg_scale > 1.f) {
+        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
+        ctx_guidance = llama_new_context_with_model(model, lparams);
+    }
+
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return 1;
+    }
+
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+    }
+
+    // determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
+    // uncomment the "used_mem" line in llama.cpp to see the results
+    if (params.mem_test) {
+        {
+            fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
+
+            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
+            llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
+        }
+
+        llama_print_timings(ctx);
+        llama_free(ctx);
+        llama_free_model(model);
+
+        return 0;
+    }
+
+    // export the cgraph and exit
+    if (params.export_cgraph) {
+        llama_eval_export(ctx, "llama.ggml");
+        llama_free(ctx);
+        llama_free_model(model);
+
+        return 0;
+    }
+    llama_split_layers_weighted(ctx, params.mpi_layer_split);
+
+    std::string path_session = params.path_prompt_cache;
+    std::vector<llama_token> session_tokens;
+
+    if (!path_session.empty()) {
+        fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+
+        // fopen to check for existing session
+        FILE * fp = std::fopen(path_session.c_str(), "rb");
+        if (fp != NULL) {
+            std::fclose(fp);
+
+            session_tokens.resize(params.n_ctx);
+            size_t n_token_count_out = 0;
+            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+                return 1;
+            }
+            session_tokens.resize(n_token_count_out);
+            llama_set_rng_seed(ctx, params.seed);
+
+            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
+        } else {
+            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
+        }
+    }
+
+    // Add BOS if SPM tokenizer
+    const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+
+    // tokenize the prompt
+    std::vector<llama_token> embd_inp;
+
+    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
+        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+    } else {
+        embd_inp = session_tokens;
+    }
+
+    // Should not run without any tokens
+    if (embd_inp.empty()) {
+        embd_inp.push_back(llama_token_bos(ctx));
+    }
+
+    // Tokenize negative prompt
+    std::vector<llama_token> guidance_inp;
+    int guidance_offset = 0;
+    int original_prompt_len = 0;
+    if (ctx_guidance) {
+        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
+
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        original_prompt_len = original_inp.size();
+        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
+    }
+
+    const int n_ctx = llama_n_ctx(ctx);
+
+    if ((int) embd_inp.size() > n_ctx - 4) {
+        fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        return 1;
+    }
+
+    // debug message about similarity of saved session, if applicable
+    size_t n_matching_session_tokens = 0;
+    if (session_tokens.size()) {
+        for (llama_token id : session_tokens) {
+            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
+                break;
+            }
+            n_matching_session_tokens++;
+        }
+        if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
+            fprintf(stderr, "%s: using full prompt from session file\n", __func__);
+        } else if (n_matching_session_tokens >= embd_inp.size()) {
+            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
+        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
+            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
+        } else {
+            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
+        }
+    }
+
+    // if we will use the cache for the full prompt without reaching the end of the cache, force
+    // reevaluation of the last token token to recalculate the cached logits
+    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
+            session_tokens.size() > embd_inp.size()) {
+        session_tokens.resize(embd_inp.size() - 1);
+    }
+
+    // number of tokens to keep when resetting context
+    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
+        params.n_keep = (int)embd_inp.size();
+    }
+
+    // prefix & suffix for instruct mode
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);
+
+    // in instruct mode, we inject a prefix and a suffix to each input by the user
+    if (params.instruct) {
+        params.interactive_first = true;
+        params.antiprompt.push_back("### Instruction:\n\n");
+    }
+
+    // enable interactive mode if interactive start is specified
+    if (params.interactive_first) {
+        params.interactive = true;
+    }
+
+    if (params.verbose_prompt) {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        for (int i = 0; i < (int) embd_inp.size(); i++) {
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+        }
+
+        if (ctx_guidance) {
+            fprintf(stderr, "\n");
+            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
+            for (int i = 0; i < (int) guidance_inp.size(); i++) {
+                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
+            }
+        }
+
+        if (params.n_keep > 0) {
+        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
+            for (int i = 0; i < params.n_keep; i++) {
+                fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            }
+            fprintf(stderr, "'\n");
+        }
+        fprintf(stderr, "\n");
+    }
+
+    if (params.interactive) {
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+        struct sigaction sigint_action;
+        sigint_action.sa_handler = sigint_handler;
+        sigemptyset (&sigint_action.sa_mask);
+        sigint_action.sa_flags = 0;
+        sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
+        };
+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+
+        fprintf(stderr, "%s: interactive mode on.\n", __func__);
+
+        if (params.antiprompt.size()) {
+            for (auto antiprompt : params.antiprompt) {
+                fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
+            }
+        }
+
+        if (params.input_prefix_bos) {
+            fprintf(stderr, "Input prefix with BOS\n");
+        }
+
+        if (!params.input_prefix.empty()) {
+            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
+        }
+
+        if (!params.input_suffix.empty()) {
+            fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str());
+        }
+    }
+    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
+    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    fprintf(stderr, "\n\n");
+
+    grammar_parser::parse_state parsed_grammar;
+    llama_grammar *             grammar = NULL;
+    if (!params.grammar.empty()) {
+        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+        // will be empty (default) if there are parse errors
+        if (parsed_grammar.rules.empty()) {
+            return 1;
+        }
+        fprintf(stderr, "%s: grammar:\n", __func__);
+        grammar_parser::print_grammar(stderr, parsed_grammar);
+        fprintf(stderr, "\n");
+
+        {
+            auto it = params.logit_bias.find(llama_token_eos(ctx));
+            if (it != params.logit_bias.end() && it->second == -INFINITY) {
+                fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
+            }
+        }
+
+        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+        grammar = llama_grammar_init(
+            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    }
+
+    // TODO: replace with ring-buffer
+    std::vector<llama_token> last_n_tokens(n_ctx);
+    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+
+    if (params.interactive) {
+        const char *control_message;
+        if (params.multiline_input) {
+            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
+                              " - To return control without starting a new line, end your input with '/'.\n";
+        } else {
+            control_message = " - Press Return to return control to LLaMa.\n"
+                              " - To return control without starting a new line, end your input with '/'.\n"
+                              " - If you want to submit another line, end your input with '\\'.\n";
+        }
+        fprintf(stderr, "== Running in interactive mode. ==\n"
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+               " - Press Ctrl+C to interject at any time.\n"
+#endif
+               "%s\n", control_message);
+
+        is_interacting = params.interactive_first;
+    }
+
+    bool is_antiprompt        = false;
+    bool input_echo           = true;
+    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();
+
+    int n_past             = 0;
+    int n_remain           = params.n_predict;
+    int n_consumed         = 0;
+    int n_session_consumed = 0;
+    int n_past_guidance    = 0;
+
+    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
+    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
+    std::ostringstream output_ss;     g_output_ss     = &output_ss;
+
+    // the first thing we will do is to output the prompt, so set color accordingly
+    console::set_display(console::prompt);
+
+    std::vector<llama_token> embd;
+    std::vector<llama_token> embd_guidance;
+
+    // do one empty run to warm up the model
+    {
+        const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
+        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+        llama_reset_timings(ctx);
+    }
+
+    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
+        // predict
+        if (embd.size() > 0) {
+            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
+            // --prompt or --file which uses the same value.
+            auto max_embd_size = n_ctx - 4;
+            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
+            if ((int)embd.size() > max_embd_size) {
+                auto skipped_tokens = embd.size() - max_embd_size;
+                console::set_display(console::error);
+                printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                console::set_display(console::reset);
+                fflush(stdout);
+                embd.resize(max_embd_size);
+            }
+
+            // infinite text generation via context swapping
+            // if we run out of context:
+            // - take the n_keep first tokens from the original prompt (via n_past)
+            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
+            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+                if (params.n_predict == -2) {
+                    fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__);
+                    break;
+                }
+
+                const int n_left = n_past - params.n_keep;
+                // always keep the first token - BOS
+                n_past = std::max(1, params.n_keep);
+                n_past_guidance = std::max(1, params.n_keep + guidance_offset);
+
+                // insert n_left/2 tokens at the start of embd from last_n_tokens
+                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
+
+                // stop saving session if we run out of context
+                path_session.clear();
+
+                //printf("\n---\n");
+                //printf("resetting: '");
+                //for (int i = 0; i < (int) embd.size(); i++) {
+                //    printf("%s", llama_token_to_piece(ctx, embd[i]));
+                //}
+                //printf("'\n");
+                //printf("\n---\n");
+            }
+
+            // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
+            if (n_session_consumed < (int) session_tokens.size()) {
+                size_t i = 0;
+                for ( ; i < embd.size(); i++) {
+                    if (embd[i] != session_tokens[n_session_consumed]) {
+                        session_tokens.resize(n_session_consumed);
+                        break;
+                    }
+
+                    n_past++;
+                    n_session_consumed++;
+
+                    if (n_session_consumed >= (int) session_tokens.size()) {
+                        ++i;
+                        break;
+                    }
+                }
+                if (i > 0) {
+                    embd.erase(embd.begin(), embd.begin() + i);
+                }
+            }
+
+            // evaluate tokens in batches
+            // embd is typically prepared beforehand to fit within a batch, but not always
+
+            if (ctx_guidance) {
+                int input_size = 0;
+                llama_token* input_buf = NULL;
+
+                if (n_past_guidance < (int) guidance_inp.size()) {
+                    // Guidance context should have the same data with these modifications:
+                    //
+                    // * Replace the initial prompt
+                    // * Shift everything by guidance_offset
+                    embd_guidance = guidance_inp;
+                    if (embd.begin() + original_prompt_len < embd.end()) {
+                        embd_guidance.insert(
+                            embd_guidance.end(),
+                            embd.begin() + original_prompt_len,
+                            embd.end()
+                        );
+                    }
+
+                    input_buf = embd_guidance.data();
+                    input_size = embd_guidance.size();
+                    //fprintf(stderr, "\n---------------------\n");
+                    //for (int i = 0; i < (int) embd_guidance.size(); i++) {
+                        //fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i]));
+                    //}
+                    //fprintf(stderr, "\n---------------------\n");
+                } else {
+                    input_buf = embd.data();
+                    input_size = embd.size();
+                }
+
+                for (int i = 0; i < input_size; i += params.n_batch) {
+                    int n_eval = std::min(input_size - i, params.n_batch);
+                    if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
+                        fprintf(stderr, "%s : failed to eval\n", __func__);
+                        return 1;
+                    }
+
+                    n_past_guidance += n_eval;
+                }
+            }
+
+            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
+                int n_eval = (int) embd.size() - i;
+                if (n_eval > params.n_batch) {
+                    n_eval = params.n_batch;
+                }
+                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
+                    fprintf(stderr, "%s : failed to eval\n", __func__);
+                    return 1;
+                }
+                n_past += n_eval;
+            }
+
+            if (embd.size() > 0 && !path_session.empty()) {
+                session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
+                n_session_consumed = session_tokens.size();
+            }
+        }
+
+        embd.clear();
+        embd_guidance.clear();
+
+        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
+            // out of user input, sample next token
+            const float   temp            = params.temp;
+            const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+            const float   top_p           = params.top_p;
+            const float   tfs_z           = params.tfs_z;
+            const float   typical_p       = params.typical_p;
+            const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
+            const float   repeat_penalty  = params.repeat_penalty;
+            const float   alpha_presence  = params.presence_penalty;
+            const float   alpha_frequency = params.frequency_penalty;
+            const int     mirostat        = params.mirostat;
+            const float   mirostat_tau    = params.mirostat_tau;
+            const float   mirostat_eta    = params.mirostat_eta;
+            const bool    penalize_nl     = params.penalize_nl;
+
+            // optionally save the session on first sample (for faster prompt loading next time)
+            if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
+                need_to_save_session = false;
+                llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+            }
+
+            llama_token id = 0;
+
+            {
+                auto logits  = llama_get_logits(ctx);
+                auto n_vocab = llama_n_vocab(ctx);
+
+                // Apply params.logit_bias map
+                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+                    logits[it->first] += it->second;
+                }
+
+                std::vector<llama_token_data> candidates;
+                candidates.reserve(n_vocab);
+                for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                    candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+                }
+
+                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+                if (ctx_guidance) {
+                    llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale);
+                }
+
+                // Apply penalties
+                float nl_logit = logits[llama_token_nl(ctx)];
+                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+                llama_sample_repetition_penalty(ctx, &candidates_p,
+                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                    last_n_repeat, repeat_penalty);
+                llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                    last_n_repeat, alpha_frequency, alpha_presence);
+                if (!penalize_nl) {
+                    for (size_t idx = 0; idx < candidates_p.size; idx++) {
+                        if (candidates_p.data[idx].id == llama_token_nl(ctx)) {
+                            candidates_p.data[idx].logit = nl_logit;
+                            break;
+                        }
+                    }
+                }
+
+                if (grammar != NULL) {
+                    llama_sample_grammar(ctx, &candidates_p, grammar);
+                }
+
+                if (temp <= 0) {
+                    // Greedy sampling
+                    id = llama_sample_token_greedy(ctx, &candidates_p);
+                } else {
+                    if (mirostat == 1) {
+                        static float mirostat_mu = 2.0f * mirostat_tau;
+                        const int mirostat_m = 100;
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+                    } else if (mirostat == 2) {
+                        static float mirostat_mu = 2.0f * mirostat_tau;
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+                    } else {
+                        // Temperature sampling
+                        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+                        llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
+                        llama_sample_typical(ctx, &candidates_p, typical_p, 1);
+                        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token(ctx, &candidates_p);
+                    }
+                }
+                // printf("`%d`", candidates_p.size);
+
+                if (grammar != NULL) {
+                    llama_grammar_accept_token(ctx, grammar, id);
+                }
+
+                last_n_tokens.erase(last_n_tokens.begin());
+                last_n_tokens.push_back(id);
+            }
+
+            // add it to the context
+            embd.push_back(id);
+
+            // echo this to console
+            input_echo = true;
+
+            // decrement remaining sampling budget
+            --n_remain;
+        } else {
+            // some user input remains from prompt or interaction, forward it to processing
+            while ((int) embd_inp.size() > n_consumed) {
+                embd.push_back(embd_inp[n_consumed]);
+                last_n_tokens.erase(last_n_tokens.begin());
+                last_n_tokens.push_back(embd_inp[n_consumed]);
+                ++n_consumed;
+                if ((int) embd.size() >= params.n_batch) {
+                    break;
+                }
+            }
+        }
+
+        // display text
+        if (input_echo) {
+            for (auto id : embd) {
+                const std::string token_str = llama_token_to_piece(ctx, id);
+                printf("%s", token_str.c_str());
+
+                if (embd.size() > 1) {
+                    input_tokens.push_back(id);
+                } else {
+                    output_tokens.push_back(id);
+                    output_ss << token_str;
+                }
+            }
+            fflush(stdout);
+        }
+        // reset color to default if we there is no pending user input
+        if (input_echo && (int)embd_inp.size() == n_consumed) {
+            console::set_display(console::reset);
+        }
+
+        // if not currently processing queued inputs;
+        if ((int) embd_inp.size() <= n_consumed) {
+
+            // check for reverse prompt
+            if (params.antiprompt.size()) {
+                std::string last_output;
+                for (auto id : last_n_tokens) {
+                    last_output += llama_token_to_piece(ctx, id);
+                }
+
+                is_antiprompt = false;
+                // Check if each of the reverse prompts appears at the end of the output.
+                // If we're not running interactively, the reverse prompt might be tokenized with some following characters
+                // so we'll compensate for that by widening the search window a bit.
+                for (std::string & antiprompt : params.antiprompt) {
+                    size_t extra_padding = params.interactive ? 0 : 2;
+                    size_t search_start_pos = last_output.length() > static_cast<size_t>(antiprompt.length() + extra_padding)
+                        ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
+                        : 0;
+
+                    if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
+                        if (params.interactive) {
+                            is_interacting = true;
+                            console::set_display(console::user_input);
+                        }
+                        is_antiprompt = true;
+                        fflush(stdout);
+                        break;
+                    }
+                }
+            }
+
+            // deal with end of text token in interactive mode
+            if (last_n_tokens.back() == llama_token_eos(ctx)) {
+                if (params.interactive) {
+                    if (params.antiprompt.size() != 0) {
+                        // tokenize and inject first reverse prompt
+                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+                        is_antiprompt = true;
+                    }
+
+                    is_interacting = true;
+                    printf("\n");
+                    console::set_display(console::user_input);
+                    fflush(stdout);
+                } else if (params.instruct) {
+                    is_interacting = true;
+                }
+            }
+
+            if (n_past > 0 && is_interacting) {
+                if (params.instruct) {
+                    printf("\n> ");
+                }
+
+                if (params.input_prefix_bos) {
+                    embd_inp.push_back(llama_token_bos(ctx));
+                }
+
+                std::string buffer;
+                if (!params.input_prefix.empty()) {
+                    buffer += params.input_prefix;
+                    printf("%s", buffer.c_str());
+                }
+
+                std::string line;
+                bool another_line = true;
+                do {
+                    another_line = console::readline(line, params.multiline_input);
+                    buffer += line;
+                } while (another_line);
+
+                // done taking input, reset color
+                console::set_display(console::reset);
+
+                // Add tokens to embd only if the input buffer is non-empty
+                // Entering a empty line lets the user pass control back
+                if (buffer.length() > 1) {
+                    // append input suffix if any
+                    if (!params.input_suffix.empty()) {
+                        buffer += params.input_suffix;
+                        printf("%s", params.input_suffix.c_str());
+                    }
+
+                    const size_t original_size = embd_inp.size();
+
+                    // instruct mode: insert instruction prefix
+                    if (params.instruct && !is_antiprompt) {
+                        n_consumed = embd_inp.size();
+                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
+                    }
+
+                    auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+
+                    // instruct mode: insert response suffix
+                    if (params.instruct) {
+                        embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
+                    }
+
+                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
+                        const llama_token token = embd_inp[i];
+                        output_tokens.push_back(token);
+                        output_ss << llama_token_to_piece(ctx, token);
+                    }
+
+                    n_remain -= line_inp.size();
+                }
+
+                input_echo = false; // do not echo this again
+            }
+
+            if (n_past > 0) {
+                if (is_interacting) {
+                    // reset grammar state if we're restarting generation
+                    if (grammar != NULL) {
+                        llama_grammar_free(grammar);
+
+                        std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
+                        grammar = llama_grammar_init(
+                            grammar_rules.data(), grammar_rules.size(),
+                            parsed_grammar.symbol_ids.at("root"));
+                    }
+                }
+                is_interacting = false;
+            }
+        }
+
+        // end of text token
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
+            fprintf(stderr, " [end of text]\n");
+            break;
+        }
+
+        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
+        // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
+        if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
+            n_remain = params.n_predict;
+            is_interacting = true;
+        }
+    }
+
+    if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
+        fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+        llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+    }
+
+    llama_print_timings(ctx);
+    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
+
+    if (ctx_guidance) { llama_free(ctx_guidance); }
+    llama_free(ctx);
+    llama_free_model(model);
+
+    if (grammar != NULL) {
+        llama_grammar_free(grammar);
+    }
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/ggml-mpi.c b/ggml-mpi.c
index ae176d70758..a908978298d 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -14,10 +14,14 @@
 struct ggml_mpi_context {
     int rank;
     int size;
+    MPI_Comm comm;
+    int layer_start;
+    int layer_end;
 };
 
 void ggml_mpi_backend_init(void) {
-    MPI_Init(NULL, NULL);
+    int ret;
+    MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &ret);
 }
 
 void ggml_mpi_backend_free(void) {
@@ -29,10 +33,19 @@ struct ggml_mpi_context * ggml_mpi_init(void) {
 
     MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
     MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
+    ctx->comm = MPI_COMM_WORLD;
 
     return ctx;
 }
 
+struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int color, int key) {
+    struct ggml_mpi_context * newCtx = calloc(1, sizeof(struct ggml_mpi_context));
+    MPI_Comm_split(ctx->comm, color, key, &newCtx->comm);
+    MPI_Comm_rank(newCtx->comm, &newCtx->rank);
+    MPI_Comm_size(newCtx->comm, &newCtx->size);
+    return newCtx;
+}
+
 void ggml_mpi_free(struct ggml_mpi_context * ctx) {
     free(ctx);
 }
@@ -41,19 +54,21 @@ int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
     return ctx->rank;
 }
 
+int ggml_mpi_size(struct ggml_mpi_context * ctx) {
+    return ctx->size;
+}
+
 void ggml_mpi_eval_init(
         struct ggml_mpi_context * ctx_mpi,
                             int * n_tokens,
                             int * n_past,
                             int * n_threads) {
-    UNUSED(ctx_mpi);
 
-    // synchronize the worker node parameters with the root node
-    MPI_Barrier(MPI_COMM_WORLD);
 
-    MPI_Bcast(n_tokens,  1, MPI_INT, 0, MPI_COMM_WORLD);
-    MPI_Bcast(n_past,    1, MPI_INT, 0, MPI_COMM_WORLD);
-    MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Barrier(ctx_mpi->comm);
+
+    MPI_Bcast(n_tokens,  1, MPI_INT, 0, ctx_mpi->comm);
+    MPI_Bcast(n_past,    1, MPI_INT, 0, ctx_mpi->comm);
 }
 
 static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
@@ -73,7 +88,8 @@ static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
     return -1;
 }
 
-static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
+
+static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst, MPI_Comm comm) {
     MPI_Datatype mpi_type;
 
     switch (t->type) {
@@ -82,11 +98,11 @@ static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
         default: GGML_ASSERT(false && "not implemented");
     }
 
-    const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
+    const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, comm);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
 
-static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
+static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src, MPI_Comm comm) {
     MPI_Datatype mpi_type;
 
     switch (t->type) {
@@ -97,10 +113,72 @@ static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
 
     MPI_Status status; UNUSED(status);
 
-    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, comm, &status);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
 
+uint16_t** ggml_mpi_split_range(
+    struct ggml_mpi_context * ctx_mpi,
+    uint16_t start,
+    uint16_t end,
+    float node_weights[]
+) {
+    // Splits the range given by start and end
+    // over the available nodes. This implementation
+    // assumes that node 0 handles the final part of the range
+    // while node 1 handles the beginning, to form a ring pipeline
+
+    // Only node 0 deals with the device splits, other nodes
+    // get the splits from the scatter layers operation
+
+    if (ctx_mpi->rank != 0) {
+        return NULL;
+    }
+
+    uint16_t range_length = end - start + 1;
+    uint16_t ** ranges = (uint16_t**) malloc(sizeof(uint16_t*) * ctx_mpi->size);
+    for (int i = 0; i < ctx_mpi->size; i++) {
+        ranges[i] = (uint16_t*) malloc(sizeof(uint16_t) * 2);
+    }
+    uint16_t next_layer = 0;
+    for (int i=1; i < ctx_mpi->size; i++) {
+        ranges[i][0] = next_layer;
+        ranges[i][1] = MIN(end, ranges[i][0] + (node_weights[i] * range_length) + start);
+        next_layer = ranges[i][1];
+    }
+
+    ranges[0][0] = next_layer;
+    ranges[0][1] = MIN(end, next_layer + (node_weights[0] * range_length) + start);
+    return ranges;
+
+}
+
+void ggml_mpi_scatter_layers(
+    struct ggml_mpi_context * ctx_mpi,
+    uint16_t ** layer_ranges
+) {
+    // Layer ranges is a 2d array with the first dimension
+    // having a length of the number of nodes and the second
+    // dimension having a length of 2. The inner arrays contain
+    // the start and end layer ID for a node.
+    uint16_t flattened_ranges[ctx_mpi->size * 2];
+
+    if (layer_ranges != NULL) {
+        for (int i = 0; i < ctx_mpi->size * 2; i += 2) {
+            fprintf(stderr, "In iteration %d\n", i);
+            flattened_ranges[i] = layer_ranges[i/2][0];
+            fprintf(stderr, "Got first element\n");
+            flattened_ranges[i + 1] = layer_ranges[i/2][1];
+        }
+    }
+
+    uint16_t received_range[2];
+    MPI_Scatter(flattened_ranges, 2, MPI_UINT16_T, received_range, 2, MPI_UINT16_T, 0, ctx_mpi->comm);
+    ctx_mpi->layer_start = received_range[0];
+    ctx_mpi->layer_end = received_range[1];
+    fprintf(stderr, "Ranges for rank %d: [%d, %d]\n", ctx_mpi->rank, ctx_mpi->layer_start, ctx_mpi->layer_end);
+}
+
 // TODO: there are many improvements that can be done to this implementation
 void ggml_mpi_graph_compute_pre(
         struct ggml_mpi_context * ctx_mpi,
@@ -134,29 +212,36 @@ void ggml_mpi_graph_compute_pre(
     // node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
     // node 0:   [(n-1) * n_per_node,            n_nodes)
     //
+
+
+
     if (mpi_rank > 0) {
         if (mpi_rank == 1) {
             // the first node (1) receives the input tokens from the main node (0)
-            ggml_mpi_tensor_recv(inp_tokens, 0);
+            ggml_mpi_tensor_recv(inp_tokens, 0, ctx_mpi->comm);
         } else {
             // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
-            ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
+            ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm);
         }
     } else if (mpi_size > 1) {
         // node 0 sends the input tokens to node 1
-        ggml_mpi_tensor_send(inp_tokens, 1);
+        ggml_mpi_tensor_send(inp_tokens, 1, ctx_mpi->comm);
 
         // recv the output data from the last node
-        ggml_mpi_tensor_recv(inp0, mpi_size - 1);
+        ggml_mpi_tensor_recv(inp0, mpi_size - 1, ctx_mpi->comm);
     }
 
     {
+
+
         const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
 
         const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
 
-        const int il0 =               (mpi_idx + 0) * n_per_node;
-        const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
+        //const int il0 =               (mpi_idx + 0) * n_per_node;
+        //const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
+        int il0 = ctx_mpi->layer_start;
+        int il1 = MIN(n_layers, ctx_mpi->layer_end);
 
         char name_l0[GGML_MAX_NAME];
         char name_l1[GGML_MAX_NAME];
@@ -196,7 +281,6 @@ void ggml_mpi_graph_compute_pre(
 
         gf->n_nodes = idx_l1 - idx_l0;
 
-        //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
     }
 }
 
@@ -211,6 +295,6 @@ void ggml_mpi_graph_compute_post(
 
     // send the output data to the next node
     if (mpi_rank > 0) {
-        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
+        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size, ctx_mpi->comm);
     }
 }
diff --git a/ggml-mpi.h b/ggml-mpi.h
index eda119d4498..2224943dc3d 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -1,4 +1,5 @@
 #pragma once
+#include <stdint.h>
 
 struct ggml_context;
 struct ggml_tensor;
@@ -14,15 +15,27 @@ void ggml_mpi_backend_init(void);
 void ggml_mpi_backend_free(void);
 
 struct ggml_mpi_context * ggml_mpi_init(void);
+struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int color, int key);
 void ggml_mpi_free(struct ggml_mpi_context * ctx);
 
 int ggml_mpi_rank(struct ggml_mpi_context * ctx);
-
+int ggml_mpi_size(struct ggml_mpi_context * ctx);
 void ggml_mpi_eval_init(
         struct ggml_mpi_context * ctx_mpi,
                             int * n_tokens,
                             int * n_past,
                             int * n_threads);
+uint16_t** ggml_mpi_split_range(
+    struct ggml_mpi_context * ctx_mpi,
+    uint16_t start,
+    uint16_t end,
+    float node_weights[]
+);
+
+void ggml_mpi_scatter_layers(
+    struct ggml_mpi_context * ctx_mpi,
+    uint16_t ** layer_ranges
+);
 
 void ggml_mpi_graph_compute_pre(
         struct ggml_mpi_context * ctx_mpi,
diff --git a/llama.cpp b/llama.cpp
index f2b5967d791..3b8c09786c1 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -850,9 +850,14 @@ struct llama_mmap {
         int flags = MAP_SHARED;
         // prefetch/readahead impairs performance on NUMA systems
         if (numa) { prefetch = 0; }
+
+#ifdef GGML_USE_MPI
+        prefetch = 0;
+#endif
 #ifdef __linux__
         if (prefetch) { flags |= MAP_POPULATE; }
 #endif
+
         addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
         if (addr == MAP_FAILED) {
             throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
@@ -8437,10 +8442,6 @@ void llama_backend_init(bool numa) {
     if (numa) {
         ggml_numa_init();
     }
-
-#ifdef GGML_USE_MPI
-    ggml_mpi_backend_init();
-#endif
 }
 
 void llama_backend_free(void) {
@@ -8676,20 +8677,21 @@ struct llama_context * llama_new_context_with_model(
 #ifdef GGML_USE_MPI
     ctx->ctx_mpi = ggml_mpi_init();
 
-    if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
-        // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
-        // TODO: needs fix after #3228
-        GGML_ASSERT(false && "not implemented");
-        //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
-        //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
-        llama_backend_free();
-        exit(1);
-    }
 #endif
 
     return ctx;
 }
 
+void llama_split_layers_weighted(struct llama_context * ctx, std::vector<float> device_weights) {
+#ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != device_weights.size()) {
+        GGML_ASSERT(false && "Must have same number of split percentages as devices");
+    }
+    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights.data());
+    ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
+#endif
+}
+
 void llama_free(struct llama_context * ctx) {
     delete ctx;
 }
@@ -9376,6 +9378,18 @@ int llama_eval(
                          int   n_past) {
     llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
 
+#ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
+        // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
+        const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
+        while (llama_decode_internal(*ctx, tmp.data(), nullptr, tmp.size(), 0, n_threads, nullptr)) {};
+        llama_backend_free();
+        exit(1);
+    }
+#endif
+
+
+
     const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
     if (ret < 0) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
diff --git a/llama.h b/llama.h
index 89cb6198e84..52e664a2a0f 100644
--- a/llama.h
+++ b/llama.h
@@ -12,7 +12,7 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdbool.h>
-
+#include <vector>
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
@@ -273,6 +273,8 @@ extern "C" {
                              const char * path_model,
             struct llama_model_params     params);
 
+    LLAMA_API void llama_split_layers_weighted(struct llama_context * ctx, std::vector<float> device_weights);
+
     LLAMA_API void llama_free_model(struct llama_model * model);
 
     LLAMA_API struct llama_context * llama_new_context_with_model(

From 1f3febc35114aa9885ed0643b92bd2499b44ca10 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sun, 24 Sep 2023 23:34:00 -0500
Subject: [PATCH 02/63] Add documentation for ggml-mpi functions

---
 ggml-mpi.c |   1 +
 ggml-mpi.h | 137 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+)

diff --git a/ggml-mpi.c b/ggml-mpi.c
index a908978298d..cef5ca6da4d 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -47,6 +47,7 @@ struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int
 }
 
 void ggml_mpi_free(struct ggml_mpi_context * ctx) {
+    MPI_Comm_free(ctx->comm);
     free(ctx);
 }
 
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 2224943dc3d..7eeb3856f24 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -9,22 +9,133 @@ struct ggml_cgraph;
 extern "C" {
 #endif
 
+/**
+ * The context used for MPI operations,
+ * a program may make use of more than one
+ * context but must always have at least one.
+ *
+ * The context stores required information like the
+ * node rank and a communicator to use for MPI operations.
+ * A context is guaranteed to be internally consistent,
+ * meaning that a context's stored rank is valid within
+ * the context's communicator.
+ */
 struct ggml_mpi_context;
 
+
+/**
+ * Initialize the MPI library and the GGML MPI backend.
+ * Calling more than once during the lifetime of the program
+ * leads to undefined behavior. This function must be called before
+ * any MPI operations.
+ */
 void ggml_mpi_backend_init(void);
+
+/**
+ * Frees the MPI backend, must be called only once at termination
+ * of the program. No MPI operations may be completed after calling this function,
+ * and attempting to do so will lead to undefined behavior.
+ */
 void ggml_mpi_backend_free(void);
 
+/**
+ * Construct a new MPI context using the MPI_WORLD
+ * communicator. This is useful only to create the
+ * initial context, as calling multiple times
+ * will only create effective copies of the same data.
+ *
+ * @return A context for us in the global communicator.
+ */
 struct ggml_mpi_context * ggml_mpi_init(void);
+
+/**
+ * Create a new context by splitting the given context's
+ * communicator, creating a "sub-communicator." This is a collective
+ * operation and must be performed by all nodes within the same communicator.
+ * The color and key have the same meaning as in MPI_Comm_split(), i.e.
+ * the color is used to determine the sub-communicator this node will belong to,
+ * and the key is the relative rank of this node in the new communicator.
+ *
+ * An example: if a node passes a color of 1, and a different node passes a color of 2,
+ * the nodes will belong to two different sub-communicators. If two nodes pass the same
+ * color, then their ranks will be ordered by the order of their keys. If they pass the same
+ * key, then the tie will be broken by the nodes' ranks in the old communicator.
+ *
+ * The communicator used by the given context remains entirely valid, so it is advisable
+ * to store both the old and new contexts. This allows an application to
+ * select at runtime which communicator to perform MPI operations with. An example
+ * would be to segregate the nodes into multiple domains categorized by the functions
+ * they perform, and use the original context to broadcast to all nodes in the cluster.
+ *
+ * @param ctx The context containing the communicator to split.
+ * @param color The sub-communicator that this node will belong to.
+ * @param key The relative rank of this node in the new communicator.
+ * @return A new context with all values referencing the newly-created communicator.
+ */
 struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int color, int key);
+
+/**
+ * Frees the given context, including the communicator. No MPI
+ * operations besides ggml_mpi_backend_freee(void) should be executed after
+ * running this function.
+ *
+ * @param ctx The context to free.
+ */
 void ggml_mpi_free(struct ggml_mpi_context * ctx);
 
+/**
+ * Get the rank of this node in the given context's communicator.
+ *
+ * @param ctx The context to use to determine the rank with regards to.
+ * @return The rank of this node.
+ */
 int ggml_mpi_rank(struct ggml_mpi_context * ctx);
+
+/**
+ * Get the number of nodes that are a part of
+ * the communicator referenced by the given context.
+ *
+ * @param ctx The context containing the communicator used for this size check.
+ * @return The number of nodes that are a part of the given context's communicator.
+ */
 int ggml_mpi_size(struct ggml_mpi_context * ctx);
+
+/**
+ * Synchronize needed information among the nodes
+ * to prepare for running an evaluation iteration.
+ * This is a collective operation and all nodes must
+ * call this function. It will block until all
+ * nodes have entered it, to prevent any desync
+ * between nodes.
+ *
+ * @param ctx_mpi The context in which to prepare for evaluation.
+ * @param n_tokens A pointer to the n_tokens, which will be synchronized after this function.
+ * @param n_past A pointer to the n_past, which will be synchronized after this function.
+ * @param n_threads A pointer to the n_threads, which is unused currently.
+ */
 void ggml_mpi_eval_init(
         struct ggml_mpi_context * ctx_mpi,
                             int * n_tokens,
                             int * n_past,
                             int * n_threads);
+
+/**
+ * Split a range across all nodes within the given
+ * context, weighting the allocations by the given weights.
+ * The dimensions of the returned 2d array are (number of nodes in the context, 2).
+ * The first element in the inner array is the starting point of the range allocated
+ * to the node indicated by the index into the outer array,
+ * and the second element is the end point of the allocated range, inclusive.
+ *
+ * @param ctx_mpi The context used to determine the number of nodes
+ *                to split the range across.
+ * @param start The starting point of the range.
+ * @param end The end point of the range, inclusive.
+ * @param node_weights How to weight the allocations across the nodes,
+ *                     must sum to 1.0.
+ * @return A 2d array, the first dimension is the number of nodes in the context
+ *         and the second dimension is 2.
+ */
 uint16_t** ggml_mpi_split_range(
     struct ggml_mpi_context * ctx_mpi,
     uint16_t start,
@@ -32,16 +143,42 @@ uint16_t** ggml_mpi_split_range(
     float node_weights[]
 );
 
+/**
+ * Scatter the layer ranges across all nodes
+ * in the given context. This is a collective operation
+ * and must be called by all nodes that are within the same
+ * communicator. The given layer ranges must be in the same
+ * format as created by the ggml_mpi_split_range().
+ *
+ * @param ctx_mpi The context to scatter the layers across.
+ * @param layer_ranges The pre-split ranges to scatter to the nodes.
+ */
 void ggml_mpi_scatter_layers(
     struct ggml_mpi_context * ctx_mpi,
     uint16_t ** layer_ranges
 );
 
+/**
+ * Modify compute graph to only process allocated
+ * layers.
+ *
+ * @param ctx_mpi The context containing the allocated layer range.
+ * @param gf The compute graph to modify
+ * @param n_layers The number of layers in the model, used as an upper bound in the layer ranges.
+ */
 void ggml_mpi_graph_compute_pre(
         struct ggml_mpi_context * ctx_mpi,
              struct ggml_cgraph * gf,
                             int   n_layers);
 
+/**
+ * Sends the output tensor to the next node for processing
+ * of later layers.
+ *
+ * @param ctx_mpi The context to use for MPI operations.
+ * @param gf The graph used in the computations
+ * @param n_layers The number of layers in the model.
+ */
 void ggml_mpi_graph_compute_post(
         struct ggml_mpi_context * ctx_mpi,
              struct ggml_cgraph * gf,

From d70f26c3405fc71a6cb8bffcc69a4495838d1141 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 25 Sep 2023 09:58:44 -0500
Subject: [PATCH 03/63] Add code comments in MPI

---
 examples/mpi/mpi.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
index 8d14d8e6186..38ed93746f7 100644
--- a/examples/mpi/mpi.cpp
+++ b/examples/mpi/mpi.cpp
@@ -20,6 +20,8 @@
 #include <sstream>
 #include <string>
 #include <vector>
+
+// TODO add Windows support
 #include <wordexp.h>
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
@@ -116,10 +118,13 @@ int main(int argc, char ** argv) {
         return 2;
     }
 
+    // Manually add the path used to launch this program to the
+    // options
     std::string rawOptions = argv[0];
     rawOptions += ' ';
     std::ifstream optionsFile(argv[1]);
     if (optionsFile.is_open()) {
+        // Read in the options file, appending to the launch path
         std::ostringstream buf;
         buf << optionsFile.rdbuf();
         rawOptions += buf.str();
@@ -130,22 +135,21 @@ int main(int argc, char ** argv) {
         return 3;
     }
 
+    // wordexp doesn't work right if there's a trailing newline, so strip it
     rawOptions.erase(rawOptions.find_last_not_of(" \t\n\r\f\v") + 1);
 
     printf("%s", rawOptions.c_str());
 
     wordexp_t  splitOptions;
     wordexp(rawOptions.c_str(), &splitOptions, WRDE_NOCMD);
-    //char** loadedArgs = (char **) malloc(1 + sizeof(char*) * splitOptions.we_wordc);
-    //loadedArgs[0] = argv[0];
-    //memcpy(&loadedArgs[1], splitOptions.we_wordv, sizeof(char*) * splitOptions.we_wordc);
-    printf("Loaded argc: %d", splitOptions.we_wordc);
+    fprintf(stderr, "Loaded arguments: ");
     for (int i = 0; i < splitOptions.we_wordc; i++) {
 
-        printf(" %s", splitOptions.we_wordv[i]);
+        fprintf(stderr, " %s", splitOptions.we_wordv[i]);
     }
-    printf("\n");
+    fprintf(stderr, "\n");
 
+    // Now we can parse like normal, but using the loaded options instead of the passed argv
     if (gpt_params_parse(splitOptions.we_wordc, splitOptions.we_wordv, params) == false) {
         wordfree(&splitOptions);
         return 1;

From 4bd95aec5475ac65642f14c58ca4c8164bc1e8a5 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 25 Sep 2023 10:05:42 -0500
Subject: [PATCH 04/63] Remove mtest (#3177)

---
 examples/mpi/mpi.cpp | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
index 38ed93746f7..393ef1b2ab0 100644
--- a/examples/mpi/mpi.cpp
+++ b/examples/mpi/mpi.cpp
@@ -233,23 +233,6 @@ int main(int argc, char ** argv) {
                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
     }
 
-    // determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
-    // uncomment the "used_mem" line in llama.cpp to see the results
-    if (params.mem_test) {
-        {
-            fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
-
-            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
-            llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
-        }
-
-        llama_print_timings(ctx);
-        llama_free(ctx);
-        llama_free_model(model);
-
-        return 0;
-    }
-
     // export the cgraph and exit
     if (params.export_cgraph) {
         llama_eval_export(ctx, "llama.ggml");

From f691b6179c9ff4d789b533dab1f6a6ddc8442c41 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 25 Sep 2023 10:15:30 -0500
Subject: [PATCH 05/63] Revert accidental removal of ggml_mpi_backend_init

---
 llama.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 3b8c09786c1..7cb47578416 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8442,6 +8442,10 @@ void llama_backend_init(bool numa) {
     if (numa) {
         ggml_numa_init();
     }
+
+#ifdef GGML_USE_MPI
+    ggml_mpi_backend_init();
+#endif
 }
 
 void llama_backend_free(void) {

From d7dbb6bb4a3a70e743f7061895345efbc9989beb Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 25 Sep 2023 17:41:57 -0500
Subject: [PATCH 06/63] Disable warmup under MPI

---
 common/common.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/common/common.cpp b/common/common.cpp
index f56ba760d96..14da09f993a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1009,9 +1009,16 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     {
         LOG("warming up the model with an empty run\n");
 
+#ifndef GGML_USE_MPI
+        // When using MPI, llama_eval() enters into an infinite loop
+        // on non-head nodes. Thus, we only want to warmup the model here
+        // if we aren't using MPI.
+        // FIXME have a way to terminate the infinite loop so we can warmup the model
+        //       in MPI mode
         std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
         llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
         llama_kv_cache_clear(lctx);
+#endif
         llama_reset_timings(lctx);
     }
 

From 1ff69c4fe9a75c7e35be35660881c51a0b88d0b2 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 25 Sep 2023 17:42:41 -0500
Subject: [PATCH 07/63] Update MPI example to follow main changes

---
 examples/mpi/mpi.cpp | 416 ++++++++++++++++++++-----------------------
 ggml-mpi.c           |   2 -
 2 files changed, 195 insertions(+), 223 deletions(-)

diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
index 393ef1b2ab0..84f15a82de8 100644
--- a/examples/mpi/mpi.cpp
+++ b/examples/mpi/mpi.cpp
@@ -1,9 +1,5 @@
-// Defines sigaction on msys:
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
 #include "common.h"
+
 #include "console.h"
 #include "llama.h"
 #include "build-info.h"
@@ -40,7 +36,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static gpt_params               * g_params;
@@ -49,10 +44,12 @@ static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting = false;
 
-void write_logfile(
-    const llama_context * ctx, const gpt_params & params, const llama_model * model,
-    const std::vector<llama_token> input_tokens, const std::string output, const std::vector<llama_token> output_tokens) {
 
+static void write_logfile(
+        const llama_context * ctx, const gpt_params & params, const llama_model * model,
+        const std::vector<llama_token> & input_tokens, const std::string & output,
+        const std::vector<llama_token> & output_tokens
+) {
     if (params.logdir.empty()) {
         return;
     }
@@ -93,7 +90,7 @@ void write_logfile(
 }
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-void sigint_handler(int signo) {
+static void sigint_handler(int signo) {
     if (signo == SIGINT) {
         if (!is_interacting) {
             is_interacting = true;
@@ -109,7 +106,6 @@ void sigint_handler(int signo) {
 #endif
 
 int main(int argc, char ** argv) {
-
     gpt_params params;
     g_params = &params;
 
@@ -156,6 +152,15 @@ int main(int argc, char ** argv) {
     }
     wordfree(&splitOptions);
 
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("main", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
+    // TODO: Dump params ?
+    //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
+
     // save choice to use color for later
     // (note for later: this is a slightly awkward choice)
     console::init(params.simple_io, params.use_color);
@@ -178,34 +183,28 @@ int main(int argc, char ** argv) {
     }
 
     if (params.rope_freq_base != 10000.0) {
-        fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
+        LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
     }
 
     if (params.rope_freq_scale != 1.0) {
-        fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
-    }
-
-    if (params.n_ctx > 2048) {
-        // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
-        fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
-    } else if (params.n_ctx < 8) {
-        fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
+        LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
     }
 
-    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+    LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
 
     if (params.seed == LLAMA_DEFAULT_SEED) {
         params.seed = time(NULL);
     }
 
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
+    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
     if (params.random_prompt) {
         params.prompt = gpt_random_prompt(rng);
     }
 
+    LOG("%s: llama backend init\n", __func__);
     llama_backend_init(params.numa);
 
     llama_model * model;
@@ -215,6 +214,7 @@ int main(int argc, char ** argv) {
     g_ctx = &ctx;
 
     // load the model and apply lora adapter, if any
+    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
     if (params.cfg_scale > 1.f) {
         struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
@@ -222,14 +222,23 @@ int main(int argc, char ** argv) {
     }
 
     if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_TEE("%s: error: unable to load model\n", __func__);
         return 1;
     }
 
+    const int n_ctx_train = llama_n_ctx_train(ctx);
+    if (params.n_ctx > n_ctx_train) {
+        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
+                __func__, n_ctx_train, params.n_ctx);
+    } else if (params.n_ctx < 8) {
+        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
+    }
+
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+        LOG_TEE("\n");
+        LOG_TEE("system_info: n_threads = %d / %d | %s\n",
                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
     }
 
@@ -241,13 +250,14 @@ int main(int argc, char ** argv) {
 
         return 0;
     }
+
     llama_split_layers_weighted(ctx, params.mpi_layer_split);
 
     std::string path_session = params.path_prompt_cache;
     std::vector<llama_token> session_tokens;
 
     if (!path_session.empty()) {
-        fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+        LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
 
         // fopen to check for existing session
         FILE * fp = std::fopen(path_session.c_str(), "rb");
@@ -257,33 +267,38 @@ int main(int argc, char ** argv) {
             session_tokens.resize(params.n_ctx);
             size_t n_token_count_out = 0;
             if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
-                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
                 return 1;
             }
             session_tokens.resize(n_token_count_out);
             llama_set_rng_seed(ctx, params.seed);
 
-            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
+            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
         } else {
-            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
+            LOG_TEE("%s: session file does not exist, will create\n", __func__);
         }
     }
 
-    // Add BOS if SPM tokenizer
     const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+    LOG("add_bos: %d\n", add_bos);
 
-    // tokenize the prompt
     std::vector<llama_token> embd_inp;
 
     if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
+        LOG("tokenize the prompt\n");
         embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
     } else {
+        LOG("use session tokens\n");
         embd_inp = session_tokens;
     }
 
+    LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
+    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+
     // Should not run without any tokens
     if (embd_inp.empty()) {
         embd_inp.push_back(llama_token_bos(ctx));
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
     }
 
     // Tokenize negative prompt
@@ -291,23 +306,31 @@ int main(int argc, char ** argv) {
     int guidance_offset = 0;
     int original_prompt_len = 0;
     if (ctx_guidance) {
+        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
+
         guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
+        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
 
         std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
+
         original_prompt_len = original_inp.size();
         guidance_offset = (int)guidance_inp.size() - original_prompt_len;
+        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
+        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
     }
 
     const int n_ctx = llama_n_ctx(ctx);
+    LOG("n_ctx: %d\n", n_ctx);
 
     if ((int) embd_inp.size() > n_ctx - 4) {
-        fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
         return 1;
     }
 
     // debug message about similarity of saved session, if applicable
     size_t n_matching_session_tokens = 0;
-    if (session_tokens.size()) {
+    if (!session_tokens.empty()) {
         for (llama_token id : session_tokens) {
             if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
                 break;
@@ -315,22 +338,27 @@ int main(int argc, char ** argv) {
             n_matching_session_tokens++;
         }
         if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
-            fprintf(stderr, "%s: using full prompt from session file\n", __func__);
+            LOG_TEE("%s: using full prompt from session file\n", __func__);
         } else if (n_matching_session_tokens >= embd_inp.size()) {
-            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
+            LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
         } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
-            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                    __func__, n_matching_session_tokens, embd_inp.size());
         } else {
-            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
+                    __func__, n_matching_session_tokens, embd_inp.size());
         }
     }
 
+    LOGLN(
+            "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
+            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
+
     // if we will use the cache for the full prompt without reaching the end of the cache, force
     // reevaluation of the last token token to recalculate the cached logits
-    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
-            session_tokens.size() > embd_inp.size()) {
+    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
+        LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
+
         session_tokens.resize(embd_inp.size() - 1);
     }
 
@@ -343,6 +371,9 @@ int main(int argc, char ** argv) {
     const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
     const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);
 
+    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
+    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
+
     // in instruct mode, we inject a prefix and a suffix to each input by the user
     if (params.instruct) {
         params.interactive_first = true;
@@ -355,30 +386,30 @@ int main(int argc, char ** argv) {
     }
 
     if (params.verbose_prompt) {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_TEE("\n");
+        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
         }
 
         if (ctx_guidance) {
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
-            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
+            LOG_TEE("\n");
+            LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
             for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
+                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
             }
         }
 
         if (params.n_keep > 0) {
-        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
+            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
-                fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
             }
-            fprintf(stderr, "'\n");
+            LOG_TEE("'\n");
         }
-        fprintf(stderr, "\n");
+        LOG_TEE("\n");
     }
 
     if (params.interactive) {
@@ -395,58 +426,59 @@ int main(int argc, char ** argv) {
         SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
-        fprintf(stderr, "%s: interactive mode on.\n", __func__);
+        LOG_TEE("%s: interactive mode on.\n", __func__);
 
-        if (params.antiprompt.size()) {
-            for (auto antiprompt : params.antiprompt) {
-                fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
+        if (!params.antiprompt.empty()) {
+            for (const auto & antiprompt : params.antiprompt) {
+                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
             }
         }
 
         if (params.input_prefix_bos) {
-            fprintf(stderr, "Input prefix with BOS\n");
+            LOG_TEE("Input prefix with BOS\n");
         }
 
         if (!params.input_prefix.empty()) {
-            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
         }
 
         if (!params.input_suffix.empty()) {
-            fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
         }
     }
-    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
             params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
-    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    fprintf(stderr, "\n\n");
+    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_TEE("\n\n");
 
+    struct llama_grammar * grammar = NULL;
     grammar_parser::parse_state parsed_grammar;
-    llama_grammar *             grammar = NULL;
+
     if (!params.grammar.empty()) {
         parsed_grammar = grammar_parser::parse(params.grammar.c_str());
         // will be empty (default) if there are parse errors
         if (parsed_grammar.rules.empty()) {
             return 1;
         }
-        fprintf(stderr, "%s: grammar:\n", __func__);
+        LOG_TEE("%s: grammar:\n", __func__);
         grammar_parser::print_grammar(stderr, parsed_grammar);
-        fprintf(stderr, "\n");
+        LOG_TEE("\n");
 
         {
             auto it = params.logit_bias.find(llama_token_eos(ctx));
             if (it != params.logit_bias.end() && it->second == -INFINITY) {
-                fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
+                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
             }
         }
 
         std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
         grammar = llama_grammar_init(
-            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+                grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
     }
 
     // TODO: replace with ring-buffer
-    std::vector<llama_token> last_n_tokens(n_ctx);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+    std::vector<llama_token> last_tokens(n_ctx);
+    std::fill(last_tokens.begin(), last_tokens.end(), 0);
 
     if (params.interactive) {
         const char *control_message;
@@ -458,11 +490,11 @@ int main(int argc, char ** argv) {
                               " - To return control without starting a new line, end your input with '/'.\n"
                               " - If you want to submit another line, end your input with '\\'.\n";
         }
-        fprintf(stderr, "== Running in interactive mode. ==\n"
+        LOG_TEE("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-               " - Press Ctrl+C to interject at any time.\n"
+        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-               "%s\n", control_message);
+        LOG_TEE(       "%s\n", control_message);
 
         is_interacting = params.interactive_first;
     }
@@ -487,27 +519,27 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd;
     std::vector<llama_token> embd_guidance;
 
-    // do one empty run to warm up the model
-    {
-        const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
-        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
-        llama_reset_timings(ctx);
-    }
+    const int n_vocab = llama_n_vocab(ctx);
+
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
 
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
-        if (embd.size() > 0) {
+        if (!embd.empty()) {
             // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
             // --prompt or --file which uses the same value.
-            auto max_embd_size = n_ctx - 4;
+            int max_embd_size = n_ctx - 4;
+
             // Ensure the input doesn't exceed the context size by truncating embd if necessary.
-            if ((int)embd.size() > max_embd_size) {
-                auto skipped_tokens = embd.size() - max_embd_size;
+            if ((int) embd.size() > max_embd_size) {
+                const int skipped_tokens = (int) embd.size() - max_embd_size;
+                embd.resize(max_embd_size);
+
                 console::set_display(console::error);
-                printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                 console::set_display(console::reset);
                 fflush(stdout);
-                embd.resize(max_embd_size);
             }
 
             // infinite text generation via context swapping
@@ -516,28 +548,26 @@ int main(int argc, char ** argv) {
             // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
             if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
                 if (params.n_predict == -2) {
-                    fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__);
+                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                     break;
                 }
 
                 const int n_left = n_past - params.n_keep;
+                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep);
+
                 // always keep the first token - BOS
-                n_past = std::max(1, params.n_keep);
+                n_past          = std::max(1, params.n_keep);
                 n_past_guidance = std::max(1, params.n_keep + guidance_offset);
 
-                // insert n_left/2 tokens at the start of embd from last_n_tokens
-                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
+                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
 
-                // stop saving session if we run out of context
-                path_session.clear();
+                // insert n_left/2 tokens at the start of embd from last_tokens
+                embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
 
-                //printf("\n---\n");
-                //printf("resetting: '");
-                //for (int i = 0; i < (int) embd.size(); i++) {
-                //    printf("%s", llama_token_to_piece(ctx, embd[i]));
-                //}
-                //printf("'\n");
-                //printf("\n---\n");
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+
+                LOG("clear session path\n");
+                path_session.clear();
             }
 
             // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
@@ -567,7 +597,7 @@ int main(int argc, char ** argv) {
 
             if (ctx_guidance) {
                 int input_size = 0;
-                llama_token* input_buf = NULL;
+                llama_token * input_buf = NULL;
 
                 if (n_past_guidance < (int) guidance_inp.size()) {
                     // Guidance context should have the same data with these modifications:
@@ -577,28 +607,25 @@ int main(int argc, char ** argv) {
                     embd_guidance = guidance_inp;
                     if (embd.begin() + original_prompt_len < embd.end()) {
                         embd_guidance.insert(
-                            embd_guidance.end(),
-                            embd.begin() + original_prompt_len,
-                            embd.end()
+                                embd_guidance.end(),
+                                embd.begin() + original_prompt_len,
+                                embd.end()
                         );
                     }
 
-                    input_buf = embd_guidance.data();
+                    input_buf  = embd_guidance.data();
                     input_size = embd_guidance.size();
-                    //fprintf(stderr, "\n---------------------\n");
-                    //for (int i = 0; i < (int) embd_guidance.size(); i++) {
-                        //fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i]));
-                    //}
-                    //fprintf(stderr, "\n---------------------\n");
+
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
                 } else {
-                    input_buf = embd.data();
+                    input_buf  = embd.data();
                     input_size = embd.size();
                 }
 
                 for (int i = 0; i < input_size; i += params.n_batch) {
                     int n_eval = std::min(input_size - i, params.n_batch);
                     if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
-                        fprintf(stderr, "%s : failed to eval\n", __func__);
+                        LOG_TEE("%s : failed to eval\n", __func__);
                         return 1;
                     }
 
@@ -611,14 +638,20 @@ int main(int argc, char ** argv) {
                 if (n_eval > params.n_batch) {
                     n_eval = params.n_batch;
                 }
+
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+
                 if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
-                    fprintf(stderr, "%s : failed to eval\n", __func__);
+                    LOG_TEE("%s : failed to eval\n", __func__);
                     return 1;
                 }
+
                 n_past += n_eval;
+
+                LOG("n_past = %d\n", n_past);
             }
 
-            if (embd.size() > 0 && !path_session.empty()) {
+            if (!embd.empty() && !path_session.empty()) {
                 session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
                 n_session_consumed = session_tokens.size();
             }
@@ -628,106 +661,21 @@ int main(int argc, char ** argv) {
         embd_guidance.clear();
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            // out of user input, sample next token
-            const float   temp            = params.temp;
-            const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
-            const float   top_p           = params.top_p;
-            const float   tfs_z           = params.tfs_z;
-            const float   typical_p       = params.typical_p;
-            const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-            const float   repeat_penalty  = params.repeat_penalty;
-            const float   alpha_presence  = params.presence_penalty;
-            const float   alpha_frequency = params.frequency_penalty;
-            const int     mirostat        = params.mirostat;
-            const float   mirostat_tau    = params.mirostat_tau;
-            const float   mirostat_eta    = params.mirostat_eta;
-            const bool    penalize_nl     = params.penalize_nl;
-
             // optionally save the session on first sample (for faster prompt loading next time)
             if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
                 need_to_save_session = false;
                 llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
-            }
-
-            llama_token id = 0;
-
-            {
-                auto logits  = llama_get_logits(ctx);
-                auto n_vocab = llama_n_vocab(ctx);
-
-                // Apply params.logit_bias map
-                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-                    logits[it->first] += it->second;
-                }
-
-                std::vector<llama_token_data> candidates;
-                candidates.reserve(n_vocab);
-                for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                    candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-                }
-
-                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-                if (ctx_guidance) {
-                    llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale);
-                }
-
-                // Apply penalties
-                float nl_logit = logits[llama_token_nl(ctx)];
-                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-                llama_sample_repetition_penalty(ctx, &candidates_p,
-                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                    last_n_repeat, repeat_penalty);
-                llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                    last_n_repeat, alpha_frequency, alpha_presence);
-                if (!penalize_nl) {
-                    for (size_t idx = 0; idx < candidates_p.size; idx++) {
-                        if (candidates_p.data[idx].id == llama_token_nl(ctx)) {
-                            candidates_p.data[idx].logit = nl_logit;
-                            break;
-                        }
-                    }
-                }
 
-                if (grammar != NULL) {
-                    llama_sample_grammar(ctx, &candidates_p, grammar);
-                }
+                LOG("saved session to %s\n", path_session.c_str());
+            }
 
-                if (temp <= 0) {
-                    // Greedy sampling
-                    id = llama_sample_token_greedy(ctx, &candidates_p);
-                } else {
-                    if (mirostat == 1) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        const int mirostat_m = 100;
-                        llama_sample_temperature(ctx, &candidates_p, temp);
-                        id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-                    } else if (mirostat == 2) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        llama_sample_temperature(ctx, &candidates_p, temp);
-                        id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-                    } else {
-                        // Temperature sampling
-                        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-                        llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
-                        llama_sample_typical(ctx, &candidates_p, typical_p, 1);
-                        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-                        llama_sample_temperature(ctx, &candidates_p, temp);
-                        id = llama_sample_token(ctx, &candidates_p);
-                    }
-                }
-                // printf("`%d`", candidates_p.size);
+            const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
 
-                if (grammar != NULL) {
-                    llama_grammar_accept_token(ctx, grammar, id);
-                }
+            last_tokens.erase(last_tokens.begin());
+            last_tokens.push_back(id);
 
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(id);
-            }
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
 
-            // add it to the context
             embd.push_back(id);
 
             // echo this to console
@@ -735,12 +683,15 @@ int main(int argc, char ** argv) {
 
             // decrement remaining sampling budget
             --n_remain;
+
+            LOG("n_remain: %d\n", n_remain);
         } else {
             // some user input remains from prompt or interaction, forward it to processing
+            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(embd_inp[n_consumed]);
+                last_tokens.erase(last_tokens.begin());
+                last_tokens.push_back(embd_inp[n_consumed]);
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
                     break;
@@ -764,17 +715,16 @@ int main(int argc, char ** argv) {
             fflush(stdout);
         }
         // reset color to default if we there is no pending user input
-        if (input_echo && (int)embd_inp.size() == n_consumed) {
+        if (input_echo && (int) embd_inp.size() == n_consumed) {
             console::set_display(console::reset);
         }
 
         // if not currently processing queued inputs;
         if ((int) embd_inp.size() <= n_consumed) {
-
             // check for reverse prompt
-            if (params.antiprompt.size()) {
+            if (!params.antiprompt.empty()) {
                 std::string last_output;
-                for (auto id : last_n_tokens) {
+                for (auto id : last_tokens) {
                     last_output += llama_token_to_piece(ctx, id);
                 }
 
@@ -785,10 +735,10 @@ int main(int argc, char ** argv) {
                 for (std::string & antiprompt : params.antiprompt) {
                     size_t extra_padding = params.interactive ? 0 : 2;
                     size_t search_start_pos = last_output.length() > static_cast<size_t>(antiprompt.length() + extra_padding)
-                        ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
-                        : 0;
+                                              ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
+                                              : 0;
 
-                    if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
+                    if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
                         if (params.interactive) {
                             is_interacting = true;
                             console::set_display(console::user_input);
@@ -798,12 +748,18 @@ int main(int argc, char ** argv) {
                         break;
                     }
                 }
+
+                if (is_antiprompt) {
+                    LOG("found antiprompt: %s\n", last_output.c_str());
+                }
             }
 
             // deal with end of text token in interactive mode
-            if (last_n_tokens.back() == llama_token_eos(ctx)) {
+            if (last_tokens.back() == llama_token_eos(ctx)) {
+                LOG("found EOS token\n");
+
                 if (params.interactive) {
-                    if (params.antiprompt.size() != 0) {
+                    if (!params.antiprompt.empty()) {
                         // tokenize and inject first reverse prompt
                         const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
                         embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
@@ -820,16 +776,20 @@ int main(int argc, char ** argv) {
             }
 
             if (n_past > 0 && is_interacting) {
+                LOG("waiting for user input\n");
+
                 if (params.instruct) {
                     printf("\n> ");
                 }
 
                 if (params.input_prefix_bos) {
+                    LOG("adding input prefix BOS token\n");
                     embd_inp.push_back(llama_token_bos(ctx));
                 }
 
                 std::string buffer;
                 if (!params.input_prefix.empty()) {
+                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                     buffer += params.input_prefix;
                     printf("%s", buffer.c_str());
                 }
@@ -849,23 +809,30 @@ int main(int argc, char ** argv) {
                 if (buffer.length() > 1) {
                     // append input suffix if any
                     if (!params.input_suffix.empty()) {
+                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                         buffer += params.input_suffix;
                         printf("%s", params.input_suffix.c_str());
                     }
 
+                    LOG("buffer: '%s'\n", buffer.c_str());
+
                     const size_t original_size = embd_inp.size();
 
                     // instruct mode: insert instruction prefix
                     if (params.instruct && !is_antiprompt) {
+                        LOG("inserting instruction prefix\n");
                         n_consumed = embd_inp.size();
                         embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                     }
 
-                    auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
+
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
                     // instruct mode: insert response suffix
                     if (params.instruct) {
+                        LOG("inserting instruction suffix\n");
                         embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
                     }
 
@@ -876,6 +843,9 @@ int main(int argc, char ** argv) {
                     }
 
                     n_remain -= line_inp.size();
+                    LOG("n_remain: %d\n", n_remain);
+                } else {
+                    LOG("empty line, passing control back\n");
                 }
 
                 input_echo = false; // do not echo this again
@@ -887,10 +857,10 @@ int main(int argc, char ** argv) {
                     if (grammar != NULL) {
                         llama_grammar_free(grammar);
 
-                        std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
+                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
                         grammar = llama_grammar_init(
-                            grammar_rules.data(), grammar_rules.size(),
-                            parsed_grammar.symbol_ids.at("root"));
+                                grammar_rules.data(), grammar_rules.size(),
+                                parsed_grammar.symbol_ids.at("root"));
                     }
                 }
                 is_interacting = false;
@@ -899,7 +869,7 @@ int main(int argc, char ** argv) {
 
         // end of text token
         if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
-            fprintf(stderr, " [end of text]\n");
+            LOG_TEE(" [end of text]\n");
             break;
         }
 
@@ -912,7 +882,7 @@ int main(int argc, char ** argv) {
     }
 
     if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
-        fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+        LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
         llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
     }
 
@@ -928,5 +898,9 @@ int main(int argc, char ** argv) {
     }
     llama_backend_free();
 
+#ifndef LOG_DISABLE_LOGS
+    LOG_TEE("Log end\n")
+#endif // LOG_DISABLE_LOGS
+
     return 0;
 }
diff --git a/ggml-mpi.c b/ggml-mpi.c
index cef5ca6da4d..9217651d683 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -166,9 +166,7 @@ void ggml_mpi_scatter_layers(
 
     if (layer_ranges != NULL) {
         for (int i = 0; i < ctx_mpi->size * 2; i += 2) {
-            fprintf(stderr, "In iteration %d\n", i);
             flattened_ranges[i] = layer_ranges[i/2][0];
-            fprintf(stderr, "Got first element\n");
             flattened_ranges[i + 1] = layer_ranges[i/2][1];
         }
     }

From 33185edd26153696efa5f3b2d5e121e8535f8fb1 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 25 Sep 2023 17:52:57 -0500
Subject: [PATCH 08/63] Remove fprintf logs from mpi main

---
 examples/mpi/mpi.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
index 84f15a82de8..0bf8f2f804a 100644
--- a/examples/mpi/mpi.cpp
+++ b/examples/mpi/mpi.cpp
@@ -134,16 +134,8 @@ int main(int argc, char ** argv) {
     // wordexp doesn't work right if there's a trailing newline, so strip it
     rawOptions.erase(rawOptions.find_last_not_of(" \t\n\r\f\v") + 1);
 
-    printf("%s", rawOptions.c_str());
-
     wordexp_t  splitOptions;
     wordexp(rawOptions.c_str(), &splitOptions, WRDE_NOCMD);
-    fprintf(stderr, "Loaded arguments: ");
-    for (int i = 0; i < splitOptions.we_wordc; i++) {
-
-        fprintf(stderr, " %s", splitOptions.we_wordv[i]);
-    }
-    fprintf(stderr, "\n");
 
     // Now we can parse like normal, but using the loaded options instead of the passed argv
     if (gpt_params_parse(splitOptions.we_wordc, splitOptions.we_wordv, params) == false) {

From f67fcbcf9aae65ebf0f32b70083f04628dd62622 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 25 Sep 2023 18:59:14 -0500
Subject: [PATCH 09/63] Remove unrelated sections from mpi readme

---
 examples/mpi/README.md | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/examples/mpi/README.md b/examples/mpi/README.md
index 44a04791567..4b934b0edbc 100644
--- a/examples/mpi/README.md
+++ b/examples/mpi/README.md
@@ -48,26 +48,6 @@ AI: What would you like to talk about?
 User:'
 ```
 
-#### Windows:
-
-```powershell
-main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
-```
-
-The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
-
-#### Unix-based systems (Linux, macOS, etc.):
-
-```bash
-./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
-```
-
-#### Windows:
-
-```powershell
-main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
-```
-
 ## Common Options
 
 In this section, we cover the most commonly used options for running the `mpi` program with the LLaMA models:

From 907f8075b982be1f91889d5ed721504298aed258 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Thu, 28 Sep 2023 12:39:34 -0500
Subject: [PATCH 10/63] Replace vector with C-style array and length in
 llama_split_layers_weighted

---
 examples/mpi/mpi.cpp | 2 +-
 llama.cpp            | 6 +++---
 llama.h              | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
index 0bf8f2f804a..5dfa70f5cf2 100644
--- a/examples/mpi/mpi.cpp
+++ b/examples/mpi/mpi.cpp
@@ -243,7 +243,7 @@ int main(int argc, char ** argv) {
         return 0;
     }
 
-    llama_split_layers_weighted(ctx, params.mpi_layer_split);
+    llama_split_layers_weighted(ctx, params.mpi_layer_split.data(), params.mpi_layer_split.size());
 
     std::string path_session = params.path_prompt_cache;
     std::vector<llama_token> session_tokens;
diff --git a/llama.cpp b/llama.cpp
index 7cb47578416..e3d656f1ce6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8686,12 +8686,12 @@ struct llama_context * llama_new_context_with_model(
     return ctx;
 }
 
-void llama_split_layers_weighted(struct llama_context * ctx, std::vector<float> device_weights) {
+void llama_split_layers_weighted(struct llama_context * ctx, float device_weights[], size_t num_weights) {
 #ifdef GGML_USE_MPI
-    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != device_weights.size()) {
+    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != num_weights) {
         GGML_ASSERT(false && "Must have same number of split percentages as devices");
     }
-    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights.data());
+    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights);
     ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
 #endif
 }
diff --git a/llama.h b/llama.h
index 52e664a2a0f..c94e9a5ef19 100644
--- a/llama.h
+++ b/llama.h
@@ -273,7 +273,7 @@ extern "C" {
                              const char * path_model,
             struct llama_model_params     params);
 
-    LLAMA_API void llama_split_layers_weighted(struct llama_context * ctx, std::vector<float> device_weights);
+    LLAMA_API void llama_split_layers_weighted(struct llama_context * ctx, float device_weights[], size_t num_weights);
 
     LLAMA_API void llama_free_model(struct llama_model * model);
 

From 6b1c471986660562e9e502f591d408173f803392 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 24 Oct 2023 12:00:52 -0500
Subject: [PATCH 11/63] Fix minor rebase errors

---
 llama.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama.h b/llama.h
index c94e9a5ef19..f0b34bf2ec9 100644
--- a/llama.h
+++ b/llama.h
@@ -12,7 +12,6 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdbool.h>
-#include <vector>
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD

From afc2cc41e5865aa4573e2b1b3b612b05207c8622 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 25 Oct 2023 17:15:11 -0500
Subject: [PATCH 12/63] Fix MPI compilation errors

---
 examples/mpi/mpi.cpp | 232 ++++++++++++++++++++-----------------------
 llama.cpp            |   5 +-
 2 files changed, 108 insertions(+), 129 deletions(-)

diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
index 5dfa70f5cf2..3030918bf24 100644
--- a/examples/mpi/mpi.cpp
+++ b/examples/mpi/mpi.cpp
@@ -3,7 +3,6 @@
 #include "console.h"
 #include "llama.h"
 #include "build-info.h"
-#include "grammar-parser.h"
 
 #include <cassert>
 #include <cinttypes>
@@ -143,6 +142,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
     wordfree(&splitOptions);
+    llama_sampling_params & sparams = params.sparams;
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("main", "log"));
@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
     console::init(params.simple_io, params.use_color);
     atexit([]() { console::cleanup(); });
 
-    if (params.perplexity) {
+    if (params.logits_all) {
         printf("\n************\n");
         printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
         printf("************\n\n");
@@ -174,12 +174,17 @@ int main(int argc, char ** argv) {
         return 0;
     }
 
-    if (params.rope_freq_base != 10000.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
+    if (params.n_ctx != 0 && params.n_ctx < 8) {
+        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
+    }
+
+    if (params.rope_freq_base != 0.0) {
+        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
     }
 
-    if (params.rope_freq_scale != 1.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
+    if (params.rope_freq_scale != 0.0) {
+        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
     LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
@@ -208,7 +213,7 @@ int main(int argc, char ** argv) {
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (params.cfg_scale > 1.f) {
+    if (sparams.cfg_scale > 1.f) {
         struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
         ctx_guidance = llama_new_context_with_model(model, lparams);
     }
@@ -218,29 +223,19 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    const int n_ctx_train = llama_n_ctx_train(ctx);
-    if (params.n_ctx > n_ctx_train) {
+    const int n_ctx_train = llama_n_ctx_train(model);
+    const int n_ctx = llama_n_ctx(ctx);
+    LOG("n_ctx: %d\n", n_ctx);
+
+    if (n_ctx > n_ctx_train) {
         LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, params.n_ctx);
-    } else if (params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
+                __func__, n_ctx_train, n_ctx);
     }
 
     // print system information
     {
         LOG_TEE("\n");
-        LOG_TEE("system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
-    }
-
-    // export the cgraph and exit
-    if (params.export_cgraph) {
-        llama_eval_export(ctx, "llama.ggml");
-        llama_free(ctx);
-        llama_free_model(model);
-
-        return 0;
+        LOG_TEE("%s\n", get_system_info(params).c_str());
     }
 
     llama_split_layers_weighted(ctx, params.mpi_layer_split.data(), params.mpi_layer_split.size());
@@ -256,7 +251,7 @@ int main(int argc, char ** argv) {
         if (fp != NULL) {
             std::fclose(fp);
 
-            session_tokens.resize(params.n_ctx);
+            session_tokens.resize(n_ctx);
             size_t n_token_count_out = 0;
             if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                 LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
@@ -271,26 +266,26 @@ int main(int argc, char ** argv) {
         }
     }
 
-    const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+    const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
     LOG("add_bos: %d\n", add_bos);
 
     std::vector<llama_token> embd_inp;
 
     if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
         LOG("tokenize the prompt\n");
-        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
     } else {
         LOG("use session tokens\n");
         embd_inp = session_tokens;
     }
 
     LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
         embd_inp.push_back(llama_token_bos(ctx));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
     }
 
     // Tokenize negative prompt
@@ -298,13 +293,13 @@ int main(int argc, char ** argv) {
     int guidance_offset = 0;
     int original_prompt_len = 0;
     if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
+        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
 
-        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
+        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
 
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
 
         original_prompt_len = original_inp.size();
         guidance_offset = (int)guidance_inp.size() - original_prompt_len;
@@ -312,9 +307,6 @@ int main(int argc, char ** argv) {
         LOG("guidance_offset:     %s", log_tostr(guidance_offset));
     }
 
-    const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
-
     if ((int) embd_inp.size() > n_ctx - 4) {
         LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
         return 1;
@@ -340,6 +332,9 @@ int main(int argc, char ** argv) {
             LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
                     __func__, n_matching_session_tokens, embd_inp.size());
         }
+
+        // remove any "future" tokens that we might have inherited from the previous session
+        llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
     }
 
     LOGLN(
@@ -360,11 +355,11 @@ int main(int argc, char ** argv) {
     }
 
     // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false,   true);
 
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
+    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
+    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
 
     // in instruct mode, we inject a prefix and a suffix to each input by the user
     if (params.instruct) {
@@ -387,7 +382,7 @@ int main(int argc, char ** argv) {
 
         if (ctx_guidance) {
             LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
             LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
             for (int i = 0; i < (int) guidance_inp.size(); i++) {
                 LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
@@ -423,6 +418,12 @@ int main(int argc, char ** argv) {
         if (!params.antiprompt.empty()) {
             for (const auto & antiprompt : params.antiprompt) {
                 LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
+                if (params.verbose_prompt) {
+                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
+                    for (int i = 0; i < (int) tmp.size(); i++) {
+                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    }
+                }
             }
         }
 
@@ -432,46 +433,28 @@ int main(int argc, char ** argv) {
 
         if (!params.input_prefix.empty()) {
             LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            if (params.verbose_prompt) {
+                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                }
+            }
         }
 
         if (!params.input_suffix.empty()) {
             LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+            if (params.verbose_prompt) {
+                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                }
+            }
         }
     }
-    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
+    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     LOG_TEE("\n\n");
 
-    struct llama_grammar * grammar = NULL;
-    grammar_parser::parse_state parsed_grammar;
-
-    if (!params.grammar.empty()) {
-        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-        // will be empty (default) if there are parse errors
-        if (parsed_grammar.rules.empty()) {
-            return 1;
-        }
-        LOG_TEE("%s: grammar:\n", __func__);
-        grammar_parser::print_grammar(stderr, parsed_grammar);
-        LOG_TEE("\n");
-
-        {
-            auto it = params.logit_bias.find(llama_token_eos(ctx));
-            if (it != params.logit_bias.end() && it->second == -INFINITY) {
-                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
-            }
-        }
-
-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-        grammar = llama_grammar_init(
-                grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-    }
-
-    // TODO: replace with ring-buffer
-    std::vector<llama_token> last_tokens(n_ctx);
-    std::fill(last_tokens.begin(), last_tokens.end(), 0);
-
     if (params.interactive) {
         const char *control_message;
         if (params.multiline_input) {
@@ -511,10 +494,7 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd;
     std::vector<llama_token> embd_guidance;
 
-    const int n_vocab = llama_n_vocab(ctx);
-
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
 
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
@@ -544,19 +524,24 @@ int main(int argc, char ** argv) {
                     break;
                 }
 
-                const int n_left = n_past - params.n_keep;
-                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep);
+                const int n_left    = n_past - params.n_keep - 1;
+                const int n_discard = n_left/2;
 
-                // always keep the first token - BOS
-                n_past          = std::max(1, params.n_keep);
-                n_past_guidance = std::max(1, params.n_keep + guidance_offset);
+                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                    n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
+                llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
 
-                // insert n_left/2 tokens at the start of embd from last_tokens
-                embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
+                n_past -= n_discard;
 
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                if (ctx_guidance) {
+                    n_past_guidance -= n_discard;
+                }
+
+                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
+
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
 
                 LOG("clear session path\n");
                 path_session.clear();
@@ -586,7 +571,6 @@ int main(int argc, char ** argv) {
 
             // evaluate tokens in batches
             // embd is typically prepared beforehand to fit within a batch, but not always
-
             if (ctx_guidance) {
                 int input_size = 0;
                 llama_token * input_buf = NULL;
@@ -608,7 +592,7 @@ int main(int argc, char ** argv) {
                     input_buf  = embd_guidance.data();
                     input_size = embd_guidance.size();
 
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
                 } else {
                     input_buf  = embd.data();
                     input_size = embd.size();
@@ -616,7 +600,7 @@ int main(int argc, char ** argv) {
 
                 for (int i = 0; i < input_size; i += params.n_batch) {
                     int n_eval = std::min(input_size - i, params.n_batch);
-                    if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
+                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
                         LOG_TEE("%s : failed to eval\n", __func__);
                         return 1;
                     }
@@ -631,9 +615,9 @@ int main(int argc, char ** argv) {
                     n_eval = params.n_batch;
                 }
 
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
 
-                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
+                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
                     LOG_TEE("%s : failed to eval\n", __func__);
                     return 1;
                 }
@@ -661,12 +645,11 @@ int main(int argc, char ** argv) {
                 LOG("saved session to %s\n", path_session.c_str());
             }
 
-            const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
 
-            last_tokens.erase(last_tokens.begin());
-            last_tokens.push_back(id);
+            llama_sampling_accept(ctx_sampling, ctx, id, true);
 
-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
 
             embd.push_back(id);
 
@@ -682,8 +665,11 @@ int main(int argc, char ** argv) {
             LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
-                last_tokens.erase(last_tokens.begin());
-                last_tokens.push_back(embd_inp[n_consumed]);
+
+                // push the prompt in the sampling context in order to apply repetition penalties later
+                // for the prompt, we don't apply grammar rules
+                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
+
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
                     break;
@@ -706,19 +692,17 @@ int main(int argc, char ** argv) {
             }
             fflush(stdout);
         }
-        // reset color to default if we there is no pending user input
+        // reset color to default if there is no pending user input
         if (input_echo && (int) embd_inp.size() == n_consumed) {
             console::set_display(console::reset);
         }
 
         // if not currently processing queued inputs;
         if ((int) embd_inp.size() <= n_consumed) {
-            // check for reverse prompt
+            // check for reverse prompt in the last n_prev tokens
             if (!params.antiprompt.empty()) {
-                std::string last_output;
-                for (auto id : last_tokens) {
-                    last_output += llama_token_to_piece(ctx, id);
-                }
+                const int n_prev = 32;
+                const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
 
                 is_antiprompt = false;
                 // Check if each of the reverse prompts appears at the end of the output.
@@ -733,10 +717,8 @@ int main(int argc, char ** argv) {
                     if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
                         if (params.interactive) {
                             is_interacting = true;
-                            console::set_display(console::user_input);
                         }
                         is_antiprompt = true;
-                        fflush(stdout);
                         break;
                     }
                 }
@@ -747,21 +729,19 @@ int main(int argc, char ** argv) {
             }
 
             // deal with end of text token in interactive mode
-            if (last_tokens.back() == llama_token_eos(ctx)) {
+            if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) {
                 LOG("found EOS token\n");
 
                 if (params.interactive) {
                     if (!params.antiprompt.empty()) {
                         // tokenize and inject first reverse prompt
-                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
                         embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                         is_antiprompt = true;
                     }
 
                     is_interacting = true;
                     printf("\n");
-                    console::set_display(console::user_input);
-                    fflush(stdout);
                 } else if (params.instruct) {
                     is_interacting = true;
                 }
@@ -782,10 +762,12 @@ int main(int argc, char ** argv) {
                 std::string buffer;
                 if (!params.input_prefix.empty()) {
                     LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
+                    printf("%s", params.input_prefix.c_str());
                 }
 
+                // color user input only
+                console::set_display(console::user_input);
+
                 std::string line;
                 bool another_line = true;
                 do {
@@ -802,7 +784,6 @@ int main(int argc, char ** argv) {
                     // append input suffix if any
                     if (!params.input_suffix.empty()) {
                         LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        buffer += params.input_suffix;
                         printf("%s", params.input_suffix.c_str());
                     }
 
@@ -816,11 +797,18 @@ int main(int argc, char ** argv) {
                         n_consumed = embd_inp.size();
                         embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                     }
+                    if (params.escape) {
+                        process_escapes(buffer);
+                    }
 
-                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
+                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
+                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
 
+                    embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+                    embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
 
                     // instruct mode: insert response suffix
                     if (params.instruct) {
@@ -845,15 +833,7 @@ int main(int argc, char ** argv) {
 
             if (n_past > 0) {
                 if (is_interacting) {
-                    // reset grammar state if we're restarting generation
-                    if (grammar != NULL) {
-                        llama_grammar_free(grammar);
-
-                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-                        grammar = llama_grammar_init(
-                                grammar_rules.data(), grammar_rules.size(),
-                                parsed_grammar.symbol_ids.at("root"));
-                    }
+                    llama_sampling_reset(ctx_sampling);
                 }
                 is_interacting = false;
             }
@@ -885,13 +865,11 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
-    if (grammar != NULL) {
-        llama_grammar_free(grammar);
-    }
+    llama_sampling_free(ctx_sampling);
     llama_backend_free();
 
 #ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n")
+    LOG_TEE("Log end\n");
 #endif // LOG_DISABLE_LOGS
 
     return 0;
diff --git a/llama.cpp b/llama.cpp
index e3d656f1ce6..9f877116cf9 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9385,8 +9385,9 @@ int llama_eval(
 #ifdef GGML_USE_MPI
     if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
         // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
-        const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
-        while (llama_decode_internal(*ctx, tmp.data(), nullptr, tmp.size(), 0, n_threads, nullptr)) {};
+        const int n_ctx = llama_n_ctx(ctx);
+        std::vector<llama_token> tmp(n_ctx, llama_token_bos(ctx));
+        while (llama_decode_internal(*ctx, llama_batch_get_one(tmp.data(), tmp.size(), n_past, 0))) {};
         llama_backend_free();
         exit(1);
     }

From efd73fe80793a95923586e607f3f5c43f9d5140f Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sun, 29 Oct 2023 15:16:16 -0500
Subject: [PATCH 13/63] Synchronize batch sequence info, fixing MPI for
 llama_decode()

---
 common/common.cpp |  2 +-
 ggml-mpi.c        | 63 ++++++++++++++++++++++++++++++++++++++++++-----
 ggml-mpi.h        | 21 +++++++++++-----
 llama.cpp         | 36 +++++++++++++++++----------
 4 files changed, 96 insertions(+), 26 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 14da09f993a..ed25964ccd1 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1010,7 +1010,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         LOG("warming up the model with an empty run\n");
 
 #ifndef GGML_USE_MPI
-        // When using MPI, llama_eval() enters into an infinite loop
+        // When using MPI, llama_decode() enters into an infinite loop
         // on non-head nodes. Thus, we only want to warmup the model here
         // if we aren't using MPI.
         // FIXME have a way to terminate the infinite loop so we can warmup the model
diff --git a/ggml-mpi.c b/ggml-mpi.c
index 9217651d683..1e4d0b376fe 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -60,16 +60,67 @@ int ggml_mpi_size(struct ggml_mpi_context * ctx) {
 }
 
 void ggml_mpi_eval_init(
-        struct ggml_mpi_context * ctx_mpi,
-                            int * n_tokens,
-                            int * n_past,
-                            int * n_threads) {
+        struct ggml_mpi_context *   ctx_mpi,
+                int32_t         *   n_tokens,
+                int32_t         **  pos,
+                int32_t         **  n_seq_ids,
+                int32_t         *** seq_id,
+                int8_t          **  logits) {
 
 
     MPI_Barrier(ctx_mpi->comm);
 
-    MPI_Bcast(n_tokens,  1, MPI_INT, 0, ctx_mpi->comm);
-    MPI_Bcast(n_past,    1, MPI_INT, 0, ctx_mpi->comm);
+    MPI_Bcast(n_tokens, 1, MPI_INT, 0, ctx_mpi->comm);
+
+    if (ctx_mpi->rank != 0) {
+        *pos = calloc(*n_tokens, sizeof(int32_t));
+        *n_seq_ids = calloc(*n_tokens, sizeof(int32_t));
+        *logits = calloc(*n_tokens, sizeof(int8_t));
+    }
+
+    int32_t total_n_seq_ids = 0;
+    for (size_t i = 0; i < *n_tokens; i++) {
+        total_n_seq_ids += (*n_seq_ids)[i];
+    }
+
+    MPI_Bcast(&total_n_seq_ids,     1,               MPI_INT32_T, 0, ctx_mpi->comm);
+    MPI_Bcast(*n_seq_ids,                  *n_tokens,        MPI_INT32_T, 0, ctx_mpi->comm);
+
+    int32_t * flattened_seq_ids = calloc(total_n_seq_ids, sizeof(int32_t));
+
+    int32_t current_index = 0;
+
+    if (ctx_mpi->rank == 0) {
+        for (size_t i = 0; i < *n_tokens; i++) {
+            for (size_t j = 0; j < (*n_seq_ids)[i]; j++) {
+                flattened_seq_ids[current_index] = (*seq_id)[i][j];
+                current_index++;
+            }
+        }
+    }
+
+
+    MPI_Bcast(*pos,                  *n_tokens,        MPI_INT32_T, 0, ctx_mpi->comm);
+    MPI_Bcast(flattened_seq_ids,    total_n_seq_ids, MPI_INT32_T, 0, ctx_mpi->comm);
+    //MPI_Bcast(*logits,               *n_tokens,        MPI_INT8_T, 0, ctx_mpi->comm);
+    int32_t ** new_seq_id = calloc(*n_tokens, sizeof(int32_t*));
+    current_index = 0;
+    for (size_t i = 0; i < *n_tokens; i++) {
+        new_seq_id[i] = calloc((*n_seq_ids)[i], sizeof(int32_t));
+        for (size_t j = 0; j < (*n_seq_ids)[i]; j++) {
+            new_seq_id[i][j] = flattened_seq_ids[current_index];
+            current_index++;
+        }
+    }
+    free(flattened_seq_ids);
+    *seq_id = new_seq_id;
+}
+
+void ggml_mpi_synch_int(
+        struct ggml_mpi_context     * ctx_mpi,
+        int32_t * val
+) {
+    MPI_Bcast(val, 1, MPI_INT32_T, 0, ctx_mpi->comm);
 }
 
 static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 7eeb3856f24..f3c4bf2aa45 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -110,14 +110,23 @@ int ggml_mpi_size(struct ggml_mpi_context * ctx);
  *
  * @param ctx_mpi The context in which to prepare for evaluation.
  * @param n_tokens A pointer to the n_tokens, which will be synchronized after this function.
- * @param n_past A pointer to the n_past, which will be synchronized after this function.
- * @param n_threads A pointer to the n_threads, which is unused currently.
+ * @param pos A pointer to the pos array, which will be synchronized after this function.
+ * @param n_seq_ids A pointer to the n_seq_ids array, which will be synchronized after this function.
+ * @param seq_id A pointer to the seq_id 2D array, which will be synchronized after this function.
+ * @param logits A pointer to the logits array, which is unused currently since only node 0 needs them.
  */
 void ggml_mpi_eval_init(
-        struct ggml_mpi_context * ctx_mpi,
-                            int * n_tokens,
-                            int * n_past,
-                            int * n_threads);
+        struct ggml_mpi_context *   ctx_mpi,
+                int32_t         *   n_tokens,
+                int32_t         **  pos,
+                int32_t         **  n_seq_ids,
+                int32_t         *** seq_id,
+                int8_t          **  logits);
+
+void ggml_mpi_synch_int(
+        struct ggml_mpi_context     * ctx_mpi,
+                int32_t * val
+        );
 
 /**
  * Split a range across all nodes within the given
diff --git a/llama.cpp b/llama.cpp
index 9f877116cf9..6f6e896ed42 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5423,8 +5423,7 @@ static struct ggml_cgraph * llama_build_graph(
 static int llama_decode_internal(
          llama_context & lctx,
            llama_batch   batch) {
-    const uint32_t n_tokens = batch.n_tokens;
-
+    uint32_t n_tokens = batch.n_tokens;
     if (n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
         return -1;
@@ -5443,12 +5442,6 @@ static int llama_decode_internal(
 
     const int64_t t_start_us = ggml_time_us();
 
-#ifdef GGML_USE_MPI
-    // TODO: needs fix after #3228
-    GGML_ASSERT(false && "not implemented");
-    //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
-#endif
-
     GGML_ASSERT(n_threads > 0);
 
     auto & kv_self = lctx.kv_self;
@@ -5496,6 +5489,11 @@ static int llama_decode_internal(
         kv_self.head = 0;
     }
 
+#ifdef GGML_USE_MPI
+    // TODO: needs fix after #3228
+    ggml_mpi_eval_init(lctx.ctx_mpi, &(batch.n_tokens), &(batch.pos), &(batch.n_seq_id), &(batch.seq_id), &(batch.logits));
+    n_tokens = batch.n_tokens;
+#endif
     if (!llama_kv_cache_find_slot(kv_self, batch)) {
         return 1;
     }
@@ -5509,7 +5507,6 @@ static int llama_decode_internal(
     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
     ggml_allocr_reset(lctx.alloc);
-
     ggml_cgraph * gf = llama_build_graph(lctx, batch);
 
     ggml_allocr_alloc_graph(lctx.alloc, gf);
@@ -5569,7 +5566,6 @@ static int llama_decode_internal(
     if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
         n_threads = 1;
     }
-
 #if GGML_USE_MPI
     const int64_t n_layer = hparams.n_layer;
     ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
@@ -9380,21 +9376,24 @@ int llama_eval(
                  llama_token * tokens,
                      int32_t   n_tokens,
                          int   n_past) {
-    llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
+
 
 #ifdef GGML_USE_MPI
     if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
         // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
         const int n_ctx = llama_n_ctx(ctx);
         std::vector<llama_token> tmp(n_ctx, llama_token_bos(ctx));
-        while (llama_decode_internal(*ctx, llama_batch_get_one(tmp.data(), tmp.size(), n_past, 0))) {};
+        do {
+            //ggml_mpi_synch_int(ctx->ctx_mpi, &n_past);
+            llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
+        } while (llama_decode_internal(*ctx, llama_batch_get_one(tmp.data(), tmp.size(), n_past, 0)) >= 0);
         llama_backend_free();
         exit(1);
     }
 #endif
 
 
-
+    llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
     const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
     if (ret < 0) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
@@ -9481,6 +9480,17 @@ void llama_batch_free(struct llama_batch batch) {
 int llama_decode(
         struct llama_context * ctx,
           struct llama_batch   batch) {
+
+#ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
+        // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
+        const int n_ctx = llama_n_ctx(ctx);
+        std::vector<llama_token> tmp(n_ctx, llama_token_bos(ctx));
+        while (llama_decode_internal(*ctx, batch) >= 0){};
+        llama_backend_free();
+        exit(1);
+    }
+#endif
     const int ret = llama_decode_internal(*ctx, batch);
     if (ret < 0) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);

From 3fa2527de3e829a40fcefd5b78c73dfe5208b16b Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 30 Oct 2023 10:50:20 -0500
Subject: [PATCH 14/63] Update MPI code to new KV seq rm and bos/eos model APIs

---
 examples/mpi/mpi.cpp | 10 +++++-----
 llama.cpp            |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
index 3030918bf24..b4944099eaa 100644
--- a/examples/mpi/mpi.cpp
+++ b/examples/mpi/mpi.cpp
@@ -284,7 +284,7 @@ int main(int argc, char ** argv) {
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(ctx));
+        embd_inp.push_back(llama_token_bos(model));
         LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
     }
 
@@ -334,7 +334,7 @@ int main(int argc, char ** argv) {
         }
 
         // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
+        llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
     }
 
     LOGLN(
@@ -729,7 +729,7 @@ int main(int argc, char ** argv) {
             }
 
             // deal with end of text token in interactive mode
-            if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) {
+            if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
                 LOG("found EOS token\n");
 
                 if (params.interactive) {
@@ -756,7 +756,7 @@ int main(int argc, char ** argv) {
 
                 if (params.input_prefix_bos) {
                     LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(ctx));
+                    embd_inp.push_back(llama_token_bos(model));
                 }
 
                 std::string buffer;
@@ -840,7 +840,7 @@ int main(int argc, char ** argv) {
         }
 
         // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
+        if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
             LOG_TEE(" [end of text]\n");
             break;
         }
diff --git a/llama.cpp b/llama.cpp
index 6f6e896ed42..1a1295b9b5c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9382,10 +9382,10 @@ int llama_eval(
     if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
         // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
         const int n_ctx = llama_n_ctx(ctx);
-        std::vector<llama_token> tmp(n_ctx, llama_token_bos(ctx));
+        std::vector<llama_token> tmp(n_ctx, llama_token_bos(&(ctx->model)));
         do {
             //ggml_mpi_synch_int(ctx->ctx_mpi, &n_past);
-            llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
+            llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
         } while (llama_decode_internal(*ctx, llama_batch_get_one(tmp.data(), tmp.size(), n_past, 0)) >= 0);
         llama_backend_free();
         exit(1);
@@ -9485,7 +9485,7 @@ int llama_decode(
     if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
         // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
         const int n_ctx = llama_n_ctx(ctx);
-        std::vector<llama_token> tmp(n_ctx, llama_token_bos(ctx));
+        std::vector<llama_token> tmp(n_ctx, llama_token_bos(&(ctx->model)));
         while (llama_decode_internal(*ctx, batch) >= 0){};
         llama_backend_free();
         exit(1);

From 33b88d6b41c5c676b7a9909b21d4f9554737eaca Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 31 Oct 2023 15:55:15 -0500
Subject: [PATCH 15/63] Fix some mpi mem leaks, add mpi-layer-split to help
 when using mpi

---
 common/common.cpp |  3 +++
 ggml-mpi.c        | 52 +++++++++++++++++++++++++++++------------------
 ggml-mpi.h        |  3 ++-
 llama.cpp         |  1 +
 4 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index ed25964ccd1..da8188169b9 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -853,6 +853,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
     printf("                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif // GGML_USE_CUBLAS
+#endif
+#ifdef GGML_USE_MPI
+    printf("  --mpi-layer-split N   percentiles to split the layers by across nodes\n");
 #endif
     printf("  --verbose-prompt      print prompt before generation\n");
     printf("  -dkvc, --dump-kv-cache\n");
diff --git a/ggml-mpi.c b/ggml-mpi.c
index 1e4d0b376fe..fd88eab1fd0 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -47,7 +47,7 @@ struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int
 }
 
 void ggml_mpi_free(struct ggml_mpi_context * ctx) {
-    MPI_Comm_free(ctx->comm);
+    MPI_Comm_free(&(ctx->comm));
     free(ctx);
 }
 
@@ -55,7 +55,7 @@ int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
     return ctx->rank;
 }
 
-int ggml_mpi_size(struct ggml_mpi_context * ctx) {
+size_t ggml_mpi_size(struct ggml_mpi_context * ctx) {
     return ctx->size;
 }
 
@@ -69,30 +69,41 @@ void ggml_mpi_eval_init(
 
 
     MPI_Barrier(ctx_mpi->comm);
-
+    int32_t old_n_tokens = *n_tokens;
     MPI_Bcast(n_tokens, 1, MPI_INT, 0, ctx_mpi->comm);
 
-    if (ctx_mpi->rank != 0) {
-        *pos = calloc(*n_tokens, sizeof(int32_t));
-        *n_seq_ids = calloc(*n_tokens, sizeof(int32_t));
-        *logits = calloc(*n_tokens, sizeof(int8_t));
+    // If what was passed in differs from what was broadcast,
+    // we can't guarantee the allocated sizes are correct
+    // TODO check how often this is done and if it's a problem,
+    //      try to allocate ahead of time
+    if (old_n_tokens != *n_tokens) {
+        *pos = realloc(*pos, *n_tokens * sizeof(int32_t));
+        *n_seq_ids = realloc(*n_seq_ids, *n_tokens * sizeof(int32_t ));
+        *logits = realloc(*logits, *n_tokens * sizeof(int32_t));
     }
 
+
+
+//    MPI_Bcast(&total_n_seq_ids,     1, MPI_INT32_T, 0, ctx_mpi->comm);
+    MPI_Bcast(*n_seq_ids,   *n_tokens, MPI_INT32_T, 0, ctx_mpi->comm);
+
+    // We need to know the total number of sequence
+    // ids, so we count them all up
     int32_t total_n_seq_ids = 0;
-    for (size_t i = 0; i < *n_tokens; i++) {
+    for (int32_t i = 0; i < *n_tokens; i++) {
         total_n_seq_ids += (*n_seq_ids)[i];
     }
 
-    MPI_Bcast(&total_n_seq_ids,     1,               MPI_INT32_T, 0, ctx_mpi->comm);
-    MPI_Bcast(*n_seq_ids,                  *n_tokens,        MPI_INT32_T, 0, ctx_mpi->comm);
-
+    // MPI can't chase the pointers for multidimensional arrays, so we flatten them first
+    // for transit
     int32_t * flattened_seq_ids = calloc(total_n_seq_ids, sizeof(int32_t));
 
     int32_t current_index = 0;
 
+    // Only rank 0 needs to flatten since the others don't have the real seq_id
     if (ctx_mpi->rank == 0) {
-        for (size_t i = 0; i < *n_tokens; i++) {
-            for (size_t j = 0; j < (*n_seq_ids)[i]; j++) {
+        for (int32_t i = 0; i < *n_tokens; i++) {
+            for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
                 flattened_seq_ids[current_index] = (*seq_id)[i][j];
                 current_index++;
             }
@@ -100,25 +111,26 @@ void ggml_mpi_eval_init(
     }
 
 
-    MPI_Bcast(*pos,                  *n_tokens,        MPI_INT32_T, 0, ctx_mpi->comm);
-    MPI_Bcast(flattened_seq_ids,    total_n_seq_ids, MPI_INT32_T, 0, ctx_mpi->comm);
+    MPI_Bcast(             *pos, *n_tokens,        MPI_INT32_T, 0, ctx_mpi->comm);
+    MPI_Bcast(flattened_seq_ids,  total_n_seq_ids, MPI_INT32_T, 0, ctx_mpi->comm);
     //MPI_Bcast(*logits,               *n_tokens,        MPI_INT8_T, 0, ctx_mpi->comm);
     int32_t ** new_seq_id = calloc(*n_tokens, sizeof(int32_t*));
     current_index = 0;
-    for (size_t i = 0; i < *n_tokens; i++) {
+    for (int32_t i = 0; i < *n_tokens; i++) {
         new_seq_id[i] = calloc((*n_seq_ids)[i], sizeof(int32_t));
-        for (size_t j = 0; j < (*n_seq_ids)[i]; j++) {
+        for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
             new_seq_id[i][j] = flattened_seq_ids[current_index];
             current_index++;
         }
     }
     free(flattened_seq_ids);
+    //free(*seq_id); // <- something is still holding onto this, need to investigate
     *seq_id = new_seq_id;
 }
 
 void ggml_mpi_synch_int(
-        struct ggml_mpi_context     * ctx_mpi,
-        int32_t * val
+        struct ggml_mpi_context * ctx_mpi,
+                        int32_t * val
 ) {
     MPI_Bcast(val, 1, MPI_INT32_T, 0, ctx_mpi->comm);
 }
@@ -284,7 +296,7 @@ void ggml_mpi_graph_compute_pre(
     {
 
 
-        const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
+        //const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
 
         const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
 
diff --git a/ggml-mpi.h b/ggml-mpi.h
index f3c4bf2aa45..62b15faefe3 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <stdint.h>
+#include <stddef.h>
 
 struct ggml_context;
 struct ggml_tensor;
@@ -98,7 +99,7 @@ int ggml_mpi_rank(struct ggml_mpi_context * ctx);
  * @param ctx The context containing the communicator used for this size check.
  * @return The number of nodes that are a part of the given context's communicator.
  */
-int ggml_mpi_size(struct ggml_mpi_context * ctx);
+size_t ggml_mpi_size(struct ggml_mpi_context * ctx);
 
 /**
  * Synchronize needed information among the nodes
diff --git a/llama.cpp b/llama.cpp
index 1a1295b9b5c..b8668c28aa6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8689,6 +8689,7 @@ void llama_split_layers_weighted(struct llama_context * ctx, float device_weight
     }
     uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights);
     ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
+    free(ranges);
 #endif
 }
 

From da37edc5b8cc82661d25146a1ba6860e1e147e67 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 1 Nov 2023 12:23:30 -0500
Subject: [PATCH 16/63] Fix missing layer_inp_i names

---
 llama.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index b8668c28aa6..aa8893d5983 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3878,6 +3878,7 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             struct ggml_tensor * inpSA = inpL;
 
             // norm
@@ -3988,6 +3989,7 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             struct ggml_tensor * inpSA = inpL;
 
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -4108,6 +4110,7 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             struct ggml_tensor * attn_norm;
 
             attn_norm = llm_build_norm(ctx0, inpL, hparams,
@@ -4232,6 +4235,7 @@ struct llm_build_context {
         cb(inpL, "inpL", -1);
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             cur = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
@@ -4328,6 +4332,7 @@ struct llm_build_context {
         }
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             struct ggml_tensor * residual = inpL;
 
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -4530,6 +4535,7 @@ struct llm_build_context {
         cb(KQ_mask, "KQ_mask", -1);
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             struct ggml_tensor * inpSA = inpL;
 
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -4627,6 +4633,7 @@ struct llm_build_context {
         cb(inpL, "inp_norm", -1);
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             cur = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
@@ -4715,6 +4722,7 @@ struct llm_build_context {
         cb(KQ_mask, "KQ_mask", -1);
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             struct ggml_tensor * attn_norm;
 
             attn_norm = llm_build_norm(ctx0, inpL, hparams,

From 51f3f8fd22e7bc8c629e582621fc11ce860f3458 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 1 Nov 2023 14:55:32 -0500
Subject: [PATCH 17/63] Allow per-node threads to be set in command-line args,
 add mpi support to main

---
 common/common.cpp      | 58 ++++++++++++++++++++++++++++++++----------
 common/common.h        |  4 +--
 examples/main/main.cpp |  2 ++
 llama.cpp              |  8 ++++++
 llama.h                |  5 ++++
 5 files changed, 61 insertions(+), 16 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index da8188169b9..9a68067e05b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -154,18 +154,37 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.n_threads = std::stoi(argv[i]);
-            if (params.n_threads <= 0) {
-                params.n_threads = std::thread::hardware_concurrency();
+            std::string arg_next = argv[i];
+
+            // split string by , and /
+            const std::regex regex{R"([,/]+)"};
+            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+            std::vector<std::string> split_arg{it, {}};
+            params.n_threads.resize(split_arg.size());
+            for (size_t i = 0; i < split_arg.size(); ++i) {
+                params.n_threads[i] = std::stoi(split_arg[i]);
+                if (params.n_threads[i] <= 0) {
+                    params.n_threads[i] = std::thread::hardware_concurrency();
+                }
             }
+
         } else if (arg == "-tb" || arg == "--threads-batch") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.n_threads_batch = std::stoi(argv[i]);
-            if (params.n_threads_batch <= 0) {
-                params.n_threads_batch = std::thread::hardware_concurrency();
+            std::string arg_next = argv[i];
+
+            // split string by , and /
+            const std::regex regex{R"([,/]+)"};
+            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+            std::vector<std::string> split_arg{it, {}};
+            params.n_threads_batch.resize(split_arg.size());
+            for (size_t i = 0; i < split_arg.size(); ++i) {
+                params.n_threads_batch[i] = std::stoi(split_arg[i]);
+                if (params.n_threads_batch[i] <= 0) {
+                    params.n_threads_batch[i] = std::thread::hardware_concurrency();
+                }
             }
         } else if (arg == "-p" || arg == "--prompt") {
             if (++i >= argc) {
@@ -759,7 +778,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        (can be specified more than once for multiple prompts).\n");
     printf("  --color               colorise output to distinguish prompt and user input from generations\n");
     printf("  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
-    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
+    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads[0]);
     printf("  -tb N, --threads-batch N\n");
     printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
     printf("  -p PROMPT, --prompt PROMPT\n");
@@ -879,9 +898,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 std::string get_system_info(const gpt_params & params) {
     std::ostringstream os;
 
-    os << "system_info: n_threads = " << params.n_threads;
-    if (params.n_threads_batch != -1) {
-        os << " (n_threads_batch = " << params.n_threads_batch << ")";
+    os << "system_info: n_threads = " << params.n_threads[0];
+    if (params.n_threads_batch[0] != -1) {
+        os << " (n_threads_batch = " << params.n_threads_batch[0] << ")";
     }
     os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
 
@@ -929,8 +948,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 
     cparams.n_ctx             = params.n_ctx;
     cparams.n_batch           = params.n_batch;
-    cparams.n_threads         = params.n_threads;
-    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    cparams.n_threads         = params.n_threads[0];
+    cparams.n_threads_batch   = params.n_threads_batch[0] == -1 ? params.n_threads[0] : params.n_threads_batch[0];
     cparams.mul_mat_q         = params.mul_mat_q;
     cparams.seed              = params.seed;
     cparams.f16_kv            = params.memory_f16;
@@ -970,6 +989,7 @@ void llama_batch_add(
 }
 
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
+    int32_t n_threads = params.n_threads[0];
     auto mparams = llama_model_params_from_gpt_params(params);
 
     llama_model * model  = llama_load_model_from_file(params.model.c_str(), mparams);
@@ -987,6 +1007,16 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         return std::make_tuple(nullptr, nullptr);
     }
 
+#ifdef GGML_USE_MPI
+    int node_id = llama_node_id(lctx);
+    n_threads = (node_id >= params.n_threads.size()) ? get_num_physical_cores() : params.n_threads[node_id];
+    int32_t n_threads_batch = (node_id >= params.n_threads_batch.size()) ? -1 : params.n_threads_batch[node_id];
+
+    params.n_threads[0] = n_threads; // So we can treat index 0 as what our n_threads is elsewhere
+    params.n_threads_batch[0] = n_threads_batch;
+    llama_set_n_threads(lctx, n_threads, (n_threads_batch > 0) ? n_threads_batch : get_num_physical_cores());
+#endif
+
     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
         const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
         float lora_scale = std::get<1>(params.lora_adapter[i]);
@@ -996,7 +1026,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
                                              ((i > 0) || params.lora_base.empty())
                                                 ? NULL
                                                 : params.lora_base.c_str(),
-                                             params.n_threads);
+                                             n_threads);
         if (err != 0) {
             fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
             llama_free(lctx);
@@ -1411,7 +1441,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
 
     fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
-    fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
+    fprintf(stream, "threads: %d # default: %d\n", params.n_threads[0], std::thread::hardware_concurrency());
     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
diff --git a/common/common.h b/common/common.h
index 176bd23d976..5287305143c 100644
--- a/common/common.h
+++ b/common/common.h
@@ -45,8 +45,8 @@ int32_t get_num_physical_cores();
 struct gpt_params {
     uint32_t seed                           = -1;    // RNG seed
 
-    int32_t n_threads                       = get_num_physical_cores();
-    int32_t n_threads_batch                 = -1;    // number of threads to use for batch processing (-1 = use n_threads)
+    std::vector<int32_t> n_threads                       = {get_num_physical_cores()};
+    std::vector<int32_t> n_threads_batch                 = {-1};    // number of threads to use for batch processing (-1 = use n_threads)
     int32_t n_predict                       = -1;    // new tokens to predict
     int32_t n_ctx                           = 512;   // context size
     int32_t n_batch                         = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 31ec8cade19..7235e12caf1 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -188,6 +188,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    llama_split_layers_weighted(ctx, params.mpi_layer_split.data(), params.mpi_layer_split.size());
+
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
     LOG("n_ctx: %d\n", n_ctx);
diff --git a/llama.cpp b/llama.cpp
index aa8893d5983..a8be816f713 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8421,6 +8421,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
     return result;
 }
 
+int llama_node_id(struct llama_context * ctx) {
+#ifdef GGML_USE_MPI
+    return ggml_mpi_rank(ctx->ctx_mpi);
+
+#endif
+    return 0;
+}
+
 int llama_max_devices(void) {
     return LLAMA_MAX_DEVICES;
 }
diff --git a/llama.h b/llama.h
index f0b34bf2ec9..43dcadefcb7 100644
--- a/llama.h
+++ b/llama.h
@@ -285,6 +285,11 @@ extern "C" {
 
     LLAMA_API int64_t llama_time_us(void);
 
+    // Get the ID of this compute node, usually 0
+    // unless running MPI, in which case it is the rank of the node
+    LLAMA_API int llama_node_id(struct llama_context * ctx);
+
+
     LLAMA_API int  llama_max_devices    (void);
     LLAMA_API bool llama_mmap_supported (void);
     LLAMA_API bool llama_mlock_supported(void);

From 4cf1c769bc9b4e9af76f5d5207cfea1c48ee1464 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 8 Nov 2023 17:21:06 -0600
Subject: [PATCH 18/63] Support running speculation with two processes

---
 examples/speculative/speculative.cpp | 61 ++++++++++++++++++++--------
 ggml-mpi.c                           | 43 ++++++++++++++++++--
 ggml-mpi.h                           |  9 +++-
 llama.cpp                            | 32 +++++++++++++++
 llama.h                              |  8 ++++
 5 files changed, 131 insertions(+), 22 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index ace755c51d8..137db84f2d0 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -62,11 +62,24 @@ int main(int argc, char ** argv) {
     params.logits_all = true;
     std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
 
+    llama_split_comm(ctx_tgt, (llama_node_id(ctx_tgt) == 0) ? 0 : -1);
+
     // load the draft model
     params.model = params.model_draft;
     params.n_gpu_layers = params.n_gpu_layers_draft;
     std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
 
+    llama_split_comm(ctx_dft, (llama_node_id(ctx_dft) == 1) ? 0 : -1);
+
+
+    llama_split_layers_weighted(ctx_dft, new float[] {1.0}, 1);
+    llama_split_layers_weighted(ctx_tgt, new float[] {1.0}, 1);
+
+    {
+        LOG_TEE("\n");
+        LOG_TEE("%s\n", get_system_info(params).c_str());
+    }
+
     {
         const int n_vocab_tgt = llama_n_vocab(model_tgt);
         const int n_vocab_dft = llama_n_vocab(model_dft);
@@ -81,17 +94,17 @@ int main(int argc, char ** argv) {
             return 1;
         }
 
-        for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
-            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
-            const char * token_text_dft = llama_token_get_text(model_dft, i);
-            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
-                fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
-                fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
-                        llama_token_to_piece(ctx_tgt, i).c_str(),
-                        llama_token_to_piece(ctx_dft, i).c_str());
-                return 1;
-            }
-        }
+//        for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
+//            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
+//            const char * token_text_dft = llama_token_get_text(model_dft, i);
+//            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
+//                fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
+//                fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
+//                        llama_token_to_piece(ctx_tgt, i).c_str(),
+//                        llama_token_to_piece(ctx_dft, i).c_str());
+//                return 1;
+//            }
+//        }
     }
 
 
@@ -197,14 +210,21 @@ int main(int argc, char ** argv) {
             // sample from the target model
             llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
 
+            llama_swap_comm(ctx_tgt);
+            llama_sync_token(ctx_tgt, &id, 0);
+
+
             llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
 
             //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
 
             const std::string token_str = llama_token_to_piece(ctx_tgt, id);
+            if (llama_node_id(ctx_tgt) == 0) {
+                printf("%s", token_str.c_str());
+                fflush(stdout);
+            }
 
-            printf("%s", token_str.c_str());
-            fflush(stdout);
+            llama_swap_comm(ctx_tgt);
 
             if (id == llama_token_eos(model_tgt)) {
                 has_eos = true;
@@ -269,8 +289,8 @@ int main(int argc, char ** argv) {
             llama_batch_clear(batch_dft);
             llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
 
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
-            // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+            llama_kv_cache_seq_rm(ctx_dft, -1, n_past_dft, -1);
+             LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
             llama_decode         (ctx_dft, batch_dft);
 
             ++n_past_dft;
@@ -313,17 +333,24 @@ int main(int argc, char ** argv) {
 
                 llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
 
-                const auto & cur_p = drafts[s].ctx_sampling->cur;
+                llama_swap_comm(ctx_dft);
+                llama_sync_token(ctx_dft, &(drafts[s].i_batch_dft), 1);
+
+                auto & cur_p = drafts[s].ctx_sampling->cur;
 
                 for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
+                    llama_sync_token_data(ctx_dft, &(cur_p[k]), 1);
                     LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
                             k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
                 }
 
+                llama_swap_comm(ctx_dft);
+
+
                 if (cur_p[0].p < p_accept) {
                     LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
                     drafts[s].drafting = false;
-                    continue;
+//                    continue;
                 }
 
                 std::vector<int> sa(1, s);
diff --git a/ggml-mpi.c b/ggml-mpi.c
index fd88eab1fd0..8056f16cb70 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -39,8 +39,16 @@ struct ggml_mpi_context * ggml_mpi_init(void) {
 }
 
 struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int color, int key) {
+    if (color < 0) {
+        color = MPI_UNDEFINED;
+    }
     struct ggml_mpi_context * newCtx = calloc(1, sizeof(struct ggml_mpi_context));
     MPI_Comm_split(ctx->comm, color, key, &newCtx->comm);
+    if(newCtx->comm == MPI_COMM_NULL) {
+        newCtx->rank = -1;
+        newCtx->size = -1;
+        return newCtx;
+    }
     MPI_Comm_rank(newCtx->comm, &newCtx->rank);
     MPI_Comm_size(newCtx->comm, &newCtx->size);
     return newCtx;
@@ -66,7 +74,9 @@ void ggml_mpi_eval_init(
                 int32_t         **  n_seq_ids,
                 int32_t         *** seq_id,
                 int8_t          **  logits) {
-
+    if(ctx_mpi->comm == MPI_COMM_NULL) {
+        return;
+    }
 
     MPI_Barrier(ctx_mpi->comm);
     int32_t old_n_tokens = *n_tokens;
@@ -130,9 +140,24 @@ void ggml_mpi_eval_init(
 
 void ggml_mpi_synch_int(
         struct ggml_mpi_context * ctx_mpi,
-                        int32_t * val
+                        int32_t * val,
+                        int root
+) {
+    if(ctx_mpi->comm == MPI_COMM_NULL) {
+        return;
+    }
+    MPI_Bcast(val, 1, MPI_INT32_T, root, ctx_mpi->comm);
+}
+
+void ggml_mpi_synch_float(
+        struct ggml_mpi_context * ctx_mpi,
+        float * val,
+        int root
 ) {
-    MPI_Bcast(val, 1, MPI_INT32_T, 0, ctx_mpi->comm);
+    if(ctx_mpi->comm == MPI_COMM_NULL) {
+        return;
+    }
+    MPI_Bcast(val, 1, MPI_FLOAT, root, ctx_mpi->comm);
 }
 
 static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
@@ -154,6 +179,9 @@ static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
 
 
 static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst, MPI_Comm comm) {
+    if(comm == MPI_COMM_NULL) {
+        return;
+    }
     MPI_Datatype mpi_type;
 
     switch (t->type) {
@@ -167,6 +195,9 @@ static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst, MPI_C
 }
 
 static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src, MPI_Comm comm) {
+    if(comm == MPI_COMM_NULL) {
+        return;
+    }
     MPI_Datatype mpi_type;
 
     switch (t->type) {
@@ -195,7 +226,7 @@ uint16_t** ggml_mpi_split_range(
     // Only node 0 deals with the device splits, other nodes
     // get the splits from the scatter layers operation
 
-    if (ctx_mpi->rank != 0) {
+    if (ctx_mpi->comm == MPI_COMM_NULL || ctx_mpi->rank != 0) {
         return NULL;
     }
 
@@ -221,6 +252,10 @@ void ggml_mpi_scatter_layers(
     struct ggml_mpi_context * ctx_mpi,
     uint16_t ** layer_ranges
 ) {
+    if(ctx_mpi->comm == MPI_COMM_NULL) {
+        return;
+    }
+
     // Layer ranges is a 2d array with the first dimension
     // having a length of the number of nodes and the second
     // dimension having a length of 2. The inner arrays contain
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 62b15faefe3..b6a38a44447 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -126,9 +126,16 @@ void ggml_mpi_eval_init(
 
 void ggml_mpi_synch_int(
         struct ggml_mpi_context     * ctx_mpi,
-                int32_t * val
+                int32_t * val,
+                int root
         );
 
+void ggml_mpi_synch_float(
+        struct ggml_mpi_context     * ctx_mpi,
+        float * val,
+        int root
+);
+
 /**
  * Split a range across all nodes within the given
  * context, weighting the allocations by the given weights.
diff --git a/llama.cpp b/llama.cpp
index a8be816f713..aafca5a9355 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1493,6 +1493,7 @@ struct llama_context {
 
 #ifdef GGML_USE_MPI
     ggml_mpi_context * ctx_mpi = NULL;
+    ggml_mpi_context * ctx_mpi_orig = NULL;
 #endif
 };
 
@@ -8698,6 +8699,35 @@ struct llama_context * llama_new_context_with_model(
     return ctx;
 }
 
+void llama_sync_token(struct llama_context * ctx, llama_token * token, int root) {
+#ifdef GGML_USE_MPI
+    ggml_mpi_synch_int(ctx->ctx_mpi, token, root);
+#endif
+}
+
+void llama_sync_token_data(struct llama_context * ctx, llama_token_data * data, int root) {
+#ifdef GGML_USE_MPI
+    ggml_mpi_synch_int(ctx->ctx_mpi, &(data->id), root);
+    ggml_mpi_synch_float(ctx->ctx_mpi, &(data->logit), root);
+    ggml_mpi_synch_float(ctx->ctx_mpi, &(data->p), root);
+#endif
+}
+
+void llama_swap_comm(struct llama_context * ctx) {
+#ifdef GGML_USE_MPI
+    ggml_mpi_context * temp = ctx->ctx_mpi;
+    ctx->ctx_mpi = ctx->ctx_mpi_orig;
+    ctx->ctx_mpi_orig = temp;
+#endif
+}
+
+void llama_split_comm(struct llama_context * ctx, int color) {
+#ifdef GGML_USE_MPI
+    ctx->ctx_mpi_orig = ctx->ctx_mpi;
+    ctx->ctx_mpi = ggml_mpi_split_comm(ctx->ctx_mpi, color, ggml_mpi_rank(ctx->ctx_mpi));
+#endif
+}
+
 void llama_split_layers_weighted(struct llama_context * ctx, float device_weights[], size_t num_weights) {
 #ifdef GGML_USE_MPI
     if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != num_weights) {
@@ -9506,6 +9536,8 @@ int llama_decode(
         while (llama_decode_internal(*ctx, batch) >= 0){};
         llama_backend_free();
         exit(1);
+    } else if (ggml_mpi_rank(ctx->ctx_mpi) < 0) {
+        return 0;
     }
 #endif
     const int ret = llama_decode_internal(*ctx, batch);
diff --git a/llama.h b/llama.h
index 43dcadefcb7..8865f158aba 100644
--- a/llama.h
+++ b/llama.h
@@ -274,6 +274,14 @@ extern "C" {
 
     LLAMA_API void llama_split_layers_weighted(struct llama_context * ctx, float device_weights[], size_t num_weights);
 
+    LLAMA_API void llama_swap_comm(struct llama_context * ctx);
+
+    LLAMA_API void llama_sync_token(struct llama_context * ctx, llama_token * token, int root);\
+
+    LLAMA_API void llama_sync_token_data(struct llama_context * ctx, llama_token_data * data, int root);
+
+    LLAMA_API void llama_split_comm(struct llama_context * ctx, int color);
+
     LLAMA_API void llama_free_model(struct llama_model * model);
 
     LLAMA_API struct llama_context * llama_new_context_with_model(

From 8ccaf96dc58ab45db4246b0d5ccdb0b9bdd803e5 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 8 Nov 2023 18:55:24 -0600
Subject: [PATCH 19/63] Support setting layer splits per comm/model

---
 common/common.cpp                    | 11 +++++++++--
 common/common.h                      |  2 +-
 examples/speculative/speculative.cpp | 13 ++++++++-----
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 9a68067e05b..61d69f129c3 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -567,12 +567,19 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             std::string arg_next = argv[i];
 
             // split string by , and /
-            const std::regex regex{R"([,/]+)"};
+            const std::regex regex{R"([\/]+)"};
+            const std::regex inner_regex{R"([,]+)"};
+
             std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
             std::vector<std::string> split_arg{it, {}};
             params.mpi_layer_split.resize(split_arg.size());
             for (size_t i = 0; i < split_arg.size(); ++i) {
-                params.mpi_layer_split[i] = std::stof(split_arg[i]);
+                std::sregex_token_iterator it_inner{split_arg[i].begin(), split_arg[i].end(), regex, -1};
+                std::vector<std::string> split_arg_inner{it_inner, {}};
+                params.mpi_layer_split[i].resize(split_arg_inner.size());
+                for (size_t j = 0; j < split_arg_inner.size(); ++j) {
+                    params.mpi_layer_split[i][j] = std::stof(split_arg_inner[j]);
+                }
             }
 
 
diff --git a/common/common.h b/common/common.h
index 5287305143c..20572f5c70c 100644
--- a/common/common.h
+++ b/common/common.h
@@ -60,7 +60,7 @@ struct gpt_params {
     int32_t n_gpu_layers                    = -1;    // number of layers to store in VRAM (-1 - use default)
     int32_t n_gpu_layers_draft              = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
     int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
-    std::vector<float> mpi_layer_split      = {1.0}; // list of percentages of the total number of layers
+    std::vector<std::vector<float>> mpi_layer_split      = {{1.0}}; // list of percentages of the total number of layers
     float   tensor_split[LLAMA_MAX_DEVICES] = {0};   // how split tensors should be distributed across GPUs
     int32_t n_beams                         = 0;     // if non-zero then use beam search of given width.
     float   rope_freq_base                  = 0.0f;  // RoPE base frequency
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 137db84f2d0..4b5cee4c859 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -62,18 +62,21 @@ int main(int argc, char ** argv) {
     params.logits_all = true;
     std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
 
-    llama_split_comm(ctx_tgt, (llama_node_id(ctx_tgt) == 0) ? 0 : -1);
+    llama_split_comm(ctx_tgt, (llama_node_id(ctx_tgt) < params.mpi_layer_split[0].size()) ? 0 : -1);
+    printf("Size of first split: %lu, element: %f\n", params.mpi_layer_split[0].size(), params.mpi_layer_split[0][0]);
 
     // load the draft model
     params.model = params.model_draft;
     params.n_gpu_layers = params.n_gpu_layers_draft;
     std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
 
-    llama_split_comm(ctx_dft, (llama_node_id(ctx_dft) == 1) ? 0 : -1);
+    llama_split_comm(ctx_dft, (llama_node_id(ctx_dft) >= params.mpi_layer_split[0].size()) ? 0 : -1);
 
+    printf("Size of second split: %lu, element: %f\n", params.mpi_layer_split[1].size(), params.mpi_layer_split[1][0]);
 
-    llama_split_layers_weighted(ctx_dft, new float[] {1.0}, 1);
-    llama_split_layers_weighted(ctx_tgt, new float[] {1.0}, 1);
+
+    llama_split_layers_weighted(ctx_tgt, params.mpi_layer_split[0].data(), params.mpi_layer_split[0].size());
+    llama_split_layers_weighted(ctx_dft, params.mpi_layer_split[1].data(), params.mpi_layer_split[1].size());
 
     {
         LOG_TEE("\n");
@@ -350,7 +353,7 @@ int main(int argc, char ** argv) {
                 if (cur_p[0].p < p_accept) {
                     LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
                     drafts[s].drafting = false;
-//                    continue;
+                    continue;
                 }
 
                 std::vector<int> sa(1, s);

From 5f21688e67c0b14e6e62fd46b8d3cfd64db9e2df Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 8 Nov 2023 19:11:24 -0600
Subject: [PATCH 20/63] Fix incorrect layer split parsing

---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 61d69f129c3..6d1ee6a0c83 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -574,7 +574,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             std::vector<std::string> split_arg{it, {}};
             params.mpi_layer_split.resize(split_arg.size());
             for (size_t i = 0; i < split_arg.size(); ++i) {
-                std::sregex_token_iterator it_inner{split_arg[i].begin(), split_arg[i].end(), regex, -1};
+                std::sregex_token_iterator it_inner{split_arg[i].begin(), split_arg[i].end(), inner_regex, -1};
                 std::vector<std::string> split_arg_inner{it_inner, {}};
                 params.mpi_layer_split[i].resize(split_arg_inner.size());
                 for (size_t j = 0; j < split_arg_inner.size(); ++j) {

From 2ddf0feee0ec0375df37ccf4fbe818a920397bb6 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 8 Nov 2023 19:40:58 -0600
Subject: [PATCH 21/63] Split orig comm to only contain root nodes of the two
 subnets

---
 examples/speculative/speculative.cpp | 6 ++++++
 llama.cpp                            | 3 ++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 4b5cee4c859..cb1ace01d06 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -62,6 +62,9 @@ int main(int argc, char ** argv) {
     params.logits_all = true;
     std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
 
+    llama_split_comm(ctx_tgt, (llama_node_id(ctx_tgt) == 0 || llama_node_id(ctx_tgt) == params.mpi_layer_split[0].size()) ? 0 : -1);
+    llama_swap_comm(ctx_tgt);
+
     llama_split_comm(ctx_tgt, (llama_node_id(ctx_tgt) < params.mpi_layer_split[0].size()) ? 0 : -1);
     printf("Size of first split: %lu, element: %f\n", params.mpi_layer_split[0].size(), params.mpi_layer_split[0][0]);
 
@@ -70,6 +73,9 @@ int main(int argc, char ** argv) {
     params.n_gpu_layers = params.n_gpu_layers_draft;
     std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
 
+    llama_split_comm(ctx_dft, (llama_node_id(ctx_dft) == 0 || llama_node_id(ctx_dft) == params.mpi_layer_split[0].size()) ? 0 : -1);
+    llama_swap_comm(ctx_dft);
+
     llama_split_comm(ctx_dft, (llama_node_id(ctx_dft) >= params.mpi_layer_split[0].size()) ? 0 : -1);
 
     printf("Size of second split: %lu, element: %f\n", params.mpi_layer_split[1].size(), params.mpi_layer_split[1][0]);
diff --git a/llama.cpp b/llama.cpp
index aafca5a9355..2d658cc3ef0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8693,6 +8693,8 @@ struct llama_context * llama_new_context_with_model(
 
 #ifdef GGML_USE_MPI
     ctx->ctx_mpi = ggml_mpi_init();
+    ctx->ctx_mpi_orig = ctx->ctx_mpi;
+
 
 #endif
 
@@ -8723,7 +8725,6 @@ void llama_swap_comm(struct llama_context * ctx) {
 
 void llama_split_comm(struct llama_context * ctx, int color) {
 #ifdef GGML_USE_MPI
-    ctx->ctx_mpi_orig = ctx->ctx_mpi;
     ctx->ctx_mpi = ggml_mpi_split_comm(ctx->ctx_mpi, color, ggml_mpi_rank(ctx->ctx_mpi));
 #endif
 }

From 2166a1268ba6691a9b7bd63e0a911e3008b1c83f Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Thu, 9 Nov 2023 12:26:31 -0600
Subject: [PATCH 22/63] Fix main layer split and fix speculative prompt
 tokenization

---
 examples/main/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 7235e12caf1..939bea23264 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -188,7 +188,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_split_layers_weighted(ctx, params.mpi_layer_split.data(), params.mpi_layer_split.size());
+    llama_split_layers_weighted(ctx, params.mpi_layer_split[0].data(), params.mpi_layer_split[0].size());
 
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);

From fbc3d4d85f1aaa68321bf0a455d6144b7109522c Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Thu, 9 Nov 2023 16:30:02 -0600
Subject: [PATCH 23/63] Fix kv desync

---
 ggml-mpi.c | 62 +++++++++++++++++++++++++++++++++++++++++---------
 ggml-mpi.h | 14 ++++++++++++
 llama.cpp  | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 131 insertions(+), 11 deletions(-)

diff --git a/ggml-mpi.c b/ggml-mpi.c
index 8056f16cb70..7c6900bbc3b 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -17,6 +17,7 @@ struct ggml_mpi_context {
     MPI_Comm comm;
     int layer_start;
     int layer_end;
+    MPI_Status status;
 };
 
 void ggml_mpi_backend_init(void) {
@@ -34,6 +35,7 @@ struct ggml_mpi_context * ggml_mpi_init(void) {
     MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
     MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
     ctx->comm = MPI_COMM_WORLD;
+//    ctx->status = *MPI_STATUS_IGNORE;
 
     return ctx;
 }
@@ -67,6 +69,40 @@ size_t ggml_mpi_size(struct ggml_mpi_context * ctx) {
     return ctx->size;
 }
 
+void ggml_mpi_barrier(struct ggml_mpi_context * ctx_mpi) {
+    MPI_Barrier(ctx_mpi->comm);
+}
+
+void ggml_mpi_probe(struct ggml_mpi_context * ctx_mpi, int src, int tag) {
+    MPI_Probe((src >= 0) ? src : MPI_ANY_SOURCE, (tag >= 0) ? tag : MPI_ANY_TAG, ctx_mpi->comm, &(ctx_mpi->status));
+}
+
+int ggml_mpi_status_tag(struct ggml_mpi_context * ctx_mpi) {
+    return ctx_mpi->status.MPI_TAG;
+}
+
+int ggml_mpi_next_node(struct ggml_mpi_context * ctx_mpi) {
+    return (ctx_mpi->rank + 1) % ctx_mpi->size;
+}
+
+void ggml_mpi_sync_pipelined(
+        struct ggml_mpi_context *   ctx_mpi,
+        void * val,
+        int count,
+        MPI_Datatype datatype,
+        int tag
+        ) {
+    if(ctx_mpi->comm == MPI_COMM_NULL) {
+        return;
+    }
+    if (ctx_mpi->rank != 0) {
+        MPI_Recv(val, count, datatype, ctx_mpi->rank - 1, tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
+    }
+    if(ctx_mpi->rank < ctx_mpi->size - 1) {
+        MPI_Send(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
+    }
+}
+
 void ggml_mpi_eval_init(
         struct ggml_mpi_context *   ctx_mpi,
                 int32_t         *   n_tokens,
@@ -77,10 +113,9 @@ void ggml_mpi_eval_init(
     if(ctx_mpi->comm == MPI_COMM_NULL) {
         return;
     }
-
-    MPI_Barrier(ctx_mpi->comm);
     int32_t old_n_tokens = *n_tokens;
-    MPI_Bcast(n_tokens, 1, MPI_INT, 0, ctx_mpi->comm);
+
+    ggml_mpi_sync_pipelined(ctx_mpi, n_tokens, 1, MPI_INT, 0);
 
     // If what was passed in differs from what was broadcast,
     // we can't guarantee the allocated sizes are correct
@@ -95,7 +130,7 @@ void ggml_mpi_eval_init(
 
 
 //    MPI_Bcast(&total_n_seq_ids,     1, MPI_INT32_T, 0, ctx_mpi->comm);
-    MPI_Bcast(*n_seq_ids,   *n_tokens, MPI_INT32_T, 0, ctx_mpi->comm);
+    ggml_mpi_sync_pipelined(ctx_mpi, *n_seq_ids,   *n_tokens, MPI_INT32_T, 0);
 
     // We need to know the total number of sequence
     // ids, so we count them all up
@@ -121,8 +156,8 @@ void ggml_mpi_eval_init(
     }
 
 
-    MPI_Bcast(             *pos, *n_tokens,        MPI_INT32_T, 0, ctx_mpi->comm);
-    MPI_Bcast(flattened_seq_ids,  total_n_seq_ids, MPI_INT32_T, 0, ctx_mpi->comm);
+    ggml_mpi_sync_pipelined(ctx_mpi, *pos, *n_tokens, MPI_INT32_T, 0);
+    ggml_mpi_sync_pipelined(ctx_mpi, flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, 0);
     //MPI_Bcast(*logits,               *n_tokens,        MPI_INT8_T, 0, ctx_mpi->comm);
     int32_t ** new_seq_id = calloc(*n_tokens, sizeof(int32_t*));
     current_index = 0;
@@ -138,6 +173,15 @@ void ggml_mpi_eval_init(
     *seq_id = new_seq_id;
 }
 
+void ggml_mpi_sync_ints_pipelined(
+        struct ggml_mpi_context * ctx_mpi,
+        int32_t * vals,
+        int count,
+        int tag
+) {
+    ggml_mpi_sync_pipelined(ctx_mpi, vals, count, MPI_INT32_T, tag);
+}
+
 void ggml_mpi_synch_int(
         struct ggml_mpi_context * ctx_mpi,
                         int32_t * val,
@@ -206,9 +250,7 @@ static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src, MPI_C
         default: GGML_ASSERT(false && "not implemented");
     }
 
-    MPI_Status status; UNUSED(status);
-
-    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, comm, &status);
+    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, comm, MPI_STATUS_IGNORE);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
 
@@ -392,6 +434,6 @@ void ggml_mpi_graph_compute_post(
 
     // send the output data to the next node
     if (mpi_rank > 0) {
-        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size, ctx_mpi->comm);
+        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], ggml_mpi_next_node(ctx_mpi), ctx_mpi->comm);
     }
 }
diff --git a/ggml-mpi.h b/ggml-mpi.h
index b6a38a44447..f843165e997 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -75,6 +75,20 @@ struct ggml_mpi_context * ggml_mpi_init(void);
  */
 struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int color, int key);
 
+void ggml_mpi_barrier(struct ggml_mpi_context * ctx);
+
+int ggml_mpi_next_node(struct ggml_mpi_context * ctx_mpi);
+
+void ggml_mpi_sync_ints_pipelined(
+        struct ggml_mpi_context * ctx_mpi,
+        int32_t * vals,
+        int count,
+        int tag
+);
+// clear = 1, rm = 2, cp = 3, keep = 4, seq_shift = 5
+void ggml_mpi_probe(struct ggml_mpi_context * ctx_mpi, int src, int tag);
+int ggml_mpi_status_tag(struct ggml_mpi_context * ctx_mpi);
+
 /**
  * Frees the given context, including the communicator. No MPI
  * operations besides ggml_mpi_backend_freee(void) should be executed after
diff --git a/llama.cpp b/llama.cpp
index 2d658cc3ef0..06a69498b7f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8973,10 +8973,20 @@ int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
 }
 
 void llama_kv_cache_clear(struct llama_context * ctx) {
+#ifdef GGML_USE_MPI
+    ggml_mpi_sync_ints_pipelined(ctx->ctx_mpi, NULL, 0, 1);
+#endif
     llama_kv_cache_clear(ctx->kv_self);
 }
 
 void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+#ifdef GGML_USE_MPI
+    int32_t vals[3] = {seq_id, p0, p1};
+    ggml_mpi_sync_ints_pipelined(ctx->ctx_mpi, vals, 3, 2);
+    seq_id = vals[0];
+    p0 = vals[1];
+    p1 = vals[2];
+#endif
     llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
 }
 
@@ -8984,14 +8994,36 @@ void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src,
     if (seq_id_src == seq_id_dst) {
         return;
     }
+#ifdef GGML_USE_MPI
+    int32_t vals[4] = {seq_id_src, seq_id_dst, p0, p1};
+    ggml_mpi_sync_ints_pipelined(ctx->ctx_mpi, vals, 4, 3);
+    seq_id_src = vals[0];
+    seq_id_dst = vals[1];
+    p0 = vals[2];
+    p1 = vals[3];
+#endif
     llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
 }
 
 void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
+#ifdef GGML_USE_MPI
+    int32_t vals[1] = {seq_id};
+    ggml_mpi_sync_ints_pipelined(ctx->ctx_mpi, vals, 1, 4);
+    seq_id = vals[0];
+#endif
     llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
 }
 
 void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+#ifdef GGML_USE_MPI
+    int32_t vals[4] = {seq_id, p0, p1, delta};
+    ggml_mpi_sync_ints_pipelined(ctx->ctx_mpi, vals, 4, 5);
+    seq_id = vals[0];
+    p0 = vals[1];
+    p1 = vals[2];
+    delta = vals[3];
+#endif
+
     llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
 }
 
@@ -9525,6 +9557,38 @@ void llama_batch_free(struct llama_batch batch) {
     if (batch.logits)   free(batch.logits);
 }
 
+#ifdef GGML_USE_MPI
+
+int llama_process_mpi_worker(
+        struct llama_context * ctx,
+        struct llama_batch   batch) {
+    ggml_mpi_probe(ctx->ctx_mpi, -1, -1);
+    int tag = ggml_mpi_status_tag(ctx->ctx_mpi);
+    switch (tag) {
+        case 0:
+            return llama_decode_internal(*ctx, batch);
+            break;
+        case 1:
+            llama_kv_cache_clear(ctx);
+            break;
+        case 2:
+            llama_kv_cache_seq_rm(ctx, 0, 0, 0);
+            break;
+        case 3:
+            llama_kv_cache_seq_cp(ctx, 0, 0, 0, 0);
+            break;
+        case 4:
+            llama_kv_cache_seq_keep(ctx, 0);
+            break;
+        case 5:
+            llama_kv_cache_seq_shift(ctx, 0, 0, 0, 0);
+            break;
+    }
+    return 0;
+}
+
+#endif
+
 int llama_decode(
         struct llama_context * ctx,
           struct llama_batch   batch) {
@@ -9534,7 +9598,7 @@ int llama_decode(
         // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
         const int n_ctx = llama_n_ctx(ctx);
         std::vector<llama_token> tmp(n_ctx, llama_token_bos(&(ctx->model)));
-        while (llama_decode_internal(*ctx, batch) >= 0){};
+        while (llama_process_mpi_worker(ctx, batch) >= 0){};
         llama_backend_free();
         exit(1);
     } else if (ggml_mpi_rank(ctx->ctx_mpi) < 0) {

From 4dc25d3f05a98083690320bc99ddf6d950f29459 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Thu, 9 Nov 2023 17:43:14 -0600
Subject: [PATCH 24/63] Propagate exit to worker nodes

---
 ggml-mpi.c | 12 ++++++++++++
 llama.cpp  |  8 ++++++++
 2 files changed, 20 insertions(+)

diff --git a/ggml-mpi.c b/ggml-mpi.c
index 7c6900bbc3b..9e308ccd42d 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -20,6 +20,14 @@ struct ggml_mpi_context {
     MPI_Status status;
 };
 
+void ggml_mpi_sync_pipelined(
+        struct ggml_mpi_context *   ctx_mpi,
+        void * val,
+        int count,
+        MPI_Datatype datatype,
+        int tag
+);
+
 void ggml_mpi_backend_init(void) {
     int ret;
     MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &ret);
@@ -57,6 +65,10 @@ struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int
 }
 
 void ggml_mpi_free(struct ggml_mpi_context * ctx) {
+    if(ctx->comm == MPI_COMM_NULL) {
+        return;
+    }
+    ggml_mpi_sync_pipelined(ctx, NULL, 0, MPI_INT8_T, 6);
     MPI_Comm_free(&(ctx->comm));
     free(ctx);
 }
diff --git a/llama.cpp b/llama.cpp
index 06a69498b7f..d715dd8d7db 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8741,6 +8741,9 @@ void llama_split_layers_weighted(struct llama_context * ctx, float device_weight
 }
 
 void llama_free(struct llama_context * ctx) {
+#ifdef GGML_USE_MPI
+    ggml_mpi_free(ctx->ctx_mpi);
+#endif
     delete ctx;
 }
 
@@ -9583,6 +9586,11 @@ int llama_process_mpi_worker(
         case 5:
             llama_kv_cache_seq_shift(ctx, 0, 0, 0, 0);
             break;
+        case 6:
+            llama_free(ctx);
+            llama_backend_free();
+            exit(0);
+            break;
     }
     return 0;
 }

From ba31377a3efb31422e7eecf8ad384ac4ddde6e90 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sun, 12 Nov 2023 17:49:06 -0600
Subject: [PATCH 25/63] Add async decoding

---
 examples/speculative/speculative.cpp |  40 +++-
 ggml-mpi.c                           | 213 +++++++++++++-----
 ggml-mpi.h                           |  13 +-
 llama.cpp                            | 324 ++++++++++++++++-----------
 llama.h                              |   9 +-
 5 files changed, 404 insertions(+), 195 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index cb1ace01d06..908bcc93cfb 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -5,6 +5,8 @@
 #include <cstdio>
 #include <string>
 #include <vector>
+#include <deque>
+
 
 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -22,6 +24,11 @@ struct seq_draft {
     struct llama_sampling_context * ctx_sampling;
 };
 
+struct seq_async_run {
+    struct ggml_cgraph * cgraph;
+    struct llama_batch batch;
+};
+
 int main(int argc, char ** argv) {
     gpt_params params;
 
@@ -192,6 +199,9 @@ int main(int argc, char ** argv) {
     llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
     llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
 
+    std::deque<struct ggml_cgraph *> dft_cgraphs;
+    std::deque<struct ggml_cgraph *> tgt_cgraphs;
+
     const auto t_dec_start = ggml_time_us();
 
     // sample from the last token of the prompt
@@ -216,9 +226,15 @@ int main(int argc, char ** argv) {
         while (true) {
             LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
 
+            if (!tgt_cgraphs.empty()) {
+                llama_finish_async_decode(*ctx_tgt, batch_tgt, tgt_cgraphs.back());
+                tgt_cgraphs.pop_back();
+            }
+
             // sample from the target model
             llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
 
+            // Swap to pipeline roots
             llama_swap_comm(ctx_tgt);
             llama_sync_token(ctx_tgt, &id, 0);
 
@@ -228,13 +244,18 @@ int main(int argc, char ** argv) {
             //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
 
             const std::string token_str = llama_token_to_piece(ctx_tgt, id);
+            // Root of WORLD
             if (llama_node_id(ctx_tgt) == 0) {
                 printf("%s", token_str.c_str());
                 fflush(stdout);
             }
 
+            // Switch back to target pipeline only
             llama_swap_comm(ctx_tgt);
 
+            // We can start the target pipeline now without needing to wait for speculation
+//            tgt_cgraphs.push_front(llama_start_async_decode(*ctx_tgt, batch_tgt));
+
             if (id == llama_token_eos(model_tgt)) {
                 has_eos = true;
             }
@@ -276,6 +297,7 @@ int main(int argc, char ** argv) {
             {
                 LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
 
+                // Pipeline syncing cache ops
                 llama_kv_cache_seq_keep(ctx_dft, s_keep);
                 llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
                 llama_kv_cache_seq_keep(ctx_dft, 0);
@@ -298,9 +320,13 @@ int main(int argc, char ** argv) {
             llama_batch_clear(batch_dft);
             llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
 
+            // Pipeline sync on draft pipeline
             llama_kv_cache_seq_rm(ctx_dft, -1, n_past_dft, -1);
-             LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
-            llama_decode         (ctx_dft, batch_dft);
+
+            // Kick off drafting pipeline but don't need it just yet
+            dft_cgraphs.push_front(llama_start_async_decode(*ctx_dft, batch_dft));
+            //llama_decode(ctx_dft, batch_dft);
+            // DON'T FORGET THE MATCHING DECODE WHEN NEEDED
 
             ++n_past_dft;
 
@@ -327,6 +353,14 @@ int main(int argc, char ** argv) {
         llama_batch_clear(batch_tgt);
         llama_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
 
+        // We need the draft now, so wait for it
+        if (!dft_cgraphs.empty()) {
+            llama_finish_async_decode(*ctx_dft, batch_dft, dft_cgraphs.back());
+            dft_cgraphs.pop_back();
+        }
+        LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+
+
         // sample n_draft tokens from the draft model using tree-based sampling
         for (int i = 0; i < n_draft; ++i) {
             batch_dft.n_tokens = 0;
@@ -451,7 +485,7 @@ int main(int argc, char ** argv) {
             }
 
             // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
-            llama_decode(ctx_tgt, batch_tgt);
+            tgt_cgraphs.push_front(llama_start_async_decode(*ctx_tgt, batch_tgt));
             ++n_past_tgt;
         }
 
diff --git a/ggml-mpi.c b/ggml-mpi.c
index 9e308ccd42d..3cb80e203d7 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -6,6 +6,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 
@@ -18,6 +19,17 @@ struct ggml_mpi_context {
     int layer_start;
     int layer_end;
     MPI_Status status;
+    MPI_Request asyncSendRequest;
+    struct ggml_tensor * duped_send_tensor;
+    MPI_Request asyncRecvRequest;
+    struct ggml_tensor * duped_recv_tensor;
+    bool asyncSendWaiting;
+    bool asyncRecvWaiting;
+    struct ggml_cgraph * cgraph;
+    bool async;
+    bool running_decode;
+    bool res;
+    bool embed;
 };
 
 void ggml_mpi_sync_pipelined(
@@ -43,6 +55,10 @@ struct ggml_mpi_context * ggml_mpi_init(void) {
     MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
     MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
     ctx->comm = MPI_COMM_WORLD;
+    ctx->asyncSendWaiting = false;
+    ctx->asyncRecvWaiting = false;
+    ctx->running_decode = false;
+    ctx->async = false;
 //    ctx->status = *MPI_STATUS_IGNORE;
 
     return ctx;
@@ -73,6 +89,18 @@ void ggml_mpi_free(struct ggml_mpi_context * ctx) {
     free(ctx);
 }
 
+bool ggml_mpi_is_decoding(struct ggml_mpi_context * ctx_mpi) {
+    return ctx_mpi->running_decode;
+}
+
+struct ggml_cgraph * ggml_mpi_get_cgraph(struct ggml_mpi_context * ctx_mpi) {
+    return ctx_mpi->cgraph;
+}
+
+void ggml_mpi_set_cgraph(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * cgraph) {
+    ctx_mpi->cgraph = cgraph;
+}
+
 int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
     return ctx->rank;
 }
@@ -233,9 +261,19 @@ static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
     return -1;
 }
 
+struct ggml_tensor * ggml_mpi_dup_tensor(struct ggml_tensor * t) {
+    struct ggml_tensor * duped = malloc(sizeof(struct ggml_tensor));
+    for (int i = 0; i < 4; i++) {
+        duped->ne[i] = t->ne[i];
+    }
+    size_t data_size = ggml_element_size(t) * ggml_nelements(t);
+    duped->data = malloc(data_size);
+    memcpy(duped->data, t->data, data_size);
+    return duped;
+}
 
-static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst, MPI_Comm comm) {
-    if(comm == MPI_COMM_NULL) {
+static void ggml_mpi_tensor_send(struct ggml_mpi_context * ctx_mpi, struct ggml_tensor * t, int mpi_rank_dst) {
+    if(ctx_mpi->comm == MPI_COMM_NULL) {
         return;
     }
     MPI_Datatype mpi_type;
@@ -246,12 +284,51 @@ static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst, MPI_C
         default: GGML_ASSERT(false && "not implemented");
     }
 
-    const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, comm);
+    if (ctx_mpi->asyncSendWaiting) {
+        MPI_Wait(&(ctx_mpi->asyncSendRequest), MPI_STATUS_IGNORE);
+        ctx_mpi->asyncSendWaiting = false;
+        free(ctx_mpi->duped_send_tensor->data);
+        free(ctx_mpi->duped_send_tensor);
+    }
+    ctx_mpi->duped_send_tensor = ggml_mpi_dup_tensor(t);
+    ctx_mpi->asyncSendWaiting = true;
+
+    const int retval = MPI_Isend(ctx_mpi->duped_send_tensor->data, ggml_nelements(ctx_mpi->duped_send_tensor), mpi_type, mpi_rank_dst, 0, ctx_mpi->comm, &(ctx_mpi->asyncSendRequest));
+    GGML_ASSERT(retval == MPI_SUCCESS);
+}
+
+static void ggml_mpi_tensor_recv(struct ggml_mpi_context * ctx_mpi, struct ggml_tensor * t, int mpi_rank_src) {
+    if(ctx_mpi->comm == MPI_COMM_NULL) {
+        return;
+    }
+    MPI_Datatype mpi_type;
+
+    switch (t->type) {
+        case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
+        case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
+        default: GGML_ASSERT(false && "not implemented");
+    }
+    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, 0, ctx_mpi->comm, MPI_STATUS_IGNORE);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
 
-static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src, MPI_Comm comm) {
-    if(comm == MPI_COMM_NULL) {
+void ggml_mpi_wait_recv(struct ggml_mpi_context * ctx_mpi) {
+    if(ctx_mpi->comm == MPI_COMM_NULL) {
+        return;
+    }
+    if (ctx_mpi->asyncRecvWaiting) {
+        MPI_Wait(&(ctx_mpi->asyncRecvRequest), MPI_STATUS_IGNORE);
+        ctx_mpi->asyncRecvWaiting = false;
+    }
+}
+
+struct ggml_tensor * ggml_mpi_async_received_tensor(struct ggml_mpi_context * ctx_mpi) {
+    ggml_mpi_wait_recv(ctx_mpi);
+    return ctx_mpi->duped_recv_tensor;
+}
+
+static void ggml_mpi_async_tensor_recv(struct ggml_mpi_context * ctx_mpi, struct ggml_tensor * t, int mpi_rank_src) {
+    if(ctx_mpi->comm == MPI_COMM_NULL) {
         return;
     }
     MPI_Datatype mpi_type;
@@ -262,7 +339,11 @@ static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src, MPI_C
         default: GGML_ASSERT(false && "not implemented");
     }
 
-    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, comm, MPI_STATUS_IGNORE);
+    ggml_mpi_wait_recv(ctx_mpi);
+//    ctx_mpi->duped_recv_tensor = t;
+    ctx_mpi->asyncRecvWaiting = true;
+    const int retval = MPI_Irecv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, 0, ctx_mpi->comm, &(ctx_mpi->asyncRecvRequest));
+
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
 
@@ -330,11 +411,15 @@ void ggml_mpi_scatter_layers(
     fprintf(stderr, "Ranges for rank %d: [%d, %d]\n", ctx_mpi->rank, ctx_mpi->layer_start, ctx_mpi->layer_end);
 }
 
+void ggml_set_async(struct ggml_mpi_context * ctx_mpi, bool async) {
+    ctx_mpi->async = async;
+}
+
 // TODO: there are many improvements that can be done to this implementation
-void ggml_mpi_graph_compute_pre(
+void ggml_mpi_graph_creation_post(
         struct ggml_mpi_context * ctx_mpi,
              struct ggml_cgraph * gf,
-                            int   n_layers) {
+                     const int n_layers) {
     const int mpi_rank = ctx_mpi->rank;
     const int mpi_size = ctx_mpi->size;
 
@@ -369,83 +454,105 @@ void ggml_mpi_graph_compute_pre(
     if (mpi_rank > 0) {
         if (mpi_rank == 1) {
             // the first node (1) receives the input tokens from the main node (0)
-            ggml_mpi_tensor_recv(inp_tokens, 0, ctx_mpi->comm);
+            ggml_mpi_tensor_recv(ctx_mpi, inp_tokens, 0);
         } else {
             // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
-            ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm);
+            ggml_mpi_tensor_recv(ctx_mpi, inp0, mpi_rank - 1);
         }
     } else if (mpi_size > 1) {
         // node 0 sends the input tokens to node 1
-        ggml_mpi_tensor_send(inp_tokens, 1, ctx_mpi->comm);
-
         // recv the output data from the last node
-        ggml_mpi_tensor_recv(inp0, mpi_size - 1, ctx_mpi->comm);
+        ggml_mpi_tensor_send(ctx_mpi, inp_tokens, 1);
+        ggml_mpi_tensor_recv(ctx_mpi, inp0, mpi_size - 1);
     }
 
-    {
+    //const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
 
+    const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
 
-        //const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
+    //const int il0 =               (mpi_idx + 0) * n_per_node;
+    //const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
+    int il0 = ctx_mpi->layer_start;
+    int il1 = MIN(n_layers, ctx_mpi->layer_end);
 
-        const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
+    char name_l0[GGML_MAX_NAME];
+    char name_l1[GGML_MAX_NAME];
 
-        //const int il0 =               (mpi_idx + 0) * n_per_node;
-        //const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
-        int il0 = ctx_mpi->layer_start;
-        int il1 = MIN(n_layers, ctx_mpi->layer_end);
+    snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
+    snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
 
-        char name_l0[GGML_MAX_NAME];
-        char name_l1[GGML_MAX_NAME];
+    const int idx_l0 =                ggml_graph_get_node_idx(gf, name_l0);
+    const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
 
-        snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
-        snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
+    struct ggml_tensor *res = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor *embeddings = gf->nodes[gf->n_nodes - 2];
 
-        const int idx_l0 =                ggml_graph_get_node_idx(gf, name_l0);
-        const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
+    if (idx_l0 < 0 || idx_l1 < 0) {
+        fprintf(stderr, "%s: layer input nodes not found\n", __func__);
+        return;
+    }
 
-        if (idx_l0 < 0 || idx_l1 < 0) {
-            fprintf(stderr, "%s: layer input nodes not found\n", __func__);
-            return;
+    // attach the input data to all nodes that need it
+    // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
+    for (int i = idx_l0; i < idx_l1; i++) {
+        if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
+            gf->nodes[i]->src[0] =  inp0;
         }
-
-        // attach the input data to all nodes that need it
-        // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
-        for (int i = idx_l0; i < idx_l1; i++) {
-            if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
-                gf->nodes[i]->src[0] =  inp0;
-            }
-            if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
-                gf->nodes[i]->src[1] =  inp0;
-            }
+        if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
+            gf->nodes[i]->src[1] =  inp0;
         }
+    }
+
+    // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
+    for (int i = 1; i < idx_l1 - idx_l0; i++) {
+        gf->nodes[i] = gf->nodes[idx_l0 + i];
+        gf->grads[i] = gf->grads[idx_l0 + i];
+    }
+
+    // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
+    if (mpi_idx != 0) {
+        gf->nodes[0]->op = GGML_OP_NONE;
+    }
+
+    gf->n_nodes = idx_l1 - idx_l0;
 
-        // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
-        for (int i = 1; i < idx_l1 - idx_l0; i++) {
-            gf->nodes[i] = gf->nodes[idx_l0 + i];
-            gf->grads[i] = gf->grads[idx_l0 + i];
-        }
 
-        // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
-        if (mpi_idx != 0) {
-            gf->nodes[0]->op = GGML_OP_NONE;
+}
+
+void ggml_mpi_graph_compute_pre(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * gf) {
+    const int mpi_rank = ctx_mpi->rank;
+    const int mpi_size = ctx_mpi->size;
+
+    struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
+    if (inp_tokens == NULL) {
+        fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
+        return;
+    }
+
+    struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
+    if (inp0 == NULL) {
+        fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
+        return;
+    }
+
+    GGML_ASSERT(inp0 == gf->nodes[0]);
+    {
+        if (mpi_rank == 0 && mpi_size > 1) {
+//            ggml_mpi_wait_recv(ctx_mpi);
         }
 
-        gf->n_nodes = idx_l1 - idx_l0;
 
     }
 }
 
 void ggml_mpi_graph_compute_post(
         struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers) {
-    UNUSED(n_layers);
+             struct ggml_cgraph * gf) {
 
     const int mpi_rank = ctx_mpi->rank;
-    const int mpi_size = ctx_mpi->size;
 
     // send the output data to the next node
     if (mpi_rank > 0) {
-        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], ggml_mpi_next_node(ctx_mpi), ctx_mpi->comm);
+        ggml_mpi_tensor_send(ctx_mpi, gf->nodes[gf->n_nodes - 1], ggml_mpi_next_node(ctx_mpi));
     }
 }
diff --git a/ggml-mpi.h b/ggml-mpi.h
index f843165e997..42735bb5366 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <stdint.h>
 #include <stddef.h>
+#include <stdbool.h>
 
 struct ggml_context;
 struct ggml_tensor;
@@ -32,6 +33,12 @@ struct ggml_mpi_context;
  */
 void ggml_mpi_backend_init(void);
 
+bool ggml_mpi_is_decoding(struct ggml_mpi_context * ctx_mpi);
+
+void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * cgraph, int   n_layers);
+
+void ggml_mpi_wait_recv(struct ggml_mpi_context * ctx_mpi);
+
 /**
  * Frees the MPI backend, must be called only once at termination
  * of the program. No MPI operations may be completed after calling this function,
@@ -199,8 +206,7 @@ void ggml_mpi_scatter_layers(
  */
 void ggml_mpi_graph_compute_pre(
         struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers);
+             struct ggml_cgraph * gf);
 
 /**
  * Sends the output tensor to the next node for processing
@@ -212,8 +218,7 @@ void ggml_mpi_graph_compute_pre(
  */
 void ggml_mpi_graph_compute_post(
         struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers);
+             struct ggml_cgraph * gf);
 
 #ifdef __cplusplus
 }
diff --git a/llama.cpp b/llama.cpp
index d715dd8d7db..7695bca7efc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5420,22 +5420,17 @@ static struct ggml_cgraph * llama_build_graph(
     return result;
 }
 
-// decode a batch of tokens by evaluating the transformer
-//
-//   - lctx:      llama context
-//   - batch:     batch to evaluate
-//
-// return 0 on success
-// return positive int on warning
-// return negative int on error
-//
-static int llama_decode_internal(
-         llama_context & lctx,
-           llama_batch   batch) {
+
+
+static struct ggml_cgraph * llama_decode_internal_phased(
+            llama_context & lctx,
+            llama_batch   batch,
+            uint8_t phase,
+            ggml_cgraph * cgraph) {
     uint32_t n_tokens = batch.n_tokens;
     if (n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
-        return -1;
+        return nullptr;
     }
 
     const auto & model   = lctx.model;
@@ -5492,74 +5487,92 @@ static int llama_decode_internal(
         batch.seq_id = seq_id_arr.data();
     }
 
-    // if we have enough unused cells before the current head ->
-    //   better to start searching from the beginning of the cache, hoping to fill it
-    if (kv_self.head > kv_self.used + 2*n_tokens) {
-        kv_self.head = 0;
-    }
+    if (phase == 0) {
+
+        // if we have enough unused cells before the current head ->
+        //   better to start searching from the beginning of the cache, hoping to fill it
+        if (kv_self.head > kv_self.used + 2*n_tokens) {
+            kv_self.head = 0;
+        }
 
 #ifdef GGML_USE_MPI
-    // TODO: needs fix after #3228
-    ggml_mpi_eval_init(lctx.ctx_mpi, &(batch.n_tokens), &(batch.pos), &(batch.n_seq_id), &(batch.seq_id), &(batch.logits));
-    n_tokens = batch.n_tokens;
+        // TODO: needs fix after #3228
+        ggml_mpi_eval_init(lctx.ctx_mpi, &(batch.n_tokens), &(batch.pos), &(batch.n_seq_id), &(batch.seq_id), &(batch.logits));
+        n_tokens = batch.n_tokens;
 #endif
-    if (!llama_kv_cache_find_slot(kv_self, batch)) {
-        return 1;
-    }
+        if (!llama_kv_cache_find_slot(kv_self, batch)) {
+            printf("Cannot find cache slot\n");
+            return nullptr;
+        }
 
-    // a heuristic, to avoid attending the full cache if it is not yet utilized
-    // after enough generations, the benefit from this heuristic disappears
-    // if we start defragmenting the cache, the benefit from this will be more important
-    //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32));   // TODO: this might be better for CUDA?
-    kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
+        // a heuristic, to avoid attending the full cache if it is not yet utilized
+        // after enough generations, the benefit from this heuristic disappears
+        // if we start defragmenting the cache, the benefit from this will be more important
+        //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32));   // TODO: this might be better for CUDA?
+        kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
 
-    //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
+        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
-    ggml_allocr_reset(lctx.alloc);
-    ggml_cgraph * gf = llama_build_graph(lctx, batch);
+        ggml_allocr_reset(lctx.alloc);
+        ggml_cgraph * gf = llama_build_graph(lctx, batch);
 
-    ggml_allocr_alloc_graph(lctx.alloc, gf);
+        ggml_allocr_alloc_graph(lctx.alloc, gf);
 
-    struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
+        struct ggml_tensor *res = gf->nodes[gf->n_nodes - 1];
+        struct ggml_tensor *embeddings = gf->nodes[gf->n_nodes - 2];
 
-    GGML_ASSERT(strcmp(res->name,        "result_output") == 0);
-    GGML_ASSERT(strcmp(embeddings->name, "result_norm")   == 0);
+        GGML_ASSERT(strcmp(res->name, "result_output") == 0);
+        GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
 
+#ifdef GGML_USE_MPI
+        const int64_t n_layer = hparams.n_layer;
+        ggml_mpi_graph_creation_post(lctx.ctx_mpi, gf, n_layer);
+#endif
 
 #ifdef GGML_USE_CUBLAS
-    for (int i = 0; i < gf->n_leafs; i++) {
-        ggml_tensor * node = gf->leafs[i];
-        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
-            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
-            ggml_cuda_copy_to_device(node);
+        for (int i = 0; i < gf->n_leafs; i++) {
+            ggml_tensor * node = gf->leafs[i];
+            if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
+                ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
+                ggml_cuda_copy_to_device(node);
+            }
         }
-    }
 
-    for (int i = 0; i < gf->n_nodes; i++) {
-        ggml_tensor * node = gf->nodes[i];
-        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
-            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
+        for (int i = 0; i < gf->n_nodes; i++) {
+            ggml_tensor * node = gf->nodes[i];
+            if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
+                ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
+            }
         }
-    }
 
-    // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
-    if (!lctx.embedding.empty()) {
-        embeddings->backend = GGML_BACKEND_CPU;
-    }
-    res->backend = GGML_BACKEND_CPU;
+        // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
+        if (!lctx.embedding.empty()) {
+            embeddings->backend = GGML_BACKEND_CPU;
+        }
+        res->backend = GGML_BACKEND_CPU;
 #endif
+        return gf;
+    } else if (phase == 1) {
 
-    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
-    // for big prompts, if BLAS is enabled, it is better to use only one thread
-    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-    // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
-    //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
-    //       with the BLAS calls. need a better solution
-    if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
-        n_threads = std::min(4, n_threads);
-    }
+        ggml_cgraph * gf = cgraph;
+        struct ggml_tensor *res = gf->nodes[gf->n_nodes - 1];
+        struct ggml_tensor *embeddings = gf->nodes[gf->n_nodes - 2];
+
+#ifdef GGML_USE_MPI
+        ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf);
+#endif
+
+        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+
+        // for big prompts, if BLAS is enabled, it is better to use only one thread
+        // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
+        // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
+        //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
+        //       with the BLAS calls. need a better solution
+        if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
+            n_threads = std::min(4, n_threads);
+        }
 
     // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
     const bool full_offload_supported =
@@ -5571,107 +5584,150 @@ static int llama_decode_internal(
         model.arch == LLM_ARCH_STARCODER  ||
         model.arch == LLM_ARCH_STABLELM;
 
-    const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
-    if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
-        n_threads = 1;
-    }
-#if GGML_USE_MPI
-    const int64_t n_layer = hparams.n_layer;
-    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
-#endif
+        const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
+        if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
+            n_threads = 1;
+        }
 
 #ifdef GGML_USE_METAL
-    if (lctx.ctx_metal) {
-        ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
-        ggml_metal_graph_compute(lctx.ctx_metal, gf);
-    } else {
-        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
-    }
+        if (lctx.ctx_metal) {
+            ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
+            ggml_metal_graph_compute(lctx.ctx_metal, gf);
+        } else {
+            ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
+        }
 #else
-    ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
+        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
 #endif
 
 #if GGML_USE_MPI
-    ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
+        ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf);
 #endif
 
-    // update the kv ring buffer
-    {
-        if (kv_self.has_shift) {
-            kv_self.has_shift = false;
-            for (uint32_t i = 0; i < kv_self.size; ++i) {
-                kv_self.cells[i].delta = 0;
+        // update the kv ring buffer
+        {
+            if (kv_self.has_shift) {
+                kv_self.has_shift = false;
+                for (uint32_t i = 0; i < kv_self.size; ++i) {
+                    kv_self.cells[i].delta = 0;
+                }
             }
-        }
 
-        kv_self.head += n_tokens;
+            kv_self.head += n_tokens;
 
-        // Ensure kv cache head points to a valid index.
-        if (kv_self.head >= kv_self.size) {
-            kv_self.head = 0;
+            // Ensure kv cache head points to a valid index.
+            if (kv_self.head >= kv_self.size) {
+                kv_self.head = 0;
+            }
         }
-    }
 
 #ifdef GGML_PERF
-    // print timing information per ggml operation (for debugging purposes)
-    // requires GGML_PERF to be defined
-    ggml_graph_print(gf);
+        // print timing information per ggml operation (for debugging purposes)
+        // requires GGML_PERF to be defined
+        ggml_graph_print(gf);
 #endif
 
-    // plot the computation graph in dot format (for debugging purposes)
-    //if (n_past%100 == 0) {
-    //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
-    //}
+        // plot the computation graph in dot format (for debugging purposes)
+        //if (n_past%100 == 0) {
+        //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
+        //}
 
-    // extract logits
-    // TODO: do not compute and extract logits if only embeddings are needed
-    //       need to update the graphs to skip "result_output"
-    {
-        auto & logits_out = lctx.logits;
+        // extract logits
+        // TODO: do not compute and extract logits if only embeddings are needed
+        //       need to update the graphs to skip "result_output"
+        {
+            auto & logits_out = lctx.logits;
 
-        if (batch.logits) {
-            logits_out.resize(n_vocab * n_tokens);
-            for (uint32_t i = 0; i < n_tokens; i++) {
-                if (batch.logits[i] == 0) {
-                    continue;
+            if (batch.logits) {
+                logits_out.resize(n_vocab * n_tokens);
+                for (uint32_t i = 0; i < n_tokens; i++) {
+                    if (batch.logits[i] == 0) {
+                        continue;
+                    }
+                    memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
                 }
-                memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
+            } else if (lctx.logits_all) {
+                logits_out.resize(n_vocab * n_tokens);
+                memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
+            } else {
+                logits_out.resize(n_vocab);
+                memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
             }
-        } else if (lctx.logits_all) {
-            logits_out.resize(n_vocab * n_tokens);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
-        } else {
-            logits_out.resize(n_vocab);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
         }
-    }
 
-    // extract embeddings
-    if (!lctx.embedding.empty()) {
-        auto & embedding_out = lctx.embedding;
+        // extract embeddings
+        if (!lctx.embedding.empty()) {
+            auto & embedding_out = lctx.embedding;
 
-        embedding_out.resize(n_embd);
-        memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
-    }
+            embedding_out.resize(n_embd);
+            memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
+        }
+
+        // measure the performance only for the single-token evals
+        if (n_tokens == 1) {
+            lctx.t_eval_us += ggml_time_us() - t_start_us;
+            lctx.n_eval++;
+        }
+        else if (n_tokens > 1) {
+            lctx.t_p_eval_us += ggml_time_us() - t_start_us;
+            lctx.n_p_eval += n_tokens;
+        }
+
+        // get a more accurate load time, upon first eval
+        // TODO: fix this
+        if (!lctx.has_evaluated_once) {
+            lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
+            lctx.has_evaluated_once = true;
+        }
+            return gf;
 
-    // measure the performance only for the single-token evals
-    if (n_tokens == 1) {
-        lctx.t_eval_us += ggml_time_us() - t_start_us;
-        lctx.n_eval++;
     }
-    else if (n_tokens > 1) {
-        lctx.t_p_eval_us += ggml_time_us() - t_start_us;
-        lctx.n_p_eval += n_tokens;
+    return nullptr;
+}
+
+// decode a batch of tokens by evaluating the transformer
+//
+//   - lctx:      llama context
+//   - batch:     batch to evaluate
+//
+// return 0 on success
+// return positive int on warning
+// return negative int on error
+//
+static int llama_decode_internal(
+        llama_context & lctx,
+        llama_batch   batch) {
+    struct ggml_cgraph * gf = llama_decode_internal_phased(lctx, batch, 0, nullptr);
+    if (gf != nullptr) {
+        return llama_decode_internal_phased(lctx, batch, 1, gf) != nullptr;
+    } else {
+        printf("Graph is null\n");
+        return -1;
     }
+}
+
+struct ggml_cgraph * llama_start_async_decode(
+        llama_context & lctx,
+        llama_batch   batch) {
+    return llama_decode_internal_phased(lctx, batch, 0, nullptr);
+
+}
+
+int llama_finish_async_decode(
+        struct llama_context & lctx,
+        struct llama_batch   batch,
+        struct ggml_cgraph * cgraph) {
 
-    // get a more accurate load time, upon first eval
-    // TODO: fix this
-    if (!lctx.has_evaluated_once) {
-        lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
-        lctx.has_evaluated_once = true;
+    int ret;
+    if (cgraph != nullptr) {
+
+        ret = llama_decode_internal_phased(lctx, batch, 1, cgraph) != nullptr;
+    } else {
+        ret = -1;
     }
 
-    return 0;
+    return ret;
+
 }
 
 //
diff --git a/llama.h b/llama.h
index 8865f158aba..9f5c81615d7 100644
--- a/llama.h
+++ b/llama.h
@@ -276,7 +276,14 @@ extern "C" {
 
     LLAMA_API void llama_swap_comm(struct llama_context * ctx);
 
-    LLAMA_API void llama_sync_token(struct llama_context * ctx, llama_token * token, int root);\
+    LLAMA_API void llama_sync_token(struct llama_context * ctx, llama_token * token, int root);
+
+    LLAMA_API struct ggml_cgraph * llama_start_async_decode(struct llama_context & lctx,
+                                                                   struct llama_batch   batch);
+
+    LLAMA_API int llama_finish_async_decode(struct llama_context & lctx,
+                                                                    struct llama_batch   batch,
+                                                                    struct ggml_cgraph * cgraph);
 
     LLAMA_API void llama_sync_token_data(struct llama_context * ctx, llama_token_data * data, int root);
 

From 1b6f75dbe4e5947ab1c3c72a06e2b388af2b8b71 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sun, 12 Nov 2023 18:15:06 -0600
Subject: [PATCH 26/63] Fix draft nodes accidentally running target

---
 examples/speculative/speculative.cpp | 4 +++-
 ggml-mpi.c                           | 5 +++--
 ggml-mpi.h                           | 2 +-
 llama.cpp                            | 4 +++-
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 908bcc93cfb..61caa93f217 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -376,6 +376,7 @@ int main(int argc, char ** argv) {
 
                 llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
 
+                // Swap back to pipeline roots
                 llama_swap_comm(ctx_dft);
                 llama_sync_token(ctx_dft, &(drafts[s].i_batch_dft), 1);
 
@@ -387,6 +388,7 @@ int main(int argc, char ** argv) {
                             k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
                 }
 
+                // Back to draft pipeline only
                 llama_swap_comm(ctx_dft);
 
 
@@ -489,7 +491,7 @@ int main(int argc, char ** argv) {
             ++n_past_tgt;
         }
 
-        // the first token is always proposed by the traget model before the speculation loop so we erase it here
+        // the first token is always proposed by the target model before the speculation loop so we erase it here
         for (int s = 0; s < n_seq_dft; ++s) {
             if (!drafts[s].active) {
                 continue;
diff --git a/ggml-mpi.c b/ggml-mpi.c
index 3cb80e203d7..c957e2b072c 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -143,7 +143,7 @@ void ggml_mpi_sync_pipelined(
     }
 }
 
-void ggml_mpi_eval_init(
+bool ggml_mpi_eval_init(
         struct ggml_mpi_context *   ctx_mpi,
                 int32_t         *   n_tokens,
                 int32_t         **  pos,
@@ -151,7 +151,7 @@ void ggml_mpi_eval_init(
                 int32_t         *** seq_id,
                 int8_t          **  logits) {
     if(ctx_mpi->comm == MPI_COMM_NULL) {
-        return;
+        return false;
     }
     int32_t old_n_tokens = *n_tokens;
 
@@ -211,6 +211,7 @@ void ggml_mpi_eval_init(
     free(flattened_seq_ids);
     //free(*seq_id); // <- something is still holding onto this, need to investigate
     *seq_id = new_seq_id;
+    return true;
 }
 
 void ggml_mpi_sync_ints_pipelined(
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 42735bb5366..5f7c5584fca 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -137,7 +137,7 @@ size_t ggml_mpi_size(struct ggml_mpi_context * ctx);
  * @param seq_id A pointer to the seq_id 2D array, which will be synchronized after this function.
  * @param logits A pointer to the logits array, which is unused currently since only node 0 needs them.
  */
-void ggml_mpi_eval_init(
+bool ggml_mpi_eval_init(
         struct ggml_mpi_context *   ctx_mpi,
                 int32_t         *   n_tokens,
                 int32_t         **  pos,
diff --git a/llama.cpp b/llama.cpp
index 7695bca7efc..fe4340a3596 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5497,7 +5497,9 @@ static struct ggml_cgraph * llama_decode_internal_phased(
 
 #ifdef GGML_USE_MPI
         // TODO: needs fix after #3228
-        ggml_mpi_eval_init(lctx.ctx_mpi, &(batch.n_tokens), &(batch.pos), &(batch.n_seq_id), &(batch.seq_id), &(batch.logits));
+        if (!ggml_mpi_eval_init(lctx.ctx_mpi, &(batch.n_tokens), &(batch.pos), &(batch.n_seq_id), &(batch.seq_id), &(batch.logits))) {
+            return nullptr;
+        }
         n_tokens = batch.n_tokens;
 #endif
         if (!llama_kv_cache_find_slot(kv_self, batch)) {

From 71c69473b89b4248ca82fd6db9e853ed6a20d51b Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sun, 12 Nov 2023 20:15:27 -0600
Subject: [PATCH 27/63] Re-enable async tensor send

---
 ggml-mpi.c | 18 ++++++++++--------
 ggml-mpi.h |  2 +-
 llama.cpp  |  4 +++-
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/ggml-mpi.c b/ggml-mpi.c
index c957e2b072c..964100d53a9 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -412,9 +412,6 @@ void ggml_mpi_scatter_layers(
     fprintf(stderr, "Ranges for rank %d: [%d, %d]\n", ctx_mpi->rank, ctx_mpi->layer_start, ctx_mpi->layer_end);
 }
 
-void ggml_set_async(struct ggml_mpi_context * ctx_mpi, bool async) {
-    ctx_mpi->async = async;
-}
 
 // TODO: there are many improvements that can be done to this implementation
 void ggml_mpi_graph_creation_post(
@@ -464,7 +461,7 @@ void ggml_mpi_graph_creation_post(
         // node 0 sends the input tokens to node 1
         // recv the output data from the last node
         ggml_mpi_tensor_send(ctx_mpi, inp_tokens, 1);
-        ggml_mpi_tensor_recv(ctx_mpi, inp0, mpi_size - 1);
+        ggml_mpi_async_tensor_recv(ctx_mpi, inp0, mpi_size - 1);
     }
 
     //const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
@@ -520,30 +517,35 @@ void ggml_mpi_graph_creation_post(
 
 }
 
-void ggml_mpi_graph_compute_pre(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * gf) {
+bool ggml_mpi_graph_compute_pre(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * gf) {
+    if (ctx_mpi->comm == MPI_COMM_NULL) {
+        return false;
+    }
+
     const int mpi_rank = ctx_mpi->rank;
     const int mpi_size = ctx_mpi->size;
 
     struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
     if (inp_tokens == NULL) {
         fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
-        return;
+        return false;
     }
 
     struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
     if (inp0 == NULL) {
         fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
-        return;
+        return false;
     }
 
     GGML_ASSERT(inp0 == gf->nodes[0]);
     {
         if (mpi_rank == 0 && mpi_size > 1) {
-//            ggml_mpi_wait_recv(ctx_mpi);
+            ggml_mpi_wait_recv(ctx_mpi);
         }
 
 
     }
+    return true;
 }
 
 void ggml_mpi_graph_compute_post(
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 5f7c5584fca..df9c10366b8 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -204,7 +204,7 @@ void ggml_mpi_scatter_layers(
  * @param gf The compute graph to modify
  * @param n_layers The number of layers in the model, used as an upper bound in the layer ranges.
  */
-void ggml_mpi_graph_compute_pre(
+bool ggml_mpi_graph_compute_pre(
         struct ggml_mpi_context * ctx_mpi,
              struct ggml_cgraph * gf);
 
diff --git a/llama.cpp b/llama.cpp
index fe4340a3596..ecb1d8598c0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5562,7 +5562,9 @@ static struct ggml_cgraph * llama_decode_internal_phased(
         struct ggml_tensor *embeddings = gf->nodes[gf->n_nodes - 2];
 
 #ifdef GGML_USE_MPI
-        ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf);
+        if (!ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf)) {
+            return nullptr;
+        }
 #endif
 
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);

From d73f944eaf03552b4e2137fe2c674c95e7041c8d Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 14 Nov 2023 11:43:58 -0600
Subject: [PATCH 28/63] Begin work on decoupling tgt and dft pipelines

---
 examples/speculative/speculative.cpp | 72 ++++++++++++++++++++++++----
 1 file changed, 62 insertions(+), 10 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 61caa93f217..fc4c88aa7d2 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -6,6 +6,7 @@
 #include <string>
 #include <vector>
 #include <deque>
+#include <stdint.h>
 
 
 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
@@ -27,6 +28,12 @@ struct seq_draft {
 struct seq_async_run {
     struct ggml_cgraph * cgraph;
     struct llama_batch batch;
+    int n_past_tgt;
+    int n_past_dft;
+    int s_keep;
+    std::vector<seq_draft> drafts;
+    llama_sampling_context * ctx_sampling;
+    uint8_t * state;
 };
 
 int main(int argc, char ** argv) {
@@ -200,7 +207,7 @@ int main(int argc, char ** argv) {
     llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
 
     std::deque<struct ggml_cgraph *> dft_cgraphs;
-    std::deque<struct ggml_cgraph *> tgt_cgraphs;
+    std::deque<struct seq_async_run> tgt_cgraphs;
 
     const auto t_dec_start = ggml_time_us();
 
@@ -223,13 +230,23 @@ int main(int argc, char ** argv) {
         int i_dft  = 0;
         int s_keep = 0;
 
+        if (!tgt_cgraphs.empty()) {
+            struct seq_async_run run = tgt_cgraphs.back();
+//            llama_set_state_data(ctx_tgt, run.state);
+//            drafts = run.drafts;
+            struct ggml_cgraph * cgraph = run.cgraph;
+//            batch_tgt = run.batch;
+            n_past_tgt = run.n_past_tgt;
+//            n_past_dft = run.n_past_dft;
+//            s_keep = run.s_keep;
+            llama_finish_async_decode(*ctx_tgt, batch_tgt, cgraph);
+            tgt_cgraphs.pop_back();
+        }
+
         while (true) {
             LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
 
-            if (!tgt_cgraphs.empty()) {
-                llama_finish_async_decode(*ctx_tgt, batch_tgt, tgt_cgraphs.back());
-                tgt_cgraphs.pop_back();
-            }
+
 
             // sample from the target model
             llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
@@ -308,6 +325,10 @@ int main(int argc, char ** argv) {
                 llama_kv_cache_seq_keep(ctx_tgt, 0);
             }
 
+            llama_batch_clear(batch_tgt);
+            llama_batch_add  (batch_tgt, id, n_past_tgt, { 0 }, true);
+
+
             for (int s = 0; s < n_seq_dft; ++s) {
                 drafts[s].active = false;
                 drafts[s].tokens.clear();
@@ -319,8 +340,11 @@ int main(int argc, char ** argv) {
 
             llama_batch_clear(batch_dft);
             llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
+            // batch_dft.n_tokens == 1 now
 
             // Pipeline sync on draft pipeline
+
+            // Remove all tokens from all sequences after n_past_dft
             llama_kv_cache_seq_rm(ctx_dft, -1, n_past_dft, -1);
 
             // Kick off drafting pipeline but don't need it just yet
@@ -350,8 +374,7 @@ int main(int argc, char ** argv) {
         drafts[0].drafting    = true;
         drafts[0].i_batch_dft = 0;
 
-        llama_batch_clear(batch_tgt);
-        llama_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
+
 
         // We need the draft now, so wait for it
         if (!dft_cgraphs.empty()) {
@@ -486,12 +509,41 @@ int main(int argc, char ** argv) {
                 llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
             }
 
-            // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
-            tgt_cgraphs.push_front(llama_start_async_decode(*ctx_tgt, batch_tgt));
             ++n_past_tgt;
+
+
+            // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
+            struct seq_async_run run;
+            run.state = (uint8_t *)malloc(llama_get_state_size(ctx_tgt));
+            llama_copy_state_data(ctx_tgt, run.state);
+            run.s_keep = s_keep;
+            run.drafts = drafts;
+            run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
+            run.n_past_tgt = n_past_tgt;
+            run.n_past_dft = n_past_dft;
+            run.batch = batch_tgt;
+            tgt_cgraphs.push_front(run);
+
         }
 
-        // the first token is always proposed by the target model before the speculation loop so we erase it here
+//        llama_kv_cache_seq_keep(ctx_tgt, 0);
+//        for (int s = 1; s < n_seq_dft; ++s) {
+//            llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
+//        }
+
+        // We can start the target pipeline now without needing to wait for speculation
+//        struct seq_async_run run;
+//        run.state = (uint8_t *)malloc(llama_get_state_size(ctx_tgt));
+//        llama_copy_state_data(ctx_tgt, run.state);
+//        run.s_keep = s_keep;
+//        run.drafts = drafts;
+//        run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
+//        run.n_past_tgt = n_past_tgt;
+//        run.n_past_dft = n_past_dft;
+//        run.batch = batch_tgt;
+//
+//        tgt_cgraphs.push_front(run);
+
         for (int s = 0; s < n_seq_dft; ++s) {
             if (!drafts[s].active) {
                 continue;

From 4aa9b6c817d1745a7c80f8e3e4a92b06db0e1e23 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 14 Nov 2023 12:09:21 -0600
Subject: [PATCH 29/63] Only sync required token data

---
 examples/speculative/speculative.cpp | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index fc4c88aa7d2..f21261a526d 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -327,6 +327,7 @@ int main(int argc, char ** argv) {
 
             llama_batch_clear(batch_tgt);
             llama_batch_add  (batch_tgt, id, n_past_tgt, { 0 }, true);
+            // batch_tgt.n_tokens = 1
 
 
             for (int s = 0; s < n_seq_dft; ++s) {
@@ -405,14 +406,16 @@ int main(int argc, char ** argv) {
 
                 auto & cur_p = drafts[s].ctx_sampling->cur;
 
+                llama_sync_token_data(ctx_dft, &(cur_p[0]), 1);
+
                 for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
-                    llama_sync_token_data(ctx_dft, &(cur_p[k]), 1);
                     LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
                             k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
                 }
 
-                // Back to draft pipeline only
-                llama_swap_comm(ctx_dft);
+
+
+
 
 
                 if (cur_p[0].p < p_accept) {
@@ -421,6 +424,14 @@ int main(int argc, char ** argv) {
                     continue;
                 }
 
+                // TODO investigate potential bottleneck
+                for (int k = 1; k < 8; ++k) {
+                    llama_sync_token_data(ctx_dft, &(cur_p[k]), 1);
+                }
+
+                // Back to draft pipeline only
+                llama_swap_comm(ctx_dft);
+
                 std::vector<int> sa(1, s);
 
                 // attempt to split the branch if the probability is high enough

From 9b67f73bac0eba756bcc656b2c7c715c9daa2dde Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 14 Nov 2023 15:23:55 -0600
Subject: [PATCH 30/63] Working additional run w/ reset

---
 examples/speculative/speculative.cpp | 49 +++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index f21261a526d..1daaaeaced7 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -204,7 +204,7 @@ int main(int argc, char ** argv) {
     }
 
     llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
+    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft + 1);
 
     std::deque<struct ggml_cgraph *> dft_cgraphs;
     std::deque<struct seq_async_run> tgt_cgraphs;
@@ -237,7 +237,7 @@ int main(int argc, char ** argv) {
             struct ggml_cgraph * cgraph = run.cgraph;
 //            batch_tgt = run.batch;
             n_past_tgt = run.n_past_tgt;
-//            n_past_dft = run.n_past_dft;
+            n_past_dft = run.n_past_dft;
 //            s_keep = run.s_keep;
             llama_finish_async_decode(*ctx_tgt, batch_tgt, cgraph);
             tgt_cgraphs.pop_back();
@@ -253,6 +253,8 @@ int main(int argc, char ** argv) {
 
             // Swap to pipeline roots
             llama_swap_comm(ctx_tgt);
+            LOG("Swapped comm to pipeline roots, id %d\n", llama_node_id(ctx_tgt));
+
             llama_sync_token(ctx_tgt, &id, 0);
 
 
@@ -269,6 +271,8 @@ int main(int argc, char ** argv) {
 
             // Switch back to target pipeline only
             llama_swap_comm(ctx_tgt);
+            LOG("Swapped comm to target only, id %d\n", llama_node_id(ctx_tgt));
+
 
             // We can start the target pipeline now without needing to wait for speculation
 //            tgt_cgraphs.push_front(llama_start_async_decode(*ctx_tgt, batch_tgt));
@@ -326,9 +330,31 @@ int main(int argc, char ** argv) {
             }
 
             llama_batch_clear(batch_tgt);
-            llama_batch_add  (batch_tgt, id, n_past_tgt, { 0 }, true);
+            llama_batch_add  (batch_tgt, id, n_past_tgt, { n_seq_dft+1 }, true);
             // batch_tgt.n_tokens = 1
 
+//            llama_kv_cache_seq_rm  (ctx_tgt, n_seq_dft+1, -1, -1);
+            llama_kv_cache_seq_cp  (ctx_tgt, 0, n_seq_dft+1, -1, -1);
+            llama_kv_cache_seq_keep(ctx_tgt, n_seq_dft+1); // NEEDED for some reason
+
+
+            struct seq_async_run run;
+            run.s_keep = s_keep;
+            run.drafts = drafts;
+            run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
+            llama_finish_async_decode(*ctx_tgt, batch_tgt, run.cgraph);
+            run.n_past_tgt = n_past_tgt;
+            run.n_past_dft = n_past_dft;
+            run.batch = batch_tgt;
+//            tgt_cgraphs.push_front(run);
+
+            llama_kv_cache_seq_rm  (ctx_tgt, n_seq_dft+1, n_past_tgt, -1);
+
+            llama_kv_cache_seq_cp  (ctx_tgt, n_seq_dft+1, 0, -1, -1);
+            llama_kv_cache_seq_keep(ctx_tgt, 0);
+
+            llama_batch_clear(batch_tgt);
+            llama_batch_add  (batch_tgt, id, n_past_tgt, { 0 }, true);
 
             for (int s = 0; s < n_seq_dft; ++s) {
                 drafts[s].active = false;
@@ -402,11 +428,21 @@ int main(int argc, char ** argv) {
 
                 // Swap back to pipeline roots
                 llama_swap_comm(ctx_dft);
+                LOG("Swapped comm to pipeline roots, id %d\n", llama_node_id(ctx_dft));
+
                 llama_sync_token(ctx_dft, &(drafts[s].i_batch_dft), 1);
 
                 auto & cur_p = drafts[s].ctx_sampling->cur;
 
                 llama_sync_token_data(ctx_dft, &(cur_p[0]), 1);
+                // TODO investigate potential bottleneck
+                for (int k = 1; k < 8; ++k) {
+                    llama_sync_token_data(ctx_dft, &(cur_p[k]), 1);
+                }
+
+                // Back to draft pipeline only
+                llama_swap_comm(ctx_dft);
+                LOG("Swapped comm to draft only, id %d\n", llama_node_id(ctx_dft));
 
                 for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
                     LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
@@ -424,13 +460,8 @@ int main(int argc, char ** argv) {
                     continue;
                 }
 
-                // TODO investigate potential bottleneck
-                for (int k = 1; k < 8; ++k) {
-                    llama_sync_token_data(ctx_dft, &(cur_p[k]), 1);
-                }
 
-                // Back to draft pipeline only
-                llama_swap_comm(ctx_dft);
+
 
                 std::vector<int> sa(1, s);
 

From 802ab554545f48abacfe7ba86af7bd952a94b9aa Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 14 Nov 2023 19:04:56 -0600
Subject: [PATCH 31/63] Fix hang due to early return

---
 llama.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index ecb1d8598c0..cdf4a6f8a11 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9054,9 +9054,6 @@ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llam
 }
 
 void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
-    if (seq_id_src == seq_id_dst) {
-        return;
-    }
 #ifdef GGML_USE_MPI
     int32_t vals[4] = {seq_id_src, seq_id_dst, p0, p1};
     ggml_mpi_sync_ints_pipelined(ctx->ctx_mpi, vals, 4, 3);
@@ -9065,6 +9062,10 @@ void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src,
     p0 = vals[2];
     p1 = vals[3];
 #endif
+    if (seq_id_src == seq_id_dst) {
+        return;
+    }
+
     llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
 }
 

From b2b4033d5fac9febef8e1dcacdcaac3479b45a29 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 14 Nov 2023 19:32:45 -0600
Subject: [PATCH 32/63] Run pipeline in parallel

---
 examples/speculative/speculative.cpp | 47 +++++++++++++++-------------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 1daaaeaced7..643518f9e9e 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -37,6 +37,7 @@ struct seq_async_run {
 };
 
 int main(int argc, char ** argv) {
+    bool should_run_async = true;
     gpt_params params;
 
     if (gpt_params_parse(argc, argv, params) == false) {
@@ -236,7 +237,7 @@ int main(int argc, char ** argv) {
 //            drafts = run.drafts;
             struct ggml_cgraph * cgraph = run.cgraph;
 //            batch_tgt = run.batch;
-            n_past_tgt = run.n_past_tgt;
+//            n_past_tgt = run.n_past_tgt;
             n_past_dft = run.n_past_dft;
 //            s_keep = run.s_keep;
             llama_finish_async_decode(*ctx_tgt, batch_tgt, cgraph);
@@ -329,29 +330,33 @@ int main(int argc, char ** argv) {
                 llama_kv_cache_seq_keep(ctx_tgt, 0);
             }
 
-            llama_batch_clear(batch_tgt);
-            llama_batch_add  (batch_tgt, id, n_past_tgt, { n_seq_dft+1 }, true);
-            // batch_tgt.n_tokens = 1
+            if (should_run_async) {
+                llama_batch_clear(batch_tgt);
+                llama_batch_add(batch_tgt, id, n_past_tgt, {0}, true);
+                // batch_tgt.n_tokens = 1
 
 //            llama_kv_cache_seq_rm  (ctx_tgt, n_seq_dft+1, -1, -1);
-            llama_kv_cache_seq_cp  (ctx_tgt, 0, n_seq_dft+1, -1, -1);
-            llama_kv_cache_seq_keep(ctx_tgt, n_seq_dft+1); // NEEDED for some reason
-
-
-            struct seq_async_run run;
-            run.s_keep = s_keep;
-            run.drafts = drafts;
-            run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
-            llama_finish_async_decode(*ctx_tgt, batch_tgt, run.cgraph);
-            run.n_past_tgt = n_past_tgt;
-            run.n_past_dft = n_past_dft;
-            run.batch = batch_tgt;
-//            tgt_cgraphs.push_front(run);
-
-            llama_kv_cache_seq_rm  (ctx_tgt, n_seq_dft+1, n_past_tgt, -1);
+//            llama_kv_cache_seq_cp  (ctx_tgt, 0, n_seq_dft+1, -1, -1);
+//            llama_kv_cache_seq_keep(ctx_tgt, n_seq_dft+1); // NEEDED for some reason
+
+                n_past_tgt++;
+                struct seq_async_run run;
+                run.s_keep = s_keep;
+                run.drafts = drafts;
+                run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
+//            llama_finish_async_decode(*ctx_tgt, batch_tgt, run.cgraph);
+                run.n_past_tgt = n_past_tgt;
+                run.n_past_dft = n_past_dft;
+                run.batch = batch_tgt;
+                tgt_cgraphs.push_front(run);
+                n_past_tgt--;
+                llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_past_tgt + 1);
+
+//            llama_kv_cache_seq_cp  (ctx_tgt, n_seq_dft+1, 0, -1, -1);
+//            llama_kv_cache_seq_keep(ctx_tgt, 0);
+            }
 
-            llama_kv_cache_seq_cp  (ctx_tgt, n_seq_dft+1, 0, -1, -1);
-            llama_kv_cache_seq_keep(ctx_tgt, 0);
+            should_run_async = !should_run_async;
 
             llama_batch_clear(batch_tgt);
             llama_batch_add  (batch_tgt, id, n_past_tgt, { 0 }, true);

From a1a9f05463f5715cd8db9fe076bd94fa2201b6a1 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 14 Nov 2023 20:17:53 -0600
Subject: [PATCH 33/63] Fix memory leak, remove unneeded fields

---
 examples/speculative/speculative.cpp | 60 +---------------------------
 1 file changed, 1 insertion(+), 59 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 643518f9e9e..95182a201f5 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -27,13 +27,6 @@ struct seq_draft {
 
 struct seq_async_run {
     struct ggml_cgraph * cgraph;
-    struct llama_batch batch;
-    int n_past_tgt;
-    int n_past_dft;
-    int s_keep;
-    std::vector<seq_draft> drafts;
-    llama_sampling_context * ctx_sampling;
-    uint8_t * state;
 };
 
 int main(int argc, char ** argv) {
@@ -233,13 +226,7 @@ int main(int argc, char ** argv) {
 
         if (!tgt_cgraphs.empty()) {
             struct seq_async_run run = tgt_cgraphs.back();
-//            llama_set_state_data(ctx_tgt, run.state);
-//            drafts = run.drafts;
             struct ggml_cgraph * cgraph = run.cgraph;
-//            batch_tgt = run.batch;
-//            n_past_tgt = run.n_past_tgt;
-            n_past_dft = run.n_past_dft;
-//            s_keep = run.s_keep;
             llama_finish_async_decode(*ctx_tgt, batch_tgt, cgraph);
             tgt_cgraphs.pop_back();
         }
@@ -275,9 +262,6 @@ int main(int argc, char ** argv) {
             LOG("Swapped comm to target only, id %d\n", llama_node_id(ctx_tgt));
 
 
-            // We can start the target pipeline now without needing to wait for speculation
-//            tgt_cgraphs.push_front(llama_start_async_decode(*ctx_tgt, batch_tgt));
-
             if (id == llama_token_eos(model_tgt)) {
                 has_eos = true;
             }
@@ -335,25 +319,12 @@ int main(int argc, char ** argv) {
                 llama_batch_add(batch_tgt, id, n_past_tgt, {0}, true);
                 // batch_tgt.n_tokens = 1
 
-//            llama_kv_cache_seq_rm  (ctx_tgt, n_seq_dft+1, -1, -1);
-//            llama_kv_cache_seq_cp  (ctx_tgt, 0, n_seq_dft+1, -1, -1);
-//            llama_kv_cache_seq_keep(ctx_tgt, n_seq_dft+1); // NEEDED for some reason
 
-                n_past_tgt++;
                 struct seq_async_run run;
-                run.s_keep = s_keep;
-                run.drafts = drafts;
                 run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
-//            llama_finish_async_decode(*ctx_tgt, batch_tgt, run.cgraph);
-                run.n_past_tgt = n_past_tgt;
-                run.n_past_dft = n_past_dft;
-                run.batch = batch_tgt;
                 tgt_cgraphs.push_front(run);
-                n_past_tgt--;
                 llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_past_tgt + 1);
 
-//            llama_kv_cache_seq_cp  (ctx_tgt, n_seq_dft+1, 0, -1, -1);
-//            llama_kv_cache_seq_keep(ctx_tgt, 0);
             }
 
             should_run_async = !should_run_async;
@@ -455,10 +426,6 @@ int main(int argc, char ** argv) {
                 }
 
 
-
-
-
-
                 if (cur_p[0].p < p_accept) {
                     LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
                     drafts[s].drafting = false;
@@ -467,7 +434,6 @@ int main(int argc, char ** argv) {
 
 
 
-
                 std::vector<int> sa(1, s);
 
                 // attempt to split the branch if the probability is high enough
@@ -561,35 +527,11 @@ int main(int argc, char ** argv) {
 
             // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
             struct seq_async_run run;
-            run.state = (uint8_t *)malloc(llama_get_state_size(ctx_tgt));
-            llama_copy_state_data(ctx_tgt, run.state);
-            run.s_keep = s_keep;
-            run.drafts = drafts;
             run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
-            run.n_past_tgt = n_past_tgt;
-            run.n_past_dft = n_past_dft;
-            run.batch = batch_tgt;
             tgt_cgraphs.push_front(run);
 
         }
-
-//        llama_kv_cache_seq_keep(ctx_tgt, 0);
-//        for (int s = 1; s < n_seq_dft; ++s) {
-//            llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
-//        }
-
-        // We can start the target pipeline now without needing to wait for speculation
-//        struct seq_async_run run;
-//        run.state = (uint8_t *)malloc(llama_get_state_size(ctx_tgt));
-//        llama_copy_state_data(ctx_tgt, run.state);
-//        run.s_keep = s_keep;
-//        run.drafts = drafts;
-//        run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
-//        run.n_past_tgt = n_past_tgt;
-//        run.n_past_dft = n_past_dft;
-//        run.batch = batch_tgt;
-//
-//        tgt_cgraphs.push_front(run);
+        
 
         for (int s = 0; s < n_seq_dft; ++s) {
             if (!drafts[s].active) {

From 86a932d842c8098ac0c6941ad2d33e3217a172f4 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 14 Nov 2023 21:02:23 -0600
Subject: [PATCH 34/63] Clean up output a bit

---
 examples/speculative/speculative.cpp | 29 +++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 95182a201f5..7d93226f4a3 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -151,10 +151,14 @@ int main(int argc, char ** argv) {
 
     fprintf(stderr, "\n\n");
 
-    for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
+    if (llama_node_id(ctx_tgt) == 0) {
+        for (auto id : inp) {
+            fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
+        }
     }
 
+
+
     fflush(stderr);
 
     const int n_input = inp.size();
@@ -250,9 +254,10 @@ int main(int argc, char ** argv) {
 
             //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
 
-            const std::string token_str = llama_token_to_piece(ctx_tgt, id);
             // Root of WORLD
+            std::string token_str;
             if (llama_node_id(ctx_tgt) == 0) {
+                std::string token_str = llama_token_to_piece(ctx_tgt, id);
                 printf("%s", token_str.c_str());
                 fflush(stdout);
             }
@@ -297,7 +302,12 @@ int main(int argc, char ** argv) {
                 }
             }
 
-            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+
+            if (llama_node_id(ctx_tgt) < 0) {
+                LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+
+            }
+
 
             // TODO: simplify
             {
@@ -420,12 +430,17 @@ int main(int argc, char ** argv) {
                 llama_swap_comm(ctx_dft);
                 LOG("Swapped comm to draft only, id %d\n", llama_node_id(ctx_dft));
 
-                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
-                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
+
+                if (llama_node_id(ctx_dft) >= 0) {
+                    for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
+                        LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
                             k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
+                    }
                 }
 
 
+
+
                 if (cur_p[0].p < p_accept) {
                     LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
                     drafts[s].drafting = false;
@@ -531,7 +546,7 @@ int main(int argc, char ** argv) {
             tgt_cgraphs.push_front(run);
 
         }
-        
+
 
         for (int s = 0; s < n_seq_dft; ++s) {
             if (!drafts[s].active) {

From 76c8dabdfdcee32d932fab0741b710f7fac87e19 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 14 Nov 2023 22:18:21 -0600
Subject: [PATCH 35/63] Switch tensor send and pipeline sync to async

---
 ggml-mpi.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/ggml-mpi.c b/ggml-mpi.c
index 964100d53a9..bb5adfd42f2 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -139,7 +139,8 @@ void ggml_mpi_sync_pipelined(
         MPI_Recv(val, count, datatype, ctx_mpi->rank - 1, tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
     }
     if(ctx_mpi->rank < ctx_mpi->size - 1) {
-        MPI_Send(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
+        MPI_Isend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm, &(ctx_mpi->asyncSendRequest));
+//        MPI_Send(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
     }
 }
 
@@ -286,10 +287,10 @@ static void ggml_mpi_tensor_send(struct ggml_mpi_context * ctx_mpi, struct ggml_
     }
 
     if (ctx_mpi->asyncSendWaiting) {
-        MPI_Wait(&(ctx_mpi->asyncSendRequest), MPI_STATUS_IGNORE);
-        ctx_mpi->asyncSendWaiting = false;
-        free(ctx_mpi->duped_send_tensor->data);
-        free(ctx_mpi->duped_send_tensor);
+//        MPI_Wait(&(ctx_mpi->asyncSendRequest), MPI_STATUS_IGNORE);
+//        ctx_mpi->asyncSendWaiting = false;
+//        free(ctx_mpi->duped_send_tensor->data);
+//        free(ctx_mpi->duped_send_tensor);
     }
     ctx_mpi->duped_send_tensor = ggml_mpi_dup_tensor(t);
     ctx_mpi->asyncSendWaiting = true;
@@ -340,7 +341,7 @@ static void ggml_mpi_async_tensor_recv(struct ggml_mpi_context * ctx_mpi, struct
         default: GGML_ASSERT(false && "not implemented");
     }
 
-    ggml_mpi_wait_recv(ctx_mpi);
+//    ggml_mpi_wait_recv(ctx_mpi);
 //    ctx_mpi->duped_recv_tensor = t;
     ctx_mpi->asyncRecvWaiting = true;
     const int retval = MPI_Irecv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, 0, ctx_mpi->comm, &(ctx_mpi->asyncRecvRequest));

From e47fd5c56f7ec68e5a9e07ff57357d455c0fcf57 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 14 Nov 2023 22:23:08 -0600
Subject: [PATCH 36/63] Re-enable wait recv

---
 ggml-mpi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-mpi.c b/ggml-mpi.c
index bb5adfd42f2..49e519dfe1b 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -341,7 +341,7 @@ static void ggml_mpi_async_tensor_recv(struct ggml_mpi_context * ctx_mpi, struct
         default: GGML_ASSERT(false && "not implemented");
     }
 
-//    ggml_mpi_wait_recv(ctx_mpi);
+    ggml_mpi_wait_recv(ctx_mpi);
 //    ctx_mpi->duped_recv_tensor = t;
     ctx_mpi->asyncRecvWaiting = true;
     const int retval = MPI_Irecv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, 0, ctx_mpi->comm, &(ctx_mpi->asyncRecvRequest));

From 3a58fefb363670a500b72da3b50def9306c57e53 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 14 Nov 2023 22:39:00 -0600
Subject: [PATCH 37/63] Don't store send requests, immediately free them

---
 ggml-mpi.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ggml-mpi.c b/ggml-mpi.c
index 49e519dfe1b..42d530b6269 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -139,7 +139,9 @@ void ggml_mpi_sync_pipelined(
         MPI_Recv(val, count, datatype, ctx_mpi->rank - 1, tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
     }
     if(ctx_mpi->rank < ctx_mpi->size - 1) {
-        MPI_Isend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm, &(ctx_mpi->asyncSendRequest));
+        MPI_Request req;
+        MPI_Isend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm, &(req));
+        MPI_Request_free(&req);
 //        MPI_Send(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
     }
 }
@@ -295,7 +297,9 @@ static void ggml_mpi_tensor_send(struct ggml_mpi_context * ctx_mpi, struct ggml_
     ctx_mpi->duped_send_tensor = ggml_mpi_dup_tensor(t);
     ctx_mpi->asyncSendWaiting = true;
 
-    const int retval = MPI_Isend(ctx_mpi->duped_send_tensor->data, ggml_nelements(ctx_mpi->duped_send_tensor), mpi_type, mpi_rank_dst, 0, ctx_mpi->comm, &(ctx_mpi->asyncSendRequest));
+    MPI_Request req;
+    const int retval = MPI_Isend(ctx_mpi->duped_send_tensor->data, ggml_nelements(ctx_mpi->duped_send_tensor), mpi_type, mpi_rank_dst, 0, ctx_mpi->comm, &req);
+    MPI_Request_free(&req);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
 

From 8c44ee670210e6bde2a66a935914574a7f8d4e02 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 14 Nov 2023 22:42:44 -0600
Subject: [PATCH 38/63] Switch pipeline sync back to synced send

---
 ggml-mpi.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml-mpi.c b/ggml-mpi.c
index 42d530b6269..8346d6c2ba1 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -139,10 +139,10 @@ void ggml_mpi_sync_pipelined(
         MPI_Recv(val, count, datatype, ctx_mpi->rank - 1, tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
     }
     if(ctx_mpi->rank < ctx_mpi->size - 1) {
-        MPI_Request req;
-        MPI_Isend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm, &(req));
-        MPI_Request_free(&req);
-//        MPI_Send(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
+//        MPI_Request req;
+//        MPI_Isend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm, &(req));
+//        MPI_Request_free(&req);
+        MPI_Send(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
     }
 }
 

From c4b836211501c49f2f6e2b5ffbd73dda4f780829 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 14 Nov 2023 23:07:52 -0600
Subject: [PATCH 39/63] Move tensor transmissions to tag 7 and re-enable async
 pipeline sync

---
 ggml-mpi.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/ggml-mpi.c b/ggml-mpi.c
index 8346d6c2ba1..8aa79a755e1 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -139,10 +139,10 @@ void ggml_mpi_sync_pipelined(
         MPI_Recv(val, count, datatype, ctx_mpi->rank - 1, tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
     }
     if(ctx_mpi->rank < ctx_mpi->size - 1) {
-//        MPI_Request req;
-//        MPI_Isend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm, &(req));
-//        MPI_Request_free(&req);
-        MPI_Send(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
+        MPI_Request req;
+        MPI_Isend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm, &(req));
+        MPI_Request_free(&req);
+//        MPI_Send(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
     }
 }
 
@@ -298,7 +298,7 @@ static void ggml_mpi_tensor_send(struct ggml_mpi_context * ctx_mpi, struct ggml_
     ctx_mpi->asyncSendWaiting = true;
 
     MPI_Request req;
-    const int retval = MPI_Isend(ctx_mpi->duped_send_tensor->data, ggml_nelements(ctx_mpi->duped_send_tensor), mpi_type, mpi_rank_dst, 0, ctx_mpi->comm, &req);
+    const int retval = MPI_Isend(ctx_mpi->duped_send_tensor->data, ggml_nelements(ctx_mpi->duped_send_tensor), mpi_type, mpi_rank_dst, 7, ctx_mpi->comm, &req);
     MPI_Request_free(&req);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
@@ -314,7 +314,7 @@ static void ggml_mpi_tensor_recv(struct ggml_mpi_context * ctx_mpi, struct ggml_
         case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
         default: GGML_ASSERT(false && "not implemented");
     }
-    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, 0, ctx_mpi->comm, MPI_STATUS_IGNORE);
+    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, 7, ctx_mpi->comm, MPI_STATUS_IGNORE);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
 
@@ -348,7 +348,7 @@ static void ggml_mpi_async_tensor_recv(struct ggml_mpi_context * ctx_mpi, struct
     ggml_mpi_wait_recv(ctx_mpi);
 //    ctx_mpi->duped_recv_tensor = t;
     ctx_mpi->asyncRecvWaiting = true;
-    const int retval = MPI_Irecv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, 0, ctx_mpi->comm, &(ctx_mpi->asyncRecvRequest));
+    const int retval = MPI_Irecv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, 7, ctx_mpi->comm, &(ctx_mpi->asyncRecvRequest));
 
     GGML_ASSERT(retval == MPI_SUCCESS);
 }

From d5b7512c38fc3529e3e2aa49a8207bf90dd333b6 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 15 Nov 2023 21:20:33 -0600
Subject: [PATCH 40/63] Switch isend to buffered send

---
 ggml-mpi.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/ggml-mpi.c b/ggml-mpi.c
index 8aa79a755e1..e8ab07e7802 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -30,6 +30,7 @@ struct ggml_mpi_context {
     bool running_decode;
     bool res;
     bool embed;
+    void* send_buffer;
 };
 
 void ggml_mpi_sync_pipelined(
@@ -59,6 +60,8 @@ struct ggml_mpi_context * ggml_mpi_init(void) {
     ctx->asyncRecvWaiting = false;
     ctx->running_decode = false;
     ctx->async = false;
+    ctx->send_buffer = calloc(1, 4096*1024); // 4MB buffer
+    MPI_Buffer_attach(ctx->send_buffer, 4096*1024);
 //    ctx->status = *MPI_STATUS_IGNORE;
 
     return ctx;
@@ -139,9 +142,9 @@ void ggml_mpi_sync_pipelined(
         MPI_Recv(val, count, datatype, ctx_mpi->rank - 1, tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
     }
     if(ctx_mpi->rank < ctx_mpi->size - 1) {
-        MPI_Request req;
-        MPI_Isend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm, &(req));
-        MPI_Request_free(&req);
+//        MPI_Request req;
+        MPI_Bsend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
+//        MPI_Request_free(&req);
 //        MPI_Send(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
     }
 }
@@ -297,9 +300,9 @@ static void ggml_mpi_tensor_send(struct ggml_mpi_context * ctx_mpi, struct ggml_
     ctx_mpi->duped_send_tensor = ggml_mpi_dup_tensor(t);
     ctx_mpi->asyncSendWaiting = true;
 
-    MPI_Request req;
-    const int retval = MPI_Isend(ctx_mpi->duped_send_tensor->data, ggml_nelements(ctx_mpi->duped_send_tensor), mpi_type, mpi_rank_dst, 7, ctx_mpi->comm, &req);
-    MPI_Request_free(&req);
+//    MPI_Request req;
+    const int retval = MPI_Bsend(ctx_mpi->duped_send_tensor->data, ggml_nelements(ctx_mpi->duped_send_tensor), mpi_type, mpi_rank_dst, 7, ctx_mpi->comm);
+//    MPI_Request_free(&req);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
 

From a0272a1a3d3d8384e6acc2ebdf27572d6f325f25 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 15 Nov 2023 21:39:15 -0600
Subject: [PATCH 41/63] Add assertions to prevent buffer overflow

---
 ggml-mpi.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/ggml-mpi.c b/ggml-mpi.c
index e8ab07e7802..e60e0d4f500 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -142,10 +142,9 @@ void ggml_mpi_sync_pipelined(
         MPI_Recv(val, count, datatype, ctx_mpi->rank - 1, tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
     }
     if(ctx_mpi->rank < ctx_mpi->size - 1) {
-//        MPI_Request req;
-        MPI_Bsend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
-//        MPI_Request_free(&req);
-//        MPI_Send(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
+        const int retval = MPI_Bsend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
+        GGML_ASSERT(retval == MPI_SUCCESS);
+
     }
 }
 
@@ -300,9 +299,7 @@ static void ggml_mpi_tensor_send(struct ggml_mpi_context * ctx_mpi, struct ggml_
     ctx_mpi->duped_send_tensor = ggml_mpi_dup_tensor(t);
     ctx_mpi->asyncSendWaiting = true;
 
-//    MPI_Request req;
     const int retval = MPI_Bsend(ctx_mpi->duped_send_tensor->data, ggml_nelements(ctx_mpi->duped_send_tensor), mpi_type, mpi_rank_dst, 7, ctx_mpi->comm);
-//    MPI_Request_free(&req);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
 

From 7fb2630bd5b72c0a25febc51a9aa8b0cea2b1292 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 15 Nov 2023 21:49:06 -0600
Subject: [PATCH 42/63] Add additional logging

---
 examples/speculative/speculative.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 7d93226f4a3..5cb32453fba 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -229,6 +229,7 @@ int main(int argc, char ** argv) {
         int s_keep = 0;
 
         if (!tgt_cgraphs.empty()) {
+            LOG("Finishing async decode\n");
             struct seq_async_run run = tgt_cgraphs.back();
             struct ggml_cgraph * cgraph = run.cgraph;
             llama_finish_async_decode(*ctx_tgt, batch_tgt, cgraph);
@@ -325,6 +326,7 @@ int main(int argc, char ** argv) {
             }
 
             if (should_run_async) {
+                LOG("Beginning async decode\n");
                 llama_batch_clear(batch_tgt);
                 llama_batch_add(batch_tgt, id, n_past_tgt, {0}, true);
                 // batch_tgt.n_tokens = 1
@@ -361,6 +363,7 @@ int main(int argc, char ** argv) {
             llama_kv_cache_seq_rm(ctx_dft, -1, n_past_dft, -1);
 
             // Kick off drafting pipeline but don't need it just yet
+            LOG("Beginning async draft\n");
             dft_cgraphs.push_front(llama_start_async_decode(*ctx_dft, batch_dft));
             //llama_decode(ctx_dft, batch_dft);
             // DON'T FORGET THE MATCHING DECODE WHEN NEEDED
@@ -391,6 +394,7 @@ int main(int argc, char ** argv) {
 
         // We need the draft now, so wait for it
         if (!dft_cgraphs.empty()) {
+            LOG("Finishing async decode of draft\n");
             llama_finish_async_decode(*ctx_dft, batch_dft, dft_cgraphs.back());
             dft_cgraphs.pop_back();
         }
@@ -521,6 +525,7 @@ int main(int argc, char ** argv) {
             }
 
             // evaluate the drafted tokens on the draft model
+            LOG("Running synchronous draft decode\n");
             llama_decode(ctx_dft, batch_dft);
             ++n_past_cur;
             ++n_drafted;
@@ -540,7 +545,7 @@ int main(int argc, char ** argv) {
             ++n_past_tgt;
 
 
-            // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
+            LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
             struct seq_async_run run;
             run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
             tgt_cgraphs.push_front(run);

From cd10f8965cea728002390080e3211df88f7dee5e Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 15 Nov 2023 23:27:57 -0600
Subject: [PATCH 43/63] Correct async tgt, but break drafts

---
 examples/speculative/speculative.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 5cb32453fba..02cc321813c 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -27,6 +27,7 @@ struct seq_draft {
 
 struct seq_async_run {
     struct ggml_cgraph * cgraph;
+    llama_batch batch;
 };
 
 int main(int argc, char ** argv) {
@@ -229,10 +230,10 @@ int main(int argc, char ** argv) {
         int s_keep = 0;
 
         if (!tgt_cgraphs.empty()) {
-            LOG("Finishing async decode\n");
+            LOG("Finishing async decode, should_run_async = %d\n", should_run_async);
             struct seq_async_run run = tgt_cgraphs.back();
             struct ggml_cgraph * cgraph = run.cgraph;
-            llama_finish_async_decode(*ctx_tgt, batch_tgt, cgraph);
+            llama_finish_async_decode(*ctx_tgt, run.batch, cgraph);
             tgt_cgraphs.pop_back();
         }
 
@@ -334,6 +335,7 @@ int main(int argc, char ** argv) {
 
                 struct seq_async_run run;
                 run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
+                run.batch = batch_tgt;
                 tgt_cgraphs.push_front(run);
                 llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_past_tgt + 1);
 
@@ -547,6 +549,7 @@ int main(int argc, char ** argv) {
 
             LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
             struct seq_async_run run;
+            run.batch = batch_tgt;
             run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
             tgt_cgraphs.push_front(run);
 

From cbe6e2ce3bf84e803cc40326bd45f27b2c0d558c Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Thu, 16 Nov 2023 22:26:41 -0600
Subject: [PATCH 44/63] Mostly working async

---
 examples/speculative/speculative.cpp | 179 ++++++++++++++++++---------
 ggml-mpi.c                           |  52 ++++++--
 ggml-mpi.h                           |   4 +-
 llama.cpp                            |  13 +-
 4 files changed, 177 insertions(+), 71 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 02cc321813c..fd1fede4917 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -28,6 +28,9 @@ struct seq_draft {
 struct seq_async_run {
     struct ggml_cgraph * cgraph;
     llama_batch batch;
+    std::vector<seq_draft> drafts;
+    int run_id;
+    int n_past_tgt;
 };
 
 int main(int argc, char ** argv) {
@@ -183,6 +186,7 @@ int main(int argc, char ** argv) {
     int n_drafted = 0;
     int n_accept  = 0;
 
+    const int ASYNC_RUN_ID = n_seq_dft+1;
     int n_past_tgt = inp.size();
     int n_past_dft = inp.size();
 
@@ -214,6 +218,8 @@ int main(int argc, char ** argv) {
     drafts[0].i_batch_tgt.resize(1);
     drafts[0].i_batch_tgt[0] = 0;
 
+    int run_id = 0;
+
     while (true) {
         // print current draft sequences
         for (int s = 0; s < n_seq_dft; ++s) {
@@ -235,31 +241,47 @@ int main(int argc, char ** argv) {
             struct ggml_cgraph * cgraph = run.cgraph;
             llama_finish_async_decode(*ctx_tgt, run.batch, cgraph);
             tgt_cgraphs.pop_back();
+            run_id = run.run_id;
+            if (run_id == ASYNC_RUN_ID) {
+                llama_kv_cache_seq_cp  (ctx_tgt, run_id, 0, -1, n_past_tgt);
+
+            }
         }
 
+        llama_token id;
+        std::string token_str;
         while (true) {
             LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
 
 
 
             // sample from the target model
-            llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
-
+            id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
             // Swap to pipeline roots
             llama_swap_comm(ctx_tgt);
             LOG("Swapped comm to pipeline roots, id %d\n", llama_node_id(ctx_tgt));
 
             llama_sync_token(ctx_tgt, &id, 0);
 
-
+            LOG("Is async: %d\n", !should_run_async);
+            LOG("Sampling index: %d\n", drafts[s_keep].i_batch_tgt[i_dft]);
             llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
 
-            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
+
+            const int n_vocab = llama_n_vocab(llama_get_model(ctx_tgt));
+            float * logits = llama_get_logits_ith(ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
+
+            LOG("logits:\n");
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                LOG("\t%d: %.4f\n", token_id, logits[token_id]);
+            }
 
             // Root of WORLD
-            std::string token_str;
+
             if (llama_node_id(ctx_tgt) == 0) {
-                std::string token_str = llama_token_to_piece(ctx_tgt, id);
+                token_str = llama_token_to_piece(ctx_tgt, id);
+                LOG("Sampled token: %d ('%s'), n_past_tgt: %d\n", id, token_str.c_str(), n_past_tgt);
                 printf("%s", token_str.c_str());
                 fflush(stdout);
             }
@@ -276,7 +298,7 @@ int main(int argc, char ** argv) {
             ++n_predict;
 
             // check if the target token matches any of the drafts
-            {
+            if(should_run_async){ // Only running this when should_run_async starts out okay but still goes off the rails eventually
                 bool matches = false;
 
                 for (int s = 0; s < n_seq_dft; ++s) {
@@ -305,76 +327,116 @@ int main(int argc, char ** argv) {
             }
 
 
-            if (llama_node_id(ctx_tgt) < 0) {
-                LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
 
-            }
 
+            break;
+        }
 
-            // TODO: simplify
-            {
-                LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
+        if (llama_node_id(ctx_tgt) < 0) {
+            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
 
-                // Pipeline syncing cache ops
-                llama_kv_cache_seq_keep(ctx_dft, s_keep);
-                llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(ctx_dft, 0);
+        }
 
-                llama_kv_cache_seq_rm  (ctx_tgt, s_keep, n_past_tgt, -1);
-                llama_kv_cache_seq_keep(ctx_tgt, s_keep);
-                llama_kv_cache_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(ctx_tgt, 0);
-            }
 
-            if (should_run_async) {
-                LOG("Beginning async decode\n");
-                llama_batch_clear(batch_tgt);
-                llama_batch_add(batch_tgt, id, n_past_tgt, {0}, true);
-                // batch_tgt.n_tokens = 1
+        // TODO: simplify
+        {
+            LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
+
+            // Pipeline syncing cache ops
+            llama_kv_cache_seq_keep(ctx_dft, s_keep);
+            llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
+            llama_kv_cache_seq_keep(ctx_dft, 0);
 
+            for (int i = 0; i < n_seq_dft; i++) {
+                if (run_id == ASYNC_RUN_ID) {
+                    llama_kv_cache_seq_rm(ctx_tgt, i + 0, n_past_tgt, -1);
+                } else {
+//                    llama_kv_cache_seq_rm(ctx_tgt, i + ASYNC_RUN_ID, n_past_tgt, -1);
 
-                struct seq_async_run run;
-                run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
-                run.batch = batch_tgt;
-                tgt_cgraphs.push_front(run);
-                llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_past_tgt + 1);
+                }
+//                llama_kv_cache_seq_rm  (ctx_tgt, i+run_id, n_past_tgt, -1);
 
             }
+//            llama_kv_cache_seq_keep(ctx_tgt, s_keep);
+            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+run_id, run_id, -1, n_past_tgt);
+//            llama_kv_cache_seq_keep(ctx_tgt, 0);
+            for (int i = 1; i < n_seq_dft; i++) {
+//                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, -1, n_past_tgt);
+                llama_kv_cache_seq_rm  (ctx_tgt, i+run_id, -1, n_past_tgt);
+
+            }
+            llama_kv_cache_seq_rm  (ctx_tgt, run_id, n_past_tgt, n_past_tgt+2);
+//            llama_kv_cache_seq_rm  (ctx_tgt, 0, n_past_tgt, n_past_tgt+2);
+
 
-            should_run_async = !should_run_async;
+            llama_kv_cache_seq_cp  (ctx_tgt, run_id, 0, -1, n_past_tgt);
 
+        }
+
+        if (should_run_async) {
+//                LOG("Beginning async decode\n");
             llama_batch_clear(batch_tgt);
-            llama_batch_add  (batch_tgt, id, n_past_tgt, { 0 }, true);
+            llama_batch_add(batch_tgt, id, n_past_tgt, {ASYNC_RUN_ID}, true);
+            // batch_tgt.n_tokens = 1
 
-            for (int s = 0; s < n_seq_dft; ++s) {
-                drafts[s].active = false;
-                drafts[s].tokens.clear();
-                drafts[s].i_batch_tgt.clear();
+
+            for (int i = 0; i < n_seq_dft; i++) {
+                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, n_past_tgt, -1);
             }
-            // note: will be erased after the speculation phase
-            drafts[0].tokens.push_back(id);
-            drafts[0].i_batch_tgt.push_back(0);
 
-            llama_batch_clear(batch_dft);
-            llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
-            // batch_dft.n_tokens == 1 now
+            llama_kv_cache_seq_cp  (ctx_tgt, run_id, ASYNC_RUN_ID, -1, n_past_tgt);
 
-            // Pipeline sync on draft pipeline
+//            llama_kv_cache_seq_keep(ctx_tgt, s_keep);
+//            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+run_id, ASYNC_RUN_ID, -1, n_past_tgt);
+//            llama_kv_cache_seq_keep(ctx_tgt, 0);
+            for (int i = 1; i < n_seq_dft; i++) {
+//                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, -1, n_past_tgt);
+            }
+//            llama_kv_cache_seq_rm  (ctx_tgt, ASYNC_RUN_ID, n_past_tgt, n_past_tgt+2);
 
-            // Remove all tokens from all sequences after n_past_dft
-            llama_kv_cache_seq_rm(ctx_dft, -1, n_past_dft, -1);
 
-            // Kick off drafting pipeline but don't need it just yet
-            LOG("Beginning async draft\n");
-            dft_cgraphs.push_front(llama_start_async_decode(*ctx_dft, batch_dft));
-            //llama_decode(ctx_dft, batch_dft);
-            // DON'T FORGET THE MATCHING DECODE WHEN NEEDED
 
-            ++n_past_dft;
+            struct seq_async_run run;
+            run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
+            run.batch = batch_tgt;
+            run.run_id = ASYNC_RUN_ID;
+            run.n_past_tgt = n_past_tgt;
+            tgt_cgraphs.push_front(run);
+//            llama_kv_cache_seq_rm(ctx_tgt, ASYNC_RUN_ID, n_past_tgt, n_past_tgt + 2);
 
-            break;
         }
 
+        should_run_async = !should_run_async;
+
+        llama_batch_clear(batch_tgt);
+        llama_batch_add  (batch_tgt, id, n_past_tgt, { 0 }, true);
+
+        for (int s = 0; s < n_seq_dft; ++s) {
+            drafts[s].active = false;
+            drafts[s].tokens.clear();
+            drafts[s].i_batch_tgt.clear();
+        }
+        // note: will be erased after the speculation phase
+        drafts[0].tokens.push_back(id);
+        drafts[0].i_batch_tgt.push_back(0);
+
+        llama_batch_clear(batch_dft);
+        llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
+        // batch_dft.n_tokens == 1 now
+
+        // Pipeline sync on draft pipeline
+
+        // Remove all tokens from all sequences after n_past_dft
+        llama_kv_cache_seq_rm(ctx_dft, -1, n_past_dft, -1);
+
+        // Kick off drafting pipeline but don't need it just yet
+        LOG("Beginning async draft\n");
+        dft_cgraphs.push_front(llama_start_async_decode(*ctx_dft, batch_dft));
+        //llama_decode(ctx_dft, batch_dft);
+        // DON'T FORGET THE MATCHING DECODE WHEN NEEDED
+
+        ++n_past_dft;
+
         if (n_predict > params.n_predict || has_eos) {
             break;
         }
@@ -496,6 +558,7 @@ int main(int argc, char ** argv) {
                 }
 
                 // add drafted token for each sequence
+                // TODO commenting this out fixes async
                 for (int is = 0; is < (int) sa.size(); ++is) {
                     const llama_token id = cur_p[is].id;
 
@@ -508,7 +571,7 @@ int main(int argc, char ** argv) {
                     // add unique drafted tokens to the target batch
                     drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
 
-                    llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
+                    llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s,s+ASYNC_RUN_ID }, true);
 
                     // add the token to the batch for batched decoding with the draft model
                     drafts[s].i_batch_dft = batch_dft.n_tokens;
@@ -539,9 +602,9 @@ int main(int argc, char ** argv) {
 
         // evaluate the target model on the drafted tokens
         {
-            llama_kv_cache_seq_keep(ctx_tgt, 0);
+//            llama_kv_cache_seq_keep(ctx_tgt, 0); // Needed to get to "Here's the code:"
             for (int s = 1; s < n_seq_dft; ++s) {
-                llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
+                llama_kv_cache_seq_cp(ctx_tgt, run_id, s+run_id, -1, n_past_tgt);
             }
 
             ++n_past_tgt;
@@ -550,6 +613,8 @@ int main(int argc, char ** argv) {
             LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
             struct seq_async_run run;
             run.batch = batch_tgt;
+            run.run_id = 0;
+            run.n_past_tgt = n_past_tgt;
             run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
             tgt_cgraphs.push_front(run);
 
diff --git a/ggml-mpi.c b/ggml-mpi.c
index e60e0d4f500..0820f2c64b5 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -128,6 +128,21 @@ int ggml_mpi_next_node(struct ggml_mpi_context * ctx_mpi) {
     return (ctx_mpi->rank + 1) % ctx_mpi->size;
 }
 
+void ggml_mpi_sync_pipelined_recv(
+        struct ggml_mpi_context *   ctx_mpi,
+        void * val,
+        int count,
+        MPI_Datatype datatype,
+        int tag
+) {
+    if(ctx_mpi->comm == MPI_COMM_NULL) {
+        return;
+    }
+    MPI_Recv(val, count, datatype, ctx_mpi->rank - 1, tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
+
+}
+
+
 void ggml_mpi_sync_pipelined(
         struct ggml_mpi_context *   ctx_mpi,
         void * val,
@@ -141,7 +156,7 @@ void ggml_mpi_sync_pipelined(
     if (ctx_mpi->rank != 0) {
         MPI_Recv(val, count, datatype, ctx_mpi->rank - 1, tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
     }
-    if(ctx_mpi->rank < ctx_mpi->size - 1) {
+    if(ctx_mpi->rank < ctx_mpi->size) {
         const int retval = MPI_Bsend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
         GGML_ASSERT(retval == MPI_SUCCESS);
 
@@ -151,16 +166,23 @@ void ggml_mpi_sync_pipelined(
 bool ggml_mpi_eval_init(
         struct ggml_mpi_context *   ctx_mpi,
                 int32_t         *   n_tokens,
+                int32_t         **  tokens,
                 int32_t         **  pos,
                 int32_t         **  n_seq_ids,
                 int32_t         *** seq_id,
-                int8_t          **  logits) {
+                int8_t          **  logits,
+                bool                receive_only) {
     if(ctx_mpi->comm == MPI_COMM_NULL) {
         return false;
     }
     int32_t old_n_tokens = *n_tokens;
 
-    ggml_mpi_sync_pipelined(ctx_mpi, n_tokens, 1, MPI_INT, 0);
+    if (receive_only) {
+        ggml_mpi_sync_pipelined_recv(ctx_mpi, n_tokens, 1, MPI_INT, 0);
+
+    } else {
+        ggml_mpi_sync_pipelined(ctx_mpi, n_tokens, 1, MPI_INT, 0);
+    }
 
     // If what was passed in differs from what was broadcast,
     // we can't guarantee the allocated sizes are correct
@@ -169,14 +191,22 @@ bool ggml_mpi_eval_init(
     if (old_n_tokens != *n_tokens) {
         *pos = realloc(*pos, *n_tokens * sizeof(int32_t));
         *n_seq_ids = realloc(*n_seq_ids, *n_tokens * sizeof(int32_t ));
-        *logits = realloc(*logits, *n_tokens * sizeof(int32_t));
+        *tokens = realloc(*tokens, *n_tokens * sizeof(int32_t ));
     }
 
+    if (receive_only) {
+        ggml_mpi_sync_pipelined_recv(ctx_mpi, *tokens, *n_tokens, MPI_INT32_T, 0);
 
+    } else {
+        ggml_mpi_sync_pipelined(ctx_mpi, *tokens, *n_tokens, MPI_INT32_T, 0);
+    }
 
-//    MPI_Bcast(&total_n_seq_ids,     1, MPI_INT32_T, 0, ctx_mpi->comm);
-    ggml_mpi_sync_pipelined(ctx_mpi, *n_seq_ids,   *n_tokens, MPI_INT32_T, 0);
 
+    if (receive_only) {
+        ggml_mpi_sync_pipelined_recv(ctx_mpi, *n_seq_ids, *n_tokens, MPI_INT32_T, 0);
+    } else {
+        ggml_mpi_sync_pipelined(ctx_mpi, *n_seq_ids, *n_tokens, MPI_INT32_T, 0);
+    }
     // We need to know the total number of sequence
     // ids, so we count them all up
     int32_t total_n_seq_ids = 0;
@@ -201,9 +231,13 @@ bool ggml_mpi_eval_init(
     }
 
 
-    ggml_mpi_sync_pipelined(ctx_mpi, *pos, *n_tokens, MPI_INT32_T, 0);
-    ggml_mpi_sync_pipelined(ctx_mpi, flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, 0);
-    //MPI_Bcast(*logits,               *n_tokens,        MPI_INT8_T, 0, ctx_mpi->comm);
+    if (receive_only) {
+        ggml_mpi_sync_pipelined_recv(ctx_mpi, *pos, *n_tokens, MPI_INT32_T, 0);
+        ggml_mpi_sync_pipelined_recv(ctx_mpi, flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, 0);
+    } else {
+        ggml_mpi_sync_pipelined(ctx_mpi, *pos, *n_tokens, MPI_INT32_T, 0);
+        ggml_mpi_sync_pipelined(ctx_mpi, flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, 0);
+    }
     int32_t ** new_seq_id = calloc(*n_tokens, sizeof(int32_t*));
     current_index = 0;
     for (int32_t i = 0; i < *n_tokens; i++) {
diff --git a/ggml-mpi.h b/ggml-mpi.h
index df9c10366b8..110ef4c52df 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -140,10 +140,12 @@ size_t ggml_mpi_size(struct ggml_mpi_context * ctx);
 bool ggml_mpi_eval_init(
         struct ggml_mpi_context *   ctx_mpi,
                 int32_t         *   n_tokens,
+                int32_t         **  tokens,
                 int32_t         **  pos,
                 int32_t         **  n_seq_ids,
                 int32_t         *** seq_id,
-                int8_t          **  logits);
+                int8_t          **  logits,
+                bool                receive_only);
 
 void ggml_mpi_synch_int(
         struct ggml_mpi_context     * ctx_mpi,
diff --git a/llama.cpp b/llama.cpp
index cdf4a6f8a11..a2c48a49956 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1655,13 +1655,13 @@ static void llama_kv_cache_seq_rm(
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
     for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
+        if ((cache.cells[i].pos >= p0 || cache.cells[i].pos < 0) && cache.cells[i].pos < p1) {
             if (seq_id < 0) {
                 cache.cells[i].seq_id.clear();
             } else if (cache.cells[i].has_seq_id(seq_id)) {
                 cache.cells[i].seq_id.erase(seq_id);
             } else {
-                continue;
+//                continue;
             }
             if (cache.cells[i].seq_id.empty()) {
                 // keep count of the number of used cells
@@ -5497,7 +5497,7 @@ static struct ggml_cgraph * llama_decode_internal_phased(
 
 #ifdef GGML_USE_MPI
         // TODO: needs fix after #3228
-        if (!ggml_mpi_eval_init(lctx.ctx_mpi, &(batch.n_tokens), &(batch.pos), &(batch.n_seq_id), &(batch.seq_id), &(batch.logits))) {
+        if (!ggml_mpi_eval_init(lctx.ctx_mpi, &(batch.n_tokens), &(batch.token), &(batch.pos), &(batch.n_seq_id), &(batch.seq_id), &(batch.logits), false)) {
             return nullptr;
         }
         n_tokens = batch.n_tokens;
@@ -5562,6 +5562,11 @@ static struct ggml_cgraph * llama_decode_internal_phased(
         struct ggml_tensor *embeddings = gf->nodes[gf->n_nodes - 2];
 
 #ifdef GGML_USE_MPI
+        if (ggml_mpi_rank(lctx.ctx_mpi) == 0 && ggml_mpi_size(lctx.ctx_mpi) > 1) {
+            if (!ggml_mpi_eval_init(lctx.ctx_mpi, &(batch.n_tokens), &(batch.token), &(batch.pos), &(batch.n_seq_id), &(batch.seq_id), &(batch.logits), true)) {
+                return nullptr;
+            }
+        }
         if (!ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf)) {
             return nullptr;
         }
@@ -9636,7 +9641,7 @@ int llama_process_mpi_worker(
             llama_kv_cache_clear(ctx);
             break;
         case 2:
-            llama_kv_cache_seq_rm(ctx, 0, 0, 0);
+            llama_kv_cache_seq_rm(ctx, 1, -1, -1);
             break;
         case 3:
             llama_kv_cache_seq_cp(ctx, 0, 0, 0, 0);

From 6933af643a2f046f3c22cf39583fdf50331bd18c Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Thu, 16 Nov 2023 23:07:56 -0600
Subject: [PATCH 45/63] Another partially working version

---
 examples/speculative/speculative.cpp | 35 +++++++++++++++++-----------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index fd1fede4917..f5b6bd71f63 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -245,6 +245,9 @@ int main(int argc, char ** argv) {
             if (run_id == ASYNC_RUN_ID) {
                 llama_kv_cache_seq_cp  (ctx_tgt, run_id, 0, -1, n_past_tgt);
 
+            } else {
+                llama_kv_cache_seq_cp  (ctx_tgt, run_id, ASYNC_RUN_ID, -1, n_past_tgt);
+
             }
         }
 
@@ -347,29 +350,35 @@ int main(int argc, char ** argv) {
             llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
             llama_kv_cache_seq_keep(ctx_dft, 0);
 
-            for (int i = 0; i < n_seq_dft; i++) {
-                if (run_id == ASYNC_RUN_ID) {
-                    llama_kv_cache_seq_rm(ctx_tgt, i + 0, n_past_tgt, -1);
-                } else {
-//                    llama_kv_cache_seq_rm(ctx_tgt, i + ASYNC_RUN_ID, n_past_tgt, -1);
+//            llama_kv_cache_seq_rm  (ctx_tgt, s_keep+run_id, n_past_tgt, -1);
+            llama_kv_cache_seq_rm  (ctx_tgt, 0, n_past_tgt, -1); // Forces "create" and nothing else effects it
 
+
+            for (int i = 0; i < n_seq_dft; i++) {
+//                if (run_id == ASYNC_RUN_ID) {
+//                    llama_kv_cache_seq_rm(ctx_tgt, i + 0, n_past_tgt, -1);
+//                } else {
+////                    llama_kv_cache_seq_rm(ctx_tgt, i + ASYNC_RUN_ID, n_past_tgt, -1);
+//
+//                }
+                if (i != s_keep) {
+//                    llama_kv_cache_seq_rm(ctx_tgt, i + 0, -1, n_past_tgt);
                 }
-//                llama_kv_cache_seq_rm  (ctx_tgt, i+run_id, n_past_tgt, -1);
 
             }
 //            llama_kv_cache_seq_keep(ctx_tgt, s_keep);
-            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+run_id, run_id, -1, n_past_tgt);
+//            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+run_id, run_id, -1, n_past_tgt);
 //            llama_kv_cache_seq_keep(ctx_tgt, 0);
             for (int i = 1; i < n_seq_dft; i++) {
 //                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, -1, n_past_tgt);
-                llama_kv_cache_seq_rm  (ctx_tgt, i+run_id, -1, n_past_tgt);
+//                llama_kv_cache_seq_rm  (ctx_tgt, i+run_id, -1, n_past_tgt);
 
             }
-            llama_kv_cache_seq_rm  (ctx_tgt, run_id, n_past_tgt, n_past_tgt+2);
+//            llama_kv_cache_seq_rm  (ctx_tgt, run_id, n_past_tgt, n_past_tgt+2);
 //            llama_kv_cache_seq_rm  (ctx_tgt, 0, n_past_tgt, n_past_tgt+2);
 
 
-            llama_kv_cache_seq_cp  (ctx_tgt, run_id, 0, -1, n_past_tgt);
+//            llama_kv_cache_seq_cp  (ctx_tgt, run_id, 0, -1, n_past_tgt);
 
         }
 
@@ -381,10 +390,10 @@ int main(int argc, char ** argv) {
 
 
             for (int i = 0; i < n_seq_dft; i++) {
-                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, n_past_tgt, -1);
+//                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, n_past_tgt, -1);
             }
 
-            llama_kv_cache_seq_cp  (ctx_tgt, run_id, ASYNC_RUN_ID, -1, n_past_tgt);
+//            llama_kv_cache_seq_cp  (ctx_tgt, run_id, ASYNC_RUN_ID, -1, n_past_tgt);
 
 //            llama_kv_cache_seq_keep(ctx_tgt, s_keep);
 //            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+run_id, ASYNC_RUN_ID, -1, n_past_tgt);
@@ -571,7 +580,7 @@ int main(int argc, char ** argv) {
                     // add unique drafted tokens to the target batch
                     drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
 
-                    llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s,s+ASYNC_RUN_ID }, true);
+                    llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
 
                     // add the token to the batch for batched decoding with the draft model
                     drafts[s].i_batch_dft = batch_dft.n_tokens;

From b005ee1003226475b48e7c45ddf7c97b4b89d90f Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sun, 19 Nov 2023 14:38:34 -0600
Subject: [PATCH 46/63] Non-async working

---
 examples/speculative/speculative.cpp | 87 ++++++++++++++--------------
 ggml-mpi.c                           |  2 +-
 llama.cpp                            | 10 ++--
 3 files changed, 50 insertions(+), 49 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index f5b6bd71f63..5ae6c8872ce 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -243,10 +243,10 @@ int main(int argc, char ** argv) {
             tgt_cgraphs.pop_back();
             run_id = run.run_id;
             if (run_id == ASYNC_RUN_ID) {
-                llama_kv_cache_seq_cp  (ctx_tgt, run_id, 0, -1, n_past_tgt);
+//                llama_kv_cache_seq_cp  (ctx_tgt, run_id, 0, -1, n_past_tgt);
 
             } else {
-                llama_kv_cache_seq_cp  (ctx_tgt, run_id, ASYNC_RUN_ID, -1, n_past_tgt);
+//                llama_kv_cache_seq_cp  (ctx_tgt, run_id, ASYNC_RUN_ID, -1, n_past_tgt);
 
             }
         }
@@ -301,7 +301,7 @@ int main(int argc, char ** argv) {
             ++n_predict;
 
             // check if the target token matches any of the drafts
-            if(should_run_async){ // Only running this when should_run_async starts out okay but still goes off the rails eventually
+            { // Only running this when should_run_async starts out okay but still goes off the rails eventually
                 bool matches = false;
 
                 for (int s = 0; s < n_seq_dft; ++s) {
@@ -324,8 +324,9 @@ int main(int argc, char ** argv) {
                     ++n_past_tgt;
                     ++n_past_dft;
                     ++i_dft;
-
-                    continue;
+                    if (should_run_async) {
+                        continue;
+                    }
                 }
             }
 
@@ -342,7 +343,7 @@ int main(int argc, char ** argv) {
 
 
         // TODO: simplify
-        {
+        if (should_run_async) {
             LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
 
             // Pipeline syncing cache ops
@@ -362,16 +363,16 @@ int main(int argc, char ** argv) {
 //
 //                }
                 if (i != s_keep) {
-//                    llama_kv_cache_seq_rm(ctx_tgt, i + 0, -1, n_past_tgt);
+                    llama_kv_cache_seq_rm(ctx_tgt, i + 0, -1, -1);
                 }
 
             }
 //            llama_kv_cache_seq_keep(ctx_tgt, s_keep);
-//            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+run_id, run_id, -1, n_past_tgt);
+            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+run_id, run_id, -1, n_past_tgt);
 //            llama_kv_cache_seq_keep(ctx_tgt, 0);
             for (int i = 1; i < n_seq_dft; i++) {
 //                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, -1, n_past_tgt);
-//                llama_kv_cache_seq_rm  (ctx_tgt, i+run_id, -1, n_past_tgt);
+                llama_kv_cache_seq_rm  (ctx_tgt, i+run_id, -1, n_past_tgt);
 
             }
 //            llama_kv_cache_seq_rm  (ctx_tgt, run_id, n_past_tgt, n_past_tgt+2);
@@ -382,40 +383,40 @@ int main(int argc, char ** argv) {
 
         }
 
-        if (should_run_async) {
-//                LOG("Beginning async decode\n");
-            llama_batch_clear(batch_tgt);
-            llama_batch_add(batch_tgt, id, n_past_tgt, {ASYNC_RUN_ID}, true);
-            // batch_tgt.n_tokens = 1
-
-
-            for (int i = 0; i < n_seq_dft; i++) {
-//                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, n_past_tgt, -1);
-            }
-
-//            llama_kv_cache_seq_cp  (ctx_tgt, run_id, ASYNC_RUN_ID, -1, n_past_tgt);
-
-//            llama_kv_cache_seq_keep(ctx_tgt, s_keep);
-//            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+run_id, ASYNC_RUN_ID, -1, n_past_tgt);
-//            llama_kv_cache_seq_keep(ctx_tgt, 0);
-            for (int i = 1; i < n_seq_dft; i++) {
-//                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, -1, n_past_tgt);
-            }
-//            llama_kv_cache_seq_rm  (ctx_tgt, ASYNC_RUN_ID, n_past_tgt, n_past_tgt+2);
-
-
-
-            struct seq_async_run run;
-            run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
-            run.batch = batch_tgt;
-            run.run_id = ASYNC_RUN_ID;
-            run.n_past_tgt = n_past_tgt;
-            tgt_cgraphs.push_front(run);
-//            llama_kv_cache_seq_rm(ctx_tgt, ASYNC_RUN_ID, n_past_tgt, n_past_tgt + 2);
-
-        }
-
-        should_run_async = !should_run_async;
+//        if (should_run_async) {
+////                LOG("Beginning async decode\n");
+//            llama_batch_clear(batch_tgt);
+//            llama_batch_add(batch_tgt, id, n_past_tgt, {ASYNC_RUN_ID}, true);
+//            // batch_tgt.n_tokens = 1
+//
+//
+//            for (int i = 0; i < n_seq_dft; i++) {
+////                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, n_past_tgt, -1);
+//            }
+//
+////            llama_kv_cache_seq_cp  (ctx_tgt, run_id, ASYNC_RUN_ID, -1, n_past_tgt);
+//
+////            llama_kv_cache_seq_keep(ctx_tgt, s_keep);
+////            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+run_id, ASYNC_RUN_ID, -1, n_past_tgt);
+////            llama_kv_cache_seq_keep(ctx_tgt, 0);
+//            for (int i = 1; i < n_seq_dft; i++) {
+////                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, -1, n_past_tgt);
+//            }
+////            llama_kv_cache_seq_rm  (ctx_tgt, ASYNC_RUN_ID, n_past_tgt, n_past_tgt+2);
+//
+//
+//
+//            struct seq_async_run run;
+//            run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
+//            run.batch = batch_tgt;
+//            run.run_id = 0;
+//            run.n_past_tgt = n_past_tgt;
+//            tgt_cgraphs.push_front(run);
+////            llama_kv_cache_seq_rm(ctx_tgt, ASYNC_RUN_ID, n_past_tgt, n_past_tgt + 2);
+//
+//        }
+//
+//        should_run_async = !should_run_async;
 
         llama_batch_clear(batch_tgt);
         llama_batch_add  (batch_tgt, id, n_past_tgt, { 0 }, true);
diff --git a/ggml-mpi.c b/ggml-mpi.c
index 0820f2c64b5..05d53828c9d 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -156,7 +156,7 @@ void ggml_mpi_sync_pipelined(
     if (ctx_mpi->rank != 0) {
         MPI_Recv(val, count, datatype, ctx_mpi->rank - 1, tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
     }
-    if(ctx_mpi->rank < ctx_mpi->size) {
+    if(ctx_mpi->rank < ctx_mpi->size - 1) {
         const int retval = MPI_Bsend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
         GGML_ASSERT(retval == MPI_SUCCESS);
 
diff --git a/llama.cpp b/llama.cpp
index a2c48a49956..ed0e0e7d04b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5562,11 +5562,11 @@ static struct ggml_cgraph * llama_decode_internal_phased(
         struct ggml_tensor *embeddings = gf->nodes[gf->n_nodes - 2];
 
 #ifdef GGML_USE_MPI
-        if (ggml_mpi_rank(lctx.ctx_mpi) == 0 && ggml_mpi_size(lctx.ctx_mpi) > 1) {
-            if (!ggml_mpi_eval_init(lctx.ctx_mpi, &(batch.n_tokens), &(batch.token), &(batch.pos), &(batch.n_seq_id), &(batch.seq_id), &(batch.logits), true)) {
-                return nullptr;
-            }
-        }
+//        if (ggml_mpi_rank(lctx.ctx_mpi) == 0 && ggml_mpi_size(lctx.ctx_mpi) > 1) {
+//            if (!ggml_mpi_eval_init(lctx.ctx_mpi, &(batch.n_tokens), &(batch.token), &(batch.pos), &(batch.n_seq_id), &(batch.seq_id), &(batch.logits), true)) {
+//                return nullptr;
+//            }
+//        }
         if (!ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf)) {
             return nullptr;
         }

From 1ac4484613bc5b4bd97f754d05d0fd37431428ae Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sun, 19 Nov 2023 18:15:35 -0600
Subject: [PATCH 47/63] Rearchitect MPI so head is first

---
 ggml-mpi.c | 133 ++++++++++++++++++++++++++---------------------------
 ggml-mpi.h |  36 +++++++++++++++
 llama.cpp  |  90 ++++++++++++++++++++++--------------
 llama.h    |   4 +-
 4 files changed, 160 insertions(+), 103 deletions(-)

diff --git a/ggml-mpi.c b/ggml-mpi.c
index 05d53828c9d..e9f4bb8c558 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -60,8 +60,8 @@ struct ggml_mpi_context * ggml_mpi_init(void) {
     ctx->asyncRecvWaiting = false;
     ctx->running_decode = false;
     ctx->async = false;
-    ctx->send_buffer = calloc(1, 4096*1024); // 4MB buffer
-    MPI_Buffer_attach(ctx->send_buffer, 4096*1024);
+    ctx->send_buffer = calloc(1, 4096*1024*32); // 128MB buffer
+    MPI_Buffer_attach(ctx->send_buffer, 4096*1024*32);
 //    ctx->status = *MPI_STATUS_IGNORE;
 
     return ctx;
@@ -128,6 +128,11 @@ int ggml_mpi_next_node(struct ggml_mpi_context * ctx_mpi) {
     return (ctx_mpi->rank + 1) % ctx_mpi->size;
 }
 
+int ggml_mpi_prev_node(struct ggml_mpi_context * ctx_mpi) {
+    int temp = (ctx_mpi->rank - 1);
+    return (temp >= 0) ? temp : ctx_mpi->size - 1;
+}
+
 void ggml_mpi_sync_pipelined_recv(
         struct ggml_mpi_context *   ctx_mpi,
         void * val,
@@ -138,7 +143,7 @@ void ggml_mpi_sync_pipelined_recv(
     if(ctx_mpi->comm == MPI_COMM_NULL) {
         return;
     }
-    MPI_Recv(val, count, datatype, ctx_mpi->rank - 1, tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
+    MPI_Recv(val, count, datatype, ggml_mpi_prev_node(ctx_mpi), tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
 
 }
 
@@ -153,8 +158,12 @@ void ggml_mpi_sync_pipelined(
     if(ctx_mpi->comm == MPI_COMM_NULL) {
         return;
     }
+
+//    printf("Rank %d sync pipelined\n", ctx_mpi->rank);
+
+
     if (ctx_mpi->rank != 0) {
-        MPI_Recv(val, count, datatype, ctx_mpi->rank - 1, tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
+        MPI_Recv(val, count, datatype, ggml_mpi_prev_node(ctx_mpi), tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
     }
     if(ctx_mpi->rank < ctx_mpi->size - 1) {
         const int retval = MPI_Bsend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
@@ -177,12 +186,9 @@ bool ggml_mpi_eval_init(
     }
     int32_t old_n_tokens = *n_tokens;
 
-    if (receive_only) {
-        ggml_mpi_sync_pipelined_recv(ctx_mpi, n_tokens, 1, MPI_INT, 0);
 
-    } else {
-        ggml_mpi_sync_pipelined(ctx_mpi, n_tokens, 1, MPI_INT, 0);
-    }
+    ggml_mpi_sync_pipelined(ctx_mpi, n_tokens, 1, MPI_INT, 0);
+
 
     // If what was passed in differs from what was broadcast,
     // we can't guarantee the allocated sizes are correct
@@ -194,19 +200,11 @@ bool ggml_mpi_eval_init(
         *tokens = realloc(*tokens, *n_tokens * sizeof(int32_t ));
     }
 
-    if (receive_only) {
-        ggml_mpi_sync_pipelined_recv(ctx_mpi, *tokens, *n_tokens, MPI_INT32_T, 0);
+    ggml_mpi_sync_pipelined(ctx_mpi, *tokens, *n_tokens, MPI_INT32_T, 0);
 
-    } else {
-        ggml_mpi_sync_pipelined(ctx_mpi, *tokens, *n_tokens, MPI_INT32_T, 0);
-    }
 
+    ggml_mpi_sync_pipelined(ctx_mpi, *n_seq_ids, *n_tokens, MPI_INT32_T, 0);
 
-    if (receive_only) {
-        ggml_mpi_sync_pipelined_recv(ctx_mpi, *n_seq_ids, *n_tokens, MPI_INT32_T, 0);
-    } else {
-        ggml_mpi_sync_pipelined(ctx_mpi, *n_seq_ids, *n_tokens, MPI_INT32_T, 0);
-    }
     // We need to know the total number of sequence
     // ids, so we count them all up
     int32_t total_n_seq_ids = 0;
@@ -231,13 +229,10 @@ bool ggml_mpi_eval_init(
     }
 
 
-    if (receive_only) {
-        ggml_mpi_sync_pipelined_recv(ctx_mpi, *pos, *n_tokens, MPI_INT32_T, 0);
-        ggml_mpi_sync_pipelined_recv(ctx_mpi, flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, 0);
-    } else {
-        ggml_mpi_sync_pipelined(ctx_mpi, *pos, *n_tokens, MPI_INT32_T, 0);
-        ggml_mpi_sync_pipelined(ctx_mpi, flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, 0);
-    }
+
+    ggml_mpi_sync_pipelined(ctx_mpi, *pos, *n_tokens, MPI_INT32_T, 0);
+    ggml_mpi_sync_pipelined(ctx_mpi, flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, 0);
+
     int32_t ** new_seq_id = calloc(*n_tokens, sizeof(int32_t*));
     current_index = 0;
     for (int32_t i = 0; i < *n_tokens; i++) {
@@ -270,6 +265,7 @@ void ggml_mpi_synch_int(
     if(ctx_mpi->comm == MPI_COMM_NULL) {
         return;
     }
+//    printf("Rank %d sync int\n", ctx_mpi->rank);
     MPI_Bcast(val, 1, MPI_INT32_T, root, ctx_mpi->comm);
 }
 
@@ -281,9 +277,34 @@ void ggml_mpi_synch_float(
     if(ctx_mpi->comm == MPI_COMM_NULL) {
         return;
     }
+//    printf("Rank %d sync float\n", ctx_mpi->rank);
     MPI_Bcast(val, 1, MPI_FLOAT, root, ctx_mpi->comm);
 }
 
+void ggml_mpi_recv_float_array(
+        struct ggml_mpi_context     * ctx_mpi,
+        float * val,
+        int arr_size,
+        int src,
+        int tag
+) {
+//    printf("Rank %d recv float array, count=%d\n", ctx_mpi->rank, arr_size);
+    int ret = MPI_Recv(val, arr_size, MPI_FLOAT, src, tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
+    GGML_ASSERT(ret == MPI_SUCCESS);
+}
+
+void ggml_mpi_send_float_array_async(
+        struct ggml_mpi_context     * ctx_mpi,
+        float * val,
+        int arr_size,
+        int dest,
+        int tag
+) {
+//    printf("Rank %d send float array async, count=%d\n", ctx_mpi->rank, arr_size);
+    int ret = MPI_Bsend(val, arr_size, MPI_FLOAT, dest, tag, ctx_mpi->comm);
+    GGML_ASSERT(ret == MPI_SUCCESS);
+}
+
 static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
     struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
     if (t == NULL) {
@@ -316,6 +337,7 @@ static void ggml_mpi_tensor_send(struct ggml_mpi_context * ctx_mpi, struct ggml_
     if(ctx_mpi->comm == MPI_COMM_NULL) {
         return;
     }
+//    printf("Rank %d tensor send\n", ctx_mpi->rank);
     MPI_Datatype mpi_type;
 
     switch (t->type) {
@@ -324,16 +346,7 @@ static void ggml_mpi_tensor_send(struct ggml_mpi_context * ctx_mpi, struct ggml_
         default: GGML_ASSERT(false && "not implemented");
     }
 
-    if (ctx_mpi->asyncSendWaiting) {
-//        MPI_Wait(&(ctx_mpi->asyncSendRequest), MPI_STATUS_IGNORE);
-//        ctx_mpi->asyncSendWaiting = false;
-//        free(ctx_mpi->duped_send_tensor->data);
-//        free(ctx_mpi->duped_send_tensor);
-    }
-    ctx_mpi->duped_send_tensor = ggml_mpi_dup_tensor(t);
-    ctx_mpi->asyncSendWaiting = true;
-
-    const int retval = MPI_Bsend(ctx_mpi->duped_send_tensor->data, ggml_nelements(ctx_mpi->duped_send_tensor), mpi_type, mpi_rank_dst, 7, ctx_mpi->comm);
+    const int retval = MPI_Bsend(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, GGML_MPI_TRANSFER_TENSORS, ctx_mpi->comm);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
 
@@ -348,7 +361,7 @@ static void ggml_mpi_tensor_recv(struct ggml_mpi_context * ctx_mpi, struct ggml_
         case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
         default: GGML_ASSERT(false && "not implemented");
     }
-    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, 7, ctx_mpi->comm, MPI_STATUS_IGNORE);
+    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, GGML_MPI_TRANSFER_TENSORS, ctx_mpi->comm, MPI_STATUS_IGNORE);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
 
@@ -380,9 +393,8 @@ static void ggml_mpi_async_tensor_recv(struct ggml_mpi_context * ctx_mpi, struct
     }
 
     ggml_mpi_wait_recv(ctx_mpi);
-//    ctx_mpi->duped_recv_tensor = t;
     ctx_mpi->asyncRecvWaiting = true;
-    const int retval = MPI_Irecv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, 7, ctx_mpi->comm, &(ctx_mpi->asyncRecvRequest));
+    const int retval = MPI_Irecv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, GGML_MPI_TRANSFER_TENSORS, ctx_mpi->comm, &(ctx_mpi->asyncRecvRequest));
 
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
@@ -411,14 +423,14 @@ uint16_t** ggml_mpi_split_range(
         ranges[i] = (uint16_t*) malloc(sizeof(uint16_t) * 2);
     }
     uint16_t next_layer = 0;
-    for (int i=1; i < ctx_mpi->size; i++) {
+    for (int i=0; i < ctx_mpi->size-1; i++) {
         ranges[i][0] = next_layer;
         ranges[i][1] = MIN(end, ranges[i][0] + (node_weights[i] * range_length) + start);
         next_layer = ranges[i][1];
     }
 
-    ranges[0][0] = next_layer;
-    ranges[0][1] = MIN(end, next_layer + (node_weights[0] * range_length) + start);
+    ranges[ctx_mpi->size-1][0] = next_layer;
+    ranges[ctx_mpi->size-1][1] = MIN(end, next_layer + (node_weights[ctx_mpi->size-1] * range_length) + start);
     return ranges;
 
 }
@@ -474,6 +486,8 @@ void ggml_mpi_graph_creation_post(
 
     GGML_ASSERT(inp0 == gf->nodes[0]);
 
+//    printf("Rank %d creation post\n", mpi_rank);
+
     // distribute the compute graph into slices across the MPI nodes
     //
     // the main node (0) processes the last layers + the remainder of the compute graph
@@ -489,24 +503,15 @@ void ggml_mpi_graph_creation_post(
 
 
     if (mpi_rank > 0) {
-        if (mpi_rank == 1) {
-            // the first node (1) receives the input tokens from the main node (0)
-            ggml_mpi_tensor_recv(ctx_mpi, inp_tokens, 0);
-        } else {
-            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
-            ggml_mpi_tensor_recv(ctx_mpi, inp0, mpi_rank - 1);
-        }
+        // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
+        ggml_mpi_tensor_recv(ctx_mpi, inp0, mpi_rank - 1);
+
     } else if (mpi_size > 1) {
-        // node 0 sends the input tokens to node 1
-        // recv the output data from the last node
-        ggml_mpi_tensor_send(ctx_mpi, inp_tokens, 1);
-        ggml_mpi_async_tensor_recv(ctx_mpi, inp0, mpi_size - 1);
+        // node 0 processes the inputs and then sends to node 1
     }
 
     //const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
 
-    const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
-
     //const int il0 =               (mpi_idx + 0) * n_per_node;
     //const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
     int il0 = ctx_mpi->layer_start;
@@ -519,10 +524,7 @@ void ggml_mpi_graph_creation_post(
     snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
 
     const int idx_l0 =                ggml_graph_get_node_idx(gf, name_l0);
-    const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
-
-    struct ggml_tensor *res = gf->nodes[gf->n_nodes - 1];
-    struct ggml_tensor *embeddings = gf->nodes[gf->n_nodes - 2];
+    const int idx_l1 = mpi_rank == mpi_size - 1 ? gf->n_nodes : ggml_graph_get_node_idx(gf, name_l1) + 1;
 
     if (idx_l0 < 0 || idx_l1 < 0) {
         fprintf(stderr, "%s: layer input nodes not found\n", __func__);
@@ -547,7 +549,7 @@ void ggml_mpi_graph_creation_post(
     }
 
     // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
-    if (mpi_idx != 0) {
+    if (mpi_rank != 0 && mpi_size > 1) {
         gf->nodes[0]->op = GGML_OP_NONE;
     }
 
@@ -561,6 +563,8 @@ bool ggml_mpi_graph_compute_pre(struct ggml_mpi_context * ctx_mpi, struct ggml_c
         return false;
     }
 
+//    printf("Rank %d compute pre\n", ctx_mpi->rank);
+
     const int mpi_rank = ctx_mpi->rank;
     const int mpi_size = ctx_mpi->size;
 
@@ -577,13 +581,7 @@ bool ggml_mpi_graph_compute_pre(struct ggml_mpi_context * ctx_mpi, struct ggml_c
     }
 
     GGML_ASSERT(inp0 == gf->nodes[0]);
-    {
-        if (mpi_rank == 0 && mpi_size > 1) {
-            ggml_mpi_wait_recv(ctx_mpi);
-        }
-
 
-    }
     return true;
 }
 
@@ -593,8 +591,9 @@ void ggml_mpi_graph_compute_post(
 
     const int mpi_rank = ctx_mpi->rank;
 
+//    printf("Rank %d compute post\n", mpi_rank);
     // send the output data to the next node
-    if (mpi_rank > 0) {
+    if (mpi_rank < ctx_mpi->size - 1) {
         ggml_mpi_tensor_send(ctx_mpi, gf->nodes[gf->n_nodes - 1], ggml_mpi_next_node(ctx_mpi));
     }
 }
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 110ef4c52df..ec3ee2c90cc 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -11,6 +11,24 @@ struct ggml_cgraph;
 extern "C" {
 #endif
 
+#define GGML_MPI_DECODE 0
+
+#define GGML_MPI_KV_CLEAR 1
+
+#define GGML_MPI_KV_SEQ_RM 2
+
+#define GGML_MPI_KV_SEQ_CP 3
+
+#define GGML_MPI_KV_SEQ_KEEP 4
+
+#define GGML_MPI_KV_SEQ_SHIFT 5
+
+#define GGML_MPI_SHUTDOWN 6
+
+#define GGML_MPI_TRANSFER_TENSORS 7
+
+#define GGML_MPI_SYNC_LOGITS 8
+
 /**
  * The context used for MPI operations,
  * a program may make use of more than one
@@ -86,6 +104,8 @@ void ggml_mpi_barrier(struct ggml_mpi_context * ctx);
 
 int ggml_mpi_next_node(struct ggml_mpi_context * ctx_mpi);
 
+int ggml_mpi_prev_node(struct ggml_mpi_context * ctx_mpi);
+
 void ggml_mpi_sync_ints_pipelined(
         struct ggml_mpi_context * ctx_mpi,
         int32_t * vals,
@@ -159,6 +179,22 @@ void ggml_mpi_synch_float(
         int root
 );
 
+void ggml_mpi_recv_float_array(
+        struct ggml_mpi_context     * ctx_mpi,
+        float * val,
+        int arr_size,
+        int src,
+        int tag
+);
+
+void ggml_mpi_send_float_array_async(
+        struct ggml_mpi_context     * ctx_mpi,
+        float * val,
+        int arr_size,
+        int dest,
+        int tag
+);
+
 /**
  * Split a range across all nodes within the given
  * context, weighting the allocations by the given weights.
diff --git a/llama.cpp b/llama.cpp
index ed0e0e7d04b..e8fbd1544ef 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5424,7 +5424,7 @@ static struct ggml_cgraph * llama_build_graph(
 
 static struct ggml_cgraph * llama_decode_internal_phased(
             llama_context & lctx,
-            llama_batch   batch,
+            llama_batch  & batch,
             uint8_t phase,
             ggml_cgraph * cgraph) {
     uint32_t n_tokens = batch.n_tokens;
@@ -5497,7 +5497,8 @@ static struct ggml_cgraph * llama_decode_internal_phased(
 
 #ifdef GGML_USE_MPI
         // TODO: needs fix after #3228
-        if (!ggml_mpi_eval_init(lctx.ctx_mpi, &(batch.n_tokens), &(batch.token), &(batch.pos), &(batch.n_seq_id), &(batch.seq_id), &(batch.logits), false)) {
+        if (!ggml_mpi_eval_init(lctx.ctx_mpi, &(batch.n_tokens), &(batch.token), &(batch.pos), &(batch.n_seq_id),
+                                &(batch.seq_id), &(batch.logits), false)) {
             return nullptr;
         }
         n_tokens = batch.n_tokens;
@@ -5553,20 +5554,8 @@ static struct ggml_cgraph * llama_decode_internal_phased(
         }
         res->backend = GGML_BACKEND_CPU;
 #endif
-        return gf;
-    } else if (phase == 1) {
-
-
-        ggml_cgraph * gf = cgraph;
-        struct ggml_tensor *res = gf->nodes[gf->n_nodes - 1];
-        struct ggml_tensor *embeddings = gf->nodes[gf->n_nodes - 2];
 
 #ifdef GGML_USE_MPI
-//        if (ggml_mpi_rank(lctx.ctx_mpi) == 0 && ggml_mpi_size(lctx.ctx_mpi) > 1) {
-//            if (!ggml_mpi_eval_init(lctx.ctx_mpi, &(batch.n_tokens), &(batch.token), &(batch.pos), &(batch.n_seq_id), &(batch.seq_id), &(batch.logits), true)) {
-//                return nullptr;
-//            }
-//        }
         if (!ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf)) {
             return nullptr;
         }
@@ -5636,6 +5625,37 @@ static struct ggml_cgraph * llama_decode_internal_phased(
         ggml_graph_print(gf);
 #endif
 
+        return gf;
+
+    } else if (phase == 1) {
+        ggml_cgraph * gf = cgraph;
+        struct ggml_tensor *res = gf->nodes[gf->n_nodes - 1];
+        struct ggml_tensor *embeddings = gf->nodes[gf->n_nodes - 2];
+
+        // Resize logits
+        auto & logits_out = lctx.logits;
+        {
+
+
+            if (batch.logits || lctx.logits_all) {
+                logits_out.resize(n_vocab * n_tokens);
+            } else {
+                logits_out.resize(n_vocab);
+            }
+        }
+
+#ifdef GGML_USE_MPI
+        if (ggml_mpi_size(lctx.ctx_mpi) > 1 && ggml_mpi_rank(lctx.ctx_mpi) == 0) {
+            ggml_mpi_recv_float_array(lctx.ctx_mpi, logits_out.data(), n_vocab * n_tokens, ggml_mpi_size(lctx.ctx_mpi) - 1, GGML_MPI_SYNC_LOGITS);
+        }
+
+        if (ggml_mpi_rank(lctx.ctx_mpi) == ggml_mpi_size(lctx.ctx_mpi) - 1) {
+
+#endif
+
+        auto * net_output = (float *) ggml_get_data(res);
+
+
         // plot the computation graph in dot format (for debugging purposes)
         //if (n_past%100 == 0) {
         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
@@ -5645,22 +5665,17 @@ static struct ggml_cgraph * llama_decode_internal_phased(
         // TODO: do not compute and extract logits if only embeddings are needed
         //       need to update the graphs to skip "result_output"
         {
-            auto & logits_out = lctx.logits;
-
             if (batch.logits) {
-                logits_out.resize(n_vocab * n_tokens);
                 for (uint32_t i = 0; i < n_tokens; i++) {
                     if (batch.logits[i] == 0) {
                         continue;
                     }
-                    memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
+                    memcpy(logits_out.data() + (n_vocab*i),  net_output + (n_vocab*i), sizeof(float)*n_vocab);
                 }
             } else if (lctx.logits_all) {
-                logits_out.resize(n_vocab * n_tokens);
-                memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
+                memcpy(logits_out.data(), net_output, sizeof(float)*n_vocab*n_tokens);
             } else {
-                logits_out.resize(n_vocab);
-                memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
+                memcpy(logits_out.data(), net_output + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
             }
         }
 
@@ -5672,6 +5687,13 @@ static struct ggml_cgraph * llama_decode_internal_phased(
             memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
         }
 
+#ifdef GGML_USE_MPI
+        }
+        if (ggml_mpi_size(lctx.ctx_mpi) > 1 && ggml_mpi_rank(lctx.ctx_mpi) == ggml_mpi_size(lctx.ctx_mpi) - 1) {
+            ggml_mpi_send_float_array_async(lctx.ctx_mpi, logits_out.data(), n_vocab * n_tokens, 0, GGML_MPI_SYNC_LOGITS);
+        }
+#endif
+
         // measure the performance only for the single-token evals
         if (n_tokens == 1) {
             lctx.t_eval_us += ggml_time_us() - t_start_us;
@@ -5688,7 +5710,7 @@ static struct ggml_cgraph * llama_decode_internal_phased(
             lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
             lctx.has_evaluated_once = true;
         }
-            return gf;
+        return cgraph;
 
     }
     return nullptr;
@@ -5710,21 +5732,21 @@ static int llama_decode_internal(
     if (gf != nullptr) {
         return llama_decode_internal_phased(lctx, batch, 1, gf) != nullptr;
     } else {
-        printf("Graph is null\n");
-        return -1;
+//        printf("Graph is null\n");
+//        return -1;
     }
 }
 
 struct ggml_cgraph * llama_start_async_decode(
         llama_context & lctx,
-        llama_batch   batch) {
+        llama_batch  & batch) {
     return llama_decode_internal_phased(lctx, batch, 0, nullptr);
 
 }
 
 int llama_finish_async_decode(
         struct llama_context & lctx,
-        struct llama_batch   batch,
+        struct llama_batch &  batch,
         struct ggml_cgraph * cgraph) {
 
     int ret;
@@ -9634,25 +9656,25 @@ int llama_process_mpi_worker(
     ggml_mpi_probe(ctx->ctx_mpi, -1, -1);
     int tag = ggml_mpi_status_tag(ctx->ctx_mpi);
     switch (tag) {
-        case 0:
+        case GGML_MPI_DECODE:
             return llama_decode_internal(*ctx, batch);
             break;
-        case 1:
+        case GGML_MPI_KV_CLEAR:
             llama_kv_cache_clear(ctx);
             break;
-        case 2:
+        case GGML_MPI_KV_SEQ_RM:
             llama_kv_cache_seq_rm(ctx, 1, -1, -1);
             break;
-        case 3:
+        case GGML_MPI_KV_SEQ_CP:
             llama_kv_cache_seq_cp(ctx, 0, 0, 0, 0);
             break;
-        case 4:
+        case GGML_MPI_KV_SEQ_KEEP:
             llama_kv_cache_seq_keep(ctx, 0);
             break;
-        case 5:
+        case GGML_MPI_KV_SEQ_SHIFT:
             llama_kv_cache_seq_shift(ctx, 0, 0, 0, 0);
             break;
-        case 6:
+        case GGML_MPI_SHUTDOWN:
             llama_free(ctx);
             llama_backend_free();
             exit(0);
diff --git a/llama.h b/llama.h
index 9f5c81615d7..fa0e7ace81d 100644
--- a/llama.h
+++ b/llama.h
@@ -279,10 +279,10 @@ extern "C" {
     LLAMA_API void llama_sync_token(struct llama_context * ctx, llama_token * token, int root);
 
     LLAMA_API struct ggml_cgraph * llama_start_async_decode(struct llama_context & lctx,
-                                                                   struct llama_batch   batch);
+                                                                   struct llama_batch &  batch);
 
     LLAMA_API int llama_finish_async_decode(struct llama_context & lctx,
-                                                                    struct llama_batch   batch,
+                                                                    struct llama_batch  & batch,
                                                                     struct ggml_cgraph * cgraph);
 
     LLAMA_API void llama_sync_token_data(struct llama_context * ctx, llama_token_data * data, int root);

From a9685bbc9e605fb4b4cf2a1e5f74bb243e92676e Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Thu, 23 Nov 2023 18:00:11 -0600
Subject: [PATCH 48/63] Fix segfault and working async w/ no np

---
 examples/speculative/speculative.cpp | 304 +++++++++++++++++++--------
 ggml-mpi.c                           |  23 +-
 2 files changed, 225 insertions(+), 102 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 5ae6c8872ce..42682de4c6e 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -31,6 +31,9 @@ struct seq_async_run {
     std::vector<seq_draft> drafts;
     int run_id;
     int n_past_tgt;
+    int i_dft;
+    int s_keep;
+    llama_sampling_context *ctx_sampling;
 };
 
 int main(int argc, char ** argv) {
@@ -208,6 +211,8 @@ int main(int argc, char ** argv) {
 
     llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
     llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft + 1);
+    llama_batch batch_tgt_async = llama_batch_init(params.n_ctx, 0, n_seq_dft + 1);
+
 
     std::deque<struct ggml_cgraph *> dft_cgraphs;
     std::deque<struct seq_async_run> tgt_cgraphs;
@@ -219,44 +224,67 @@ int main(int argc, char ** argv) {
     drafts[0].i_batch_tgt[0] = 0;
 
     int run_id = 0;
+    int offset = 1;
+    bool has_asynced = false;
+    int run_n_past_tgt = n_past_tgt;
 
     while (true) {
-        // print current draft sequences
-        for (int s = 0; s < n_seq_dft; ++s) {
-            if (!drafts[s].active) {
-                continue;
-            }
-
-            const auto & tokens = drafts[s].tokens;
 
-            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
-        }
 
         int i_dft  = 0;
         int s_keep = 0;
 
         if (!tgt_cgraphs.empty()) {
-            LOG("Finishing async decode, should_run_async = %d\n", should_run_async);
+            has_asynced = true;
             struct seq_async_run run = tgt_cgraphs.back();
+            LOG("Finishing async decode, is async = %d\n", run.run_id == ASYNC_RUN_ID);
             struct ggml_cgraph * cgraph = run.cgraph;
             llama_finish_async_decode(*ctx_tgt, run.batch, cgraph);
             tgt_cgraphs.pop_back();
             run_id = run.run_id;
+            drafts = run.drafts;
+//            ctx_sampling = run.ctx_sampling;
+            run_n_past_tgt = run.n_past_tgt;
+//            s_keep = run.s_keep;
             if (run_id == ASYNC_RUN_ID) {
 //                llama_kv_cache_seq_cp  (ctx_tgt, run_id, 0, -1, n_past_tgt);
 
             } else {
-//                llama_kv_cache_seq_cp  (ctx_tgt, run_id, ASYNC_RUN_ID, -1, n_past_tgt);
+//                offset++;
+//                i_dft = 0;
+            }
+        } else {
+            run_n_past_tgt = n_past_tgt;
+        }
 
+        // print current draft sequences
+        bool any_active = false;
+        for (int s = 0; s < n_seq_dft; ++s) {
+            if (!drafts[s].active) {
+                continue;
             }
+
+            any_active = true;
+            const auto & tokens = drafts[s].tokens;
+
+            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
         }
+        LOG("Any active drafts: %d\n", any_active);
+
 
+        bool any_match = false;
         llama_token id;
         std::string token_str;
         while (true) {
-            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
+//            if (i_dft >= drafts[s_keep].i_batch_tgt.size()) {
+//                break;
+//            }
 
+            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d, run_n_past_tgt = %3d, n_past_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft], run_n_past_tgt, n_past_tgt);
 
+//            if (has_asynced && run_id != ASYNC_RUN_ID) {
+//                break;
+//            }
 
             // sample from the target model
             id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
@@ -266,27 +294,33 @@ int main(int argc, char ** argv) {
 
             llama_sync_token(ctx_tgt, &id, 0);
 
-            LOG("Is async: %d\n", !should_run_async);
+            LOG("Should run async: %d\n", should_run_async);
             LOG("Sampling index: %d\n", drafts[s_keep].i_batch_tgt[i_dft]);
-            llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
+
 
             LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
 
             const int n_vocab = llama_n_vocab(llama_get_model(ctx_tgt));
             float * logits = llama_get_logits_ith(ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
 
-            LOG("logits:\n");
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                LOG("\t%d: %.4f\n", token_id, logits[token_id]);
-            }
+//            LOG("logits:\n");
+//            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+//                LOG("\t%d: %.4f\n", token_id, logits[token_id]);
+//            }
+
+            if (run_n_past_tgt + drafts[s_keep].i_batch_tgt[i_dft] == n_past_tgt) {
+                any_match = true;
+                ++n_predict;
+                llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
 
-            // Root of WORLD
+                // Root of WORLD
 
-            if (llama_node_id(ctx_tgt) == 0) {
-                token_str = llama_token_to_piece(ctx_tgt, id);
-                LOG("Sampled token: %d ('%s'), n_past_tgt: %d\n", id, token_str.c_str(), n_past_tgt);
-                printf("%s", token_str.c_str());
-                fflush(stdout);
+                if (llama_node_id(ctx_tgt) == 0) {
+                    token_str = llama_token_to_piece(ctx_tgt, id);
+                    LOG("Sampled token: %d ('%s'), n_past_tgt: %d\n", id, token_str.c_str(), n_past_tgt);
+                    printf("%s", token_str.c_str());
+                    fflush(stdout);
+                }
             }
 
             // Switch back to target pipeline only
@@ -294,11 +328,19 @@ int main(int argc, char ** argv) {
             LOG("Swapped comm to target only, id %d\n", llama_node_id(ctx_tgt));
 
 
+
             if (id == llama_token_eos(model_tgt)) {
                 has_eos = true;
             }
 
-            ++n_predict;
+
+
+
+
+            if (run_id == ASYNC_RUN_ID) {
+                break;
+            }
+
 
             // check if the target token matches any of the drafts
             { // Only running this when should_run_async starts out okay but still goes off the rails eventually
@@ -320,11 +362,13 @@ int main(int argc, char ** argv) {
                 }
 
                 if (matches) {
-                    ++n_accept;
-                    ++n_past_tgt;
-                    ++n_past_dft;
+                    if (run_n_past_tgt + drafts[s_keep].i_batch_tgt[i_dft] == n_past_tgt) {
+                        ++n_accept;
+                        ++n_past_tgt;
+                        ++n_past_dft;
+                    }
                     ++i_dft;
-                    if (should_run_async) {
+                    if (run_id != ASYNC_RUN_ID && run_n_past_tgt + drafts[s_keep].i_batch_tgt[i_dft] <= n_past_tgt) {
                         continue;
                     }
                 }
@@ -336,20 +380,27 @@ int main(int argc, char ** argv) {
             break;
         }
 
+
+
         if (llama_node_id(ctx_tgt) < 0) {
             LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
 
         }
 
+        if (!any_match) {
+            should_run_async = !should_run_async;
+            continue;
+        }
 
         // TODO: simplify
-        if (should_run_async) {
+        {
             LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
 
             // Pipeline syncing cache ops
             llama_kv_cache_seq_keep(ctx_dft, s_keep);
             llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
             llama_kv_cache_seq_keep(ctx_dft, 0);
+            llama_kv_cache_seq_rm  (ctx_dft, 0, n_past_dft, -1);
 
 //            llama_kv_cache_seq_rm  (ctx_tgt, s_keep+run_id, n_past_tgt, -1);
             llama_kv_cache_seq_rm  (ctx_tgt, 0, n_past_tgt, -1); // Forces "create" and nothing else effects it
@@ -363,16 +414,16 @@ int main(int argc, char ** argv) {
 //
 //                }
                 if (i != s_keep) {
-                    llama_kv_cache_seq_rm(ctx_tgt, i + 0, -1, -1);
+                    llama_kv_cache_seq_rm(ctx_tgt, i + 0, -1, n_past_tgt);
                 }
 
             }
 //            llama_kv_cache_seq_keep(ctx_tgt, s_keep);
-            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+run_id, run_id, -1, n_past_tgt);
+            llama_kv_cache_seq_cp  (ctx_tgt, s_keep, 0, -1, n_past_tgt);
 //            llama_kv_cache_seq_keep(ctx_tgt, 0);
             for (int i = 1; i < n_seq_dft; i++) {
 //                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, -1, n_past_tgt);
-                llama_kv_cache_seq_rm  (ctx_tgt, i+run_id, -1, n_past_tgt);
+                llama_kv_cache_seq_rm  (ctx_tgt, i, -1, n_past_tgt);
 
             }
 //            llama_kv_cache_seq_rm  (ctx_tgt, run_id, n_past_tgt, n_past_tgt+2);
@@ -383,40 +434,79 @@ int main(int argc, char ** argv) {
 
         }
 
-//        if (should_run_async) {
-////                LOG("Beginning async decode\n");
-//            llama_batch_clear(batch_tgt);
-//            llama_batch_add(batch_tgt, id, n_past_tgt, {ASYNC_RUN_ID}, true);
-//            // batch_tgt.n_tokens = 1
-//
-//
-//            for (int i = 0; i < n_seq_dft; i++) {
-////                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, n_past_tgt, -1);
-//            }
-//
-////            llama_kv_cache_seq_cp  (ctx_tgt, run_id, ASYNC_RUN_ID, -1, n_past_tgt);
-//
-////            llama_kv_cache_seq_keep(ctx_tgt, s_keep);
-////            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+run_id, ASYNC_RUN_ID, -1, n_past_tgt);
-////            llama_kv_cache_seq_keep(ctx_tgt, 0);
-//            for (int i = 1; i < n_seq_dft; i++) {
-////                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, -1, n_past_tgt);
-//            }
-////            llama_kv_cache_seq_rm  (ctx_tgt, ASYNC_RUN_ID, n_past_tgt, n_past_tgt+2);
-//
-//
-//
-//            struct seq_async_run run;
-//            run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
-//            run.batch = batch_tgt;
-//            run.run_id = 0;
-//            run.n_past_tgt = n_past_tgt;
-//            tgt_cgraphs.push_front(run);
-////            llama_kv_cache_seq_rm(ctx_tgt, ASYNC_RUN_ID, n_past_tgt, n_past_tgt + 2);
-//
-//        }
-//
-//        should_run_async = !should_run_async;
+        for (int s = 1; s < n_seq_dft; ++s) {
+            llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, n_past_tgt);
+        }
+
+
+
+        if (should_run_async) {
+                LOG("Beginning async decode\n");
+            llama_batch_clear(batch_tgt_async);
+            std::vector<llama_seq_id> seq_ids;
+            for (int i = 0; i < n_seq_dft; i++) {
+                seq_ids.emplace_back(i);
+            }
+            llama_batch_add(batch_tgt_async, id, n_past_tgt, seq_ids, true);
+            // batch_tgt.n_tokens = 1
+
+
+            for (int i = 0; i < n_seq_dft; i++) {
+//                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, n_past_tgt, -1);
+            }
+
+//            llama_kv_cache_seq_cp  (ctx_tgt, run_id, ASYNC_RUN_ID, -1, n_past_tgt);
+
+//            llama_kv_cache_seq_keep(ctx_tgt, s_keep);
+//            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+run_id, ASYNC_RUN_ID, -1, n_past_tgt);
+//            llama_kv_cache_seq_keep(ctx_tgt, 0);
+            for (int i = 1; i < n_seq_dft; i++) {
+//                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, -1, n_past_tgt);
+            }
+//            llama_kv_cache_seq_rm  (ctx_tgt, ASYNC_RUN_ID, n_past_tgt, n_past_tgt+2);
+
+
+
+            struct seq_async_run run;
+            run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt_async);
+//            llama_decode(ctx_tgt, batch_tgt_async);
+            run.batch = llama_batch_init(params.n_ctx, 0, n_seq_dft + 1);
+            run.batch.n_tokens = batch_tgt_async.n_tokens;
+            for (int i = 0; i < batch_tgt_async.n_tokens; i++) {
+                run.batch.n_seq_id[i] = batch_tgt_async.n_seq_id[i];
+                for (int j = 0; j < run.batch.n_seq_id[i]; j++) {
+                    run.batch.seq_id[i][j] = batch_tgt_async.seq_id[i][j];
+                }
+                run.batch.token[i] = batch_tgt_async.token[i];
+                run.batch.pos[i] = batch_tgt_async.pos[i];
+                run.batch.logits[i] = batch_tgt_async.logits[i];
+            }
+            run.ctx_sampling = llama_sampling_init(params.sparams);
+            llama_sampling_cp(ctx_sampling, run.ctx_sampling);
+            run.drafts = std::vector<seq_draft>(n_seq_dft);
+            for (int s = 0; s < n_seq_dft; ++s) {
+                run.drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
+                llama_sampling_cp(drafts[s].ctx_sampling, run.drafts[s].ctx_sampling);
+                run.drafts[s].i_batch_tgt = std::vector<int>(1,0);
+                run.drafts[s].i_batch_dft = drafts[s].i_batch_dft;
+                run.drafts[s].tokens = std::vector<llama_token>(1, id);
+                run.drafts[s].active = drafts[s].active;
+                run.drafts[s].drafting = drafts[s].drafting;
+                run.drafts[s].skip = drafts[s].skip;
+            }
+            run.i_dft = offset - 1;
+            run.s_keep = s_keep;
+            run.run_id = ASYNC_RUN_ID;
+            run.n_past_tgt = n_past_tgt + 1;
+            tgt_cgraphs.push_front(run);
+            llama_kv_cache_seq_rm(ctx_tgt, -1, n_past_tgt, -1);
+
+        }
+
+        should_run_async = !should_run_async;
+
+
+
 
         llama_batch_clear(batch_tgt);
         llama_batch_add  (batch_tgt, id, n_past_tgt, { 0 }, true);
@@ -439,12 +529,22 @@ int main(int argc, char ** argv) {
         // Remove all tokens from all sequences after n_past_dft
         llama_kv_cache_seq_rm(ctx_dft, -1, n_past_dft, -1);
 
+
+
         // Kick off drafting pipeline but don't need it just yet
         LOG("Beginning async draft\n");
         dft_cgraphs.push_front(llama_start_async_decode(*ctx_dft, batch_dft));
         //llama_decode(ctx_dft, batch_dft);
         // DON'T FORGET THE MATCHING DECODE WHEN NEEDED
 
+        // We need the draft now, so wait for it
+        if (!dft_cgraphs.empty()) {
+            LOG("Finishing async decode of draft\n");
+            llama_finish_async_decode(*ctx_dft, batch_dft, dft_cgraphs.back());
+            dft_cgraphs.pop_back();
+        }
+        LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+
         ++n_past_dft;
 
         if (n_predict > params.n_predict || has_eos) {
@@ -460,20 +560,13 @@ int main(int argc, char ** argv) {
             drafts[s].active   = false;
             drafts[s].drafting = false;
         }
-        drafts[0].active      = true;
-        drafts[0].drafting    = true;
-        drafts[0].i_batch_dft = 0;
 
 
 
-        // We need the draft now, so wait for it
-        if (!dft_cgraphs.empty()) {
-            LOG("Finishing async decode of draft\n");
-            llama_finish_async_decode(*ctx_dft, batch_dft, dft_cgraphs.back());
-            dft_cgraphs.pop_back();
-        }
-        LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+        drafts[0].active      = true;
+        drafts[0].drafting    = true;
 
+        drafts[0].i_batch_dft = 0;
 
         // sample n_draft tokens from the draft model using tree-based sampling
         for (int i = 0; i < n_draft; ++i) {
@@ -488,7 +581,7 @@ int main(int argc, char ** argv) {
                     continue;
                 }
 
-                llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
+
 
                 // Swap back to pipeline roots
                 llama_swap_comm(ctx_dft);
@@ -496,9 +589,11 @@ int main(int argc, char ** argv) {
 
                 llama_sync_token(ctx_dft, &(drafts[s].i_batch_dft), 1);
 
+                llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
+
                 auto & cur_p = drafts[s].ctx_sampling->cur;
 
-                llama_sync_token_data(ctx_dft, &(cur_p[0]), 1);
+                llama_sync_token_data(ctx_dft, cur_p.data(), 1);
                 // TODO investigate potential bottleneck
                 for (int k = 1; k < 8; ++k) {
                     llama_sync_token_data(ctx_dft, &(cur_p[k]), 1);
@@ -581,11 +676,14 @@ int main(int argc, char ** argv) {
                     // add unique drafted tokens to the target batch
                     drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
 
+                    LOG("Adding drafted token %d to tgt\n", id);
                     llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
 
                     // add the token to the batch for batched decoding with the draft model
                     drafts[s].i_batch_dft = batch_dft.n_tokens;
 
+                    LOG("Adding drafted token %d to dft\n", id);
+
                     llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
 
                     if (batch_tgt.n_tokens > n_draft) {
@@ -610,34 +708,60 @@ int main(int argc, char ** argv) {
             }
         }
 
+        for (int s = 0; s < n_seq_dft; ++s) {
+            if (!drafts[s].active) {
+                continue;
+            }
+
+            drafts[s].tokens.erase(drafts[s].tokens.begin());
+        }
+
+
+
         // evaluate the target model on the drafted tokens
         {
 //            llama_kv_cache_seq_keep(ctx_tgt, 0); // Needed to get to "Here's the code:"
-            for (int s = 1; s < n_seq_dft; ++s) {
-                llama_kv_cache_seq_cp(ctx_tgt, run_id, s+run_id, -1, n_past_tgt);
-            }
 
             ++n_past_tgt;
 
 
             LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
             struct seq_async_run run;
-            run.batch = batch_tgt;
+            run.ctx_sampling = llama_sampling_init(params.sparams);
+            llama_sampling_cp(ctx_sampling, run.ctx_sampling);
+            run.drafts = std::vector<seq_draft>(n_seq_dft);
+            for (int s = 0; s < n_seq_dft; ++s) {
+                run.drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
+                llama_sampling_cp(drafts[s].ctx_sampling, run.drafts[s].ctx_sampling);
+                run.drafts[s].i_batch_tgt = drafts[s].i_batch_tgt;
+                run.drafts[s].tokens = drafts[s].tokens;
+                run.drafts[s].active = drafts[s].active;
+                run.drafts[s].drafting = drafts[s].drafting;
+                run.drafts[s].skip = drafts[s].skip;
+                run.drafts[s].i_batch_dft = drafts[s].i_batch_dft;
+            }
+            run.i_dft = offset;
+            run.s_keep = s_keep;
+            run.batch = llama_batch_init(params.n_ctx, 0, n_seq_dft + 1);
+            run.batch.n_tokens = batch_tgt.n_tokens;
+            for (int i = 0; i < batch_tgt.n_tokens; i++) {
+                run.batch.n_seq_id[i] = batch_tgt.n_seq_id[i];
+                for (int j = 0; j < run.batch.n_seq_id[i]; j++) {
+                    run.batch.seq_id[i][j] = batch_tgt.seq_id[i][j];
+                }
+                run.batch.token[i] = batch_tgt.token[i];
+                run.batch.pos[i] = batch_tgt.pos[i];
+                run.batch.logits[i] = batch_tgt.logits[i];
+            }
             run.run_id = 0;
             run.n_past_tgt = n_past_tgt;
-            run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt);
+            run.cgraph = llama_start_async_decode(*ctx_tgt, run.batch);
             tgt_cgraphs.push_front(run);
 
         }
 
 
-        for (int s = 0; s < n_seq_dft; ++s) {
-            if (!drafts[s].active) {
-                continue;
-            }
 
-            drafts[s].tokens.erase(drafts[s].tokens.begin());
-        }
     }
 
     auto t_dec_end = ggml_time_us();
diff --git a/ggml-mpi.c b/ggml-mpi.c
index e9f4bb8c558..f5195777344 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -194,11 +194,11 @@ bool ggml_mpi_eval_init(
     // we can't guarantee the allocated sizes are correct
     // TODO check how often this is done and if it's a problem,
     //      try to allocate ahead of time
-    if (old_n_tokens != *n_tokens) {
-        *pos = realloc(*pos, *n_tokens * sizeof(int32_t));
-        *n_seq_ids = realloc(*n_seq_ids, *n_tokens * sizeof(int32_t ));
-        *tokens = realloc(*tokens, *n_tokens * sizeof(int32_t ));
-    }
+//    if (old_n_tokens != *n_tokens) {
+//        *pos = realloc(*pos, *n_tokens * sizeof(int32_t));
+//        *n_seq_ids = realloc(*n_seq_ids, *n_tokens * sizeof(int32_t ));
+//        *tokens = realloc(*tokens, *n_tokens * sizeof(int32_t ));
+//    }
 
     ggml_mpi_sync_pipelined(ctx_mpi, *tokens, *n_tokens, MPI_INT32_T, 0);
 
@@ -233,18 +233,17 @@ bool ggml_mpi_eval_init(
     ggml_mpi_sync_pipelined(ctx_mpi, *pos, *n_tokens, MPI_INT32_T, 0);
     ggml_mpi_sync_pipelined(ctx_mpi, flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, 0);
 
-    int32_t ** new_seq_id = calloc(*n_tokens, sizeof(int32_t*));
     current_index = 0;
     for (int32_t i = 0; i < *n_tokens; i++) {
-        new_seq_id[i] = calloc((*n_seq_ids)[i], sizeof(int32_t));
-        for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
-            new_seq_id[i][j] = flattened_seq_ids[current_index];
-            current_index++;
+        if (i < *n_tokens) {
+            for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
+                (*seq_id)[i][j] = flattened_seq_ids[current_index];
+                current_index++;
+            }
         }
     }
     free(flattened_seq_ids);
-    //free(*seq_id); // <- something is still holding onto this, need to investigate
-    *seq_id = new_seq_id;
+
     return true;
 }
 

From 7081a7aabb198662eb2ab93b2123caf446dcfe5e Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 28 Nov 2023 12:26:14 -0600
Subject: [PATCH 49/63] Fix np >= 2 by using sequence offsets

---
 examples/speculative/speculative.cpp | 267 +++++++++++++++++----------
 ggml-mpi.c                           |  31 ++--
 llama.cpp                            |   4 +-
 3 files changed, 180 insertions(+), 122 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 42682de4c6e..f4329b89c6b 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -33,7 +33,10 @@ struct seq_async_run {
     int n_past_tgt;
     int i_dft;
     int s_keep;
+    int seq_offset;
+    int n_past_max;
     llama_sampling_context *ctx_sampling;
+    bool speculative;
 };
 
 int main(int argc, char ** argv) {
@@ -172,10 +175,27 @@ int main(int argc, char ** argv) {
 
     const auto t_enc_start = ggml_time_us();
 
+    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 2*n_seq_dft + 1);
+    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 2*n_seq_dft + 1);
+    llama_batch batch_tgt_async = llama_batch_init(params.n_ctx, 0, 2*n_seq_dft + 1);
+
+    std::vector<llama_seq_id> seq_ids;
+    for (int i = 0; i < 2*n_seq_dft+1; i++) {
+        seq_ids.emplace_back(i);
+    }
+
+    for (size_t i = 0; i < inp.size()-1; i++) {
+        llama_batch_add(batch_dft, inp[i], i, seq_ids, true);
+        llama_batch_add(batch_tgt, inp[i], i, seq_ids, true);
+    }
+    llama_decode(ctx_tgt, batch_tgt);
+    llama_batch_clear(batch_tgt);
+    llama_batch_add(batch_dft, inp.back(), n_input-1, seq_ids, true);
+    llama_batch_add(batch_tgt, inp.back(), n_input-1, seq_ids, true);
+
     // eval the prompt with both models
-    llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
-    llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
-    llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input,     0,           0));
+    llama_decode(ctx_tgt, batch_tgt);
+    llama_decode(ctx_dft, batch_dft);
 
     const auto t_enc_end = ggml_time_us();
 
@@ -209,9 +229,7 @@ int main(int argc, char ** argv) {
         drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
     }
 
-    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft + 1);
-    llama_batch batch_tgt_async = llama_batch_init(params.n_ctx, 0, n_seq_dft + 1);
+
 
 
     std::deque<struct ggml_cgraph *> dft_cgraphs;
@@ -227,7 +245,10 @@ int main(int argc, char ** argv) {
     int offset = 1;
     bool has_asynced = false;
     int run_n_past_tgt = n_past_tgt;
-
+    int seq_offset = 1;
+    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx_tgt, 2*n_seq_dft+1);
+    std::vector<llama_token> generated = inp;
+    bool run_speculative = false;
     while (true) {
 
 
@@ -237,14 +258,22 @@ int main(int argc, char ** argv) {
         if (!tgt_cgraphs.empty()) {
             has_asynced = true;
             struct seq_async_run run = tgt_cgraphs.back();
-            LOG("Finishing async decode, is async = %d\n", run.run_id == ASYNC_RUN_ID);
+            LOG("Finishing async decode, is async = %d, old seq_offset = %d\n", run.run_id == ASYNC_RUN_ID, seq_offset);
             struct ggml_cgraph * cgraph = run.cgraph;
             llama_finish_async_decode(*ctx_tgt, run.batch, cgraph);
+            llama_swap_comm(ctx_tgt);
+            if (llama_node_id(ctx_tgt) == 0) {
+                llama_kv_cache_view_update(ctx_tgt, &kvc_view);
+                //dump_kv_cache_view_seqs(kvc_view, 20);
+            }
+            llama_swap_comm(ctx_tgt);
             tgt_cgraphs.pop_back();
             run_id = run.run_id;
             drafts = run.drafts;
+            run_speculative = run.speculative;
 //            ctx_sampling = run.ctx_sampling;
             run_n_past_tgt = run.n_past_tgt;
+            seq_offset = run.seq_offset;
 //            s_keep = run.s_keep;
             if (run_id == ASYNC_RUN_ID) {
 //                llama_kv_cache_seq_cp  (ctx_tgt, run_id, 0, -1, n_past_tgt);
@@ -254,7 +283,7 @@ int main(int argc, char ** argv) {
 //                i_dft = 0;
             }
         } else {
-            run_n_past_tgt = n_past_tgt;
+//            run_n_past_tgt = n_past_tgt;
         }
 
         // print current draft sequences
@@ -275,19 +304,35 @@ int main(int argc, char ** argv) {
         bool any_match = false;
         llama_token id;
         std::string token_str;
-        while (true) {
-//            if (i_dft >= drafts[s_keep].i_batch_tgt.size()) {
-//                break;
-//            }
 
-            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d, run_n_past_tgt = %3d, n_past_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft], run_n_past_tgt, n_past_tgt);
 
-//            if (has_asynced && run_id != ASYNC_RUN_ID) {
-//                break;
-//            }
+
+        if (run_speculative) {
+            LOG("Speculative run, last generated: %d, first draft: %d\n", generated.back(), drafts[s_keep].tokens[0]);
+            if(generated.back() == drafts[s_keep].tokens[0]) {
+                //drafts[s_keep].tokens.erase(drafts[s_keep].tokens.begin());
+                for (int s = 0; s < n_seq_dft; ++s) {
+                    if (!drafts[s].active) {
+                        continue;
+                    }
+
+                    drafts[s].tokens.erase(drafts[s].tokens.begin());
+                }
+
+            } else {
+                continue;
+            }
+
+        }
+        std::vector<int> keeps = seq_ids;
+        while (!keeps.empty()) {
+
+            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d, run_n_past_tgt = %3d, n_past_tgt = %3d, seq_offset = %d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft], run_n_past_tgt, n_past_tgt, seq_offset);
+
 
             // sample from the target model
-            id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
+            id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[keeps[0]].i_batch_tgt[i_dft]);
+            token_str = llama_token_to_piece(ctx_tgt, id);
             // Swap to pipeline roots
             llama_swap_comm(ctx_tgt);
             LOG("Swapped comm to pipeline roots, id %d\n", llama_node_id(ctx_tgt));
@@ -300,24 +345,19 @@ int main(int argc, char ** argv) {
 
             LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
 
-            const int n_vocab = llama_n_vocab(llama_get_model(ctx_tgt));
-            float * logits = llama_get_logits_ith(ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
 
-//            LOG("logits:\n");
-//            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-//                LOG("\t%d: %.4f\n", token_id, logits[token_id]);
-//            }
+            LOG("Sampled token: %d ('%s'), n_past_tgt: %d\n", id, token_str.c_str(), n_past_tgt);
+
 
-            if (run_n_past_tgt + drafts[s_keep].i_batch_tgt[i_dft] == n_past_tgt) {
+            if (run_n_past_tgt + i_dft == n_past_tgt) {
                 any_match = true;
                 ++n_predict;
                 llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
 
                 // Root of WORLD
-
+                LOG("Accepting token %d ('%s'), n_past_tgt: %d\n", id, token_str.c_str(), n_past_tgt);
+                generated.push_back(id);
                 if (llama_node_id(ctx_tgt) == 0) {
-                    token_str = llama_token_to_piece(ctx_tgt, id);
-                    LOG("Sampled token: %d ('%s'), n_past_tgt: %d\n", id, token_str.c_str(), n_past_tgt);
                     printf("%s", token_str.c_str());
                     fflush(stdout);
                 }
@@ -345,7 +385,7 @@ int main(int argc, char ** argv) {
             // check if the target token matches any of the drafts
             { // Only running this when should_run_async starts out okay but still goes off the rails eventually
                 bool matches = false;
-
+                keeps.clear();
                 for (int s = 0; s < n_seq_dft; ++s) {
                     if (!drafts[s].active) {
                         continue;
@@ -356,19 +396,20 @@ int main(int argc, char ** argv) {
 
                         s_keep = s;
                         matches = true;
+                        keeps.push_back(s);
                     } else {
                         drafts[s].active = false;
                     }
                 }
 
                 if (matches) {
-                    if (run_n_past_tgt + drafts[s_keep].i_batch_tgt[i_dft] == n_past_tgt) {
+                    if (run_n_past_tgt + i_dft == n_past_tgt) {
                         ++n_accept;
                         ++n_past_tgt;
                         ++n_past_dft;
                     }
                     ++i_dft;
-                    if (run_id != ASYNC_RUN_ID && run_n_past_tgt + drafts[s_keep].i_batch_tgt[i_dft] <= n_past_tgt) {
+                    if (run_id != ASYNC_RUN_ID && run_n_past_tgt + i_dft <= n_past_tgt) {
                         continue;
                     }
                 }
@@ -377,7 +418,7 @@ int main(int argc, char ** argv) {
 
 
 
-            break;
+            //break;
         }
 
 
@@ -392,85 +433,75 @@ int main(int argc, char ** argv) {
             continue;
         }
 
+        // Pipeline syncing cache ops
+        llama_kv_cache_seq_keep(ctx_dft, s_keep);
+        llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
+        llama_kv_cache_seq_keep(ctx_dft, 0);
+        llama_kv_cache_seq_rm  (ctx_dft, 0, n_past_dft, -1);
+
         // TODO: simplify
-        {
+        if (run_id != ASYNC_RUN_ID){
             LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
 
-            // Pipeline syncing cache ops
-            llama_kv_cache_seq_keep(ctx_dft, s_keep);
-            llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
-            llama_kv_cache_seq_keep(ctx_dft, 0);
-            llama_kv_cache_seq_rm  (ctx_dft, 0, n_past_dft, -1);
 
-//            llama_kv_cache_seq_rm  (ctx_tgt, s_keep+run_id, n_past_tgt, -1);
-            llama_kv_cache_seq_rm  (ctx_tgt, 0, n_past_tgt, -1); // Forces "create" and nothing else effects it
 
 
+//            llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1);
             for (int i = 0; i < n_seq_dft; i++) {
-//                if (run_id == ASYNC_RUN_ID) {
-//                    llama_kv_cache_seq_rm(ctx_tgt, i + 0, n_past_tgt, -1);
-//                } else {
-////                    llama_kv_cache_seq_rm(ctx_tgt, i + ASYNC_RUN_ID, n_past_tgt, -1);
-//
-//                }
-                if (i != s_keep) {
-                    llama_kv_cache_seq_rm(ctx_tgt, i + 0, -1, n_past_tgt);
-                }
+
 
             }
-//            llama_kv_cache_seq_keep(ctx_tgt, s_keep);
-            llama_kv_cache_seq_cp  (ctx_tgt, s_keep, 0, -1, n_past_tgt);
-//            llama_kv_cache_seq_keep(ctx_tgt, 0);
-            for (int i = 1; i < n_seq_dft; i++) {
-//                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, -1, n_past_tgt);
-                llama_kv_cache_seq_rm  (ctx_tgt, i, -1, n_past_tgt);
+            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+seq_offset, 0, run_n_past_tgt, n_past_tgt);
+            for (int i = 0; i < n_seq_dft; i++) {
+                llama_kv_cache_seq_rm  (ctx_tgt, i+seq_offset, -1, -1);
 
             }
-//            llama_kv_cache_seq_rm  (ctx_tgt, run_id, n_past_tgt, n_past_tgt+2);
-//            llama_kv_cache_seq_rm  (ctx_tgt, 0, n_past_tgt, n_past_tgt+2);
 
 
-//            llama_kv_cache_seq_cp  (ctx_tgt, run_id, 0, -1, n_past_tgt);
+            for (int i = 0; i < 2*n_seq_dft+1; i++) {
+                llama_kv_cache_seq_cp(ctx_tgt, 0, i, -1, n_past_tgt);
+            }
 
-        }
+//            for (int i = 0; i < n_seq_dft; i++) {
+//                llama_kv_cache_seq_cp(ctx_tgt, 0, i+seq_offset, -1, n_past_tgt);
+//            }
 
-        for (int s = 1; s < n_seq_dft; ++s) {
-            llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, n_past_tgt);
-        }
 
+        } else {
+//            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+seq_offset, 0, -1, n_past_tgt);
+//            for (int i = 1; i < n_seq_dft; i++) {
+//                llama_kv_cache_seq_rm  (ctx_tgt, i+seq_offset, -1, n_past_tgt);
+//
+//            }
+
+//            for (int i = 0; i < n_seq_dft; i++) {
+//                llama_kv_cache_seq_rm  (ctx_tgt, i+seq_offset, -1, n_past_tgt);
+//                llama_kv_cache_seq_cp(ctx_tgt, 0, i+seq_offset, -1, n_past_tgt);
+//            }
+        }
 
 
-        if (should_run_async) {
-                LOG("Beginning async decode\n");
-            llama_batch_clear(batch_tgt_async);
-            std::vector<llama_seq_id> seq_ids;
-            for (int i = 0; i < n_seq_dft; i++) {
-                seq_ids.emplace_back(i);
-            }
-            llama_batch_add(batch_tgt_async, id, n_past_tgt, seq_ids, true);
-            // batch_tgt.n_tokens = 1
 
 
-            for (int i = 0; i < n_seq_dft; i++) {
-//                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, n_past_tgt, -1);
-            }
 
-//            llama_kv_cache_seq_cp  (ctx_tgt, run_id, ASYNC_RUN_ID, -1, n_past_tgt);
 
-//            llama_kv_cache_seq_keep(ctx_tgt, s_keep);
-//            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+run_id, ASYNC_RUN_ID, -1, n_past_tgt);
-//            llama_kv_cache_seq_keep(ctx_tgt, 0);
-            for (int i = 1; i < n_seq_dft; i++) {
-//                llama_kv_cache_seq_rm  (ctx_tgt, i+ASYNC_RUN_ID, -1, n_past_tgt);
-            }
-//            llama_kv_cache_seq_rm  (ctx_tgt, ASYNC_RUN_ID, n_past_tgt, n_past_tgt+2);
 
+        {
+            LOG("Beginning async decode\n");
+            llama_batch_clear(batch_tgt_async);
 
+            llama_batch_add(batch_tgt_async, id, n_past_tgt, {0}, true);
+            // batch_tgt.n_tokens = 1
 
+            ++n_past_tgt;
             struct seq_async_run run;
-            run.cgraph = llama_start_async_decode(*ctx_tgt, batch_tgt_async);
-//            llama_decode(ctx_tgt, batch_tgt_async);
-            run.batch = llama_batch_init(params.n_ctx, 0, n_seq_dft + 1);
+
+            if (seq_offset == 1) {
+                run.seq_offset = n_seq_dft + 1;
+            } else {
+                run.seq_offset = 1;
+            }
+            run.batch = llama_batch_init(params.n_ctx, 0, 2*n_seq_dft + 1);
             run.batch.n_tokens = batch_tgt_async.n_tokens;
             for (int i = 0; i < batch_tgt_async.n_tokens; i++) {
                 run.batch.n_seq_id[i] = batch_tgt_async.n_seq_id[i];
@@ -497,19 +528,39 @@ int main(int argc, char ** argv) {
             run.i_dft = offset - 1;
             run.s_keep = s_keep;
             run.run_id = ASYNC_RUN_ID;
-            run.n_past_tgt = n_past_tgt + 1;
+            run.n_past_tgt = n_past_tgt;
+            run.speculative = false;
+            run.n_past_max = n_past_tgt;
+            run.cgraph = llama_start_async_decode(*ctx_tgt, run.batch);
             tgt_cgraphs.push_front(run);
-            llama_kv_cache_seq_rm(ctx_tgt, -1, n_past_tgt, -1);
-
+            //llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_past_tgt+1);
+            for (int i = 0; i < 2*n_seq_dft+1; i++) {
+                llama_kv_cache_seq_cp(ctx_tgt, 0, i, n_past_tgt-1, n_past_tgt);
+            }
         }
 
         should_run_async = !should_run_async;
 
 
+        bool should_run_spec = true;
+        for (const auto& r : tgt_cgraphs) {
+            if (r.seq_offset == seq_offset && r.run_id != ASYNC_RUN_ID) {
+                should_run_spec = false;
+                break;
+            }
+        }
+
+//        if (!should_run_spec) {
+//            if (!should_run_async) {
+//                n_past_tgt++;
+//                n_past_dft++;
+//            }
+//            continue;
+//        }
 
 
         llama_batch_clear(batch_tgt);
-        llama_batch_add  (batch_tgt, id, n_past_tgt, { 0 }, true);
+        //llama_batch_add  (batch_tgt, id, n_past_tgt, { seq_offset }, true);
 
         for (int s = 0; s < n_seq_dft; ++s) {
             drafts[s].active = false;
@@ -518,7 +569,7 @@ int main(int argc, char ** argv) {
         }
         // note: will be erased after the speculation phase
         drafts[0].tokens.push_back(id);
-        drafts[0].i_batch_tgt.push_back(0);
+        //drafts[0].i_batch_tgt.push_back(0);
 
         llama_batch_clear(batch_dft);
         llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
@@ -635,8 +686,8 @@ int main(int argc, char ** argv) {
                         // all previous tokens from this branch are now also part of the new branch
                         for (int t = 0; t < batch_tgt.n_tokens; ++t) {
                             for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) {
-                                if (batch_tgt.seq_id[t][p] == s) {
-                                    batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur;
+                                if (batch_tgt.seq_id[t][p] == s+seq_offset) {
+                                    batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur+seq_offset;
                                     batch_tgt.n_seq_id[t]++;
                                     break;
                                 }
@@ -674,10 +725,11 @@ int main(int argc, char ** argv) {
                     drafts[s].tokens.push_back(id);
 
                     // add unique drafted tokens to the target batch
+
                     drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
 
                     LOG("Adding drafted token %d to tgt\n", id);
-                    llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
+                    llama_batch_add(batch_tgt, id, n_past_tgt + i, { s+seq_offset }, true);
 
                     // add the token to the batch for batched decoding with the draft model
                     drafts[s].i_batch_dft = batch_dft.n_tokens;
@@ -708,13 +760,7 @@ int main(int argc, char ** argv) {
             }
         }
 
-        for (int s = 0; s < n_seq_dft; ++s) {
-            if (!drafts[s].active) {
-                continue;
-            }
 
-            drafts[s].tokens.erase(drafts[s].tokens.begin());
-        }
 
 
 
@@ -722,11 +768,27 @@ int main(int argc, char ** argv) {
         {
 //            llama_kv_cache_seq_keep(ctx_tgt, 0); // Needed to get to "Here's the code:"
 
-            ++n_past_tgt;
 
 
+
+
+            if (batch_tgt.n_tokens == 0) {
+                continue;
+            }
+
+            for (int s = 0; s < n_seq_dft; ++s) {
+                if (!drafts[s].active) {
+                    continue;
+                }
+
+               drafts[s].tokens.erase(drafts[s].tokens.begin());
+               //drafts[s].tokens.erase(drafts[s].tokens.begin());
+            }
+
             LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
             struct seq_async_run run;
+            run.speculative = true;
+            run.seq_offset = seq_offset;
             run.ctx_sampling = llama_sampling_init(params.sparams);
             llama_sampling_cp(ctx_sampling, run.ctx_sampling);
             run.drafts = std::vector<seq_draft>(n_seq_dft);
@@ -742,10 +804,11 @@ int main(int argc, char ** argv) {
             }
             run.i_dft = offset;
             run.s_keep = s_keep;
-            run.batch = llama_batch_init(params.n_ctx, 0, n_seq_dft + 1);
+            run.batch = llama_batch_init(params.n_ctx, 0, 2*n_seq_dft + 1);
             run.batch.n_tokens = batch_tgt.n_tokens;
             for (int i = 0; i < batch_tgt.n_tokens; i++) {
                 run.batch.n_seq_id[i] = batch_tgt.n_seq_id[i];
+                int cur_n_seqs = 0;
                 for (int j = 0; j < run.batch.n_seq_id[i]; j++) {
                     run.batch.seq_id[i][j] = batch_tgt.seq_id[i][j];
                 }
@@ -754,7 +817,7 @@ int main(int argc, char ** argv) {
                 run.batch.logits[i] = batch_tgt.logits[i];
             }
             run.run_id = 0;
-            run.n_past_tgt = n_past_tgt;
+            run.n_past_tgt = n_past_tgt+1;
             run.cgraph = llama_start_async_decode(*ctx_tgt, run.batch);
             tgt_cgraphs.push_front(run);
 
diff --git a/ggml-mpi.c b/ggml-mpi.c
index f5195777344..90be4831cd4 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -60,9 +60,8 @@ struct ggml_mpi_context * ggml_mpi_init(void) {
     ctx->asyncRecvWaiting = false;
     ctx->running_decode = false;
     ctx->async = false;
-    ctx->send_buffer = calloc(1, 4096*1024*32); // 128MB buffer
+    ctx->send_buffer = calloc(1, 128*1024*1024); // 128MB buffer
     MPI_Buffer_attach(ctx->send_buffer, 4096*1024*32);
-//    ctx->status = *MPI_STATUS_IGNORE;
 
     return ctx;
 }
@@ -159,7 +158,7 @@ void ggml_mpi_sync_pipelined(
         return;
     }
 
-//    printf("Rank %d sync pipelined\n", ctx_mpi->rank);
+    //printf("Rank %d sync pipelined\n", ctx_mpi->rank);
 
 
     if (ctx_mpi->rank != 0) {
@@ -190,15 +189,13 @@ bool ggml_mpi_eval_init(
     ggml_mpi_sync_pipelined(ctx_mpi, n_tokens, 1, MPI_INT, 0);
 
 
-    // If what was passed in differs from what was broadcast,
-    // we can't guarantee the allocated sizes are correct
-    // TODO check how often this is done and if it's a problem,
-    //      try to allocate ahead of time
-//    if (old_n_tokens != *n_tokens) {
-//        *pos = realloc(*pos, *n_tokens * sizeof(int32_t));
-//        *n_seq_ids = realloc(*n_seq_ids, *n_tokens * sizeof(int32_t ));
-//        *tokens = realloc(*tokens, *n_tokens * sizeof(int32_t ));
-//    }
+    // For now, we assume that the pos, seq_ids, tokens, etc have been
+    // pre-allocated for the largest possible sizes, even on worker nodes.
+    //if (old_n_tokens != *n_tokens) {
+    //    *pos = realloc(*pos, *n_tokens * sizeof(int32_t));
+    //    *n_seq_ids = realloc(*n_seq_ids, *n_tokens * sizeof(int32_t ));
+    //    *tokens = realloc(*tokens, *n_tokens * sizeof(int32_t ));
+    //}
 
     ggml_mpi_sync_pipelined(ctx_mpi, *tokens, *n_tokens, MPI_INT32_T, 0);
 
@@ -235,12 +232,11 @@ bool ggml_mpi_eval_init(
 
     current_index = 0;
     for (int32_t i = 0; i < *n_tokens; i++) {
-        if (i < *n_tokens) {
-            for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
-                (*seq_id)[i][j] = flattened_seq_ids[current_index];
-                current_index++;
-            }
+        for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
+            (*seq_id)[i][j] = flattened_seq_ids[current_index];
+            current_index++;
         }
+
     }
     free(flattened_seq_ids);
 
@@ -544,7 +540,6 @@ void ggml_mpi_graph_creation_post(
     // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
     for (int i = 1; i < idx_l1 - idx_l0; i++) {
         gf->nodes[i] = gf->nodes[idx_l0 + i];
-        gf->grads[i] = gf->grads[idx_l0 + i];
     }
 
     // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
diff --git a/llama.cpp b/llama.cpp
index e8fbd1544ef..7b295267851 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5732,8 +5732,8 @@ static int llama_decode_internal(
     if (gf != nullptr) {
         return llama_decode_internal_phased(lctx, batch, 1, gf) != nullptr;
     } else {
-//        printf("Graph is null\n");
-//        return -1;
+        //printf("Graph is null\n");
+        return -1;
     }
 }
 

From 67838da973d0c16555464e196fddc26c11853f8f Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Fri, 8 Dec 2023 16:57:24 -0600
Subject: [PATCH 50/63] Fix draft model KV cache synchronization w/ double
 buffering

---
 examples/speculative/speculative.cpp | 118 ++++++++++++++++-----------
 1 file changed, 69 insertions(+), 49 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index f4329b89c6b..b4c8553f399 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -31,6 +31,7 @@ struct seq_async_run {
     std::vector<seq_draft> drafts;
     int run_id;
     int n_past_tgt;
+    int n_past_dft;
     int i_dft;
     int s_keep;
     int seq_offset;
@@ -245,8 +246,10 @@ int main(int argc, char ** argv) {
     int offset = 1;
     bool has_asynced = false;
     int run_n_past_tgt = n_past_tgt;
+    int run_n_past_dft = n_past_dft;
     int seq_offset = 1;
     struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx_tgt, 2*n_seq_dft+1);
+    struct llama_kv_cache_view kvc_view_dft = llama_kv_cache_view_init(ctx_dft, 2*n_seq_dft+1);
     std::vector<llama_token> generated = inp;
     bool run_speculative = false;
     while (true) {
@@ -261,18 +264,13 @@ int main(int argc, char ** argv) {
             LOG("Finishing async decode, is async = %d, old seq_offset = %d\n", run.run_id == ASYNC_RUN_ID, seq_offset);
             struct ggml_cgraph * cgraph = run.cgraph;
             llama_finish_async_decode(*ctx_tgt, run.batch, cgraph);
-            llama_swap_comm(ctx_tgt);
-            if (llama_node_id(ctx_tgt) == 0) {
-                llama_kv_cache_view_update(ctx_tgt, &kvc_view);
-                //dump_kv_cache_view_seqs(kvc_view, 20);
-            }
-            llama_swap_comm(ctx_tgt);
             tgt_cgraphs.pop_back();
             run_id = run.run_id;
             drafts = run.drafts;
             run_speculative = run.speculative;
 //            ctx_sampling = run.ctx_sampling;
             run_n_past_tgt = run.n_past_tgt;
+            run_n_past_dft = run.n_past_dft;
             seq_offset = run.seq_offset;
 //            s_keep = run.s_keep;
             if (run_id == ASYNC_RUN_ID) {
@@ -285,7 +283,10 @@ int main(int argc, char ** argv) {
         } else {
 //            run_n_past_tgt = n_past_tgt;
         }
-
+        if (llama_node_id(ctx_tgt) == 0) {
+            llama_kv_cache_view_update(ctx_tgt, &kvc_view);
+            dump_kv_cache_view_seqs(kvc_view, 20);
+        }
         // print current draft sequences
         bool any_active = false;
         for (int s = 0; s < n_seq_dft; ++s) {
@@ -309,8 +310,8 @@ int main(int argc, char ** argv) {
 
         if (run_speculative) {
             LOG("Speculative run, last generated: %d, first draft: %d\n", generated.back(), drafts[s_keep].tokens[0]);
-            if(generated.back() == drafts[s_keep].tokens[0]) {
-                //drafts[s_keep].tokens.erase(drafts[s_keep].tokens.begin());
+            if(generated.back()-(n_past_tgt-run_n_past_tgt) == drafts[s_keep].tokens[0]) {
+                //drafts[0].tokens.erase(drafts[0].tokens.begin());
                 for (int s = 0; s < n_seq_dft; ++s) {
                     if (!drafts[s].active) {
                         continue;
@@ -327,7 +328,7 @@ int main(int argc, char ** argv) {
         std::vector<int> keeps = seq_ids;
         while (!keeps.empty()) {
 
-            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d, run_n_past_tgt = %3d, n_past_tgt = %3d, seq_offset = %d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft], run_n_past_tgt, n_past_tgt, seq_offset);
+            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d, run_n_past_tgt = %3d, n_past_tgt = %3d, seq_offset = %d, keeps[0] = %d\n", s_keep, i_dft, drafts[keeps[0]].i_batch_tgt[i_dft], run_n_past_tgt, n_past_tgt, seq_offset, keeps[0]);
 
 
             // sample from the target model
@@ -340,7 +341,7 @@ int main(int argc, char ** argv) {
             llama_sync_token(ctx_tgt, &id, 0);
 
             LOG("Should run async: %d\n", should_run_async);
-            LOG("Sampling index: %d\n", drafts[s_keep].i_batch_tgt[i_dft]);
+            LOG("Sampling index: %d\n", drafts[keeps[0]].i_batch_tgt[i_dft]);
 
 
             LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
@@ -393,10 +394,9 @@ int main(int argc, char ** argv) {
 
                     if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
                         LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
-
-                        s_keep = s;
                         matches = true;
                         keeps.push_back(s);
+                        s_keep = keeps[0];
                     } else {
                         drafts[s].active = false;
                     }
@@ -415,10 +415,6 @@ int main(int argc, char ** argv) {
                 }
             }
 
-
-
-
-            //break;
         }
 
 
@@ -434,10 +430,10 @@ int main(int argc, char ** argv) {
         }
 
         // Pipeline syncing cache ops
-        llama_kv_cache_seq_keep(ctx_dft, s_keep);
-        llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
-        llama_kv_cache_seq_keep(ctx_dft, 0);
-        llama_kv_cache_seq_rm  (ctx_dft, 0, n_past_dft, -1);
+//        llama_kv_cache_seq_keep(ctx_dft, s_keep);
+//        llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
+//        llama_kv_cache_seq_keep(ctx_dft, 0);
+//        llama_kv_cache_seq_rm  (ctx_dft, 0, n_past_dft, -1);
 
         // TODO: simplify
         if (run_id != ASYNC_RUN_ID){
@@ -452,14 +448,16 @@ int main(int argc, char ** argv) {
 
             }
             llama_kv_cache_seq_cp  (ctx_tgt, s_keep+seq_offset, 0, run_n_past_tgt, n_past_tgt);
+            llama_kv_cache_seq_cp  (ctx_dft, s_keep+seq_offset, 0, run_n_past_dft, n_past_dft);
             for (int i = 0; i < n_seq_dft; i++) {
                 llama_kv_cache_seq_rm  (ctx_tgt, i+seq_offset, -1, -1);
-
+                llama_kv_cache_seq_rm  (ctx_dft, i+seq_offset, -1, -1);
             }
 
 
             for (int i = 0; i < 2*n_seq_dft+1; i++) {
                 llama_kv_cache_seq_cp(ctx_tgt, 0, i, -1, n_past_tgt);
+                llama_kv_cache_seq_cp(ctx_dft, 0, i, -1, n_past_dft);
             }
 
 //            for (int i = 0; i < n_seq_dft; i++) {
@@ -529,6 +527,7 @@ int main(int argc, char ** argv) {
             run.s_keep = s_keep;
             run.run_id = ASYNC_RUN_ID;
             run.n_past_tgt = n_past_tgt;
+            run.n_past_dft = n_past_dft;
             run.speculative = false;
             run.n_past_max = n_past_tgt;
             run.cgraph = llama_start_async_decode(*ctx_tgt, run.batch);
@@ -550,13 +549,25 @@ int main(int argc, char ** argv) {
             }
         }
 
-//        if (!should_run_spec) {
-//            if (!should_run_async) {
-//                n_past_tgt++;
-//                n_past_dft++;
-//            }
-//            continue;
-//        }
+        if (!should_run_spec) {
+            //if (!should_run_async) {
+            //    n_past_tgt++;
+            //    n_past_dft++;
+            //}
+            continue;
+        }
+
+        for (int i = 0; i < n_seq_dft; i++) {
+            llama_kv_cache_seq_rm  (ctx_tgt, i+seq_offset, -1, -1);
+            llama_kv_cache_seq_cp(ctx_tgt, 0, i+seq_offset, -1, n_past_tgt);
+            llama_kv_cache_seq_rm  (ctx_dft, i+seq_offset, n_past_dft, -1);
+        }
+
+        for (int i = 0; i < 2*n_seq_dft+1; i++) {
+
+        }
+
+
 
 
         llama_batch_clear(batch_tgt);
@@ -571,15 +582,12 @@ int main(int argc, char ** argv) {
         drafts[0].tokens.push_back(id);
         //drafts[0].i_batch_tgt.push_back(0);
 
+        llama_kv_cache_seq_cp(ctx_dft, 0, seq_offset, -1, n_past_dft);
+
         llama_batch_clear(batch_dft);
-        llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
+        llama_batch_add  (batch_dft, id, n_past_dft, { seq_offset }, true);
         // batch_dft.n_tokens == 1 now
 
-        // Pipeline sync on draft pipeline
-
-        // Remove all tokens from all sequences after n_past_dft
-        llama_kv_cache_seq_rm(ctx_dft, -1, n_past_dft, -1);
-
 
 
         // Kick off drafting pipeline but don't need it just yet
@@ -596,6 +604,15 @@ int main(int argc, char ** argv) {
         }
         LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
 
+        for (int i = 0; i < 2*n_seq_dft+1; i++) {
+            llama_kv_cache_seq_cp(ctx_dft, seq_offset, i, n_past_dft, n_past_dft+1);
+        }
+
+        if (llama_node_id(ctx_dft) == 0) {
+//            llama_kv_cache_view_update(ctx_dft, &kvc_view_dft);
+//            dump_kv_cache_view_seqs(kvc_view_dft, 20);
+        }
+
         ++n_past_dft;
 
         if (n_predict > params.n_predict || has_eos) {
@@ -604,10 +621,12 @@ int main(int argc, char ** argv) {
 
         llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
 
-        int n_seq_cur  = 1;
+        int n_seq_cur  = 0;
+        int max_ran_seq = 0;
         int n_past_cur = n_past_dft;
 
         for (int s = 0; s < n_seq_dft; ++s) {
+            drafts[s].skip = true;
             drafts[s].active   = false;
             drafts[s].drafting = false;
         }
@@ -616,6 +635,7 @@ int main(int argc, char ** argv) {
 
         drafts[0].active      = true;
         drafts[0].drafting    = true;
+        drafts[0].skip = false;
 
         drafts[0].i_batch_dft = 0;
 
@@ -623,11 +643,7 @@ int main(int argc, char ** argv) {
         for (int i = 0; i < n_draft; ++i) {
             batch_dft.n_tokens = 0;
 
-            for (int s = 0; s < n_seq_dft; ++s) {
-                drafts[s].skip = false;
-            }
-
-            for (int s = 0; s < n_seq_dft; ++s) {
+            for (int s = 0; s <= max_ran_seq; ++s) {
                 if (!drafts[s].drafting || drafts[s].skip) {
                     continue;
                 }
@@ -677,11 +693,12 @@ int main(int argc, char ** argv) {
 
                 // attempt to split the branch if the probability is high enough
                 for (int f = 1; f < 8; ++f) {
-                    if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
+                    if (n_seq_cur < n_seq_dft-1 && cur_p[f].p > p_split) {
+                        n_seq_cur++;
                         LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
 
-                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
-                        llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
+                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur+seq_offset, -1, -1);
+                        llama_kv_cache_seq_cp(ctx_dft, s+seq_offset, n_seq_cur+seq_offset, -1, -1);
 
                         // all previous tokens from this branch are now also part of the new branch
                         for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@@ -694,20 +711,20 @@ int main(int argc, char ** argv) {
                             }
                         }
 
+
                         // copy the draft state
                         drafts[n_seq_cur].active   = true;
                         drafts[n_seq_cur].drafting = true;
-                        drafts[n_seq_cur].skip     = true;
+                        drafts[n_seq_cur].skip     = false;
 
                         drafts[n_seq_cur].tokens      = drafts[s].tokens;
-                        drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
                         drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
 
                         llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
 
                         sa.push_back(n_seq_cur);
 
-                        n_seq_cur++;
+
                     } else {
                         break;
                     }
@@ -728,7 +745,7 @@ int main(int argc, char ** argv) {
 
                     drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
 
-                    LOG("Adding drafted token %d to tgt\n", id);
+                    LOG("Adding drafted token %d to tgt, sequence %d, position %d, i_batch_tgt %d\n", id, s+seq_offset, n_past_tgt+i, batch_tgt.n_tokens);
                     llama_batch_add(batch_tgt, id, n_past_tgt + i, { s+seq_offset }, true);
 
                     // add the token to the batch for batched decoding with the draft model
@@ -736,7 +753,7 @@ int main(int argc, char ** argv) {
 
                     LOG("Adding drafted token %d to dft\n", id);
 
-                    llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
+                    llama_batch_add(batch_dft, id, n_past_cur, { s+seq_offset }, true);
 
                     if (batch_tgt.n_tokens > n_draft) {
                         drafts[s].drafting = false;
@@ -755,6 +772,8 @@ int main(int argc, char ** argv) {
             ++n_past_cur;
             ++n_drafted;
 
+            max_ran_seq = n_seq_cur;
+
             if (batch_tgt.n_tokens > n_draft) {
                 break;
             }
@@ -818,6 +837,7 @@ int main(int argc, char ** argv) {
             }
             run.run_id = 0;
             run.n_past_tgt = n_past_tgt+1;
+            run.n_past_dft = n_past_dft;
             run.cgraph = llama_start_async_decode(*ctx_tgt, run.batch);
             tgt_cgraphs.push_front(run);
 

From 615e6665700b888fdf21b0dc2cea5705f484560d Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Thu, 4 Jan 2024 11:38:27 -0600
Subject: [PATCH 51/63] Add cancellation and multiple simultaneous speculative
 seqs

---
 examples/speculative/speculative.cpp | 678 +++++++++++++++++----------
 ggml-mpi.c                           |  59 ++-
 ggml-mpi.h                           |  16 +
 llama.cpp                            |  98 +++-
 llama.h                              |  11 +
 5 files changed, 584 insertions(+), 278 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index b4c8553f399..a0bac66fd34 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -38,10 +38,10 @@ struct seq_async_run {
     int n_past_max;
     llama_sampling_context *ctx_sampling;
     bool speculative;
+    bool canceled;
 };
 
 int main(int argc, char ** argv) {
-    bool should_run_async = true;
     gpt_params params;
 
     if (gpt_params_parse(argc, argv, params) == false) {
@@ -85,7 +85,7 @@ int main(int argc, char ** argv) {
     llama_swap_comm(ctx_tgt);
 
     llama_split_comm(ctx_tgt, (llama_node_id(ctx_tgt) < params.mpi_layer_split[0].size()) ? 0 : -1);
-    printf("Size of first split: %lu, element: %f\n", params.mpi_layer_split[0].size(), params.mpi_layer_split[0][0]);
+//    printf("Size of first split: %lu, element: %f\n", params.mpi_layer_split[0].size(), params.mpi_layer_split[0][0]);
 
     // load the draft model
     params.model = params.model_draft;
@@ -97,12 +97,19 @@ int main(int argc, char ** argv) {
 
     llama_split_comm(ctx_dft, (llama_node_id(ctx_dft) >= params.mpi_layer_split[0].size()) ? 0 : -1);
 
-    printf("Size of second split: %lu, element: %f\n", params.mpi_layer_split[1].size(), params.mpi_layer_split[1][0]);
+//    printf("Size of second split: %lu, element: %f\n", params.mpi_layer_split[1].size(), params.mpi_layer_split[1][0]);
 
 
     llama_split_layers_weighted(ctx_tgt, params.mpi_layer_split[0].data(), params.mpi_layer_split[0].size());
     llama_split_layers_weighted(ctx_dft, params.mpi_layer_split[1].data(), params.mpi_layer_split[1].size());
 
+    std::deque<int> free_sequence_offsets;
+    const int n_simul_seqs = 20;
+    const int max_seq = n_simul_seqs * n_seq_dft + 1;
+    for (int i = 0; i < n_simul_seqs; i++) {
+        free_sequence_offsets.push_back(i*n_seq_dft + 1);
+    }
+
     {
         LOG_TEE("\n");
         LOG_TEE("%s\n", get_system_info(params).c_str());
@@ -176,12 +183,18 @@ int main(int argc, char ** argv) {
 
     const auto t_enc_start = ggml_time_us();
 
-    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 2*n_seq_dft + 1);
-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 2*n_seq_dft + 1);
-    llama_batch batch_tgt_async = llama_batch_init(params.n_ctx, 0, 2*n_seq_dft + 1);
+    int32_t batch_id = 0;
+
+    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, max_seq);
+    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, max_seq);
+    llama_batch batch_tgt_async = llama_batch_init(params.n_ctx, 0, max_seq);
+
+    batch_dft.batch_id = batch_id;
+    batch_tgt.batch_id = batch_id;
+    batch_tgt_async.batch_id = batch_id;
 
     std::vector<llama_seq_id> seq_ids;
-    for (int i = 0; i < 2*n_seq_dft+1; i++) {
+    for (int i = 0; i < max_seq; i++) {
         seq_ids.emplace_back(i);
     }
 
@@ -244,12 +257,12 @@ int main(int argc, char ** argv) {
 
     int run_id = 0;
     int offset = 1;
-    bool has_asynced = false;
     int run_n_past_tgt = n_past_tgt;
     int run_n_past_dft = n_past_dft;
-    int seq_offset = 1;
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx_tgt, 2*n_seq_dft+1);
-    struct llama_kv_cache_view kvc_view_dft = llama_kv_cache_view_init(ctx_dft, 2*n_seq_dft+1);
+    int seq_offset = free_sequence_offsets.front();
+    free_sequence_offsets.pop_front();
+    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx_tgt, max_seq);
+    struct llama_kv_cache_view kvc_view_dft = llama_kv_cache_view_init(ctx_dft, max_seq);
     std::vector<llama_token> generated = inp;
     bool run_speculative = false;
     while (true) {
@@ -258,34 +271,68 @@ int main(int argc, char ** argv) {
         int i_dft  = 0;
         int s_keep = 0;
 
+        for (auto run : tgt_cgraphs) {
+            if(!run.canceled && (run.n_past_max < n_past_tgt || (run.speculative && generated.back()-(n_past_tgt-run.n_past_tgt) != run.drafts[s_keep].tokens[0]))) {
+//                LOG("Cancelling run with ID %d, batch ID %d, run_npast_max %d, n_past_tgt %d, run_speculative %d, tokens[0] %d\n", run.run_id, run.batch.batch_id, run.n_past_max, n_past_tgt, run_speculative, drafts[s_keep].tokens[0]);
+//                llama_cancel_run(ctx_tgt, &run.batch.batch_id, 1);
+                run.canceled = true;
+////                }
+//
+//                if (run_speculative) {
+//                    free_sequence_offsets.push_back(seq_offset);
+//                }
+            }
+        }
+
         if (!tgt_cgraphs.empty()) {
-            has_asynced = true;
             struct seq_async_run run = tgt_cgraphs.back();
-            LOG("Finishing async decode, is async = %d, old seq_offset = %d\n", run.run_id == ASYNC_RUN_ID, seq_offset);
+            LOG("Finishing async decode, is async = %d, old seq_offset = %d, new seq offset = %d, batch id = %d\n", run.run_id == ASYNC_RUN_ID, seq_offset, run.seq_offset, run.batch.batch_id);
             struct ggml_cgraph * cgraph = run.cgraph;
-            llama_finish_async_decode(*ctx_tgt, run.batch, cgraph);
-            tgt_cgraphs.pop_back();
+
             run_id = run.run_id;
             drafts = run.drafts;
             run_speculative = run.speculative;
 //            ctx_sampling = run.ctx_sampling;
             run_n_past_tgt = run.n_past_tgt;
             run_n_past_dft = run.n_past_dft;
+//            n_past_dft = run.n_past_dft;
             seq_offset = run.seq_offset;
-//            s_keep = run.s_keep;
-            if (run_id == ASYNC_RUN_ID) {
-//                llama_kv_cache_seq_cp  (ctx_tgt, run_id, 0, -1, n_past_tgt);
+
+
+//                LOG("Speculative run, last generated: %d, first draft: %d\n", generated.back(), drafts[s_keep].tokens[0]);
+            if(run.n_past_max >= n_past_tgt && (!run_speculative || generated.back()-(n_past_tgt-run_n_past_tgt) == drafts[s_keep].tokens[0])) {
+                //drafts[0].tokens.erase(drafts[0].tokens.begin());
+                for (int s = 0; s < n_seq_dft; ++s) {
+                    if (!drafts[s].active) {
+                        continue;
+                    }
+
+                    drafts[s].tokens.erase(drafts[s].tokens.begin());
+                }
 
             } else {
-//                offset++;
-//                i_dft = 0;
+//                if (run_id != ASYNC_RUN_ID) {
+                LOG("Cancelling run with ID %d, batch ID %d, run_npast_max %d, n_past_tgt %d, run_speculative %d, tokens[0] %d\n", run.run_id, run.batch.batch_id, run.n_past_max, n_past_tgt, run_speculative, drafts[s_keep].tokens[0]);
+                    llama_cancel_run(ctx_tgt, &run.batch.batch_id, 1);
+//                }
+                llama_finish_async_decode(*ctx_tgt, run.batch, cgraph);
+                tgt_cgraphs.pop_back();
+                if (run_speculative) {
+                    free_sequence_offsets.push_back(seq_offset);
+                }
+//                fprintf(stderr, "Incorrect starting token\n");
+                continue;
             }
-        } else {
-//            run_n_past_tgt = n_past_tgt;
+
+
+
+            llama_finish_async_decode(*ctx_tgt, run.batch, cgraph);
+            tgt_cgraphs.pop_back();
+
         }
         if (llama_node_id(ctx_tgt) == 0) {
-            llama_kv_cache_view_update(ctx_tgt, &kvc_view);
-            dump_kv_cache_view_seqs(kvc_view, 20);
+//            llama_kv_cache_view_update(ctx_tgt, &kvc_view);
+//            dump_kv_cache_view_seqs(kvc_view, 20);
         }
         // print current draft sequences
         bool any_active = false;
@@ -308,23 +355,7 @@ int main(int argc, char ** argv) {
 
 
 
-        if (run_speculative) {
-            LOG("Speculative run, last generated: %d, first draft: %d\n", generated.back(), drafts[s_keep].tokens[0]);
-            if(generated.back()-(n_past_tgt-run_n_past_tgt) == drafts[s_keep].tokens[0]) {
-                //drafts[0].tokens.erase(drafts[0].tokens.begin());
-                for (int s = 0; s < n_seq_dft; ++s) {
-                    if (!drafts[s].active) {
-                        continue;
-                    }
-
-                    drafts[s].tokens.erase(drafts[s].tokens.begin());
-                }
-
-            } else {
-                continue;
-            }
 
-        }
         std::vector<int> keeps = seq_ids;
         while (!keeps.empty()) {
 
@@ -340,7 +371,6 @@ int main(int argc, char ** argv) {
 
             llama_sync_token(ctx_tgt, &id, 0);
 
-            LOG("Should run async: %d\n", should_run_async);
             LOG("Sampling index: %d\n", drafts[keeps[0]].i_batch_tgt[i_dft]);
 
 
@@ -384,7 +414,7 @@ int main(int argc, char ** argv) {
 
 
             // check if the target token matches any of the drafts
-            { // Only running this when should_run_async starts out okay but still goes off the rails eventually
+            {
                 bool matches = false;
                 keeps.clear();
                 for (int s = 0; s < n_seq_dft; ++s) {
@@ -425,7 +455,10 @@ int main(int argc, char ** argv) {
         }
 
         if (!any_match) {
-            should_run_async = !should_run_async;
+            if (run_id != ASYNC_RUN_ID) {
+                free_sequence_offsets.push_back(seq_offset);
+            }
+//            fprintf(stderr, "No match\n");
             continue;
         }
 
@@ -437,7 +470,7 @@ int main(int argc, char ** argv) {
 
         // TODO: simplify
         if (run_id != ASYNC_RUN_ID){
-            LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
+            LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep+seq_offset, n_past_tgt, n_past_dft);
 
 
 
@@ -447,15 +480,26 @@ int main(int argc, char ** argv) {
 
 
             }
+//            LOG("Copying tgt sequence %d to %d from positions %d to %d\n", s_keep+seq_offset, 0, run_n_past_tgt, n_past_tgt);
+//            llama_kv_cache_seq_cp_back  (ctx_tgt, s_keep+seq_offset, 0, run_n_past_tgt, n_past_tgt);
             llama_kv_cache_seq_cp  (ctx_tgt, s_keep+seq_offset, 0, run_n_past_tgt, n_past_tgt);
+//            LOG("Copying dft sequence %d to %d from positions %d to %d\n", s_keep+seq_offset, 0, run_n_past_dft, n_past_dft);
+
             llama_kv_cache_seq_cp  (ctx_dft, s_keep+seq_offset, 0, run_n_past_dft, n_past_dft);
             for (int i = 0; i < n_seq_dft; i++) {
+//                LOG("Removing tgt sequence %d from positions %d to %d\n", i+seq_offset, -1, -1);
+
                 llama_kv_cache_seq_rm  (ctx_tgt, i+seq_offset, -1, -1);
+//                LOG("Removing dft sequence %d from positions %d to %d\n", i+seq_offset, -1, -1);
+
                 llama_kv_cache_seq_rm  (ctx_dft, i+seq_offset, -1, -1);
             }
 
 
-            for (int i = 0; i < 2*n_seq_dft+1; i++) {
+            for (int i = 0; i < max_seq; i++) {
+//                LOG("Copying tgt sequence %d to %d from positions %d to %d\n", 0, i, -1, n_past_tgt);
+//                LOG("Copying dft sequence %d to %d from positions %d to %d\n", 0, i, -1, n_past_dft);
+
                 llama_kv_cache_seq_cp(ctx_tgt, 0, i, -1, n_past_tgt);
                 llama_kv_cache_seq_cp(ctx_dft, 0, i, -1, n_past_dft);
             }
@@ -485,7 +529,7 @@ int main(int argc, char ** argv) {
 
 
         {
-            LOG("Beginning async decode\n");
+            LOG("Beginning async decode, batch id = %d\n", batch_id + 1);
             llama_batch_clear(batch_tgt_async);
 
             llama_batch_add(batch_tgt_async, id, n_past_tgt, {0}, true);
@@ -493,13 +537,20 @@ int main(int argc, char ** argv) {
 
             ++n_past_tgt;
             struct seq_async_run run;
-
-            if (seq_offset == 1) {
-                run.seq_offset = n_seq_dft + 1;
-            } else {
-                run.seq_offset = 1;
-            }
-            run.batch = llama_batch_init(params.n_ctx, 0, 2*n_seq_dft + 1);
+            run.canceled = false;
+//            if (!free_sequence_offsets.empty()) {
+//                run.seq_offset = free_sequence_offsets.front();
+//                printf("Popping %d from seq offsets\n", run.seq_offset);
+//                free_sequence_offsets.pop_front();
+//            } else if(!tgt_cgraphs.empty()){
+//                printf("Getting offset from head of tgt cgraphs\n");
+//                run.seq_offset = tgt_cgraphs.front().seq_offset;
+//            } else {
+//                printf("NO FREE OFFSETS AND NO TGT CGRAPHS\n");
+//            }
+            run.batch = llama_batch_init(params.n_ctx, 0, max_seq);
+            batch_id++;
+            run.batch.batch_id = batch_id;
             run.batch.n_tokens = batch_tgt_async.n_tokens;
             for (int i = 0; i < batch_tgt_async.n_tokens; i++) {
                 run.batch.n_seq_id[i] = batch_tgt_async.n_seq_id[i];
@@ -527,322 +578,433 @@ int main(int argc, char ** argv) {
             run.s_keep = s_keep;
             run.run_id = ASYNC_RUN_ID;
             run.n_past_tgt = n_past_tgt;
+            run.n_past_max = n_past_tgt + 1;
             run.n_past_dft = n_past_dft;
             run.speculative = false;
-            run.n_past_max = n_past_tgt;
             run.cgraph = llama_start_async_decode(*ctx_tgt, run.batch);
             tgt_cgraphs.push_front(run);
             //llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_past_tgt+1);
-            for (int i = 0; i < 2*n_seq_dft+1; i++) {
+            for (int i = 0; i < max_seq; i++) {
+//                LOG("Copying tgt sequence %d to %d from positions %d to %d\n", 0, i, n_past_tgt-1, n_past_tgt);
+
                 llama_kv_cache_seq_cp(ctx_tgt, 0, i, n_past_tgt-1, n_past_tgt);
             }
         }
 
-        should_run_async = !should_run_async;
 
-
-        bool should_run_spec = true;
-        for (const auto& r : tgt_cgraphs) {
-            if (r.seq_offset == seq_offset && r.run_id != ASYNC_RUN_ID) {
-                should_run_spec = false;
-                break;
+        if (run_id == ASYNC_RUN_ID) {
+            if (free_sequence_offsets.empty()) {
+                continue;
             }
+            seq_offset = free_sequence_offsets.front();
+//            printf("Popping %d from seq offsets for spec run\n", seq_offset);
+            free_sequence_offsets.pop_front();
         }
 
-        if (!should_run_spec) {
-            //if (!should_run_async) {
-            //    n_past_tgt++;
-            //    n_past_dft++;
-            //}
-            continue;
-        }
+        int spec_past_tgt = n_past_tgt;
+        int spec_past_dft = n_past_dft;
 
-        for (int i = 0; i < n_seq_dft; i++) {
-            llama_kv_cache_seq_rm  (ctx_tgt, i+seq_offset, -1, -1);
-            llama_kv_cache_seq_cp(ctx_tgt, 0, i+seq_offset, -1, n_past_tgt);
-            llama_kv_cache_seq_rm  (ctx_dft, i+seq_offset, n_past_dft, -1);
-        }
+        int first_run = true;
 
-        for (int i = 0; i < 2*n_seq_dft+1; i++) {
+        bool is_waiting = false;
+//        llama_swap_comm(ctx_tgt);
+//        llama_sync_token(ctx_tgt, reinterpret_cast<llama_token *>(&is_waiting), 0);
+//        llama_swap_comm(ctx_tgt);
 
+
+//        llama_batch_clear(batch_dft);
+//        llama_batch_add(batch_dft, id, spec_past_dft, {0}, true);
+//        // batch_dft.n_tokens == 1 now
+//
+//
+//
+//        // Kick off drafting pipeline but don't need it just yet
+//        LOG("Beginning async draft with sequence 0\n");
+//        dft_cgraphs.push_front(llama_start_async_decode(*ctx_dft, batch_dft));
+//        // DON'T FORGET THE MATCHING DECODE WHEN NEEDED
+//
+//        // We need the draft now, so wait for it
+//        if (!dft_cgraphs.empty()) {
+////            LOG("Finishing async decode of draft\n");
+//            llama_finish_async_decode(*ctx_dft, batch_dft, dft_cgraphs.back());
+//            dft_cgraphs.pop_back();
+//        }
+//        LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+
+
+        if (is_waiting) {
+            free_sequence_offsets.push_back(seq_offset);
         }
+        int iter = 0;
+        while(!is_waiting) {
 
 
 
 
-        llama_batch_clear(batch_tgt);
-        //llama_batch_add  (batch_tgt, id, n_past_tgt, { seq_offset }, true);
 
-        for (int s = 0; s < n_seq_dft; ++s) {
-            drafts[s].active = false;
-            drafts[s].tokens.clear();
-            drafts[s].i_batch_tgt.clear();
-        }
-        // note: will be erased after the speculation phase
-        drafts[0].tokens.push_back(id);
-        //drafts[0].i_batch_tgt.push_back(0);
+            int orig_offset = seq_offset;
+            bool should_run_spec = true;
+            std::deque<int> checked_offsets;
+            do {
+                should_run_spec = true;
+                for (const auto &r: tgt_cgraphs) {
+                    if (r.seq_offset == seq_offset && r.run_id != ASYNC_RUN_ID) {
+                        checked_offsets.push_back(seq_offset);
 
-        llama_kv_cache_seq_cp(ctx_dft, 0, seq_offset, -1, n_past_dft);
+                        should_run_spec = false;
+                        if (!free_sequence_offsets.empty()) {
+                            seq_offset = free_sequence_offsets.front();
+                            free_sequence_offsets.pop_front();
 
-        llama_batch_clear(batch_dft);
-        llama_batch_add  (batch_dft, id, n_past_dft, { seq_offset }, true);
-        // batch_dft.n_tokens == 1 now
+                        }
+                        break;
+                    }
+                }
+            } while (!should_run_spec && !free_sequence_offsets.empty());
 
+            free_sequence_offsets.insert(free_sequence_offsets.end(), checked_offsets.begin(),
+                                         checked_offsets.end());
 
+            if (!should_run_spec) {
 
-        // Kick off drafting pipeline but don't need it just yet
-        LOG("Beginning async draft\n");
-        dft_cgraphs.push_front(llama_start_async_decode(*ctx_dft, batch_dft));
-        //llama_decode(ctx_dft, batch_dft);
-        // DON'T FORGET THE MATCHING DECODE WHEN NEEDED
+                break;
+            }
 
-        // We need the draft now, so wait for it
-        if (!dft_cgraphs.empty()) {
-            LOG("Finishing async decode of draft\n");
-            llama_finish_async_decode(*ctx_dft, batch_dft, dft_cgraphs.back());
-            dft_cgraphs.pop_back();
-        }
-        LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+//            LOG("Doing speculative run, seq_offset = %d, spec_past_tgt = %d, spec_past_dft = %d, n_past_tgt = %d, n_past_dft = %d", seq_offset, spec_past_tgt, spec_past_dft, n_past_tgt, n_past_dft);
 
-        for (int i = 0; i < 2*n_seq_dft+1; i++) {
-            llama_kv_cache_seq_cp(ctx_dft, seq_offset, i, n_past_dft, n_past_dft+1);
-        }
+            for (int i = 0; i < n_seq_dft; i++) {
+//                LOG("Removing tgt sequence %d from positions %d to %d\n", i + seq_offset, -1, -1);
+
+
+                llama_kv_cache_seq_rm(ctx_tgt, i + seq_offset, -1, -1);
+
+//                LOG("Copying tgt sequence %d to %d from positions %d to %d\n", (first_run) ? 0 : orig_offset, i + seq_offset, -1, spec_past_tgt);
+
+                llama_kv_cache_seq_cp(ctx_tgt, (first_run) ? 0 : orig_offset, i + seq_offset, -1, spec_past_tgt);
+
+//                LOG("Removing dft sequence %d from positions %d to %d\n", i + seq_offset, spec_past_dft, -1);
+
+                llama_kv_cache_seq_rm(ctx_dft, i + seq_offset, spec_past_dft, -1);
+            }
+
+
+            llama_batch_clear(batch_tgt);
+
+            for (int s = 0; s < n_seq_dft; ++s) {
+                drafts[s].active = false;
+                drafts[s].tokens.clear();
+                drafts[s].i_batch_tgt.clear();
+            }
+            // note: will be erased after the speculation phase
+            drafts[0].tokens.push_back(id);
+
+
+//            LOG("Copying dft sequence %d to %d from positions %d to %d\n", (first_run) ? 0 : orig_offset, seq_offset, -1, spec_past_dft);
+
+            llama_kv_cache_seq_cp(ctx_dft, (first_run) ? 0 : orig_offset, seq_offset, -1, spec_past_dft);
+
+            llama_batch_clear(batch_dft);
+            llama_batch_add(batch_dft, id, spec_past_dft, {seq_offset}, true);
+            // batch_dft.n_tokens == 1 now
 
-        if (llama_node_id(ctx_dft) == 0) {
+
+
+            // Kick off drafting pipeline but don't need it just yet
+            LOG("Beginning async draft\n");
+            dft_cgraphs.push_front(llama_start_async_decode(*ctx_dft, batch_dft));
+            // DON'T FORGET THE MATCHING DECODE WHEN NEEDED
+
+            // We need the draft now, so wait for it
+            if (!dft_cgraphs.empty()) {
+                LOG("Finishing async decode of draft\n");
+                llama_finish_async_decode(*ctx_dft, batch_dft, dft_cgraphs.back());
+                dft_cgraphs.pop_back();
+            }
+            LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+
+            if (first_run) {
+                for (int i = 0; i < max_seq; i++) {
+//                    LOG("Copying dft sequence %d to %d from positions %d to %d\n", seq_offset, i, spec_past_dft, spec_past_dft + 1);
+
+                    llama_kv_cache_seq_cp(ctx_dft, seq_offset, i, spec_past_dft, spec_past_dft + 1);
+                }
+                ++n_past_dft;
+            } else {
+                for (int i = 0; i < n_seq_dft; i++) {
+//                    LOG("Copying dft sequence %d to %d from positions %d to %d\n", seq_offset, i+seq_offset, spec_past_dft, spec_past_dft + 1);
+
+                    llama_kv_cache_seq_cp(ctx_dft, seq_offset, i+seq_offset, spec_past_dft, spec_past_dft + 1);
+                }
+            }
+
+            if (llama_node_id(ctx_dft) == 0) {
 //            llama_kv_cache_view_update(ctx_dft, &kvc_view_dft);
 //            dump_kv_cache_view_seqs(kvc_view_dft, 20);
-        }
+            }
 
-        ++n_past_dft;
 
-        if (n_predict > params.n_predict || has_eos) {
-            break;
-        }
 
-        llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
+            if (n_predict > params.n_predict || has_eos) {
+                break;
+            }
 
-        int n_seq_cur  = 0;
-        int max_ran_seq = 0;
-        int n_past_cur = n_past_dft;
+            llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
 
-        for (int s = 0; s < n_seq_dft; ++s) {
-            drafts[s].skip = true;
-            drafts[s].active   = false;
-            drafts[s].drafting = false;
-        }
+            int n_seq_cur  = 0;
+            int max_ran_seq = 0;
+            int n_past_cur = spec_past_dft+1;
 
+            for (int s = 0; s < n_seq_dft; ++s) {
+                drafts[s].skip = true;
+                drafts[s].active = false;
+                drafts[s].drafting = false;
+            }
 
 
-        drafts[0].active      = true;
-        drafts[0].drafting    = true;
-        drafts[0].skip = false;
+            drafts[0].active = true;
+            drafts[0].drafting = true;
+            drafts[0].skip = false;
 
-        drafts[0].i_batch_dft = 0;
+            drafts[0].i_batch_dft = 0;
 
-        // sample n_draft tokens from the draft model using tree-based sampling
-        for (int i = 0; i < n_draft; ++i) {
-            batch_dft.n_tokens = 0;
 
-            for (int s = 0; s <= max_ran_seq; ++s) {
-                if (!drafts[s].drafting || drafts[s].skip) {
-                    continue;
-                }
+            // sample n_draft tokens from the draft model using tree-based sampling
+            for (int i = 0; i < n_draft; ++i) {
+                batch_dft.n_tokens = 0;
 
+                for (int s = 0; s <= max_ran_seq; ++s) {
+                    if (!drafts[s].drafting || drafts[s].skip) {
+                        continue;
+                    }
 
 
-                // Swap back to pipeline roots
-                llama_swap_comm(ctx_dft);
-                LOG("Swapped comm to pipeline roots, id %d\n", llama_node_id(ctx_dft));
 
-                llama_sync_token(ctx_dft, &(drafts[s].i_batch_dft), 1);
+                    // Swap back to pipeline roots
+                    llama_swap_comm(ctx_dft);
+                    LOG("Swapped comm to pipeline roots, id %d\n", llama_node_id(ctx_dft));
 
-                llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
+                    llama_sync_token(ctx_dft, &(drafts[s].i_batch_dft), 1);
 
-                auto & cur_p = drafts[s].ctx_sampling->cur;
+                    llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
 
-                llama_sync_token_data(ctx_dft, cur_p.data(), 1);
-                // TODO investigate potential bottleneck
-                for (int k = 1; k < 8; ++k) {
-                    llama_sync_token_data(ctx_dft, &(cur_p[k]), 1);
-                }
+                    auto &cur_p = drafts[s].ctx_sampling->cur;
 
-                // Back to draft pipeline only
-                llama_swap_comm(ctx_dft);
-                LOG("Swapped comm to draft only, id %d\n", llama_node_id(ctx_dft));
+                    llama_sync_token_data(ctx_dft, cur_p.data(), 1);
+                    // TODO investigate potential bottleneck
+                    for (int k = 1; k < 8; ++k) {
+                        llama_sync_token_data(ctx_dft, &(cur_p[k]), 1);
+                    }
+
+                    // Back to draft pipeline only
+                    llama_swap_comm(ctx_dft);
+                    LOG("Swapped comm to draft only, id %d\n", llama_node_id(ctx_dft));
 
 
-                if (llama_node_id(ctx_dft) >= 0) {
-                    for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
-                        LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
-                            k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
+                    if (llama_node_id(ctx_dft) >= 0) {
+                        for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
+                            LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                                k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
+                        }
                     }
-                }
 
 
+                    if (cur_p[0].p < p_accept) {
+                        LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p,
+                            p_accept);
+                        drafts[s].drafting = false;
+                        continue;
+                    }
+
 
+                    std::vector<int> sa(1, s);
 
-                if (cur_p[0].p < p_accept) {
-                    LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
-                    drafts[s].drafting = false;
-                    continue;
-                }
+                    // attempt to split the branch if the probability is high enough
+                    for (int f = 1; f < 8; ++f) {
+                        if (n_seq_cur < n_seq_dft - 1 && cur_p[f].p > p_split) {
+                            n_seq_cur++;
+                            LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
 
 
+//                            LOG("Removing dft sequence %d from positions %d to %d\n", n_seq_cur + seq_offset, -1, -1);
 
-                std::vector<int> sa(1, s);
+                            llama_kv_cache_seq_rm(ctx_dft, n_seq_cur + seq_offset, -1, -1);
 
-                // attempt to split the branch if the probability is high enough
-                for (int f = 1; f < 8; ++f) {
-                    if (n_seq_cur < n_seq_dft-1 && cur_p[f].p > p_split) {
-                        n_seq_cur++;
-                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
+//                            LOG("Copying dft sequence %d to %d from positions %d to %d\n", s + seq_offset, n_seq_cur + seq_offset, -1, -1);
 
-                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur+seq_offset, -1, -1);
-                        llama_kv_cache_seq_cp(ctx_dft, s+seq_offset, n_seq_cur+seq_offset, -1, -1);
+                            llama_kv_cache_seq_cp(ctx_dft, s + seq_offset, n_seq_cur + seq_offset, -1, -1);
 
-                        // all previous tokens from this branch are now also part of the new branch
-                        for (int t = 0; t < batch_tgt.n_tokens; ++t) {
-                            for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) {
-                                if (batch_tgt.seq_id[t][p] == s+seq_offset) {
-                                    batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur+seq_offset;
-                                    batch_tgt.n_seq_id[t]++;
-                                    break;
+                            // all previous tokens from this branch are now also part of the new branch
+                            for (int t = 0; t < batch_tgt.n_tokens; ++t) {
+                                for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) {
+                                    if (batch_tgt.seq_id[t][p] == s + seq_offset) {
+                                        batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur + seq_offset;
+                                        batch_tgt.n_seq_id[t]++;
+                                        break;
+                                    }
                                 }
                             }
-                        }
 
 
-                        // copy the draft state
-                        drafts[n_seq_cur].active   = true;
-                        drafts[n_seq_cur].drafting = true;
-                        drafts[n_seq_cur].skip     = false;
+                            // copy the draft state
+                            drafts[n_seq_cur].active = true;
+                            drafts[n_seq_cur].drafting = true;
+                            drafts[n_seq_cur].skip = false;
 
-                        drafts[n_seq_cur].tokens      = drafts[s].tokens;
-                        drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
+                            drafts[n_seq_cur].tokens = drafts[s].tokens;
+                            drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
 
-                        llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
+                            llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
 
-                        sa.push_back(n_seq_cur);
+                            sa.push_back(n_seq_cur);
 
 
-                    } else {
-                        break;
+                        } else {
+                            break;
+                        }
                     }
-                }
 
-                // add drafted token for each sequence
-                // TODO commenting this out fixes async
-                for (int is = 0; is < (int) sa.size(); ++is) {
-                    const llama_token id = cur_p[is].id;
+                    // add drafted token for each sequence
+                    // TODO commenting this out fixes async
+                    for (int is = 0; is < (int) sa.size(); ++is) {
+                        const llama_token id = cur_p[is].id;
 
-                    const int s = sa[is];
+                        const int s = sa[is];
 
-                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
+                        llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
 
-                    drafts[s].tokens.push_back(id);
+                        drafts[s].tokens.push_back(id);
 
-                    // add unique drafted tokens to the target batch
+                        // add unique drafted tokens to the target batch
 
-                    drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
+                        drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
 
-                    LOG("Adding drafted token %d to tgt, sequence %d, position %d, i_batch_tgt %d\n", id, s+seq_offset, n_past_tgt+i, batch_tgt.n_tokens);
-                    llama_batch_add(batch_tgt, id, n_past_tgt + i, { s+seq_offset }, true);
+                        LOG("Adding drafted token %d to tgt, sequence %d, position %d, i_batch_tgt %d\n", id,
+                            s + seq_offset, spec_past_tgt + i, batch_tgt.n_tokens);
+                        llama_batch_add(batch_tgt, id, spec_past_tgt + i, {s + seq_offset}, true);
 
-                    // add the token to the batch for batched decoding with the draft model
-                    drafts[s].i_batch_dft = batch_dft.n_tokens;
+                        // add the token to the batch for batched decoding with the draft model
+                        drafts[s].i_batch_dft = batch_dft.n_tokens;
 
-                    LOG("Adding drafted token %d to dft\n", id);
+                        LOG("Adding drafted token %d to dft\n", id);
 
-                    llama_batch_add(batch_dft, id, n_past_cur, { s+seq_offset }, true);
+                        llama_batch_add(batch_dft, id, n_past_cur, {s + seq_offset}, true);
 
-                    if (batch_tgt.n_tokens > n_draft) {
-                        drafts[s].drafting = false;
+                        if (batch_tgt.n_tokens > n_draft) {
+                            drafts[s].drafting = false;
+                        }
                     }
                 }
-            }
 
-            // no sequence is drafting anymore
-            if (batch_dft.n_tokens == 0) {
-                break;
-            }
+                // no sequence is drafting anymore
+                if (batch_dft.n_tokens == 0) {
+                    break;
+                }
 
-            // evaluate the drafted tokens on the draft model
-            LOG("Running synchronous draft decode\n");
-            llama_decode(ctx_dft, batch_dft);
-            ++n_past_cur;
-            ++n_drafted;
+                // evaluate the drafted tokens on the draft model
+                LOG("Running synchronous draft decode\n");
+                llama_decode(ctx_dft, batch_dft);
+                ++n_past_cur;
+                ++n_drafted;
 
-            max_ran_seq = n_seq_cur;
+                max_ran_seq = n_seq_cur;
 
-            if (batch_tgt.n_tokens > n_draft) {
-                break;
+                if (batch_tgt.n_tokens > n_draft) {
+                    break;
+                }
             }
-        }
 
 
 
 
 
-        // evaluate the target model on the drafted tokens
-        {
+            // evaluate the target model on the drafted tokens
+            {
 //            llama_kv_cache_seq_keep(ctx_tgt, 0); // Needed to get to "Here's the code:"
 
 
 
 
 
-            if (batch_tgt.n_tokens == 0) {
-                continue;
-            }
-
-            for (int s = 0; s < n_seq_dft; ++s) {
-                if (!drafts[s].active) {
-                    continue;
+                if (batch_tgt.n_tokens == 0) {
+                    free_sequence_offsets.push_back(seq_offset);
+                    break;
                 }
 
-               drafts[s].tokens.erase(drafts[s].tokens.begin());
-               //drafts[s].tokens.erase(drafts[s].tokens.begin());
-            }
+                size_t max_draft_tokens = 0;
 
-            LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
-            struct seq_async_run run;
-            run.speculative = true;
-            run.seq_offset = seq_offset;
-            run.ctx_sampling = llama_sampling_init(params.sparams);
-            llama_sampling_cp(ctx_sampling, run.ctx_sampling);
-            run.drafts = std::vector<seq_draft>(n_seq_dft);
-            for (int s = 0; s < n_seq_dft; ++s) {
-                run.drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
-                llama_sampling_cp(drafts[s].ctx_sampling, run.drafts[s].ctx_sampling);
-                run.drafts[s].i_batch_tgt = drafts[s].i_batch_tgt;
-                run.drafts[s].tokens = drafts[s].tokens;
-                run.drafts[s].active = drafts[s].active;
-                run.drafts[s].drafting = drafts[s].drafting;
-                run.drafts[s].skip = drafts[s].skip;
-                run.drafts[s].i_batch_dft = drafts[s].i_batch_dft;
-            }
-            run.i_dft = offset;
-            run.s_keep = s_keep;
-            run.batch = llama_batch_init(params.n_ctx, 0, 2*n_seq_dft + 1);
-            run.batch.n_tokens = batch_tgt.n_tokens;
-            for (int i = 0; i < batch_tgt.n_tokens; i++) {
-                run.batch.n_seq_id[i] = batch_tgt.n_seq_id[i];
-                int cur_n_seqs = 0;
-                for (int j = 0; j < run.batch.n_seq_id[i]; j++) {
-                    run.batch.seq_id[i][j] = batch_tgt.seq_id[i][j];
+                for (int s = 0; s < n_seq_dft; ++s) {
+                    if (!drafts[s].active) {
+                        continue;
+                    }
+
+                    drafts[s].tokens.erase(drafts[s].tokens.begin());
+                    max_draft_tokens = std::max(max_draft_tokens, drafts[s].tokens.size());
+                    //drafts[s].tokens.erase(drafts[s].tokens.begin());
+                }
+
+                LOG("target batch: %s\n, batch_id = %d\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str(), batch_id + 1);
+                struct seq_async_run run;
+                run.canceled = false;
+                run.speculative = true;
+                run.seq_offset = seq_offset;
+                run.ctx_sampling = llama_sampling_init(params.sparams);
+                llama_sampling_cp(ctx_sampling, run.ctx_sampling);
+                run.drafts = std::vector<seq_draft>(n_seq_dft);
+                for (int s = 0; s < n_seq_dft; ++s) {
+                    run.drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
+                    llama_sampling_cp(drafts[s].ctx_sampling, run.drafts[s].ctx_sampling);
+                    run.drafts[s].i_batch_tgt = drafts[s].i_batch_tgt;
+                    run.drafts[s].tokens = drafts[s].tokens;
+                    run.drafts[s].active = drafts[s].active;
+                    run.drafts[s].drafting = drafts[s].drafting;
+                    run.drafts[s].skip = drafts[s].skip;
+                    run.drafts[s].i_batch_dft = drafts[s].i_batch_dft;
+                }
+                run.i_dft = offset;
+                run.s_keep = s_keep;
+                run.batch = llama_batch_init(params.n_ctx, 0, max_seq);
+                batch_id++;
+                run.batch.batch_id = batch_id;
+                run.batch.n_tokens = batch_tgt.n_tokens;
+                for (int i = 0; i < batch_tgt.n_tokens; i++) {
+                    run.batch.n_seq_id[i] = batch_tgt.n_seq_id[i];
+                    int cur_n_seqs = 0;
+                    for (int j = 0; j < run.batch.n_seq_id[i]; j++) {
+                        run.batch.seq_id[i][j] = batch_tgt.seq_id[i][j];
+                    }
+                    run.batch.token[i] = batch_tgt.token[i];
+                    run.batch.pos[i] = batch_tgt.pos[i];
+                    run.batch.logits[i] = batch_tgt.logits[i];
                 }
-                run.batch.token[i] = batch_tgt.token[i];
-                run.batch.pos[i] = batch_tgt.pos[i];
-                run.batch.logits[i] = batch_tgt.logits[i];
+                run.run_id = 0;
+                run.n_past_tgt = spec_past_tgt + 1;
+                run.n_past_dft = n_past_dft;
+                run.n_past_max = spec_past_tgt + 1 + max_draft_tokens;
+                run.cgraph = llama_start_async_decode(*ctx_tgt, run.batch);
+                tgt_cgraphs.push_front(run);
+
+                spec_past_tgt += run.drafts[0].tokens.size();
+                spec_past_dft += run.drafts[0].tokens.size();
+                id = run.drafts[0].tokens.back();
+                first_run = false;
+
             }
-            run.run_id = 0;
-            run.n_past_tgt = n_past_tgt+1;
-            run.n_past_dft = n_past_dft;
-            run.cgraph = llama_start_async_decode(*ctx_tgt, run.batch);
-            tgt_cgraphs.push_front(run);
+
+            is_waiting = llama_mpi_iprobe(ctx_tgt);
+            llama_swap_comm(ctx_tgt);
+            llama_sync_token(ctx_tgt, reinterpret_cast<llama_token *>(&is_waiting), 0);
+            llama_swap_comm(ctx_tgt);
+
+            iter++;
+//            break;
 
         }
 
+        if (n_predict > params.n_predict || has_eos) {
+            break;
+        }
+
+
 
 
     }
diff --git a/ggml-mpi.c b/ggml-mpi.c
index 90be4831cd4..c02a55f3495 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -43,7 +43,7 @@ void ggml_mpi_sync_pipelined(
 
 void ggml_mpi_backend_init(void) {
     int ret;
-    MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &ret);
+    MPI_Init_thread(NULL, NULL, MPI_THREAD_FUNNELED, &ret);
 }
 
 void ggml_mpi_backend_free(void) {
@@ -119,10 +119,26 @@ void ggml_mpi_probe(struct ggml_mpi_context * ctx_mpi, int src, int tag) {
     MPI_Probe((src >= 0) ? src : MPI_ANY_SOURCE, (tag >= 0) ? tag : MPI_ANY_TAG, ctx_mpi->comm, &(ctx_mpi->status));
 }
 
+int ggml_mpi_iprobe(struct ggml_mpi_context * ctx_mpi, int src, int tag) {
+    if(ctx_mpi->comm == MPI_COMM_NULL) {
+        return 0;
+    }
+
+    int ret;
+    MPI_Iprobe((src >= 0) ? src : MPI_ANY_SOURCE, (tag >= 0) ? tag : MPI_ANY_TAG, ctx_mpi->comm, &ret, &(ctx_mpi->status));
+    return ret;
+}
+
 int ggml_mpi_status_tag(struct ggml_mpi_context * ctx_mpi) {
     return ctx_mpi->status.MPI_TAG;
 }
 
+int ggml_mpi_status_count_int32(struct ggml_mpi_context * ctx_mpi) {
+    int32_t count;
+    MPI_Get_count(&ctx_mpi->status, MPI_INT32_T, &count);
+    return count;
+}
+
 int ggml_mpi_next_node(struct ggml_mpi_context * ctx_mpi) {
     return (ctx_mpi->rank + 1) % ctx_mpi->size;
 }
@@ -171,6 +187,30 @@ void ggml_mpi_sync_pipelined(
     }
 }
 
+void ggml_mpi_sync_pipelined_back(
+        struct ggml_mpi_context *   ctx_mpi,
+        void * val,
+        int count,
+        MPI_Datatype datatype,
+        int tag
+) {
+    if(ctx_mpi->comm == MPI_COMM_NULL) {
+        return;
+    }
+
+    //printf("Rank %d sync pipelined\n", ctx_mpi->rank);
+
+
+    if (ctx_mpi->rank != 0) {
+        MPI_Recv(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
+    }
+    if(ctx_mpi->rank != 1) {
+        const int retval = MPI_Bsend(val, count, datatype, ggml_mpi_prev_node(ctx_mpi), tag, ctx_mpi->comm);
+        GGML_ASSERT(retval == MPI_SUCCESS);
+
+    }
+}
+
 bool ggml_mpi_eval_init(
         struct ggml_mpi_context *   ctx_mpi,
                 int32_t         *   n_tokens,
@@ -179,6 +219,7 @@ bool ggml_mpi_eval_init(
                 int32_t         **  n_seq_ids,
                 int32_t         *** seq_id,
                 int8_t          **  logits,
+                int32_t         *   batch_id,
                 bool                receive_only) {
     if(ctx_mpi->comm == MPI_COMM_NULL) {
         return false;
@@ -186,6 +227,9 @@ bool ggml_mpi_eval_init(
     int32_t old_n_tokens = *n_tokens;
 
 
+    ggml_mpi_sync_pipelined(ctx_mpi, batch_id, 1, MPI_INT, 0);
+
+
     ggml_mpi_sync_pipelined(ctx_mpi, n_tokens, 1, MPI_INT, 0);
 
 
@@ -252,6 +296,15 @@ void ggml_mpi_sync_ints_pipelined(
     ggml_mpi_sync_pipelined(ctx_mpi, vals, count, MPI_INT32_T, tag);
 }
 
+void ggml_mpi_sync_ints_pipelined_back(
+        struct ggml_mpi_context * ctx_mpi,
+        int32_t * vals,
+        int count,
+        int tag
+) {
+    ggml_mpi_sync_pipelined_back(ctx_mpi, vals, count, MPI_INT32_T, tag);
+}
+
 void ggml_mpi_synch_int(
         struct ggml_mpi_context * ctx_mpi,
                         int32_t * val,
@@ -425,7 +478,9 @@ uint16_t** ggml_mpi_split_range(
     }
 
     ranges[ctx_mpi->size-1][0] = next_layer;
-    ranges[ctx_mpi->size-1][1] = MIN(end, next_layer + (node_weights[ctx_mpi->size-1] * range_length) + start);
+//    ranges[ctx_mpi->size-1][1] = MIN(end, next_layer + (node_weights[ctx_mpi->size-1] * range_length) + start);
+    ranges[ctx_mpi->size-1][1] = end;
+
     return ranges;
 
 }
diff --git a/ggml-mpi.h b/ggml-mpi.h
index ec3ee2c90cc..2b6069223e9 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -29,6 +29,10 @@ extern "C" {
 
 #define GGML_MPI_SYNC_LOGITS 8
 
+#define GGML_MPI_CANCEL_RUN 9
+
+#define GGML_MPI_KV_SEQ_CP_BACK 10
+
 /**
  * The context used for MPI operations,
  * a program may make use of more than one
@@ -53,6 +57,8 @@ void ggml_mpi_backend_init(void);
 
 bool ggml_mpi_is_decoding(struct ggml_mpi_context * ctx_mpi);
 
+int ggml_mpi_status_count_int32(struct ggml_mpi_context * ctx_mpi);
+
 void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * cgraph, int   n_layers);
 
 void ggml_mpi_wait_recv(struct ggml_mpi_context * ctx_mpi);
@@ -112,10 +118,19 @@ void ggml_mpi_sync_ints_pipelined(
         int count,
         int tag
 );
+
+void ggml_mpi_sync_ints_pipelined_back(
+        struct ggml_mpi_context * ctx_mpi,
+        int32_t * vals,
+        int count,
+        int tag
+);
 // clear = 1, rm = 2, cp = 3, keep = 4, seq_shift = 5
 void ggml_mpi_probe(struct ggml_mpi_context * ctx_mpi, int src, int tag);
 int ggml_mpi_status_tag(struct ggml_mpi_context * ctx_mpi);
 
+int ggml_mpi_iprobe(struct ggml_mpi_context * ctx_mpi, int src, int tag);
+
 /**
  * Frees the given context, including the communicator. No MPI
  * operations besides ggml_mpi_backend_freee(void) should be executed after
@@ -165,6 +180,7 @@ bool ggml_mpi_eval_init(
                 int32_t         **  n_seq_ids,
                 int32_t         *** seq_id,
                 int8_t          **  logits,
+                int32_t         *   batch_id,
                 bool                receive_only);
 
 void ggml_mpi_synch_int(
diff --git a/llama.cpp b/llama.cpp
index 7b295267851..666e8d79c05 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1494,6 +1494,7 @@ struct llama_context {
 #ifdef GGML_USE_MPI
     ggml_mpi_context * ctx_mpi = NULL;
     ggml_mpi_context * ctx_mpi_orig = NULL;
+    std::unordered_map<int, bool> canceled_batches;
 #endif
 };
 
@@ -5420,7 +5421,9 @@ static struct ggml_cgraph * llama_build_graph(
     return result;
 }
 
-
+bool llama_mpi_iprobe(struct llama_context * lctx) {
+    return ggml_mpi_iprobe(lctx->ctx_mpi, ggml_mpi_prev_node(lctx->ctx_mpi), GGML_MPI_SYNC_LOGITS);
+}
 
 static struct ggml_cgraph * llama_decode_internal_phased(
             llama_context & lctx,
@@ -5498,7 +5501,7 @@ static struct ggml_cgraph * llama_decode_internal_phased(
 #ifdef GGML_USE_MPI
         // TODO: needs fix after #3228
         if (!ggml_mpi_eval_init(lctx.ctx_mpi, &(batch.n_tokens), &(batch.token), &(batch.pos), &(batch.n_seq_id),
-                                &(batch.seq_id), &(batch.logits), false)) {
+                                &(batch.seq_id), &(batch.logits), &(batch.batch_id), false)) {
             return nullptr;
         }
         n_tokens = batch.n_tokens;
@@ -5556,9 +5559,20 @@ static struct ggml_cgraph * llama_decode_internal_phased(
 #endif
 
 #ifdef GGML_USE_MPI
+        if (ggml_mpi_iprobe(lctx.ctx_mpi, ggml_mpi_next_node(lctx.ctx_mpi), GGML_MPI_CANCEL_RUN)) {
+            int count = ggml_mpi_status_count_int32(lctx.ctx_mpi);
+//            printf("Received async cancel run\n");
+            {
+                std::vector<int32_t> canceled(count, -1);
+                llama_cancel_run(&lctx, canceled.data(), canceled.size());
+
+            }
+        }
         if (!ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf)) {
             return nullptr;
         }
+        auto it = lctx.canceled_batches.find(batch.batch_id);
+        if (it == lctx.canceled_batches.end() || !lctx.canceled_batches[batch.batch_id]) {
 #endif
 
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -5597,27 +5611,28 @@ static struct ggml_cgraph * llama_decode_internal_phased(
 #else
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
 #endif
+// update the kv ring buffer
+            {
+                if (kv_self.has_shift) {
+                    kv_self.has_shift = false;
+                    for (uint32_t i = 0; i < kv_self.size; ++i) {
+                        kv_self.cells[i].delta = 0;
+                    }
+                }
 
-#if GGML_USE_MPI
-        ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf);
-#endif
+                kv_self.head += n_tokens;
 
-        // update the kv ring buffer
-        {
-            if (kv_self.has_shift) {
-                kv_self.has_shift = false;
-                for (uint32_t i = 0; i < kv_self.size; ++i) {
-                    kv_self.cells[i].delta = 0;
+                // Ensure kv cache head points to a valid index.
+                if (kv_self.head >= kv_self.size) {
+                    kv_self.head = 0;
                 }
             }
+#if GGML_USE_MPI
+        }
+        ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf);
+#endif
 
-            kv_self.head += n_tokens;
 
-            // Ensure kv cache head points to a valid index.
-            if (kv_self.head >= kv_self.size) {
-                kv_self.head = 0;
-            }
-        }
 
 #ifdef GGML_PERF
         // print timing information per ggml operation (for debugging purposes)
@@ -9096,6 +9111,22 @@ void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src,
     llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
 }
 
+void llama_kv_cache_seq_cp_back(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+#ifdef GGML_USE_MPI
+    int32_t vals[4] = {seq_id_src, seq_id_dst, p0, p1};
+    ggml_mpi_sync_ints_pipelined_back(ctx->ctx_mpi, vals, 4, GGML_MPI_KV_SEQ_CP_BACK);
+    seq_id_src = vals[0];
+    seq_id_dst = vals[1];
+    p0 = vals[2];
+    p1 = vals[3];
+#endif
+    if (seq_id_src == seq_id_dst) {
+        return;
+    }
+
+    llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
+}
+
 void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
 #ifdef GGML_USE_MPI
     int32_t vals[1] = {seq_id};
@@ -9611,11 +9642,12 @@ struct llama_batch llama_batch_get_one(
         /*all_pos_0      =*/ pos_0,
         /*all_pos_1      =*/ 1,
         /*all_seq_id     =*/ seq_id,
+        0
     };
 }
 
 struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
-    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
+    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0,};
 
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
@@ -9655,6 +9687,7 @@ int llama_process_mpi_worker(
         struct llama_batch   batch) {
     ggml_mpi_probe(ctx->ctx_mpi, -1, -1);
     int tag = ggml_mpi_status_tag(ctx->ctx_mpi);
+    int32_t count;
     switch (tag) {
         case GGML_MPI_DECODE:
             return llama_decode_internal(*ctx, batch);
@@ -9668,6 +9701,9 @@ int llama_process_mpi_worker(
         case GGML_MPI_KV_SEQ_CP:
             llama_kv_cache_seq_cp(ctx, 0, 0, 0, 0);
             break;
+        case GGML_MPI_KV_SEQ_CP_BACK:
+            llama_kv_cache_seq_cp_back(ctx, 0, 0, 0, 0);
+            break;
         case GGML_MPI_KV_SEQ_KEEP:
             llama_kv_cache_seq_keep(ctx, 0);
             break;
@@ -9679,12 +9715,38 @@ int llama_process_mpi_worker(
             llama_backend_free();
             exit(0);
             break;
+        case GGML_MPI_CANCEL_RUN:
+            count = ggml_mpi_status_count_int32(ctx->ctx_mpi);
+//            printf("Received cancel run\n");
+            {
+                std::vector<int32_t> canceled(count, -1);
+                llama_cancel_run(ctx, canceled.data(), canceled.size());
+
+            }
+            break;
+        default:
+            printf("Unknown operation, exiting\n");
+            exit(1);
+            break;
     }
     return 0;
 }
 
 #endif
 
+void llama_cancel_run(struct llama_context * ctx, int32_t * canceled, int count) {
+    ggml_mpi_sync_ints_pipelined_back(ctx->ctx_mpi, canceled, count, GGML_MPI_CANCEL_RUN);
+    for (int i = 0; i < count; i++) {
+        int32_t run_id = canceled[i];
+        auto it = ctx->canceled_batches.find(run_id);
+        if (it != ctx->canceled_batches.end()) {
+            it->second = true;
+        } else {
+            ctx->canceled_batches[run_id] = true;
+        }
+    }
+}
+
 int llama_decode(
         struct llama_context * ctx,
           struct llama_batch   batch) {
diff --git a/llama.h b/llama.h
index fa0e7ace81d..2ef3e2d3c77 100644
--- a/llama.h
+++ b/llama.h
@@ -155,6 +155,7 @@ extern "C" {
         llama_pos    all_pos_0;  // used if pos == NULL
         llama_pos    all_pos_1;  // used if pos == NULL
         llama_seq_id all_seq_id; // used if seq_id == NULL
+        int32_t      batch_id;
     } llama_batch;
 
     struct llama_model_params {
@@ -304,6 +305,9 @@ extern "C" {
     // unless running MPI, in which case it is the rank of the node
     LLAMA_API int llama_node_id(struct llama_context * ctx);
 
+    LLAMA_API bool llama_mpi_iprobe(struct llama_context * lctx);
+
+    LLAMA_API void llama_cancel_run(struct llama_context * ctx, int32_t * canceled, int count);
 
     LLAMA_API int  llama_max_devices    (void);
     LLAMA_API bool llama_mmap_supported (void);
@@ -462,6 +466,13 @@ extern "C" {
                        llama_pos   p0,
                        llama_pos   p1);
 
+LLAMA_API void llama_kv_cache_seq_cp_back(
+        struct llama_context * ctx,
+        llama_seq_id   seq_id_src,
+        llama_seq_id   seq_id_dst,
+        llama_pos   p0,
+        llama_pos   p1);
+
     // Removes all tokens that do not belong to the specified sequence
     LLAMA_API void llama_kv_cache_seq_keep(
             struct llama_context * ctx,

From 9f65428022a698bc812b9e8079e60d2fbbdc1daf Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sun, 7 Jan 2024 21:18:44 -0600
Subject: [PATCH 52/63] Mostly fix cache sync issues and simul spec runs

---
 examples/speculative/speculative.cpp | 266 ++++++++++++++++++++++-----
 ggml-mpi.c                           |  29 ++-
 ggml-mpi.h                           |   8 +
 ggml.c                               |   8 +-
 llama.cpp                            | 102 ++++++++--
 5 files changed, 343 insertions(+), 70 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index a0bac66fd34..4f580a03a6f 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -21,6 +21,7 @@ struct seq_draft {
     std::vector<int> i_batch_tgt;
 
     std::vector<llama_token> tokens;
+    std::vector<llama_token> prefix_tokens;
 
     struct llama_sampling_context * ctx_sampling;
 };
@@ -31,6 +32,7 @@ struct seq_async_run {
     std::vector<seq_draft> drafts;
     int run_id;
     int n_past_tgt;
+    int prefix_n_past_tgt;
     int n_past_dft;
     int i_dft;
     int s_keep;
@@ -104,7 +106,7 @@ int main(int argc, char ** argv) {
     llama_split_layers_weighted(ctx_dft, params.mpi_layer_split[1].data(), params.mpi_layer_split[1].size());
 
     std::deque<int> free_sequence_offsets;
-    const int n_simul_seqs = 20;
+    const int n_simul_seqs = 100;
     const int max_seq = n_simul_seqs * n_seq_dft + 1;
     for (int i = 0; i < n_simul_seqs; i++) {
         free_sequence_offsets.push_back(i*n_seq_dft + 1);
@@ -258,6 +260,7 @@ int main(int argc, char ** argv) {
     int run_id = 0;
     int offset = 1;
     int run_n_past_tgt = n_past_tgt;
+    int run_max_n_past = n_past_tgt;
     int run_n_past_dft = n_past_dft;
     int seq_offset = free_sequence_offsets.front();
     free_sequence_offsets.pop_front();
@@ -271,19 +274,66 @@ int main(int argc, char ** argv) {
         int i_dft  = 0;
         int s_keep = 0;
 
-        for (auto run : tgt_cgraphs) {
-            if(!run.canceled && (run.n_past_max < n_past_tgt || (run.speculative && generated.back()-(n_past_tgt-run.n_past_tgt) != run.drafts[s_keep].tokens[0]))) {
-//                LOG("Cancelling run with ID %d, batch ID %d, run_npast_max %d, n_past_tgt %d, run_speculative %d, tokens[0] %d\n", run.run_id, run.batch.batch_id, run.n_past_max, n_past_tgt, run_speculative, drafts[s_keep].tokens[0]);
-//                llama_cancel_run(ctx_tgt, &run.batch.batch_id, 1);
-                run.canceled = true;
+        std::vector<int> canceled_batches;
+        for (auto &run : tgt_cgraphs) {
+            if(!run.canceled) {
+                bool correct_prefix = true;
+
+                if (run.speculative && n_past_tgt >= run.prefix_n_past_tgt) {
+                    size_t draft_index = 0;
+                    int prev_token = -1;
+                    int prev_gen_token = -1;
+                    std::vector<llama_token> concat_tokens = run.drafts[s_keep].prefix_tokens;
+                    concat_tokens.insert(concat_tokens.end(), run.drafts[s_keep].tokens.begin(),
+                                         run.drafts[s_keep].tokens.end());
+
+
+                    LOG("Prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, run.drafts[s_keep].prefix_tokens).c_str());
+
+                    LOG("Concat tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, concat_tokens).c_str());
+
+
+                    size_t index = (generated.size() - 1) - (n_past_tgt - run.prefix_n_past_tgt) + draft_index;
+                    while (index < generated.size() && draft_index < concat_tokens.size()) {
+                        LOG("Checking draft at index %zu and generated index %zu\n", draft_index, index);
+                        if (generated.at(index) != concat_tokens[draft_index]) {
+                            LOG("Found non-matching prefix at generated index %zu, draft index %zu, gen token %d, draft token %d, prev draft token %d, prev gen token %d\n", index, draft_index, generated.at(index), concat_tokens[draft_index], prev_token, prev_gen_token);
+                            correct_prefix = false;
+                            break;
+                        }
+                        prev_token = concat_tokens[draft_index];
+                        prev_gen_token = generated[index];
+                        draft_index++;
+                        index = (generated.size() - 1) - (n_past_tgt - run.prefix_n_past_tgt) + draft_index;
+                    }
+                }
+
+
+                if (run.n_past_max <= n_past_tgt || !correct_prefix) {
+                    LOG("Cancelling run with ID %d, batch ID %d, run.npast_max %d, run.n_past_tgt %d, n_past_tgt %d, run_speculative %d, tokens[0] %d, generated: %d, generated index: %zu\n",
+                        run.run_id, run.batch.batch_id, run.n_past_max, run.n_past_tgt, n_past_tgt, run.speculative,
+                        run.drafts[s_keep].tokens[0], (n_past_tgt < run.n_past_tgt) ? -1 : generated.at(
+                            generated.size() - (n_past_tgt - run.n_past_tgt + 1)),
+                        generated.size() - (n_past_tgt - run.n_past_tgt + 1));
+
+                    if (run.speculative) {
+                        // TODO put these in a vector so they are transmitted in a burst
+                        canceled_batches.push_back(run.batch.batch_id);
+                    }
+                    run.canceled = true;
 ////                }
 //
 //                if (run_speculative) {
 //                    free_sequence_offsets.push_back(seq_offset);
 //                }
+                }
             }
         }
 
+        if (!canceled_batches.empty()) {
+            llama_cancel_run(ctx_tgt, canceled_batches.data(), canceled_batches.size());
+        }
+
         if (!tgt_cgraphs.empty()) {
             struct seq_async_run run = tgt_cgraphs.back();
             LOG("Finishing async decode, is async = %d, old seq_offset = %d, new seq offset = %d, batch id = %d\n", run.run_id == ASYNC_RUN_ID, seq_offset, run.seq_offset, run.batch.batch_id);
@@ -292,6 +342,7 @@ int main(int argc, char ** argv) {
             run_id = run.run_id;
             drafts = run.drafts;
             run_speculative = run.speculative;
+            run_max_n_past = run.n_past_max;
 //            ctx_sampling = run.ctx_sampling;
             run_n_past_tgt = run.n_past_tgt;
             run_n_past_dft = run.n_past_dft;
@@ -299,8 +350,10 @@ int main(int argc, char ** argv) {
             seq_offset = run.seq_offset;
 
 
-//                LOG("Speculative run, last generated: %d, first draft: %d\n", generated.back(), drafts[s_keep].tokens[0]);
-            if(run.n_past_max >= n_past_tgt && (!run_speculative || generated.back()-(n_past_tgt-run_n_past_tgt) == drafts[s_keep].tokens[0])) {
+            LOG("Checking run, last generated: %d, first draft: %d\n", generated.back(), drafts[s_keep].tokens[0]);
+//            if(run.n_past_max >= n_past_tgt && (!run_speculative || (n_past_tgt-run_n_past_tgt >= 0 && generated.at(generated.size() - (n_past_tgt-run_n_past_tgt+1)) == drafts[s_keep].tokens[0]))) {
+
+            if(!run.canceled) {
                 //drafts[0].tokens.erase(drafts[0].tokens.begin());
                 for (int s = 0; s < n_seq_dft; ++s) {
                     if (!drafts[s].active) {
@@ -308,12 +361,13 @@ int main(int argc, char ** argv) {
                     }
 
                     drafts[s].tokens.erase(drafts[s].tokens.begin());
+
                 }
 
             } else {
 //                if (run_id != ASYNC_RUN_ID) {
-                LOG("Cancelling run with ID %d, batch ID %d, run_npast_max %d, n_past_tgt %d, run_speculative %d, tokens[0] %d\n", run.run_id, run.batch.batch_id, run.n_past_max, n_past_tgt, run_speculative, drafts[s_keep].tokens[0]);
-                    llama_cancel_run(ctx_tgt, &run.batch.batch_id, 1);
+//                LOG("Cancelling run with ID %d, batch ID %d, run_npast_max %d, n_past_tgt %d, run_speculative %d, tokens[0] %d\n", run.run_id, run.batch.batch_id, run.n_past_max, n_past_tgt, run_speculative, drafts[s_keep].tokens[0]);
+//                    llama_cancel_run(ctx_tgt, &run.batch.batch_id, 1);
 //                }
                 llama_finish_async_decode(*ctx_tgt, run.batch, cgraph);
                 tgt_cgraphs.pop_back();
@@ -330,10 +384,12 @@ int main(int argc, char ** argv) {
             tgt_cgraphs.pop_back();
 
         }
-        if (llama_node_id(ctx_tgt) == 0) {
+//        if (llama_node_id(ctx_tgt) == 0) {
 //            llama_kv_cache_view_update(ctx_tgt, &kvc_view);
 //            dump_kv_cache_view_seqs(kvc_view, 20);
-        }
+////            dump_kv_cache_view(kvc_view, 20);
+//            printf("n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d\n", n_past_tgt, run_n_past_tgt, run_max_n_past);
+//        }
         // print current draft sequences
         bool any_active = false;
         for (int s = 0; s < n_seq_dft; ++s) {
@@ -354,6 +410,8 @@ int main(int argc, char ** argv) {
         std::string token_str;
 
 
+        int old_n_past_tgt = n_past_tgt;
+        int old_n_past_dft = n_past_dft;
 
 
         std::vector<int> keeps = seq_ids;
@@ -377,7 +435,7 @@ int main(int argc, char ** argv) {
             LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
 
 
-            LOG("Sampled token: %d ('%s'), n_past_tgt: %d\n", id, token_str.c_str(), n_past_tgt);
+            LOG("Sampled token: %d ('%s'), n_past_tgt: %d, run_n_past_tgt + i_dft: %d, drafts[keeps[0]].i_batch_tgt[i_dft]: %d\n", id, token_str.c_str(), n_past_tgt, run_n_past_tgt + i_dft, drafts[keeps[0]].i_batch_tgt[i_dft]);
 
 
             if (run_n_past_tgt + i_dft == n_past_tgt) {
@@ -482,28 +540,79 @@ int main(int argc, char ** argv) {
             }
 //            LOG("Copying tgt sequence %d to %d from positions %d to %d\n", s_keep+seq_offset, 0, run_n_past_tgt, n_past_tgt);
 //            llama_kv_cache_seq_cp_back  (ctx_tgt, s_keep+seq_offset, 0, run_n_past_tgt, n_past_tgt);
-            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+seq_offset, 0, run_n_past_tgt, n_past_tgt);
+            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+seq_offset, 0, old_n_past_tgt, n_past_tgt);
+
+//            if (llama_node_id(ctx_tgt) == 0) {
+//                llama_kv_cache_view_update(ctx_tgt, &kvc_view);
+//                dump_kv_cache_view_seqs(kvc_view, 20);
+////            dump_kv_cache_view(kvc_view, 20);
+//                printf("Copied to 0, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d\n", n_past_tgt, run_n_past_tgt, run_max_n_past);
+//            }
+
+            for (int i = 0; i < n_seq_dft; i++) {
+                llama_kv_cache_seq_cp(ctx_tgt, 0, i+seq_offset, -1, n_past_tgt);
+//                if (llama_node_id(ctx_tgt) == 0) {
+//                    llama_kv_cache_view_update(ctx_tgt, &kvc_view);
+//                    dump_kv_cache_view_seqs(kvc_view, 20);
+////            dump_kv_cache_view(kvc_view, 20);
+//                    printf("Copied from 0 to %d, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d\n", i+seq_offset, n_past_tgt, run_n_past_tgt, run_max_n_past);
+//                }
+            }
+
 //            LOG("Copying dft sequence %d to %d from positions %d to %d\n", s_keep+seq_offset, 0, run_n_past_dft, n_past_dft);
 
-            llama_kv_cache_seq_cp  (ctx_dft, s_keep+seq_offset, 0, run_n_past_dft, n_past_dft);
+            llama_kv_cache_seq_cp  (ctx_dft, s_keep+seq_offset, 0, old_n_past_dft, n_past_dft);
             for (int i = 0; i < n_seq_dft; i++) {
 //                LOG("Removing tgt sequence %d from positions %d to %d\n", i+seq_offset, -1, -1);
 
-                llama_kv_cache_seq_rm  (ctx_tgt, i+seq_offset, -1, -1);
+                llama_kv_cache_seq_rm  (ctx_tgt, i+seq_offset, old_n_past_tgt, -1);
+
+//                if (llama_node_id(ctx_tgt) == 0) {
+//                    llama_kv_cache_view_update(ctx_tgt, &kvc_view);
+//                    dump_kv_cache_view_seqs(kvc_view, 20);
+////            dump_kv_cache_view(kvc_view, 20);
+//                    printf("Removed %d, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d, old_n_past: %d\n", i+seq_offset, n_past_tgt, run_n_past_tgt, run_max_n_past, old_n_past_tgt);
+//                }
+
 //                LOG("Removing dft sequence %d from positions %d to %d\n", i+seq_offset, -1, -1);
 
-                llama_kv_cache_seq_rm  (ctx_dft, i+seq_offset, -1, -1);
+                llama_kv_cache_seq_rm  (ctx_dft, i+seq_offset, old_n_past_dft, -1);
             }
 
 
-            for (int i = 0; i < max_seq; i++) {
+            for (int i = 1; i < max_seq; i++) {
 //                LOG("Copying tgt sequence %d to %d from positions %d to %d\n", 0, i, -1, n_past_tgt);
 //                LOG("Copying dft sequence %d to %d from positions %d to %d\n", 0, i, -1, n_past_dft);
 
-                llama_kv_cache_seq_cp(ctx_tgt, 0, i, -1, n_past_tgt);
-                llama_kv_cache_seq_cp(ctx_dft, 0, i, -1, n_past_dft);
+                llama_kv_cache_seq_rm(ctx_tgt, i, old_n_past_tgt, n_past_tgt);
+                llama_kv_cache_seq_rm(ctx_dft, i, old_n_past_dft, n_past_dft);
+//
+//                if (llama_node_id(ctx_tgt) == 0) {
+////                    llama_kv_cache_view_update(ctx_tgt, &kvc_view);
+////                    dump_kv_cache_view_seqs(kvc_view, 20);
+////            dump_kv_cache_view(kvc_view, 20);
+//                    printf("Removed %d, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d, old_n_past: %d\n", i+seq_offset, n_past_tgt, run_n_past_tgt, run_max_n_past, old_n_past_tgt);
+//                }
+
+                llama_kv_cache_seq_cp(ctx_tgt, 0, i, old_n_past_tgt, n_past_tgt);
+
+//                if (llama_node_id(ctx_tgt) == 0) {
+////                    llama_kv_cache_view_update(ctx_tgt, &kvc_view);
+////                    dump_kv_cache_view_seqs(kvc_view, 20);
+////            dump_kv_cache_view(kvc_view, 20);
+//                    printf("Copied 0 to %d, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d, old_n_past: %d\n", i, n_past_tgt, run_n_past_tgt, run_max_n_past, old_n_past_tgt);
+//                }
+
+                llama_kv_cache_seq_cp(ctx_dft, 0, i, old_n_past_dft, n_past_dft);
             }
 
+//            if (llama_node_id(ctx_tgt) == 0) {
+//                llama_kv_cache_view_update(ctx_tgt, &kvc_view);
+//                dump_kv_cache_view_seqs(kvc_view, 20);
+////            dump_kv_cache_view(kvc_view, 20);
+//                printf("Kept sequence, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d\n", n_past_tgt, run_n_past_tgt, run_max_n_past);
+//            }
+
 //            for (int i = 0; i < n_seq_dft; i++) {
 //                llama_kv_cache_seq_cp(ctx_tgt, 0, i+seq_offset, -1, n_past_tgt);
 //            }
@@ -529,10 +638,18 @@ int main(int argc, char ** argv) {
 
 
         {
-            LOG("Beginning async decode, batch id = %d\n", batch_id + 1);
+
+            batch_id++;
+
+
+            LOG("Beginning async decode, batch id = %d\n", batch_id);
             llama_batch_clear(batch_tgt_async);
 
             llama_batch_add(batch_tgt_async, id, n_past_tgt, {0}, true);
+
+            LOG("target async batch: %s\n, batch_id = %d\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt_async).c_str(), batch_id);
+
+
             // batch_tgt.n_tokens = 1
 
             ++n_past_tgt;
@@ -549,7 +666,6 @@ int main(int argc, char ** argv) {
 //                printf("NO FREE OFFSETS AND NO TGT CGRAPHS\n");
 //            }
             run.batch = llama_batch_init(params.n_ctx, 0, max_seq);
-            batch_id++;
             run.batch.batch_id = batch_id;
             run.batch.n_tokens = batch_tgt_async.n_tokens;
             for (int i = 0; i < batch_tgt_async.n_tokens; i++) {
@@ -573,22 +689,31 @@ int main(int argc, char ** argv) {
                 run.drafts[s].active = drafts[s].active;
                 run.drafts[s].drafting = drafts[s].drafting;
                 run.drafts[s].skip = drafts[s].skip;
+                run.drafts[s].prefix_tokens = std::vector<llama_token>(0);
             }
             run.i_dft = offset - 1;
             run.s_keep = s_keep;
             run.run_id = ASYNC_RUN_ID;
             run.n_past_tgt = n_past_tgt;
+            run.prefix_n_past_tgt = n_past_tgt;
             run.n_past_max = n_past_tgt + 1;
             run.n_past_dft = n_past_dft;
             run.speculative = false;
             run.cgraph = llama_start_async_decode(*ctx_tgt, run.batch);
             tgt_cgraphs.push_front(run);
             //llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_past_tgt+1);
-            for (int i = 0; i < max_seq; i++) {
+            for (int i = 1; i < max_seq; i++) {
 //                LOG("Copying tgt sequence %d to %d from positions %d to %d\n", 0, i, n_past_tgt-1, n_past_tgt);
-
+                llama_kv_cache_seq_rm(ctx_tgt, i, n_past_tgt-1, n_past_tgt);
                 llama_kv_cache_seq_cp(ctx_tgt, 0, i, n_past_tgt-1, n_past_tgt);
             }
+
+//            if (llama_node_id(ctx_tgt) == 0) {
+//                llama_kv_cache_view_update(ctx_tgt, &kvc_view);
+//                dump_kv_cache_view_seqs(kvc_view, 20);
+////            dump_kv_cache_view(kvc_view, 20);
+//                printf("Copied async, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d\n", n_past_tgt, run_n_past_tgt, run_max_n_past);
+//            }
         }
 
 
@@ -670,21 +795,32 @@ int main(int argc, char ** argv) {
                 break;
             }
 
-//            LOG("Doing speculative run, seq_offset = %d, spec_past_tgt = %d, spec_past_dft = %d, n_past_tgt = %d, n_past_dft = %d", seq_offset, spec_past_tgt, spec_past_dft, n_past_tgt, n_past_dft);
+            LOG("Doing speculative run, seq_offset = %d, spec_past_tgt = %d, spec_past_dft = %d, n_past_tgt = %d, n_past_dft = %d", seq_offset, spec_past_tgt, spec_past_dft, n_past_tgt, n_past_dft);
 
             for (int i = 0; i < n_seq_dft; i++) {
 //                LOG("Removing tgt sequence %d from positions %d to %d\n", i + seq_offset, -1, -1);
 
+//                if(first_run) {
 
-                llama_kv_cache_seq_rm(ctx_tgt, i + seq_offset, -1, -1);
+                    llama_kv_cache_seq_rm(ctx_tgt, i + seq_offset, -1, -1);
 
-//                LOG("Copying tgt sequence %d to %d from positions %d to %d\n", (first_run) ? 0 : orig_offset, i + seq_offset, -1, spec_past_tgt);
+                    LOG("Copying tgt sequence %d to %d from positions %d to %d\n", (first_run) ? 0 : orig_offset,
+                        i + seq_offset, -1, (first_run) ? spec_past_tgt : spec_past_tgt);
 
-                llama_kv_cache_seq_cp(ctx_tgt, (first_run) ? 0 : orig_offset, i + seq_offset, -1, spec_past_tgt);
+                    llama_kv_cache_seq_cp(ctx_tgt, (first_run) ? 0 : orig_offset, i + seq_offset, -1, (first_run) ? spec_past_tgt : spec_past_tgt);
+//                if (llama_node_id(ctx_tgt) == 0) {
+//                    llama_kv_cache_view_update(ctx_tgt, &kvc_view);
+//                    dump_kv_cache_view_seqs(kvc_view, 20);
+////            dump_kv_cache_view(kvc_view, 20);
+//                    printf("Copied tgt sequence, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d, spec_past_tgt: %d, first_run: %d\n", n_past_tgt, run_n_past_tgt, run_max_n_past, spec_past_tgt, first_run);
+//                }
+//                }
 
 //                LOG("Removing dft sequence %d from positions %d to %d\n", i + seq_offset, spec_past_dft, -1);
 
-                llama_kv_cache_seq_rm(ctx_dft, i + seq_offset, spec_past_dft, -1);
+
+                    llama_kv_cache_seq_rm(ctx_dft, i + seq_offset, spec_past_dft, -1);
+
             }
 
 
@@ -692,6 +828,14 @@ int main(int argc, char ** argv) {
 
             for (int s = 0; s < n_seq_dft; ++s) {
                 drafts[s].active = false;
+                if (!first_run) {
+                    if (!drafts[s].tokens.empty()) {
+                        drafts[s].prefix_tokens.insert(drafts[s].prefix_tokens.end(), drafts[s].tokens.begin(),
+                                                       drafts[s].tokens.end());
+                    }
+                } else {
+                    drafts[s].prefix_tokens.clear();
+                }
                 drafts[s].tokens.clear();
                 drafts[s].i_batch_tgt.clear();
             }
@@ -701,39 +845,43 @@ int main(int argc, char ** argv) {
 
 //            LOG("Copying dft sequence %d to %d from positions %d to %d\n", (first_run) ? 0 : orig_offset, seq_offset, -1, spec_past_dft);
 
-            llama_kv_cache_seq_cp(ctx_dft, (first_run) ? 0 : orig_offset, seq_offset, -1, spec_past_dft);
+            llama_kv_cache_seq_cp(ctx_dft, (first_run) ? 0 : orig_offset, seq_offset, -1, (first_run) ? spec_past_dft : spec_past_dft + 1);
 
             llama_batch_clear(batch_dft);
-            llama_batch_add(batch_dft, id, spec_past_dft, {seq_offset}, true);
-            // batch_dft.n_tokens == 1 now
 
 
+            if (first_run) {
+                llama_batch_add(batch_dft, id, spec_past_dft, {seq_offset}, true);
+                // batch_dft.n_tokens == 1 now
 
-            // Kick off drafting pipeline but don't need it just yet
-            LOG("Beginning async draft\n");
-            dft_cgraphs.push_front(llama_start_async_decode(*ctx_dft, batch_dft));
-            // DON'T FORGET THE MATCHING DECODE WHEN NEEDED
 
-            // We need the draft now, so wait for it
-            if (!dft_cgraphs.empty()) {
-                LOG("Finishing async decode of draft\n");
-                llama_finish_async_decode(*ctx_dft, batch_dft, dft_cgraphs.back());
-                dft_cgraphs.pop_back();
-            }
-            LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
 
-            if (first_run) {
+                // Kick off drafting pipeline but don't need it just yet
+                LOG("Beginning async draft\n");
+                dft_cgraphs.push_front(llama_start_async_decode(*ctx_dft, batch_dft));
+                // DON'T FORGET THE MATCHING DECODE WHEN NEEDED
+
+                // We need the draft now, so wait for it
+                if (!dft_cgraphs.empty()) {
+                    LOG("Finishing async decode of draft\n");
+                    llama_finish_async_decode(*ctx_dft, batch_dft, dft_cgraphs.back());
+                    dft_cgraphs.pop_back();
+                }
+                LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+
                 for (int i = 0; i < max_seq; i++) {
 //                    LOG("Copying dft sequence %d to %d from positions %d to %d\n", seq_offset, i, spec_past_dft, spec_past_dft + 1);
 
-                    llama_kv_cache_seq_cp(ctx_dft, seq_offset, i, spec_past_dft, spec_past_dft + 1);
+                    if (i != seq_offset) {
+                        llama_kv_cache_seq_cp(ctx_dft, seq_offset, i, spec_past_dft, spec_past_dft + 1);
+                    }
                 }
                 ++n_past_dft;
             } else {
-                for (int i = 0; i < n_seq_dft; i++) {
+                for (int i = 1; i < n_seq_dft; i++) {
 //                    LOG("Copying dft sequence %d to %d from positions %d to %d\n", seq_offset, i+seq_offset, spec_past_dft, spec_past_dft + 1);
 
-                    llama_kv_cache_seq_cp(ctx_dft, seq_offset, i+seq_offset, spec_past_dft, spec_past_dft + 1);
+                    llama_kv_cache_seq_cp(ctx_dft, seq_offset, i+seq_offset, -1, spec_past_dft + 1);
                 }
             }
 
@@ -908,11 +1056,23 @@ int main(int argc, char ** argv) {
 
                 max_ran_seq = n_seq_cur;
 
+                llama_batch_clear(batch_dft);
+
                 if (batch_tgt.n_tokens > n_draft) {
                     break;
                 }
             }
 
+            // no sequence is drafting anymore
+            if (batch_dft.n_tokens != 0) {
+                // evaluate the drafted tokens on the draft model
+                LOG("Running synchronous draft decode\n");
+                llama_decode(ctx_dft, batch_dft);
+
+            }
+
+
+
 
 
 
@@ -942,7 +1102,10 @@ int main(int argc, char ** argv) {
                     //drafts[s].tokens.erase(drafts[s].tokens.begin());
                 }
 
-                LOG("target batch: %s\n, batch_id = %d\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str(), batch_id + 1);
+                batch_id++;
+
+
+                LOG("target batch: %s\n, batch_id = %d\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str(), batch_id);
                 struct seq_async_run run;
                 run.canceled = false;
                 run.speculative = true;
@@ -959,11 +1122,11 @@ int main(int argc, char ** argv) {
                     run.drafts[s].drafting = drafts[s].drafting;
                     run.drafts[s].skip = drafts[s].skip;
                     run.drafts[s].i_batch_dft = drafts[s].i_batch_dft;
+                    run.drafts[s].prefix_tokens = drafts[s].prefix_tokens;
                 }
                 run.i_dft = offset;
                 run.s_keep = s_keep;
                 run.batch = llama_batch_init(params.n_ctx, 0, max_seq);
-                batch_id++;
                 run.batch.batch_id = batch_id;
                 run.batch.n_tokens = batch_tgt.n_tokens;
                 for (int i = 0; i < batch_tgt.n_tokens; i++) {
@@ -977,9 +1140,10 @@ int main(int argc, char ** argv) {
                     run.batch.logits[i] = batch_tgt.logits[i];
                 }
                 run.run_id = 0;
-                run.n_past_tgt = spec_past_tgt + 1;
+                run.n_past_tgt = spec_past_tgt+1;
+                run.prefix_n_past_tgt = n_past_tgt+1;
                 run.n_past_dft = n_past_dft;
-                run.n_past_max = spec_past_tgt + 1 + max_draft_tokens;
+                run.n_past_max = spec_past_tgt + max_draft_tokens;
                 run.cgraph = llama_start_async_decode(*ctx_tgt, run.batch);
                 tgt_cgraphs.push_front(run);
 
diff --git a/ggml-mpi.c b/ggml-mpi.c
index c02a55f3495..1c8c4579fb5 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -31,8 +31,22 @@ struct ggml_mpi_context {
     bool res;
     bool embed;
     void* send_buffer;
+    int trans_id;
+    int recv_trans_id;
 };
 
+int ggml_mpi_recv_trans_id(struct ggml_mpi_context * ctx_mpi) {
+    return ctx_mpi->recv_trans_id;
+}
+
+int ggml_mpi_trans_id(struct ggml_mpi_context * ctx_mpi) {
+    return ctx_mpi->trans_id;
+}
+
+void ggml_mpi_inc_trans_id(struct ggml_mpi_context * ctx_mpi) {
+    ctx_mpi->trans_id++;
+}
+
 void ggml_mpi_sync_pipelined(
         struct ggml_mpi_context *   ctx_mpi,
         void * val,
@@ -86,6 +100,11 @@ void ggml_mpi_free(struct ggml_mpi_context * ctx) {
     if(ctx->comm == MPI_COMM_NULL) {
         return;
     }
+
+    if (ctx->comm == NULL) {
+        return;
+    }
+
     ggml_mpi_sync_pipelined(ctx, NULL, 0, MPI_INT8_T, 6);
     MPI_Comm_free(&(ctx->comm));
     free(ctx);
@@ -294,6 +313,10 @@ void ggml_mpi_sync_ints_pipelined(
         int tag
 ) {
     ggml_mpi_sync_pipelined(ctx_mpi, vals, count, MPI_INT32_T, tag);
+    int old_trans = ctx_mpi->trans_id;
+    ggml_mpi_sync_pipelined(ctx_mpi, &ctx_mpi->trans_id, 1, MPI_INT32_T, GGML_MPI_TRANS_ID);
+    ctx_mpi->recv_trans_id = ctx_mpi->trans_id;
+    ctx_mpi->trans_id = old_trans;
 }
 
 void ggml_mpi_sync_ints_pipelined_back(
@@ -303,6 +326,10 @@ void ggml_mpi_sync_ints_pipelined_back(
         int tag
 ) {
     ggml_mpi_sync_pipelined_back(ctx_mpi, vals, count, MPI_INT32_T, tag);
+    int old_trans = ctx_mpi->trans_id;
+    ggml_mpi_sync_pipelined_back(ctx_mpi, &ctx_mpi->trans_id, 1, MPI_INT32_T, GGML_MPI_TRANS_ID);
+    ctx_mpi->recv_trans_id = ctx_mpi->trans_id;
+    ctx_mpi->trans_id = old_trans;
 }
 
 void ggml_mpi_synch_int(
@@ -348,7 +375,7 @@ void ggml_mpi_send_float_array_async(
         int dest,
         int tag
 ) {
-//    printf("Rank %d send float array async, count=%d\n", ctx_mpi->rank, arr_size);
+//    printf("Rank %d send float array async, count=%d, val==null: %d\n", ctx_mpi->rank, arr_size, val == NULL);
     int ret = MPI_Bsend(val, arr_size, MPI_FLOAT, dest, tag, ctx_mpi->comm);
     GGML_ASSERT(ret == MPI_SUCCESS);
 }
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 2b6069223e9..f2a4cce85cb 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -33,6 +33,8 @@ extern "C" {
 
 #define GGML_MPI_KV_SEQ_CP_BACK 10
 
+#define GGML_MPI_TRANS_ID 11
+
 /**
  * The context used for MPI operations,
  * a program may make use of more than one
@@ -47,6 +49,12 @@ extern "C" {
 struct ggml_mpi_context;
 
 
+int ggml_mpi_trans_id(struct ggml_mpi_context * ctx_mpi);
+
+int ggml_mpi_recv_trans_id(struct ggml_mpi_context * ctx_mpi);
+
+void ggml_mpi_inc_trans_id(struct ggml_mpi_context * ctx_mpi);
+
 /**
  * Initialize the MPI library and the GGML MPI backend.
  * Calling more than once during the lifetime of the program
diff --git a/ggml.c b/ggml.c
index f92292b39c6..14a982e877d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15718,10 +15718,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     int node_n = -1;
 
     while (true) {
-        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-            state->shared->node_n += 1;
-            return (thread_ret_t) GGML_EXIT_ABORTED;
-        }
+//        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+//            state->shared->node_n += 1;
+//            return (thread_ret_t) GGML_EXIT_ABORTED;
+//        }
         if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
             // all other threads are finished and spinning
             // do finalize and init here so we don't have synchronize again
diff --git a/llama.cpp b/llama.cpp
index 666e8d79c05..9711276a25a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -76,6 +76,7 @@
 #include <sstream>
 #include <thread>
 #include <unordered_map>
+#include <utility>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -662,9 +663,23 @@ static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
 // ggml helpers
 //
 
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+static std::function<bool(void*data)> abort_callback_function;
+
+static bool ab_callback(void * data) {
+    if (abort_callback_function != nullptr) {
+        return abort_callback_function(data);
+    }
+    return false;
+}
+
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads, std::function<bool(void*data)> callback = nullptr, void * abort_data = nullptr) {
     struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
 
+    abort_callback_function = std::move(callback);
+
+    plan.abort_callback = &ab_callback;
+    plan.abort_callback_data = abort_data;
+
     if (plan.work_size > 0) {
         buf.resize(plan.work_size);
         plan.work_data = buf.data();
@@ -5430,6 +5445,9 @@ static struct ggml_cgraph * llama_decode_internal_phased(
             llama_batch  & batch,
             uint8_t phase,
             ggml_cgraph * cgraph) {
+    if (phase == 0) {
+        ggml_mpi_sync_ints_pipelined(lctx.ctx_mpi, &batch.n_tokens, 1, 0);
+    }
     uint32_t n_tokens = batch.n_tokens;
     if (n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
@@ -5609,7 +5627,35 @@ static struct ggml_cgraph * llama_decode_internal_phased(
             ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
         }
 #else
-        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
+            auto abort_callback = [&lctx, &batch](void * data) -> bool {
+                if (data != nullptr && *((bool*)data)) {
+                    return true;
+                }
+                if (ggml_mpi_iprobe(lctx.ctx_mpi, ggml_mpi_next_node(lctx.ctx_mpi), GGML_MPI_CANCEL_RUN)) {
+                    int count = ggml_mpi_status_count_int32(lctx.ctx_mpi);
+//            printf("Received async cancel run\n");
+                    {
+                        std::vector<int32_t> canceled(count, -1);
+                        llama_cancel_run(&lctx, canceled.data(), canceled.size());
+
+                    }
+                    auto it = lctx.canceled_batches.find(batch.batch_id);
+                    if (it != lctx.canceled_batches.end() && lctx.canceled_batches[batch.batch_id]) {
+                        if (data != nullptr) {
+                            *((bool *) data) = true;
+                        }
+                        return true;
+                    }
+                }
+                return false;
+            };
+
+        bool * aborted = static_cast<bool *>(malloc(sizeof(bool)));
+        *aborted = false;
+
+        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads, abort_callback, aborted);
+
+        free(aborted);
 #endif
 // update the kv ring buffer
             {
@@ -5661,7 +5707,7 @@ static struct ggml_cgraph * llama_decode_internal_phased(
 
 #ifdef GGML_USE_MPI
         if (ggml_mpi_size(lctx.ctx_mpi) > 1 && ggml_mpi_rank(lctx.ctx_mpi) == 0) {
-            ggml_mpi_recv_float_array(lctx.ctx_mpi, logits_out.data(), n_vocab * n_tokens, ggml_mpi_size(lctx.ctx_mpi) - 1, GGML_MPI_SYNC_LOGITS);
+            ggml_mpi_recv_float_array(lctx.ctx_mpi, logits_out.data(), (batch.logits || lctx.logits_all) ? n_vocab * n_tokens : n_vocab, ggml_mpi_size(lctx.ctx_mpi) - 1, GGML_MPI_SYNC_LOGITS);
         }
 
         if (ggml_mpi_rank(lctx.ctx_mpi) == ggml_mpi_size(lctx.ctx_mpi) - 1) {
@@ -5705,7 +5751,7 @@ static struct ggml_cgraph * llama_decode_internal_phased(
 #ifdef GGML_USE_MPI
         }
         if (ggml_mpi_size(lctx.ctx_mpi) > 1 && ggml_mpi_rank(lctx.ctx_mpi) == ggml_mpi_size(lctx.ctx_mpi) - 1) {
-            ggml_mpi_send_float_array_async(lctx.ctx_mpi, logits_out.data(), n_vocab * n_tokens, 0, GGML_MPI_SYNC_LOGITS);
+            ggml_mpi_send_float_array_async(lctx.ctx_mpi, logits_out.data(), (batch.logits || lctx.logits_all) ? n_vocab * n_tokens : n_vocab, 0, GGML_MPI_SYNC_LOGITS);
         }
 #endif
 
@@ -5742,10 +5788,10 @@ static struct ggml_cgraph * llama_decode_internal_phased(
 //
 static int llama_decode_internal(
         llama_context & lctx,
-        llama_batch   batch) {
+        llama_batch  & batch) {
     struct ggml_cgraph * gf = llama_decode_internal_phased(lctx, batch, 0, nullptr);
     if (gf != nullptr) {
-        return llama_decode_internal_phased(lctx, batch, 1, gf) != nullptr;
+        return llama_decode_internal_phased(lctx, batch, 1, gf) == nullptr;
     } else {
         //printf("Graph is null\n");
         return -1;
@@ -9095,10 +9141,33 @@ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llam
     llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
 }
 
+void llama_kv_cache_seq_cp_sync_bi(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+#ifdef GGML_USE_MPI
+
+    int32_t vals[4] = {seq_id_src, seq_id_dst, p0, p1};
+    ggml_mpi_sync_ints_pipelined(ctx->ctx_mpi, vals, 4, GGML_MPI_KV_SEQ_CP);
+    ggml_mpi_sync_ints_pipelined_back(ctx->ctx_mpi, vals, 4, GGML_MPI_KV_SEQ_CP_BACK);
+    ggml_mpi_inc_trans_id(ctx->ctx_mpi);
+    seq_id_src = vals[0];
+    seq_id_dst = vals[1];
+    p0 = vals[2];
+    p1 = vals[3];
+#endif
+    if (seq_id_src == seq_id_dst) {
+        return;
+    }
+
+    llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
+}
+
 void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
 #ifdef GGML_USE_MPI
     int32_t vals[4] = {seq_id_src, seq_id_dst, p0, p1};
-    ggml_mpi_sync_ints_pipelined(ctx->ctx_mpi, vals, 4, 3);
+    ggml_mpi_sync_ints_pipelined(ctx->ctx_mpi, vals, 4, GGML_MPI_KV_SEQ_CP);
+    if(ggml_mpi_recv_trans_id(ctx->ctx_mpi) < ggml_mpi_trans_id(ctx->ctx_mpi)) {
+        return;
+    }
+    ggml_mpi_inc_trans_id(ctx->ctx_mpi);
     seq_id_src = vals[0];
     seq_id_dst = vals[1];
     p0 = vals[2];
@@ -9115,6 +9184,10 @@ void llama_kv_cache_seq_cp_back(struct llama_context * ctx, llama_seq_id seq_id_
 #ifdef GGML_USE_MPI
     int32_t vals[4] = {seq_id_src, seq_id_dst, p0, p1};
     ggml_mpi_sync_ints_pipelined_back(ctx->ctx_mpi, vals, 4, GGML_MPI_KV_SEQ_CP_BACK);
+    if(ggml_mpi_recv_trans_id(ctx->ctx_mpi) < ggml_mpi_trans_id(ctx->ctx_mpi)) {
+        return;
+    }
+    ggml_mpi_inc_trans_id(ctx->ctx_mpi);
     seq_id_src = vals[0];
     seq_id_dst = vals[1];
     p0 = vals[2];
@@ -9585,18 +9658,19 @@ int llama_eval(
         // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
         const int n_ctx = llama_n_ctx(ctx);
         std::vector<llama_token> tmp(n_ctx, llama_token_bos(&(ctx->model)));
+        llama_batch tmp_batch = llama_batch_get_one(tmp.data(), tmp.size(), n_past, 0);
         do {
             //ggml_mpi_synch_int(ctx->ctx_mpi, &n_past);
             llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
-        } while (llama_decode_internal(*ctx, llama_batch_get_one(tmp.data(), tmp.size(), n_past, 0)) >= 0);
+        } while (llama_decode_internal(*ctx, tmp_batch) >= 0);
         llama_backend_free();
         exit(1);
     }
 #endif
 
-
+    llama_batch tmp_batch = llama_batch_get_one(tokens, n_tokens, n_past, 0);
     llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
-    const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
+    const int ret = llama_decode_internal(*ctx, tmp_batch);
     if (ret < 0) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
     }
@@ -9611,7 +9685,7 @@ int llama_eval_embd(
                              int   n_past) {
     llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
 
-    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
+    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, 0,};
 
     const int ret = llama_decode_internal(*ctx, batch);
     if (ret < 0) {
@@ -9684,7 +9758,7 @@ void llama_batch_free(struct llama_batch batch) {
 
 int llama_process_mpi_worker(
         struct llama_context * ctx,
-        struct llama_batch   batch) {
+        struct llama_batch  batch) {
     ggml_mpi_probe(ctx->ctx_mpi, -1, -1);
     int tag = ggml_mpi_status_tag(ctx->ctx_mpi);
     int32_t count;
@@ -9756,7 +9830,7 @@ int llama_decode(
         // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
         const int n_ctx = llama_n_ctx(ctx);
         std::vector<llama_token> tmp(n_ctx, llama_token_bos(&(ctx->model)));
-        while (llama_process_mpi_worker(ctx, batch) >= 0){};
+        while (llama_process_mpi_worker(ctx, batch) >= 0){}
         llama_backend_free();
         exit(1);
     } else if (ggml_mpi_rank(ctx->ctx_mpi) < 0) {
@@ -9764,7 +9838,7 @@ int llama_decode(
     }
 #endif
     const int ret = llama_decode_internal(*ctx, batch);
-    if (ret < 0) {
+    if (ret != 0) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
     }
 

From 73b92c796181b466e909c4bd82c7a78f602c5421 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sun, 7 Jan 2024 23:13:00 -0600
Subject: [PATCH 53/63] Cancel after sampling

---
 examples/speculative/speculative.cpp | 147 ++++++++++++++-------------
 1 file changed, 79 insertions(+), 68 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 4f580a03a6f..8fe43a29f94 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -43,6 +43,9 @@ struct seq_async_run {
     bool canceled;
 };
 
+void check_for_cancel(llama_context *ctx_tgt, int n_past_tgt, std::deque<struct seq_async_run> &tgt_cgraphs,
+                      std::vector<llama_token> &generated);
+
 int main(int argc, char ** argv) {
     gpt_params params;
 
@@ -259,7 +262,7 @@ int main(int argc, char ** argv) {
 
     int run_id = 0;
     int offset = 1;
-    int run_n_past_tgt = n_past_tgt;
+    int run_n_past_tgt = n_past_tgt-1;
     int run_max_n_past = n_past_tgt;
     int run_n_past_dft = n_past_dft;
     int seq_offset = free_sequence_offsets.front();
@@ -274,65 +277,7 @@ int main(int argc, char ** argv) {
         int i_dft  = 0;
         int s_keep = 0;
 
-        std::vector<int> canceled_batches;
-        for (auto &run : tgt_cgraphs) {
-            if(!run.canceled) {
-                bool correct_prefix = true;
-
-                if (run.speculative && n_past_tgt >= run.prefix_n_past_tgt) {
-                    size_t draft_index = 0;
-                    int prev_token = -1;
-                    int prev_gen_token = -1;
-                    std::vector<llama_token> concat_tokens = run.drafts[s_keep].prefix_tokens;
-                    concat_tokens.insert(concat_tokens.end(), run.drafts[s_keep].tokens.begin(),
-                                         run.drafts[s_keep].tokens.end());
-
-
-                    LOG("Prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, run.drafts[s_keep].prefix_tokens).c_str());
-
-                    LOG("Concat tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, concat_tokens).c_str());
-
-
-                    size_t index = (generated.size() - 1) - (n_past_tgt - run.prefix_n_past_tgt) + draft_index;
-                    while (index < generated.size() && draft_index < concat_tokens.size()) {
-                        LOG("Checking draft at index %zu and generated index %zu\n", draft_index, index);
-                        if (generated.at(index) != concat_tokens[draft_index]) {
-                            LOG("Found non-matching prefix at generated index %zu, draft index %zu, gen token %d, draft token %d, prev draft token %d, prev gen token %d\n", index, draft_index, generated.at(index), concat_tokens[draft_index], prev_token, prev_gen_token);
-                            correct_prefix = false;
-                            break;
-                        }
-                        prev_token = concat_tokens[draft_index];
-                        prev_gen_token = generated[index];
-                        draft_index++;
-                        index = (generated.size() - 1) - (n_past_tgt - run.prefix_n_past_tgt) + draft_index;
-                    }
-                }
-
-
-                if (run.n_past_max <= n_past_tgt || !correct_prefix) {
-                    LOG("Cancelling run with ID %d, batch ID %d, run.npast_max %d, run.n_past_tgt %d, n_past_tgt %d, run_speculative %d, tokens[0] %d, generated: %d, generated index: %zu\n",
-                        run.run_id, run.batch.batch_id, run.n_past_max, run.n_past_tgt, n_past_tgt, run.speculative,
-                        run.drafts[s_keep].tokens[0], (n_past_tgt < run.n_past_tgt) ? -1 : generated.at(
-                            generated.size() - (n_past_tgt - run.n_past_tgt + 1)),
-                        generated.size() - (n_past_tgt - run.n_past_tgt + 1));
-
-                    if (run.speculative) {
-                        // TODO put these in a vector so they are transmitted in a burst
-                        canceled_batches.push_back(run.batch.batch_id);
-                    }
-                    run.canceled = true;
-////                }
-//
-//                if (run_speculative) {
-//                    free_sequence_offsets.push_back(seq_offset);
-//                }
-                }
-            }
-        }
-
-        if (!canceled_batches.empty()) {
-            llama_cancel_run(ctx_tgt, canceled_batches.data(), canceled_batches.size());
-        }
+        check_for_cancel(ctx_tgt, n_past_tgt, tgt_cgraphs, generated);
 
         if (!tgt_cgraphs.empty()) {
             struct seq_async_run run = tgt_cgraphs.back();
@@ -438,7 +383,7 @@ int main(int argc, char ** argv) {
             LOG("Sampled token: %d ('%s'), n_past_tgt: %d, run_n_past_tgt + i_dft: %d, drafts[keeps[0]].i_batch_tgt[i_dft]: %d\n", id, token_str.c_str(), n_past_tgt, run_n_past_tgt + i_dft, drafts[keeps[0]].i_batch_tgt[i_dft]);
 
 
-            if (run_n_past_tgt + i_dft == n_past_tgt) {
+            if (run_n_past_tgt + i_dft == n_past_tgt-1) {
                 any_match = true;
                 ++n_predict;
                 llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
@@ -491,13 +436,13 @@ int main(int argc, char ** argv) {
                 }
 
                 if (matches) {
-                    if (run_n_past_tgt + i_dft == n_past_tgt) {
+                    if (run_n_past_tgt + i_dft == n_past_tgt-1) {
                         ++n_accept;
                         ++n_past_tgt;
                         ++n_past_dft;
                     }
                     ++i_dft;
-                    if (run_id != ASYNC_RUN_ID && run_n_past_tgt + i_dft <= n_past_tgt) {
+                    if (run_id != ASYNC_RUN_ID && run_n_past_tgt + i_dft < n_past_tgt) {
                         continue;
                     }
                 }
@@ -520,6 +465,8 @@ int main(int argc, char ** argv) {
             continue;
         }
 
+        check_for_cancel(ctx_tgt, n_past_tgt, tgt_cgraphs, generated);
+
         // Pipeline syncing cache ops
 //        llama_kv_cache_seq_keep(ctx_dft, s_keep);
 //        llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
@@ -694,9 +641,9 @@ int main(int argc, char ** argv) {
             run.i_dft = offset - 1;
             run.s_keep = s_keep;
             run.run_id = ASYNC_RUN_ID;
-            run.n_past_tgt = n_past_tgt;
-            run.prefix_n_past_tgt = n_past_tgt;
-            run.n_past_max = n_past_tgt + 1;
+            run.n_past_tgt = n_past_tgt-1;
+            run.prefix_n_past_tgt = n_past_tgt-1;
+            run.n_past_max = n_past_tgt;
             run.n_past_dft = n_past_dft;
             run.speculative = false;
             run.cgraph = llama_start_async_decode(*ctx_tgt, run.batch);
@@ -1140,8 +1087,8 @@ int main(int argc, char ** argv) {
                     run.batch.logits[i] = batch_tgt.logits[i];
                 }
                 run.run_id = 0;
-                run.n_past_tgt = spec_past_tgt+1;
-                run.prefix_n_past_tgt = n_past_tgt+1;
+                run.n_past_tgt = spec_past_tgt;
+                run.prefix_n_past_tgt = n_past_tgt;
                 run.n_past_dft = n_past_dft;
                 run.n_past_max = spec_past_tgt + max_draft_tokens;
                 run.cgraph = llama_start_async_decode(*ctx_tgt, run.batch);
@@ -1212,3 +1159,67 @@ int main(int argc, char ** argv) {
 
     return 0;
 }
+
+void check_for_cancel(llama_context *ctx_tgt, int n_past_tgt, std::deque<struct seq_async_run> &tgt_cgraphs,
+                      std::vector<llama_token> &generated) {
+    std::vector<int> canceled_batches;
+    for (auto &run : tgt_cgraphs) {
+        if(!run.canceled) {
+            bool correct_prefix = true;
+
+            if (run.speculative && n_past_tgt >= run.prefix_n_past_tgt) {
+                size_t draft_index = 0;
+                int prev_token = -1;
+                int prev_gen_token = -1;
+                std::vector<llama_token> concat_tokens = run.drafts[0].prefix_tokens;
+                concat_tokens.insert(concat_tokens.end(), run.drafts[0].tokens.begin(),
+                                     run.drafts[0].tokens.end());
+
+
+                LOG("Prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, run.drafts[0].prefix_tokens).c_str());
+
+                LOG("Concat tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, concat_tokens).c_str());
+
+
+                size_t index = run.prefix_n_past_tgt + draft_index;
+                LOG("Looping over run starting at gen index %zu, draft index %zu, prefix_n_past_tgt %d, n_past_tgt %d, generated size %zu\n", index, draft_index, run.prefix_n_past_tgt, n_past_tgt, generated.size());
+                while (index < generated.size() && draft_index < concat_tokens.size() && generated.size() > (size_t)run.prefix_n_past_tgt) {
+                    LOG("Checking draft at index %zu and generated index %zu\n", draft_index, index);
+                    if (generated.at(index) != concat_tokens[draft_index]) {
+                        LOG("Found non-matching prefix at generated index %zu, draft index %zu, gen token %d, draft token %d, prev draft token %d, prev gen token %d\n", index, draft_index, generated.at(index), concat_tokens[draft_index], prev_token, prev_gen_token);
+                        correct_prefix = false;
+                        break;
+                    }
+                    prev_token = concat_tokens[draft_index];
+                    prev_gen_token = generated[index];
+                    draft_index++;
+                    index = run.prefix_n_past_tgt + draft_index;
+                }
+            }
+
+
+            if (run.n_past_max < n_past_tgt || !correct_prefix) {
+                LOG("Cancelling run with ID %d, batch ID %d, run.npast_max %d, run.n_past_tgt %d, n_past_tgt %d, run_speculative %d, tokens[0] %d, generated: %d, generated index: %zu\n",
+                    run.run_id, run.batch.batch_id, run.n_past_max, run.n_past_tgt, n_past_tgt, run.speculative,
+                    run.drafts[0].tokens[0], (n_past_tgt < run.n_past_tgt) ? -1 : generated.at(
+                        generated.size() - (n_past_tgt - run.n_past_tgt + 1)),
+                    generated.size() - (n_past_tgt - run.n_past_tgt + 1));
+
+                if (run.speculative) {
+                    // TODO put these in a vector so they are transmitted in a burst
+                    canceled_batches.push_back(run.batch.batch_id);
+                }
+                run.canceled = true;
+////                }
+//
+//                if (run_speculative) {
+//                    free_sequence_offsets.push_back(seq_offset);
+//                }
+            }
+        }
+    }
+
+    if (!canceled_batches.empty()) {
+        llama_cancel_run(ctx_tgt, canceled_batches.data(), canceled_batches.size());
+    }
+}

From e797f1a9a995a298adf878b1d669098407a929d0 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 8 Jan 2024 11:19:29 -0600
Subject: [PATCH 54/63] Force at least 2 spec runs

---
 examples/speculative/speculative.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 8fe43a29f94..daf9f47d833 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -678,6 +678,7 @@ int main(int argc, char ** argv) {
 
         int first_run = true;
 
+//        bool is_waiting = llama_mpi_iprobe(ctx_tgt);
         bool is_waiting = false;
 //        llama_swap_comm(ctx_tgt);
 //        llama_sync_token(ctx_tgt, reinterpret_cast<llama_token *>(&is_waiting), 0);
@@ -708,7 +709,7 @@ int main(int argc, char ** argv) {
             free_sequence_offsets.push_back(seq_offset);
         }
         int iter = 0;
-        while(!is_waiting) {
+        while(iter < 2 || !is_waiting) {
 
 
 

From 7674bde51f13f7c8b997241c5b69925019da9876 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 8 Jan 2024 13:58:17 -0600
Subject: [PATCH 55/63] Take secondary draft sequences into account for
 cancellation

---
 examples/speculative/speculative.cpp | 67 ++++++++++++++++------------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index daf9f47d833..91d5a025b1e 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -44,7 +44,7 @@ struct seq_async_run {
 };
 
 void check_for_cancel(llama_context *ctx_tgt, int n_past_tgt, std::deque<struct seq_async_run> &tgt_cgraphs,
-                      std::vector<llama_token> &generated);
+                      std::vector<llama_token> &generated, const int n_seq_dft);
 
 int main(int argc, char ** argv) {
     gpt_params params;
@@ -277,7 +277,7 @@ int main(int argc, char ** argv) {
         int i_dft  = 0;
         int s_keep = 0;
 
-        check_for_cancel(ctx_tgt, n_past_tgt, tgt_cgraphs, generated);
+        check_for_cancel(ctx_tgt, n_past_tgt, tgt_cgraphs, generated, n_seq_dft);
 
         if (!tgt_cgraphs.empty()) {
             struct seq_async_run run = tgt_cgraphs.back();
@@ -465,7 +465,7 @@ int main(int argc, char ** argv) {
             continue;
         }
 
-        check_for_cancel(ctx_tgt, n_past_tgt, tgt_cgraphs, generated);
+        check_for_cancel(ctx_tgt, n_past_tgt, tgt_cgraphs, generated, n_seq_dft);
 
         // Pipeline syncing cache ops
 //        llama_kv_cache_seq_keep(ctx_dft, s_keep);
@@ -1162,39 +1162,48 @@ int main(int argc, char ** argv) {
 }
 
 void check_for_cancel(llama_context *ctx_tgt, int n_past_tgt, std::deque<struct seq_async_run> &tgt_cgraphs,
-                      std::vector<llama_token> &generated) {
+                      std::vector<llama_token> &generated, const int n_seq_dft) {
     std::vector<int> canceled_batches;
     for (auto &run : tgt_cgraphs) {
         if(!run.canceled) {
             bool correct_prefix = true;
 
             if (run.speculative && n_past_tgt >= run.prefix_n_past_tgt) {
-                size_t draft_index = 0;
-                int prev_token = -1;
-                int prev_gen_token = -1;
-                std::vector<llama_token> concat_tokens = run.drafts[0].prefix_tokens;
-                concat_tokens.insert(concat_tokens.end(), run.drafts[0].tokens.begin(),
-                                     run.drafts[0].tokens.end());
-
-
-                LOG("Prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, run.drafts[0].prefix_tokens).c_str());
-
-                LOG("Concat tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, concat_tokens).c_str());
-
-
-                size_t index = run.prefix_n_past_tgt + draft_index;
-                LOG("Looping over run starting at gen index %zu, draft index %zu, prefix_n_past_tgt %d, n_past_tgt %d, generated size %zu\n", index, draft_index, run.prefix_n_past_tgt, n_past_tgt, generated.size());
-                while (index < generated.size() && draft_index < concat_tokens.size() && generated.size() > (size_t)run.prefix_n_past_tgt) {
-                    LOG("Checking draft at index %zu and generated index %zu\n", draft_index, index);
-                    if (generated.at(index) != concat_tokens[draft_index]) {
-                        LOG("Found non-matching prefix at generated index %zu, draft index %zu, gen token %d, draft token %d, prev draft token %d, prev gen token %d\n", index, draft_index, generated.at(index), concat_tokens[draft_index], prev_token, prev_gen_token);
-                        correct_prefix = false;
-                        break;
+                for (int draft_id = 0; draft_id < n_seq_dft; draft_id++) {
+                    if (!run.drafts[draft_id].tokens.empty()) {
+                        correct_prefix = true;
+                    }
+                    size_t draft_index = 0;
+                    int prev_token = -1;
+                    int prev_gen_token = -1;
+                    std::vector<llama_token> concat_tokens = run.drafts[draft_id].prefix_tokens;
+                    concat_tokens.insert(concat_tokens.end(), run.drafts[draft_id].tokens.begin(),
+                                         run.drafts[draft_id].tokens.end());
+
+
+                    LOG("Prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, run.drafts[draft_id].prefix_tokens).c_str());
+
+                    LOG("Concat tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, concat_tokens).c_str());
+
+
+                    size_t index = run.prefix_n_past_tgt + draft_index;
+                    LOG("Looping over run starting at gen index %zu, draft index %zu, prefix_n_past_tgt %d, n_past_tgt %d, generated size %zu\n",
+                        index, draft_index, run.prefix_n_past_tgt, n_past_tgt, generated.size());
+                    while (index < generated.size() && draft_index < concat_tokens.size() &&
+                           generated.size() > (size_t) run.prefix_n_past_tgt) {
+                        LOG("Checking draft at index %zu and generated index %zu\n", draft_index, index);
+                        if (generated.at(index) != concat_tokens[draft_index]) {
+                            LOG("Found non-matching prefix at generated index %zu, draft index %zu, gen token %d, draft token %d, prev draft token %d, prev gen token %d\n",
+                                index, draft_index, generated.at(index), concat_tokens[draft_index], prev_token,
+                                prev_gen_token);
+                            correct_prefix = false;
+                            break;
+                        }
+                        prev_token = concat_tokens[draft_index];
+                        prev_gen_token = generated[index];
+                        draft_index++;
+                        index = run.prefix_n_past_tgt + draft_index;
                     }
-                    prev_token = concat_tokens[draft_index];
-                    prev_gen_token = generated[index];
-                    draft_index++;
-                    index = run.prefix_n_past_tgt + draft_index;
                 }
             }
 

From 9606d3838d23369ff6012cf3908ffb78aa82fda2 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 15 Jan 2024 12:33:34 -0600
Subject: [PATCH 56/63] Add p_recovery and move dump_kv_cache_view_seqs out of
 common

---
 common/common.cpp | 52 +++++++----------------------------------------
 common/common.h   |  4 ++--
 2 files changed, 9 insertions(+), 47 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 6d1ee6a0c83..055d8178ca8 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -448,6 +448,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.p_split = std::stof(argv[i]);
+        }else if (arg == "--p-recovery" || arg == "-pr") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.p_recovery = std::stof(argv[i]);
         } else if (arg == "-m" || arg == "--model") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -854,6 +860,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
     printf("  -pa N, --p-accept N   speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
     printf("  -ps N, --p-split N    speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
+    printf("  -pr N, --p-recovery N PipeInfer recovery probability (default: %.1f)\n", (double)params.p_recovery);
     printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
     printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
     printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
@@ -1483,49 +1490,4 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
     printf("\n=== Done dumping\n");
 }
 
-void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
-    static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-
-    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
-        view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
-
-    std::unordered_map<llama_seq_id, size_t> seqs;
-    llama_kv_cache_view_cell * c_curr = view.cells;
-    llama_seq_id * cs_curr = view.cells_sequences;
-
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
-        for (int j = 0; j < view.n_max_seq; j++) {
-            if (cs_curr[j] < 0) { continue; }
-            if (seqs.find(cs_curr[j]) == seqs.end()) {
-                if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-                seqs[cs_curr[j]] = seqs.size();
-            }
-        }
-        if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-    }
 
-    printf("=== Sequence legend: ");
-    for (const auto & it : seqs) {
-        printf("%zu=%d, ", it.second, it.first);
-    }
-    printf("'+'=other sequence ids");
-
-    c_curr = view.cells;
-    cs_curr = view.cells_sequences;
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
-        if (i % row_size == 0) {
-            printf("\n%5d: ", i);
-        }
-        for (int j = 0; j < view.n_max_seq; j++) {
-            if (cs_curr[j] >= 0) {
-                const auto & it = seqs.find(cs_curr[j]);
-                putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
-            } else {
-                putchar('.');
-            }
-        }
-        putchar(' ');
-    }
-
-    printf("\n=== Done dumping\n");
-}
diff --git a/common/common.h b/common/common.h
index 20572f5c70c..635a5e2269a 100644
--- a/common/common.h
+++ b/common/common.h
@@ -57,6 +57,7 @@ struct gpt_params {
     int32_t n_sequences                     = 1;     // number of sequences to decode
     float   p_accept                        = 0.5f;  // speculative decoding accept probability
     float   p_split                         = 0.1f;  // speculative decoding split probability
+    float   p_recovery                      = 0.0f;  // Cumulative probability that p_accept and p_split are increased by per-iteration.
     int32_t n_gpu_layers                    = -1;    // number of layers to store in VRAM (-1 - use default)
     int32_t n_gpu_layers_draft              = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
     int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
@@ -228,5 +229,4 @@ void dump_non_result_info_yaml(
 // Dump the KV cache view with the number of sequences per cell.
 void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
 
-// Dump the KV cache view showing individual sequences in each cell (long output).
-void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+

From c6ac68020675123ce7dc38089362c9d26e1275fb Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 15 Jan 2024 12:35:41 -0600
Subject: [PATCH 57/63] Enforce message ordering with transactions, abort GGML
 compute if canceled

---
 ggml.c    |  20 ++++-
 ggml.h    |   2 +-
 llama.cpp | 253 +++++++++++++++++++++++++++++++++++++++++++-----------
 llama.h   |  18 +++-
 4 files changed, 234 insertions(+), 59 deletions(-)

diff --git a/ggml.c b/ggml.c
index 14a982e877d..ce495a7e2e8 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15745,6 +15745,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
             // distribute new work or execute it direct if 1T
             while (++node_n < cgraph->n_nodes) {
+                if (cplan->abort_callback && cplan->abort_callback(state->ith, cplan->abort_callback_data)) {
+                    break;
+                }
                 GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
 
                 struct ggml_tensor * node = cgraph->nodes[node_n];
@@ -15777,9 +15780,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                     break;
                 }
 
-                if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-                    break;
-                }
             }
 
             atomic_store(&state->shared->n_active, n_threads);
@@ -15798,16 +15798,23 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
                 node_n = atomic_load(&state->shared->node_n);
                 if (node_n != last) break;
+                if (cplan->abort_callback && cplan->abort_callback(state->ith, cplan->abort_callback_data)) {
+                    break;
+                }
             };
         }
 
         // check if we should stop
         if (node_n >= cgraph->n_nodes) break;
+        if (cplan->abort_callback && cplan->abort_callback(state->ith, cplan->abort_callback_data)) {
+            break;
+        }
 
         /* COMPUTE */
         struct ggml_tensor * node = cgraph->nodes[node_n];
         const int n_tasks = ggml_get_n_tasks(node, n_threads);
 
+
         struct ggml_compute_params params = {
             /*.type  =*/ GGML_TASK_COMPUTE,
             /*.ith   =*/ state->ith,
@@ -15819,6 +15826,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         if (state->ith < n_tasks) {
             ggml_compute_forward(&params, node);
         }
+
+
+
     }
 
     return GGML_EXIT_SUCCESS;
@@ -16036,6 +16046,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
     };
     struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
 
+    if (cplan->abort_callback && cplan->abort_callback(0, cplan->abort_callback_data)) {
+        return GGML_EXIT_SUCCESS;
+    }
+
     // create thread pool
     if (n_threads > 1) {
         for (int j = 1; j < n_threads; ++j) {
diff --git a/ggml.h b/ggml.h
index f2fce0f22d3..e39cdd8a859 100644
--- a/ggml.h
+++ b/ggml.h
@@ -531,7 +531,7 @@ extern "C" {
         int n_threads;
 
         // abort ggml_graph_compute when true
-        bool (*abort_callback)(void * data);
+        bool (*abort_callback)(int ith, void * data);
         void * abort_callback_data;
     };
 
diff --git a/llama.cpp b/llama.cpp
index 9711276a25a..bfac601f5f7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -18,6 +18,8 @@
 #endif
 #ifdef GGML_USE_MPI
 #  include "ggml-mpi.h"
+#include "common/log.h"
+
 #endif
 #ifndef QK_K
 #  ifdef GGML_QKK_64
@@ -77,6 +79,7 @@
 #include <thread>
 #include <unordered_map>
 #include <utility>
+#include <atomic>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -663,16 +666,16 @@ static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
 // ggml helpers
 //
 
-static std::function<bool(void*data)> abort_callback_function;
+static std::function<bool(int ithread, void*data)> abort_callback_function;
 
-static bool ab_callback(void * data) {
+static bool ab_callback(int ithread, void * data) {
     if (abort_callback_function != nullptr) {
-        return abort_callback_function(data);
+        return abort_callback_function(ithread, data);
     }
     return false;
 }
 
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads, std::function<bool(void*data)> callback = nullptr, void * abort_data = nullptr) {
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads, std::function<bool(int ithread, void*data)> callback = nullptr, void * abort_data = nullptr) {
     struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
 
     abort_callback_function = std::move(callback);
@@ -5207,10 +5210,25 @@ static struct ggml_cgraph * llama_build_graph(
                 for (int h = 0; h < 1; ++h) {
                     for (int j = 0; j < n_tokens; ++j) {
                         const llama_pos    pos    = batch.pos[j];
-                        const llama_seq_id seq_id = batch.seq_id[j][0];
+
 
                         for (int i = 0; i < n_kv; ++i) {
-                            if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
+
+                            const int n_seq_id = batch.n_seq_id[j];
+
+                            bool has_seq_id = false;
+
+                            for (int seq_index = 0; seq_index < n_seq_id; seq_index++) {
+                                llama_seq_id seq_id = batch.seq_id[j][seq_index];
+//                                printf("Seq id %d in index %d, n_seq_id %d\n", seq_id, seq_index, n_seq_id);
+
+                                has_seq_id = lctx.kv_self.cells[i].has_seq_id(seq_id);
+                                if (has_seq_id) {
+                                    break;
+                                }
+                            }
+
+                            if (!has_seq_id || lctx.kv_self.cells[i].pos > pos) {
                                 data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
                             }
                         }
@@ -5446,7 +5464,13 @@ static struct ggml_cgraph * llama_decode_internal_phased(
             uint8_t phase,
             ggml_cgraph * cgraph) {
     if (phase == 0) {
-        ggml_mpi_sync_ints_pipelined(lctx.ctx_mpi, &batch.n_tokens, 1, 0);
+        if (ggml_mpi_rank(lctx.ctx_mpi) == 0 && ggml_mpi_size(lctx.ctx_mpi) > 1) {
+            int transaction_type = GGML_MPI_DECODE;
+            ggml_mpi_sync_ints_pipelined(lctx.ctx_mpi, &transaction_type, 1, GGML_MPI_BEGIN_TRANSACTION);
+        }
+        ggml_mpi_sync_ints_pipelined(lctx.ctx_mpi, &batch.batch_id, 1, GGML_MPI_BATCH_ID);
+
+        ggml_mpi_sync_ints_pipelined(lctx.ctx_mpi, &batch.n_tokens, 1, GGML_MPI_N_TOKENS);
     }
     uint32_t n_tokens = batch.n_tokens;
     if (n_tokens == 0) {
@@ -5627,13 +5651,14 @@ static struct ggml_cgraph * llama_decode_internal_phased(
             ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
         }
 #else
-            auto abort_callback = [&lctx, &batch](void * data) -> bool {
-                if (data != nullptr && *((bool*)data)) {
+            auto abort_callback = [&lctx, &batch](int ithread, void * data) -> bool {
+                if (data != nullptr && *((std::atomic_bool*)data)) {
+//                    printf("\nAborting because already have aborted\n");
                     return true;
                 }
-                if (ggml_mpi_iprobe(lctx.ctx_mpi, ggml_mpi_next_node(lctx.ctx_mpi), GGML_MPI_CANCEL_RUN)) {
+                if (ithread == 0 && ggml_mpi_iprobe(lctx.ctx_mpi, ggml_mpi_next_node(lctx.ctx_mpi), GGML_MPI_CANCEL_RUN)) {
                     int count = ggml_mpi_status_count_int32(lctx.ctx_mpi);
-//            printf("Received async cancel run\n");
+//                    printf("\nReceived async cancel run, count of %d\n", count);
                     {
                         std::vector<int32_t> canceled(count, -1);
                         llama_cancel_run(&lctx, canceled.data(), canceled.size());
@@ -5642,7 +5667,7 @@ static struct ggml_cgraph * llama_decode_internal_phased(
                     auto it = lctx.canceled_batches.find(batch.batch_id);
                     if (it != lctx.canceled_batches.end() && lctx.canceled_batches[batch.batch_id]) {
                         if (data != nullptr) {
-                            *((bool *) data) = true;
+                            *((std::atomic_bool *) data) = true;
                         }
                         return true;
                     }
@@ -5650,30 +5675,31 @@ static struct ggml_cgraph * llama_decode_internal_phased(
                 return false;
             };
 
-        bool * aborted = static_cast<bool *>(malloc(sizeof(bool)));
-        *aborted = false;
+        auto* aborted = new std::atomic_bool(false);
 
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads, abort_callback, aborted);
 
-        free(aborted);
+        delete aborted;
 #endif
 // update the kv ring buffer
-            {
-                if (kv_self.has_shift) {
-                    kv_self.has_shift = false;
-                    for (uint32_t i = 0; i < kv_self.size; ++i) {
-                        kv_self.cells[i].delta = 0;
-                    }
-                }
 
-                kv_self.head += n_tokens;
+#if GGML_USE_MPI
+        }
 
-                // Ensure kv cache head points to a valid index.
-                if (kv_self.head >= kv_self.size) {
-                    kv_self.head = 0;
+        {
+            if (kv_self.has_shift) {
+                kv_self.has_shift = false;
+                for (uint32_t i = 0; i < kv_self.size; ++i) {
+                    kv_self.cells[i].delta = 0;
                 }
             }
-#if GGML_USE_MPI
+
+            kv_self.head += n_tokens;
+
+            // Ensure kv cache head points to a valid index.
+            if (kv_self.head >= kv_self.size) {
+                kv_self.head = 0;
+            }
         }
         ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf);
 #endif
@@ -5707,7 +5733,14 @@ static struct ggml_cgraph * llama_decode_internal_phased(
 
 #ifdef GGML_USE_MPI
         if (ggml_mpi_size(lctx.ctx_mpi) > 1 && ggml_mpi_rank(lctx.ctx_mpi) == 0) {
-            ggml_mpi_recv_float_array(lctx.ctx_mpi, logits_out.data(), (batch.logits || lctx.logits_all) ? n_vocab * n_tokens : n_vocab, ggml_mpi_size(lctx.ctx_mpi) - 1, GGML_MPI_SYNC_LOGITS);
+            // TODO print logits array for comparison
+            ggml_mpi_recv_float_array(lctx.ctx_mpi, logits_out.data(), n_vocab * n_tokens, ggml_mpi_size(lctx.ctx_mpi) - 1, GGML_MPI_SYNC_LOGITS);
+//            printf("\nReceived %zu logits, logits_out.size = %zu\n", n_vocab * n_tokens, logits_out.size());
+//            printf("batch: %s\n", LOG_BATCH_TOSTR_PRETTY(&lctx, batch).c_str());
+//            for (auto logit : logits_out) {
+//                printf("%f, ", logit);
+//            }
+//            printf("]\n");
         }
 
         if (ggml_mpi_rank(lctx.ctx_mpi) == ggml_mpi_size(lctx.ctx_mpi) - 1) {
@@ -5726,18 +5759,20 @@ static struct ggml_cgraph * llama_decode_internal_phased(
         // TODO: do not compute and extract logits if only embeddings are needed
         //       need to update the graphs to skip "result_output"
         {
-            if (batch.logits) {
-                for (uint32_t i = 0; i < n_tokens; i++) {
-                    if (batch.logits[i] == 0) {
-                        continue;
-                    }
-                    memcpy(logits_out.data() + (n_vocab*i),  net_output + (n_vocab*i), sizeof(float)*n_vocab);
-                }
-            } else if (lctx.logits_all) {
-                memcpy(logits_out.data(), net_output, sizeof(float)*n_vocab*n_tokens);
-            } else {
-                memcpy(logits_out.data(), net_output + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
-            }
+            memcpy(logits_out.data(), net_output, sizeof(float)*n_vocab*n_tokens);
+
+//            if (batch.logits) {
+//                for (uint32_t i = 0; i < n_tokens; i++) {
+//                    if (batch.logits[i] == 0) {
+//                        continue;
+//                    }
+//                    memcpy(logits_out.data() + (n_vocab*i),  net_output + (n_vocab*i), sizeof(float)*n_vocab);
+//                }
+//            } else if (lctx.logits_all) {
+//                memcpy(logits_out.data(), net_output, sizeof(float)*n_vocab*n_tokens);
+//            } else {
+//                memcpy(logits_out.data(), net_output + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
+//            }
         }
 
         // extract embeddings
@@ -5751,7 +5786,11 @@ static struct ggml_cgraph * llama_decode_internal_phased(
 #ifdef GGML_USE_MPI
         }
         if (ggml_mpi_size(lctx.ctx_mpi) > 1 && ggml_mpi_rank(lctx.ctx_mpi) == ggml_mpi_size(lctx.ctx_mpi) - 1) {
-            ggml_mpi_send_float_array_async(lctx.ctx_mpi, logits_out.data(), (batch.logits || lctx.logits_all) ? n_vocab * n_tokens : n_vocab, 0, GGML_MPI_SYNC_LOGITS);
+//            printf("\nSent %zu logits, logits_out.size = %zu\nbatch: %s\n", n_vocab * n_tokens, logits_out.size(), LOG_BATCH_TOSTR_PRETTY(&lctx, batch).c_str());
+            ggml_mpi_send_float_array_async(lctx.ctx_mpi, logits_out.data(), n_vocab * n_tokens, 0, GGML_MPI_SYNC_LOGITS);
+//            llama_kv_cache_view view = llama_kv_cache_view_init(&lctx, 21);
+//            llama_kv_cache_view_update(&lctx, &view);
+//            printf("Cache view:\n%s\n", dump_kv_cache_view_seqs(view, 1).c_str());
         }
 #endif
 
@@ -5800,14 +5839,14 @@ static int llama_decode_internal(
 
 struct ggml_cgraph * llama_start_async_decode(
         llama_context & lctx,
-        llama_batch  & batch) {
+        llama_batch   batch) {
     return llama_decode_internal_phased(lctx, batch, 0, nullptr);
 
 }
 
 int llama_finish_async_decode(
         struct llama_context & lctx,
-        struct llama_batch &  batch,
+        struct llama_batch   batch,
         struct ggml_cgraph * cgraph) {
 
     int ret;
@@ -8891,6 +8930,7 @@ void llama_split_layers_weighted(struct llama_context * ctx, float device_weight
 void llama_free(struct llama_context * ctx) {
 #ifdef GGML_USE_MPI
     ggml_mpi_free(ctx->ctx_mpi);
+    ggml_mpi_free(ctx->ctx_mpi_orig);
 #endif
     delete ctx;
 }
@@ -9045,6 +9085,68 @@ void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
     }
 }
 
+std::string dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
+    static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+    std::stringstream dumped;
+
+    dumped << "=== Dumping KV cache. total cells "
+           << view.n_cells
+           << ", max sequences per cell "
+           << view.n_max_seq
+           <<", populated cells "
+           << view.used_cells
+           << ", total tokens in cache "
+           << view.token_count
+           << ", largest empty slot="
+           << view.max_contiguous
+           << "@ "
+           << view.max_contiguous_idx
+           << '\n';
+
+    std::unordered_map<llama_seq_id, size_t> seqs;
+    llama_kv_cache_view_cell * c_curr = view.cells;
+    llama_seq_id * cs_curr = view.cells_sequences;
+
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
+        for (int j = 0; j < view.n_max_seq; j++) {
+            if (cs_curr[j] < 0) { continue; }
+            if (seqs.find(cs_curr[j]) == seqs.end()) {
+                if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
+                seqs[cs_curr[j]] = cs_curr[j];
+            }
+        }
+        if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
+    }
+
+    dumped << "=== Sequence legend: ";
+    for (const auto & it : seqs) {
+        dumped << slot_chars[it.second] << "=" << it.first << ", ";
+    }
+    dumped << "'+'=other sequence ids";
+
+    c_curr = view.cells;
+    cs_curr = view.cells_sequences;
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
+        if (i % row_size == 0) {
+            dumped << "\n" << i << ": ";
+        }
+        for (int j = 0; j < view.n_max_seq; j++) {
+            if (cs_curr[j] >= 0) {
+                const auto & it = seqs.find(cs_curr[j]);
+                dumped << ((it != seqs.end()) ? slot_chars[it->second] : '+');
+            } else {
+                dumped << '.';
+            }
+        }
+        dumped << " (" << c_curr->pos << ") ";
+        //putchar(' ');
+    }
+
+    dumped << "\n=== Done dumping\n";
+    return dumped.str();
+}
+
 void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
     if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
         view->n_cells = int32_t(ctx->kv_self.size);
@@ -9132,11 +9234,18 @@ void llama_kv_cache_clear(struct llama_context * ctx) {
 
 void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
 #ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) > 1) {
+        int transaction_type = GGML_MPI_KV_SEQ_RM;
+        ggml_mpi_sync_ints_pipelined(ctx->ctx_mpi, &transaction_type, 1, GGML_MPI_BEGIN_TRANSACTION);
+    }
     int32_t vals[3] = {seq_id, p0, p1};
-    ggml_mpi_sync_ints_pipelined(ctx->ctx_mpi, vals, 3, 2);
+    ggml_mpi_sync_ints_pipelined(ctx->ctx_mpi, vals, 3, GGML_MPI_KV_SEQ_RM);
     seq_id = vals[0];
     p0 = vals[1];
     p1 = vals[2];
+//    if (ggml_mpi_rank(ctx->ctx_mpi) == ggml_mpi_size(ctx->ctx_mpi) - 1 && ggml_mpi_size(ctx->ctx_mpi) > 1) {
+//        printf("\nRemoving sequence %d from %d to %d\n", seq_id, p0, p1);
+//    }
 #endif
     llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
 }
@@ -9162,16 +9271,24 @@ void llama_kv_cache_seq_cp_sync_bi(struct llama_context * ctx, llama_seq_id seq_
 
 void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
 #ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) > 1) {
+        int transaction_type = GGML_MPI_KV_SEQ_CP;
+        ggml_mpi_sync_ints_pipelined(ctx->ctx_mpi, &transaction_type, 1, GGML_MPI_BEGIN_TRANSACTION);
+    }
+
     int32_t vals[4] = {seq_id_src, seq_id_dst, p0, p1};
     ggml_mpi_sync_ints_pipelined(ctx->ctx_mpi, vals, 4, GGML_MPI_KV_SEQ_CP);
     if(ggml_mpi_recv_trans_id(ctx->ctx_mpi) < ggml_mpi_trans_id(ctx->ctx_mpi)) {
-        return;
+//        return;
     }
     ggml_mpi_inc_trans_id(ctx->ctx_mpi);
     seq_id_src = vals[0];
     seq_id_dst = vals[1];
     p0 = vals[2];
     p1 = vals[3];
+//    if (ggml_mpi_rank(ctx->ctx_mpi) == ggml_mpi_size(ctx->ctx_mpi) - 1 && ggml_mpi_size(ctx->ctx_mpi) > 1) {
+//        printf("\nCopying sequence %d to sequence %d from %d to %d\n", seq_id_src, seq_id_dst, p0, p1);
+//    }
 #endif
     if (seq_id_src == seq_id_dst) {
         return;
@@ -9185,13 +9302,18 @@ void llama_kv_cache_seq_cp_back(struct llama_context * ctx, llama_seq_id seq_id_
     int32_t vals[4] = {seq_id_src, seq_id_dst, p0, p1};
     ggml_mpi_sync_ints_pipelined_back(ctx->ctx_mpi, vals, 4, GGML_MPI_KV_SEQ_CP_BACK);
     if(ggml_mpi_recv_trans_id(ctx->ctx_mpi) < ggml_mpi_trans_id(ctx->ctx_mpi)) {
-        return;
+//        return;
     }
     ggml_mpi_inc_trans_id(ctx->ctx_mpi);
     seq_id_src = vals[0];
     seq_id_dst = vals[1];
     p0 = vals[2];
     p1 = vals[3];
+
+//    if (ggml_mpi_rank(ctx->ctx_mpi) == ggml_mpi_size(ctx->ctx_mpi) - 1 && ggml_mpi_size(ctx->ctx_mpi) > 1) {
+//        printf("\nCopying sequence %d to sequence %d from %d to %d\n", seq_id_src, seq_id_dst, p0, p1);
+//    }
+
 #endif
     if (seq_id_src == seq_id_dst) {
         return;
@@ -9756,12 +9878,14 @@ void llama_batch_free(struct llama_batch batch) {
 
 #ifdef GGML_USE_MPI
 
-int llama_process_mpi_worker(
+int llama_process_mpi_transaction(
         struct llama_context * ctx,
-        struct llama_batch  batch) {
-    ggml_mpi_probe(ctx->ctx_mpi, -1, -1);
-    int tag = ggml_mpi_status_tag(ctx->ctx_mpi);
-    int32_t count;
+        struct llama_batch & batch,
+                int tag) {
+//    if (ggml_mpi_rank(ctx->ctx_mpi) == ggml_mpi_size(ctx->ctx_mpi) - 1) {
+//        printf("\nBeginning transaction type %d\n", tag);
+//    }
+
     switch (tag) {
         case GGML_MPI_DECODE:
             return llama_decode_internal(*ctx, batch);
@@ -9784,6 +9908,30 @@ int llama_process_mpi_worker(
         case GGML_MPI_KV_SEQ_SHIFT:
             llama_kv_cache_seq_shift(ctx, 0, 0, 0, 0);
             break;
+        default:
+            printf("Unknown operation, exiting\n");
+            exit(1);
+            break;
+    }
+    return 0;
+}
+
+int llama_process_mpi_worker(
+        struct llama_context * ctx,
+        struct llama_batch & batch) {
+    ggml_mpi_probe(ctx->ctx_mpi, -1, -1);
+    int tag = ggml_mpi_status_tag(ctx->ctx_mpi);
+    int32_t count;
+    int32_t trans_type;
+//    if (ggml_mpi_rank(ctx->ctx_mpi) == ggml_mpi_size(ctx->ctx_mpi) - 1) {
+//        printf("\nReceived command %d\n", tag);
+//    }
+    switch (tag) {
+        case GGML_MPI_BEGIN_TRANSACTION:
+
+            ggml_mpi_sync_ints_pipelined(ctx->ctx_mpi, &trans_type, 1, GGML_MPI_BEGIN_TRANSACTION);
+            return llama_process_mpi_transaction(ctx, batch, trans_type);
+            break;
         case GGML_MPI_SHUTDOWN:
             llama_free(ctx);
             llama_backend_free();
@@ -9816,7 +9964,8 @@ void llama_cancel_run(struct llama_context * ctx, int32_t * canceled, int count)
         if (it != ctx->canceled_batches.end()) {
             it->second = true;
         } else {
-            ctx->canceled_batches[run_id] = true;
+            ctx->canceled_batches.insert({run_id, true});
+//            ctx->canceled_batches[run_id] = true;
         }
     }
 }
diff --git a/llama.h b/llama.h
index 2ef3e2d3c77..77112853113 100644
--- a/llama.h
+++ b/llama.h
@@ -12,6 +12,8 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdbool.h>
+#include <string>
+
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
@@ -280,10 +282,10 @@ extern "C" {
     LLAMA_API void llama_sync_token(struct llama_context * ctx, llama_token * token, int root);
 
     LLAMA_API struct ggml_cgraph * llama_start_async_decode(struct llama_context & lctx,
-                                                                   struct llama_batch &  batch);
+                                                                   struct llama_batch   batch);
 
     LLAMA_API int llama_finish_async_decode(struct llama_context & lctx,
-                                                                    struct llama_batch  & batch,
+                                                                    struct llama_batch   batch,
                                                                     struct ggml_cgraph * cgraph);
 
     LLAMA_API void llama_sync_token_data(struct llama_context * ctx, llama_token_data * data, int root);
@@ -473,6 +475,13 @@ LLAMA_API void llama_kv_cache_seq_cp_back(
         llama_pos   p0,
         llama_pos   p1);
 
+LLAMA_API void llama_kv_cache_seq_cp_sync_bi(
+        struct llama_context * ctx,
+        llama_seq_id   seq_id_src,
+        llama_seq_id   seq_id_dst,
+        llama_pos   p0,
+        llama_pos   p1);
+
     // Removes all tokens that do not belong to the specified sequence
     LLAMA_API void llama_kv_cache_seq_keep(
             struct llama_context * ctx,
@@ -657,7 +666,10 @@ LLAMA_API void llama_kv_cache_seq_cp_back(
                            llama_token   token,
                                   char * buf,
                                   int    length);
-
+extern "C++" {
+// Dump the KV cache view showing individual sequences in each cell (long output).
+std::string dump_kv_cache_view_seqs(const llama_kv_cache_view &view, int row_size = 40);
+}
     //
     // Grammar
     //

From a76859e68bf533f75cf69b3291f458c37ace731e Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 15 Jan 2024 12:37:28 -0600
Subject: [PATCH 58/63] Kinda fix no shutdown issue, add more tag definitions

---
 ggml-mpi.c | 35 ++++++++++++++++++++++-------------
 ggml-mpi.h | 14 ++++++++++++++
 2 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/ggml-mpi.c b/ggml-mpi.c
index 1c8c4579fb5..11178a93dc7 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -74,8 +74,9 @@ struct ggml_mpi_context * ggml_mpi_init(void) {
     ctx->asyncRecvWaiting = false;
     ctx->running_decode = false;
     ctx->async = false;
-    ctx->send_buffer = calloc(1, 128*1024*1024); // 128MB buffer
-    MPI_Buffer_attach(ctx->send_buffer, 4096*1024*32);
+    const int buffer_size = 128*1024*1024;
+    ctx->send_buffer = calloc(1, buffer_size); // 128MB buffer
+    MPI_Buffer_attach(ctx->send_buffer, buffer_size);
 
     return ctx;
 }
@@ -105,7 +106,9 @@ void ggml_mpi_free(struct ggml_mpi_context * ctx) {
         return;
     }
 
-    ggml_mpi_sync_pipelined(ctx, NULL, 0, MPI_INT8_T, 6);
+    ggml_mpi_sync_pipelined(ctx, NULL, 0, MPI_INT8_T, GGML_MPI_SHUTDOWN);
+    int buffer_size = 128*1024*1024;
+    MPI_Buffer_detach(ctx->send_buffer, &buffer_size);
     MPI_Comm_free(&(ctx->comm));
     free(ctx);
 }
@@ -246,10 +249,10 @@ bool ggml_mpi_eval_init(
     int32_t old_n_tokens = *n_tokens;
 
 
-    ggml_mpi_sync_pipelined(ctx_mpi, batch_id, 1, MPI_INT, 0);
+    ggml_mpi_sync_pipelined(ctx_mpi, batch_id, 1, MPI_INT, GGML_MPI_BATCH_ID);
 
 
-    ggml_mpi_sync_pipelined(ctx_mpi, n_tokens, 1, MPI_INT, 0);
+    ggml_mpi_sync_pipelined(ctx_mpi, n_tokens, 1, MPI_INT, GGML_MPI_N_TOKENS);
 
 
     // For now, we assume that the pos, seq_ids, tokens, etc have been
@@ -260,10 +263,10 @@ bool ggml_mpi_eval_init(
     //    *tokens = realloc(*tokens, *n_tokens * sizeof(int32_t ));
     //}
 
-    ggml_mpi_sync_pipelined(ctx_mpi, *tokens, *n_tokens, MPI_INT32_T, 0);
+    ggml_mpi_sync_pipelined(ctx_mpi, *tokens, *n_tokens, MPI_INT32_T, GGML_MPI_TOKENS);
 
 
-    ggml_mpi_sync_pipelined(ctx_mpi, *n_seq_ids, *n_tokens, MPI_INT32_T, 0);
+    ggml_mpi_sync_pipelined(ctx_mpi, *n_seq_ids, *n_tokens, MPI_INT32_T, GGML_MPI_N_SEQ_IDS);
 
     // We need to know the total number of sequence
     // ids, so we count them all up
@@ -290,8 +293,8 @@ bool ggml_mpi_eval_init(
 
 
 
-    ggml_mpi_sync_pipelined(ctx_mpi, *pos, *n_tokens, MPI_INT32_T, 0);
-    ggml_mpi_sync_pipelined(ctx_mpi, flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, 0);
+    ggml_mpi_sync_pipelined(ctx_mpi, *pos, *n_tokens, MPI_INT32_T, GGML_MPI_POS);
+    ggml_mpi_sync_pipelined(ctx_mpi, flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, GGML_MPI_SEQ_IDS);
 
     current_index = 0;
     for (int32_t i = 0; i < *n_tokens; i++) {
@@ -326,10 +329,10 @@ void ggml_mpi_sync_ints_pipelined_back(
         int tag
 ) {
     ggml_mpi_sync_pipelined_back(ctx_mpi, vals, count, MPI_INT32_T, tag);
-    int old_trans = ctx_mpi->trans_id;
-    ggml_mpi_sync_pipelined_back(ctx_mpi, &ctx_mpi->trans_id, 1, MPI_INT32_T, GGML_MPI_TRANS_ID);
-    ctx_mpi->recv_trans_id = ctx_mpi->trans_id;
-    ctx_mpi->trans_id = old_trans;
+//    int old_trans = ctx_mpi->trans_id;
+//    ggml_mpi_sync_pipelined_back(ctx_mpi, &ctx_mpi->trans_id, 1, MPI_INT32_T, GGML_MPI_TRANS_ID);
+//    ctx_mpi->recv_trans_id = ctx_mpi->trans_id;
+//    ctx_mpi->trans_id = old_trans;
 }
 
 void ggml_mpi_synch_int(
@@ -412,6 +415,8 @@ static void ggml_mpi_tensor_send(struct ggml_mpi_context * ctx_mpi, struct ggml_
     if(ctx_mpi->comm == MPI_COMM_NULL) {
         return;
     }
+
+//    printf("\nSending tensor of size %zu from node %d to node %d", ggml_nelements(t), ctx_mpi->rank, mpi_rank_dst);
 //    printf("Rank %d tensor send\n", ctx_mpi->rank);
     MPI_Datatype mpi_type;
 
@@ -436,6 +441,10 @@ static void ggml_mpi_tensor_recv(struct ggml_mpi_context * ctx_mpi, struct ggml_
         case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
         default: GGML_ASSERT(false && "not implemented");
     }
+
+//    printf("\nReceiving tensor of size %zu, at node %d, from node %d", ggml_nelements(t), ctx_mpi->rank, mpi_rank_src);
+
+
     const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, GGML_MPI_TRANSFER_TENSORS, ctx_mpi->comm, MPI_STATUS_IGNORE);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
diff --git a/ggml-mpi.h b/ggml-mpi.h
index f2a4cce85cb..1b8c3d091ee 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -35,6 +35,20 @@ extern "C" {
 
 #define GGML_MPI_TRANS_ID 11
 
+#define GGML_MPI_BATCH_ID 12
+
+#define GGML_MPI_N_TOKENS 13
+
+#define GGML_MPI_TOKENS 14
+
+#define GGML_MPI_N_SEQ_IDS 15
+
+#define GGML_MPI_SEQ_IDS 16
+
+#define GGML_MPI_POS 17
+
+#define GGML_MPI_BEGIN_TRANSACTION 18
+
 /**
  * The context used for MPI operations,
  * a program may make use of more than one

From 68303130d2291b41b7b4a3904fd43b5226a7a1a8 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 15 Jan 2024 12:39:11 -0600
Subject: [PATCH 59/63] Refactor speculation for better readability, check for
 cancellations and run continuous speculation if not waiting

---
 examples/speculative/speculative.cpp | 1168 +++++++++++++-------------
 1 file changed, 606 insertions(+), 562 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 91d5a025b1e..4f7a3162a33 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -30,7 +30,6 @@ struct seq_async_run {
     struct ggml_cgraph * cgraph;
     llama_batch batch;
     std::vector<seq_draft> drafts;
-    int run_id;
     int n_past_tgt;
     int prefix_n_past_tgt;
     int n_past_dft;
@@ -38,18 +37,45 @@ struct seq_async_run {
     int s_keep;
     int seq_offset;
     int n_past_max;
-    llama_sampling_context *ctx_sampling;
     bool speculative;
     bool canceled;
 };
 
+
 void check_for_cancel(llama_context *ctx_tgt, int n_past_tgt, std::deque<struct seq_async_run> &tgt_cgraphs,
-                      std::vector<llama_token> &generated, const int n_seq_dft);
+                      std::vector<llama_token> &generated, int n_seq_dft);
+
+void begin_async_run(const llama_sampling_params& sparams, int n_seq_dft,
+                     llama_context *ctx_tgt, int max_seq,
+                     int n_past_dft, const std::vector<seq_draft> &drafts,
+                     std::deque<struct seq_async_run> &tgt_cgraphs,
+                     int32_t &batch_id, int &n_past, llama_kv_cache_view &kvc_view,
+                     bool is_spec, llama_batch batch, int n_past_max, int prefix_n_past, int seq_offset);
+
+bool start_async_spec_run(const gpt_params &params, llama_context *ctx_tgt, llama_context *ctx_dft,
+                          std::deque<int> &free_sequence_offsets, int max_seq, llama_batch &batch_tgt, int n_predict,
+                          int prefix_n_past, int n_past_dft, bool has_eos, llama_sampling_context *ctx_sampling,
+                          std::deque<struct seq_async_run> &tgt_cgraphs, const seq_async_run &current_run,
+                          int &spec_past_tgt, int &spec_past_dft, int first_run, int orig_offset, int32_t &batch_id,
+                          llama_batch &batch_dft, int &n_drafted, std::vector<seq_draft> &drafts, llama_token &id,
+                          llama_kv_cache_view &kvc, int iter);
+
+void begin_non_spec_run(const gpt_params &params, int n_seq_dft, llama_context *ctx, int max_seq,
+                        const std::vector<seq_draft> &drafts, llama_token id, int32_t &batch_id, int &n_past, int n_past_dft,
+                        std::deque<struct seq_async_run> &dft_cgraphs, llama_kv_cache_view &kvc_view);
+
+void
+run_speculation_loop(const gpt_params &params, const float p_accept, llama_context *ctx_tgt, llama_context *ctx_dft,
+                     int max_seq, llama_batch &batch_tgt, int n_predict, int n_past_tgt, int n_past_dft,
+                     bool has_eos, llama_sampling_context *ctx_sampling, int & spec_past_tgt, int & spec_past_dft,
+                     bool & first_run, std::deque<int> &free_sequence_offsets, int32_t &batch_id, llama_batch &batch_dft,
+                     int &n_drafted, std::vector<seq_draft> &drafts, std::deque<struct seq_async_run> &tgt_cgraphs,
+                     seq_async_run &current_run, llama_kv_cache_view &kvc_view_dft, llama_token &id);
 
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!gpt_params_parse(argc, argv, params)) {
         return 1;
     }
 
@@ -109,7 +135,7 @@ int main(int argc, char ** argv) {
     llama_split_layers_weighted(ctx_dft, params.mpi_layer_split[1].data(), params.mpi_layer_split[1].size());
 
     std::deque<int> free_sequence_offsets;
-    const int n_simul_seqs = 100;
+    const int n_simul_seqs = 1000;
     const int max_seq = n_simul_seqs * n_seq_dft + 1;
     for (int i = 0; i < n_simul_seqs; i++) {
         free_sequence_offsets.push_back(i*n_seq_dft + 1);
@@ -190,20 +216,20 @@ int main(int argc, char ** argv) {
 
     int32_t batch_id = 0;
 
-    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, max_seq);
-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, max_seq);
-    llama_batch batch_tgt_async = llama_batch_init(params.n_ctx, 0, max_seq);
+    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, max_seq+1);
+    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, max_seq+1);
+    llama_batch batch_tgt_async = llama_batch_init(params.n_ctx, 0, max_seq+1);
 
     batch_dft.batch_id = batch_id;
     batch_tgt.batch_id = batch_id;
     batch_tgt_async.batch_id = batch_id;
 
     std::vector<llama_seq_id> seq_ids;
-    for (int i = 0; i < max_seq; i++) {
+    for (int i = 0; i <= max_seq; i++) {
         seq_ids.emplace_back(i);
     }
 
-    for (size_t i = 0; i < inp.size()-1; i++) {
+    for (int i = 0; i < n_input-1; i++) {
         llama_batch_add(batch_dft, inp[i], i, seq_ids, true);
         llama_batch_add(batch_tgt, inp[i], i, seq_ids, true);
     }
@@ -228,9 +254,9 @@ int main(int argc, char ** argv) {
     int n_drafted = 0;
     int n_accept  = 0;
 
-    const int ASYNC_RUN_ID = n_seq_dft+1;
-    int n_past_tgt = inp.size();
-    int n_past_dft = inp.size();
+
+    int n_past_tgt = n_input;
+    int n_past_dft = n_input;
 
     // used to determine end of generation
     bool has_eos = false;
@@ -251,7 +277,7 @@ int main(int argc, char ** argv) {
 
 
 
-    std::deque<struct ggml_cgraph *> dft_cgraphs;
+    std::deque<struct seq_async_run> dft_cgraphs;
     std::deque<struct seq_async_run> tgt_cgraphs;
 
     const auto t_dec_start = ggml_time_us();
@@ -260,45 +286,55 @@ int main(int argc, char ** argv) {
     drafts[0].i_batch_tgt.resize(1);
     drafts[0].i_batch_tgt[0] = 0;
 
-    int run_id = 0;
-    int offset = 1;
-    int run_n_past_tgt = n_past_tgt-1;
-    int run_max_n_past = n_past_tgt;
-    int run_n_past_dft = n_past_dft;
-    int seq_offset = free_sequence_offsets.front();
-    free_sequence_offsets.pop_front();
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx_tgt, max_seq);
-    struct llama_kv_cache_view kvc_view_dft = llama_kv_cache_view_init(ctx_dft, max_seq);
+    seq_async_run current_run;
+
+    current_run.n_past_tgt = n_past_tgt - 1;
+    current_run.n_past_max = n_past_tgt;
+    current_run.n_past_dft = n_past_dft - 1;
+    current_run.seq_offset = free_sequence_offsets.front();
+    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx_tgt, max_seq+1);
+    struct llama_kv_cache_view kvc_view_dft = llama_kv_cache_view_init(ctx_dft, max_seq+1);
     std::vector<llama_token> generated = inp;
-    bool run_speculative = false;
+
+    int spec_past_tgt = n_past_tgt;
+    int spec_past_dft = n_past_dft;
+
+    bool first_run = true;
+    llama_token id;
     while (true) {
 
 
         int i_dft  = 0;
         int s_keep = 0;
 
-        check_for_cancel(ctx_tgt, n_past_tgt, tgt_cgraphs, generated, n_seq_dft);
 
-        if (!tgt_cgraphs.empty()) {
+        bool is_waiting = llama_mpi_iprobe(ctx_tgt);
+        llama_swap_comm(ctx_tgt);
+        llama_sync_token(ctx_tgt, reinterpret_cast<llama_token *>(&is_waiting), 0);
+        llama_swap_comm(ctx_tgt);
+
+        if (!tgt_cgraphs.empty() && is_waiting) {
+            check_for_cancel(ctx_tgt, n_past_tgt, tgt_cgraphs, generated, n_seq_dft);
+
             struct seq_async_run run = tgt_cgraphs.back();
-            LOG("Finishing async decode, is async = %d, old seq_offset = %d, new seq offset = %d, batch id = %d\n", run.run_id == ASYNC_RUN_ID, seq_offset, run.seq_offset, run.batch.batch_id);
+            LOG("Finishing async decode, is spec = %d, old seq_offset = %d, new seq offset = %d, batch id = %d\n", run.speculative, current_run.seq_offset, run.seq_offset, run.batch.batch_id);
             struct ggml_cgraph * cgraph = run.cgraph;
 
-            run_id = run.run_id;
-            drafts = run.drafts;
-            run_speculative = run.speculative;
-            run_max_n_past = run.n_past_max;
-//            ctx_sampling = run.ctx_sampling;
-            run_n_past_tgt = run.n_past_tgt;
-            run_n_past_dft = run.n_past_dft;
-//            n_past_dft = run.n_past_dft;
-            seq_offset = run.seq_offset;
 
 
-            LOG("Checking run, last generated: %d, first draft: %d\n", generated.back(), drafts[s_keep].tokens[0]);
-//            if(run.n_past_max >= n_past_tgt && (!run_speculative || (n_past_tgt-run_n_past_tgt >= 0 && generated.at(generated.size() - (n_past_tgt-run_n_past_tgt+1)) == drafts[s_keep].tokens[0]))) {
+            LOG("Checking run, last generated: %d, first draft: %d\n", generated.back(), run.drafts[run.s_keep].tokens[0]);
+//            if(run.n_past_max >= n_past_tgt && (!run_speculative || (n_past_tgt-current_run.n_past_tgt >= 0 && generated.at(generated.size() - (n_past_tgt-current_run.n_past_tgt+1)) == drafts[s_keep].tokens[0]))) {
 
             if(!run.canceled) {
+
+                drafts = run.drafts;
+                current_run.speculative = run.speculative;
+                current_run.n_past_max = run.n_past_max;
+                current_run.n_past_tgt = run.n_past_tgt;
+                current_run.n_past_dft = run.n_past_dft;
+                current_run.seq_offset = run.seq_offset;
+                s_keep = run.s_keep;
+
                 //drafts[0].tokens.erase(drafts[0].tokens.begin());
                 for (int s = 0; s < n_seq_dft; ++s) {
                     if (!drafts[s].active) {
@@ -310,31 +346,60 @@ int main(int argc, char ** argv) {
                 }
 
             } else {
-//                if (run_id != ASYNC_RUN_ID) {
-//                LOG("Cancelling run with ID %d, batch ID %d, run_npast_max %d, n_past_tgt %d, run_speculative %d, tokens[0] %d\n", run.run_id, run.batch.batch_id, run.n_past_max, n_past_tgt, run_speculative, drafts[s_keep].tokens[0]);
-//                    llama_cancel_run(ctx_tgt, &run.batch.batch_id, 1);
+//                if (llama_node_id(ctx_tgt) == 0) {
+//                    printf("\nFinishing canceled async run, spec: %d, batch id: %d, batch: %s\n", run.speculative, run.batch.batch_id, LOG_BATCH_TOSTR_PRETTY(ctx_tgt, run.batch).c_str());
 //                }
+// FIXME Main bottleneck because when finishing a canceled run, we're forced to wait until a correct run
+//  is finished instead of jumping back to speculation
                 llama_finish_async_decode(*ctx_tgt, run.batch, cgraph);
                 tgt_cgraphs.pop_back();
-                if (run_speculative) {
-                    free_sequence_offsets.push_back(seq_offset);
+                if (run.speculative) {
+//                    if(llama_node_id(ctx_tgt) == 0) {
+//                        fprintf(stderr, "\nRun was canceled, pushing seq offset %d to free seq offsets\n",
+//                                run.seq_offset);
+//                        fflush(stderr);
+//                    }
+                    free_sequence_offsets.push_back(run.seq_offset);
+//                    if(llama_node_id(ctx_tgt) == 0) {
+//
+//                        fprintf(stderr, "\nDone pushing seq offset %d to free seq offsets\n", run.seq_offset);
+//                        fflush(stderr);
+//                    }
                 }
 //                fprintf(stderr, "Incorrect starting token\n");
                 continue;
             }
 
 
-
+//            if (llama_node_id(ctx_tgt) == 0) {
+//                printf("\nFinishing async run, spec: %d, batch id: %d, batch: %s\n", run.speculative, run.batch.batch_id, LOG_BATCH_TOSTR_PRETTY(ctx_tgt, run.batch).c_str());
+//            }
             llama_finish_async_decode(*ctx_tgt, run.batch, cgraph);
             tgt_cgraphs.pop_back();
 
+            spec_past_tgt = n_past_tgt;
+            spec_past_dft = n_past_dft;
+
+            first_run = true;
+
+        } else if (!tgt_cgraphs.empty()) {
+            run_speculation_loop(params, p_accept, ctx_tgt, ctx_dft, max_seq, batch_tgt, n_predict, n_past_tgt, n_past_dft,
+                                 has_eos, ctx_sampling,
+                                 spec_past_tgt, spec_past_dft, first_run, free_sequence_offsets, batch_id, batch_dft,
+                                 n_drafted, drafts, tgt_cgraphs, current_run, kvc_view_dft, id);
+            continue;
         }
-//        if (llama_node_id(ctx_tgt) == 0) {
+
+
+        if (llama_node_id(ctx_tgt) == 0) {
 //            llama_kv_cache_view_update(ctx_tgt, &kvc_view);
-//            dump_kv_cache_view_seqs(kvc_view, 20);
-////            dump_kv_cache_view(kvc_view, 20);
-//            printf("n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d\n", n_past_tgt, run_n_past_tgt, run_max_n_past);
-//        }
+//            LOG("Beginning sampling, tgt cache layout:\n%s", dump_kv_cache_view_seqs(kvc_view, 1).c_str());
+            LOG("n_past_tgt: %d, current_run.n_past_tgt: %d, current_run.n_past_max: %d\n", n_past_tgt, current_run.n_past_tgt, current_run.n_past_max);
+        } else {
+//            llama_kv_cache_view_update(ctx_dft, &kvc_view_dft);
+//            LOG("Beginning sampling, dft cache layout:\n%s", dump_kv_cache_view_seqs(kvc_view_dft, 1).c_str());
+            LOG("n_past_dft: %d, current_run.n_past_dft: %d, current_run.n_past_max: %d\n", n_past_dft, current_run.n_past_dft, current_run.n_past_max);
+        }
         // print current draft sequences
         bool any_active = false;
         for (int s = 0; s < n_seq_dft; ++s) {
@@ -351,7 +416,7 @@ int main(int argc, char ** argv) {
 
 
         bool any_match = false;
-        llama_token id;
+
         std::string token_str;
 
 
@@ -359,14 +424,16 @@ int main(int argc, char ** argv) {
         int old_n_past_dft = n_past_dft;
 
 
-        std::vector<int> keeps = seq_ids;
+        std::deque<int> keeps(seq_ids.begin(), seq_ids.end());
+        keeps.erase(std::find(keeps.begin(), keeps.end(),s_keep));
+        keeps.push_front(s_keep);
         while (!keeps.empty()) {
 
-            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d, run_n_past_tgt = %3d, n_past_tgt = %3d, seq_offset = %d, keeps[0] = %d\n", s_keep, i_dft, drafts[keeps[0]].i_batch_tgt[i_dft], run_n_past_tgt, n_past_tgt, seq_offset, keeps[0]);
+            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d, current_run.n_past_tgt = %3d, n_past_tgt = %3d, seq_offset = %d, keeps[0] = %d\n", s_keep, i_dft, drafts[keeps[0]].i_batch_tgt[i_dft], current_run.n_past_tgt, n_past_tgt, current_run.seq_offset, keeps[0]);
 
 
             // sample from the target model
-            id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[keeps[0]].i_batch_tgt[i_dft]);
+            id = llama_sampling_sample(ctx_sampling, ctx_tgt, nullptr, drafts[keeps[0]].i_batch_tgt[i_dft]);
             token_str = llama_token_to_piece(ctx_tgt, id);
             // Swap to pipeline roots
             llama_swap_comm(ctx_tgt);
@@ -380,20 +447,28 @@ int main(int argc, char ** argv) {
             LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
 
 
-            LOG("Sampled token: %d ('%s'), n_past_tgt: %d, run_n_past_tgt + i_dft: %d, drafts[keeps[0]].i_batch_tgt[i_dft]: %d\n", id, token_str.c_str(), n_past_tgt, run_n_past_tgt + i_dft, drafts[keeps[0]].i_batch_tgt[i_dft]);
+            LOG("Sampled token: %d ('%s'), n_past_tgt: %d, current_run.n_past_tgt + i_dft: %d, drafts[keeps[0]].i_batch_tgt[i_dft]: %d\n", id, token_str.c_str(), n_past_tgt, current_run.n_past_tgt + i_dft, drafts[keeps[0]].i_batch_tgt[i_dft]);
 
 
-            if (run_n_past_tgt + i_dft == n_past_tgt-1) {
+            if (current_run.n_past_tgt + i_dft == n_past_tgt-1) {
                 any_match = true;
                 ++n_predict;
+                if (current_run.speculative) {
+                    n_accept++;
+                }
                 llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
 
                 // Root of WORLD
                 LOG("Accepting token %d ('%s'), n_past_tgt: %d\n", id, token_str.c_str(), n_past_tgt);
                 generated.push_back(id);
                 if (llama_node_id(ctx_tgt) == 0) {
-                    printf("%s", token_str.c_str());
-                    fflush(stdout);
+                    if (!params.use_color) {
+                        printf("%s", token_str.c_str());
+//                        fprintf(stderr, "%s", token_str.c_str());
+                        fflush(stdout);
+//                        fflush(stderr);
+                    }
+
                 }
             }
 
@@ -411,7 +486,7 @@ int main(int argc, char ** argv) {
 
 
 
-            if (run_id == ASYNC_RUN_ID) {
+            if (!current_run.speculative) {
                 break;
             }
 
@@ -436,18 +511,28 @@ int main(int argc, char ** argv) {
                 }
 
                 if (matches) {
-                    if (run_n_past_tgt + i_dft == n_past_tgt-1) {
+                    if (current_run.n_past_tgt + i_dft == n_past_tgt-1) {
                         ++n_accept;
                         ++n_past_tgt;
                         ++n_past_dft;
                     }
                     ++i_dft;
-                    if (run_id != ASYNC_RUN_ID && run_n_past_tgt + i_dft < n_past_tgt) {
+                    if (params.use_color) {
+                        // Color token according to its origin sequence
+                        printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
+                        fflush(stdout);
+                    }
+                    if (current_run.speculative && current_run.n_past_tgt + i_dft < n_past_tgt) {
                         continue;
                     }
                 }
             }
 
+            if (params.use_color) {
+                printf("%s", token_str.c_str());
+            }
+            fflush(stdout);
+
         }
 
 
@@ -458,8 +543,10 @@ int main(int argc, char ** argv) {
         }
 
         if (!any_match) {
-            if (run_id != ASYNC_RUN_ID) {
-                free_sequence_offsets.push_back(seq_offset);
+            if (current_run.speculative) {
+//                fprintf(stderr, "\nNo match, pushing seq offset %d to free seq offsets\n", current_run.seq_offset);
+//                fflush(stderr);
+                free_sequence_offsets.push_back(current_run.seq_offset);
             }
 //            fprintf(stderr, "No match\n");
             continue;
@@ -467,557 +554,486 @@ int main(int argc, char ** argv) {
 
         check_for_cancel(ctx_tgt, n_past_tgt, tgt_cgraphs, generated, n_seq_dft);
 
-        // Pipeline syncing cache ops
-//        llama_kv_cache_seq_keep(ctx_dft, s_keep);
-//        llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
-//        llama_kv_cache_seq_keep(ctx_dft, 0);
-//        llama_kv_cache_seq_rm  (ctx_dft, 0, n_past_dft, -1);
 
         // TODO: simplify
-        if (run_id != ASYNC_RUN_ID){
-            LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep+seq_offset, n_past_tgt, n_past_dft);
+        if (current_run.speculative){
+            LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d, current_run.n_past_tgt = %d, current_run.n_past_dft = %d\n", s_keep+current_run.seq_offset, n_past_tgt, n_past_dft, current_run.n_past_tgt, current_run.n_past_dft);
 
+            for (int i = 0; i < n_seq_dft; i++) {
 
+                llama_kv_cache_seq_rm  (ctx_tgt, i+current_run.seq_offset, n_past_tgt, -1);
+                llama_kv_cache_seq_rm  (ctx_dft, i+current_run.seq_offset, n_past_dft, -1);
 
 
-//            llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1);
-            for (int i = 0; i < n_seq_dft; i++) {
+            }
 
+            llama_kv_cache_seq_rm  (ctx_tgt, 0, current_run.n_past_tgt+1, n_past_tgt);
+            llama_kv_cache_seq_rm  (ctx_dft, 0, current_run.n_past_dft+1, n_past_dft);
 
-            }
-//            LOG("Copying tgt sequence %d to %d from positions %d to %d\n", s_keep+seq_offset, 0, run_n_past_tgt, n_past_tgt);
-//            llama_kv_cache_seq_cp_back  (ctx_tgt, s_keep+seq_offset, 0, run_n_past_tgt, n_past_tgt);
-            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+seq_offset, 0, old_n_past_tgt, n_past_tgt);
+            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+current_run.seq_offset, 0, current_run.n_past_tgt+1, n_past_tgt);
+            llama_kv_cache_seq_cp  (ctx_dft, s_keep+current_run.seq_offset, 0, current_run.n_past_dft+1, n_past_dft);
 
-//            if (llama_node_id(ctx_tgt) == 0) {
-//                llama_kv_cache_view_update(ctx_tgt, &kvc_view);
-//                dump_kv_cache_view_seqs(kvc_view, 20);
-////            dump_kv_cache_view(kvc_view, 20);
-//                printf("Copied to 0, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d\n", n_past_tgt, run_n_past_tgt, run_max_n_past);
-//            }
 
-            for (int i = 0; i < n_seq_dft; i++) {
-                llama_kv_cache_seq_cp(ctx_tgt, 0, i+seq_offset, -1, n_past_tgt);
-//                if (llama_node_id(ctx_tgt) == 0) {
-//                    llama_kv_cache_view_update(ctx_tgt, &kvc_view);
-//                    dump_kv_cache_view_seqs(kvc_view, 20);
-////            dump_kv_cache_view(kvc_view, 20);
-//                    printf("Copied from 0 to %d, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d\n", i+seq_offset, n_past_tgt, run_n_past_tgt, run_max_n_past);
-//                }
-            }
+            for (int i = 1; i <= max_seq; i++) {
 
-//            LOG("Copying dft sequence %d to %d from positions %d to %d\n", s_keep+seq_offset, 0, run_n_past_dft, n_past_dft);
+                llama_kv_cache_seq_rm(ctx_tgt, i, current_run.n_past_tgt+1, n_past_tgt);
+                llama_kv_cache_seq_rm(ctx_dft, i, current_run.n_past_dft+1, n_past_dft);
 
-            llama_kv_cache_seq_cp  (ctx_dft, s_keep+seq_offset, 0, old_n_past_dft, n_past_dft);
-            for (int i = 0; i < n_seq_dft; i++) {
-//                LOG("Removing tgt sequence %d from positions %d to %d\n", i+seq_offset, -1, -1);
+                llama_kv_cache_seq_cp(ctx_tgt, 0, i, current_run.n_past_tgt+1, n_past_tgt);
+                llama_kv_cache_seq_cp(ctx_dft, 0, i, current_run.n_past_dft+1, n_past_dft);
 
-                llama_kv_cache_seq_rm  (ctx_tgt, i+seq_offset, old_n_past_tgt, -1);
 
-//                if (llama_node_id(ctx_tgt) == 0) {
-//                    llama_kv_cache_view_update(ctx_tgt, &kvc_view);
-//                    dump_kv_cache_view_seqs(kvc_view, 20);
-////            dump_kv_cache_view(kvc_view, 20);
-//                    printf("Removed %d, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d, old_n_past: %d\n", i+seq_offset, n_past_tgt, run_n_past_tgt, run_max_n_past, old_n_past_tgt);
-//                }
+            }
+
 
-//                LOG("Removing dft sequence %d from positions %d to %d\n", i+seq_offset, -1, -1);
 
-                llama_kv_cache_seq_rm  (ctx_dft, i+seq_offset, old_n_past_dft, -1);
+
+            if (llama_node_id(ctx_tgt) == 0) {
+//                llama_kv_cache_view_update(ctx_tgt, &kvc_view);
+//                LOG("Done keeping sequence, new tgt cache layout:\n%s", dump_kv_cache_view_seqs(kvc_view, 1).c_str());
+            } else {
+//                llama_kv_cache_view_update(ctx_dft, &kvc_view_dft);
+//                LOG("Done keeping sequence, new dft cache layout:\n%s", dump_kv_cache_view_seqs(kvc_view_dft, 1).c_str());
             }
 
 
-            for (int i = 1; i < max_seq; i++) {
-//                LOG("Copying tgt sequence %d to %d from positions %d to %d\n", 0, i, -1, n_past_tgt);
-//                LOG("Copying dft sequence %d to %d from positions %d to %d\n", 0, i, -1, n_past_dft);
 
-                llama_kv_cache_seq_rm(ctx_tgt, i, old_n_past_tgt, n_past_tgt);
-                llama_kv_cache_seq_rm(ctx_dft, i, old_n_past_dft, n_past_dft);
-//
-//                if (llama_node_id(ctx_tgt) == 0) {
-////                    llama_kv_cache_view_update(ctx_tgt, &kvc_view);
-////                    dump_kv_cache_view_seqs(kvc_view, 20);
-////            dump_kv_cache_view(kvc_view, 20);
-//                    printf("Removed %d, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d, old_n_past: %d\n", i+seq_offset, n_past_tgt, run_n_past_tgt, run_max_n_past, old_n_past_tgt);
-//                }
 
-                llama_kv_cache_seq_cp(ctx_tgt, 0, i, old_n_past_tgt, n_past_tgt);
+        }
 
-//                if (llama_node_id(ctx_tgt) == 0) {
-////                    llama_kv_cache_view_update(ctx_tgt, &kvc_view);
-////                    dump_kv_cache_view_seqs(kvc_view, 20);
-////            dump_kv_cache_view(kvc_view, 20);
-//                    printf("Copied 0 to %d, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d, old_n_past: %d\n", i, n_past_tgt, run_n_past_tgt, run_max_n_past, old_n_past_tgt);
-//                }
+        begin_non_spec_run(params, n_seq_dft, ctx_tgt, max_seq, drafts, id, batch_id, n_past_tgt, n_past_dft, tgt_cgraphs,
+                           kvc_view);
 
-                llama_kv_cache_seq_cp(ctx_dft, 0, i, old_n_past_dft, n_past_dft);
-            }
+        begin_non_spec_run(params, n_seq_dft, ctx_dft, max_seq, drafts, id, batch_id, n_past_dft, n_past_dft, dft_cgraphs,
+                           kvc_view_dft);
 
-//            if (llama_node_id(ctx_tgt) == 0) {
-//                llama_kv_cache_view_update(ctx_tgt, &kvc_view);
-//                dump_kv_cache_view_seqs(kvc_view, 20);
-////            dump_kv_cache_view(kvc_view, 20);
-//                printf("Kept sequence, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d\n", n_past_tgt, run_n_past_tgt, run_max_n_past);
-//            }
+        seq_async_run dft_run = dft_cgraphs.back();
+        dft_cgraphs.pop_back();
+        llama_finish_async_decode(*ctx_dft, dft_run.batch, dft_run.cgraph);
 
-//            for (int i = 0; i < n_seq_dft; i++) {
-//                llama_kv_cache_seq_cp(ctx_tgt, 0, i+seq_offset, -1, n_past_tgt);
-//            }
 
+        spec_past_tgt = n_past_tgt;
+        spec_past_dft = n_past_dft;
 
-        } else {
-//            llama_kv_cache_seq_cp  (ctx_tgt, s_keep+seq_offset, 0, -1, n_past_tgt);
-//            for (int i = 1; i < n_seq_dft; i++) {
-//                llama_kv_cache_seq_rm  (ctx_tgt, i+seq_offset, -1, n_past_tgt);
-//
-//            }
 
-//            for (int i = 0; i < n_seq_dft; i++) {
-//                llama_kv_cache_seq_rm  (ctx_tgt, i+seq_offset, -1, n_past_tgt);
-//                llama_kv_cache_seq_cp(ctx_tgt, 0, i+seq_offset, -1, n_past_tgt);
+        if (!current_run.speculative) {
+            if (free_sequence_offsets.empty()) {
+                continue;
+            }
+            current_run.seq_offset = free_sequence_offsets.front();
+//            if (llama_node_id(ctx_tgt) == 0) {
+//                fprintf(stderr, "Popping %d from seq offsets for spec run\n", current_run.seq_offset);
+//                fflush(stderr);
 //            }
+            free_sequence_offsets.pop_front();
         }
 
 
+//        bool is_waiting = false;
 
+        run_speculation_loop(params, p_accept, ctx_tgt, ctx_dft, max_seq, batch_tgt, n_predict, n_past_tgt, n_past_dft,
+                             has_eos, ctx_sampling,
+                             spec_past_tgt, spec_past_dft, first_run, free_sequence_offsets, batch_id, batch_dft,
+                             n_drafted, drafts, tgt_cgraphs, current_run, kvc_view_dft, id);
 
 
+        if (n_predict > params.n_predict || has_eos) {
+            break;
+        }
 
 
-        {
 
-            batch_id++;
 
+    }
 
-            LOG("Beginning async decode, batch id = %d\n", batch_id);
-            llama_batch_clear(batch_tgt_async);
+    auto t_dec_end = ggml_time_us();
 
-            llama_batch_add(batch_tgt_async, id, n_past_tgt, {0}, true);
+    LOG_TEE("\n\n");
 
-            LOG("target async batch: %s\n, batch_id = %d\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt_async).c_str(), batch_id);
+    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
 
+    LOG_TEE("\n");
+    LOG_TEE("n_draft   = %d\n", n_draft);
+    LOG_TEE("n_predict = %d\n", n_predict);
+    LOG_TEE("n_drafted = %d\n", n_drafted);
+    LOG_TEE("n_accept  = %d\n", n_accept);
+    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
-            // batch_tgt.n_tokens = 1
+    LOG_TEE("\ndraft:\n");
+    llama_print_timings(ctx_dft);
 
-            ++n_past_tgt;
-            struct seq_async_run run;
-            run.canceled = false;
-//            if (!free_sequence_offsets.empty()) {
-//                run.seq_offset = free_sequence_offsets.front();
-//                printf("Popping %d from seq offsets\n", run.seq_offset);
-//                free_sequence_offsets.pop_front();
-//            } else if(!tgt_cgraphs.empty()){
-//                printf("Getting offset from head of tgt cgraphs\n");
-//                run.seq_offset = tgt_cgraphs.front().seq_offset;
-//            } else {
-//                printf("NO FREE OFFSETS AND NO TGT CGRAPHS\n");
-//            }
-            run.batch = llama_batch_init(params.n_ctx, 0, max_seq);
-            run.batch.batch_id = batch_id;
-            run.batch.n_tokens = batch_tgt_async.n_tokens;
-            for (int i = 0; i < batch_tgt_async.n_tokens; i++) {
-                run.batch.n_seq_id[i] = batch_tgt_async.n_seq_id[i];
-                for (int j = 0; j < run.batch.n_seq_id[i]; j++) {
-                    run.batch.seq_id[i][j] = batch_tgt_async.seq_id[i][j];
-                }
-                run.batch.token[i] = batch_tgt_async.token[i];
-                run.batch.pos[i] = batch_tgt_async.pos[i];
-                run.batch.logits[i] = batch_tgt_async.logits[i];
-            }
-            run.ctx_sampling = llama_sampling_init(params.sparams);
-            llama_sampling_cp(ctx_sampling, run.ctx_sampling);
-            run.drafts = std::vector<seq_draft>(n_seq_dft);
-            for (int s = 0; s < n_seq_dft; ++s) {
-                run.drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
-                llama_sampling_cp(drafts[s].ctx_sampling, run.drafts[s].ctx_sampling);
-                run.drafts[s].i_batch_tgt = std::vector<int>(1,0);
-                run.drafts[s].i_batch_dft = drafts[s].i_batch_dft;
-                run.drafts[s].tokens = std::vector<llama_token>(1, id);
-                run.drafts[s].active = drafts[s].active;
-                run.drafts[s].drafting = drafts[s].drafting;
-                run.drafts[s].skip = drafts[s].skip;
-                run.drafts[s].prefix_tokens = std::vector<llama_token>(0);
-            }
-            run.i_dft = offset - 1;
-            run.s_keep = s_keep;
-            run.run_id = ASYNC_RUN_ID;
-            run.n_past_tgt = n_past_tgt-1;
-            run.prefix_n_past_tgt = n_past_tgt-1;
-            run.n_past_max = n_past_tgt;
-            run.n_past_dft = n_past_dft;
-            run.speculative = false;
-            run.cgraph = llama_start_async_decode(*ctx_tgt, run.batch);
-            tgt_cgraphs.push_front(run);
-            //llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_past_tgt+1);
-            for (int i = 1; i < max_seq; i++) {
-//                LOG("Copying tgt sequence %d to %d from positions %d to %d\n", 0, i, n_past_tgt-1, n_past_tgt);
-                llama_kv_cache_seq_rm(ctx_tgt, i, n_past_tgt-1, n_past_tgt);
-                llama_kv_cache_seq_cp(ctx_tgt, 0, i, n_past_tgt-1, n_past_tgt);
-            }
+    LOG_TEE("\ntarget:\n");
+    llama_print_timings(ctx_tgt);
 
-//            if (llama_node_id(ctx_tgt) == 0) {
-//                llama_kv_cache_view_update(ctx_tgt, &kvc_view);
-//                dump_kv_cache_view_seqs(kvc_view, 20);
-////            dump_kv_cache_view(kvc_view, 20);
-//                printf("Copied async, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d\n", n_past_tgt, run_n_past_tgt, run_max_n_past);
-//            }
-        }
+    llama_sampling_free(ctx_sampling);
+    for (int s = 0; s < n_seq_dft; ++s) {
+        llama_sampling_free(drafts[s].ctx_sampling);
+    }
 
+    llama_batch_free(batch_dft);
 
-        if (run_id == ASYNC_RUN_ID) {
-            if (free_sequence_offsets.empty()) {
-                continue;
-            }
-            seq_offset = free_sequence_offsets.front();
-//            printf("Popping %d from seq offsets for spec run\n", seq_offset);
-            free_sequence_offsets.pop_front();
-        }
+    llama_free(ctx_tgt);
+    llama_free_model(model_tgt);
 
-        int spec_past_tgt = n_past_tgt;
-        int spec_past_dft = n_past_dft;
+    llama_free(ctx_dft);
+    llama_free_model(model_dft);
 
-        int first_run = true;
+    llama_backend_free();
 
-//        bool is_waiting = llama_mpi_iprobe(ctx_tgt);
-        bool is_waiting = false;
-//        llama_swap_comm(ctx_tgt);
-//        llama_sync_token(ctx_tgt, reinterpret_cast<llama_token *>(&is_waiting), 0);
-//        llama_swap_comm(ctx_tgt);
+    fprintf(stderr, "\n\n");
 
+    return 0;
+}
 
-//        llama_batch_clear(batch_dft);
-//        llama_batch_add(batch_dft, id, spec_past_dft, {0}, true);
-//        // batch_dft.n_tokens == 1 now
-//
-//
-//
-//        // Kick off drafting pipeline but don't need it just yet
-//        LOG("Beginning async draft with sequence 0\n");
-//        dft_cgraphs.push_front(llama_start_async_decode(*ctx_dft, batch_dft));
-//        // DON'T FORGET THE MATCHING DECODE WHEN NEEDED
-//
-//        // We need the draft now, so wait for it
-//        if (!dft_cgraphs.empty()) {
-////            LOG("Finishing async decode of draft\n");
-//            llama_finish_async_decode(*ctx_dft, batch_dft, dft_cgraphs.back());
-//            dft_cgraphs.pop_back();
-//        }
-//        LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+void
+run_speculation_loop(const gpt_params &params, const float p_accept, llama_context *ctx_tgt, llama_context *ctx_dft,
+                     const int max_seq, llama_batch &batch_tgt, int n_predict, int n_past_tgt, int n_past_dft,
+                     bool has_eos, llama_sampling_context *ctx_sampling, int &spec_past_tgt, int &spec_past_dft,
+                     bool & first_run, std::deque<int> &free_sequence_offsets, int32_t &batch_id, llama_batch &batch_dft,
+                     int &n_drafted, std::vector<seq_draft> &drafts, std::deque<struct seq_async_run> &tgt_cgraphs,
+                     seq_async_run &current_run, llama_kv_cache_view &kvc_view_dft, llama_token &id) {
+    bool is_waiting = llama_mpi_iprobe(ctx_tgt);
+    llama_swap_comm(ctx_tgt);
+    llama_sync_token(ctx_tgt, reinterpret_cast<llama_token *>(&is_waiting), 0);
+    llama_swap_comm(ctx_tgt);
 
 
-        if (is_waiting) {
-            free_sequence_offsets.push_back(seq_offset);
-        }
-        int iter = 0;
-        while(iter < 2 || !is_waiting) {
+    if (is_waiting) {
+//            fprintf(stderr, "\nIs waiting, pushing seq offset %d to free seq offsets\n", current_run.seq_offset);
+//            fflush(stderr);
+        free_sequence_offsets.push_back(current_run.seq_offset);
+    }
+    int iter = 0;
+    while((!is_waiting && (p_accept + iter * params.p_recovery) < 1.0)) {
 
 
 
 
 
-            int orig_offset = seq_offset;
-            bool should_run_spec = true;
-            std::deque<int> checked_offsets;
-            do {
-                should_run_spec = true;
-                for (const auto &r: tgt_cgraphs) {
-                    if (r.seq_offset == seq_offset && r.run_id != ASYNC_RUN_ID) {
-                        checked_offsets.push_back(seq_offset);
+        int orig_offset = current_run.seq_offset;
+        bool should_run_spec = true;
+        std::deque<int> checked_offsets;
+        do {
+            should_run_spec = true;
+            for (const auto &r: tgt_cgraphs) {
+                if (r.seq_offset == current_run.seq_offset && r.speculative) {
+                    checked_offsets.push_back(current_run.seq_offset);
 
-                        should_run_spec = false;
-                        if (!free_sequence_offsets.empty()) {
-                            seq_offset = free_sequence_offsets.front();
-                            free_sequence_offsets.pop_front();
+                    should_run_spec = false;
+                    if (!free_sequence_offsets.empty()) {
+                        current_run.seq_offset = free_sequence_offsets.front();
+                        free_sequence_offsets.pop_front();
 
-                        }
-                        break;
                     }
+                    break;
                 }
-            } while (!should_run_spec && !free_sequence_offsets.empty());
-
-            free_sequence_offsets.insert(free_sequence_offsets.end(), checked_offsets.begin(),
-                                         checked_offsets.end());
+            }
+        } while (!should_run_spec && !free_sequence_offsets.empty());
 
-            if (!should_run_spec) {
+        if (!should_run_spec) {
+            LOG("Ending spec because no available offsets\n");
+            break;
+        }
+//            if (llama_node_id(ctx_tgt) == 0) {
+//                fprintf(stderr, "\nErasing seq offset %d from free seq offsets\n", current_run.seq_offset);
+//                fflush(stderr);
+//            }
+        auto it = std::find(free_sequence_offsets.begin(), free_sequence_offsets.end(), current_run.seq_offset);
+        if (it != free_sequence_offsets.end()) {
+            free_sequence_offsets.erase(it);
+        }
 
-                break;
-            }
 
-            LOG("Doing speculative run, seq_offset = %d, spec_past_tgt = %d, spec_past_dft = %d, n_past_tgt = %d, n_past_dft = %d", seq_offset, spec_past_tgt, spec_past_dft, n_past_tgt, n_past_dft);
+        if (start_async_spec_run(params, ctx_tgt, ctx_dft, free_sequence_offsets, max_seq,
+                                 batch_tgt, n_predict, n_past_tgt, n_past_dft, has_eos, ctx_sampling,
+                                 tgt_cgraphs,
+                                 current_run, spec_past_tgt, spec_past_dft, first_run, orig_offset,
+                                 batch_id, batch_dft, n_drafted, drafts, id, kvc_view_dft, iter)) {
+            LOG("Ending spec run because returned true\n");
+            break;
+        }
 
-            for (int i = 0; i < n_seq_dft; i++) {
-//                LOG("Removing tgt sequence %d from positions %d to %d\n", i + seq_offset, -1, -1);
+        is_waiting = llama_mpi_iprobe(ctx_tgt);
+        llama_swap_comm(ctx_tgt);
+        llama_sync_token(ctx_tgt, reinterpret_cast<llama_token *>(&is_waiting), 0);
+        llama_swap_comm(ctx_tgt);
+        first_run = false;
 
-//                if(first_run) {
+        iter++;
+//            break;
 
-                    llama_kv_cache_seq_rm(ctx_tgt, i + seq_offset, -1, -1);
+    }
+}
 
-                    LOG("Copying tgt sequence %d to %d from positions %d to %d\n", (first_run) ? 0 : orig_offset,
-                        i + seq_offset, -1, (first_run) ? spec_past_tgt : spec_past_tgt);
+void begin_non_spec_run(const gpt_params &params, const int n_seq_dft, llama_context *ctx, const int max_seq,
+                        const std::vector<seq_draft> &drafts, llama_token id, int32_t &batch_id, int &n_past,
+                        int n_past_dft,
+                        std::deque<struct seq_async_run> &dft_cgraphs, llama_kv_cache_view &kvc_view) {
 
-                    llama_kv_cache_seq_cp(ctx_tgt, (first_run) ? 0 : orig_offset, i + seq_offset, -1, (first_run) ? spec_past_tgt : spec_past_tgt);
-//                if (llama_node_id(ctx_tgt) == 0) {
-//                    llama_kv_cache_view_update(ctx_tgt, &kvc_view);
-//                    dump_kv_cache_view_seqs(kvc_view, 20);
-////            dump_kv_cache_view(kvc_view, 20);
-//                    printf("Copied tgt sequence, n_past_tgt: %d, run_n_past_tgt: %d, run_max_n_past: %d, spec_past_tgt: %d, first_run: %d\n", n_past_tgt, run_n_past_tgt, run_max_n_past, spec_past_tgt, first_run);
-//                }
-//                }
+    std::vector<seq_draft> non_spec_drafts = std::vector<seq_draft>(n_seq_dft);
+    for (int s = 0; s < n_seq_dft; ++s) {
+        non_spec_drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
+        llama_sampling_cp(drafts[s].ctx_sampling, drafts[s].ctx_sampling);
+        non_spec_drafts[s].i_batch_tgt = std::vector<int>(1,0);
+        non_spec_drafts[s].i_batch_dft = drafts[s].i_batch_dft;
+        non_spec_drafts[s].tokens = std::vector<llama_token>(1, id);
+        non_spec_drafts[s].active = drafts[s].active;
+        non_spec_drafts[s].drafting = drafts[s].drafting;
+        non_spec_drafts[s].skip = drafts[s].skip;
+        non_spec_drafts[s].prefix_tokens = std::vector<llama_token>(0);
+    }
 
-//                LOG("Removing dft sequence %d from positions %d to %d\n", i + seq_offset, spec_past_dft, -1);
+    llama_batch async_batch = llama_batch_init(params.n_ctx, 0, max_seq + 1);
 
+    llama_batch_clear(async_batch);
 
-                    llama_kv_cache_seq_rm(ctx_dft, i + seq_offset, spec_past_dft, -1);
+    llama_batch_add(async_batch, id, n_past, {0}, true);
 
-            }
+    begin_async_run(params.sparams, n_seq_dft, ctx, max_seq, n_past_dft,
+                    non_spec_drafts, dft_cgraphs, batch_id, n_past, kvc_view, false, async_batch, n_past+1, n_past, 0);
 
+    n_past++;
 
-            llama_batch_clear(batch_tgt);
+}
 
-            for (int s = 0; s < n_seq_dft; ++s) {
-                drafts[s].active = false;
-                if (!first_run) {
-                    if (!drafts[s].tokens.empty()) {
-                        drafts[s].prefix_tokens.insert(drafts[s].prefix_tokens.end(), drafts[s].tokens.begin(),
-                                                       drafts[s].tokens.end());
-                    }
-                } else {
-                    drafts[s].prefix_tokens.clear();
-                }
-                drafts[s].tokens.clear();
-                drafts[s].i_batch_tgt.clear();
-            }
-            // note: will be erased after the speculation phase
-            drafts[0].tokens.push_back(id);
+bool start_async_spec_run(const gpt_params &params, llama_context *ctx_tgt, llama_context *ctx_dft,
+                          std::deque<int> &free_sequence_offsets, int max_seq, llama_batch &batch_tgt, int n_predict,
+                          int prefix_n_past, int n_past_dft, bool has_eos, llama_sampling_context *ctx_sampling,
+                          std::deque<struct seq_async_run> &tgt_cgraphs, const seq_async_run &current_run,
+                          int &spec_past_tgt, int &spec_past_dft, int first_run, int orig_offset, int32_t &batch_id,
+                          llama_batch &batch_dft, int &n_drafted, std::vector<seq_draft> &drafts, llama_token &id,
+                          llama_kv_cache_view &kvc, const int iter) {
+    LOG("Doing speculative run, seq_offset = %d, spec_past_tgt = %d, spec_past_dft = %d, prefix_n_past = %d, n_past_dft = %d\n",
+        current_run.seq_offset, spec_past_tgt, spec_past_dft, prefix_n_past, n_past_dft);
 
+    for (int i = 0; i < params.n_parallel; i++) {
 
-//            LOG("Copying dft sequence %d to %d from positions %d to %d\n", (first_run) ? 0 : orig_offset, seq_offset, -1, spec_past_dft);
+        llama_kv_cache_seq_rm(ctx_tgt, i + current_run.seq_offset, (first_run) ? prefix_n_past : prefix_n_past - 1, -1);
+        llama_kv_cache_seq_rm(ctx_dft, i + current_run.seq_offset, (first_run) ? n_past_dft : n_past_dft - 1, -1);
 
-            llama_kv_cache_seq_cp(ctx_dft, (first_run) ? 0 : orig_offset, seq_offset, -1, (first_run) ? spec_past_dft : spec_past_dft + 1);
+        LOG("Copying tgt sequence %d to %d from positions %d to %d\n", (first_run) ? 0 : orig_offset,
+            i + current_run.seq_offset, prefix_n_past, spec_past_tgt);
 
-            llama_batch_clear(batch_dft);
+        llama_kv_cache_seq_cp(ctx_tgt, (first_run) ? 0 : orig_offset, i + current_run.seq_offset, (first_run) ? prefix_n_past : prefix_n_past - 1,
+                              spec_past_tgt+1);
+        llama_kv_cache_seq_cp(ctx_dft, (first_run) ? 0 : orig_offset, i + current_run.seq_offset, (first_run) ? n_past_dft : n_past_dft - 1,
+                              spec_past_dft+1);
 
 
-            if (first_run) {
-                llama_batch_add(batch_dft, id, spec_past_dft, {seq_offset}, true);
-                // batch_dft.n_tokens == 1 now
+    }
 
 
+    llama_batch_clear(batch_tgt);
 
-                // Kick off drafting pipeline but don't need it just yet
-                LOG("Beginning async draft\n");
-                dft_cgraphs.push_front(llama_start_async_decode(*ctx_dft, batch_dft));
-                // DON'T FORGET THE MATCHING DECODE WHEN NEEDED
+    for (int s = 0; s < params.n_parallel; ++s) {
+        drafts[s].active = false;
+        if (!first_run) {
+            if (!drafts[s].tokens.empty()) {
+                drafts[s].prefix_tokens.insert(drafts[s].prefix_tokens.end(), drafts[s].tokens.begin(),
+                                               drafts[s].tokens.end());
+            }
+        } else {
+            drafts[s].prefix_tokens.clear();
+        }
+        drafts[s].tokens.clear();
+        drafts[s].i_batch_tgt.clear();
+    }
+    // note: will be erased after the speculation phase
+    drafts[0].tokens.push_back(id);
 
-                // We need the draft now, so wait for it
-                if (!dft_cgraphs.empty()) {
-                    LOG("Finishing async decode of draft\n");
-                    llama_finish_async_decode(*ctx_dft, batch_dft, dft_cgraphs.back());
-                    dft_cgraphs.pop_back();
-                }
-                LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
 
-                for (int i = 0; i < max_seq; i++) {
-//                    LOG("Copying dft sequence %d to %d from positions %d to %d\n", seq_offset, i, spec_past_dft, spec_past_dft + 1);
+    llama_batch_clear(batch_dft);
 
-                    if (i != seq_offset) {
-                        llama_kv_cache_seq_cp(ctx_dft, seq_offset, i, spec_past_dft, spec_past_dft + 1);
-                    }
-                }
-                ++n_past_dft;
-            } else {
-                for (int i = 1; i < n_seq_dft; i++) {
-//                    LOG("Copying dft sequence %d to %d from positions %d to %d\n", seq_offset, i+seq_offset, spec_past_dft, spec_past_dft + 1);
 
-                    llama_kv_cache_seq_cp(ctx_dft, seq_offset, i+seq_offset, -1, spec_past_dft + 1);
-                }
-            }
+    if (llama_node_id(ctx_dft) == 0) {
+//            llama_kv_cache_view_update(ctx_dft, &kvc);
+//        LOG("Draft KV cache view:\n%s\n", dump_kv_cache_view_seqs(kvc, 1).c_str());
+    }
 
-            if (llama_node_id(ctx_dft) == 0) {
-//            llama_kv_cache_view_update(ctx_dft, &kvc_view_dft);
-//            dump_kv_cache_view_seqs(kvc_view_dft, 20);
-            }
 
+    if (n_predict > params.n_predict || has_eos) {
+                return true;
+    }
 
+    llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
 
-            if (n_predict > params.n_predict || has_eos) {
-                break;
-            }
+    int n_seq_cur  = 0;
+    int max_ran_seq = 0;
+    int n_past_cur = spec_past_dft;
 
-            llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
+    for (int s = 0; s < params.n_parallel; ++s) {
+        drafts[s].skip = true;
+        drafts[s].active = false;
+        drafts[s].drafting = false;
+    }
 
-            int n_seq_cur  = 0;
-            int max_ran_seq = 0;
-            int n_past_cur = spec_past_dft+1;
 
-            for (int s = 0; s < n_seq_dft; ++s) {
-                drafts[s].skip = true;
-                drafts[s].active = false;
-                drafts[s].drafting = false;
-            }
+    drafts[0].active = true;
+    drafts[0].drafting = true;
+    drafts[0].skip = false;
 
+    drafts[0].i_batch_dft = 0;
 
-            drafts[0].active = true;
-            drafts[0].drafting = true;
-            drafts[0].skip = false;
 
-            drafts[0].i_batch_dft = 0;
+    // sample n_draft tokens from the draft model using tree-based sampling
+    for (int i = 0; i < params.n_draft; ++i) {
+        batch_dft.n_tokens = 0;
 
 
-            // sample n_draft tokens from the draft model using tree-based sampling
-            for (int i = 0; i < n_draft; ++i) {
-                batch_dft.n_tokens = 0;
 
-                for (int s = 0; s <= max_ran_seq; ++s) {
-                    if (!drafts[s].drafting || drafts[s].skip) {
-                        continue;
-                    }
+        for (int s = 0; s <= max_ran_seq; ++s) {
+            if (!drafts[s].drafting || drafts[s].skip) {
+                continue;
+            }
 
 
 
-                    // Swap back to pipeline roots
-                    llama_swap_comm(ctx_dft);
-                    LOG("Swapped comm to pipeline roots, id %d\n", llama_node_id(ctx_dft));
+            // Swap back to pipeline roots
+            llama_swap_comm(ctx_dft);
+            LOG("Swapped comm to pipeline roots, id %d\n", llama_node_id(ctx_dft));
 
-                    llama_sync_token(ctx_dft, &(drafts[s].i_batch_dft), 1);
+            llama_sync_token(ctx_dft, &(drafts[s].i_batch_dft), 1);
 
-                    llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
+            llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, nullptr, drafts[s].i_batch_dft);
 
-                    auto &cur_p = drafts[s].ctx_sampling->cur;
+            auto &cur_p = drafts[s].ctx_sampling->cur;
 
-                    llama_sync_token_data(ctx_dft, cur_p.data(), 1);
-                    // TODO investigate potential bottleneck
-                    for (int k = 1; k < 8; ++k) {
-                        llama_sync_token_data(ctx_dft, &(cur_p[k]), 1);
-                    }
+            llama_sync_token_data(ctx_dft, cur_p.data(), 1);
+            // TODO investigate potential bottleneck
+            for (int k = 1; k < 8; ++k) {
+                llama_sync_token_data(ctx_dft, &(cur_p[k]), 1);
+            }
 
-                    // Back to draft pipeline only
-                    llama_swap_comm(ctx_dft);
-                    LOG("Swapped comm to draft only, id %d\n", llama_node_id(ctx_dft));
+            // Back to draft pipeline only
+            llama_swap_comm(ctx_dft);
+            LOG("Swapped comm to draft only, id %d\n", llama_node_id(ctx_dft));
 
 
-                    if (llama_node_id(ctx_dft) >= 0) {
-                        for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
-                            LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
-                                k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
-                        }
-                    }
+            if (llama_node_id(ctx_dft) >= 0) {
+                for (int k = 0; k < std::min(params.n_parallel, (int) cur_p.size()); ++k) {
+                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                        k, s+current_run.seq_offset, i+spec_past_dft, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
+                }
+            }
 
 
-                    if (cur_p[0].p < p_accept) {
-                        LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p,
-                            p_accept);
-                        drafts[s].drafting = false;
-                        continue;
-                    }
+            if (cur_p[0].p < params.p_accept + params.p_recovery * iter) {
+                LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p,
+                    params.p_accept);
+                drafts[s].drafting = false;
+                continue;
+            }
 
 
-                    std::vector<int> sa(1, s);
+            std::vector<int> sa(1, s);
 
-                    // attempt to split the branch if the probability is high enough
-                    for (int f = 1; f < 8; ++f) {
-                        if (n_seq_cur < n_seq_dft - 1 && cur_p[f].p > p_split) {
-                            n_seq_cur++;
-                            LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
+            // attempt to split the branch if the probability is high enough
+            for (int f = 1; f < 8; ++f) {
+                if (n_seq_cur < params.n_parallel - 1 && cur_p[f].p > params.p_split + params.p_recovery * iter) {
+                    n_seq_cur++;
+                    LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
 
 
-//                            LOG("Removing dft sequence %d from positions %d to %d\n", n_seq_cur + seq_offset, -1, -1);
+                    LOG("Removing dft sequence %d from positions %d to %d\n", n_seq_cur + current_run.seq_offset, n_past_dft, n_past_cur);
 
-                            llama_kv_cache_seq_rm(ctx_dft, n_seq_cur + seq_offset, -1, -1);
+                    llama_kv_cache_seq_rm(ctx_dft, n_seq_cur + current_run.seq_offset, n_past_dft, n_past_cur);
 
-//                            LOG("Copying dft sequence %d to %d from positions %d to %d\n", s + seq_offset, n_seq_cur + seq_offset, -1, -1);
+                    LOG("Copying dft sequence %d to %d from positions %d to %d\n", s + current_run.seq_offset, n_seq_cur + current_run.seq_offset, n_past_dft, n_past_cur);
 
-                            llama_kv_cache_seq_cp(ctx_dft, s + seq_offset, n_seq_cur + seq_offset, -1, -1);
+                    llama_kv_cache_seq_cp(ctx_dft, s + current_run.seq_offset, n_seq_cur + current_run.seq_offset, n_past_dft, n_past_cur);
 
-                            // all previous tokens from this branch are now also part of the new branch
-                            for (int t = 0; t < batch_tgt.n_tokens; ++t) {
-                                for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) {
-                                    if (batch_tgt.seq_id[t][p] == s + seq_offset) {
-                                        batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur + seq_offset;
-                                        batch_tgt.n_seq_id[t]++;
-                                        break;
-                                    }
-                                }
+                    // all previous tokens from this branch are now also part of the new branch
+                    for (int t = 0; t < batch_tgt.n_tokens; ++t) {
+                        for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) {
+                            if (batch_tgt.seq_id[t][p] == s + current_run.seq_offset) {
+                                batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur + current_run.seq_offset;
+                                batch_tgt.n_seq_id[t]++;
+                                break;
                             }
+                        }
+                    }
 
 
-                            // copy the draft state
-                            drafts[n_seq_cur].active = true;
-                            drafts[n_seq_cur].drafting = true;
-                            drafts[n_seq_cur].skip = false;
+                    // copy the draft state
+                    drafts[n_seq_cur].active = true;
+                    drafts[n_seq_cur].drafting = true;
+                    drafts[n_seq_cur].skip = false;
 
-                            drafts[n_seq_cur].tokens = drafts[s].tokens;
-                            drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
+                    drafts[n_seq_cur].tokens = drafts[s].tokens;
+                    drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
 
-                            llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
+                    llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
 
-                            sa.push_back(n_seq_cur);
+                    sa.push_back(n_seq_cur);
 
 
-                        } else {
-                            break;
-                        }
-                    }
+                } else {
+                    break;
+                }
+            }
 
-                    // add drafted token for each sequence
-                    // TODO commenting this out fixes async
-                    for (int is = 0; is < (int) sa.size(); ++is) {
-                        const llama_token id = cur_p[is].id;
+            // add drafted token for each sequence
+            // TODO commenting this out fixes async
+            for (int is = 0; is < (int) sa.size(); ++is) {
+                const llama_token id = cur_p[is].id;
 
-                        const int s = sa[is];
+                const int s = sa[is];
 
-                        llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
+                llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
 
-                        drafts[s].tokens.push_back(id);
+                drafts[s].tokens.push_back(id);
 
-                        // add unique drafted tokens to the target batch
+                // add unique drafted tokens to the target batch
 
-                        drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
+                drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
 
-                        LOG("Adding drafted token %d to tgt, sequence %d, position %d, i_batch_tgt %d\n", id,
-                            s + seq_offset, spec_past_tgt + i, batch_tgt.n_tokens);
-                        llama_batch_add(batch_tgt, id, spec_past_tgt + i, {s + seq_offset}, true);
+                LOG("Adding drafted token %d to tgt, sequence %d, position %d, i_batch_tgt %d\n", id,
+                    s + current_run.seq_offset, spec_past_tgt + i, batch_tgt.n_tokens);
+                llama_batch_add(batch_tgt, id, spec_past_tgt + i, {s + current_run.seq_offset}, true);
 
-                        // add the token to the batch for batched decoding with the draft model
-                        drafts[s].i_batch_dft = batch_dft.n_tokens;
+                // add the token to the batch for batched decoding with the draft model
+                drafts[s].i_batch_dft = batch_dft.n_tokens;
 
-                        LOG("Adding drafted token %d to dft\n", id);
+                LOG("Adding drafted token %d to dft, sequence %d, position %d\n", id, s + current_run.seq_offset, n_past_cur);
 
-                        llama_batch_add(batch_dft, id, n_past_cur, {s + seq_offset}, true);
+                llama_batch_add(batch_dft, id, n_past_cur, {s + current_run.seq_offset}, true);
 
-                        if (batch_tgt.n_tokens > n_draft) {
-                            drafts[s].drafting = false;
-                        }
-                    }
+                if (batch_tgt.n_tokens > params.n_draft) {
+                    drafts[s].drafting = false;
                 }
+            }
+        }
 
-                // no sequence is drafting anymore
-                if (batch_dft.n_tokens == 0) {
-                    break;
-                }
+        // no sequence is drafting anymore
+        if (batch_dft.n_tokens == 0) {
+            break;
+        }
 
-                // evaluate the drafted tokens on the draft model
-                LOG("Running synchronous draft decode\n");
-                llama_decode(ctx_dft, batch_dft);
-                ++n_past_cur;
-                ++n_drafted;
+        // evaluate the drafted tokens on the draft model
+        LOG("Running synchronous draft decode while still drafting\n");
+        LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+        llama_decode(ctx_dft, batch_dft);
+        ++n_past_cur;
+        ++n_drafted;
 
-                max_ran_seq = n_seq_cur;
+        max_ran_seq = n_seq_cur;
 
-                llama_batch_clear(batch_dft);
+        llama_batch_clear(batch_dft);
 
-                if (batch_tgt.n_tokens > n_draft) {
-                    break;
-                }
-            }
+        if (batch_tgt.n_tokens > params.n_draft) {
+            break;
+        }
+    }
 
-            // no sequence is drafting anymore
-            if (batch_dft.n_tokens != 0) {
-                // evaluate the drafted tokens on the draft model
-                LOG("Running synchronous draft decode\n");
-                llama_decode(ctx_dft, batch_dft);
+    // no sequence is drafting anymore
+    if (batch_dft.n_tokens != 0) {
+        // evaluate the drafted tokens on the draft model
+        LOG("Running synchronous draft decode when no seqs drafting\n");
+        LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+        llama_decode(ctx_dft, batch_dft);
 
-            }
+    }
 
 
 
@@ -1025,140 +1041,157 @@ int main(int argc, char ** argv) {
 
 
 
-            // evaluate the target model on the drafted tokens
-            {
+    // evaluate the target model on the drafted tokens
+    {
 //            llama_kv_cache_seq_keep(ctx_tgt, 0); // Needed to get to "Here's the code:"
 
 
 
 
 
-                if (batch_tgt.n_tokens == 0) {
-                    free_sequence_offsets.push_back(seq_offset);
-                    break;
-                }
+        if (batch_tgt.n_tokens == 0) {
+//            fprintf(stderr, "\nNo tgt tokens, pushing seq offset %d to free seq offsets\n", current_run.seq_offset);
+//            fflush(stderr);
+            free_sequence_offsets.push_back(current_run.seq_offset);
+            return true;
+        }
 
-                size_t max_draft_tokens = 0;
+//        bool is_waiting = llama_mpi_iprobe(ctx_tgt);
+//        llama_swap_comm(ctx_tgt);
+//        llama_sync_token(ctx_tgt, reinterpret_cast<llama_token *>(&is_waiting), 0);
+//        llama_swap_comm(ctx_tgt);
+//
+//        if (is_waiting) {
+//            free_sequence_offsets.push_back(current_run.seq_offset);
+//            return true;
+//        }
 
-                for (int s = 0; s < n_seq_dft; ++s) {
-                    if (!drafts[s].active) {
-                        continue;
-                    }
+        size_t max_draft_tokens = 0;
 
-                    drafts[s].tokens.erase(drafts[s].tokens.begin());
-                    max_draft_tokens = std::max(max_draft_tokens, drafts[s].tokens.size());
-                    //drafts[s].tokens.erase(drafts[s].tokens.begin());
-                }
+        for (int s = 0; s < params.n_parallel; ++s) {
+            if (!drafts[s].active) {
+                continue;
+            }
 
-                batch_id++;
+            drafts[s].tokens.erase(drafts[s].tokens.begin());
+            max_draft_tokens = std::max(max_draft_tokens, drafts[s].tokens.size());
+            //drafts[s].tokens.erase(drafts[s].tokens.begin());
+        }
 
 
-                LOG("target batch: %s\n, batch_id = %d\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str(), batch_id);
-                struct seq_async_run run;
-                run.canceled = false;
-                run.speculative = true;
-                run.seq_offset = seq_offset;
-                run.ctx_sampling = llama_sampling_init(params.sparams);
-                llama_sampling_cp(ctx_sampling, run.ctx_sampling);
-                run.drafts = std::vector<seq_draft>(n_seq_dft);
-                for (int s = 0; s < n_seq_dft; ++s) {
-                    run.drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
-                    llama_sampling_cp(drafts[s].ctx_sampling, run.drafts[s].ctx_sampling);
-                    run.drafts[s].i_batch_tgt = drafts[s].i_batch_tgt;
-                    run.drafts[s].tokens = drafts[s].tokens;
-                    run.drafts[s].active = drafts[s].active;
-                    run.drafts[s].drafting = drafts[s].drafting;
-                    run.drafts[s].skip = drafts[s].skip;
-                    run.drafts[s].i_batch_dft = drafts[s].i_batch_dft;
-                    run.drafts[s].prefix_tokens = drafts[s].prefix_tokens;
-                }
-                run.i_dft = offset;
-                run.s_keep = s_keep;
-                run.batch = llama_batch_init(params.n_ctx, 0, max_seq);
-                run.batch.batch_id = batch_id;
-                run.batch.n_tokens = batch_tgt.n_tokens;
-                for (int i = 0; i < batch_tgt.n_tokens; i++) {
-                    run.batch.n_seq_id[i] = batch_tgt.n_seq_id[i];
-                    int cur_n_seqs = 0;
-                    for (int j = 0; j < run.batch.n_seq_id[i]; j++) {
-                        run.batch.seq_id[i][j] = batch_tgt.seq_id[i][j];
-                    }
-                    run.batch.token[i] = batch_tgt.token[i];
-                    run.batch.pos[i] = batch_tgt.pos[i];
-                    run.batch.logits[i] = batch_tgt.logits[i];
-                }
-                run.run_id = 0;
-                run.n_past_tgt = spec_past_tgt;
-                run.prefix_n_past_tgt = n_past_tgt;
-                run.n_past_dft = n_past_dft;
-                run.n_past_max = spec_past_tgt + max_draft_tokens;
-                run.cgraph = llama_start_async_decode(*ctx_tgt, run.batch);
-                tgt_cgraphs.push_front(run);
-
-                spec_past_tgt += run.drafts[0].tokens.size();
-                spec_past_dft += run.drafts[0].tokens.size();
-                id = run.drafts[0].tokens.back();
-                first_run = false;
 
-            }
+        begin_async_run(params.sparams, params.n_parallel, ctx_tgt, max_seq, n_past_dft, drafts, tgt_cgraphs,
+                        batch_id, spec_past_tgt, kvc, true, batch_tgt, spec_past_tgt + drafts[0].tokens.size(), prefix_n_past, current_run.seq_offset);
 
-            is_waiting = llama_mpi_iprobe(ctx_tgt);
-            llama_swap_comm(ctx_tgt);
-            llama_sync_token(ctx_tgt, reinterpret_cast<llama_token *>(&is_waiting), 0);
-            llama_swap_comm(ctx_tgt);
+        spec_past_tgt += drafts[0].tokens.size();
+        spec_past_dft += drafts[0].tokens.size();
+        id = drafts[0].tokens.back();
+        first_run = false;
 
-            iter++;
-//            break;
+//        LOG("Beginning tgt spec run, run.prefix_n_past=%d, run.prefix_n_past_tgt=%d, run.n_past_dft=%d, run.n_past_max=%d, new spec_past_tgt=%d, new spec_past_dft=%d, new id=%d\n",
+//            run.prefix_n_past, run.prefix_n_past_tgt, run.n_past_dft, run.n_past_max, spec_past_tgt, spec_past_dft, id
+//        );
 
-        }
+    }
 
-        if (n_predict > params.n_predict || has_eos) {
-            break;
-        }
+    return false;
 
 
+}
 
+void begin_async_run(const llama_sampling_params& sparams, const int n_seq_dft,
+                     llama_context *ctx_tgt, const int max_seq,
+                     int n_past_dft, const std::vector<seq_draft> &drafts,
+                     std::deque<struct seq_async_run> &tgt_cgraphs,
+                     int32_t &batch_id, int &n_past, llama_kv_cache_view &kvc_view,
+                     const bool is_spec, llama_batch batch, const int n_past_max, const int prefix_n_past, const int seq_offset) {
+    batch_id++;
 
-    }
 
-    auto t_dec_end = ggml_time_us();
+    LOG("Beginning async decode, batch id = %d\n", batch_id);
 
-    LOG_TEE("\n\n");
 
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
 
-    LOG_TEE("\n");
-    LOG_TEE("n_draft   = %d\n", n_draft);
-    LOG_TEE("n_predict = %d\n", n_predict);
-    LOG_TEE("n_drafted = %d\n", n_drafted);
-    LOG_TEE("n_accept  = %d\n", n_accept);
-    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
-    LOG_TEE("\ndraft:\n");
-    llama_print_timings(ctx_dft);
 
-    LOG_TEE("\ntarget:\n");
-    llama_print_timings(ctx_tgt);
+    // batch_tgt.n_tokens = 1
 
-    llama_sampling_free(ctx_sampling);
-    for (int s = 0; s < n_seq_dft; ++s) {
-        llama_sampling_free(drafts[s].ctx_sampling);
+
+    struct seq_async_run run;
+    run.seq_offset = seq_offset;
+    run.batch = llama_batch_init(1028, 0, max_seq);
+    run.batch.batch_id = batch_id;
+    run.batch.n_tokens = batch.n_tokens;
+    for (int i = 0; i < batch.n_tokens; i++) {
+        run.batch.n_seq_id[i] = batch.n_seq_id[i];
+        int cur_n_seqs = 0;
+        for (int j = 0; j < run.batch.n_seq_id[i]; j++) {
+            run.batch.seq_id[i][j] = batch.seq_id[i][j];
+        }
+        run.batch.token[i] = batch.token[i];
+        run.batch.pos[i] = batch.pos[i];
+        run.batch.logits[i] = batch.logits[i];
     }
+    run.batch.batch_id = batch_id;
+    run.canceled = false;
+    run.s_keep = 0;
+//            if (!free_sequence_offsets.empty()) {
+//                run.seq_offset = free_sequence_offsets.front();
+//                printf("Popping %d from seq offsets\n", run.seq_offset);
+//                free_sequence_offsets.pop_front();
+//            } else if(!tgt_cgraphs.empty()){
+//                printf("Getting offset from head of tgt cgraphs\n");
+//                run.seq_offset = tgt_cgraphs.front().seq_offset;
+//            } else {
+//                printf("NO FREE OFFSETS AND NO TGT CGRAPHS\n");
+//            }
 
-    llama_batch_free(batch_dft);
 
-    llama_free(ctx_tgt);
-    llama_free_model(model_tgt);
 
-    llama_free(ctx_dft);
-    llama_free_model(model_dft);
+    LOG("target async batch: %s\n, batch_id = %d\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, run.batch).c_str(),
+        batch_id);
 
-    llama_backend_free();
+    run.drafts = std::vector<seq_draft>(n_seq_dft);
+    for (int s = 0; s < n_seq_dft; ++s) {
+        run.drafts[s].ctx_sampling = llama_sampling_init(sparams);
+        llama_sampling_cp(drafts[s].ctx_sampling, run.drafts[s].ctx_sampling);
+        run.drafts[s].i_batch_tgt = drafts[s].i_batch_tgt;
+        run.drafts[s].i_batch_dft = drafts[s].i_batch_dft;
+        run.drafts[s].tokens = drafts[s].tokens;
+        run.drafts[s].active = drafts[s].active;
+        run.drafts[s].drafting = drafts[s].drafting;
+        run.drafts[s].skip = drafts[s].skip;
+        run.drafts[s].prefix_tokens = drafts[s].prefix_tokens;
+    }
+    run.n_past_tgt = n_past;
+    run.prefix_n_past_tgt = prefix_n_past;
+    run.n_past_max = n_past_max;
+    run.n_past_dft = n_past_dft;
+    run.speculative = is_spec;
+
+    if (!is_spec) {
+        for (int i = 0; i <= max_seq; i++) {
+            llama_kv_cache_seq_rm(ctx_tgt, i, n_past, n_past + 1);
+        }
+    } else {
+        for (int i = 0; i < n_seq_dft; i++) {
+            llama_kv_cache_seq_rm(ctx_tgt, i+seq_offset, n_past, n_past + 1);
+        }
+    }
+    run.cgraph = llama_start_async_decode(*ctx_tgt, run.batch);
+    tgt_cgraphs.push_front(run);
 
-    fprintf(stderr, "\n\n");
+    if (!is_spec) {
+        for (int i = 1; i <= max_seq; i++) {
+            llama_kv_cache_seq_cp(ctx_tgt, 0, i, n_past, n_past + 1);
+        }
+    }
 
-    return 0;
+    if (llama_node_id(ctx_tgt) == 0) {
+//        llama_kv_cache_view_update(ctx_tgt, &kvc_view);
+//        LOG("Done running non-spec, cache view:\n%s", dump_kv_cache_view_seqs(kvc_view, 1).c_str());
+//        printf("\nBeginning async run, batch id: %d, batch: %s\n", run.batch.batch_id, LOG_BATCH_TOSTR_PRETTY(ctx_tgt, run.batch).c_str());
+    }
 }
 
 void check_for_cancel(llama_context *ctx_tgt, int n_past_tgt, std::deque<struct seq_async_run> &tgt_cgraphs,
@@ -1169,9 +1202,11 @@ void check_for_cancel(llama_context *ctx_tgt, int n_past_tgt, std::deque<struct
             bool correct_prefix = true;
 
             if (run.speculative && n_past_tgt >= run.prefix_n_past_tgt) {
-                for (int draft_id = 0; draft_id < n_seq_dft; draft_id++) {
+                for (int draft_id = n_seq_dft - 1; draft_id >= 0; draft_id--) {
                     if (!run.drafts[draft_id].tokens.empty()) {
                         correct_prefix = true;
+                    } else {
+                        continue;
                     }
                     size_t draft_index = 0;
                     int prev_token = -1;
@@ -1204,13 +1239,16 @@ void check_for_cancel(llama_context *ctx_tgt, int n_past_tgt, std::deque<struct
                         draft_index++;
                         index = run.prefix_n_past_tgt + draft_index;
                     }
+                    if (correct_prefix) {
+                        run.s_keep = draft_id;
+                    }
                 }
             }
 
 
             if (run.n_past_max < n_past_tgt || !correct_prefix) {
-                LOG("Cancelling run with ID %d, batch ID %d, run.npast_max %d, run.n_past_tgt %d, n_past_tgt %d, run_speculative %d, tokens[0] %d, generated: %d, generated index: %zu\n",
-                    run.run_id, run.batch.batch_id, run.n_past_max, run.n_past_tgt, n_past_tgt, run.speculative,
+                LOG("Cancelling batch ID %d, run.npast_max %d, run.n_past_tgt %d, n_past_tgt %d, run_speculative %d, tokens[0] %d, generated: %d, generated index: %zu\n",
+                    run.batch.batch_id, run.n_past_max, run.n_past_tgt, n_past_tgt, run.speculative,
                     run.drafts[0].tokens[0], (n_past_tgt < run.n_past_tgt) ? -1 : generated.at(
                         generated.size() - (n_past_tgt - run.n_past_tgt + 1)),
                     generated.size() - (n_past_tgt - run.n_past_tgt + 1));
@@ -1218,6 +1256,12 @@ void check_for_cancel(llama_context *ctx_tgt, int n_past_tgt, std::deque<struct
                 if (run.speculative) {
                     // TODO put these in a vector so they are transmitted in a burst
                     canceled_batches.push_back(run.batch.batch_id);
+                    for (int i = 0; i < n_seq_dft; i++) {
+
+//                        llama_kv_cache_seq_rm  (ctx_tgt, i+run.seq_offset, run.n_past_tgt, -1);
+
+
+                    }
                 }
                 run.canceled = true;
 ////                }

From cfe3120e7248a0701a03a4174842ed45e8600522 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 15 Jan 2024 13:36:53 -0600
Subject: [PATCH 60/63] Fix main, kinda

---
 examples/main/main.cpp | 13 ++++++++++++-
 llama.cpp              | 31 +++++++++++++++----------------
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 939bea23264..5de7d144513 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -242,6 +242,13 @@ int main(int argc, char ** argv) {
             params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
         }
         embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+        llama_batch batch = llama_batch_init(n_ctx, 0, 1);
+
+        for (int i = 0; i < embd_inp.size()-1; i++) {
+            llama_batch_add(batch, embd_inp[i], i, {0}, true);
+
+        }
+        llama_decode(ctx, batch);
     } else {
         LOG("use session tokens\n");
         embd_inp = session_tokens;
@@ -597,7 +604,11 @@ int main(int argc, char ** argv) {
 
                 LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
 
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
+                llama_batch batch = llama_batch_init(n_eval, 0, 1);
+                for (int j = 0; j < n_eval; j++) {
+                    llama_batch_add(batch, embd[i+j], n_past+j, {0}, true);
+                }
+                if (llama_decode(ctx, batch)) {
                     LOG_TEE("%s : failed to eval\n", __func__);
                     return 1;
                 }
diff --git a/llama.cpp b/llama.cpp
index bfac601f5f7..e3bd56db936 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5734,7 +5734,7 @@ static struct ggml_cgraph * llama_decode_internal_phased(
 #ifdef GGML_USE_MPI
         if (ggml_mpi_size(lctx.ctx_mpi) > 1 && ggml_mpi_rank(lctx.ctx_mpi) == 0) {
             // TODO print logits array for comparison
-            ggml_mpi_recv_float_array(lctx.ctx_mpi, logits_out.data(), n_vocab * n_tokens, ggml_mpi_size(lctx.ctx_mpi) - 1, GGML_MPI_SYNC_LOGITS);
+            ggml_mpi_recv_float_array(lctx.ctx_mpi, logits_out.data(), (batch.logits || lctx.logits_all) ? n_vocab * n_tokens : n_vocab, ggml_mpi_size(lctx.ctx_mpi) - 1, GGML_MPI_SYNC_LOGITS);
 //            printf("\nReceived %zu logits, logits_out.size = %zu\n", n_vocab * n_tokens, logits_out.size());
 //            printf("batch: %s\n", LOG_BATCH_TOSTR_PRETTY(&lctx, batch).c_str());
 //            for (auto logit : logits_out) {
@@ -5759,20 +5759,19 @@ static struct ggml_cgraph * llama_decode_internal_phased(
         // TODO: do not compute and extract logits if only embeddings are needed
         //       need to update the graphs to skip "result_output"
         {
-            memcpy(logits_out.data(), net_output, sizeof(float)*n_vocab*n_tokens);
-
-//            if (batch.logits) {
-//                for (uint32_t i = 0; i < n_tokens; i++) {
-//                    if (batch.logits[i] == 0) {
-//                        continue;
-//                    }
-//                    memcpy(logits_out.data() + (n_vocab*i),  net_output + (n_vocab*i), sizeof(float)*n_vocab);
-//                }
-//            } else if (lctx.logits_all) {
-//                memcpy(logits_out.data(), net_output, sizeof(float)*n_vocab*n_tokens);
-//            } else {
-//                memcpy(logits_out.data(), net_output + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
-//            }
+
+            if (batch.logits) {
+                for (uint32_t i = 0; i < n_tokens; i++) {
+                    if (batch.logits[i] == 0) {
+                        continue;
+                    }
+                    memcpy(logits_out.data() + (n_vocab*i),  net_output + (n_vocab*i), sizeof(float)*n_vocab);
+                }
+            } else if (lctx.logits_all) {
+                memcpy(logits_out.data(), net_output, sizeof(float)*n_vocab*n_tokens);
+            } else {
+                memcpy(logits_out.data(), net_output + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
+            }
         }
 
         // extract embeddings
@@ -5787,7 +5786,7 @@ static struct ggml_cgraph * llama_decode_internal_phased(
         }
         if (ggml_mpi_size(lctx.ctx_mpi) > 1 && ggml_mpi_rank(lctx.ctx_mpi) == ggml_mpi_size(lctx.ctx_mpi) - 1) {
 //            printf("\nSent %zu logits, logits_out.size = %zu\nbatch: %s\n", n_vocab * n_tokens, logits_out.size(), LOG_BATCH_TOSTR_PRETTY(&lctx, batch).c_str());
-            ggml_mpi_send_float_array_async(lctx.ctx_mpi, logits_out.data(), n_vocab * n_tokens, 0, GGML_MPI_SYNC_LOGITS);
+            ggml_mpi_send_float_array_async(lctx.ctx_mpi, logits_out.data(), (batch.logits || lctx.logits_all) ? n_vocab * n_tokens : n_vocab, 0, GGML_MPI_SYNC_LOGITS);
 //            llama_kv_cache_view view = llama_kv_cache_view_init(&lctx, 21);
 //            llama_kv_cache_view_update(&lctx, &view);
 //            printf("Cache view:\n%s\n", dump_kv_cache_view_seqs(view, 1).c_str());

From d3baaf7687a91f021ff4b44fd363f9d5a9c148d1 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Fri, 19 Jan 2024 17:53:39 -0600
Subject: [PATCH 61/63] Both main and speculative mostly working, add latencies
 to speculative, change timing to thread cycles

---
 examples/main/main.cpp               |  8 +++-
 examples/speculative/speculative.cpp | 56 ++++++++++++++++++++++++----
 ggml-mpi.c                           | 39 ++++++++++++++++++-
 ggml-mpi.h                           |  4 ++
 ggml.c                               |  4 +-
 llama.cpp                            | 26 ++++++++++---
 llama.h                              |  3 +-
 7 files changed, 122 insertions(+), 18 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 5de7d144513..2a9965cb4d1 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -235,7 +235,7 @@ int main(int argc, char ** argv) {
     LOG("add_bos: %d\n", add_bos);
 
     std::vector<llama_token> embd_inp;
-
+    int n_past             = 0;
     if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
         LOG("tokenize the prompt\n");
         if (params.chatml) {
@@ -249,6 +249,10 @@ int main(int argc, char ** argv) {
 
         }
         llama_decode(ctx, batch);
+        llama_token last = embd_inp.back();
+        n_past = embd_inp.size()-2;
+        embd_inp.clear();
+        embd_inp.push_back(last);
     } else {
         LOG("use session tokens\n");
         embd_inp = session_tokens;
@@ -465,7 +469,7 @@ int main(int argc, char ** argv) {
     bool input_echo           = true;
     bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();
 
-    int n_past             = 0;
+
     int n_remain           = params.n_predict;
     int n_consumed         = 0;
     int n_session_consumed = 0;
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 4f7a3162a33..50b3d4ffc21 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -299,6 +299,12 @@ int main(int argc, char ** argv) {
     int spec_past_tgt = n_past_tgt;
     int spec_past_dft = n_past_dft;
 
+    long ttft = ggml_time_us();
+    std::vector<uint64_t > inter_token_times;
+    int64_t itt_start;
+    bool first_token = false;
+    bool has_run_first_token = false;
+
     bool first_run = true;
     llama_token id;
     while (true) {
@@ -456,6 +462,18 @@ int main(int argc, char ** argv) {
                 if (current_run.speculative) {
                     n_accept++;
                 }
+
+                if (has_run_first_token) {
+                    if (first_token) {
+                        ttft = ggml_time_us() - ttft;
+                        LOG("\nTTFT: %ld\n", ttft);
+                        first_token = false;
+                    } else {
+                        inter_token_times.push_back(ggml_time_us() - itt_start);
+                    }
+
+                    itt_start = ggml_time_us();
+                }
                 llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
 
                 // Root of WORLD
@@ -607,6 +625,12 @@ int main(int argc, char ** argv) {
         begin_non_spec_run(params, n_seq_dft, ctx_dft, max_seq, drafts, id, batch_id, n_past_dft, n_past_dft, dft_cgraphs,
                            kvc_view_dft);
 
+        if (!has_run_first_token) {
+
+            has_run_first_token = true;
+            first_token = true;
+        }
+
         seq_async_run dft_run = dft_cgraphs.back();
         dft_cgraphs.pop_back();
         llama_finish_async_decode(*ctx_dft, dft_run.batch, dft_run.cgraph);
@@ -648,11 +672,21 @@ int main(int argc, char ** argv) {
 
     auto t_dec_end = ggml_time_us();
 
+    uint64_t avg_itt = 0;
+    for (auto latency : inter_token_times) {
+        avg_itt += latency;
+    }
+
+    avg_itt = avg_itt / inter_token_times.size();
+
     LOG_TEE("\n\n");
 
     LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
     LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
-
+    LOG_TEE("Average inter-token latency: %f seconds\n", avg_itt / 1e6f);
+    LOG_TEE("Time-to-first-token: %f seconds\n", ttft / 1e6f);
+    
+    
     LOG_TEE("\n");
     LOG_TEE("n_draft   = %d\n", n_draft);
     LOG_TEE("n_predict = %d\n", n_predict);
@@ -671,7 +705,17 @@ int main(int argc, char ** argv) {
         llama_sampling_free(drafts[s].ctx_sampling);
     }
 
-    llama_batch_free(batch_dft);
+    if (llama_node_id(ctx_tgt) == 0) {
+        for (size_t i = tgt_cgraphs.size() - 1; i >= 0; i--) {
+            const auto &run = tgt_cgraphs[i];
+            llama_finish_async_decode(*ctx_tgt, run.batch, run.cgraph);
+        }
+    }
+
+    for (size_t i = dft_cgraphs.size()-1; i >= 0; i--) {
+        const auto& run = dft_cgraphs[i];
+        llama_finish_async_decode(*ctx_dft, run.batch, run.cgraph);
+    }
 
     llama_free(ctx_tgt);
     llama_free_model(model_tgt);
@@ -852,10 +896,6 @@ bool start_async_spec_run(const gpt_params &params, llama_context *ctx_tgt, llam
     }
 
 
-    if (n_predict > params.n_predict || has_eos) {
-                return true;
-    }
-
     llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
 
     int n_seq_cur  = 0;
@@ -1078,7 +1118,9 @@ bool start_async_spec_run(const gpt_params &params, llama_context *ctx_tgt, llam
             //drafts[s].tokens.erase(drafts[s].tokens.begin());
         }
 
-
+        if (first_run) {
+            ++n_drafted;
+        }
 
         begin_async_run(params.sparams, params.n_parallel, ctx_tgt, max_seq, n_past_dft, drafts, tgt_cgraphs,
                         batch_id, spec_past_tgt, kvc, true, batch_tgt, spec_past_tgt + drafts[0].tokens.size(), prefix_n_past, current_run.seq_offset);
diff --git a/ggml-mpi.c b/ggml-mpi.c
index 11178a93dc7..b1e9b9b047c 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -108,7 +108,7 @@ void ggml_mpi_free(struct ggml_mpi_context * ctx) {
 
     ggml_mpi_sync_pipelined(ctx, NULL, 0, MPI_INT8_T, GGML_MPI_SHUTDOWN);
     int buffer_size = 128*1024*1024;
-    MPI_Buffer_detach(ctx->send_buffer, &buffer_size);
+    MPI_Buffer_detach(&ctx->send_buffer, &buffer_size);
     MPI_Comm_free(&(ctx->comm));
     free(ctx);
 }
@@ -253,7 +253,44 @@ bool ggml_mpi_eval_init(
 
 
     ggml_mpi_sync_pipelined(ctx_mpi, n_tokens, 1, MPI_INT, GGML_MPI_N_TOKENS);
+    int8_t* temp_logits = (int8_t*) calloc(*n_tokens, sizeof(int8_t));
 
+    if (ctx_mpi->rank == 0 && *logits != NULL) {
+        ggml_mpi_sync_pipelined(ctx_mpi, *logits, *n_tokens, MPI_INT8_T, GGML_MPI_BATCH_LOGITS);
+    } else {
+        ggml_mpi_sync_pipelined(ctx_mpi, temp_logits, *n_tokens, MPI_INT8_T, GGML_MPI_BATCH_LOGITS);
+    }
+
+
+
+
+
+
+
+    if (ctx_mpi->rank != 0) {
+        bool should_set_batch_logits = false;
+        for (int i = 0; i < *n_tokens; i++) {
+            if (temp_logits[i]) {
+                should_set_batch_logits = true;
+                break;
+            }
+        }
+        if (should_set_batch_logits) {
+            if (*logits != NULL) {
+                free(*logits);
+                *logits = NULL;
+            }
+            *logits = temp_logits;
+        } else {
+            if (*logits != NULL) {
+                free(*logits);
+                *logits = NULL;
+            }
+            free(temp_logits);
+        }
+    } else {
+        free(temp_logits);
+    }
 
     // For now, we assume that the pos, seq_ids, tokens, etc have been
     // pre-allocated for the largest possible sizes, even on worker nodes.
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 1b8c3d091ee..ca2365862c1 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -49,6 +49,10 @@ extern "C" {
 
 #define GGML_MPI_BEGIN_TRANSACTION 18
 
+#define GGML_MPI_MAX_N_SEQ 19
+
+#define GGML_MPI_BATCH_LOGITS 20
+
 /**
  * The context used for MPI operations,
  * a program may make use of more than one
diff --git a/ggml.c b/ggml.c
index ce495a7e2e8..6b1275cf3ee 100644
--- a/ggml.c
+++ b/ggml.c
@@ -362,13 +362,13 @@ int64_t ggml_time_us(void) {
 void ggml_time_init(void) {}
 int64_t ggml_time_ms(void) {
     struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
     return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
 }
 
 int64_t ggml_time_us(void) {
     struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
     return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
 }
 #endif
diff --git a/llama.cpp b/llama.cpp
index e3bd56db936..47d897fb15f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5463,6 +5463,10 @@ static struct ggml_cgraph * llama_decode_internal_phased(
             llama_batch  & batch,
             uint8_t phase,
             ggml_cgraph * cgraph) {
+    if (ggml_mpi_rank(lctx.ctx_mpi) < 0) {
+        return nullptr;
+    }
+
     if (phase == 0) {
         if (ggml_mpi_rank(lctx.ctx_mpi) == 0 && ggml_mpi_size(lctx.ctx_mpi) > 1) {
             int transaction_type = GGML_MPI_DECODE;
@@ -5471,6 +5475,8 @@ static struct ggml_cgraph * llama_decode_internal_phased(
         ggml_mpi_sync_ints_pipelined(lctx.ctx_mpi, &batch.batch_id, 1, GGML_MPI_BATCH_ID);
 
         ggml_mpi_sync_ints_pipelined(lctx.ctx_mpi, &batch.n_tokens, 1, GGML_MPI_N_TOKENS);
+
+        ggml_mpi_sync_ints_pipelined(lctx.ctx_mpi, &batch.max_n_seq, 1, GGML_MPI_MAX_N_SEQ);
     }
     uint32_t n_tokens = batch.n_tokens;
     if (n_tokens == 0) {
@@ -5487,7 +5493,7 @@ static struct ggml_cgraph * llama_decode_internal_phased(
     GGML_ASSERT(n_tokens <= n_batch);
 
     int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
-    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+
 
     const int64_t t_start_us = ggml_time_us();
 
@@ -5523,7 +5529,7 @@ static struct ggml_cgraph * llama_decode_internal_phased(
         seq_id_arr.resize(n_tokens);
         for (uint32_t i = 0; i < n_tokens; i++) {
             n_seq_id[i] = 1;
-            seq_id[i].resize(1);
+            seq_id[i].resize(batch.max_n_seq);
             seq_id[i][0] = batch.all_seq_id;
             seq_id_arr[i] = seq_id[i].data();
         }
@@ -5548,6 +5554,8 @@ static struct ggml_cgraph * llama_decode_internal_phased(
         }
         n_tokens = batch.n_tokens;
 #endif
+
+        GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
         if (!llama_kv_cache_find_slot(kv_self, batch)) {
             printf("Cannot find cache slot\n");
             return nullptr;
@@ -9837,12 +9845,12 @@ struct llama_batch llama_batch_get_one(
         /*all_pos_0      =*/ pos_0,
         /*all_pos_1      =*/ 1,
         /*all_seq_id     =*/ seq_id,
-        0
+        0, 1
     };
 }
 
 struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
-    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0,};
+    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, n_seq_max};
 
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
@@ -9861,7 +9869,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_se
     return batch;
 }
 
-void llama_batch_free(struct llama_batch batch) {
+void llama_batch_free(struct llama_batch & batch) {
     if (batch.token)    free(batch.token);
     if (batch.embd)     free(batch.embd);
     if (batch.pos)      free(batch.pos);
@@ -9873,6 +9881,13 @@ void llama_batch_free(struct llama_batch batch) {
         free(batch.seq_id);
     }
     if (batch.logits)   free(batch.logits);
+
+    batch.token = nullptr;
+    batch.embd = nullptr;
+    batch.pos = nullptr;
+    batch.n_seq_id = nullptr;
+    batch.seq_id = nullptr;
+    batch.logits = nullptr;
 }
 
 #ifdef GGML_USE_MPI
@@ -9887,6 +9902,7 @@ int llama_process_mpi_transaction(
 
     switch (tag) {
         case GGML_MPI_DECODE:
+//            llama_batch_free(batch);
             return llama_decode_internal(*ctx, batch);
             break;
         case GGML_MPI_KV_CLEAR:
diff --git a/llama.h b/llama.h
index 77112853113..7ccbdf03064 100644
--- a/llama.h
+++ b/llama.h
@@ -158,6 +158,7 @@ extern "C" {
         llama_pos    all_pos_1;  // used if pos == NULL
         llama_seq_id all_seq_id; // used if seq_id == NULL
         int32_t      batch_id;
+        int32_t max_n_seq;
     } llama_batch;
 
     struct llama_model_params {
@@ -581,7 +582,7 @@ LLAMA_API void llama_kv_cache_seq_cp_sync_bi(
             int32_t n_seq_max);
 
     // Frees a batch of tokens allocated with llama_batch_init()
-    LLAMA_API void llama_batch_free(struct llama_batch batch);
+    LLAMA_API void llama_batch_free(struct llama_batch & batch);
 
     // Positive return values does not mean a fatal error, but rather a warning.
     //   0 - success

From d23b996b369bd800598e3deeda435e46f4c62e26 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sun, 21 Jan 2024 15:02:43 -0600
Subject: [PATCH 62/63] Add latencies to main

---
 common/common.cpp      |  4 +++-
 examples/main/main.cpp | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 055d8178ca8..a7ffe01677c 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -997,7 +997,9 @@ void llama_batch_add(
     for (size_t i = 0; i < seq_ids.size(); ++i) {
         batch.seq_id[batch.n_tokens][i] = seq_ids[i];
     }
-    batch.logits  [batch.n_tokens] = logits;
+    if (batch.logits) {
+        batch.logits[batch.n_tokens] = logits;
+    }
 
     batch.n_tokens++;
 }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 2a9965cb4d1..842a5e2a465 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -487,6 +487,12 @@ int main(int argc, char ** argv) {
 
     struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
 
+    long ttft = ggml_time_ms();
+    std::vector<uint64_t > inter_token_times;
+    int64_t itt_start;
+    bool first_token = false;
+    bool has_run_first_token = false;
+
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
         if (!embd.empty()) {
@@ -640,6 +646,18 @@ int main(int argc, char ** argv) {
                 LOG("saved session to %s\n", path_session.c_str());
             }
 
+            if (has_run_first_token) {
+                if (first_token) {
+                    ttft = ggml_time_ms() - ttft;
+                    LOG("\nTTFT: %ld\n", ttft);
+                    first_token = false;
+                } else {
+                    inter_token_times.push_back(ggml_time_ms() - itt_start);
+                }
+
+                itt_start = ggml_time_ms();
+            }
+
             const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
 
             llama_sampling_accept(ctx_sampling, ctx, id, true);
@@ -655,6 +673,13 @@ int main(int argc, char ** argv) {
             --n_remain;
 
             LOG("n_remain: %d\n", n_remain);
+
+            if (!has_run_first_token && (int) embd_inp.size() <= n_consumed) {
+
+                has_run_first_token = true;
+                first_token = true;
+                ttft = ggml_time_ms();
+            }
         } else {
             // some user input remains from prompt or interaction, forward it to processing
             LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
@@ -867,6 +892,16 @@ int main(int argc, char ** argv) {
     llama_print_timings(ctx);
     write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
 
+    uint64_t avg_itt = 0;
+    for (auto latency : inter_token_times) {
+        avg_itt += latency;
+    }
+
+    avg_itt = avg_itt / inter_token_times.size();
+
+    LOG_TEE("Average inter-token latency: %ld microseconds\n", avg_itt);
+    LOG_TEE("Time-to-first-token: %ld microseconds\n", ttft);
+
     if (ctx_guidance) { llama_free(ctx_guidance); }
     llama_free(ctx);
     llama_free_model(model);

From d6a70a91710bab069e1cb98a7591f1951edf7413 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sat, 3 Feb 2024 13:04:22 -0600
Subject: [PATCH 63/63] Add p_decay

---
 common/common.cpp                    | 11 +++++-
 common/common.h                      |  1 +
 examples/speculative/speculative.cpp | 59 +++++++++++++++++-----------
 3 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index a7ffe01677c..99fa55fefde 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -448,12 +448,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.p_split = std::stof(argv[i]);
-        }else if (arg == "--p-recovery" || arg == "-pr") {
+        } else if (arg == "--p-recovery" || arg == "-pr") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.p_recovery = std::stof(argv[i]);
+        } else if (arg == "--p-decay" || arg == "-pd") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.p_decay = std::stof(argv[i]);
         } else if (arg == "-m" || arg == "--model") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -860,7 +866,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
     printf("  -pa N, --p-accept N   speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
     printf("  -ps N, --p-split N    speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
-    printf("  -pr N, --p-recovery N PipeInfer recovery probability (default: %.1f)\n", (double)params.p_recovery);
+    printf("  -pr N, --p-recovery N PipeInfer probability recovery (default: %.1f)\n", (double)params.p_recovery);
+    printf("  -pd N, --p-decay N PipeInfer probability decay (default: %.1f)\n", (double)params.p_decay);
     printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
     printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
     printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
diff --git a/common/common.h b/common/common.h
index 635a5e2269a..6053bb0d2ea 100644
--- a/common/common.h
+++ b/common/common.h
@@ -58,6 +58,7 @@ struct gpt_params {
     float   p_accept                        = 0.5f;  // speculative decoding accept probability
     float   p_split                         = 0.1f;  // speculative decoding split probability
     float   p_recovery                      = 0.0f;  // Cumulative probability that p_accept and p_split are increased by per-iteration.
+    float   p_decay                         = 0.0f;  // Cumulative probability that p_accept and p_split are decreased by per-iteration when drafting stops due to p_accept.
     int32_t n_gpu_layers                    = -1;    // number of layers to store in VRAM (-1 - use default)
     int32_t n_gpu_layers_draft              = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
     int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 50b3d4ffc21..591b3f21fdf 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -54,11 +54,11 @@ void begin_async_run(const llama_sampling_params& sparams, int n_seq_dft,
 
 bool start_async_spec_run(const gpt_params &params, llama_context *ctx_tgt, llama_context *ctx_dft,
                           std::deque<int> &free_sequence_offsets, int max_seq, llama_batch &batch_tgt, int n_predict,
-                          int prefix_n_past, int n_past_dft, bool has_eos, llama_sampling_context *ctx_sampling,
+                          int prefix_n_past, int n_past_dft, llama_sampling_context *ctx_sampling,
                           std::deque<struct seq_async_run> &tgt_cgraphs, const seq_async_run &current_run,
                           int &spec_past_tgt, int &spec_past_dft, int first_run, int orig_offset, int32_t &batch_id,
                           llama_batch &batch_dft, int &n_drafted, std::vector<seq_draft> &drafts, llama_token &id,
-                          llama_kv_cache_view &kvc, int iter);
+                          llama_kv_cache_view &kvc, float p_adjust, int &n_reject);
 
 void begin_non_spec_run(const gpt_params &params, int n_seq_dft, llama_context *ctx, int max_seq,
                         const std::vector<seq_draft> &drafts, llama_token id, int32_t &batch_id, int &n_past, int n_past_dft,
@@ -66,11 +66,13 @@ void begin_non_spec_run(const gpt_params &params, int n_seq_dft, llama_context *
 
 void
 run_speculation_loop(const gpt_params &params, const float p_accept, llama_context *ctx_tgt, llama_context *ctx_dft,
-                     int max_seq, llama_batch &batch_tgt, int n_predict, int n_past_tgt, int n_past_dft,
-                     bool has_eos, llama_sampling_context *ctx_sampling, int & spec_past_tgt, int & spec_past_dft,
-                     bool & first_run, std::deque<int> &free_sequence_offsets, int32_t &batch_id, llama_batch &batch_dft,
-                     int &n_drafted, std::vector<seq_draft> &drafts, std::deque<struct seq_async_run> &tgt_cgraphs,
-                     seq_async_run &current_run, llama_kv_cache_view &kvc_view_dft, llama_token &id);
+                     const int max_seq, llama_batch &batch_tgt, int n_predict, int n_past_tgt, int n_past_dft,
+                     llama_sampling_context *ctx_sampling, int &spec_past_tgt, int &spec_past_dft, bool &first_run,
+                     std::deque<int> &free_sequence_offsets, int32_t &batch_id, llama_batch &batch_dft, int &n_drafted,
+                     std::vector<seq_draft> &drafts, std::deque<struct seq_async_run> &tgt_cgraphs,
+                     seq_async_run &current_run, llama_kv_cache_view &kvc_view_dft, llama_token &id, int &n_rejected);
+
+float calc_p_adjust(const gpt_params &params, int iter, int n_reject);
 
 int main(int argc, char ** argv) {
     gpt_params params;
@@ -307,6 +309,9 @@ int main(int argc, char ** argv) {
 
     bool first_run = true;
     llama_token id;
+
+    int n_rejected = 0;
+
     while (true) {
 
 
@@ -389,10 +394,10 @@ int main(int argc, char ** argv) {
             first_run = true;
 
         } else if (!tgt_cgraphs.empty()) {
-            run_speculation_loop(params, p_accept, ctx_tgt, ctx_dft, max_seq, batch_tgt, n_predict, n_past_tgt, n_past_dft,
-                                 has_eos, ctx_sampling,
+            run_speculation_loop(params, p_accept, ctx_tgt, ctx_dft, max_seq, batch_tgt, n_predict, n_past_tgt,
+                                 n_past_dft, ctx_sampling,
                                  spec_past_tgt, spec_past_dft, first_run, free_sequence_offsets, batch_id, batch_dft,
-                                 n_drafted, drafts, tgt_cgraphs, current_run, kvc_view_dft, id);
+                                 n_drafted, drafts, tgt_cgraphs, current_run, kvc_view_dft, id, n_rejected);
             continue;
         }
 
@@ -570,6 +575,8 @@ int main(int argc, char ** argv) {
             continue;
         }
 
+        n_rejected = 0;
+
         check_for_cancel(ctx_tgt, n_past_tgt, tgt_cgraphs, generated, n_seq_dft);
 
 
@@ -656,9 +663,9 @@ int main(int argc, char ** argv) {
 //        bool is_waiting = false;
 
         run_speculation_loop(params, p_accept, ctx_tgt, ctx_dft, max_seq, batch_tgt, n_predict, n_past_tgt, n_past_dft,
-                             has_eos, ctx_sampling,
+                             ctx_sampling,
                              spec_past_tgt, spec_past_dft, first_run, free_sequence_offsets, batch_id, batch_dft,
-                             n_drafted, drafts, tgt_cgraphs, current_run, kvc_view_dft, id);
+                             n_drafted, drafts, tgt_cgraphs, current_run, kvc_view_dft, id, n_rejected);
 
 
         if (n_predict > params.n_predict || has_eos) {
@@ -733,10 +740,10 @@ int main(int argc, char ** argv) {
 void
 run_speculation_loop(const gpt_params &params, const float p_accept, llama_context *ctx_tgt, llama_context *ctx_dft,
                      const int max_seq, llama_batch &batch_tgt, int n_predict, int n_past_tgt, int n_past_dft,
-                     bool has_eos, llama_sampling_context *ctx_sampling, int &spec_past_tgt, int &spec_past_dft,
-                     bool & first_run, std::deque<int> &free_sequence_offsets, int32_t &batch_id, llama_batch &batch_dft,
-                     int &n_drafted, std::vector<seq_draft> &drafts, std::deque<struct seq_async_run> &tgt_cgraphs,
-                     seq_async_run &current_run, llama_kv_cache_view &kvc_view_dft, llama_token &id) {
+                     llama_sampling_context *ctx_sampling, int &spec_past_tgt, int &spec_past_dft, bool &first_run,
+                     std::deque<int> &free_sequence_offsets, int32_t &batch_id, llama_batch &batch_dft, int &n_drafted,
+                     std::vector<seq_draft> &drafts, std::deque<struct seq_async_run> &tgt_cgraphs,
+                     seq_async_run &current_run, llama_kv_cache_view &kvc_view_dft, llama_token &id, int &n_rejected) {
     bool is_waiting = llama_mpi_iprobe(ctx_tgt);
     llama_swap_comm(ctx_tgt);
     llama_sync_token(ctx_tgt, reinterpret_cast<llama_token *>(&is_waiting), 0);
@@ -749,7 +756,8 @@ run_speculation_loop(const gpt_params &params, const float p_accept, llama_conte
         free_sequence_offsets.push_back(current_run.seq_offset);
     }
     int iter = 0;
-    while((!is_waiting && (p_accept + iter * params.p_recovery) < 1.0)) {
+    float p_adjust = calc_p_adjust(params, iter, n_rejected);
+    while((!is_waiting && p_accept + (p_adjust = calc_p_adjust(params, iter, n_rejected)) < 1.0)) {
 
 
 
@@ -790,10 +798,10 @@ run_speculation_loop(const gpt_params &params, const float p_accept, llama_conte
 
 
         if (start_async_spec_run(params, ctx_tgt, ctx_dft, free_sequence_offsets, max_seq,
-                                 batch_tgt, n_predict, n_past_tgt, n_past_dft, has_eos, ctx_sampling,
+                                 batch_tgt, n_predict, n_past_tgt, n_past_dft, ctx_sampling,
                                  tgt_cgraphs,
                                  current_run, spec_past_tgt, spec_past_dft, first_run, orig_offset,
-                                 batch_id, batch_dft, n_drafted, drafts, id, kvc_view_dft, iter)) {
+                                 batch_id, batch_dft, n_drafted, drafts, id, kvc_view_dft, p_adjust, n_rejected)) {
             LOG("Ending spec run because returned true\n");
             break;
         }
@@ -810,6 +818,10 @@ run_speculation_loop(const gpt_params &params, const float p_accept, llama_conte
     }
 }
 
+float calc_p_adjust(const gpt_params &params, int iter, int n_reject) {
+    return iter * params.p_recovery - std::max(n_reject * params.p_decay, 0.0f);
+}
+
 void begin_non_spec_run(const gpt_params &params, const int n_seq_dft, llama_context *ctx, const int max_seq,
                         const std::vector<seq_draft> &drafts, llama_token id, int32_t &batch_id, int &n_past,
                         int n_past_dft,
@@ -843,11 +855,11 @@ void begin_non_spec_run(const gpt_params &params, const int n_seq_dft, llama_con
 
 bool start_async_spec_run(const gpt_params &params, llama_context *ctx_tgt, llama_context *ctx_dft,
                           std::deque<int> &free_sequence_offsets, int max_seq, llama_batch &batch_tgt, int n_predict,
-                          int prefix_n_past, int n_past_dft, bool has_eos, llama_sampling_context *ctx_sampling,
+                          int prefix_n_past, int n_past_dft, llama_sampling_context *ctx_sampling,
                           std::deque<struct seq_async_run> &tgt_cgraphs, const seq_async_run &current_run,
                           int &spec_past_tgt, int &spec_past_dft, int first_run, int orig_offset, int32_t &batch_id,
                           llama_batch &batch_dft, int &n_drafted, std::vector<seq_draft> &drafts, llama_token &id,
-                          llama_kv_cache_view &kvc, const int iter) {
+                          llama_kv_cache_view &kvc, float p_adjust, int &n_reject) {
     LOG("Doing speculative run, seq_offset = %d, spec_past_tgt = %d, spec_past_dft = %d, prefix_n_past = %d, n_past_dft = %d\n",
         current_run.seq_offset, spec_past_tgt, spec_past_dft, prefix_n_past, n_past_dft);
 
@@ -958,7 +970,7 @@ bool start_async_spec_run(const gpt_params &params, llama_context *ctx_tgt, llam
             }
 
 
-            if (cur_p[0].p < params.p_accept + params.p_recovery * iter) {
+            if (cur_p[0].p < params.p_accept + p_adjust) {
                 LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p,
                     params.p_accept);
                 drafts[s].drafting = false;
@@ -970,7 +982,7 @@ bool start_async_spec_run(const gpt_params &params, llama_context *ctx_tgt, llam
 
             // attempt to split the branch if the probability is high enough
             for (int f = 1; f < 8; ++f) {
-                if (n_seq_cur < params.n_parallel - 1 && cur_p[f].p > params.p_split + params.p_recovery * iter) {
+                if (n_seq_cur < params.n_parallel - 1 && cur_p[f].p > params.p_split + p_adjust) {
                     n_seq_cur++;
                     LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
 
@@ -1093,6 +1105,7 @@ bool start_async_spec_run(const gpt_params &params, llama_context *ctx_tgt, llam
 //            fprintf(stderr, "\nNo tgt tokens, pushing seq offset %d to free seq offsets\n", current_run.seq_offset);
 //            fflush(stderr);
             free_sequence_offsets.push_back(current_run.seq_offset);
+            n_reject++;
             return true;
         }