diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 3eda21a562d..ece1fc8a541 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -36,11 +36,19 @@ #include GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, + ModelParams & model_params, + ComputeParams & compute_params, std::map> & model_weights, - bool is_static) : + bool is_static, + bool is_prefill, + int prefill_chunk_size) : m_is_static(is_static), + m_is_prefill(is_prefill), + m_prefill_chunk_size(prefill_chunk_size), m_cgraph(cgraph), - m_model_weights(model_weights) { + m_model_weights(model_weights), + m_model_params(model_params), + m_compute_params(compute_params) { if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { #ifdef _WIN32 _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", ""); @@ -50,7 +58,6 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, print_tensor_address_map(cgraph); } - set_llm_params(); validate_cgraph(); for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { @@ -82,32 +89,60 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map data_addr_map; + std::unordered_set output_name_set; + for (const auto & node_info : m_node_info_list) { + for (const auto & it : node_info.node_inputs) { + const auto & src_name = it.first; + const auto & src_node = it.second; + + if (output_name_set.find(src_name) == output_name_set.end() && + m_model_weights.find(src_name) == m_model_weights.end() && + m_model_inputs.find(src_name) == m_model_inputs.end()) { + auto param_node = + std::make_shared(get_ov_type(src_node), ov::Shape(get_shape(src_node))); + param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); + m_model_inputs[src_name] = param_node; + } + } + output_name_set.emplace(node_info.node_output_name); + data_addr_map[node_info.data_addr] = node_info.node_output; + } + for (const auto & it : data_addr_map) { + // No need to add view tensors as model outputs + if (it.second->op != GGML_OP_VIEW) { + m_model_outputs[std::string(it.second->name)] = it.second; + } + } } -// Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; -// 2. constructing a decoder for a node; -// 3. constructing a decoder for the whole graph naively (op test case) void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { - std::string node_name; NodeInfo current_node_info; + auto node_name = std::string(node->name); + auto node_output_name = node_name; + auto * node_output = node; if (node->op == GGML_OP_SET_ROWS) { // SET_ROWS updates the tensor in place. For later ov op that uses the // the view_src of SET_ROWS, we need to make sure they get the updated tensor // by putting the view_src name in the tensor_map in // /src/frontends/ggml/src/translate_session.cpp - node_name = std::string(node->view_src->name); - } else { - node_name = std::string(node->name); + node_output_name = std::string(node->view_src->name); + node_output = node->view_src; } - m_output_names.push_back(node_name); - m_outputs[node_name] = node; + m_output_names.push_back(node_output_name); + m_outputs[node_output_name] = node_output; current_node_info.node = node; current_node_info.node_name = node_name; - current_node_info.node_outputs[node_name] = node; - current_node_info.node_outputs_names.push_back(node_name); + current_node_info.node_output = node_output; + current_node_info.node_output_name = node_output_name; current_node_info.node_op_case = 0; + current_node_info.data_addr = node->data; for (int i = 0; i < GGML_MAX_SRC; i++) { auto * src = node->src[i]; @@ -120,17 +155,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { current_node_info.node_inputs[src_name] = src; current_node_info.node_inputs_names.push_back(src_name); - // Add model inputs and weights constants, if called for the whole graph - if (naive) { - if (m_model_weights.find(src_name) == m_model_weights.end()) { - auto param_node = - std::make_shared(get_ov_type(src), get_graph_input_shape(node, src)); - param_node->set_friendly_name(src_name); - param_node->output(0).get_tensor().set_names({src_name}); - m_model_inputs[src_name] = param_node; - } - - } else if (!src->view_src) { + // Add model inputs + if (!naive && !src->view_src) { ggml_backend_buffer * buffer = src->buffer; if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { @@ -150,24 +176,15 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { } } - // Add model outputs, if called for the whole graph - if (naive) { - m_model_output_names.push_back(node_name); - } else { + // Add model outputs + if (!naive) { // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || - node_name.find("output") != std::string::npos || debug_output_names.count(node_name)) { - if (node->op == GGML_OP_SET_ROWS) { - assert(node_name.find("cache_k") == 0 || node_name.find("cache_v") == 0); - if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), node_name); it == m_kv_names.end()) { - m_kv_names.push_back(node_name); - } - } - if (auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), node_name); - it == m_model_output_names.end()) { - m_model_output_names.push_back(node_name); + node_output_name.find("output") != std::string::npos || debug_output_names.count(node_output_name)) { + if (m_model_outputs.find(node_output_name) == m_model_outputs.end()) { + m_model_outputs[node_output_name] = node_output; } } } @@ -175,7 +192,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { m_node_info_list.push_back(current_node_info); } -int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) { +int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { int op_case = 0; switch (node->op) { case GGML_OP_RESHAPE: { @@ -273,9 +290,11 @@ int extract_layer_from_name(const std::string & name) { return layer; } -void GgmlOvDecoder::set_llm_params() { - for (int i = 0; i < m_cgraph->n_nodes; i++) { - auto * node = m_cgraph->nodes[i]; +std::pair GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static) { + ModelParams model_params; + ComputeParams compute_params; + for (int i = 0; i < cgraph->n_nodes; i++) { + auto * node = cgraph->nodes[i]; std::string name = std::string(node->name); if (node->op == GGML_OP_FLASH_ATTN_EXT) { auto * cache_k_perm = node->src[1]; @@ -290,49 +309,50 @@ void GgmlOvDecoder::set_llm_params() { assert(mask_name.find("KQ_mask") == 0); if (std::string(node->src[3]->name).find("swa") != std::string::npos) { - m_swa_layers.push_back(layer); - m_ctx_per_seq_swa = cache_k->ne[1]; + model_params.swa_layers.push_back(layer); + model_params.ctx_per_seq_swa = cache_k->ne[1]; } else { - m_ctx_per_seq = cache_k->ne[1]; - m_n_seq = cache_k->ne[2]; + model_params.ctx_per_seq = cache_k->ne[1]; + model_params.n_seq = cache_k->ne[2]; } - m_n_seq_active = mask->ne[3]; + compute_params.n_seq_active = mask->ne[3]; auto seq_size = cache_k->ne[0] * cache_k->ne[1] * ggml_type_size(cache_k->type); size_t offset; memcpy(&offset, cache_k_view->op_params, sizeof(size_t)); - m_seq_active_start = offset / seq_size; - m_token_len_per_seq = node->ne[2]; + compute_params.seq_active_start = offset / seq_size; + compute_params.token_len_per_seq = node->ne[2]; if (mask_name.find("swa") != std::string::npos) { - m_attention_size_swa = mask->ne[0]; + compute_params.attention_size_swa = mask->ne[0]; } else { - m_attention_size = mask->ne[0]; + compute_params.attention_size = mask->ne[0]; } - if (m_is_static) { - m_attention_size = m_ctx_per_seq; - m_attention_size_swa = m_ctx_per_seq_swa; - m_token_len_per_seq = 1; + if (is_static) { + compute_params.attention_size = model_params.ctx_per_seq; + compute_params.attention_size_swa = model_params.ctx_per_seq_swa; + compute_params.token_len_per_seq = 1; } } else if (node->op == GGML_OP_ROPE) { if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) { - m_head_size = node->ne[0]; - m_n_heads = node->ne[1]; - m_rope_params = node->op_params; + model_params.head_size = node->ne[0]; + model_params.n_heads = node->ne[1]; + model_params.rope_params = node->op_params; auto * inp_pos = node->src[1]; - m_input_len = inp_pos->ne[0]; + compute_params.input_len = inp_pos->ne[0]; } else if (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0) { - m_n_heads_kv = node->ne[1]; + model_params.n_heads_kv = node->ne[1]; } } } - m_ctx = m_ctx_per_seq * m_n_seq; - m_ctx_swa = m_ctx_per_seq_swa * m_n_seq; + model_params.ctx = model_params.ctx_per_seq * model_params.n_seq; + model_params.ctx_swa = model_params.ctx_per_seq_swa * model_params.n_seq; + return {model_params, compute_params}; } void GgmlOvDecoder::validate_cgraph() const { - if (m_n_seq > 1 && m_is_static == true) { + if (m_model_params.n_seq > 1 && m_is_static == true) { throw std::runtime_error("n_seq > 1 is not supported on NPU. Try setting -np 1."); } } @@ -341,12 +361,16 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co auto name = std::string(input->name); ov::PartialShape input_shape; - if (name == "inp_tokens" || name == "inp_pos" || name == "inp_out_ids") { + if (name == "inp_tokens" || name == "inp_pos") { + int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; + input_shape = ov::PartialShape{1, 1, 1, len}; + + } else if (name == "inp_out_ids") { input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1}; } else if (name.find("KQ_mask") == 0) { if (m_is_static) { - input_shape = ov::PartialShape{1, 1, 1, m_ctx}; + input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx}; } else { input_shape = ov::PartialShape{-1, 1, -1, -1}; } @@ -359,11 +383,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co } } else if (op && op->op == GGML_OP_SET_ROWS && op->src[1] == input) { - input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1}; + int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; + input_shape = ov::PartialShape{1, 1, 1, len}; - } else if (input->op == GGML_OP_VIEW) { - // This case is added to make test-backend-ops work - input_shape = ov::PartialShape{get_shape(input->view_src)}; } else { input_shape = ov::PartialShape{get_shape(input)}; } @@ -394,14 +416,14 @@ void GgmlOvDecoder::add_extra_inputs() { } }; - create_1d_input("attention_size", m_attention_size); - if (m_attention_size_swa != -1) { - create_1d_input("attention_size_swa", m_attention_size_swa); + create_1d_input("attention_size", m_compute_params.attention_size); + if (m_compute_params.attention_size_swa != -1) { + create_1d_input("attention_size_swa", m_compute_params.attention_size_swa); } - create_1d_input("n_seq_active", m_n_seq_active); - create_1d_input("seq_active_start", m_seq_active_start); - create_1d_input("seq_active_end", m_seq_active_start + m_n_seq_active); - create_1d_input("token_len_per_seq", m_token_len_per_seq); + create_1d_input("n_seq_active", m_compute_params.n_seq_active); + create_1d_input("seq_active_start", m_compute_params.seq_active_start); + create_1d_input("seq_active_end", m_compute_params.seq_active_start + m_compute_params.n_seq_active); + create_1d_input("token_len_per_seq", m_compute_params.token_len_per_seq); // create_1d_input("token_len", m_token_len_per_seq * m_n_seq_active); } @@ -436,15 +458,15 @@ const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name return nullptr; } -std::map GgmlOvDecoder::get_kv_param_res_names() const { - std::map kv_param_res_names; - for (const auto & name : m_kv_names) { - if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { - kv_param_res_names[name] = name; - } - } - return kv_param_res_names; -} +// std::map GgmlOvDecoder::get_kv_param_res_names() const { +// std::map kv_param_res_names; +// for (const auto & name : m_model_params.kv_names) { +// if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { +// kv_param_res_names[name] = name; +// } +// } +// return kv_param_res_names; +// } std::map> GgmlOvDecoder::create_weight_nodes( ggml_cgraph * cgraph, @@ -753,17 +775,11 @@ std::vector GgmlOvDecoder::get_output_stride(const std::string & name) c ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string & name) const { auto * ggml_tensor = m_outputs.at(name); - if (ggml_tensor->op == GGML_OP_SET_ROWS) { - ggml_tensor = ggml_tensor->view_src; - } return ov::PartialShape(get_shape(ggml_tensor)); } -ov::PartialShape GgmlOvDecoder::get_output_shape(int node_idx, const std::string & name) const { - auto * ggml_tensor = m_node_info_list[node_idx].node_outputs.at(name); - if (ggml_tensor->op == GGML_OP_SET_ROWS) { - ggml_tensor = ggml_tensor->view_src; - } +ov::PartialShape GgmlOvDecoder::get_output_shape(int node_idx) const { + auto * ggml_tensor = m_node_info_list[node_idx].node_output; return ov::PartialShape(get_shape(ggml_tensor)); } @@ -776,7 +792,7 @@ std::vector GgmlOvDecoder::get_output_names() const { } std::vector GgmlOvDecoder::get_output_names(int node_idx) const { - return m_node_info_list[node_idx].node_outputs_names; + return {m_node_info_list[node_idx].node_output_name}; } const std::string & GgmlOvDecoder::get_op_name() const { @@ -800,8 +816,8 @@ int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const { return m_outputs.at(name)->op_params; } -int32_t * GgmlOvDecoder::get_output_op_params(int node_idx, const std::string & name) const { - return m_node_info_list[node_idx].node_outputs.at(name)->op_params; +int32_t * GgmlOvDecoder::get_output_op_params(int node_idx) const { + return m_node_info_list[node_idx].node->op_params; } void GgmlOvDecoder::visit_subgraph(std::function, int node_idx)> node_visitor) const { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 6e2bf0486d8..8e680b5c20c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -11,22 +11,63 @@ #include #include +struct ModelParams { + int ctx = -1; + int ctx_swa = -1; + int ctx_per_seq = -1; + int ctx_per_seq_swa = -1; + int n_seq = -1; + int n_heads = -1; + int n_heads_kv = -1; + int head_size = -1; + int32_t * rope_params = nullptr; + std::vector swa_layers; + + // std::vector kv_names; + + bool can_reuse_dynamically(const ModelParams & other) const { + return n_seq == other.n_seq && n_heads == other.n_heads && n_heads_kv == other.n_heads_kv && + head_size == other.head_size && rope_params == other.rope_params && swa_layers == other.swa_layers; + } + + bool can_reuse_statically(const ModelParams & other) const { + return can_reuse_dynamically(other) && ctx_per_seq == other.ctx_per_seq && + ctx_per_seq_swa == other.ctx_per_seq_swa; + } +}; + +struct ComputeParams { + int n_seq_active = -1; + int seq_active_start = -1; + int attention_size = -1; + int attention_size_swa = -1; + int input_len = -1; + int token_len_per_seq = -1; + int past_kv_len = -1; + int output_len = -1; +}; + class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: struct NodeInfo { ggml_tensor * node; + std::string node_name; + std::string node_op_type; std::map node_inputs; std::vector node_inputs_names; - std::map node_outputs; - std::vector node_outputs_names; + ggml_tensor * node_output; + std::string node_output_name; int node_op_case = 0; - std::string node_op_type; - std::string node_name; + void * data_addr; }; // Graph decoder GgmlOvDecoder(ggml_cgraph * cgraph, + ModelParams & model_params, + ComputeParams & compute_params, std::map> & model_weights, - bool is_static); + bool is_static, + bool is_prefill = false, + int prefill_chunk_size = 256); // Naive graph decoder GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights); @@ -66,7 +107,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual ov::PartialShape get_output_shape(const std::string & name) const override; - virtual ov::PartialShape get_output_shape(int node_idx, const std::string & name) const override; + virtual ov::PartialShape get_output_shape(int node_idx) const override; virtual std::vector get_output_stride(const std::string & name) const override; @@ -78,7 +119,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual int32_t * get_output_op_params(const std::string & name) const override; - virtual int32_t * get_output_op_params(int node_idx, const std::string & name) const override; + virtual int32_t * get_output_op_params(int node_idx) const override; virtual std::vector get_output_names() const override; @@ -116,29 +157,39 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { return m_model_weights; } - virtual const std::vector & get_model_output_names() const override { return m_model_output_names; } + virtual std::vector get_model_output_names() const override { + std::vector output_names; + output_names.reserve(m_model_outputs.size()); + for (const auto & [name, tensor] : m_model_outputs) { + output_names.push_back(name); + } + return output_names; + } - virtual int get_ctx_size() const { return m_ctx; } + const std::map & get_model_outputs() const { return m_model_outputs; } - virtual int get_ctx_swa_size() const { return m_ctx_swa; } + virtual int get_ctx_size() const { return m_model_params.ctx; } - virtual int get_ctx_per_seq() const { return m_ctx_per_seq; } + virtual int get_ctx_swa_size() const { return m_model_params.ctx_swa; } - virtual int get_ctx_per_seq_swa() const { return m_ctx_per_seq_swa; } + virtual int get_ctx_per_seq() const { return m_model_params.ctx_per_seq; } - virtual int get_n_seq() const { return m_n_seq; } + virtual int get_ctx_per_seq_swa() const { return m_model_params.ctx_per_seq_swa; } + + virtual int get_n_seq() const { return m_model_params.n_seq; } virtual int is_swa_layer(int layer) const override { - return std::find(m_swa_layers.begin(), m_swa_layers.end(), layer) != m_swa_layers.end(); + return std::find(m_model_params.swa_layers.begin(), m_model_params.swa_layers.end(), layer) != + m_model_params.swa_layers.end(); } - int get_past_kv_len() const { return m_past_kv_len; } + int get_past_kv_len() const { return m_compute_params.past_kv_len; } - int get_input_len() const { return m_input_len; } + int get_input_len() const { return m_compute_params.input_len; } - virtual int32_t * get_rope_params() const override { return m_rope_params; } + virtual int32_t * get_rope_params() const override { return m_model_params.rope_params; } - virtual std::map get_kv_param_res_names() const override; + // virtual std::map get_kv_param_res_names() const override; virtual bool is_static() const override { return m_is_static; } @@ -159,19 +210,31 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void clear_model_weights() { m_model_weights.clear(); } -private: - void set_input_output(ggml_tensor * node, bool naive = false); - void add_extra_inputs(); + static std::pair compute_llm_params(ggml_cgraph * cgraph, bool is_static); + + ModelParams get_model_params() const { return m_model_params; } + + ComputeParams get_compute_params() const { return m_compute_params; } + + void set_model_params(const ModelParams & model_params) { m_model_params = model_params; } + + void set_compute_params(const ComputeParams & compute_params) { m_compute_params = compute_params; } + + bool m_is_static = false; + bool m_is_prefill = false; + int m_prefill_chunk_size = 0; + static std::vector get_shape(const ggml_tensor * tensor); static std::vector get_stride(const ggml_tensor * tensor); static ov::element::Type get_ov_type(const ggml_tensor * tensor); - int compute_op_case(const ggml_tensor * node); - std::string compute_op_type(const ggml_tensor * node); + static std::string compute_op_type(const ggml_tensor * node); - void set_llm_params(); - void validate_cgraph() const; +private: + void set_input_output(ggml_tensor * node, bool naive = false); + void add_extra_inputs(); + int compute_op_case(const ggml_tensor * node) const; - bool m_is_static = false; + void validate_cgraph() const; ggml_cgraph * m_cgraph = nullptr; std::vector m_nodes; @@ -184,30 +247,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map> m_model_extra_inputs; std::map> m_model_extra_input_values; std::map> m_model_weights; - std::vector m_model_output_names; + std::map m_model_outputs; std::vector m_node_info_list; - // Fixed for a model - int m_ctx = -1; - int m_ctx_swa = -1; - int m_ctx_per_seq = -1; - int m_ctx_per_seq_swa = -1; - int m_n_seq = -1; - int m_n_heads = -1; - int m_n_heads_kv = -1; - int m_head_size = -1; - std::vector m_swa_layers; - std::vector m_kv_names; - - // Changed per inference - int m_n_seq_active = -1; - int m_seq_active_start = -1; - int m_attention_size = -1; - int m_attention_size_swa = -1; - int m_input_len = -1; - int m_token_len_per_seq = -1; - int m_past_kv_len = -1; - int32_t * m_rope_params = nullptr; + ModelParams m_model_params; + ComputeParams m_compute_params; }; void print_tensor_address_map(const ggml_cgraph * cgraph); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 1d5b7a850f8..2ecc4401df3 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -39,7 +39,7 @@ class GgmlDecoder : public DecoderBase { virtual PartialShape get_output_shape(const std::string& name) const = 0; - virtual PartialShape get_output_shape(int node_idx, const std::string& name) const = 0; + virtual PartialShape get_output_shape(int node_idx) const = 0; virtual std::vector get_output_stride(const std::string& name) const = 0; @@ -51,7 +51,7 @@ class GgmlDecoder : public DecoderBase { virtual int32_t* get_output_op_params(const std::string& name) const = 0; - virtual int32_t* get_output_op_params(int node_idx, const std::string& name) const = 0; + virtual int32_t * get_output_op_params(int node_idx) const = 0; virtual std::vector get_output_names() const = 0; @@ -72,10 +72,10 @@ class GgmlDecoder : public DecoderBase { virtual const std::map>& get_model_inputs() const = 0; virtual const std::map>& get_model_extra_inputs() const = 0; virtual const std::map>& get_model_weights() const = 0; - virtual const std::vector& get_model_output_names() const = 0; + virtual std::vector get_model_output_names() const = 0; virtual int32_t* get_rope_params() const = 0; - virtual std::map get_kv_param_res_names() const = 0; + // virtual std::map get_kv_param_res_names() const = 0; virtual bool is_static() const = 0; diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index 64e3d550c58..42d950c3eb4 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -53,17 +53,13 @@ class NodeContext : public frontend::NodeContext { std::string get_output_name() const { return m_output_names[0]; } - PartialShape get_output_shape(size_t index) const { - return m_decoder->get_output_shape(m_node_idx, m_output_names[index]); - } + PartialShape get_output_shape() const { return m_decoder->get_output_shape(m_node_idx); } int32_t* get_input_op_params(size_t index) const { return m_decoder->get_input_op_params(m_node_idx, m_input_names[index]); } - int32_t* get_output_op_params(size_t index) const { - return m_decoder->get_output_op_params(m_node_idx, m_output_names[index]); - } + int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); } ov::element::Type get_output_type(size_t index) const { return m_decoder->get_output_type(m_output_names[index]); diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index 618b4efdea4..d4c47d4bf1b 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -22,7 +22,7 @@ OutputVector translate_cont(const NodeContext & context) { FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case"); auto src_shape = context.get_input_shape(0).to_shape(); - auto dst_shape = context.get_output_shape(0).to_shape(); + auto dst_shape = context.get_output_shape().to_shape(); ov::Output res; if (op_case == 1) { diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index efbdf421c63..342da882aaa 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -26,7 +26,7 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) { auto v = context.get_input(2); auto mask = context.get_input(3); - float * params = reinterpret_cast(context.get_output_op_params(0)); + float * params = reinterpret_cast(context.get_output_op_params()); float scale = params[0]; // float max_bias = params[1]; // float logit_softcap = params[2]; diff --git a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp index 80bfbafd83c..ad5cd3f6ba5 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp @@ -32,7 +32,7 @@ OutputVector translate_glu_geglu(const NodeContext & context) { src1 = split->output(1); } - int32_t * params = context.get_output_op_params(0); + int32_t * params = context.get_output_op_params(); const int32_t swapped = params[1]; if (swapped) { std::swap(src0, src1); diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index 21489312460..2b7f13629f2 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -32,7 +32,7 @@ OutputVector translate_glu_swiglu(const NodeContext & context) { src1 = split->output(1); } - int32_t * params = context.get_output_op_params(0); + int32_t * params = context.get_output_op_params(); const int32_t swapped = params[1]; if (swapped) { std::swap(src0, src1); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index d156e48e3cf..bfe09a2b840 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -32,10 +32,12 @@ OutputVector translate_permute(const NodeContext & context) { if (op_case == 1) { res = std::make_shared(src, perm); } else if (op_case == 4) { - auto output_shape = context.get_output_shape(0).to_shape(); + auto output_shape = context.get_output_shape().to_shape(); auto n_heads = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[1]}); auto head_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]}); - auto n_seq_active = context.get_input("n_seq_active"); + auto n_seq_active = context.has_input("n_seq_active") ? + context.get_input("n_seq_active") : + ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[0]}); auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); auto new_shape = @@ -49,26 +51,39 @@ OutputVector translate_permute(const NodeContext & context) { res = std::make_shared(reshaped, perm); } else { auto cache_shape = src.get_partial_shape(); - auto output_shape = context.get_output_shape(0).to_shape(); + auto output_shape = context.get_output_shape().to_shape(); int64_t head_size = output_shape[3]; int64_t n_heads = output_shape[1]; int64_t ctx_per_seq = cache_shape[2].is_static() ? cache_shape[2].get_length() : -1; int64_t n_seq = cache_shape[1].get_length(); Output attention_size; - if (op_case == 2) { + if (!context.has_input("attention_size")) { + attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[2]}); + } else if (op_case == 2) { attention_size = context.get_input("attention_size"); } else { attention_size = context.get_input("attention_size_swa"); } + Output seq_active_start; + Output seq_active_end; + if (context.has_input("seq_active_start")) { + seq_active_start = context.get_input("seq_active_start"); + seq_active_end = context.get_input("seq_active_end"); + } else { + int64_t n_seq_active = output_shape[0]; + size_t offset = *((size_t *) context.get_input_op_params(0)); + int64_t seq_active_start_val = offset / context.get_input_stride(0)[0]; + int64_t seq_active_end_val = seq_active_start_val + n_seq_active; + seq_active_start = ov::op::v0::Constant::create(ov::element::i64, {1}, {seq_active_start_val}); + seq_active_end = ov::op::v0::Constant::create(ov::element::i64, {1}, {seq_active_end_val}); + } + // 1. reshape to [n_seq, ctx_per_seq, n_heads, head_size] // 2. slice out the active sequences // 3. slice out the attention part in each sequence // 4. permute - auto seq_active_start = context.get_input("seq_active_start"); - auto seq_active_end = context.get_input("seq_active_end"); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index b34fa626f19..e26a8c778c8 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -20,7 +20,7 @@ namespace op { OutputVector translate_reshape(const NodeContext & context) { num_inputs_check(context, 1, 1); - if (context.get_input_shape(0) == context.get_output_shape(0)) { + if (context.get_input_shape(0) == context.get_output_shape()) { return {context.get_input(0)}; } @@ -29,7 +29,7 @@ OutputVector translate_reshape(const NodeContext & context) { op_case == 1 || op_case == 2 || op_case == 3 || op_case == 4 || op_case == 5 || op_case == 6, "Unsupported RESHAPE case"); - auto output_shape = context.get_output_shape(0).to_shape(); + auto output_shape = context.get_output_shape().to_shape(); std::shared_ptr new_shape_node; if (op_case == 1) { new_shape_node = ov::op::v0::Constant::create( @@ -50,18 +50,18 @@ OutputVector translate_reshape(const NodeContext & context) { return {context.get_input(0).get_node_shared_ptr()->input_value(0)}; } else if (op_case == 5) { - std::vector shape_vec = {1, 1, -1, (int64_t) context.get_output_shape(0).to_shape()[3]}; + std::vector shape_vec = {1, 1, -1, (int64_t) context.get_output_shape().to_shape()[3]}; new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, shape_vec); // // Alternative // auto token_len = context.get_input("token_len"); // auto emb_size = - // ov::op::v0::Constant::create(ov::element::i64, {1}, {(int64_t) context.get_output_shape(0).to_shape()[3]}); + // ov::op::v0::Constant::create(ov::element::i64, {1}, {(int64_t) context.get_output_shape().to_shape()[3]}); // auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); // new_shape_node = std::make_shared(ov::OutputVector{one, one, token_len, emb_size}, 0); } else if (op_case == 6) { - new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, context.get_output_shape(0).to_shape()); + new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, context.get_output_shape().to_shape()); } auto res = std::make_shared(context.get_input(0), new_shape_node, false); return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index 3ac96d0c224..99c97e06aed 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -27,7 +27,7 @@ OutputVector translate_rms_norm(const NodeContext & context) { square, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true); float eps; - memcpy(&eps, context.get_output_op_params(0), sizeof(float)); + memcpy(&eps, context.get_output_op_params(), sizeof(float)); auto rms = std::make_shared( std::make_shared(mean, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {eps}))); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 5c83867d18e..96fbb6b795a 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -31,8 +31,8 @@ OutputVector translate_rope(const NodeContext & context) { ov::Output res; auto data_node = context.get_input(0).get_node_shared_ptr(); - auto output_shape = context.get_output_shape(0).to_shape(); - int32_t * op_params = context.get_output_op_params(0); + auto output_shape = context.get_output_shape().to_shape(); + int32_t * op_params = context.get_output_op_params(); Output cos_theta_node; Output sin_theta_node; diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp index f52381786a3..01e59cedd99 100644 --- a/ggml/src/ggml-openvino/openvino/op/scale.cpp +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -15,7 +15,7 @@ OutputVector translate_scale(const NodeContext & context) { num_inputs_check(context, 1, 1); float scale; - memcpy(&scale, context.get_output_op_params(0), sizeof(float)); + memcpy(&scale, context.get_output_op_params(), sizeof(float)); auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); auto res = std::make_shared(context.get_input(0), scale_node); diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index a323e5ed38a..eb128f04a36 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -34,7 +34,7 @@ OutputVector translate_set_rows(const NodeContext & context) { data = std::make_shared(data, context.get_output_type(0)); - auto dst_shape = context.get_output_shape(0).to_shape(); + auto dst_shape = context.get_output_shape().to_shape(); auto ind_squeezed = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 1, 2})); diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 591bcb46c43..921475e51ae 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -31,7 +31,7 @@ OutputVector translate_soft_max(const NodeContext & context) { float scale = 1.0f; float max_bias = 0.0f; - auto * op_params = context.get_output_op_params(0); + auto * op_params = context.get_output_op_params(); memcpy(&scale, (float *) op_params + 0, sizeof(float)); memcpy(&max_bias, (float *) op_params + 1, sizeof(float)); auto src0_shape = context.get_input_shape(0).get_shape(); diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index 6bf980cab67..f0b8938befc 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -10,7 +10,7 @@ OutputVector translate_view(const NodeContext & context) { num_inputs_check(context, 1, 1); if (context.get_op_case() == 2) { - auto dst_shape = context.get_output_shape(0).to_shape(); + auto dst_shape = context.get_output_shape().to_shape(); return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[2] * dst_shape[3])}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index d03c9358b03..546778a4707 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -203,7 +203,16 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo results.push_back(result); } - resulting_model = std::make_shared(results, params); + ov::ParameterVector used_params; + for (const auto & param : params) { + if (!param->output(0).get_target_inputs().empty()) { + used_params.push_back(param); + } + } + // if (auto diff = params.size() - used_params.size()) { + // GGML_LOG_INFO("%zu parameters are not used in the model.", diff); + // } + resulting_model = std::make_shared(results, used_params); apply_transformations(resulting_model); return resulting_model; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 6e1d7393c75..92e8ce80b35 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -48,8 +48,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } return device; }; - static std::string device = get_device(); - bool is_static = device == "NPU" ? true : false; + auto get_prefill_chunk_size = [] { + const char * chunk_size_str = getenv("GGML_OPENVINO_PREFILL_CHUNK_SIZE"); + if (chunk_size_str && atoi(chunk_size_str) > 0) { + return atoi(chunk_size_str); + } + return 256; + }; + + static const auto device = get_device(); + static const auto is_static = device == "NPU" ? true : false; + static const auto prefill_chunk_size = get_prefill_chunk_size(); ov::AnyMap config; @@ -70,12 +79,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } static std::mutex cache_mutex; - static std::unordered_map> infer_request_cache; - static std::unordered_map> ov_input_names_cache; - static std::unordered_map> ov_output_names_cache; + static std::unordered_map, graph_key_hash> decoder_cache; + static std::unordered_map, graph_key_hash> infer_request_cache; + static std::unordered_map, graph_key_hash> infer_request_cache_prefill; + static std::unordered_map, graph_key_hash> ov_input_names_cache; + static std::unordered_map, graph_key_hash> ov_output_names_cache; std::shared_ptr ggml_decoder; std::shared_ptr infer_request; + ModelParams m_params; + ComputeParams c_params; + std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static); + + const auto * inp_pos = get_inp_pos_tensor(cgraph); + const auto is_prefill = get_is_prefill(inp_pos); + const auto key = compute_graph_key(cgraph); int64_t decoder_end_time; int64_t conversion_end_time; @@ -85,38 +103,92 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * { std::lock_guard lock(cache_mutex); - auto it = infer_request_cache.find(cgraph); - if (it != infer_request_cache.end()) { + auto it = decoder_cache.find(key); + + auto cache_hit = it != decoder_cache.end(); + if (cache_hit) { + ggml_decoder = it->second; + cache_hit = is_static ? ggml_decoder->get_model_params().can_reuse_statically(m_params) : + ggml_decoder->get_model_params().can_reuse_dynamically(m_params); + } + + if (cache_hit) { std::map> model_weights; - ggml_decoder = std::make_shared(cgraph, model_weights, is_static); + ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, + is_prefill, prefill_chunk_size); + decoder_cache[key] = ggml_decoder; decoder_end_time = ggml_time_us(); - infer_request = infer_request_cache[cgraph]; + infer_request = is_static && is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key]; conversion_end_time = ggml_time_us(); compile_end_time = conversion_end_time; } else { + infer_request_cache.erase(key); + infer_request_cache_prefill.erase(key); + std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); - ggml_decoder = std::make_shared(cgraph, model_weights, is_static); - decoder_end_time = ggml_time_us(); + if (!is_static) { + ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static); + decoder_end_time = ggml_time_us(); - auto input_model = std::make_shared(ggml_decoder); - model = ov::frontend::ggml::FrontEnd::convert(input_model); - ggml_decoder->clear_model_weights(); - conversion_end_time = ggml_time_us(); + auto input_model = std::make_shared(ggml_decoder); + model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); + conversion_end_time = ggml_time_us(); - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long) ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); - } + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } - auto compiled_model = core.compile_model(model, device, get_ov_compile_config(device)); - compile_end_time = ggml_time_us(); - infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); - infer_request = infer_request_cache[cgraph]; + auto compiled_model = core.compile_model(model, device, get_ov_compile_config(device)); + compile_end_time = ggml_time_us(); + infer_request = std::make_shared(compiled_model.create_infer_request()); + infer_request_cache[key] = infer_request; + decoder_cache[key] = ggml_decoder; + } else { + auto ggml_decoder_prefill = std::make_shared(cgraph, m_params, c_params, model_weights, + is_static, true, prefill_chunk_size); + auto ggml_decoder_decode = std::make_shared(cgraph, m_params, c_params, model_weights, + is_static, false, prefill_chunk_size); + decoder_end_time = ggml_time_us(); + + auto input_model_prefill = std::make_shared(ggml_decoder_prefill); + auto input_model_decode = std::make_shared(ggml_decoder_decode); + + auto model_prefill = ov::frontend::ggml::FrontEnd::convert(input_model_prefill); + ggml_decoder_prefill->clear_model_weights(); + auto model_decode = ov::frontend::ggml::FrontEnd::convert(input_model_decode); + ggml_decoder_decode->clear_model_weights(); + conversion_end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); + ov::serialize(model_prefill, timestamped_filename); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_decode_%lld.xml", timestamp); + ov::serialize(model_decode, timestamped_filename); + } + + auto compiled_model_prefill = core.compile_model(model_prefill, device, get_ov_compile_config(device)); + auto compiled_model_decode = core.compile_model(model_decode, device, get_ov_compile_config(device)); + + infer_request_cache_prefill[key] = + std::make_shared(compiled_model_prefill.create_infer_request()); + infer_request_cache[key] = + std::make_shared(compiled_model_decode.create_infer_request()); + compile_end_time = ggml_time_us(); + + model = is_prefill ? model_prefill : model_decode; + ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode; + infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key]; + decoder_cache[key] = ggml_decoder; + } std::vector ov_input_names; std::vector ov_output_names; @@ -126,30 +198,32 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * for (const auto & ov_output : model->get_results()) { ov_output_names.push_back(ov_output->get_friendly_name()); } - ov_input_names_cache[cgraph] = ov_input_names; - ov_output_names_cache[cgraph] = ov_output_names; + ov_input_names_cache[key] = ov_input_names; + ov_output_names_cache[key] = ov_output_names; // Set output tensors (for NPU) and kvcache i/o tensors once and for all // Note: does not seem to improve perf on CPU/GPU, but breaks llama-bench, so disabled it for CPU/GPU - if (is_static) { - for (size_t i = 0; i < ov_output_names.size(); i++) { - auto output_name = ov_output_names[i]; - auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); - infer_request->set_output_tensor(i, output_tensor); - } - for (size_t i = 0; i < ov_input_names.size(); i++) { - auto param_name = ov_input_names[i]; - if (param_name.find("cache") == 0) { - auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0); - infer_request->set_input_tensor(i, input_tensor); - } - } - } + // if (is_static) { + // for (size_t i = 0; i < ov_input_names.size(); i++) { + // auto param_name = ov_input_names[i]; + // if (param_name.find("cache") == 0) { + // auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name); + // infer_request->set_input_tensor(i, input_tensor); + // } + // } + // for (size_t i = 0; i < ov_output_names.size(); i++) { + // auto output_name = ov_output_names[i]; + // if (output_name.find("cache") == 0) { + // auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + // infer_request->set_output_tensor(i, output_tensor); + // } + // } + // } } } - auto ov_input_names = ov_input_names_cache[cgraph]; - auto ov_output_names = ov_output_names_cache[cgraph]; + auto ov_input_names = ov_input_names_cache[key]; + auto ov_output_names = ov_output_names_cache[key]; if (!is_static) { for (size_t i = 0; i < ov_input_names.size(); i++) { @@ -177,11 +251,39 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } } } else { - auto input_len = ggml_decoder->get_input_len(); - for (int j = 0; j < input_len; j++) { + if (is_prefill) { + auto inp_len = inp_pos->ne[0]; + for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) { + for (size_t i = 0; i < ov_input_names.size(); i++) { + auto param_name = ov_input_names[i]; + auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index); + infer_request->set_input_tensor(i, input_tensor); + + if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + const auto input_tensor = infer_request->get_input_tensor(i); + print_input_tensor_info(param_name, input_tensor); + } + } + + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + infer_request->set_output_tensor(i, output_tensor); + } + + infer_request->infer(); + + if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { + for (size_t i = 0; i < ov_output_names.size(); i++) { + const auto output_tensor = infer_request->get_output_tensor(i); + print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); + } + } + } + infer_end_time = ggml_time_us(); + } else { for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; - auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, j, input_len); + auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name); infer_request->set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { @@ -190,7 +292,13 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } } + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + infer_request->set_output_tensor(i, output_tensor); + } + infer_request->infer(); + infer_end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { for (size_t i = 0; i < ov_output_names.size(); i++) { @@ -199,7 +307,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } } } - infer_end_time = ggml_time_us(); } if (getenv("GGML_OPENVINO_PROFILING")) { @@ -255,7 +362,7 @@ std::map get_types_to_requant(const std::string & dev } bool is_naive(ggml_cgraph * cgraph) { - constexpr int naive_graph_size_threshold = 20; + constexpr int naive_graph_size_threshold = 100; return cgraph->n_nodes < naive_graph_size_threshold; } @@ -305,7 +412,7 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, ov::Shape input_shape; if (ggml_tensor->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work - input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor, ggml_tensor->view_src).to_shape(); + input_shape = ggml_decoder->get_shape(ggml_tensor->view_src); } else { input_shape = ggml_decoder->get_input_shape(name).to_shape(); } @@ -324,21 +431,84 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons return input_tensor; } -ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decoder, - const std::string & param_name, - int j, - int input_len) { +ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, + const std::string & param_name) { const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); if (param_name == "inp_pos" || param_name == "inp_tokens" || (op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) { + assert(ggml_tensor->ne[0] == 1); ov::Shape input_shape = {1, 1, 1, 1}; ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); - // copy the j-th value from ggml_tensor + if (ggml_tensor->type == GGML_TYPE_I32) { + *input_tensor.data() = *((int32_t *) ggml_tensor->data); + } else if (ggml_tensor->type == GGML_TYPE_I64) { + *input_tensor.data() = *((int64_t *) ggml_tensor->data); + } else { + throw std::runtime_error("Unexpected tensor type for " + param_name); + } + return input_tensor; + } + + if (param_name == "inp_out_ids") { + ov::Shape input_shape = {1, 1, 1, 1}; + ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); + int32_t inp_out_id = *((int32_t *) ggml_tensor->data); + assert(ggml_tensor->ne[0] == 1); + assert(inp_out_id == 0); + *input_tensor.data() = inp_out_id; + return input_tensor; + } + + if (param_name.find("KQ_mask") == 0) { + size_t context_size = ggml_decoder->get_ctx_size(); + std::vector padded_data = pad_input(ggml_tensor, 1, context_size, -INFINITY); + ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size}); + auto * data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.begin() + context_size, data_ptr); + return input_tensor; + } + + return get_ov_input_tensor(ggml_decoder, param_name); +} + +ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggml_decoder, + const std::string & param_name, + int chunk_index) { + const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); + const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); + + const size_t input_len = ggml_decoder->get_input_len(); + const size_t chunk_size = ggml_decoder->m_prefill_chunk_size; + const size_t chunk_valid_size = std::min(chunk_size, input_len - chunk_index * chunk_size); + const size_t chunk_pad_size = chunk_size - chunk_valid_size; + + if (param_name == "inp_pos" || param_name == "inp_tokens" || + (op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) { + ov::Shape input_shape = {1, 1, 1, chunk_size}; + ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); + // copy the chunk_index-th chunk from ggml_tensor size_t element_size = ggml_type_size(ggml_tensor->type); - void * input_data = (char *) ggml_tensor->data + j * element_size; - std::memcpy(input_tensor.data(), input_data, element_size); + void * input_data = (char *) ggml_tensor->data + chunk_index * chunk_size * element_size; + std::memcpy(input_tensor.data(), input_data, chunk_valid_size * element_size); + // pad the rest with last_value + 1, so that kv's of padded positions are inserted + // to the next row after the valids row in the kvcache + if (chunk_pad_size > 0) { + if (ggml_tensor->type == GGML_TYPE_I32) { + int32_t last_value = + *((int32_t *) ggml_tensor->data + (chunk_index * chunk_size + chunk_valid_size - 1)); + int32_t * output_data = input_tensor.data(); + std::fill(output_data + chunk_valid_size, output_data + chunk_size, last_value + 1); + } else if (ggml_tensor->type == GGML_TYPE_I64) { + int64_t last_value = + *((int64_t *) ggml_tensor->data + (chunk_index * chunk_size + chunk_valid_size - 1)); + int64_t * output_data = input_tensor.data(); + std::fill(output_data + chunk_valid_size, output_data + chunk_size, last_value + 1); + } else { + throw std::runtime_error("Unexpected tensor type for " + param_name); + } + } return input_tensor; } @@ -348,25 +518,26 @@ ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decode if (ggml_tensor->ne[0] == 0) { *input_tensor.data() = 0; } else if (ggml_tensor->ne[0] == 1) { - if (j == input_len - 1) { - *input_tensor.data() = *((int32_t *) ggml_tensor->data); - } else { - *input_tensor.data() = 0; - } + int32_t inp_out_id = *((int32_t *) ggml_tensor->data) % chunk_size; + *input_tensor.data() = inp_out_id; } else { - throw std::runtime_error("Static graph inp_out_ids unexpected ne[0] > 1"); + throw std::runtime_error("NPU does not support outputing logits for multiple tokens at once."); } return input_tensor; } if (param_name.find("KQ_mask") == 0) { + size_t cols = ggml_tensor->ne[0]; + size_t rows = ggml_tensor->ne[1]; + float * ggml_data = (float *) ggml_tensor->data + chunk_index * chunk_size * cols; + size_t chunk_valid_rows = std::min(chunk_size, rows - chunk_index * chunk_size); size_t context_size = ggml_decoder->get_ctx_size(); - const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - std::vector padded_data = pad_input(input_tensor_ggml, input_len, context_size, -INFINITY); - ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size}); - // copy the j-th row of padded_data + std::vector padded_data = + pad_input(ggml_data, chunk_valid_rows, cols, chunk_size, context_size, -INFINITY); + set_zero_diagonal(padded_data, chunk_size, context_size); + ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, chunk_size, context_size}); auto * data_ptr = input_tensor.data(); - std::copy(padded_data.begin() + j * context_size, padded_data.begin() + (j + 1) * context_size, data_ptr); + std::copy(padded_data.begin(), padded_data.begin() + chunk_size * context_size, data_ptr); return input_tensor; } @@ -374,15 +545,13 @@ ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decode } ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, const std::string & result_name) { - auto * ggml_tensor = ggml_decoder->get_output_ggml_tensor(result_name); - auto output_type = ggml_decoder->get_output_type(result_name); - ov::Shape output_shape; - output_shape = ggml_decoder->get_output_shape(result_name).to_shape(); + auto * ggml_tensor = ggml_decoder->get_model_outputs().at(result_name); + auto output_type = ggml_decoder->get_ov_type(ggml_tensor); + auto output_shape = ggml_decoder->get_shape(ggml_tensor); if (ggml_decoder->is_static() && result_name == "result_output") { output_shape[1] = 1; } - ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); return output_tensor; } @@ -401,9 +570,28 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; switch (tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(tensor.data()) << std::endl; + case ov::element::f32: { + if (name.find("KQ_mask") == std::string::npos) { + std::cout << *(tensor.data()) << std::endl; + } else { + size_t rows = tensor.get_shape()[2]; + size_t cols = tensor.get_shape()[3]; + auto * data = tensor.data(); + for (size_t i = 0; i < rows; ++i) { + for (size_t j = 0; j < cols; ++j) { + float val = data[i * cols + j]; + if (std::isinf(val) && val < 0) { + std::cout << std::setw(5) << "-inf"; + } else { + std::cout << std::setw(5) << val; + } + } + std::cout << std::endl; + } + } + break; + } case ov::element::f16: std::cout << *(tensor.data()) << std::endl; break; @@ -414,7 +602,10 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor std::cout << std::endl; break; case ov::element::i64: - std::cout << *(tensor.data()) << std::endl; + for (size_t i = 0; i < tensor.get_size(); ++i) { + std::cout << tensor.data()[i] << " "; + } + std::cout << std::endl; break; default: break; @@ -471,9 +662,10 @@ void print_output_tensor_info(const std::string & name, const ov::Tensor & tenso } } -void set_zero_diagonal(std::vector & matrix, size_t dim) { - for (size_t i = 0; i < dim; ++i) { - matrix[i * dim + i] = 0.0f; +void set_zero_diagonal(std::vector & matrix, size_t rows, size_t cols) { + for (size_t i = 0; i < rows; ++i) { + size_t diag_col = std::min(i, cols - 1); + matrix[i * cols + diag_col] = 0.0f; } } @@ -494,8 +686,23 @@ const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) { throw std::runtime_error("get_inp_pos_tensor: inp_pos not found in cgraph"); } -bool get_is_first_token(const ggml_tensor * inp_pos) { - return *(int32_t *) inp_pos->data == 0; +bool get_is_prefill(const ggml_tensor * inp_pos) { + return inp_pos->ne[0] > 1; +} + +graph_key compute_graph_key(ggml_cgraph * cgraph) { + graph_key key; + key.n_nodes = cgraph->n_nodes; + + if (cgraph->n_nodes > 0) { + key.first_node_name = std::string(cgraph->nodes[0]->name); + key.last_node_name = std::string(cgraph->nodes[cgraph->n_nodes - 1]->name); + } else { + key.first_node_name = ""; + key.last_node_name = ""; + } + + return key; } #pragma GCC diagnostic pop diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 999fc53f322..dca74f8afc2 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -3,8 +3,29 @@ #include "ggml-impl.h" #include +#include #include +struct graph_key { + size_t n_nodes; + std::string first_node_name; + std::string last_node_name; + + bool operator==(const graph_key & other) const { + return n_nodes == other.n_nodes && first_node_name == other.first_node_name && + last_node_name == other.last_node_name; + } +}; + +struct graph_key_hash { + size_t operator()(const graph_key & key) const { + size_t h = std::hash{}(key.n_nodes); + h ^= std::hash{}(key.first_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash{}(key.last_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2); + return h; + } +}; + enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph); size_t checksum(const void * data, size_t size); @@ -14,35 +35,49 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, void * output_dst); template -std::vector pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_cols, T pad_value) { - std::vector padded_data(padded_rows * padded_cols, pad_value); - size_t rows = tensor->ne[1]; - size_t cols = tensor->ne[0]; - T * data = static_cast(tensor->data); +std::vector pad_input(const T * data, + size_t rows, + size_t cols, + size_t padded_rows, + size_t padded_cols, + T pad_value) { + std::vector padded(padded_rows * padded_cols, pad_value); for (size_t i = 0; i < std::min(rows, padded_rows); ++i) { for (size_t j = 0; j < std::min(cols, padded_cols); ++j) { - padded_data[i * padded_cols + j] = data[i * cols + j]; + padded[i * padded_cols + j] = data[i * cols + j]; } } - return padded_data; + + return padded; } -void set_zero_diagonal(std::vector & matrix, size_t dim); +template +std::vector pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_cols, T pad_value) { + return pad_input(reinterpret_cast(tensor->data), + static_cast(tensor->ne[1]), // rows + static_cast(tensor->ne[0]), // cols + padded_rows, padded_cols, pad_value); +} + +void set_zero_diagonal(std::vector & matrix, size_t rows, size_t cols); const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph); -bool get_is_first_token(const ggml_tensor * inp_pos); +bool get_is_prefill(const ggml_tensor * inp_pos); + +graph_key compute_graph_key(struct ggml_cgraph * cgraph); ov::AnyMap get_ov_compile_config(const std::string & device); std::map get_types_to_requant(const std::string & device); ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name); -ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decoder, - const std::string & param_name, - int j, - int input_len); +ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, + const std::string & param_name); +ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggml_decoder, + const std::string & param_name, + int chunk_index); ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, const std::string & result_name);