diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 72f6144708a..20cc02e98ba 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -59,13 +59,19 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, } validate_cgraph(); - for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto * cur_node = cgraph->nodes[node_n]; m_nodes.push_back(cur_node); set_input_output(cur_node); } + m_is_full_model = has_inp_tokens && has_output; + if (!m_is_full_model) { + compute_cgraph_dynamic_dims(); + add_extra_model_inputs_for_fallback(); + add_extra_model_outputs_for_fallback(); + } + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node); m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node); @@ -150,6 +156,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { current_node_info.node_inputs[src_name] = src; current_node_info.node_inputs_names.push_back(src_name); + if (src_name == "inp_tokens") { + has_inp_tokens = true; + } + // Add model inputs if (!naive && !src->view_src) { ggml_backend_buffer * buffer = src->buffer; @@ -176,6 +186,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { if (!naive) { // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches static std::set debug_output_names = {}; + if (node_output_name.find("output") != std::string::npos) { + has_output = true; + } // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || node_output_name.find("output") != std::string::npos || debug_output_names.count(node_output_name)) { @@ -264,6 +277,9 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { throw std::runtime_error("Unsupported VIEW case"); } op_case = 2; + if (!m_is_full_model && m_model_inputs.find(std::string(src->name)) != m_model_inputs.end()) { + op_case = 0; + } } break; } @@ -359,7 +375,7 @@ void GgmlOvDecoder::validate_cgraph() const { } } -ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const { +ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index) const { auto name = std::string(input->name); ov::PartialShape input_shape; @@ -391,6 +407,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co } else { input_shape = ov::PartialShape{get_shape(input)}; } + if (dynamic_dim_index != -1) { + input_shape[3-dynamic_dim_index] = -1; + } return input_shape; } @@ -863,3 +882,201 @@ const std::string & GgmlOvDecoder::get_op_type() const { static const std::string unknown_op = "UNKNOWN_GGML_OP"; return unknown_op; } + +/** + * @brief Computes the dynamic dimensions for the computation graph nodes to support fallback mechanisms. + * + * This function traverses the computation graph and determines the dynamic dimensions + * for each node based on its operation type and dependencies. The dynamic dimension + * is stored in the `m_node_dynamic_dims` map, where a value of -1 indicates no dynamic + * dimension. Specific operations such as GGML_OP_GET_ROWS, GGML_OP_MUL, GGML_OP_VIEW, + * etc., are handled to compute the dynamic dimension index. + * + * Key behaviors: + * - Nodes with operations like GGML_OP_NONE, GGML_OP_GET_ROWS, GGML_OP_MUL, and others + * are analyzed to determine their dynamic dimensions. + * - Nodes with specific names (e.g., "inp_tokens", "inp_pos", "inp_out_ids") are + * explicitly assigned a dynamic dimension index of 0. + * - For operations like GGML_OP_VIEW and GGML_OP_RESHAPE, the function ensures that + * the dynamic dimension is uniquely determined; otherwise, a warning is printed. + * - Unhandled operations print a message indicating the node name and operation type. + * + * This function is critical for preparing the computation graph for execution, ensuring + * that dynamic dimensions are correctly propagated and resolved. + */ +void GgmlOvDecoder::compute_cgraph_dynamic_dims() { + auto visit_node = [&](auto && self, ggml_tensor * node) -> void { + if (!node) { + return; + } + + if (node->op == GGML_OP_CPY) { + m_node_dynamic_dims[node] = -1; + } + + if (m_node_dynamic_dims.count(node)) { + return; + } + for (int i = 0; i < GGML_MAX_SRC; i++) { + ggml_tensor * src = node->src[i]; + if (src) { + self(self, src); + } + } + switch (node->op) { + case GGML_OP_NONE: + m_node_dynamic_dims[node] = -1; + if (std::string(node->name) == "inp_tokens" || std::string(node->name) == "inp_pos" || + std::string(node->name) == "inp_out_ids") { + m_node_dynamic_dims[node] = 0; + } + break; + case GGML_OP_GET_ROWS: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[1]] != -1) { + m_node_dynamic_dims[node] = 1; + } + break; + case GGML_OP_MUL: + case GGML_OP_MUL_MAT: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; + } + if (m_node_dynamic_dims[node->src[1]] != -1) { + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]]; + } + break; + case GGML_OP_VIEW: + case GGML_OP_FLASH_ATTN_EXT: + case GGML_OP_PERMUTE: + case GGML_OP_RESHAPE: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]]; + auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx]; + int same_dim_count = 0; + for (int i = 0; i < 4; i++) { + if (node->ne[i] == dynamic_dim_value) { + m_node_dynamic_dims[node] = i; + same_dim_count++; + } + } + if (same_dim_count != 1) { + std::cout << "Cannot determine dynamic dim for node: " << node->name << std::endl; + } + } + break; + case GGML_OP_RMS_NORM: + case GGML_OP_ADD: + case GGML_OP_GLU: + case GGML_OP_ROPE: + case GGML_OP_SCALE: + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; + break; + case GGML_OP_CPY: + case GGML_OP_SET_ROWS: + m_node_dynamic_dims[node] = -1; + break; + default: + std::cout << "Doesn't handle node name: " << node->name << " op: " << ggml_op_name(node->op) << std::endl; + break; + } + }; + + for (int i = 0; i < m_cgraph->n_nodes; i++) { + ggml_tensor * node = m_cgraph->nodes[i]; + visit_node(visit_node, node); + } +} + +/** + * @brief Adds extra model outputs to support fallback mechanisms. + * + * This function ensures that all relevant nodes in the computation graph are included + * as model outputs for fallback scenarios. It creates a mapping of tensor data addresses + * to their corresponding nodes, excluding nodes with the GGML_OP_VIEW operation. + * + * Key behaviors: + * - Iterates through all nodes in the computation graph and maps their data addresses + * to the corresponding tensor nodes, skipping nodes with GGML_OP_VIEW. + * - Adds nodes to the `m_model_outputs` map if they are not already present, using + * the tensor's name as the key. + * + * This function is essential for ensuring that fallback mechanisms have access to all + * necessary model outputs, particularly in scenarios where certain outputs are not + * explicitly defined in the original model configuration. + */ +void GgmlOvDecoder::add_extra_model_outputs_for_fallback() { + std::map address_map; + for (int i = 0; i < m_cgraph->n_nodes; i++) { + ggml_tensor * node = m_cgraph->nodes[i]; + if (node->op == GGML_OP_VIEW) { + continue; + } + address_map[node->data] = node; + } + + for (const auto & pair : address_map) { + const std::string & name = pair.second->name; + if (m_model_outputs.find(name) == m_model_outputs.end()) { + m_model_outputs[name] = pair.second; + } + } +} + +/** +* @brief Adds extra model inputs to support fallback mechanisms. +* +* This function ensures that all necessary input nodes in the computation graph are +* included as model inputs for fallback scenarios. It iterates through the source nodes +* of each computation graph node and adds them to the `m_model_inputs` map if they meet +* specific criteria. +* +* Key behaviors: +* - Skips source nodes that are already present in `m_model_weights` or `m_model_inputs`. +* - Excludes intermediate nodes that are part of `m_node_info_list`. +* - For eligible source nodes, creates OpenVINO parameter nodes with appropriate types +* and shapes, and assigns them friendly names. +* - Updates the `m_inputs` and `m_model_inputs` maps with the new parameter nodes. +* +* This function is critical for ensuring that fallback mechanisms have access to all +* required model inputs, particularly in scenarios where certain inputs are not +* explicitly defined in the original model configuration. +*/ +void GgmlOvDecoder::add_extra_model_inputs_for_fallback() { + for (int i = 0; i < m_cgraph->n_nodes; i++) { + ggml_tensor * node = m_cgraph->nodes[i]; + for (int i = 0; i < GGML_MAX_SRC; i++) { + auto * src = node->src[i]; + if (src == nullptr) { + continue; + } + std::string src_name = std::string(src->name); + if (m_model_weights.find(src_name) != m_model_weights.end()) { + continue; + } + + bool is_intermediate_node = false; + for (const auto & node_info : m_node_info_list) { + if (node_info.node == src) { + is_intermediate_node = true; + break; + } + } + if (is_intermediate_node) { + continue; + } + if (m_model_inputs.find(src_name) != m_model_inputs.end()) { + continue; + } + + m_inputs[src_name] = src; + auto param_node = std::make_shared( + get_ov_type(src), get_graph_input_shape(node, src, m_node_dynamic_dims[src])); + param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); + m_model_inputs[src_name] = param_node; + } + } +} diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 111eb7200b8..341bc768501 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -175,7 +175,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual bool is_static() const override { return m_is_static; } - ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const; + ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index=-1) const; static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); @@ -202,8 +202,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void set_compute_params(const ComputeParams & compute_params) { m_compute_params = compute_params; } + virtual bool is_full_model() const override {return m_is_full_model; } + bool m_is_static = false; bool m_is_prefill = false; + bool m_is_full_model = true; // label the cgraph is splited or not int m_prefill_chunk_size = 0; static std::vector get_shape(const ggml_tensor * tensor); @@ -216,6 +219,13 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void set_input_output(ggml_tensor * node, bool naive = false); int compute_op_case(const ggml_tensor * node) const; + // @brief Computes the dynamic dimensions for the computation graph nodes to support fallback mechanisms. + void compute_cgraph_dynamic_dims(); + // @brief Adds extra model outputs to support fallback mechanisms. + void add_extra_model_outputs_for_fallback(); + // @brief Adds extra model inputs to support fallback mechanisms. + void add_extra_model_inputs_for_fallback(); + void validate_cgraph() const; ggml_cgraph * m_cgraph = nullptr; @@ -228,6 +238,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map> m_model_weights; std::map m_model_outputs; std::vector m_node_info_list; + std::map m_node_dynamic_dims; // map from ggml_tensor to its dynamic dimension index, -1 means static + + bool has_inp_tokens = false; + bool has_output = false; ModelParams m_model_params; ComputeParams m_compute_params; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 1603c7fd201..71d3c26e9c9 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -53,6 +53,8 @@ class GgmlDecoder : public DecoderBase { virtual int get_op_case(int node_idx) const = 0; + virtual bool is_full_model() const = 0; + virtual const std::map>& get_model_inputs() const = 0; virtual const std::map>& get_model_extra_inputs() const = 0; virtual const std::map>& get_model_weights() const = 0; diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 546778a4707..45fe19d4918 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -190,7 +190,7 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo } }; - if (!m_naive) { + if (!m_naive && ggml_model_decoder->is_full_model()) { preprocess(*tensor_map, *ggml_model_decoder); } ggml_model_decoder->visit_subgraph(node_visitor); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 1f94d4bad60..5e0f5cb097d 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -3,9 +3,9 @@ #include "ggml-impl.h" #include "ggml-openvino/ggml-decoder.h" #include "ggml.h" +#include "ggml-cpu.h" #include "openvino/frontend.hpp" #include "openvino/input_model.hpp" - #include #include #include @@ -432,7 +432,7 @@ std::map get_types_to_requant(const std::string & dev } bool is_naive(ggml_cgraph * cgraph) { - constexpr int naive_graph_size_threshold = 20; + constexpr int naive_graph_size_threshold = 0; return cgraph->n_nodes < naive_graph_size_threshold; } @@ -480,12 +480,77 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); auto * input_data = ggml_tensor->data; ov::Shape input_shape; - if (ggml_tensor->op == GGML_OP_VIEW) { + if (0) { // This case is added to make test-backend-ops work input_shape = ggml_decoder->get_shape(ggml_tensor->view_src); } else { input_shape = ggml_decoder->get_shape(ggml_tensor); } + + // If the tensor is a result of PERMUTE operation, use ggml_cont to make it contiguous + if (ggml_tensor->op == GGML_OP_PERMUTE && !ggml_decoder->is_full_model()) { + // Create a temporary context for ggml_cont operation + // Need space for: tensor overhead, tensor data, graph structure, and work buffer + size_t mem_size = ggml_tensor_overhead() * 4 + ggml_nbytes(ggml_tensor) * 2 + 1024 * 1024; + struct ggml_init_params params = { + /*.mem_size =*/mem_size, + /*.mem_buffer =*/NULL, + /*.no_alloc =*/false, + }; + struct ggml_context * temp_ctx = ggml_init(params); + if (temp_ctx == NULL) { + throw std::runtime_error("Failed to initialize temporary context for PERMUTE"); + } + + // Create contiguous tensor using ggml_cont + struct ggml_tensor * cont_tensor = ggml_cont(temp_ctx, const_cast(ggml_tensor)); + + // Build a simple graph to compute ggml_cont + struct ggml_cgraph * gf = ggml_new_graph(temp_ctx); + ggml_build_forward_expand(gf, cont_tensor); + ggml_graph_compute_with_ctx(temp_ctx, gf, 1); + + // Create OpenVINO tensor with contiguous data + ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); + memcpy(input_tensor.data(), cont_tensor->data, ggml_nbytes(cont_tensor)); + + // Free temporary context + ggml_free(temp_ctx); + + return input_tensor; + } + + // If the tensor is a result of VIEW operation, use ggml_cont to make it contiguous + if (ggml_tensor->op == GGML_OP_VIEW && !ggml_decoder->is_full_model()) { + // if the ggml_tensor shape size is equal to the source tensor shape size, no need to reconstruct the ov input tensor data + if (ggml_nelements(ggml_tensor) == ggml_nelements(ggml_tensor->view_src)) { + auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data); + return input_tensor; + } + + // Create OpenVINO input tensor, the data need to reconstructed based on the view tensor shape & stride + // Todo: parallel copy & the copy the whole last dim one loop (perf improve) + ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); + const auto * src_tensor = ggml_tensor->view_src; + size_t des_index = 0; + for (size_t i0 = 0; i0 < static_cast(ggml_tensor->ne[3]); i0++) { + for (size_t i1 = 0; i1 < static_cast(ggml_tensor->ne[2]); i1++) { + for (size_t i2 = 0; i2 < static_cast(ggml_tensor->ne[1]); i2++) { + for (size_t i3 = 0; i3 < static_cast(ggml_tensor->ne[0]); i3++) { + size_t src_index = ggml_tensor->view_offs + i0 * ggml_tensor->nb[3] + i1 * ggml_tensor->nb[2] + + i2 * ggml_tensor->nb[1] + i3 * ggml_tensor->nb[0]; + + memcpy(static_cast(input_tensor.data()) + des_index, + static_cast(src_tensor->data) + src_index, ggml_tensor->nb[0]); + des_index += ggml_tensor->nb[0]; + } + } + } + } + return input_tensor; + } + + auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data); return input_tensor; }