diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index c00efaf6aeb..2e45e2081f0 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -173,93 +173,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { } } } - - if (m_node) { - switch (node->op) { - case GGML_OP_RESHAPE: { - auto * src = node->src[0]; - if (src->op == GGML_OP_RESHAPE && src->src[0]->ne[0] == node->ne[0] && src->src[0]->ne[1] == node->ne[1]) { - m_op_case = 4; - } else if (node->ne[0] * node->ne[1] == src->ne[0]) { - m_op_case = 1; - } else if (src->ne[0] * src->ne[1] == node->ne[0]) { - m_op_case = 2; - if (src->ne[2] * src->ne[3] == node->ne[1]) { - m_op_case = 5; - } - } else if (src->ne[0] * src->ne[1] == node->ne[1]) { - m_op_case = 3; - } else if (src->ne[1] * src->ne[2] == node->ne[1]) { - m_op_case = 6; - } - break; - } - case GGML_OP_CONT: { - if (node->src[0]->op == GGML_OP_PERMUTE) { - m_op_case = 1; - } else if (node->src[0]->op == GGML_OP_TRANSPOSE) { - m_op_case = 2; - } else if (node->src[0]->op == GGML_OP_VIEW) { - // The input comes from a VIEW which is subtensor - m_op_case = 3; - } - break; - } - case GGML_OP_PERMUTE: { - if (node->src[0]->op != GGML_OP_VIEW) { - m_op_case = 1; - } else if (ggml_is_contiguous(node->src[0])) { - std::string src_name(node->view_src->name); - if (src_name.find("cache") == std::string::npos) { - // permute Qcur - m_op_case = 4; - } else { - // Permute kv cache (view) - int layer = extract_layer_from_name(src_name); - if (!is_swa_layer(layer)) { - m_op_case = 2; - } else { - m_op_case = 3; - } - } - } - break; - } - case GGML_OP_MUL_MAT: { - if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) { - m_op_case = 2; - } else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) { - // test-backend-ops case - m_op_case = 3; - } - break; - } - case GGML_OP_GET_ROWS: { - if (node->src[1]->op == GGML_OP_VIEW) { - m_op_case = 2; - } - break; - } - case GGML_OP_ROPE: { - if (node->src[0]->op == GGML_OP_VIEW) { - m_op_case = 2; - } - break; - } - case GGML_OP_VIEW: { - if (node->src[0]->op == GGML_OP_VIEW) { - auto * src = node->src[0]; - if (ggml_nelements(node) != ggml_nelements(src)) { - throw std::runtime_error("Unsupported VIEW case"); - } - // This view is a reshape, slicing happens at src->op - m_op_case = 2; - } - } - default: - break; - } - } } int extract_layer_from_name(const std::string & name) { @@ -320,7 +233,6 @@ void GgmlOvDecoder::set_llm_params() { } else if (node->op == GGML_OP_ROPE) { if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) { m_head_size = node->ne[0]; - m_n_heads = node->ne[1]; m_rope_params = node->op_params; auto * inp_pos = node->src[1]; m_input_len = inp_pos->ne[0]; @@ -775,15 +687,17 @@ int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const { return m_outputs.at(name)->op_params; } -void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { +void GgmlOvDecoder::visit_subgraph(std::function node_visitor) const { for (const auto & node : m_nodes) { - auto decoder = std::make_shared(node, m_cgraph, m_is_static, m_ctx, m_ctx_swa, m_n_heads, - m_n_heads_kv, m_head_size, m_swa_layers); - node_visitor(decoder); + node_visitor(node, m_is_static); } } const std::string & GgmlOvDecoder::get_op_type() const { + return get_ggml_op_type(m_node); +} + +std::string GgmlOvDecoder::get_ggml_op_type(ggml_tensor * tensor) { static const std::map ops = { {GGML_OP_NONE, "GGML_OP_NONE" }, {GGML_OP_ACC, "GGML_OP_ACC" }, @@ -831,13 +745,13 @@ const std::string & GgmlOvDecoder::get_op_type() const { {GGML_GLU_OP_REGLU, "GGML_GLU_OP_REGLU" } }; - switch (m_node->op) { + switch (tensor->op) { case GGML_OP_UNARY: - return unary_ops.at(ggml_get_unary_op(m_node)); + return unary_ops.at(ggml_get_unary_op(tensor)); case GGML_OP_GLU: - return glu_ops.at(ggml_get_glu_op(m_node)); + return glu_ops.at(ggml_get_glu_op(tensor)); default: - return ops.at(m_node->op); + return ops.at(tensor->op); } static const std::string unknown_op = "UNKNOWN_GGML_OP"; return unknown_op; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index e2efc73f17f..ea15698bae8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -75,16 +75,16 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual const std::string & get_op_type() const override; + static std::string get_ggml_op_type(ggml_tensor * tensor); + virtual const std::string & get_op_name() const override; - virtual void visit_subgraph(std::function)> node_visitor) const override; + virtual void visit_subgraph(std::function node_visitor) const override; ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); } ggml_tensor * get_output_ggml_tensor(const std::string & name) const { return m_outputs.at(name); } - virtual int get_op_case() const override { return m_op_case; } - virtual const std::map> & get_model_inputs() const override { return m_model_inputs; } @@ -144,12 +144,14 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void clear_model_weights() { m_model_weights.clear(); } + static ov::element::Type get_ov_type(const ggml_tensor * tensor); + + static std::vector get_shape(const ggml_tensor * tensor); + + static std::vector get_stride(const ggml_tensor * tensor); private: void set_input_output(ggml_tensor * node, bool naive = false); void add_extra_inputs(); - static std::vector get_shape(const ggml_tensor * tensor); - static std::vector get_stride(const ggml_tensor * tensor); - static ov::element::Type get_ov_type(const ggml_tensor * tensor); void set_llm_params(); void validate_cgraph() const; @@ -165,7 +167,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::vector m_output_names; std::string m_op_name; mutable std::string m_name; - int m_op_case = 0; std::vector> m_op_node_name; std::map> m_model_inputs; std::map> m_model_extra_inputs; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 8f86a4de064..4304001190f 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -5,6 +5,7 @@ #include #include #include +#include "ggml.h" namespace ov { namespace frontend { @@ -45,13 +46,9 @@ class GgmlDecoder : public DecoderBase { virtual std::vector get_output_names() const = 0; - virtual const std::string& get_op_type() const = 0; - virtual const std::string& get_op_name() const = 0; - virtual void visit_subgraph(std::function)> node_visitor) const = 0; - - virtual int get_op_case() const = 0; + virtual void visit_subgraph(std::function node_visitor) const = 0; virtual const std::map>& get_model_inputs() const = 0; virtual const std::map>& get_model_extra_inputs() const = 0; diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index 0d76dc83e05..b49d2ea51f6 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -1,11 +1,11 @@ #pragma once +#include "ggml-openvino/ggml-decoder.h" + #include #include #include -#include "decoder.hpp" - namespace ov { namespace frontend { namespace ggml { @@ -16,98 +16,193 @@ typedef std::map> TensorMap; class NodeContext : public frontend::NodeContext { public: - NodeContext(const std::shared_ptr& decoder, - std::shared_ptr& tensor_map, - TranslateSession* translate_session = nullptr) - : ov::frontend::NodeContext(decoder->get_op_type()), - m_decoder(decoder), - m_tensor_map(tensor_map), - m_translate_session(translate_session) { - m_input_names = decoder->get_input_names(); - m_output_names = decoder->get_output_names(); - } + NodeContext(ggml_tensor * node, + std::shared_ptr & tensor_map, + bool is_static = false, + std::string op_type = "", + TranslateSession * translate_session = nullptr) : + ov::frontend::NodeContext(op_type), + m_node(node), + m_tensor_map(tensor_map), + m_is_static(is_static), + m_translate_session(translate_session), + m_node_name(std::string(node->name)), + m_op_case(0) { + std::string node_name; + if (node->op == GGML_OP_SET_ROWS) { + node_name = std::string(node->view_src->name); + } else { + node_name = std::string(node->name); + } - TranslateSession* get_translate_session() const { - return m_translate_session; + m_output_names.push_back(node_name); + m_outputs[node_name] = node; + + for (int i = 0; i < GGML_MAX_SRC; i++) { + auto * src = node->src[i]; + if (src == nullptr) { + continue; + } + std::string src_name = std::string(src->name); + m_input_names.push_back(src_name); + m_inputs[src_name] = src; + } + + m_op_case = compute_op_case(node); } - const std::vector& get_input_names() const { return m_input_names; } + TranslateSession * get_translate_session() const { return m_translate_session; } - size_t get_input_size() const override { - return m_decoder->get_input_size(); - } + const std::vector & get_input_names() const { return m_input_names; } + + const std::vector & get_output_names() const { return m_output_names; } + + size_t get_input_size() const override { return m_input_names.size(); } ov::element::Type get_input_type(size_t index) const { - return m_decoder->get_input_type(m_input_names[index]); + return GgmlOvDecoder::get_ov_type(m_inputs.at(m_input_names[index])); } PartialShape get_input_shape(size_t index) const { - return m_decoder->get_input_shape(m_input_names[index]); + return ov::PartialShape(GgmlOvDecoder::get_shape(m_inputs.at(m_input_names[index]))); } std::vector get_input_stride(size_t index) const { - return m_decoder->get_input_stride(m_input_names[index]); + return GgmlOvDecoder::get_stride(m_inputs.at(m_input_names[index])); } std::string get_output_name() const { return m_output_names[0]; } PartialShape get_output_shape(size_t index) const { - return m_decoder->get_output_shape(m_output_names[index]); + return ov::PartialShape(GgmlOvDecoder::get_shape(m_outputs.at(m_output_names[index]))); } - std::vector get_output_stride(size_t index) const { - return m_decoder->get_output_stride(m_output_names[index]); - } + int32_t * get_input_op_params(size_t index) const { return m_inputs.at(m_input_names[index])->op_params; } - int32_t* get_input_op_params(size_t index) const { - return m_decoder->get_input_op_params(m_input_names[index]); - } - - int32_t* get_output_op_params(size_t index) const { - return m_decoder->get_output_op_params(m_output_names[index]); - } + int32_t * get_output_op_params(size_t index) const { return m_outputs.at(m_output_names[index])->op_params; } ov::element::Type get_output_type(size_t index) const { - return m_decoder->get_output_type(m_output_names[index]); + return GgmlOvDecoder::get_ov_type(m_outputs.at(m_output_names[index])); } - Output get_input(int idx) const override { - return m_tensor_map->at(m_decoder->get_input_name(idx)); - } + Output get_input(int idx) const override { return m_tensor_map->at(m_input_names[idx]); } - Output get_input(const std::string& name) const override { + Output get_input(const std::string & name) const override { if (m_tensor_map->find(name) == m_tensor_map->end()) { throw std::runtime_error("'" + name + "' not found in tensor map."); } return m_tensor_map->at(name); } - bool has_input(const std::string& name) const { - return m_tensor_map->find(name) != m_tensor_map->end(); - } + bool has_input(const std::string & name) const { return m_tensor_map->find(name) != m_tensor_map->end(); } - const std::string& get_name() const override { - return m_decoder->get_op_name(); - } + const std::string & get_name() const override { return m_node_name; } - ov::Any get_attribute_as_any(const std::string& name) const override { - return m_decoder->get_attribute(name); + ov::Any get_attribute_as_any(const std::string & name) const override { + return nullptr; + GGML_UNUSED(name); } - int get_op_case() const { - return m_decoder->get_op_case(); - } - bool is_static() const { return m_decoder->is_static(); } + int get_op_case() const { return m_op_case; } + + bool is_static() const { return m_is_static; } private: - std::shared_ptr m_decoder; - std::shared_ptr& m_tensor_map; - TranslateSession* m_translate_session; + ggml_tensor * m_node; + std::shared_ptr & m_tensor_map; + bool m_is_static = false; + TranslateSession * m_translate_session; std::vector m_input_names; std::vector m_output_names; + std::string m_node_name; + std::map m_inputs; + std::map m_outputs; + int m_op_case; + + int compute_op_case(ggml_tensor * node) { + int op_case = 0; + switch (node->op) { + case GGML_OP_RESHAPE: { + if (node->src[0]->op == GGML_OP_RESHAPE && node->src[0]->src[0]->ne[0] == node->ne[0] && + node->src[0]->src[0]->ne[1] == node->ne[1]) { + op_case = 4; + } else if (node->ne[0] * node->ne[1] == node->src[0]->ne[0]) { + op_case = 1; + } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[0]) { + op_case = 2; + } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[1]) { + op_case = 3; + } + break; + } + case GGML_OP_CONT: { + if (node->src[0]->op == GGML_OP_PERMUTE) { + op_case = 1; + } else if (node->src[0]->op == GGML_OP_TRANSPOSE) { + op_case = 2; + } else if (node->src[0]->op == GGML_OP_VIEW) { + // The input comes from a VIEW which is subtensor + op_case = 3; + } + break; + } + case GGML_OP_PERMUTE: { + if (node->src[0]->op != GGML_OP_VIEW) { + op_case = 1; + } else if (ggml_is_contiguous(node->src[0])) { + std::string src_name(node->view_src->name); + if (src_name.find("cache") == std::string::npos) { + op_case = 1; + } else { + // Permute kv cache (view) + if (!(std::string(node->name).find("swa") != std::string::npos)) { + op_case = 2; + } else { + op_case = 3; + } + } + } + break; + } + case GGML_OP_MUL_MAT: { + if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) { + op_case = 2; + } else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) { + // test-backend-ops case + op_case = 3; + } + break; + } + case GGML_OP_GET_ROWS: { + if (node->src[1]->op == GGML_OP_VIEW) { + op_case = 2; + } + break; + } + case GGML_OP_ROPE: { + if (node->src[0]->op == GGML_OP_VIEW) { + op_case = 2; + } + break; + } + case GGML_OP_VIEW: { + if (node->src[0]->op == GGML_OP_VIEW) { + auto * src = node->src[0]; + auto * view_src = src->view_src; + if (view_src->ne[1] != src->ne[2]) { + throw std::runtime_error("Unsupported VIEW case"); + } + op_case = 2; + } + } + default: + break; + } + return op_case; + } }; -using CreatorFunction = std::function; +using CreatorFunction = std::function; } // namespace ggml } // namespace frontend diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index d12701acdc7..60d2d1d55e0 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -5,6 +5,7 @@ #include "input_model.hpp" #include "pass/eliminate_zp.hpp" #include "pass/mark_decompression_convert_constant_folding.hpp" +#include "ggml-openvino/ggml-decoder.h" #include #include @@ -164,8 +165,8 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo (*tensor_map)[it.first] = it.second; } - auto node_visitor = [&](std::shared_ptr node) { - auto operation_type = node->get_op_type(); + auto node_visitor = [&](ggml_tensor * node, bool is_static) { + auto operation_type = GgmlOvDecoder::get_ggml_op_type(node); if (operation_type == "GGML_OP_NONE") { return; } @@ -174,10 +175,10 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo auto it = m_translator_map.find(operation_type); FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(), "Translation for operation type ", operation_type, " is not implemented."); - NodeContext node_context(node, tensor_map, this); + NodeContext node_context(node, tensor_map, is_static, operation_type, this); converted_outputs = it->second(node_context); - const auto & node_output_names = node->get_output_names(); + const auto & node_output_names = node_context.get_output_names(); FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(), "Number of ", operation_type, " outputs greater than number of converted outputs, which are ", node_output_names.size(), " and ", converted_outputs.size(), " respectively.");