From 08576bbe160ad4372a5603f6085dd023fb4d95cd Mon Sep 17 00:00:00 2001 From: oreomaker Date: Sun, 18 Jan 2026 11:07:04 +0800 Subject: [PATCH 1/5] feat(qnn): add saveContext method and improve AOT tensor management - Add saveContext method to QNNBackend for saving context binary to file - Implement proper output tensor validation and allocation in graphExecute - Remove redundant output reordering logic that was causing issues - Add tensor caching and management improvements in QnnAOTGraph - Enhance QnnAOTEnv to properly track and retrieve tensors - Add sub-graph input/output tensor capture in LLM2QnnLoweringPass - Remove duplicate allocation warning in QNNTensorWrapper::alloc --- mllm/backends/qnn/QNNBackend.cpp | 82 +++++++++---------- mllm/backends/qnn/QNNBackend.hpp | 1 + mllm/backends/qnn/QNNUtils.cpp | 1 - mllm/backends/qnn/aot/QnnWrappersAPI.cpp | 30 +++++-- mllm/backends/qnn/aot/QnnWrappersAPI.hpp | 2 + .../qnn/aot/passes/LLM2QnnLoweringPass.cpp | 11 +++ 6 files changed, 72 insertions(+), 55 deletions(-) diff --git a/mllm/backends/qnn/QNNBackend.cpp b/mllm/backends/qnn/QNNBackend.cpp index abcdb6519..54da97c9d 100644 --- a/mllm/backends/qnn/QNNBackend.cpp +++ b/mllm/backends/qnn/QNNBackend.cpp @@ -437,6 +437,25 @@ bool QNNBackend::loadContext(const std::string& contextPath) { return true; } +void QNNBackend::saveContext(const std::string& contextPath) { + uint64_t binarySize, writtenSize; + + runtime_->qnnInterface.contextGetBinarySize(context_, &binarySize); + + std::unique_ptr binaryBuffer(new uint8_t[binarySize]); + + runtime_->qnnInterface.contextGetBinary(context_, reinterpret_cast(binaryBuffer.get()), binarySize, &writtenSize); + + if (binarySize < writtenSize) { + MLLM_ERROR("QNN context binary size mismatch. Written {} bytes, expected {} bytes.", writtenSize, binarySize); + } + std::ofstream file(contextPath, std::ios::binary); + file.write(reinterpret_cast(binaryBuffer.get()), writtenSize); + file.close(); + + MLLM_INFO("QNN context saved to {} written {} bytes.", contextPath, writtenSize); +} + std::shared_ptr QNNBackend::createQnnGraph(const std::string& graphName) { // If the graph already exists, return the existing model if (qnnModelIndexMap_.find(graphName) != qnnModelIndexMap_.end()) { @@ -538,6 +557,7 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector& std::vector qnn_inputs; std::vector qnn_outputs; + // Prepare QNN inputs for (int i = 0; i < model->getGraphInputTensorWrappers().size(); i++) { auto wrapper = model->getGraphInputTensorWrappers()[i]; auto& wrapper_tensor = wrapper->getDataContainer(); @@ -549,6 +569,7 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector& return; } + // Case of executing retrieved graph created by AOT // input wrapper is empty, set wrapper's dataContainer(mllm::Tensor) if (!wrapper->isAlloc()) { wrapper->__setDataContainer(runtime_input); } @@ -557,59 +578,30 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector& wrapper->alloc(); qnn_inputs.push_back(*(wrapper->getNativeTensor())); } - - // Prepare QNN outputs in QNN order - std::vector qnn_output_tensors; // Temporary storage for QNN outputs + // Prepare QNN outputs for (int j = 0; j < model->getGraphOutputTensorWrappers().size(); j++) { + auto wrapper = model->getGraphOutputTensorWrappers()[j]; + auto& wrapper_tensor = wrapper->getDataContainer(); + const auto& runtime_output = outputs[j]; + + // Validate output tensors + if (runtime_output.isNil()) { + MLLM_ERROR("Output tensor {} is nil for graph '{}'", j, graphName); + return; + } + + // output wrapper is empty, set wrapper's dataContainer(mllm::Tensor) + if (!wrapper->isAlloc()) { wrapper->__setDataContainer(runtime_output); } + // alloc and register qnn tensor - model->getGraphOutputTensorWrappers()[j]->alloc(); // QNNAllocator will handle registered memory descriptor - qnn_outputs.push_back(*(model->getGraphOutputTensorWrappers()[j]->getNativeTensor())); - qnn_output_tensors.push_back(model->getGraphOutputTensorWrappers()[j]->getDataContainer()); + wrapper->alloc(); // QNNAllocator will handle registered memory descriptor + qnn_outputs.push_back(*(wrapper->getNativeTensor())); } CALL_QNN(runtime_->qnnInterface.graphExecute(model->getQnnGraph(), qnn_inputs.data(), qnn_inputs.size(), qnn_outputs.data(), qnn_outputs.size(), runtime_->profileHandle, nullptr)); if (ProfilingLevel::OFF != profilingLevel_) { extractBackendProfilingInfo(runtime_->profileHandle); } - - // Reorder outputs according to MLLM expected order - const auto& expectedOrder = model->getExpectedOutputOrder(); - - // Resize outputs to match QNN output count first - outputs.resize(qnn_output_tensors.size()); // Ensure outputs has enough space for all QNN outputs - if (!expectedOrder.empty() && expectedOrder.size() == qnn_output_tensors.size()) { - // Reorder outputs according to expected order - for (size_t i = 0; i < expectedOrder.size(); i++) { - const std::string& expected_name = expectedOrder[i]; - int qnn_index = model->getQnnOutputIndex(expected_name); - if (qnn_index >= 0 && qnn_index < static_cast(qnn_output_tensors.size())) { - outputs[i] = qnn_output_tensors[qnn_index]; - } else { - MLLM_ERROR("QNNBackend::graphExecute: Failed to find QNN output index for tensor '{}' in graph '{}'", expected_name, - graphName); - // If mapping fails, we cannot safely reorder outputs - // This is a critical error as we cannot determine the correct output order - MLLM_ERROR("Cannot reorder outputs: missing QNN output index for tensor '{}'. Output order may be incorrect.", - expected_name); - // Note: We still try to copy what we can, but the order may be wrong - if (i < qnn_output_tensors.size()) { - outputs[i] = qnn_output_tensors[i]; - } else { - MLLM_ERROR("Output index {} out of bounds (size: {})", i, qnn_output_tensors.size()); - } - } - } - } else { - // No expected order set or size mismatch, use QNN order as-is - if (expectedOrder.empty()) { - MLLM_WARN("QNNBackend::graphExecute: No expected output order set for graph '{}', using QNN order", graphName); - } else { - MLLM_WARN( - "QNNBackend::graphExecute: Expected output order size ({}) != outputs size ({}) for graph '{}', using QNN order", - expectedOrder.size(), outputs.size(), graphName); - } - for (size_t i = 0; i < qnn_output_tensors.size(); i++) { outputs[i] = qnn_output_tensors[i]; } - } } bool QNNBackend::addTensor(const std::string& graphName, const std::string& tensorName, Qnn_TensorType_t type, diff --git a/mllm/backends/qnn/QNNBackend.hpp b/mllm/backends/qnn/QNNBackend.hpp index 408b45117..49669c7c1 100644 --- a/mllm/backends/qnn/QNNBackend.hpp +++ b/mllm/backends/qnn/QNNBackend.hpp @@ -89,6 +89,7 @@ class QNNBackend final : public Backend { bool loadContext(const std::string& contextPath); bool createContext(); + void saveContext(const std::string& contextPath = "qnn_context.bin"); bool isWeightOnDevice() override { return false; } diff --git a/mllm/backends/qnn/QNNUtils.cpp b/mllm/backends/qnn/QNNUtils.cpp index 271b67200..b8bd4c78f 100644 --- a/mllm/backends/qnn/QNNUtils.cpp +++ b/mllm/backends/qnn/QNNUtils.cpp @@ -483,7 +483,6 @@ std::shared_ptr QNNTensorWrapper::createStaticTensor(const std } void QNNTensorWrapper::alloc() { - if (isAlloc_) { MLLM_WARN("Tensor {} has already been allocated.", name_); } MLLM_RT_ASSERT(dataContainer_.device() == kQNN); // if storage is not allocated, allocate it diff --git a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp index 829a47f2d..b2b04fd78 100644 --- a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp +++ b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp @@ -257,6 +257,11 @@ QnnAOTGraph::QnnAOTGraph(QNN_INTERFACE_VER_TYPE& qnnInterface, Qnn_BackendHandle qnn_model_->initialize(contextHandle, graphName.c_str(), false); } +void QnnAOTGraph::addTensor(const QnnAOTNodeTensor::ptr_t& tensor) { + qnn_model_->addTensorWrapper(tensor->getWrapper()); + all_tensors_.insert({tensor->getWrapper()->getName(), tensor}); +} + void QnnAOTGraph::addOperation(const QnnAOTNodeOperation::ptr_t& qnn_op) { std::vector inputNames; for (auto& in : qnn_op->inputs) inputNames.push_back(in->getWrapper()->getName()); @@ -622,23 +627,30 @@ QnnAOTNodeTensor::ptr_t QnnAOTEnv::captureQnnAOTNodeTensor(const std::string& qn __qnn_enable_static_weight = true; } + MLLM_RT_ASSERT_EQ(contexts_.count(qnn_context_name), 1); + MLLM_RT_ASSERT_EQ(contexts_[qnn_context_name]->graphs_.count(graph_name), 1); + auto graph = contexts_[qnn_context_name]->graphs_[graph_name]; + + // If normal weight is cached, we return it directly + if (graph->all_tensors_.count(__qnn_tensor_name)) { return graph->all_tensors_[__qnn_tensor_name]; } + + QnnAOTNodeTensor::ptr_t ret = nullptr; + // If static weight is cached, we return it directly. if (__qnn_enable_static_weight) { - MLLM_RT_ASSERT_EQ(contexts_.count(qnn_context_name), 1); if (contexts_[qnn_context_name]->static_tensor_.count(__qnn_tensor_name)) { - return contexts_[qnn_context_name]->static_tensor_[__qnn_tensor_name]; + ret = contexts_[qnn_context_name]->static_tensor_[__qnn_tensor_name]; } } - // If normal weight is cached, we return it directly - MLLM_RT_ASSERT_EQ(contexts_.count(qnn_context_name), 1); - MLLM_RT_ASSERT_EQ(contexts_[qnn_context_name]->graphs_.count(graph_name), 1); - if (contexts_[qnn_context_name]->graphs_[graph_name]->all_tensors_.count(__qnn_tensor_name)) { - return contexts_[qnn_context_name]->graphs_[graph_name]->all_tensors_[__qnn_tensor_name]; + // There has no Tensor in the cache. + if (ret == nullptr) { + ret = QnnAOTNodeTensor::create(v, __qnn_enable_static_weight); + + if (__qnn_enable_static_weight) { contexts_[qnn_context_name]->static_tensor_[__qnn_tensor_name] = ret; } } - // There has no Tensor in the cache. - auto ret = QnnAOTNodeTensor::create(v, __qnn_enable_static_weight); + graph->addTensor(ret); return ret; } diff --git a/mllm/backends/qnn/aot/QnnWrappersAPI.hpp b/mllm/backends/qnn/aot/QnnWrappersAPI.hpp index ebbf7f3b2..66313d83e 100644 --- a/mllm/backends/qnn/aot/QnnWrappersAPI.hpp +++ b/mllm/backends/qnn/aot/QnnWrappersAPI.hpp @@ -120,6 +120,8 @@ class QnnAOTGraph : public std::enable_shared_from_this { void addOperation(const QnnAOTNodeOperation::ptr_t& qnn_op); + void addTensor(const QnnAOTNodeTensor::ptr_t& tensor); + bool compile(); bool is_compiled_ = false; diff --git a/mllm/backends/qnn/aot/passes/LLM2QnnLoweringPass.cpp b/mllm/backends/qnn/aot/passes/LLM2QnnLoweringPass.cpp index 971adb593..63b58e0b2 100644 --- a/mllm/backends/qnn/aot/passes/LLM2QnnLoweringPass.cpp +++ b/mllm/backends/qnn/aot/passes/LLM2QnnLoweringPass.cpp @@ -146,6 +146,17 @@ uint8_t LLM2QnnLoweringPass::run(const ir::node_ptr_t& op) { auto aot_graph = aot_env->captureAOTGraph("context.0", subgraph_name); + // Add sub-graph inputs + for (auto& input : region->inputs()) { + auto tensor_input = input->cast_(); + if (tensor_input) { aot_env->captureQnnAOTNodeTensor("context.0", subgraph_name, tensor_input); } + } + // Add sub-graph outputs + for (auto& output : region->outputs()) { + auto tensor_output = output->cast_(); + if (tensor_output) { aot_env->captureQnnAOTNodeTensor("context.0", subgraph_name, tensor_output); } + } + // Walk through all linalg operations in the subgraph subgraph_writer.walk( [&](ir::IRWriter& this_tough_writer, const ir::linalg::LinalgIROp::ptr_t& linalg_op) -> ir::IRWriter::WalkResult { From f0ff9035a19660b1e6abee0194b60c570e9fb324 Mon Sep 17 00:00:00 2001 From: oreomaker Date: Sun, 18 Jan 2026 11:35:46 +0800 Subject: [PATCH 2/5] refactor(qnn-aot): consolidate configuration and simplify runner setup - Remove redundant temperature parameter from example application - Replace custom config structs with unified QnnAOTConfig - Move initialization of QNN backend after argument parsing - Simplify module construction by removing model path dependency - Add sampleGreedy method to QnnAOTModule for token sampling - Update tensor shapes and I/O handling for proper cache management - Remove unused includes and commented code for cleaner implementation --- examples/qwen3_qnn_aot/aot_run.cpp | 7 +-- mllm/backends/qnn/aot_rt/KVCacheManager.cpp | 2 +- mllm/backends/qnn/aot_rt/KVCacheManager.hpp | 14 +---- mllm/backends/qnn/aot_rt/PromptProcessor.cpp | 61 +++++--------------- mllm/backends/qnn/aot_rt/PromptProcessor.hpp | 19 +----- mllm/backends/qnn/aot_rt/QnnAOTConfig.hpp | 28 +++++++++ mllm/backends/qnn/aot_rt/QnnAOTModule.cpp | 11 +++- mllm/backends/qnn/aot_rt/QnnAOTModule.hpp | 8 +-- mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp | 49 ++++------------ mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp | 12 +--- mllm/backends/qnn/aot_rt/TokenGenerator.cpp | 51 ++++++++-------- mllm/backends/qnn/aot_rt/TokenGenerator.hpp | 17 +----- 12 files changed, 104 insertions(+), 175 deletions(-) create mode 100644 mllm/backends/qnn/aot_rt/QnnAOTConfig.hpp diff --git a/examples/qwen3_qnn_aot/aot_run.cpp b/examples/qwen3_qnn_aot/aot_run.cpp index 56203bc14..34d956269 100644 --- a/examples/qwen3_qnn_aot/aot_run.cpp +++ b/examples/qwen3_qnn_aot/aot_run.cpp @@ -14,23 +14,20 @@ MLLM_MAIN({ auto& model_path = Argparse::add("-m|--model").help("Model path").def("qwen3_qnn.mllm"); auto& tokenizer_path = Argparse::add("-t|--tokenizer").help("Tokenizer path").def("tokenizer.json"); auto& config_path = Argparse::add("-c|--config").help("Config path").required(true); - auto& temperature = Argparse::add("--temperature").help("Temperature").def(0.8f); auto& ar_len = Argparse::add("--ar_len").help("Autoregressive length (chunk size)").def(128); Argparse::parse(argc, argv); - mllm::initQnnBackend(model_path.get()); - if (help.isSet()) { Argparse::printHelp(); return 0; } + mllm::initQnnBackend(model_path.get()); + auto qwen3_cfg = mllm::models::qwen3::Qwen3Config(config_path.get()); RunnerConfig config; - config.model_path = model_path.get(); - config.temperature = temperature.get(); config.num_layers = qwen3_cfg.num_hidden_layers; config.num_heads = qwen3_cfg.num_attention_heads; config.head_dim = qwen3_cfg.head_dim; diff --git a/mllm/backends/qnn/aot_rt/KVCacheManager.cpp b/mllm/backends/qnn/aot_rt/KVCacheManager.cpp index 787ca4148..7df115464 100644 --- a/mllm/backends/qnn/aot_rt/KVCacheManager.cpp +++ b/mllm/backends/qnn/aot_rt/KVCacheManager.cpp @@ -9,7 +9,7 @@ namespace mllm::qnn::aot { template -KVCacheManager::KVCacheManager(KVCacheConfig config) : config_(config) { +KVCacheManager::KVCacheManager(QnnAOTConfig config) : config_(config) { k_cache_.resize(config_.num_layers); v_cache_.resize(config_.num_layers); diff --git a/mllm/backends/qnn/aot_rt/KVCacheManager.hpp b/mllm/backends/qnn/aot_rt/KVCacheManager.hpp index fb85ff9ac..8eddb1a95 100644 --- a/mllm/backends/qnn/aot_rt/KVCacheManager.hpp +++ b/mllm/backends/qnn/aot_rt/KVCacheManager.hpp @@ -8,6 +8,7 @@ #include #include "mllm/core/Storage.hpp" #include "mllm/backends/base/Allocator.hpp" +#include "mllm/backends/qnn/aot_rt/QnnAOTConfig.hpp" namespace mllm::qnn::aot { @@ -19,19 +20,10 @@ struct KVCache { T* output_buffer; }; -struct KVCacheConfig { - int32_t context_len; - int64_t head_dim; - int32_t max_ar_len; - int32_t max_cache_len; - int64_t num_heads; - int64_t num_layers; -}; - template class KVCacheManager { public: - explicit KVCacheManager(KVCacheConfig config); + explicit KVCacheManager(QnnAOTConfig config); ~KVCacheManager() = default; void initCache(mllm::Allocator* allocator, int32_t ar_len); @@ -59,7 +51,7 @@ class KVCacheManager { void updateKey(KVCache& k_cache, int32_t n_past, int32_t n_update, const std::vector& selected); void updateValue(KVCache& v_cache, int32_t n_past, int32_t n_update, const std::vector& selected); - KVCacheConfig config_; + QnnAOTConfig config_; size_t total_cache_size_; int32_t cur_ar_len_; std::vector> k_cache_; diff --git a/mllm/backends/qnn/aot_rt/PromptProcessor.cpp b/mllm/backends/qnn/aot_rt/PromptProcessor.cpp index b13c66a0d..acf67bd56 100644 --- a/mllm/backends/qnn/aot_rt/PromptProcessor.cpp +++ b/mllm/backends/qnn/aot_rt/PromptProcessor.cpp @@ -5,21 +5,18 @@ #include "mllm/backends/qnn/aot_rt/PromptProcessor.hpp" #include "mllm/core/DataTypes.hpp" #include "mllm/core/SlicePrimitives.hpp" -#include "mllm/utils/Log.hpp" #include -#include #include #include #include -#include namespace mllm::qnn::aot { template -PromptProcessor::PromptProcessor(KVCacheManager* kv_manager, Config config) - : kv_manager_(kv_manager), config_(std::move(config)) { +PromptProcessor::PromptProcessor(KVCacheManager* kv_manager, QnnAOTConfig config) + : kv_manager_(kv_manager), config_(config) { std::string graph_name = "model.0.s" + std::to_string(config_.ar_len); - module_ = std::make_unique(config_.model_path, graph_name); + module_ = std::make_unique(graph_name); module_->to(kQNN); } @@ -32,20 +29,17 @@ void PromptProcessor::init_io() { input_ids.setName("input_ids"); input_tensors_.push_back(input_ids); - // // 2. Sliding Window Attention Mask - // input_tensors_.push_back(Tensor::empty({1, 1, config_.ar_len, config_.context_len}, kUInt16, kQNN).alloc()); - - // 3. Position IDs + // 2. Position IDs auto pos_ids = Tensor::empty({config_.ar_len}, kInt32, kQNN).alloc(); pos_ids.setName("position_ids"); input_tensors_.push_back(pos_ids); - // 4. Attention Mask + // 3. Attention Mask auto attn_mask = Tensor::empty({1, 1, config_.ar_len, config_.context_len}, kUInt16, kQNN).alloc(); attn_mask.setName("attention_mask"); input_tensors_.push_back(attn_mask); - // 5. KV Caches + // 4. KV Caches const auto& k_caches = kv_manager_->getKCache(); const auto& v_caches = kv_manager_->getVCache(); for (int l = 0; l < config_.num_layers; ++l) { @@ -74,17 +68,18 @@ void PromptProcessor::init_io() { logits.setName("logits"); output_tensors_.push_back(logits); - // 2. KV Caches + // 2. KV Caches, should be consistant with modeling file, or it will cause error for (int l = 0; l < config_.num_layers; ++l) { // K Output - auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN); + auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.head_dim, config_.ar_len}, config_.kv_dtype, kQNN); k_tensor.impl()->storage()->ptr_ = k_caches[l].output_buffer; k_tensor.impl()->storage()->mem_type_ = kManual; k_tensor.setName("present_key_" + std::to_string(l)); output_tensors_.push_back(k_tensor); - + } + for (int l = 0; l < config_.num_layers; ++l) { // V Output - auto v_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN); + auto v_tensor = Tensor::empty({1, (int)config_.num_heads, config_.ar_len, config_.head_dim}, config_.kv_dtype, kQNN); v_tensor.impl()->storage()->ptr_ = v_caches[l].output_buffer; v_tensor.impl()->storage()->mem_type_ = kManual; v_tensor.setName("present_value_" + std::to_string(l)); @@ -106,33 +101,10 @@ void PromptProcessor::prepare_io(const std::vector& prompt_tokens, i input_ids_ptr[i] = 0; // Padding } } - - // 2. Position IDs - int32_t* pos_ids_ptr = input_tensors_[1].ptr(); - for (int i = 0; i < config_.ar_len; ++i) { pos_ids_ptr[i] = (int32_t)(start_pos + i); } - - // 3. Attention Mask - // We need to re-calculate attention mask based on start_pos - std::vector attn_mask_data(config_.ar_len * config_.context_len); - std::vector attention_map(config_.ar_len); - for (int i = 0; i < config_.ar_len; ++i) { - if (i == 0) { - attention_map[i] = -1; - } else { - attention_map[i] = i - 1; - } - } - - kv_manager_->initAttentionMask(attn_mask_data.data(), attention_map, config_.ar_len, start_pos); - - uint16_t* attn_mask_ptr = input_tensors_[2].ptr(); - for (size_t k = 0; k < attn_mask_data.size(); ++k) { attn_mask_ptr[k] = (uint16_t)attn_mask_data[k]; } } template int64_t PromptProcessor::prefill(const std::vector& prompt_tokens, int64_t start_pos) { - MLLM_INFO("perform prefill"); - int64_t num_tokens = prompt_tokens.size(); int64_t current_pos = start_pos; int64_t processed_tokens = 0; @@ -142,8 +114,7 @@ int64_t PromptProcessor::prefill(const std::vector& prompt_tokens, i std::vector attention_map(config_.ar_len); std::iota(attention_map.begin(), attention_map.end(), -1); - kv_manager_->initAttentionMask(input_tensors_[3].ptr(), // TODO: use member rather than index - attention_map, config_.ar_len, start_pos); + kv_manager_->initAttentionMask(input_tensors_[2].ptr(), attention_map, config_.ar_len, start_pos); module_->setOutputTensors(output_tensors_); @@ -153,20 +124,20 @@ int64_t PromptProcessor::prefill(const std::vector& prompt_tokens, i prepare_io(prompt_tokens, processed_tokens, current_pos); // Run forward - output_tensors_ = (*module_)(input_tensors_); + auto module_input = input_tensors_; + output_tensors_ = (*module_)(module_input); int32_t n_update = chunk_size; kv_manager_->updateCache(config_.ar_len, current_pos, n_update, {}); - kv_manager_->updateAttentionMask(input_tensors_[3].ptr(), config_.ar_len, current_pos, n_update, - config_.sliding_window); + kv_manager_->updateAttentionMask(input_tensors_[2].ptr(), config_.ar_len, current_pos, n_update); processed_tokens += chunk_size; current_pos += chunk_size; } - auto logits = output_tensors_[0][{kAll, (num_tokens + config_.ar_len - 1) % config_.ar_len, kAll}]; + auto logits = output_tensors_[0].to(kCPU).squeeze(0)[{kAll, (num_tokens + config_.ar_len - 1) % config_.ar_len, kAll}]; auto cur_token = module_->sampleGreedy(logits); diff --git a/mllm/backends/qnn/aot_rt/PromptProcessor.hpp b/mllm/backends/qnn/aot_rt/PromptProcessor.hpp index c867f0f0c..cfe08620b 100644 --- a/mllm/backends/qnn/aot_rt/PromptProcessor.hpp +++ b/mllm/backends/qnn/aot_rt/PromptProcessor.hpp @@ -5,30 +5,17 @@ #include "mllm/backends/qnn/aot_rt/QnnAOTModule.hpp" #include "mllm/backends/qnn/aot_rt/KVCacheManager.hpp" +#include "mllm/backends/qnn/aot_rt/QnnAOTConfig.hpp" #include "mllm/core/Tensor.hpp" #include #include -#include namespace mllm::qnn::aot { template class PromptProcessor { public: - struct Config { - std::string model_path; - int32_t context_len; - int64_t num_heads; - int64_t num_layers; - int32_t ar_len; - int32_t vocab_size; - int32_t head_dim; - bool use_int64_token; - int sliding_window; - DataTypes kv_dtype = kUInt8; - }; - - PromptProcessor(KVCacheManager* kv_manager, Config config); + PromptProcessor(KVCacheManager* kv_manager, QnnAOTConfig config); /** * Prefill an LLM Module with the given text input. @@ -44,7 +31,7 @@ class PromptProcessor { private: std::unique_ptr module_; KVCacheManager* kv_manager_; - Config config_; + QnnAOTConfig config_; std::vector input_tensors_; std::vector output_tensors_; diff --git a/mllm/backends/qnn/aot_rt/QnnAOTConfig.hpp b/mllm/backends/qnn/aot_rt/QnnAOTConfig.hpp new file mode 100644 index 000000000..8943d6cec --- /dev/null +++ b/mllm/backends/qnn/aot_rt/QnnAOTConfig.hpp @@ -0,0 +1,28 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/DataTypes.hpp" + +namespace mllm::qnn::aot { + +struct QnnAOTConfig { + int num_layers = 28; + int num_heads = 12; + int head_dim = 128; + int vocab_size = 151936; + + int context_len = 4096; + int ar_len = 128; // Chunk size for prefill + int sliding_window = 0; + + // Derived/Computed + int max_ar_len = 128; + int max_cache_len = 4096; + + DataTypes kv_dtype = kUInt8; + bool use_int64_token = true; +}; + +} // namespace mllm::qnn::aot diff --git a/mllm/backends/qnn/aot_rt/QnnAOTModule.cpp b/mllm/backends/qnn/aot_rt/QnnAOTModule.cpp index f1cf6eb1d..8270bc31b 100644 --- a/mllm/backends/qnn/aot_rt/QnnAOTModule.cpp +++ b/mllm/backends/qnn/aot_rt/QnnAOTModule.cpp @@ -3,15 +3,22 @@ #include "mllm/utils/Log.hpp" #include "mllm/engine/Context.hpp" #include "mllm/backends/qnn/QNNBackend.hpp" +#include namespace mllm::qnn::aot { -QnnAOTModule::QnnAOTModule(const std::string& model_path, const std::string& graph_name) - : mllm::nn::Module(graph_name), model_path_(model_path), graph_name_(graph_name) {} +QnnAOTModule::QnnAOTModule(const std::string& graph_name) : mllm::nn::Module(graph_name), graph_name_(graph_name) {} std::vector QnnAOTModule::forward(const std::vector& inputs, const std::vector& args) { return output_tensors_; } +int64_t QnnAOTModule::sampleGreedy(mllm::Tensor& logits) { + auto logits_data = logits.ptr(); + int vocab_size = logits.shape().back(); + auto max_it = std::max_element(logits_data, logits_data + vocab_size); + return std::distance(logits_data, max_it); +} + } // namespace mllm::qnn::aot diff --git a/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp b/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp index 0cfa464c5..9b8acc338 100644 --- a/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp +++ b/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp @@ -11,7 +11,7 @@ namespace mllm::qnn::aot { class QnnAOTModule : public mllm::nn::Module, public models::ARGeneration { public: - QnnAOTModule(const std::string& model_path, const std::string& graph_name); + explicit QnnAOTModule(const std::string& graph_name); std::vector forward(const std::vector& inputs, const std::vector& args) override; @@ -21,15 +21,13 @@ class QnnAOTModule : public mllm::nn::Module, public models::ARGeneration { return {}; }; + int64_t sampleGreedy(Tensor& logits); + void setOutputTensors(const std::vector& output_tensors) { output_tensors_ = output_tensors; } private: - std::string model_path_; std::string graph_name_; - std::vector output_tensors_; - - std::string backend_path_; }; } // namespace mllm::qnn::aot diff --git a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp index 6f0bcfd57..fcac8286a 100644 --- a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp +++ b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp @@ -14,24 +14,16 @@ Runner::Runner(const RunnerConfig& config, mllm::preprocessor::AutoTokenizer* to bool Runner::load() { // init KV cache manager - KVCacheConfig kv_config; - kv_config.context_len = config_.context_len; - kv_config.head_dim = config_.head_dim; - int32_t prompt_processor_ar_len = config_.ar_len; int32_t token_generator_ar_len = 1; if (prompt_processor_ar_len == config_.context_len) { - kv_config.max_cache_len = config_.context_len; + config_.max_cache_len = config_.context_len; } else { - kv_config.max_cache_len = config_.context_len - std::min(token_generator_ar_len, prompt_processor_ar_len); + config_.max_cache_len = config_.context_len - std::min(token_generator_ar_len, prompt_processor_ar_len); } - kv_config.max_ar_len = std::max(token_generator_ar_len, prompt_processor_ar_len); - - kv_config.num_heads = config_.num_heads; - kv_config.num_layers = config_.num_layers; - - kv_manager_ = std::make_unique>(kv_config); + config_.max_ar_len = std::max(token_generator_ar_len, prompt_processor_ar_len); + kv_manager_ = std::make_unique>(config_); auto backend = mllm::Context::instance().getBackend(mllm::kQNN); if (!backend) { @@ -40,37 +32,18 @@ bool Runner::load() { } // init prompt processor(prefill) - PromptProcessor::Config prefill_config; - prefill_config.model_path = config_.model_path; - prefill_config.context_len = config_.context_len; - prefill_config.num_heads = config_.num_heads; - prefill_config.num_layers = config_.num_layers; - prefill_config.ar_len = config_.ar_len; - prefill_config.vocab_size = config_.vocab_size; - prefill_config.head_dim = config_.head_dim; - prefill_config.use_int64_token = false; - prefill_config.sliding_window = config_.context_len; // no sliding window for now - - prompt_processor_ = std::make_unique>(kv_manager_.get(), prefill_config); + config_.use_int64_token = false; + config_.sliding_window = config_.context_len; // no sliding window for now - // init token generator(decode) - TokenGenerator::Config decode_config; - decode_config.model_path = config_.model_path; - decode_config.context_len = config_.context_len; - decode_config.num_heads = config_.num_heads; - decode_config.num_layers = config_.num_layers; - decode_config.vocab_size = config_.vocab_size; - decode_config.head_dim = config_.head_dim; - decode_config.use_int64_token = false; - decode_config.sliding_window = config_.context_len; + prompt_processor_ = std::make_unique>(kv_manager_.get(), config_); + // init token generator(decode) // TODO: EOS IDs auto eos_ids = std::make_unique>(); eos_ids->insert(151643); eos_ids->insert(151645); - token_generator_ = - std::make_unique>(tokenizer_, kv_manager_.get(), std::move(eos_ids), decode_config); + token_generator_ = std::make_unique>(tokenizer_, kv_manager_.get(), std::move(eos_ids), config_); kv_manager_->initCache(backend->allocator().get(), config_.ar_len); prompt_processor_->init_io(); @@ -96,9 +69,9 @@ void Runner::generate(std::vector& prompt_tokens, int32_t seq_len, token_callback(str); } - int64_t cur_pos = prompt_tokens.size(); + // int64_t cur_pos = prompt_tokens.size(); - token_generator_->generate(prompt_tokens, cur_pos, seq_len, token_callback, false); + // token_generator_->generate(prompt_tokens, cur_pos, seq_len, token_callback, false); } } // namespace mllm::qnn::aot diff --git a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp index dc41ad09f..2c51db7d8 100644 --- a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp +++ b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp @@ -5,6 +5,7 @@ #include "mllm/backends/qnn/aot_rt/KVCacheManager.hpp" #include "mllm/backends/qnn/aot_rt/PromptProcessor.hpp" #include "mllm/backends/qnn/aot_rt/TokenGenerator.hpp" +#include "mllm/backends/qnn/aot_rt/QnnAOTConfig.hpp" #include "mllm/preprocessor/tokenizers/AutoTokenizer.hpp" #include #include @@ -13,16 +14,7 @@ namespace mllm::qnn::aot { -struct RunnerConfig { - std::string model_path; - float temperature = 0.8f; - int num_layers = 28; - int num_heads = 12; - int head_dim = 128; - int vocab_size = 151936; - int context_len = 4096; - int ar_len = 128; // Chunk size for prefill -}; +using RunnerConfig = QnnAOTConfig; class Runner { public: diff --git a/mllm/backends/qnn/aot_rt/TokenGenerator.cpp b/mllm/backends/qnn/aot_rt/TokenGenerator.cpp index 98986ee41..e581b15a5 100644 --- a/mllm/backends/qnn/aot_rt/TokenGenerator.cpp +++ b/mllm/backends/qnn/aot_rt/TokenGenerator.cpp @@ -7,10 +7,10 @@ namespace mllm::qnn::aot { template TokenGenerator::TokenGenerator(mllm::preprocessor::AutoTokenizer* tokenizer, KVCacheManager* kv_manager, - std::unique_ptr>&& eos_ids, Config config) - : tokenizer_(tokenizer), kv_manager_(kv_manager), eos_ids_(std::move(eos_ids)), config_(std::move(config)) { + std::unique_ptr>&& eos_ids, QnnAOTConfig config) + : tokenizer_(tokenizer), kv_manager_(kv_manager), eos_ids_(std::move(eos_ids)), config_(config) { std::string graph_name = "model.0.s1"; - module_ = std::make_unique(config_.model_path, graph_name); + module_ = std::make_unique(graph_name); module_->to(kQNN); } @@ -19,38 +19,34 @@ void TokenGenerator::init_io() { input_tensors_.reserve(4 + 2 * config_.num_layers); // 1. Input IDs - auto input_ids = Tensor::empty({1, 1, 1, 1}, kInt64, kQNN).alloc(); + auto input_ids = Tensor::empty({1, 1}, kInt32, kQNN).alloc(); input_ids.setName("input_ids"); input_tensors_.push_back(input_ids); - // // 2. Sliding Window Attention Mask - // auto sliding_window_mask = Tensor::empty({1, 1, 1, config_.context_len}, kUInt16, kQNN).alloc(); - // sliding_window_mask.setName("sliding_window_attention_mask"); - // input_tensors_.push_back(sliding_window_mask); + // 2. Position IDs + auto pos_ids = Tensor::empty({1}, kInt32, kQNN).alloc(); + pos_ids.setName("position_ids"); + input_tensors_.push_back(pos_ids); // 3. Attention Mask auto attn_mask = Tensor::empty({1, 1, 1, config_.context_len}, kUInt16, kQNN).alloc(); attn_mask.setName("attention_mask"); input_tensors_.push_back(attn_mask); - // 4. Position IDs - auto pos_ids = Tensor::empty({1, 1, 1, 1}, kInt32, kQNN).alloc(); - pos_ids.setName("position_ids"); - input_tensors_.push_back(pos_ids); - - // 5. KV Caches + // 4. KV Caches const auto& k_caches = kv_manager_->getKCache(); const auto& v_caches = kv_manager_->getVCache(); for (int l = 0; l < config_.num_layers; ++l) { // K - auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN); + auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.head_dim, config_.context_len}, config_.kv_dtype, kQNN); k_tensor.impl()->storage()->ptr_ = k_caches[l].buffer; k_tensor.impl()->storage()->mem_type_ = kManual; k_tensor.setName("past_key_" + std::to_string(l)); input_tensors_.push_back(k_tensor); // V - auto v_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN); + auto v_tensor = + Tensor::empty({1, (int)config_.num_heads, config_.context_len - 1, config_.head_dim}, config_.kv_dtype, kQNN); v_tensor.impl()->storage()->ptr_ = v_caches[l].buffer; v_tensor.impl()->storage()->mem_type_ = kManual; v_tensor.setName("past_value_" + std::to_string(l)); @@ -65,17 +61,18 @@ void TokenGenerator::init_io() { logits.setName("logits"); output_tensors_.push_back(logits); - // 2. KV Caches + // 2. KV Caches, should be consistant with modeling file, or it will cause error for (int l = 0; l < config_.num_layers; ++l) { // K Output - auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN); + auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.head_dim, 1}, config_.kv_dtype, kQNN); k_tensor.impl()->storage()->ptr_ = k_caches[l].output_buffer; k_tensor.impl()->storage()->mem_type_ = kManual; k_tensor.setName("present_key_" + std::to_string(l)); output_tensors_.push_back(k_tensor); - + } + for (int l = 0; l < config_.num_layers; ++l) { // V Output - auto v_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN); + auto v_tensor = Tensor::empty({1, (int)config_.num_heads, 1, config_.head_dim}, config_.kv_dtype, kQNN); v_tensor.impl()->storage()->ptr_ = v_caches[l].output_buffer; v_tensor.impl()->storage()->mem_type_ = kManual; v_tensor.setName("present_value_" + std::to_string(l)); @@ -95,12 +92,8 @@ void TokenGenerator::prepare_io(uint64_t cur_token, int64_t start_pos) { input_ids_ptr[0] = (int32_t)cur_token; // 2. Position IDs - int32_t* pos_ids_ptr = input_tensors_[3].ptr(); + int32_t* pos_ids_ptr = input_tensors_[1].ptr(); pos_ids_ptr[0] = (int32_t)start_pos; - - // 3. Attention Mask - // Update attention mask for the current position - kv_manager_->updateAttentionMask(input_tensors_[2].ptr(), 1, start_pos, 1, config_.sliding_window); } template @@ -120,17 +113,19 @@ int64_t TokenGenerator::generate(std::vector& tokens, int64_t start prepare_io(next_token, current_pos); - output_tensors_ = module_->forward(input_tensors_, {}); + // Run forward + auto module_input = input_tensors_; + output_tensors_ = (*module_)(module_input); // Update KV Cache int32_t n_update = 1; kv_manager_->updateCache(1, current_pos, n_update, {}); // Get logits - auto logits_tensor = output_tensors_[0]; + auto logits = output_tensors_[0].to(kCPU).squeeze(0); // Sample - auto cur_token = module_->sampleGreedy(logits_tensor); + auto cur_token = module_->sampleGreedy(logits); next_token = cur_token; tokens.push_back(next_token); diff --git a/mllm/backends/qnn/aot_rt/TokenGenerator.hpp b/mllm/backends/qnn/aot_rt/TokenGenerator.hpp index 5c23da325..da50836d2 100644 --- a/mllm/backends/qnn/aot_rt/TokenGenerator.hpp +++ b/mllm/backends/qnn/aot_rt/TokenGenerator.hpp @@ -2,6 +2,7 @@ #include "mllm/backends/qnn/aot_rt/QnnAOTModule.hpp" #include "mllm/backends/qnn/aot_rt/KVCacheManager.hpp" +#include "mllm/backends/qnn/aot_rt/QnnAOTConfig.hpp" #include "mllm/preprocessor/tokenizers/AutoTokenizer.hpp" #include "mllm/core/Tensor.hpp" #include @@ -15,20 +16,8 @@ namespace mllm::qnn::aot { template class TokenGenerator { public: - struct Config { - std::string model_path; - int32_t context_len; - int64_t num_heads; - int64_t num_layers; - int32_t vocab_size; - int32_t head_dim; - bool use_int64_token; - int sliding_window; - DataTypes kv_dtype = kUInt8; - }; - TokenGenerator(mllm::preprocessor::AutoTokenizer* tokenizer, KVCacheManager* kv_manager, - std::unique_ptr>&& eos_ids, Config config); + std::unique_ptr>&& eos_ids, QnnAOTConfig config); virtual ~TokenGenerator() = default; @@ -44,7 +33,7 @@ class TokenGenerator { std::unique_ptr module_; KVCacheManager* kv_manager_; std::unique_ptr> eos_ids_; - Config config_; + QnnAOTConfig config_; std::vector input_tensors_; std::vector output_tensors_; From 8c7181bd6dd41b45678dfa0c1c8430aaae44098e Mon Sep 17 00:00:00 2001 From: oreomaker Date: Mon, 19 Jan 2026 14:39:46 +0800 Subject: [PATCH 3/5] feat(mllm): add support for kUInt16 data type in Tensor formatter --- mllm/mllm.inl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mllm/mllm.inl b/mllm/mllm.inl index 0cab49da0..4ecb3ce2c 100644 --- a/mllm/mllm.inl +++ b/mllm/mllm.inl @@ -322,6 +322,8 @@ struct formatter { return fmt::format_to(out, "{}", tensor.constAt(const_cast&>(indices))); case mllm::kInt16: return fmt::format_to(out, "{}", tensor.constAt(const_cast&>(indices))); + case mllm::kUInt16: + return fmt::format_to(out, "{}", tensor.constAt(const_cast&>(indices))); case mllm::kInt8: return fmt::format_to( out, "{}", From 3cfa876eaf3921ec218d972de8fa6a348368c9a0 Mon Sep 17 00:00:00 2001 From: oreomaker Date: Mon, 19 Jan 2026 14:41:27 +0800 Subject: [PATCH 4/5] feat(qnn-aot): update generate function to accept tensor input fix(qnn-aot): add position ID handling in PromptProcessor --- examples/qwen3_qnn_aot/aot_run.cpp | 8 ++------ mllm/backends/qnn/aot_rt/PromptProcessor.cpp | 11 ++++++++++- mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp | 11 +++++++---- mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp | 3 +-- 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/examples/qwen3_qnn_aot/aot_run.cpp b/examples/qwen3_qnn_aot/aot_run.cpp index 34d956269..2d237af7e 100644 --- a/examples/qwen3_qnn_aot/aot_run.cpp +++ b/examples/qwen3_qnn_aot/aot_run.cpp @@ -49,12 +49,8 @@ MLLM_MAIN({ return 1; } - std::vector prompt_tokens; - auto sequence = input_tensor["sequence"]; - int64_t* ptr = sequence.ptr(); - for (int i = 0; i < sequence.shape()[1]; ++i) { prompt_tokens.push_back((uint64_t)ptr[i]); } - - runner.generate(prompt_tokens, config.context_len, [](const std::string& token) { std::cout << token << std::flush; }); + runner.generate(input_tensor["sequence"], config.context_len, + [](const std::string& token) { std::cout << token << std::flush; }); std::cout << "\n"; return 0; diff --git a/mllm/backends/qnn/aot_rt/PromptProcessor.cpp b/mllm/backends/qnn/aot_rt/PromptProcessor.cpp index acf67bd56..d56880612 100644 --- a/mllm/backends/qnn/aot_rt/PromptProcessor.cpp +++ b/mllm/backends/qnn/aot_rt/PromptProcessor.cpp @@ -92,14 +92,18 @@ void PromptProcessor::prepare_io(const std::vector& prompt_tokens, i int64_t num_tokens = prompt_tokens.size(); int64_t chunk_size = std::min((int64_t)config_.ar_len, num_tokens - prompt_pos); - // 1. Input IDs int32_t* input_ids_ptr = input_tensors_[0].ptr(); + int32_t* pos_ids_ptr = input_tensors_[1].ptr(); for (int i = 0; i < config_.ar_len; ++i) { + // 1. Input IDs if (i < chunk_size) { input_ids_ptr[i] = (int32_t)prompt_tokens[prompt_pos + i]; } else { input_ids_ptr[i] = 0; // Padding } + + // 2. Position IDs + pos_ids_ptr[i] = start_pos + i; } } @@ -115,6 +119,9 @@ int64_t PromptProcessor::prefill(const std::vector& prompt_tokens, i std::vector attention_map(config_.ar_len); std::iota(attention_map.begin(), attention_map.end(), -1); kv_manager_->initAttentionMask(input_tensors_[2].ptr(), attention_map, config_.ar_len, start_pos); + // init window attention mask with current position + kv_manager_->initAttentionMask(input_tensors_[2].ptr(), attention_map, config_.ar_len, start_pos, + config_.sliding_window); module_->setOutputTensors(output_tensors_); @@ -132,6 +139,8 @@ int64_t PromptProcessor::prefill(const std::vector& prompt_tokens, i kv_manager_->updateCache(config_.ar_len, current_pos, n_update, {}); kv_manager_->updateAttentionMask(input_tensors_[2].ptr(), config_.ar_len, current_pos, n_update); + kv_manager_->updateAttentionMask(input_tensors_[2].ptr(), config_.ar_len, current_pos, n_update, + config_.sliding_window); processed_tokens += chunk_size; current_pos += chunk_size; diff --git a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp index fcac8286a..58491cc7a 100644 --- a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp +++ b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp @@ -4,8 +4,10 @@ #include "mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp" #include #include +#include "mllm/core/DataTypes.hpp" #include "mllm/core/DeviceTypes.hpp" #include "mllm/preprocessor/tokenizers/Unicode.hpp" +#include "mllm/utils/Common.hpp" #include "mllm/utils/Log.hpp" namespace mllm::qnn::aot { @@ -52,17 +54,18 @@ bool Runner::load() { return true; } -void Runner::generate(std::vector& prompt_tokens, int32_t seq_len, +void Runner::generate(const Tensor& prompt_tokens, int32_t seq_len, const std::function& token_callback) { + MLLM_RT_ASSERT(prompt_tokens.rank() == 2 && prompt_tokens.dtype() == kInt64); + int64_t start_pos = 0; std::vector prompt_tokens_i64; - prompt_tokens_i64.reserve(prompt_tokens.size()); - for (auto t : prompt_tokens) prompt_tokens_i64.push_back((int64_t)t); + prompt_tokens_i64.reserve(prompt_tokens.shape()[1]); + for (int i = 0; i < prompt_tokens.shape()[1]; i++) { prompt_tokens_i64.push_back(prompt_tokens.ptr()[i]); } int64_t next_token = prompt_processor_->prefill(prompt_tokens_i64, start_pos); - prompt_tokens.push_back((uint64_t)next_token); if (token_callback) { std::wstring wstr = tokenizer_->detokenize(next_token); std::string str = mllm::preprocessor::wideString2Utf8String(wstr); diff --git a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp index 2c51db7d8..c83d1567c 100644 --- a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp +++ b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp @@ -22,8 +22,7 @@ class Runner { ~Runner() = default; bool load(); - void generate(std::vector& prompt_tokens, int32_t seq_len, - const std::function& token_callback); + void generate(const Tensor& prompt_tokens, int32_t seq_len, const std::function& token_callback); private: RunnerConfig config_; From 817a96883b0a281cfbcbbba04ab4a68e189629a5 Mon Sep 17 00:00:00 2001 From: oreomaker Date: Tue, 20 Jan 2026 11:03:16 +0800 Subject: [PATCH 5/5] fix(mllm/backends/qnn): restructure KV cache initialization loop --- mllm/backends/qnn/aot_rt/PromptProcessor.cpp | 7 ++++--- mllm/backends/qnn/aot_rt/TokenGenerator.cpp | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/mllm/backends/qnn/aot_rt/PromptProcessor.cpp b/mllm/backends/qnn/aot_rt/PromptProcessor.cpp index d56880612..50396955d 100644 --- a/mllm/backends/qnn/aot_rt/PromptProcessor.cpp +++ b/mllm/backends/qnn/aot_rt/PromptProcessor.cpp @@ -42,16 +42,17 @@ void PromptProcessor::init_io() { // 4. KV Caches const auto& k_caches = kv_manager_->getKCache(); const auto& v_caches = kv_manager_->getVCache(); + // K for (int l = 0; l < config_.num_layers; ++l) { - // K auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.head_dim, config_.context_len - config_.ar_len}, config_.kv_dtype, kQNN); k_tensor.impl()->storage()->ptr_ = k_caches[l].buffer; k_tensor.impl()->storage()->mem_type_ = kManual; k_tensor.setName("past_key_" + std::to_string(l)); input_tensors_.push_back(k_tensor); - - // V + } + // V + for (int l = 0; l < config_.num_layers; ++l) { auto v_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len - config_.ar_len, config_.head_dim}, config_.kv_dtype, kQNN); v_tensor.impl()->storage()->ptr_ = v_caches[l].buffer; diff --git a/mllm/backends/qnn/aot_rt/TokenGenerator.cpp b/mllm/backends/qnn/aot_rt/TokenGenerator.cpp index e581b15a5..d2cbbf43f 100644 --- a/mllm/backends/qnn/aot_rt/TokenGenerator.cpp +++ b/mllm/backends/qnn/aot_rt/TokenGenerator.cpp @@ -36,15 +36,16 @@ void TokenGenerator::init_io() { // 4. KV Caches const auto& k_caches = kv_manager_->getKCache(); const auto& v_caches = kv_manager_->getVCache(); + // K for (int l = 0; l < config_.num_layers; ++l) { - // K auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.head_dim, config_.context_len}, config_.kv_dtype, kQNN); k_tensor.impl()->storage()->ptr_ = k_caches[l].buffer; k_tensor.impl()->storage()->mem_type_ = kManual; k_tensor.setName("past_key_" + std::to_string(l)); input_tensors_.push_back(k_tensor); - - // V + } + // V + for (int l = 0; l < config_.num_layers; ++l) { auto v_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len - 1, config_.head_dim}, config_.kv_dtype, kQNN); v_tensor.impl()->storage()->ptr_ = v_caches[l].buffer;