Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 4 additions & 11 deletions examples/qwen3_qnn_aot/aot_run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,20 @@ MLLM_MAIN({
auto& model_path = Argparse::add<std::string>("-m|--model").help("Model path").def("qwen3_qnn.mllm");
auto& tokenizer_path = Argparse::add<std::string>("-t|--tokenizer").help("Tokenizer path").def("tokenizer.json");
auto& config_path = Argparse::add<std::string>("-c|--config").help("Config path").required(true);
auto& temperature = Argparse::add<float>("--temperature").help("Temperature").def(0.8f);
auto& ar_len = Argparse::add<int>("--ar_len").help("Autoregressive length (chunk size)").def(128);

Argparse::parse(argc, argv);

mllm::initQnnBackend(model_path.get());

if (help.isSet()) {
Argparse::printHelp();
return 0;
}

mllm::initQnnBackend(model_path.get());

auto qwen3_cfg = mllm::models::qwen3::Qwen3Config(config_path.get());

RunnerConfig config;
config.model_path = model_path.get();
config.temperature = temperature.get();
config.num_layers = qwen3_cfg.num_hidden_layers;
config.num_heads = qwen3_cfg.num_attention_heads;
config.head_dim = qwen3_cfg.head_dim;
Expand All @@ -52,12 +49,8 @@ MLLM_MAIN({
return 1;
}

std::vector<uint64_t> prompt_tokens;
auto sequence = input_tensor["sequence"];
int64_t* ptr = sequence.ptr<int64_t>();
for (int i = 0; i < sequence.shape()[1]; ++i) { prompt_tokens.push_back((uint64_t)ptr[i]); }

runner.generate(prompt_tokens, config.context_len, [](const std::string& token) { std::cout << token << std::flush; });
runner.generate(input_tensor["sequence"], config.context_len,
[](const std::string& token) { std::cout << token << std::flush; });
std::cout << "\n";

return 0;
Expand Down
82 changes: 37 additions & 45 deletions mllm/backends/qnn/QNNBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,25 @@ bool QNNBackend::loadContext(const std::string& contextPath) {
return true;
}

void QNNBackend::saveContext(const std::string& contextPath) {
uint64_t binarySize, writtenSize;

runtime_->qnnInterface.contextGetBinarySize(context_, &binarySize);

std::unique_ptr<uint8_t[]> binaryBuffer(new uint8_t[binarySize]);

runtime_->qnnInterface.contextGetBinary(context_, reinterpret_cast<void*>(binaryBuffer.get()), binarySize, &writtenSize);

if (binarySize < writtenSize) {
MLLM_ERROR("QNN context binary size mismatch. Written {} bytes, expected {} bytes.", writtenSize, binarySize);
}
std::ofstream file(contextPath, std::ios::binary);
file.write(reinterpret_cast<char*>(binaryBuffer.get()), writtenSize);
file.close();

MLLM_INFO("QNN context saved to {} written {} bytes.", contextPath, writtenSize);
}
Comment on lines +440 to +457
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Missing error handling in saveContext - may silently fail or write corrupt data.

Several issues compared to the similar QnnAOTEnv::saveContext implementation in QnnWrappersAPI.cpp:

  1. No error check on contextGetBinarySize return value (line 443)
  2. No error check on contextGetBinary return value (line 447)
  3. Size mismatch at line 449 logs an error but continues writing potentially corrupt data
  4. No check if file opened successfully before writing (line 452)
Suggested fix with proper error handling
 void QNNBackend::saveContext(const std::string& contextPath) {
   uint64_t binarySize, writtenSize;

-  runtime_->qnnInterface.contextGetBinarySize(context_, &binarySize);
+  auto status = runtime_->qnnInterface.contextGetBinarySize(context_, &binarySize);
+  if (status != QNN_SUCCESS) {
+    MLLM_ERROR("Failed to get QNN context binary size.");
+    return;
+  }

   std::unique_ptr<uint8_t[]> binaryBuffer(new uint8_t[binarySize]);

-  runtime_->qnnInterface.contextGetBinary(context_, reinterpret_cast<void*>(binaryBuffer.get()), binarySize, &writtenSize);
+  status = runtime_->qnnInterface.contextGetBinary(context_, reinterpret_cast<void*>(binaryBuffer.get()), binarySize, &writtenSize);
+  if (status != QNN_SUCCESS) {
+    MLLM_ERROR("Failed to get QNN context binary.");
+    return;
+  }

   if (binarySize < writtenSize) {
     MLLM_ERROR("QNN context binary size mismatch. Written {}  bytes, expected {} bytes.", writtenSize, binarySize);
+    return;
   }
+
   std::ofstream file(contextPath, std::ios::binary);
+  if (!file.is_open()) {
+    MLLM_ERROR("Failed to open file {} for writing QNN context.", contextPath);
+    return;
+  }
   file.write(reinterpret_cast<char*>(binaryBuffer.get()), writtenSize);
   file.close();

   MLLM_INFO("QNN context saved to {} written {} bytes.", contextPath, writtenSize);
 }
🤖 Prompt for AI Agents
In `@mllm/backends/qnn/QNNBackend.cpp` around lines 440 - 457,
QNNBackend::saveContext must perform and act on error returns and validate file
open/size before writing: check the return value of
runtime_->qnnInterface.contextGetBinarySize(context_, &binarySize) and bail/log
on failure; after allocating binaryBuffer, check the return of
runtime_->qnnInterface.contextGetBinary(context_, ..., &writtenSize) and
bail/log on failure; if writtenSize != binarySize treat it as an error and do
not proceed to write the buffer; verify std::ofstream file(contextPath,
std::ios::binary).is_open() before file.write and handle/log/return on failure;
ensure all early exits free resources and log via MLLM_ERROR/MllM_INFO as
appropriate.


std::shared_ptr<QNNModel> QNNBackend::createQnnGraph(const std::string& graphName) {
// If the graph already exists, return the existing model
if (qnnModelIndexMap_.find(graphName) != qnnModelIndexMap_.end()) {
Expand Down Expand Up @@ -538,6 +557,7 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&

std::vector<Qnn_Tensor_t> qnn_inputs;
std::vector<Qnn_Tensor_t> qnn_outputs;
// Prepare QNN inputs
for (int i = 0; i < model->getGraphInputTensorWrappers().size(); i++) {
auto wrapper = model->getGraphInputTensorWrappers()[i];
auto& wrapper_tensor = wrapper->getDataContainer();
Expand All @@ -549,6 +569,7 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
return;
}

// Case of executing retrieved graph created by AOT
// input wrapper is empty, set wrapper's dataContainer(mllm::Tensor)
if (!wrapper->isAlloc()) { wrapper->__setDataContainer(runtime_input); }

Expand All @@ -557,59 +578,30 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
wrapper->alloc();
qnn_inputs.push_back(*(wrapper->getNativeTensor()));
}

// Prepare QNN outputs in QNN order
std::vector<Tensor> qnn_output_tensors; // Temporary storage for QNN outputs
// Prepare QNN outputs
for (int j = 0; j < model->getGraphOutputTensorWrappers().size(); j++) {
auto wrapper = model->getGraphOutputTensorWrappers()[j];
auto& wrapper_tensor = wrapper->getDataContainer();
const auto& runtime_output = outputs[j];

// Validate output tensors
if (runtime_output.isNil()) {
MLLM_ERROR("Output tensor {} is nil for graph '{}'", j, graphName);
return;
}

// output wrapper is empty, set wrapper's dataContainer(mllm::Tensor)
if (!wrapper->isAlloc()) { wrapper->__setDataContainer(runtime_output); }

// alloc and register qnn tensor
model->getGraphOutputTensorWrappers()[j]->alloc(); // QNNAllocator will handle registered memory descriptor
qnn_outputs.push_back(*(model->getGraphOutputTensorWrappers()[j]->getNativeTensor()));
qnn_output_tensors.push_back(model->getGraphOutputTensorWrappers()[j]->getDataContainer());
wrapper->alloc(); // QNNAllocator will handle registered memory descriptor
qnn_outputs.push_back(*(wrapper->getNativeTensor()));
}

CALL_QNN(runtime_->qnnInterface.graphExecute(model->getQnnGraph(), qnn_inputs.data(), qnn_inputs.size(), qnn_outputs.data(),
qnn_outputs.size(), runtime_->profileHandle, nullptr));

if (ProfilingLevel::OFF != profilingLevel_) { extractBackendProfilingInfo(runtime_->profileHandle); }

// Reorder outputs according to MLLM expected order
const auto& expectedOrder = model->getExpectedOutputOrder();

// Resize outputs to match QNN output count first
outputs.resize(qnn_output_tensors.size()); // Ensure outputs has enough space for all QNN outputs
if (!expectedOrder.empty() && expectedOrder.size() == qnn_output_tensors.size()) {
// Reorder outputs according to expected order
for (size_t i = 0; i < expectedOrder.size(); i++) {
const std::string& expected_name = expectedOrder[i];
int qnn_index = model->getQnnOutputIndex(expected_name);
if (qnn_index >= 0 && qnn_index < static_cast<int>(qnn_output_tensors.size())) {
outputs[i] = qnn_output_tensors[qnn_index];
} else {
MLLM_ERROR("QNNBackend::graphExecute: Failed to find QNN output index for tensor '{}' in graph '{}'", expected_name,
graphName);
// If mapping fails, we cannot safely reorder outputs
// This is a critical error as we cannot determine the correct output order
MLLM_ERROR("Cannot reorder outputs: missing QNN output index for tensor '{}'. Output order may be incorrect.",
expected_name);
// Note: We still try to copy what we can, but the order may be wrong
if (i < qnn_output_tensors.size()) {
outputs[i] = qnn_output_tensors[i];
} else {
MLLM_ERROR("Output index {} out of bounds (size: {})", i, qnn_output_tensors.size());
}
}
}
} else {
// No expected order set or size mismatch, use QNN order as-is
if (expectedOrder.empty()) {
MLLM_WARN("QNNBackend::graphExecute: No expected output order set for graph '{}', using QNN order", graphName);
} else {
MLLM_WARN(
"QNNBackend::graphExecute: Expected output order size ({}) != outputs size ({}) for graph '{}', using QNN order",
expectedOrder.size(), outputs.size(), graphName);
}
for (size_t i = 0; i < qnn_output_tensors.size(); i++) { outputs[i] = qnn_output_tensors[i]; }
}
}

bool QNNBackend::addTensor(const std::string& graphName, const std::string& tensorName, Qnn_TensorType_t type,
Expand Down
1 change: 1 addition & 0 deletions mllm/backends/qnn/QNNBackend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ class QNNBackend final : public Backend {

bool loadContext(const std::string& contextPath);
bool createContext();
void saveContext(const std::string& contextPath = "qnn_context.bin");

bool isWeightOnDevice() override { return false; }

Expand Down
1 change: 0 additions & 1 deletion mllm/backends/qnn/QNNUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,6 @@ std::shared_ptr<QNNTensorWrapper> QNNTensorWrapper::createStaticTensor(const std
}

void QNNTensorWrapper::alloc() {
if (isAlloc_) { MLLM_WARN("Tensor {} has already been allocated.", name_); }
MLLM_RT_ASSERT(dataContainer_.device() == kQNN);

// if storage is not allocated, allocate it
Expand Down
30 changes: 21 additions & 9 deletions mllm/backends/qnn/aot/QnnWrappersAPI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,11 @@ QnnAOTGraph::QnnAOTGraph(QNN_INTERFACE_VER_TYPE& qnnInterface, Qnn_BackendHandle
qnn_model_->initialize(contextHandle, graphName.c_str(), false);
}

void QnnAOTGraph::addTensor(const QnnAOTNodeTensor::ptr_t& tensor) {
qnn_model_->addTensorWrapper(tensor->getWrapper());
all_tensors_.insert({tensor->getWrapper()->getName(), tensor});
}

void QnnAOTGraph::addOperation(const QnnAOTNodeOperation::ptr_t& qnn_op) {
std::vector<std::string> inputNames;
for (auto& in : qnn_op->inputs) inputNames.push_back(in->getWrapper()->getName());
Expand Down Expand Up @@ -622,23 +627,30 @@ QnnAOTNodeTensor::ptr_t QnnAOTEnv::captureQnnAOTNodeTensor(const std::string& qn
__qnn_enable_static_weight = true;
}

MLLM_RT_ASSERT_EQ(contexts_.count(qnn_context_name), 1);
MLLM_RT_ASSERT_EQ(contexts_[qnn_context_name]->graphs_.count(graph_name), 1);
auto graph = contexts_[qnn_context_name]->graphs_[graph_name];

// If normal weight is cached, we return it directly
if (graph->all_tensors_.count(__qnn_tensor_name)) { return graph->all_tensors_[__qnn_tensor_name]; }

QnnAOTNodeTensor::ptr_t ret = nullptr;

// If static weight is cached, we return it directly.
if (__qnn_enable_static_weight) {
MLLM_RT_ASSERT_EQ(contexts_.count(qnn_context_name), 1);
if (contexts_[qnn_context_name]->static_tensor_.count(__qnn_tensor_name)) {
return contexts_[qnn_context_name]->static_tensor_[__qnn_tensor_name];
ret = contexts_[qnn_context_name]->static_tensor_[__qnn_tensor_name];
}
}

// If normal weight is cached, we return it directly
MLLM_RT_ASSERT_EQ(contexts_.count(qnn_context_name), 1);
MLLM_RT_ASSERT_EQ(contexts_[qnn_context_name]->graphs_.count(graph_name), 1);
if (contexts_[qnn_context_name]->graphs_[graph_name]->all_tensors_.count(__qnn_tensor_name)) {
return contexts_[qnn_context_name]->graphs_[graph_name]->all_tensors_[__qnn_tensor_name];
// There has no Tensor in the cache.
if (ret == nullptr) {
ret = QnnAOTNodeTensor::create(v, __qnn_enable_static_weight);

if (__qnn_enable_static_weight) { contexts_[qnn_context_name]->static_tensor_[__qnn_tensor_name] = ret; }
}

// There has no Tensor in the cache.
auto ret = QnnAOTNodeTensor::create(v, __qnn_enable_static_weight);
graph->addTensor(ret);

return ret;
}
Expand Down
2 changes: 2 additions & 0 deletions mllm/backends/qnn/aot/QnnWrappersAPI.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ class QnnAOTGraph : public std::enable_shared_from_this<QnnAOTGraph> {

void addOperation(const QnnAOTNodeOperation::ptr_t& qnn_op);

void addTensor(const QnnAOTNodeTensor::ptr_t& tensor);

bool compile();

bool is_compiled_ = false;
Expand Down
11 changes: 11 additions & 0 deletions mllm/backends/qnn/aot/passes/LLM2QnnLoweringPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,17 @@ uint8_t LLM2QnnLoweringPass::run(const ir::node_ptr_t& op) {

auto aot_graph = aot_env->captureAOTGraph("context.0", subgraph_name);

// Add sub-graph inputs
for (auto& input : region->inputs()) {
auto tensor_input = input->cast_<ir::tensor::TensorValue>();
if (tensor_input) { aot_env->captureQnnAOTNodeTensor("context.0", subgraph_name, tensor_input); }
}
// Add sub-graph outputs
for (auto& output : region->outputs()) {
auto tensor_output = output->cast_<ir::tensor::TensorValue>();
if (tensor_output) { aot_env->captureQnnAOTNodeTensor("context.0", subgraph_name, tensor_output); }
}

// Walk through all linalg operations in the subgraph
subgraph_writer.walk<ir::linalg::LinalgIROp>(
[&](ir::IRWriter& this_tough_writer, const ir::linalg::LinalgIROp::ptr_t& linalg_op) -> ir::IRWriter::WalkResult {
Expand Down
2 changes: 1 addition & 1 deletion mllm/backends/qnn/aot_rt/KVCacheManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
namespace mllm::qnn::aot {

template<typename T>
KVCacheManager<T>::KVCacheManager(KVCacheConfig config) : config_(config) {
KVCacheManager<T>::KVCacheManager(QnnAOTConfig config) : config_(config) {
k_cache_.resize(config_.num_layers);
v_cache_.resize(config_.num_layers);

Expand Down
14 changes: 3 additions & 11 deletions mllm/backends/qnn/aot_rt/KVCacheManager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <memory>
#include "mllm/core/Storage.hpp"
#include "mllm/backends/base/Allocator.hpp"
#include "mllm/backends/qnn/aot_rt/QnnAOTConfig.hpp"

namespace mllm::qnn::aot {

Expand All @@ -19,19 +20,10 @@ struct KVCache {
T* output_buffer;
};

struct KVCacheConfig {
int32_t context_len;
int64_t head_dim;
int32_t max_ar_len;
int32_t max_cache_len;
int64_t num_heads;
int64_t num_layers;
};

template<typename T>
class KVCacheManager {
public:
explicit KVCacheManager(KVCacheConfig config);
explicit KVCacheManager(QnnAOTConfig config);
~KVCacheManager() = default;

void initCache(mllm::Allocator* allocator, int32_t ar_len);
Expand Down Expand Up @@ -59,7 +51,7 @@ class KVCacheManager {
void updateKey(KVCache<T>& k_cache, int32_t n_past, int32_t n_update, const std::vector<bool>& selected);
void updateValue(KVCache<T>& v_cache, int32_t n_past, int32_t n_update, const std::vector<bool>& selected);

KVCacheConfig config_;
QnnAOTConfig config_;
size_t total_cache_size_;
int32_t cur_ar_len_;
std::vector<KVCache<T>> k_cache_;
Expand Down
Loading