UbiquitousLearning · chenghuaWang · Jan 20, 2026 · Jan 18, 2026 · Jan 18, 2026 · Jan 19, 2026
diff --git a/examples/qwen3_qnn_aot/aot_run.cpp b/examples/qwen3_qnn_aot/aot_run.cpp
@@ -14,23 +14,20 @@ MLLM_MAIN({
   auto& model_path = Argparse::add<std::string>("-m|--model").help("Model path").def("qwen3_qnn.mllm");
   auto& tokenizer_path = Argparse::add<std::string>("-t|--tokenizer").help("Tokenizer path").def("tokenizer.json");
   auto& config_path = Argparse::add<std::string>("-c|--config").help("Config path").required(true);
-  auto& temperature = Argparse::add<float>("--temperature").help("Temperature").def(0.8f);
   auto& ar_len = Argparse::add<int>("--ar_len").help("Autoregressive length (chunk size)").def(128);
 
   Argparse::parse(argc, argv);
 
-  mllm::initQnnBackend(model_path.get());
-
   if (help.isSet()) {
     Argparse::printHelp();
     return 0;
   }
 
+  mllm::initQnnBackend(model_path.get());
+
   auto qwen3_cfg = mllm::models::qwen3::Qwen3Config(config_path.get());
 
   RunnerConfig config;
-  config.model_path = model_path.get();
-  config.temperature = temperature.get();
   config.num_layers = qwen3_cfg.num_hidden_layers;
   config.num_heads = qwen3_cfg.num_attention_heads;
   config.head_dim = qwen3_cfg.head_dim;
@@ -52,12 +49,8 @@ MLLM_MAIN({
     return 1;
   }
 
-  std::vector<uint64_t> prompt_tokens;
-  auto sequence = input_tensor["sequence"];
-  int64_t* ptr = sequence.ptr<int64_t>();
-  for (int i = 0; i < sequence.shape()[1]; ++i) { prompt_tokens.push_back((uint64_t)ptr[i]); }
-
-  runner.generate(prompt_tokens, config.context_len, [](const std::string& token) { std::cout << token << std::flush; });
+  runner.generate(input_tensor["sequence"], config.context_len,
+                  [](const std::string& token) { std::cout << token << std::flush; });
   std::cout << "\n";
 
   return 0;

@@ -437,6 +437,25 @@ bool QNNBackend::loadContext(const std::string& contextPath) {
   return true;
 }
 
+void QNNBackend::saveContext(const std::string& contextPath) {
+  uint64_t binarySize, writtenSize;
+
+  runtime_->qnnInterface.contextGetBinarySize(context_, &binarySize);
+
+  std::unique_ptr<uint8_t[]> binaryBuffer(new uint8_t[binarySize]);
+
+  runtime_->qnnInterface.contextGetBinary(context_, reinterpret_cast<void*>(binaryBuffer.get()), binarySize, &writtenSize);
+
+  if (binarySize < writtenSize) {
+    MLLM_ERROR("QNN context binary size mismatch. Written {}  bytes, expected {} bytes.", writtenSize, binarySize);
+  }
+  std::ofstream file(contextPath, std::ios::binary);
+  file.write(reinterpret_cast<char*>(binaryBuffer.get()), writtenSize);
+  file.close();
+
+  MLLM_INFO("QNN context saved to {} written {} bytes.", contextPath, writtenSize);
+}
+
 std::shared_ptr<QNNModel> QNNBackend::createQnnGraph(const std::string& graphName) {
   // If the graph already exists, return the existing model
   if (qnnModelIndexMap_.find(graphName) != qnnModelIndexMap_.end()) {
@@ -538,6 +557,7 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
 
   std::vector<Qnn_Tensor_t> qnn_inputs;
   std::vector<Qnn_Tensor_t> qnn_outputs;
+  // Prepare QNN inputs
   for (int i = 0; i < model->getGraphInputTensorWrappers().size(); i++) {
     auto wrapper = model->getGraphInputTensorWrappers()[i];
     auto& wrapper_tensor = wrapper->getDataContainer();
@@ -549,6 +569,7 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
       return;
     }
 
+    // Case of executing retrieved graph created by AOT
     // input wrapper is empty, set wrapper's dataContainer(mllm::Tensor)
     if (!wrapper->isAlloc()) { wrapper->__setDataContainer(runtime_input); }
 
@@ -557,59 +578,30 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
     wrapper->alloc();
     qnn_inputs.push_back(*(wrapper->getNativeTensor()));
   }
-
-  // Prepare QNN outputs in QNN order
-  std::vector<Tensor> qnn_output_tensors;  // Temporary storage for QNN outputs
+  // Prepare QNN outputs
   for (int j = 0; j < model->getGraphOutputTensorWrappers().size(); j++) {
+    auto wrapper = model->getGraphOutputTensorWrappers()[j];
+    auto& wrapper_tensor = wrapper->getDataContainer();
+    const auto& runtime_output = outputs[j];
+
+    // Validate output tensors
+    if (runtime_output.isNil()) {
+      MLLM_ERROR("Output tensor {} is nil for graph '{}'", j, graphName);
+      return;
+    }
+
+    // output wrapper is empty, set wrapper's dataContainer(mllm::Tensor)
+    if (!wrapper->isAlloc()) { wrapper->__setDataContainer(runtime_output); }
+
     // alloc and register qnn tensor
-    model->getGraphOutputTensorWrappers()[j]->alloc();  // QNNAllocator will handle registered memory descriptor
-    qnn_outputs.push_back(*(model->getGraphOutputTensorWrappers()[j]->getNativeTensor()));
-    qnn_output_tensors.push_back(model->getGraphOutputTensorWrappers()[j]->getDataContainer());
+    wrapper->alloc();  // QNNAllocator will handle registered memory descriptor
+    qnn_outputs.push_back(*(wrapper->getNativeTensor()));
   }
 
   CALL_QNN(runtime_->qnnInterface.graphExecute(model->getQnnGraph(), qnn_inputs.data(), qnn_inputs.size(), qnn_outputs.data(),
                                                qnn_outputs.size(), runtime_->profileHandle, nullptr));
 
   if (ProfilingLevel::OFF != profilingLevel_) { extractBackendProfilingInfo(runtime_->profileHandle); }
-
-  // Reorder outputs according to MLLM expected order
-  const auto& expectedOrder = model->getExpectedOutputOrder();
-
-  // Resize outputs to match QNN output count first
-  outputs.resize(qnn_output_tensors.size());  // Ensure outputs has enough space for all QNN outputs
-  if (!expectedOrder.empty() && expectedOrder.size() == qnn_output_tensors.size()) {
-    // Reorder outputs according to expected order
-    for (size_t i = 0; i < expectedOrder.size(); i++) {
-      const std::string& expected_name = expectedOrder[i];
-      int qnn_index = model->getQnnOutputIndex(expected_name);
-      if (qnn_index >= 0 && qnn_index < static_cast<int>(qnn_output_tensors.size())) {
-        outputs[i] = qnn_output_tensors[qnn_index];
-      } else {
-        MLLM_ERROR("QNNBackend::graphExecute: Failed to find QNN output index for tensor '{}' in graph '{}'", expected_name,
-                   graphName);
-        // If mapping fails, we cannot safely reorder outputs
-        // This is a critical error as we cannot determine the correct output order
-        MLLM_ERROR("Cannot reorder outputs: missing QNN output index for tensor '{}'. Output order may be incorrect.",
-                   expected_name);
-        // Note: We still try to copy what we can, but the order may be wrong
-        if (i < qnn_output_tensors.size()) {
-          outputs[i] = qnn_output_tensors[i];
-        } else {
-          MLLM_ERROR("Output index {} out of bounds (size: {})", i, qnn_output_tensors.size());
-        }
-      }
-    }
-  } else {
-    // No expected order set or size mismatch, use QNN order as-is
-    if (expectedOrder.empty()) {
-      MLLM_WARN("QNNBackend::graphExecute: No expected output order set for graph '{}', using QNN order", graphName);
-    } else {
-      MLLM_WARN(
-          "QNNBackend::graphExecute: Expected output order size ({}) != outputs size ({}) for graph '{}', using QNN order",
-          expectedOrder.size(), outputs.size(), graphName);
-    }
-    for (size_t i = 0; i < qnn_output_tensors.size(); i++) { outputs[i] = qnn_output_tensors[i]; }
-  }
 }
 
 bool QNNBackend::addTensor(const std::string& graphName, const std::string& tensorName, Qnn_TensorType_t type,

@@ -89,6 +89,7 @@ class QNNBackend final : public Backend {
 
   bool loadContext(const std::string& contextPath);
   bool createContext();
+  void saveContext(const std::string& contextPath = "qnn_context.bin");
 
   bool isWeightOnDevice() override { return false; }
 

@@ -483,7 +483,6 @@ std::shared_ptr<QNNTensorWrapper> QNNTensorWrapper::createStaticTensor(const std
 }
 
 void QNNTensorWrapper::alloc() {
-  if (isAlloc_) { MLLM_WARN("Tensor {} has already been allocated.", name_); }
   MLLM_RT_ASSERT(dataContainer_.device() == kQNN);
 
   // if storage is not allocated, allocate it

@@ -257,6 +257,11 @@ QnnAOTGraph::QnnAOTGraph(QNN_INTERFACE_VER_TYPE& qnnInterface, Qnn_BackendHandle
   qnn_model_->initialize(contextHandle, graphName.c_str(), false);
 }
 
+void QnnAOTGraph::addTensor(const QnnAOTNodeTensor::ptr_t& tensor) {
+  qnn_model_->addTensorWrapper(tensor->getWrapper());
+  all_tensors_.insert({tensor->getWrapper()->getName(), tensor});
+}
+
 void QnnAOTGraph::addOperation(const QnnAOTNodeOperation::ptr_t& qnn_op) {
   std::vector<std::string> inputNames;
   for (auto& in : qnn_op->inputs) inputNames.push_back(in->getWrapper()->getName());
@@ -622,23 +627,30 @@ QnnAOTNodeTensor::ptr_t QnnAOTEnv::captureQnnAOTNodeTensor(const std::string& qn
     __qnn_enable_static_weight = true;
   }
 
+  MLLM_RT_ASSERT_EQ(contexts_.count(qnn_context_name), 1);
+  MLLM_RT_ASSERT_EQ(contexts_[qnn_context_name]->graphs_.count(graph_name), 1);
+  auto graph = contexts_[qnn_context_name]->graphs_[graph_name];
+
+  // If normal weight is cached, we return it directly
+  if (graph->all_tensors_.count(__qnn_tensor_name)) { return graph->all_tensors_[__qnn_tensor_name]; }
+
+  QnnAOTNodeTensor::ptr_t ret = nullptr;
+
   // If static weight is cached, we return it directly.
   if (__qnn_enable_static_weight) {
-    MLLM_RT_ASSERT_EQ(contexts_.count(qnn_context_name), 1);
     if (contexts_[qnn_context_name]->static_tensor_.count(__qnn_tensor_name)) {
-      return contexts_[qnn_context_name]->static_tensor_[__qnn_tensor_name];
+      ret = contexts_[qnn_context_name]->static_tensor_[__qnn_tensor_name];
     }
   }
 
-  // If normal weight is cached, we return it directly
-  MLLM_RT_ASSERT_EQ(contexts_.count(qnn_context_name), 1);
-  MLLM_RT_ASSERT_EQ(contexts_[qnn_context_name]->graphs_.count(graph_name), 1);
-  if (contexts_[qnn_context_name]->graphs_[graph_name]->all_tensors_.count(__qnn_tensor_name)) {
-    return contexts_[qnn_context_name]->graphs_[graph_name]->all_tensors_[__qnn_tensor_name];
+  // There has no Tensor in the cache.
+  if (ret == nullptr) {
+    ret = QnnAOTNodeTensor::create(v, __qnn_enable_static_weight);
+
+    if (__qnn_enable_static_weight) { contexts_[qnn_context_name]->static_tensor_[__qnn_tensor_name] = ret; }
   }
 
-  // There has no Tensor in the cache.
-  auto ret = QnnAOTNodeTensor::create(v, __qnn_enable_static_weight);
+  graph->addTensor(ret);
 
   return ret;
 }

@@ -120,6 +120,8 @@ class QnnAOTGraph : public std::enable_shared_from_this<QnnAOTGraph> {
 
   void addOperation(const QnnAOTNodeOperation::ptr_t& qnn_op);
 
+  void addTensor(const QnnAOTNodeTensor::ptr_t& tensor);
+
   bool compile();
 
   bool is_compiled_ = false;

@@ -146,6 +146,17 @@ uint8_t LLM2QnnLoweringPass::run(const ir::node_ptr_t& op) {
 
     auto aot_graph = aot_env->captureAOTGraph("context.0", subgraph_name);
 
+    // Add sub-graph inputs
+    for (auto& input : region->inputs()) {
+      auto tensor_input = input->cast_<ir::tensor::TensorValue>();
+      if (tensor_input) { aot_env->captureQnnAOTNodeTensor("context.0", subgraph_name, tensor_input); }
+    }
+    // Add sub-graph outputs
+    for (auto& output : region->outputs()) {
+      auto tensor_output = output->cast_<ir::tensor::TensorValue>();
+      if (tensor_output) { aot_env->captureQnnAOTNodeTensor("context.0", subgraph_name, tensor_output); }
+    }
+
     // Walk through all linalg operations in the subgraph
     subgraph_writer.walk<ir::linalg::LinalgIROp>(
         [&](ir::IRWriter& this_tough_writer, const ir::linalg::LinalgIROp::ptr_t& linalg_op) -> ir::IRWriter::WalkResult {

@@ -9,7 +9,7 @@
 namespace mllm::qnn::aot {
 
 template<typename T>
-KVCacheManager<T>::KVCacheManager(KVCacheConfig config) : config_(config) {
+KVCacheManager<T>::KVCacheManager(QnnAOTConfig config) : config_(config) {
   k_cache_.resize(config_.num_layers);
   v_cache_.resize(config_.num_layers);
 

@@ -8,6 +8,7 @@
 #include <memory>
 #include "mllm/core/Storage.hpp"
 #include "mllm/backends/base/Allocator.hpp"
+#include "mllm/backends/qnn/aot_rt/QnnAOTConfig.hpp"
 
 namespace mllm::qnn::aot {
 
@@ -19,19 +20,10 @@ struct KVCache {
   T* output_buffer;
 };
 
-struct KVCacheConfig {
-  int32_t context_len;
-  int64_t head_dim;
-  int32_t max_ar_len;
-  int32_t max_cache_len;
-  int64_t num_heads;
-  int64_t num_layers;
-};
-
 template<typename T>
 class KVCacheManager {
  public:
-  explicit KVCacheManager(KVCacheConfig config);
+  explicit KVCacheManager(QnnAOTConfig config);
   ~KVCacheManager() = default;
 
   void initCache(mllm::Allocator* allocator, int32_t ar_len);
@@ -59,7 +51,7 @@ class KVCacheManager {
   void updateKey(KVCache<T>& k_cache, int32_t n_past, int32_t n_update, const std::vector<bool>& selected);
   void updateValue(KVCache<T>& v_cache, int32_t n_past, int32_t n_update, const std::vector<bool>& selected);
 
-  KVCacheConfig config_;
+  QnnAOTConfig config_;
   size_t total_cache_size_;
   int32_t cur_ar_len_;
   std::vector<KVCache<T>> k_cache_;