From 28945a9cd28f45ceba7a7e7a540042ca506b8a46 Mon Sep 17 00:00:00 2001
From: zhushuang <974198603@qq.com>
Date: Wed, 11 Feb 2026 21:03:32 +0800
Subject: [PATCH 1/3] issue/224 - feat: add warmup before InfiniLM generation

---
 examples/jiuge.py | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/examples/jiuge.py b/examples/jiuge.py
index 8612fc26..f0b563b6 100644
--- a/examples/jiuge.py
+++ b/examples/jiuge.py
@@ -236,6 +236,43 @@ def test(
 
     model.reset_cache(cache_config)
 
+    # ---------------------------------------------------------------------------- #
+    #                                Warmup
+    # ---------------------------------------------------------------------------- #
+    warmup_steps = 1
+
+    # Choose a length that approximates the real workload.
+    # It should be long enough to trigger the correct kernel paths,
+    # but not so long that warmup becomes unnecessarily expensive.
+    avg_prompt_len = min(64, max(len(ids) for ids in input_ids_list))
+
+    # Use truncated versions of real prompts for warmup
+    warmup_ids = [
+        ids[:avg_prompt_len] if len(ids) >= avg_prompt_len else ids
+        for ids in input_ids_list
+    ]
+
+    input_ids_infini = infinicore.from_list(warmup_ids)
+
+    print("=================== warmup start ===================")
+
+    for _ in range(warmup_steps):
+        _ = model.generate(
+            input_ids_infini,
+            GenerationConfig(
+                max_new_tokens=2,  # warmup decode kernel
+                temperature=1,
+                top_k=1,
+                top_p=0.8,
+            ),
+            _measure_and_log_time=False,
+        )
+
+    print("=================== warmup done ====================")
+
+    # Reset KV cache 
+    model.reset_cache(cache_config)
+
     # ---------------------------------------------------------------------------- #
     #                        Generate
     # ---------------------------------------------------------------------------- #

From d304ba00d657151c3e019d19eb393e0646d3131d Mon Sep 17 00:00:00 2001
From: zhushuang <974198603@qq.com>
Date: Wed, 11 Feb 2026 21:17:51 +0800
Subject: [PATCH 2/3] issue/224 - feat: use muDNN silu_and_mul to replace
 elementwise swiglu in moore gpu

---
 csrc/models/llama/llama_mlp.cpp | 35 +++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/csrc/models/llama/llama_mlp.cpp b/csrc/models/llama/llama_mlp.cpp
index a3ab7859..282e2eca 100644
--- a/csrc/models/llama/llama_mlp.cpp
+++ b/csrc/models/llama/llama_mlp.cpp
@@ -71,19 +71,34 @@ LlamaMLP::LlamaMLP(std::shared_ptr<infinilm::config::ModelConfig> model_config,
 }
 
 infinicore::Tensor LlamaMLP::forward(const infinicore::Tensor &hidden_states) const {
-    // 1. Project to gate and up
-    auto hidden_states_mutable = hidden_states;
-    auto [gate, up] = gate_up_proj_->forward_split(hidden_states_mutable);
+    infinicore::Device::Type dev_type = hidden_states->device().getType();
+    if(dev_type == infinicore::Device::Type::MOORE){
+        // 1. Project to a single combined gate_up tensor
+        auto hidden_states_mutable = hidden_states;
+        auto gate_up = gate_up_proj_->forward(hidden_states_mutable); 
 
-    // 2. Apply SwiGLU: silu(gate) * up
-    // Note: swiglu kernel expects (up, gate) and computes gate * sigmoid(gate) * up
-    // So we pass (up, gate) to get the correct result: gate * sigmoid(gate) * up
-    auto intermediate = infinicore::op::swiglu(up, gate);
+        // 2. Apply the fused silu_and_mul operator
+        // applies SiLU to the first half, and multiplies it by the second half.
+        // Mathematically equivalent to: result = SiLU(gate_up[..., :d]) * gate_up[..., d:]
+        auto intermediate = infinicore::op::silu_and_mul(gate_up);
 
-    // 3. Project down
-    auto output = down_proj_->forward(intermediate);
+        // 3. Project down
+        auto output = down_proj_->forward(intermediate);
+        return output;
+    } else{
+        // 1. Project to gate and up
+        auto hidden_states_mutable = hidden_states;
+        auto [gate, up] = gate_up_proj_->forward_split(hidden_states_mutable);
 
-    return output;
+        // 2. Apply SwiGLU: silu(gate) * up
+        // Note: swiglu kernel expects (up, gate) and computes gate * sigmoid(gate) * up
+        // So we pass (up, gate) to get the correct result: gate * sigmoid(gate) * up
+        auto intermediate = infinicore::op::swiglu(up, gate);
+
+        // 3. Project down
+        auto output = down_proj_->forward(intermediate);
+        return output;
+    }
 }
 
 } // namespace infinilm::models::llama

From a247d405ec0c282e617e267eeb89c5dbe74c3b26 Mon Sep 17 00:00:00 2001
From: zhushuang <974198603@qq.com>
Date: Thu, 12 Feb 2026 09:50:54 +0800
Subject: [PATCH 3/3] issue/224 - feat: add --warmup flag and disable warmup by
 default

---
 examples/jiuge.py | 62 ++++++++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 28 deletions(-)

diff --git a/examples/jiuge.py b/examples/jiuge.py
index f0b563b6..3b515d78 100644
--- a/examples/jiuge.py
+++ b/examples/jiuge.py
@@ -131,6 +131,11 @@ def get_args():
         default=1.0,
         help="sampling temperature",
     )
+    parser.add_argument(
+        "--warmup",
+        action="store_true",
+        help="Perform a warmup run before benchmarking/inference."
+    )
 
     return parser.parse_args()
 
@@ -239,39 +244,40 @@ def test(
     # ---------------------------------------------------------------------------- #
     #                                Warmup
     # ---------------------------------------------------------------------------- #
-    warmup_steps = 1
-
-    # Choose a length that approximates the real workload.
-    # It should be long enough to trigger the correct kernel paths,
-    # but not so long that warmup becomes unnecessarily expensive.
-    avg_prompt_len = min(64, max(len(ids) for ids in input_ids_list))
-
-    # Use truncated versions of real prompts for warmup
-    warmup_ids = [
-        ids[:avg_prompt_len] if len(ids) >= avg_prompt_len else ids
-        for ids in input_ids_list
-    ]
+    if args.warmup:
+        warmup_steps = 1
+
+        # Choose a length that approximates the real workload.
+        # It should be long enough to trigger the correct kernel paths,
+        # but not so long that warmup becomes unnecessarily expensive.
+        avg_prompt_len = min(64, max(len(ids) for ids in input_ids_list))
+
+        # Use truncated versions of real prompts for warmup
+        warmup_ids = [
+            ids[:avg_prompt_len] if len(ids) >= avg_prompt_len else ids
+            for ids in input_ids_list
+        ]
 
-    input_ids_infini = infinicore.from_list(warmup_ids)
+        input_ids_infini = infinicore.from_list(warmup_ids)
 
-    print("=================== warmup start ===================")
+        print("=================== warmup start ===================")
 
-    for _ in range(warmup_steps):
-        _ = model.generate(
-            input_ids_infini,
-            GenerationConfig(
-                max_new_tokens=2,  # warmup decode kernel
-                temperature=1,
-                top_k=1,
-                top_p=0.8,
-            ),
-            _measure_and_log_time=False,
-        )
+        for _ in range(warmup_steps):
+            _ = model.generate(
+                input_ids_infini,
+                GenerationConfig(
+                    max_new_tokens=2,  # warmup decode kernel
+                    temperature=temperature,
+                    top_k=top_k,
+                    top_p=top_p,
+                ),
+                _measure_and_log_time=False,
+            )
 
-    print("=================== warmup done ====================")
+        print("=================== warmup done ====================")
 
-    # Reset KV cache 
-    model.reset_cache(cache_config)
+        # Reset KV cache 
+        model.reset_cache(cache_config)
 
     # ---------------------------------------------------------------------------- #
     #                        Generate