From 26a8faf4fef8ebbcb7bca276b65e3b9125673d81 Mon Sep 17 00:00:00 2001
From: "peng.li24" <peng.li24@nio.com>
Date: Thu, 29 Jan 2026 09:02:52 +0000
Subject: [PATCH 1/3] =?UTF-8?q?deepxir:=E8=AE=BE=E8=AE=A1=E5=AE=8C?=
 =?UTF-8?q?=E5=96=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/executor-deepxcore.yml      |   2 +-
 .github/workflows/executor-heapmemcuda.yml    |   6 +-
 .github/workflows/executor-op-cuda-linux.yml  |   2 +-
 .../workflows/executor-op-ompsimd-linux.yml   |   2 +-
 docs/deepxIR/deepxir.md                       | 121 ++++++++++++++++++
 executor/deepxcore/README.md                  |  72 +++++++++++
 6 files changed, 199 insertions(+), 6 deletions(-)
 create mode 100644 docs/deepxIR/deepxir.md
 create mode 100644 executor/deepxcore/README.md

diff --git a/.github/workflows/executor-deepxcore.yml b/.github/workflows/executor-deepxcore.yml
index dcbe9ab..2f280f1 100644
--- a/.github/workflows/executor-deepxcore.yml
+++ b/.github/workflows/executor-deepxcore.yml
@@ -1,4 +1,4 @@
-name: Excuter/cppcommon Build
+name: executor/deepxcore Build
 on:
   push:
     paths:
diff --git a/.github/workflows/executor-heapmemcuda.yml b/.github/workflows/executor-heapmemcuda.yml
index bd3119d..fbfd2c4 100644
--- a/.github/workflows/executor-heapmemcuda.yml
+++ b/.github/workflows/executor-heapmemcuda.yml
@@ -1,4 +1,4 @@
-name: op/cuda-linux Build
+name: executor/heapmem-cuda Build
 on:
   push:
     paths:
@@ -61,8 +61,8 @@ jobs:
             cp -r include/* /usr/local/include/ && \
             cd /workspace && \
             
-            # 构建 common 库
-            cd executor/cpp-common && \
+            # 构建 deepxcore 库
+            cd executor/deepxcore && \
             mkdir -p build && cd build && \
             cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -GNinja .. && \
             ninja && \
diff --git a/.github/workflows/executor-op-cuda-linux.yml b/.github/workflows/executor-op-cuda-linux.yml
index 41cfa4a..51d7daf 100644
--- a/.github/workflows/executor-op-cuda-linux.yml
+++ b/.github/workflows/executor-op-cuda-linux.yml
@@ -1,4 +1,4 @@
-name: Excuter/cuda-linux Build
+name: executor/op-cuda-linux Build
 on:
   push:
     paths:
diff --git a/.github/workflows/executor-op-ompsimd-linux.yml b/.github/workflows/executor-op-ompsimd-linux.yml
index 414be7e..d673e2b 100644
--- a/.github/workflows/executor-op-ompsimd-linux.yml
+++ b/.github/workflows/executor-op-ompsimd-linux.yml
@@ -1,4 +1,4 @@
-name: Excuter/ompsimd-linux Build
+name: executor/op-mem-ompsimd-linux Build
 on:
   push:
     paths:
diff --git a/docs/deepxIR/deepxir.md b/docs/deepxIR/deepxir.md
new file mode 100644
index 0000000..80f3ce1
--- /dev/null
+++ b/docs/deepxIR/deepxir.md
@@ -0,0 +1,121 @@
+// ==================== 1. 类型系统 ====================
+// 基础数据类型
+type f16, f32, f64, bf16,bf8      // 浮点类型
+type i8, i16, i32, i64, u8    // 整数类型
+type bool                     // 布尔类型
+
+//类型约束
+f32|f64  //支持2种类型之一
+
+// Tensor类型模板
+type tensor<shape; elem_type>
+// shape格式: dim1xdim2x...xdimN 或 ? 表示动态维度
+// 示例: tensor<10x20xf32>, tensor<?x?xi32>
+
+// ==================== 2. ir定义格式 ====================
+deepxir ir_name(ro_p1:type1,ro_param2:type2,...) -> (w_p1:type3,w_p2:type4,...)
+{
+    // 函数体: IR操作序列
+    operation_name( ro_p1,  ro_p1)-> w_p1
+    operation_name( ro_p2,  ro_p2)-> w_p2
+}
+
+// ==================== 3. 具体示例 ====================
+
+// 示例1: 符合您要求的精确约束函数
+function  constrained_matmul(
+     A: tensor<?x1xFloat>,      // 第一个参数: <?x1> 且元素类型为f32或f64
+     B: tensor<1x?xFloat>       // 第二个参数: <1x?> 且元素类型与 A相同
+) -> tensor<?x?xFloat> {        // 返回值: <?x?> 且元素类型继承自输入
+    
+    // 函数体 - IR操作序列
+     0 = tensor.matmul( A,  B) {
+        transpose_a = false,
+        transpose_b = false
+    } : (tensor<?x1xFloat>, tensor<1x?xFloat>) -> tensor<?x?xFloat>
+    
+    return  0
+}
+
+// 示例2: 更复杂的函数，包含多个操作
+function  conv_relu(
+     input: tensor<1x32x32x3xf32>,
+     filter: tensor<3x3x3x16xf32>
+) -> tensor<1x30x30x16xf32> {
+    
+    // 卷积操作
+     conv = tensor.conv2d( input,  filter) {
+        stride = [1, 1],
+        padding = "valid",
+        dilation = [1, 1]
+    } : (tensor<1x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<1x30x30x16xf32>
+    
+    // ReLU激活
+     relu = tensor.relu( conv) : (tensor<1x30x30x16xf32>) -> tensor<1x30x30x16xf32>
+    
+    return  relu
+}
+
+// 示例3: 支持动态形状和类型推断
+function  dynamic_operations(
+     A: tensor<?x?xf32>,
+     B: tensor<?x?xf32>
+) -> tensor<?x?xf32> {
+    
+    // 元素级加法
+     add = tensor.add( A,  B) : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
+    
+    // 矩阵乘法
+     matmul = tensor.matmul( add,  A) : 
+        (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
+    
+    return  matmul
+}
+
+// 示例4: 带有属性约束的函数
+function  batch_norm(
+     input: tensor<?x?x?x?xf32>,
+     scale: tensor<?xf32>,
+     bias: tensor<?xf32>
+) -> tensor<?x?x?x?xf32> 
+attributes {
+    training = true,
+    epsilon = 1e-5 : f32,
+    momentum = 0.9 : f32
+} {
+     output = tensor.batch_norm( input,  scale,  bias) {
+        epsilon = 1e-5 : f32,
+        training = true
+    } : (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?x?xf32>
+    
+    return  output
+}
+
+// ==================== 4. 操作签名格式 ====================
+操作签名格式:
+ result = operation_name(operands) {attributes} : (input_types) -> output_type
+
+其中:
+- operands: 逗号分隔的输入SSA值 ( arg1,  arg2, ...)
+- attributes: 键值对属性 {key1 = value1, key2 = value2}
+- input_types: 逗号分隔的输入类型
+- output_type: 单个输出类型
+
+// ==================== 5. 类型推断规则 ====================
+type_rule matmul_shape_inference {
+    input_shapes = [tensor<?xMxT>, tensor<?xNxT>],
+    output_shape = tensor<?x?xT>,
+    constraints = [
+        M.dim1 == N.dim0,  // 矩阵乘法维度匹配
+        T in Numeric       // 元素类型为数值类型
+    ]
+}
+
+type_rule elementwise_broadcast {
+    input_shapes = [tensor<A_dimsxT>, tensor<B_dimsxT>],
+    output_shape = tensor<broadcast(A_dims, B_dims)xT>,
+    constraints = [
+        can_broadcast(A_dims, B_dims),  // 维度可广播
+        T in Numeric
+    ]
+}
\ No newline at end of file
diff --git a/executor/deepxcore/README.md b/executor/deepxcore/README.md
new file mode 100644
index 0000000..37391f7
--- /dev/null
+++ b/executor/deepxcore/README.md
@@ -0,0 +1,72 @@
+# deepxcore
+
+deepxcore 是 deepx 执行器层与统一存算面协议共享的 C++ 核心基础库。
+
+它的目标是提供稳定、跨执行器可复用的数据模型与协议对象，避免把 CUDA/Metal/CPU 等具体实现细节渗透到上层与其他组件，从而保证进程间与代码组件的隔离。
+
+## 定位
+- 面向：执行器进程（heapmem-*、op-*）、统一存算面 SDK、调度/编译侧的 C++ 组件
+- 提供：dtype/shape/tensor 等基础数据结构、协议对象的结构化表达、配置与序列化基础设施
+- 不提供：具体硬件算子实现、显存/IPC 生命周期实现、调度编译逻辑
+
+## 职责
+
+### 1) 基础数据模型
+- `DType`：数据类型描述与大小/对齐等基础能力
+- `Shape`：维度/元素数量/bytes 计算、shape 合法性检查
+- `Tensor`：Tensor 元信息与句柄表达（不绑定具体设备实现）
+
+这些类型应作为所有执行器的共同语言，保证跨组件传递时语义一致。
+
+### 2) 统一存算面协议对象
+用于在统一寻址空间（如 Redis KV）与执行器之间传递的数据结构，例如：
+- tensor 元信息记录（name/key、dtype、shape、device、bytes、ctime 等）
+- 生命周期指令（create/get/delete 等）
+
+deepxcore 只负责“结构化表达与编解码”，不负责“真正分配/回收/IPC 映射”。
+
+### 3) 序列化/反序列化与配置
+- 将协议对象、元信息在 JSON/YAML/二进制之间进行编解码
+- 读取执行器/客户端的配置（例如地址、设备策略、协议版本等）
+
+目标是让其他组件不要各自实现一套解析与校验逻辑。
+
+### 4) 通用基础设施
+- 轻量的错误与返回值表达（Status/Result）
+- 字符串、文件系统等工具的薄封装
+
+要求保持依赖尽量少、接口稳定、与具体硬件/运行时解耦。
+
+## 非职责（边界）
+
+### 不做硬件绑定
+- 不直接依赖 CUDA/Metal/ROCm/NCCL 等
+- 不实现任何具体算子 kernel
+
+这些应由 `op-cuda`、`op-ompsimd`、`op-mem-mps` 等执行器承担。
+
+### 不做堆 tensor 生命周期与 IPC
+- 不管理持久堆 tensor 的分配/回收
+- 不负责 CUDA IPC handle 的创建/打开/关闭
+
+这些应由 `heapmem-cuda` 这类“统一寻址空间的 tensor 具体实现”承担。
+
+### 不做编译与调度
+- 不负责 deepxIR 的编译替换、fusion、分布式调度
+
+这些属于中端编译器与调度器。
+
+## 与其他组件的关系
+
+- heapmem-*：owner 侧负责堆 tensor 生命周期与跨进程共享；deepxcore 提供 dtype/shape/协议对象
+- op-*：算子执行器负责栈 tensor（中间变量）与 kernel；deepxcore 提供基础数据模型与统一的元信息表达
+- 前端/SDK：通过统一协议把计算图与 tensor 元信息写入统一寻址空间；deepxcore 是 C++ 侧共用的协议层
+
+## 目录
+- `src/`：核心库实现
+- `test/`：单元测试
+
+## 构建
+本库通过 CMake 构建，并作为其他执行器目标的依赖被链接。
+
+在上层执行器中使用时，通常只需要链接 `deepxcore` 目标，并包含对应头文件。
\ No newline at end of file

From 873866988713d4140c0ef75de1c8419497b7e586 Mon Sep 17 00:00:00 2001
From: "peng.li24" <peng.li24@nio.com>
Date: Fri, 30 Jan 2026 06:26:26 +0000
Subject: [PATCH 2/3] =?UTF-8?q?deepxir:=E8=AE=BE=E8=AE=A1=E5=AE=8C?=
 =?UTF-8?q?=E5=96=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/deepxIR/deepxir.md | 112 +++++++++-------------------------------
 1 file changed, 25 insertions(+), 87 deletions(-)

diff --git a/docs/deepxIR/deepxir.md b/docs/deepxIR/deepxir.md
index 80f3ce1..1c89938 100644
--- a/docs/deepxIR/deepxir.md
+++ b/docs/deepxIR/deepxir.md
@@ -7,11 +7,17 @@ type bool                     // 布尔类型
 //类型约束
 f32|f64  //支持2种类型之一
 
+
 // Tensor类型模板
 type tensor<shape; elem_type>
 // shape格式: dim1xdim2x...xdimN 或 ? 表示动态维度
 // 示例: tensor<10x20xf32>, tensor<?x?xi32>
 
+//动态维度的维度变量
+? //任意数字
+?1 //动态维度变量1
+?2 //动态维度变量2，用来告诉出现?2的tensor对应维度需要保持一致
+
 // ==================== 2. ir定义格式 ====================
 deepxir ir_name(ro_p1:type1,ro_param2:type2,...) -> (w_p1:type3,w_p2:type4,...)
 {
@@ -19,103 +25,35 @@ deepxir ir_name(ro_p1:type1,ro_param2:type2,...) -> (w_p1:type3,w_p2:type4,...)
     operation_name( ro_p1,  ro_p1)-> w_p1
     operation_name( ro_p2,  ro_p2)-> w_p2
 }
+deepxir是关键词，或者我们也可以使用function,func这些传统关键字
+用来定义新的ir名
 
-// ==================== 3. 具体示例 ====================
+deepxir的参数，遵循左读右写的规则，没有返回值
+deepxir的参数类型，既包括tensor，还有list<tensor>，也包括基础类型，以及list<基础类型>
 
-// 示例1: 符合您要求的精确约束函数
-function  constrained_matmul(
-     A: tensor<?x1xFloat>,      // 第一个参数: <?x1> 且元素类型为f32或f64
-     B: tensor<1x?xFloat>       // 第二个参数: <1x?> 且元素类型与 A相同
-) -> tensor<?x?xFloat> {        // 返回值: <?x?> 且元素类型继承自输入
-    
-    // 函数体 - IR操作序列
-     0 = tensor.matmul( A,  B) {
-        transpose_a = false,
-        transpose_b = false
-    } : (tensor<?x1xFloat>, tensor<1x?xFloat>) -> tensor<?x?xFloat>
-    
-    return  0
-}
+// ==================== 3.设计思考 ====================
 
-// 示例2: 更复杂的函数，包含多个操作
-function  conv_relu(
-     input: tensor<1x32x32x3xf32>,
-     filter: tensor<3x3x3x16xf32>
-) -> tensor<1x30x30x16xf32> {
-    
-    // 卷积操作
-     conv = tensor.conv2d( input,  filter) {
-        stride = [1, 1],
-        padding = "valid",
-        dilation = [1, 1]
-    } : (tensor<1x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<1x30x30x16xf32>
-    
-    // ReLU激活
-     relu = tensor.relu( conv) : (tensor<1x30x30x16xf32>) -> tensor<1x30x30x16xf32>
-    
-    return  relu
+// ==================== 4. 具体示例 ====================
+
+// 示例1: 包含多个操作
+deepxir conv_relu(input: tensor<1x32x32x3xf32>,filter: tensor<3x3x3x16xf32>) -> (out: tensor<1x30x30x16xf32>) {
+    tensor.new([1 30 30 16],f32)->conv
+    tensor.conv2d( input,  filter)->conv
+    tensor.relu(conv)-> out
 }
 
 // 示例3: 支持动态形状和类型推断
-function  dynamic_operations(
-     A: tensor<?x?xf32>,
-     B: tensor<?x?xf32>
-) -> tensor<?x?xf32> {
-    
-    // 元素级加法
-     add = tensor.add( A,  B) : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-    
-    // 矩阵乘法
-     matmul = tensor.matmul( add,  A) : 
-        (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-    
-    return  matmul
+deepxir dynamic_operations( A: tensor<?x?xf32>,B: tensor<?x?xf32>
+) -> (out: tensor<?x?xf32>) {
+    tensor.add( A,  B)-> %add
+    tensor.matmul( %add,  A)-> out
 }
 
 // 示例4: 带有属性约束的函数
-function  batch_norm(
+deepxir batch_norm(
      input: tensor<?x?x?x?xf32>,
      scale: tensor<?xf32>,
      bias: tensor<?xf32>
-) -> tensor<?x?x?x?xf32> 
-attributes {
-    training = true,
-    epsilon = 1e-5 : f32,
-    momentum = 0.9 : f32
-} {
-     output = tensor.batch_norm( input,  scale,  bias) {
-        epsilon = 1e-5 : f32,
-        training = true
-    } : (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?x?xf32>
-    
-    return  output
+) -> (output: tensor<?x?x?x?xf32>) {
+    tensor.batch_norm( input,  scale,  bias)-> output
 }
-
-// ==================== 4. 操作签名格式 ====================
-操作签名格式:
- result = operation_name(operands) {attributes} : (input_types) -> output_type
-
-其中:
-- operands: 逗号分隔的输入SSA值 ( arg1,  arg2, ...)
-- attributes: 键值对属性 {key1 = value1, key2 = value2}
-- input_types: 逗号分隔的输入类型
-- output_type: 单个输出类型
-
-// ==================== 5. 类型推断规则 ====================
-type_rule matmul_shape_inference {
-    input_shapes = [tensor<?xMxT>, tensor<?xNxT>],
-    output_shape = tensor<?x?xT>,
-    constraints = [
-        M.dim1 == N.dim0,  // 矩阵乘法维度匹配
-        T in Numeric       // 元素类型为数值类型
-    ]
-}
-
-type_rule elementwise_broadcast {
-    input_shapes = [tensor<A_dimsxT>, tensor<B_dimsxT>],
-    output_shape = tensor<broadcast(A_dims, B_dims)xT>,
-    constraints = [
-        can_broadcast(A_dims, B_dims),  // 维度可广播
-        T in Numeric
-    ]
-}
\ No newline at end of file

From e66eb61689be754eb78a621773ec8a11a645ccdf Mon Sep 17 00:00:00 2001
From: "peng.li24" <peng.li24@nio.com>
Date: Fri, 30 Jan 2026 09:25:52 +0000
Subject: [PATCH 3/3] =?UTF-8?q?deepxir:=E6=8E=A5=E8=BF=91=E7=AC=AC?=
 =?UTF-8?q?=E4=B8=80=E7=89=88=E6=96=B9=E6=A1=88=E5=AE=9A=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/deepxIR/deepxir.md | 165 +++++++++++++++++++++++++++++-----------
 docs/deepxIR/ir.md      | 115 ----------------------------
 2 files changed, 121 insertions(+), 159 deletions(-)
 delete mode 100644 docs/deepxIR/ir.md

diff --git a/docs/deepxIR/deepxir.md b/docs/deepxIR/deepxir.md
index 1c89938..58e6872 100644
--- a/docs/deepxIR/deepxir.md
+++ b/docs/deepxIR/deepxir.md
@@ -1,59 +1,136 @@
-// ==================== 1. 类型系统 ====================
-// 基础数据类型
-type f16, f32, f64, bf16,bf8      // 浮点类型
-type i8, i16, i32, i64, u8    // 整数类型
-type bool                     // 布尔类型
+# DeepX IR（deepxir）规范
 
-//类型约束
-f32|f64  //支持2种类型之一
+## 1. 类型系统
 
+### 基础数据类型
+```
+type f16, f32, f64, bf16, bf8    // 浮点类型
+type i8, i16, i32, i64, u8       // 整数类型
+type bool                       // 布尔类型
+```
 
-// Tensor类型模板
-type tensor<shape; elem_type>
-// shape格式: dim1xdim2x...xdimN 或 ? 表示动态维度
-// 示例: tensor<10x20xf32>, tensor<?x?xi32>
+### 动态长度类型
+```
+list<type>   // list 可以和以上基础类型组合
+```
 
-//动态维度的维度变量
-? //任意数字
-?1 //动态维度变量1
-?2 //动态维度变量2，用来告诉出现?2的tensor对应维度需要保持一致
+### 类型约束
+```
+f32|f64   // 支持两种/多种 类型之一
+```
 
-// ==================== 2. ir定义格式 ====================
-deepxir ir_name(ro_p1:type1,ro_param2:type2,...) -> (w_p1:type3,w_p2:type4,...)
+### Tensor 类型模板
+```
+type tensor<shape, elem_type>
+```
+- shape 格式：dim1xdim2x...xdimN，或使用 `?` 表示动态维度。 最后一个x后的是精度。 
+- 示例：`tensor<10x20xf32>`, `tensor<?x?xi32>`
+
+tensor 也可以没有 shape 和 dtype 的约束，例如：
+```
+deepxir addscalar(A:tensor, b:i8|i16|i32|i64) -> (c:tensor) { ... }
+```
+表示任意 shape、任意 dtype 的 tensor 都可作为参数。
+
+### 动态维度变量
+- `?` 任意数字  
+- `?1` 动态维度变量 1  
+- `?2` 动态维度变量 2（用于表示同名变量处维度需一致）  
+- 示例：`tensor<?1x?2xf32>`
+
+## 2. IR 定义格式
+
+语法示例：
+```
+deepxir ir_name(ro_p1:type1, ro_param2:type2, ...) -> (w_p1:type3, w_p2:type4, ...)
 {
-    // 函数体: IR操作序列
-    operation_name( ro_p1,  ro_p1)-> w_p1
-    operation_name( ro_p2,  ro_p2)-> w_p2
+    // 函数体：IR 操作序列
+    operation_name(ro_p1, ro_p2) -> w_p1
+    operation_name(ro_p2, ro_p2) -> w_p2
 }
-deepxir是关键词，或者我们也可以使用function,func这些传统关键字
-用来定义新的ir名
-
-deepxir的参数，遵循左读右写的规则，没有返回值
-deepxir的参数类型，既包括tensor，还有list<tensor>，也包括基础类型，以及list<基础类型>
+```
+- `deepxir` 为关键字，也可使用 `function`、`func` 等。  
+- 参数遵循“左读右写”规则（无返回值；通过写入参数实现输出）。  
+- 参数类型支持：`tensor`、`list<tensor>`、基础类型，以及基础类型的 list。
 
-// ==================== 3.设计思考 ====================
+## 3. 设计思考
+DeepX IR 采用简洁的文本格式表示张量类型约束、运算定义与运算体，便于阅读与解析。
+deepx不是ssa，调用时，依然遵循左读右写的参数列表原则，右写的参数列表支持多个。
 
-// ==================== 4. 具体示例 ====================
+## 4. 具体示例
 
-// 示例1: 包含多个操作
-deepxir conv_relu(input: tensor<1x32x32x3xf32>,filter: tensor<3x3x3x16xf32>) -> (out: tensor<1x30x30x16xf32>) {
-    tensor.new([1 30 30 16],f32)->conv
-    tensor.conv2d( input,  filter)->conv
-    tensor.relu(conv)-> out
+### 示例 1：融合 Linear + 归一化
+```
+deepxir fused_linear_norm(
+    A: tensor<?1x?2xf32>,
+    W: tensor<?2x?3xf32>,
+    b: tensor<?3xf32>,
+    axis: i32,
+    keepdims: bool
+) -> (out: tensor<?1x?3xf32>) {
+    newtensor(?1x?3, f32)->(mm)
+    matmul(A, W)-> (mm)
+    newtensor(?1x?3, f32)-> bias
+    add(mm, b)-> bias
+    deltensor(mm)-> mm
+    newtensor(?1, f32)-> mean
+    sum(bias, axis, keepdims)-> mean
+    newtensor(?1x?3, f32)-> centered
+    sub(bias, mean)-> centered
+    deltensor(bias)-> bias
+    deltensor(mean)-> mean
+    newtensor(?1x?3, f32)-> sq
+    mul(centered, centered)-> sq
+    deltensor(centered)-> centered
+    newtensor(?1, f32)-> var
+    sum(sq, axis, keepdims)-> var
+    deltensor(sq)-> sq
+    constant(1e-5)-> eps
+    newtensor(?1, f32)-> var_eps
+    add(var, eps)-> var_eps
+    deltensor(var)-> var
+    deltensor(eps)-> eps
+    newtensor(?1, f32)-> std
+    sqrt(var_eps)-> std
+    deltensor(var_eps)-> var_eps
+    div(std, std)-> std
+    deltensor(std)-> std
+    div(centered, std)-> out
 }
+```
 
-// 示例3: 支持动态形状和类型推断
-deepxir dynamic_operations( A: tensor<?x?xf32>,B: tensor<?x?xf32>
-) -> (out: tensor<?x?xf32>) {
-    tensor.add( A,  B)-> %add
-    tensor.matmul( %add,  A)-> out
+下面给出一个完整的 `deepxir` 调用示例：在一个 IR 中先构造输入张量和辅助参数，然后调用 `fused_linear_norm`，输出 `out`。
+
+```
+deepxir example_use_fused_linear_norm() -> (out: tensor<2x3xf32>) {
+    newtensor([2,4], f32)-> A
+    newtensor([4,3], f32)-> W
+    newtensor([3], f32)-> b
+    fused_linear_norm(A, W, b, 1, false) -> out
 }
+```
 
-// 示例4: 带有属性约束的函数
-deepxir batch_norm(
-     input: tensor<?x?x?x?xf32>,
-     scale: tensor<?xf32>,
-     bias: tensor<?xf32>
-) -> (output: tensor<?x?x?x?xf32>) {
-    tensor.batch_norm( input,  scale,  bias)-> output
+该示例展示了如何在 IR 中构造必要的张量/参数并调用 `fused_linear_norm`，其中 `out` 的类型为 `tensor<2x3xf32>`，与 `W` 的列数和 `A` 的行数对应。
+
+### 示例 2：融合 Attention score + Softmax
+```
+deepxir fused_attention_scores(
+    Q: tensor<?x?xf32>,
+    K: tensor<?x?xf32>,
+    axis: list<i32>,
+    keepdims: bool,
+    shape_scores: list<i32>,
+    shape_sum: list<i32>
+) -> (out: tensor<?x?xf32>) {
+    newtensor(shape_scores, f32)-> scores_tmp
+    matmul(Q, K)-> scores_tmp
+    newtensor(shape_scores, f32)-> exp_tmp
+    exp(scores_tmp)-> exp_tmp
+    deltensor(scores_tmp)-> scores_tmp
+    newtensor(shape_sum, f32)-> sum_tmp
+    sum(exp_tmp, axis, keepdims)-> sum_tmp
+    div(exp_tmp, sum_tmp)-> out
+    deltensor(exp_tmp)-> exp_tmp
+    deltensor(sum_tmp)-> sum_tmp
 }
+```
\ No newline at end of file
diff --git a/docs/deepxIR/ir.md b/docs/deepxIR/ir.md
deleted file mode 100644
index 712ecda..0000000
--- a/docs/deepxIR/ir.md
+++ /dev/null
@@ -1,115 +0,0 @@
-# DeepX IR (Intermediate Representation) 格式规范
-
-DeepX IR 采用简洁的文本格式来表示张量运算。主要分为函数定义(funcdef)和函数调用(funccall)两种模式。
-
-## 基本语法规则
-
-1. 使用 `->` 分隔输入参数和返回值
-2. 参数之间使用逗号(,)分隔
-3. 向量类型的值使用空格分隔元素
-4. 参数和返回值可选择性地用括号()包裹
-5. 可在指令后添加元数据，使用 `//` 分隔
-
-## 函数调用(funccall)模式
-
-函数调用模式用于实际执行操作，语法更简洁。
-
-示例:
-matmul A,B -> C
-sum(A,[1 2 3]) -> B
-newtensor 3 4 5 -> T1
-
-## 函数定义(funcdef)
-
-函数定义由executor层负责注册实现,用于声明操作的参数和返回值类型。executor通过注册funcdef来声明其支持的tensorfunc。
-
-因此需要设置参数、返回值的详细类型约束
-
-语法示例:
-```
-matmul(Tensor<float32|float64> A, Tensor<float32|float64> B) -> Tensor<float32|float64> C
-sum(Tensor<any> A, vector<int32> dim) -> Tensor<any> B
-newtensor(vector<int32> shape) -> Tensor<float32> T1
-```
-
-## 元数据格式
-
-可在指令后添加元数据信息:
-
-```
-matmul(A,B)->C //id=1 created_at=123456789 sent_at=123456790
-```
-
-支持的元数据字段:
-- id: 操作ID
-- author: 作者，部分tensorfunc的实现，如matmul，会有多实现，需要指定作者以根据环境指定最优实现
-- created_at: 创建时间戳
-- sent_at: 发送时间戳
-
-## 类型系统
-
-对于tensorfunc的类型系统，我们只关心与tensor相关的类型系统
-
-参考 executor/common/src/deepx/dtype.hpp
-
-```
-{
-    类型: 
-         var
-         vector
-         tensor
-         listtensor
-    精度:
-         float64
-         float32
-         float16
-         bfloat16
-         fp8
-         fp4
-         int64
-         int32
-         int16
-         int8
-         int4
-         string//可以用来引用其他var或tensor的name
-}
-```
-多精度支持可以用|分隔,如float32|float64
-
-
-## funcdef
-
-executor 负责定义其支持的tensorfunc
-
-1. 矩阵乘法:
-```
-# funcdef
-matmul(Tensor<float32|float64> A, Tensor<float32|float64> B) -> Tensor<float32|float64> C
-
-# funccall  
-matmul A,B -> C
-// rtf(remote tensor func)解析器会自动解析参数和返回值的列表
-// executor会从mem获取A，B，C这3个tensor，并执行matmul操作
-```
-
-2. 张量求和:
-```
-# funcdef
-sum(Tensor<any> input, vector<int32> dims,var<bool> keepdim) -> Tensor<any> output
-
-# funccall
-sum(T1,[0 1],true) -> T2
-// rtf(remote tensor func)解析器会自动解析参数和返回值的列表
-// 其中[0 1]会被解析为vector<int32>，便于executor执行时使用
-// true会被解析为var<bool> keepdim，便于executor执行时使用
-// executor会从mem获取T1，T2这2个tensor，并执行sum操作
-```
-
-3. 创建新张量:
-```
-# funcdef
-newtensor(vector<int32> shape) -> Tensor<float32> output
-
-# funccall
-newtensor 3 4 5 -> T1
-```
\ No newline at end of file