array2d · miaobyte · Mar 28, 2025 · Mar 20, 2025 · Mar 26, 2025 · Mar 26, 2025
diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
@@ -16,5 +16,6 @@
 | newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
 | vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
+| matmul | cublas | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | sub | miaobyte | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
@@ -17,5 +17,7 @@
 | newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(var<string> shape)->(tensor<any> tensor1) |
 | vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
+| matmul | cblas | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
+| matmul | miaobyte | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | sub | miaobyte | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1-T2 | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp b/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp
@@ -3,7 +3,7 @@
 
 #include "deepx/tensor.hpp"
 #include "deepx/tensorfunc/authors.hpp"
-
+#include "stdutil/error.hpp"
 namespace deepx::tensorfunc
 {
     bool check_matmul_shape(const Shape &a, const Shape &b)
@@ -29,7 +29,10 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct matmulDispatcher
     {
-        static void matmul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
+        static void matmul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
+        {
+            throw NotImplementError("matmul");
+        }
     };
 
     template <typename Author, typename T>

diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -4,6 +4,7 @@
 #include "deepx/tf/print.hpp"
 #include "deepx/tf/init.hpp"
 #include "deepx/tf/elementwise_basic.hpp"
+#include "deepx/tf/matmul.hpp"
 #include "deepx/dtype.hpp"
 #include "deepx/tf/tffactory.hpp"
 #include "deepx/tensorfunc/authors.hpp"
@@ -173,12 +174,19 @@ namespace deepx::tf
         //     opfactory.add_op(Powscalar_miaobyte<float>());
         //     opfactory.add_op(Powscalar_miaobyte<double>());
     }
-    // // matmul
-    // void register_matmul(OpFactory &opfactory)
-    // {
-    //     opfactory.add_op(MatMul<float>());
-    //     opfactory.add_op(MatMul<double>());
-    // }
+    // matmul
+    void register_matmul(TfFactory &tffactory)
+    {
+        tffactory.add_tf(std::make_shared<MatMul<cublas>>(vector<Param>(
+                                                             {
+                                                                 Param("A", DataCategory::Tensor, Precision::Any),
+                                                                 Param("B", DataCategory::Tensor, Precision::Any),
+                                                             }),
+                                                         vector<Param>(
+                                                             {
+                                                                 Param("C", DataCategory::Tensor, Precision::Any),
+                                                             })));
+    }
     // // changeshape
     void register_changeshape(TfFactory &tffactory)
     {
@@ -207,7 +215,7 @@ namespace deepx::tf
         register_init(tffactory);
         register_util(tffactory);
         register_elementwise(tffactory);
-        // register_matmul(opfactory);
+        register_matmul(tffactory);
         register_changeshape(tffactory);
         // register_reduce(opfactory);
         return 0;

diff --git a/excuter/op-mem-cuda/src/deepx/tf/matmul.hpp b/excuter/op-mem-cuda/src/deepx/tf/matmul.hpp
@@ -0,0 +1,88 @@
+#ifndef DEEPX_TF_MATMUL_HPP
+#define DEEPX_TF_MATMUL_HPP
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#include "deepx/tf/tf.hpp"
+#include "deepx/dtype.hpp"
+#include "deepx/dtype_cuda.hpp"
+#include "deepx/tensorfunc/matmul_cublas.hpp"
+
+namespace deepx::tf
+{
+    template <typename Author>
+    class MatMul : public TF
+    {
+    public:
+        MatMul(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "matmul";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        MatMul(string text)
+        {
+            this->parse(text);
+            this->author = Author::name();
+            if (this->name != "matmul")
+            {
+                throw std::runtime_error("Invalid name: " + this->name);
+            }
+        }
+        string math_formula() const override
+        {
+            return "T3=T1 @ T2";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<MatMul<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != b_type || a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::matmul<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::matmul<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Float16:
+                tensorfunc::matmul<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->args[1].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+                break;
+            case Precision::BFloat16:
+                tensorfunc::matmul<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->args[1].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::matmul<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::matmul<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::matmul<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::matmul<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+}
+
+#endif
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -8,7 +8,7 @@
 #include "deepx/tf/changeshape.hpp"
 #include "deepx/tf/elementwise.hpp"
 #include "deepx/tf/tffactory.hpp"
-
+#include "deepx/tf/matmul.hpp"
 #include "deepx/tensorfunc/authors.hpp"
 namespace deepx::tf
 {
@@ -186,12 +186,28 @@ namespace deepx::tf
         //     opfactory.add_op(Powscalar_miaobyte<float>());
         //     opfactory.add_op(Powscalar_miaobyte<double>());
     }
-    // // matmul
-    // void register_matmul(OpFactory &opfactory)
-    // {
-    //     opfactory.add_op(MatMul<float>());
-    //     opfactory.add_op(MatMul<double>());
-    // }
+    // matmul
+    void register_matmul(TfFactory &tffactory)
+    {
+        tffactory.add_tf(std::make_shared<MatMul<miaobyte>>(vector<Param>(
+                                                             {
+                                                                 Param("A", DataCategory::Tensor, Precision::Any),
+                                                                 Param("B", DataCategory::Tensor, Precision::Any),
+                                                             }),
+                                                         vector<Param>(
+                                                             {
+                                                                 Param("C", DataCategory::Tensor, Precision::Any),
+                                                             })));
+        tffactory.add_tf(std::make_shared<MatMul<cblas>>(vector<Param>(
+                                                             {
+                                                                 Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32),
+                                                                 Param("B", DataCategory::Tensor, Precision::Float64|Precision::Float32),
+                                                             }),
+                                                         vector<Param>(
+                                                             {
+                                                                 Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32),
+                                                             })));  
+    }
     // // changeshape
     void register_changeshape(TfFactory &tffactory)
     {
@@ -220,7 +236,7 @@ namespace deepx::tf
         register_init(tffactory);
         register_util(tffactory);
         register_elementwise(tffactory);
-        // register_matmul(opfactory);
+        register_matmul(tffactory);
         register_changeshape(tffactory);
         // register_reduce(opfactory);
         return 0;

diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp
@@ -1,5 +1,5 @@
-#ifndef DEEPX_TENSORFUNC_MATMUL_HPP
-#define DEEPX_TENSORFUNC_MATMUL_HPP
+#ifndef DEEPX_TENSORFUNC_MATMUL_CBLAS_HPP
+#define DEEPX_TENSORFUNC_MATMUL_CBLAS_HPP
 
 #include <cblas.h> // 如果使用 OpenBLAS
 #include "deepx/tensor.hpp"
@@ -64,7 +64,7 @@ namespace deepx::tensorfunc
   {
     static void matmul(const Tensor<double> &a, const Tensor<double> &b, Tensor<double> &c)
     {
-      if (!check_shape(a.shape, b.shape))
+      if (!check_matmul_shape(a.shape, b.shape))
       {
         throw std::invalid_argument("a.shape could matmul with b.shape");
       }
@@ -150,7 +150,7 @@ namespace deepx::tensorfunc
   {
     static void matmuladd(const Tensor<float> &a, const Tensor<float> &b, const float &alpha, const float &beta, Tensor<float> &c)
     {
-      if (!check_shape(a.shape, b.shape))
+      if (!check_matmul_shape(a.shape, b.shape))
       {
         throw std::invalid_argument("a.shape could matmul with b.shape");
       }
@@ -208,7 +208,7 @@ namespace deepx::tensorfunc
   {
     static void matmuladd(const Tensor<double> &a, const Tensor<double> &b, const double &alpha, const double &beta, Tensor<double> &c)
     {
-      if (!check_shape(a.shape, b.shape))
+      if (!check_matmul_shape(a.shape, b.shape))
       {
         throw std::invalid_argument("a.shape could matmul with b.shape");
       }
@@ -261,4 +261,4 @@ namespace deepx::tensorfunc
     }
   };
 }
-#endif // DEEPX_TENSORFUNC_MATMUL_HPP
+#endif // DEEPX_TENSORFUNC_MATMUL_CBLAS_HPP
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp
@@ -0,0 +1,80 @@
+#ifndef DEEPX_TF_MATMUL_HPP
+#define DEEPX_TF_MATMUL_HPP
+
+#include "deepx/tf/tf.hpp"
+#include "deepx/dtype.hpp"
+#include "deepx/dtype_ompsimd.hpp"
+#include "deepx/tensorfunc/matmul.hpp"
+#include "deepx/tensorfunc/matmul_cblas.hpp"
+#include "deepx/tensorfunc/matmul_miaobyte.hpp"
+namespace deepx::tf
+{
+    template <typename Author>
+    class MatMul : public TF
+    {
+    public:
+        MatMul(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "matmul";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        MatMul(string text)
+        {
+            this->parse(text);
+            this->author = Author::name();
+            if (this->name != "matmul")
+            {
+                throw std::runtime_error("Invalid name: " + this->name);
+            }
+        }
+        string math_formula() const override
+        {
+            return "T3=T1 @ T2";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<MatMul<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != b_type || a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::matmul<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::matmul<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::matmul<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::matmul<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::matmul<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::matmul<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+}
+
+#endif