diff --git a/doc/excuter/deepx.op.drawio b/doc/excuter/deepx.op.drawio index e30564a9..86117f1f 100644 --- a/doc/excuter/deepx.op.drawio +++ b/doc/excuter/deepx.op.drawio @@ -1,79 +1,122 @@ - + - - + + - - - - - + + - - + + - - + + - - + + - - - - - - + + + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + - - + + - - + + - + + + + + + + + + + + + + + + + - - + + + + + + + + + - - + + + + - - + + + + + + + + - - + + - - + + diff --git a/doc/excuter/deepx.op.drawio.svg b/doc/excuter/deepx.op.drawio.svg index 8c207b90..f6fe1fd0 100644 --- a/doc/excuter/deepx.op.drawio.svg +++ b/doc/excuter/deepx.op.drawio.svg @@ -1,4 +1,4 @@ -
excuter 
cuda
excuter 
cpu
excuter 
cuda
class Op{
name string;
vector<string> args;
vector<string> returns;
bool grad;
vector<string> args_grad;
vector<string> returns_grad;

virusal void forward();
virsual void backward();
}

openblas的matmul实现
template <T >
class Matmul_cblas:Op
cblas实现

tensorfunc/elementwise.hpp

template<T>
void matmul_cblas(const Tensor<T> &a, const Tensor<T> &b, Tensor<T> &c)

cblas对double的特化

template <>
void matmul<double>(const Tensor<double> &a, const Tensor<double> &b, Tensor<double> &c)
某auther a的实现
template <T >
class Matmul_authora:Op
默认的matmul
会选其中一个作为默认
template <T >
class Matmul:Op
某auther a的实现

tensorfunc/elementwise.hpp

template<T>
void matmul_authora(const Tensor<T> &a, const Tensor<T> &b, Tensor<T> &c)

authora对double的特化

template <>
  void matmul_authora<double>(const Tensor<double> &a, const Tensor<double> &b, Tensor<double> &c)
tensorfunc<T>
tensorfunc 特化
Op
\ No newline at end of file +
excuter 
ompsimd
tensorfunc<T>
tensorfunc 特化
TF
cpp-common
excuter 
cuda
TFfactory
List
init.hpp
elementwise.hpp
matmul.hpp

io.hpp

reduce.hpp

changeshape.hpp
authora
init_authora.hpp
elementwise_authora.hpp
matmul_authora.hpp

io_authora.hpp

reduce_authora.hpp

changeshape_authora.hpp
authorb
matmul_authora.hpp

io_authora.hpp

reduce_authora.hpp

changeshape_authora.hpp
TF
+ name
+ args
+ returns
template author 特化
%3CmxGraphModel%3E%3Croot%3E%3CmxCell%20id%3D%220%22%2F%3E%3CmxCell%20id%3D%221%22%20parent%3D%220%22%2F%3E%3CmxCell%20id%3D%222%22%20style%3D%22edgeStyle%3DorthogonalEdgeStyle%3Brounded%3D0%3BorthogonalLoop%3D1%3BjettySize%3Dauto%3Bhtml%3D1%3BentryX%3D0%3BentryY%3D0.123%3BentryDx%3D0%3BentryDy%3D0%3BentryPerimeter%3D0%3BexitX%3D1.006%3BexitY%3D0.145%3BexitDx%3D0%3BexitDy%3D0%3BexitPerimeter%3D0%3B%22%20edge%3D%221%22%20parent%3D%221%22%3E%3CmxGeometry%20relative%3D%221%22%20as%3D%22geometry%22%3E%3CmxPoint%20x%3D%22-332%22%20y%3D%22283%22%20as%3D%22sourcePoint%22%2F%3E%3CmxPoint%20x%3D%22-150%22%20y%3D%22280%22%20as%3D%22targetPoint%22%2F%3E%3CArray%20as%3D%22points%22%3E%3CmxPoint%20x%3D%22-332%22%20y%3D%22280%22%2F%3E%3C%2FArray%3E%3C%2FmxGeometry%3E%3C%2FmxCell%3E%3CmxCell%20id%3D%223%22%20value%3D%22template%20author%20%E7%89%B9%E5%8C%96%22%20style%3D%22edgeLabel%3Bhtml%3D1%3Balign%3Dcenter%3BverticalAlign%3Dmiddle%3Bresizable%3D0%3Bpoints%3D%5B%5D%3B%22%20vertex%3D%221%22%20connectable%3D%220%22%20parent%3D%222%22%3E%3CmxGeometry%20x%3D%220.4298%22%20y%3D%221%22%20relative%3D%221%22%20as%3D%22geometry%22%3E%3CmxPoint%20x%3D%22-37%22%20y%3D%221%22%20as%3D%22offset%22%2F%3E%3C%2FmxGeometry%3E%3C%2FmxCell%3E%3C%2Froot%3E%3C%2FmxGraphModel%3E
%3CmxGraphModel%3E%3Croot%3E%3CmxCell%20id%3D%220%22%2F%3E%3CmxCell%20id%3D%221%22%20parent%3D%220%22%2F%3E%3CmxCell%20id%3D%222%22%20style%3D%22edgeStyle%3DorthogonalEdgeStyle%3Brounded%3D0%3BorthogonalLoop%3D1%3BjettySize%3Dauto%3Bhtml%3D1%3BentryX%3D0%3BentryY%3D0.123%3BentryDx%3D0%3BentryDy%3D0%3BentryPerimeter%3D0%3BexitX%3D1.006%3BexitY%3D0.145%3BexitDx%3D0%3BexitDy%3D0%3BexitPerimeter%3D0%3B%22%20edge%3D%221%22%20parent%3D%221%22%3E%3CmxGeometry%20relative%3D%221%22%20as%3D%22geometry%22%3E%3CmxPoint%20x%3D%22-332%22%20y%3D%22283%22%20as%3D%22sourcePoint%22%2F%3E%3CmxPoint%20x%3D%22-150%22%20y%3D%22280%22%20as%3D%22targetPoint%22%2F%3E%3CArray%20as%3D%22points%22%3E%3CmxPoint%20x%3D%22-332%22%20y%3D%22280%22%2F%3E%3C%2FArray%3E%3C%2FmxGeometry%3E%3C%2FmxCell%3E%3CmxCell%20id%3D%223%22%20value%3D%22template%20author%20%E7%89%B9%E5%8C%96%22%20style%3D%22edgeLabel%3Bhtml%3D1%3Balign%3Dcenter%3BverticalAlign%3Dmiddle%3Bresizable%3D0%3Bpoints%3D%5B%5D%3B%22%20vertex%3D%221%22%20connectable%3D%220%22%20parent%3D%222%22%3E%3CmxGeometry%20x%3D%220.4298%22%20y%3D%221%22%20relative%3D%221%22%20as%3D%22geometry%22%3E%3CmxPoint%20x%3D%22-37%22%20y%3D%221%22%20as%3D%22offset%22%2F%3E%3C%2FmxGeometry%3E%3C%2FmxCell%3E%3C%2Froot%3E%3C%2FmxGraphModel%3E
template author 精度特化
Add:TF
+ run override
Sub:TF
+ run override
\ No newline at end of file diff --git a/doc/excuter/deepx.op.jpg b/doc/excuter/deepx.op.jpg new file mode 100644 index 00000000..85d4d9bc Binary files /dev/null and b/doc/excuter/deepx.op.jpg differ diff --git a/doc/excuter/excuter.md b/doc/excuter/excuter.md index 700a60bd..0e604e04 100644 --- a/doc/excuter/excuter.md +++ b/doc/excuter/excuter.md @@ -60,9 +60,16 @@ todo #### 4.front对接测试 -1.先启动excuter +1.先启动excuter可执行文件, 位于excuter/op-mem-{cuda/ompsimd}/build,可执行文件名同excuter名 2.然后测试front中py的对应算子脚本(front/py/examples 目录) +可以按照顺序,以此测试 + +1_tensor + +2_ir + +3_functional diff --git a/excuter/cpp-common/src/deepx/tensorfunc/io.hpp b/excuter/cpp-common/src/deepx/tensorfunc/io.hpp new file mode 100644 index 00000000..d9fdb47f --- /dev/null +++ b/excuter/cpp-common/src/deepx/tensorfunc/io.hpp @@ -0,0 +1,39 @@ +#ifndef DEEPX_TENSORFUNC_IO_HPP +#define DEEPX_TENSORFUNC_IO_HPP + +#include "deepx/tensor.hpp" + +namespace deepx::tensorfunc{ + + template + struct printDispatcher{ + static void print(const Tensor &t, const std::string &f="")=delete; + }; + + template + void print(const Tensor &t, const std::string &f=""){ + printDispatcher::print(t, f); + } + + template + struct saveDispatcher{ + static void save(Tensor &tensor,const std::string &path,int filebegin=0)=delete; + }; + + template + void save(Tensor &tensor,const std::string &path,int filebegin=0){ + saveDispatcher::save(tensor, path, filebegin); + } + + template + struct loadDispatcher{ + static Tensor load(const std::string &path,int filebegin=0)=delete; + }; + + template + Tensor load(const std::string &path,int filebegin=0){ + return loadDispatcher::load(path, filebegin); + } +} + +#endif // DEEPX_TENSORFUNC_IO_HPP diff --git a/excuter/cpp-common/src/deepx/tensorfunc/print.hpp b/excuter/cpp-common/src/deepx/tensorfunc/print.hpp deleted file mode 100644 index 559f3912..00000000 --- a/excuter/cpp-common/src/deepx/tensorfunc/print.hpp +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef DEEPX_TENSORFUNC_PRINT_HPP -#define DEEPX_TENSORFUNC_PRINT_HPP - -#include "deepx/tensor.hpp" - -namespace deepx::tensorfunc{ - - template - struct printDispatcher{ - static void print(const Tensor &t, const std::string &f="")=delete; - }; - - template - void print(const Tensor &t, const std::string &f=""){ - printDispatcher::print(t, f); - } -} - -#endif // DEEPX_TENSORFUNC_PRINT_HPP diff --git a/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp b/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp index c9f3b2a7..f1570693 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp @@ -7,50 +7,51 @@ namespace deepx::tensorfunc { + + template - struct reducesumDispatcher + struct reducemaxDispatcher { - static void reducesum(const Tensor &A, const int axis,const bool keepdims, Tensor &B) = delete; + static void reducemax(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) = delete; }; template - void reducesum(const Tensor &A, const int axis,const bool keepdims, Tensor &B) + void reducemax(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) { - reducesumDispatcher::reducesum(A, axis, keepdims, B); + reducemaxDispatcher::reducemax(A, dims, B, keepdims); } template - struct reduceprodDispatcher + struct reduceminDispatcher { - static void reduceprod(const Tensor &A, const int axis,const bool keepdims, Tensor &B) = delete; + static void reducemin(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) = delete; }; - template - void reduceprod(const Tensor &A, const int axis,const bool keepdims, Tensor &B) + void reducemin(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) { - reduceprodDispatcher::reduceprod(A, axis, keepdims, B); + reduceminDispatcher::reducemin(A, dims, B, keepdims); } - + template - struct reducemaxDispatcher + struct sumDispatcher { - static void reducemax(const Tensor &A, const int axis,const bool keepdims, Tensor &B) = delete; + static void reducesum(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) = delete; }; template - void reducemax(const Tensor &A, const int axis,const bool keepdims, Tensor &B) + void sum(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) { - reducemaxDispatcher::reducemax(A, axis, keepdims, B); + sumDispatcher::sum(A, dims, B, keepdims); } template - struct reduceminDispatcher + struct prodDispatcher { - static void reducemin(const Tensor &A, const int axis,const bool keepdims, Tensor &B) = delete; + static void prod(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) = delete; }; + template - void reducemin(const Tensor &A, const int axis,const bool keepdims, Tensor &B) + void prod(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) { - reduceminDispatcher::reducemin(A, axis, keepdims, B); + prodDispatcher::prod(A, dims, B, keepdims); } - } #endif // DEEPX_TENSORFUNC_REDUCE_HPP diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp index 5a4e5540..44560d7f 100644 --- a/excuter/op-mem-cuda/src/client/tfs.cpp +++ b/excuter/op-mem-cuda/src/client/tfs.cpp @@ -1,7 +1,7 @@ #include "deepx/tf/arg.hpp" #include "deepx/tf/tf.hpp" #include "deepx/tf/new.hpp" -#include "deepx/tf/print.hpp" +#include "deepx/tf/io.hpp" #include "deepx/tf/init.hpp" #include "deepx/tf/elementwise_basic.hpp" #include "deepx/tf/elementwise_sqrt.hpp" diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/file.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/file.hpp deleted file mode 100644 index 62695e64..00000000 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/file.hpp +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef DEEPX_TENSORFUNC_FILE_HPP -#define DEEPX_TENSORFUNC_FILE_HPP - -#include -#include - -#include -namespace deepx::tensorfunc -{ - template - void save(Tensor &tensor,const std::string &path); - - template - Tensor load(const std::string &path); - -} - -#endif \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp new file mode 100644 index 00000000..e4aa4080 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp @@ -0,0 +1,137 @@ +#ifndef DEEPX_TENSORFUNC_IO_MIAOBYTE_HPP +#define DEEPX_TENSORFUNC_IO_MIAOBYTE_HPP + +#include +#include +#include +#include +#include +#include +#include "deepx/tensorfunc/authors.hpp" +#include "deepx/tensorfunc/io.hpp" + +namespace deepx::tensorfunc +{ + template + struct printDispatcher + { + static void print(const Tensor &t, const std::string &f = "") + { + int bytes = precision_bits(t.shape.dtype) / 8; + size_t total_bytes = t.shape.size * bytes; + + // 统一分配CPU内存 + unsigned char *host_data = new unsigned char[total_bytes]; + if (host_data == nullptr) + { + throw std::runtime_error("Failed to allocate host memory"); + } + + // 统一复制数据到CPU + cudaError_t err = cudaMemcpy(host_data, t.data, total_bytes, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + { + delete[] host_data; + throw std::runtime_error("Failed to copy data from device to host"); + } + + // 对于half和bf16类型需要转换为float + if (t.shape.dtype == Precision::Float16 || t.shape.dtype == Precision::BFloat16) + { + float *host_float = new float[t.shape.size]; + if (host_float == nullptr) + { + delete[] host_data; + throw std::runtime_error("Failed to allocate host memory for float conversion"); + } + + // 在CPU上进行类型转换 + if (t.shape.dtype == Precision::Float16) + { + for (size_t i = 0; i < t.shape.size; i++) + { + host_float[i] = __half2float(((half *)host_data)[i]); + } + } + else + { // BFloat16 + for (size_t i = 0; i < t.shape.size; i++) + { + host_float[i] = __bfloat162float(((nv_bfloat16 *)host_data)[i]); + } + } + + // 打印转换后的float数据 + stdutil::print(t.shape.shape, host_float, Precision::Float32, f.empty() ? "%.4f" : f); + delete[] host_float; + } + else + { + // 其他类型直接打印 + stdutil::print(t.shape.shape, host_data, t.shape.dtype, f); + } + + delete[] host_data; + } + }; + + template + struct saveDispatcher + { + static void save(Tensor &tensor, const std::string &path, int filebegin = 0) + { + // 保存shape + std::string shapepath = path + ".shape"; + std::string shapedata = tensor.shape.toYaml(); + std::ofstream shape_fs(shapepath, std::ios::binary); + shape_fs.write(shapedata.c_str(), shapedata.size()); + shape_fs.close(); + + // 保存data + std::string datapath = path + ".data"; + std::ofstream data_fs(datapath, std::ios::binary | std::ios::in | std::ios::out); + + if (!data_fs.is_open()) + { + // 如果文件不存在,则创建新文件 + data_fs.open(datapath, std::ios::binary | std::ios::out); + } + data_fs.seekp(filebegin); + data_fs.write(reinterpret_cast(tensor.data), tensor.shape.size * sizeof(T)); + data_fs.close(); + } + }; + template + struct loadDispatcher + { + static Tensor load(const std::string &path, int filebegin = 0) + { + // 加载shape + std::string shapepath = path + ".shape"; + std::ifstream shape_fs(shapepath, std::ios::binary); + std::string shapedata((std::istreambuf_iterator(shape_fs)), std::istreambuf_iterator()); + + Shape shape; + shape.fromYaml(shapedata); + shape_fs.close(); + + // 加载data + Tensor tensor = New(shape); + std::string datapath = path + ".data"; + std::ifstream data_fs(datapath, std::ios::binary); + + if (!data_fs.is_open()) + { + throw std::runtime_error("无法打开数据文件: " + datapath); + } + + // 设置读取位置 + data_fs.seekg(filebegin); + data_fs.read(reinterpret_cast(tensor.data), shape.size * sizeof(T)); + data_fs.close(); + + return tensor; + } + }; +} +#endif // DEEPX_TENSORFUNC_IO_MIAOBYTE_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/print_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/print_miaobyte.hpp deleted file mode 100644 index 98487d63..00000000 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/print_miaobyte.hpp +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef DEEPX_TENSORFUNC_PRINT_DEFAULT_HPP -#define DEEPX_TENSORFUNC_PRINT_DEFAULT_HPP - -#include -#include -#include -#include -#include -#include -#include "deepx/tensorfunc/authors.hpp" -#include "deepx/tensorfunc/print.hpp" - -namespace deepx::tensorfunc -{ - template - struct printDispatcher - { - static void print(const Tensor &t, const std::string &f = "") - { - int bytes = precision_bits(t.shape.dtype) / 8; - size_t total_bytes = t.shape.size * bytes; - - // 统一分配CPU内存 - unsigned char *host_data = new unsigned char[total_bytes]; - if (host_data == nullptr) { - throw std::runtime_error("Failed to allocate host memory"); - } - - // 统一复制数据到CPU - cudaError_t err = cudaMemcpy(host_data, t.data, total_bytes, cudaMemcpyDeviceToHost); - if (err != cudaSuccess) { - delete[] host_data; - throw std::runtime_error("Failed to copy data from device to host"); - } - - // 对于half和bf16类型需要转换为float - if (t.shape.dtype == Precision::Float16 || t.shape.dtype == Precision::BFloat16) { - float* host_float = new float[t.shape.size]; - if (host_float == nullptr) { - delete[] host_data; - throw std::runtime_error("Failed to allocate host memory for float conversion"); - } - - // 在CPU上进行类型转换 - if (t.shape.dtype == Precision::Float16) { - for(size_t i = 0; i < t.shape.size; i++) { - host_float[i] = __half2float(((half*)host_data)[i]); - } - } else { // BFloat16 - for(size_t i = 0; i < t.shape.size; i++) { - host_float[i] = __bfloat162float(((nv_bfloat16*)host_data)[i]); - } - } - - // 打印转换后的float数据 - stdutil::print(t.shape.shape, host_float, Precision::Float32, f.empty() ? "%.4f" : f); - delete[] host_float; - } - else { - // 其他类型直接打印 - stdutil::print(t.shape.shape, host_data, t.shape.dtype, f); - } - - delete[] host_data; - } - }; -} - -#endif // DEEPX_TENSORFUNC_PRINT_DEFAULT_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce.hpp deleted file mode 100644 index 8a39fccf..00000000 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce.hpp +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef DEEPX_TENSORFUNC_REDUCE_HPP -#define DEEPX_TENSORFUNC_REDUCE_HPP - -#include -#include -#include -#include - -#include "deepx/tensor.hpp" -#include "deepx/shape_reduce.hpp" -#include "deepx/tensorfunc/init.hpp" - -namespace deepx::tensorfunc -{ - - template - void sum(const Tensor &tensor, const std::vector &dims, Tensor &result); - - - template - void product(const Tensor &tensor, const std::vector &dims, Tensor &result); -} -#endif \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.hpp new file mode 100644 index 00000000..7b2a9f58 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.hpp @@ -0,0 +1,68 @@ +#ifndef DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_HPP +#define DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_HPP + +#include +#include +#include + +#include "deepx/tensor.hpp" +#include "deepx/shape_reduce.hpp" +#include "deepx/tensorfunc/authors.hpp" +#include + +#include "deepx/tensorfunc/reduce.hpp" + +namespace deepx::tensorfunc +{ + + template < typename T> + struct reducemaxDispatcher + { + static void reducemax(const Tensor &A, const std::vector &dims, Tensor &B,const bool keepdims) { + if (axis < 0) { + axis += A.shape.dim; + } + if (axis >= A.shape.dim) { + throw std::invalid_argument("Invalid axis for reducemax"); + } + + } + }; + + + template < typename T> + struct reduceminDispatcher + { + static void reducemin(const Tensor &A, const std::vector &dims, Tensor &B,const bool keepdims) { + if (axis < 0) { + axis += A.shape.dim; + } + if (axis >= A.shape.dim) { + throw std::invalid_argument("Invalid axis for reducemin"); + } + + } + }; + + + template + struct sumDispatcher + { + static void sum(const Tensor &tensor, const std::vector &dims, Tensor &result,const bool keepdims) + { + + + } + }; + + + template + struct prodDispatcher + { + static void prod(const Tensor &tensor, const std::vector &dims, Tensor &result,const bool keepdims) + { + + } + }; +} +#endif DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/shape.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/shape.hpp deleted file mode 100644 index 3ad23019..00000000 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/shape.hpp +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef DEEPX_TENSORFUNC_SHAPE_HPP -#define DEEPX_TENSORFUNC_SHAPE_HPP - -#include -#include - -#include "deepx/tensor.hpp" -#include "deepx/tensorfunc/new.hpp" -namespace deepx::tensorfunc -{ - template - void transpose(const Tensor &tensor, Tensor &result, const std::vector &dimOrder); - - template - void concat(const std::vector *> &tensors, const int axis, Tensor &result); - - template - void split(const Tensor &tensor, const int axis, std::vector *> &results); -} - -#endif // DEEPX_TENSORFUNC_TRANSPOSE_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tf/print.hpp b/excuter/op-mem-cuda/src/deepx/tf/io.hpp similarity index 92% rename from excuter/op-mem-cuda/src/deepx/tf/print.hpp rename to excuter/op-mem-cuda/src/deepx/tf/io.hpp index 5746b435..6118471a 100644 --- a/excuter/op-mem-cuda/src/deepx/tf/print.hpp +++ b/excuter/op-mem-cuda/src/deepx/tf/io.hpp @@ -1,9 +1,9 @@ -#ifndef DEEPX_TF_PRINT_HPP -#define DEEPX_TF_PRINT_HPP +#ifndef DEEPX_TF_IO_HPP +#define DEEPX_TF_IO_HPP #include "deepx/tf/tf.hpp" -#include "deepx/tensorfunc/print.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "deepx/tensorfunc/authors.hpp" namespace deepx::tf { diff --git a/excuter/op-mem-cuda/test/tensorfunc/0_new.cpp b/excuter/op-mem-cuda/test/tensorfunc/0_new.cpp index 4ec44898..5896850c 100644 --- a/excuter/op-mem-cuda/test/tensorfunc/0_new.cpp +++ b/excuter/op-mem-cuda/test/tensorfunc/0_new.cpp @@ -1,7 +1,7 @@ #include "deepx/tensorfunc/init.hpp" #include "deepx/tensor.hpp" #include "deepx/tensorfunc/new.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "deepx/tensorfunc/init_miaobyte.hpp" #include "deepx/tensorfunc/authors.hpp" diff --git a/excuter/op-mem-cuda/test/tensorfunc/1_cublas_add.cpp b/excuter/op-mem-cuda/test/tensorfunc/1_cublas_add.cpp index 9ab2a44c..5904b91c 100644 --- a/excuter/op-mem-cuda/test/tensorfunc/1_cublas_add.cpp +++ b/excuter/op-mem-cuda/test/tensorfunc/1_cublas_add.cpp @@ -1,7 +1,7 @@ #include "deepx/tensorfunc/init_miaobyte.hpp" #include "deepx/tensor.hpp" #include "deepx/tensorfunc/new.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "deepx/tensorfunc/elementwise.hpp" #include "deepx/tensorfunc/elementwise_cublas_basic.hpp" using namespace deepx::tensorfunc; diff --git a/excuter/op-mem-cuda/test/tensorfunc/1_cublas_matmul.cpp b/excuter/op-mem-cuda/test/tensorfunc/1_cublas_matmul.cpp index cac9cae5..3a2ac4e0 100644 --- a/excuter/op-mem-cuda/test/tensorfunc/1_cublas_matmul.cpp +++ b/excuter/op-mem-cuda/test/tensorfunc/1_cublas_matmul.cpp @@ -1,7 +1,7 @@ #include "deepx/tensorfunc/init_miaobyte.hpp" #include "deepx/tensor.hpp" #include "deepx/tensorfunc/new.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "deepx/tensorfunc/matmul.hpp" #include "deepx/tensorfunc/matmul_cublas.hpp" diff --git a/excuter/op-mem-cuda/test/tensorfunc/2_changeshape.cpp b/excuter/op-mem-cuda/test/tensorfunc/2_changeshape.cpp index 33ccab95..f18671c5 100644 --- a/excuter/op-mem-cuda/test/tensorfunc/2_changeshape.cpp +++ b/excuter/op-mem-cuda/test/tensorfunc/2_changeshape.cpp @@ -1,7 +1,7 @@ #include "deepx/tensorfunc/init_miaobyte.hpp" #include "deepx/tensor.hpp" #include "deepx/tensorfunc/new.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "deepx/tensorfunc/changeshape_miaobyte.hpp" using namespace deepx::tensorfunc; using namespace deepx; diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp index 0489f169..d81306ee 100644 --- a/excuter/op-mem-ompsimd/src/client/tfs.cpp +++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp @@ -4,7 +4,7 @@ #include "deepx/tf/arg.hpp" #include "deepx/tf/new.hpp" #include "deepx/tf/init.hpp" -#include "deepx/tf/print.hpp" +#include "deepx/tf/io.hpp" #include "deepx/tf/changeshape.hpp" #include "deepx/tf/elementwise.hpp" #include "deepx/tf/tffactory.hpp" diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/file.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/file.hpp deleted file mode 100644 index 5bfea0e2..00000000 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/file.hpp +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef DEEPX_TENSORFUNC_FILE_HPP -#define DEEPX_TENSORFUNC_FILE_HPP - -#include -#include - -#include -namespace deepx::tensorfunc -{ - template - void save(Tensor &tensor,const std::string &path) - { - std::string shapepath = path + ".shape"; - std::string shapedata = tensor.shape.toYaml(); - std::ofstream shape_fs(shapepath, std::ios::binary); - shape_fs.write(shapedata.c_str(), shapedata.size()); - shape_fs.close(); - - std::string datapath = path + ".data"; - std::ofstream data_fs(datapath, std::ios::binary); - data_fs.write(reinterpret_cast(tensor.data), tensor.shape.size * sizeof(T)); - data_fs.close(); - } - template - Tensor load(const std::string &path) - { - - std::string shapepath = path + ".shape"; - std::ifstream shape_fs(shapepath, std::ios::binary); - std::string shapedata((std::istreambuf_iterator(shape_fs)), std::istreambuf_iterator()); - - Shape shape; - shape.fromYaml(shapedata); - shape_fs.close(); - - Tensor tensor=New(shape); - std::string datapath = path + ".data"; - std::ifstream data_fs(datapath, std::ios::binary); - - data_fs.read(reinterpret_cast(tensor.data), shape.size * sizeof(T)); - data_fs.close(); - - - return tensor; - } -} - -#endif \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp new file mode 100644 index 00000000..e6feebcc --- /dev/null +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp @@ -0,0 +1,98 @@ +#ifndef DEEPX_TENSORFUNC_IO_MIAOBYTE_HPP +#define DEEPX_TENSORFUNC_IO_MIAOBYTE_HPP + +#include + +#include "deepx/tensor.hpp" +#include "stdutil/vector.hpp" +#include "stdutil/print.hpp" +#include "deepx/tensorfunc/authors.hpp" +#include "deepx/tensorfunc/io.hpp" +#include "deepx/tensorfunc/new.hpp" + +namespace deepx::tensorfunc +{ + // 通用模板特化 + template + struct printDispatcher + { + static void print(const Tensor &t, const std::string &f = "") + { + Tensor vt; + vt.data = t.data; + vt.shape = t.shape; + vt.deleter = nullptr; + stdutil::print(t.shape.shape, t.data, t.shape.dtype, f); + } + }; + + // void类型的完全特化 + template <> + struct printDispatcher + { + static void print(const Tensor &t, const std::string &f = "") + { + stdutil::print(t.shape.shape, t.data, t.shape.dtype, f); + } + }; + + template + struct saveDispatcher + { + static void save(Tensor &tensor, const std::string &path, int filebegin = 0) + { + // 保存shape + std::string shapepath = path + ".shape"; + std::string shapedata = tensor.shape.toYaml(); + std::ofstream shape_fs(shapepath, std::ios::binary); + shape_fs.write(shapedata.c_str(), shapedata.size()); + shape_fs.close(); + + // 保存data + std::string datapath = path + ".data"; + std::ofstream data_fs(datapath, std::ios::binary | std::ios::in | std::ios::out); + + if (!data_fs.is_open()) + { + // 如果文件不存在,则创建新文件 + data_fs.open(datapath, std::ios::binary | std::ios::out); + } + data_fs.seekp(filebegin); + data_fs.write(reinterpret_cast(tensor.data), tensor.shape.size * sizeof(T)); + data_fs.close(); + } + }; + template + struct loadDispatcher + { + static Tensor load(const std::string &path, int filebegin = 0) + { + // 加载shape + std::string shapepath = path + ".shape"; + std::ifstream shape_fs(shapepath, std::ios::binary); + std::string shapedata((std::istreambuf_iterator(shape_fs)), std::istreambuf_iterator()); + + Shape shape; + shape.fromYaml(shapedata); + shape_fs.close(); + + // 加载data + Tensor tensor = New(shape); + std::string datapath = path + ".data"; + std::ifstream data_fs(datapath, std::ios::binary); + + if (!data_fs.is_open()) + { + throw std::runtime_error("无法打开数据文件: " + datapath); + } + + // 设置读取位置 + data_fs.seekg(filebegin); + data_fs.read(reinterpret_cast(tensor.data), shape.size * sizeof(T)); + data_fs.close(); + + return tensor; + } + }; +} +#endif // DEEPX_TENSORFUNC_IO_MIAOBYTE_HPP \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/print_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/print_miaobyte.hpp deleted file mode 100644 index e2e5e576..00000000 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/print_miaobyte.hpp +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef DEEPX_TENSORFUNC_PRINT_MIAOBYTE_HPP -#define DEEPX_TENSORFUNC_PRINT_MIAOBYTE_HPP - -#include - -#include "deepx/tensor.hpp" -#include "stdutil/vector.hpp" -#include "stdutil/print.hpp" -#include "deepx/tensorfunc/authors.hpp" -#include "deepx/tensorfunc/print.hpp" - -namespace deepx::tensorfunc -{ - // 通用模板特化 - template - struct printDispatcher - { - static void print(const Tensor &t, const std::string &f = "") - { - Tensor vt; - vt.data = t.data; - vt.shape = t.shape; - vt.deleter = nullptr; - stdutil::print(t.shape.shape, t.data, t.shape.dtype, f); - } - }; - - // void类型的完全特化 - template <> - struct printDispatcher - { - static void print(const Tensor &t, const std::string &f = "") - { - stdutil::print(t.shape.shape, t.data, t.shape.dtype, f); - } - }; -} -#endif // DEEPX_TENSORFUNC_PRINT_DEFAULT_HPP \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce.hpp deleted file mode 100644 index aebd110a..00000000 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce.hpp +++ /dev/null @@ -1,188 +0,0 @@ -#ifndef DEEPX_TENSORFUNC_REDUCE_HPP -#define DEEPX_TENSORFUNC_REDUCE_HPP - -#include -#include -#include -#include -#include - -#include "deepx/tensor.hpp" -#include "deepx/shape_reduce.hpp" -#include "deepx/tensorfunc/init_miaobyte.hpp" - -namespace deepx::tensorfunc -{ - using namespace hwy::HWY_NAMESPACE; - - template - void sum(const Tensor &tensor, const std::vector &dims, Tensor &result) - { - constant(result,T(0)); - - std::vector sorted_dims = dims; - if (dims.size()==0){ - sorted_dims=arrange(tensor.shape.dim); - } - // 从大到小排序 - std::sort(sorted_dims.begin(), sorted_dims.end(), std::greater()); - std::vector sumMap = reduceDimMap(tensor.shape, sorted_dims); - // 如果dims的最后一个元素是tensor.shape.dim-1,则说明求和的数据不连续(不对齐),无法simd(需要不停跳跃) - - const ScalableTag _tag; - size_t minshape_1=Lanes(_tag); - // if (true) - if (sorted_dims.rbegin()[0] == tensor.shape.dim - 1 - ||tensor.shape.dim>sorted_dims.size() - ||tensor.shape[-1]>=minshape_1 - ) - { - tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &sumMap](const int idx_linear, const std::vector &indices, std::vector &newIndices) - { - // 计算输出索引 - - for (size_t i = 0,j=0; i < tensor.shape.dim ; ++i) { - if (sumMap[i]==0) { - newIndices[j++]=indices[i]; - } - } - // 累加求和 - int outputIdx=result.shape.linearat(newIndices); -#pragma omp atomic - result.data[outputIdx]+=tensor.data[idx_linear]; }, result.shape.dim); - } - else - { - //这里有bug,todo - // 如果数据连续(对齐),则可以simd - tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &sumMap](const int idx_linear, const std::vector &indices, std::vector &newIndices) - { - // 计算输出索引 - for (size_t i = 0,j=0; i < tensor.shape.dim ; ++i) { - if (sumMap[i]==0) { - newIndices[j++]=indices[i]; - } - } - int outputIdx = result.shape.linearat(newIndices); - - - int shape_last = tensor.shape[-1]; - const ScalableTag tag; - const size_t lanes = Lanes(tag); - size_t j = 0; - T sum=0; - // 前部分:处理到对齐 - while (j < shape_last && !IsAligned(tag, tensor.data + idx_linear + j)) - { - sum+=tensor.data[idx_linear + j]; - ++j; - } - - // 中间部分:SIMD - size_t aligned_end = shape_last - (shape_last % lanes); - auto sum_vec = Zero(tag); // 初始化累加向量为0 - for (; j + lanes <= aligned_end; j += lanes) - { - auto vec = Load(tag, tensor.data + idx_linear + j); - sum_vec = Add(sum_vec, vec); // 向量累加 - } - -// 将向量累加结果写回 - sum+= ReduceSum(tag, sum_vec); // 使用ReduceSum替代GetLane(SumOfLane()) - - - // 尾部分:处理剩余 - for (; j < shape_last; ++j) - { - sum+=tensor.data[idx_linear + j]; - } - #pragma omp atomic - result.data[outputIdx]+=sum; - - }, result.shape.dim); - } - } - - template - void product(const Tensor &tensor, const std::vector &dims, Tensor &result) - { - - std::vector sorted_dims = dims; - if (dims.size()==0){ - sorted_dims=arrange(tensor.shape.dim); - } - // 从大到小排序 - std::sort(sorted_dims.begin(), sorted_dims.end(), std::greater()); - std::vector sumMap = reduceDimMap(tensor.shape, sorted_dims); - // 如果dims的最后一个元素是tensor.shape.dim-1,则说明求和的数据不连续(不对齐),无法simd(需要不停跳跃) - constant(result,T(1)); - if (sorted_dims.at(sorted_dims.size() - 1) == tensor.shape.dim - 1&&tensor.shape.dim>sorted_dims.size()) - { - tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &sumMap](const int idx_linear, const std::vector &indices, std::vector &newIndices) - { - // 计算输出索引 - - for (size_t i = 0,j=0; i < tensor.shape.dim ; ++i) { - if (sumMap[i]==0) { - newIndices[j++]=indices[i]; - } - } - // 累加求和 - int outputIdx=result.shape.linearat(newIndices); -#pragma omp atomic - result.data[outputIdx]*=tensor.data[idx_linear]; }, result.shape.dim); - } - else - { - // 如果数据连续(对齐),则可以simd - tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &sumMap](const int i, const std::vector &indices, std::vector &newIndices) - { - // 计算输出索引 - - for (size_t i = 0,j=0; i < tensor.shape.dim ; ++i) { - if (sumMap[i]==0) { - newIndices[j++]=indices[i]; - } - } - // 累加求和 - int outputIdx = result.shape.linearat(newIndices); - - - int shape_last = tensor.shape[-1]; - const ScalableTag tag; - const size_t lanes = Lanes(tag); - size_t j = 0; - T product=1; - // 前部分:处理到对齐 - while (j < shape_last && !IsAligned(tag, tensor.data + i + j)) - { - product*=tensor.data[i + j]; - ++j; - } - - // 中间部分:SIMD - size_t aligned_end = shape_last - (shape_last % lanes); - auto product_vec = One(tag); // 初始化累乘向量为1 - for (; j + lanes <= aligned_end; j += lanes) - { - auto vec = Load(tag, tensor.data + i + j); - product_vec = Mul(product_vec, vec); // 向量累乘 - } - -// 将向量累乘结果写回 - product*= ReduceMul(tag, product_vec); - - - // 尾部分:处理剩余 - for (; j < shape_last; ++j) - { - product*=tensor.data[i + j]; - } - #pragma omp atomic - result.data[outputIdx]*=product; - - }, result.shape.dim); - } - } -} -#endif \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp new file mode 100644 index 00000000..3eb77628 --- /dev/null +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp @@ -0,0 +1,190 @@ +#ifndef DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_HPP +#define DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_HPP + +#include +#include +#include +#include +#include + +#include "deepx/tensor.hpp" +#include "deepx/shape_reduce.hpp" +#include "deepx/tensorfunc/reduce.hpp" +#include "deepx/tensorfunc/init_miaobyte.hpp" + +namespace deepx::tensorfunc +{ + using namespace hwy::HWY_NAMESPACE; + + template + struct sumDispatcher + { + static void sum(const Tensor &tensor, const std::vector &dims, Tensor &result,const bool keepdims) + { + constant(result, T(0)); + + std::vector sorted_dims = dims; + if (dims.size() == 0) + { + sorted_dims = arrange(tensor.shape.dim); + } + // 从大到小排序 + std::sort(sorted_dims.begin(), sorted_dims.end(), std::greater()); + std::vector sumMap = reduceDimMap(tensor.shape, sorted_dims); + // 如果dims的最后一个元素是tensor.shape.dim-1,则说明求和的数据不连续(不对齐),无法simd(需要不停跳跃) + + const ScalableTag _tag; + size_t minshape_1 = Lanes(_tag); + // if (true) + if (sorted_dims.rbegin()[0] == tensor.shape.dim - 1 || tensor.shape.dim > sorted_dims.size() || tensor.shape[-1] >= minshape_1) + { + tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &sumMap](const int idx_linear, const std::vector &indices, std::vector &newIndices) + { + // 计算输出索引 + + for (size_t i = 0,j=0; i < tensor.shape.dim ; ++i) { + if (sumMap[i]==0) { + newIndices[j++]=indices[i]; + } + } + // 累加求和 + int outputIdx=result.shape.linearat(newIndices); +#pragma omp atomic + result.data[outputIdx]+=tensor.data[idx_linear]; }, result.shape.dim); + } + else + { + // 这里有bug,todo + // 如果数据连续(对齐),则可以simd + tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &sumMap](const int idx_linear, const std::vector &indices, std::vector &newIndices) + { + // 计算输出索引 + for (size_t i = 0, j = 0; i < tensor.shape.dim; ++i) + { + if (sumMap[i] == 0) + { + newIndices[j++] = indices[i]; + } + } + int outputIdx = result.shape.linearat(newIndices); + + int shape_last = tensor.shape[-1]; + const ScalableTag tag; + const size_t lanes = Lanes(tag); + size_t j = 0; + T sum = 0; + // 前部分:处理到对齐 + while (j < shape_last && !IsAligned(tag, tensor.data + idx_linear + j)) + { + sum += tensor.data[idx_linear + j]; + ++j; + } + + // 中间部分:SIMD + size_t aligned_end = shape_last - (shape_last % lanes); + auto sum_vec = Zero(tag); // 初始化累加向量为0 + for (; j + lanes <= aligned_end; j += lanes) + { + auto vec = Load(tag, tensor.data + idx_linear + j); + sum_vec = Add(sum_vec, vec); // 向量累加 + } + + // 将向量累加结果写回 + sum += ReduceSum(tag, sum_vec); // 使用ReduceSum替代GetLane(SumOfLane()) + + // 尾部分:处理剩余 + for (; j < shape_last; ++j) + { + sum += tensor.data[idx_linear + j]; + } +#pragma omp atomic + result.data[outputIdx] += sum; }, result.shape.dim); + } + } + }; + + template + struct prodDispatcher + { + static void prod(const Tensor &tensor, const std::vector &dims, Tensor &result,const bool keepdims) + { + + std::vector sorted_dims = dims; + if (dims.size() == 0) + { + sorted_dims = arrange(tensor.shape.dim); + } + // 从大到小排序 + std::sort(sorted_dims.begin(), sorted_dims.end(), std::greater()); + std::vector sumMap = reduceDimMap(tensor.shape, sorted_dims); + // 如果dims的最后一个元素是tensor.shape.dim-1,则说明求和的数据不连续(不对齐),无法simd(需要不停跳跃) + constant(result, T(1)); + if (sorted_dims.at(sorted_dims.size() - 1) == tensor.shape.dim - 1 && tensor.shape.dim > sorted_dims.size()) + { + tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &sumMap](const int idx_linear, const std::vector &indices, std::vector &newIndices) + { + // 计算输出索引 + + for (size_t i = 0,j=0; i < tensor.shape.dim ; ++i) { + if (sumMap[i]==0) { + newIndices[j++]=indices[i]; + } + } + // 累加求和 + int outputIdx=result.shape.linearat(newIndices); +#pragma omp atomic + result.data[outputIdx]*=tensor.data[idx_linear]; }, result.shape.dim); + } + else + { + // 如果数据连续(对齐),则可以simd + tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &sumMap](const int i, const std::vector &indices, std::vector &newIndices) + { + // 计算输出索引 + + for (size_t i = 0, j = 0; i < tensor.shape.dim; ++i) + { + if (sumMap[i] == 0) + { + newIndices[j++] = indices[i]; + } + } + // 累加求和 + int outputIdx = result.shape.linearat(newIndices); + + int shape_last = tensor.shape[-1]; + const ScalableTag tag; + const size_t lanes = Lanes(tag); + size_t j = 0; + T product = 1; + // 前部分:处理到对齐 + while (j < shape_last && !IsAligned(tag, tensor.data + i + j)) + { + product *= tensor.data[i + j]; + ++j; + } + + // 中间部分:SIMD + size_t aligned_end = shape_last - (shape_last % lanes); + auto product_vec = One(tag); // 初始化累乘向量为1 + for (; j + lanes <= aligned_end; j += lanes) + { + auto vec = Load(tag, tensor.data + i + j); + product_vec = Mul(product_vec, vec); // 向量累乘 + } + + // 将向量累乘结果写回 + product *= ReduceMul(tag, product_vec); + + // 尾部分:处理剩余 + for (; j < shape_last; ++j) + { + product *= tensor.data[i + j]; + } +#pragma omp atomic + result.data[outputIdx] *= product; }, result.shape.dim); + } + } + }; +} +#endif \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/print.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp similarity index 91% rename from excuter/op-mem-ompsimd/src/deepx/tf/print.hpp rename to excuter/op-mem-ompsimd/src/deepx/tf/io.hpp index 5746b435..ba180f3d 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tf/print.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp @@ -1,9 +1,9 @@ -#ifndef DEEPX_TF_PRINT_HPP -#define DEEPX_TF_PRINT_HPP +#ifndef DEEPX_TF_IO_HPP +#define DEEPX_TF_IO_HPP #include "deepx/tf/tf.hpp" -#include "deepx/tensorfunc/print.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "deepx/tensorfunc/authors.hpp" namespace deepx::tf { @@ -62,4 +62,4 @@ namespace deepx::tf } }; } -#endif +#endif // DEEPX_TF_IO_HPP diff --git a/excuter/op-mem-ompsimd/test/op/1_mem.cpp b/excuter/op-mem-ompsimd/test/op/1_mem.cpp index 327210f5..6433d086 100644 --- a/excuter/op-mem-ompsimd/test/op/1_mem.cpp +++ b/excuter/op-mem-ompsimd/test/op/1_mem.cpp @@ -3,7 +3,7 @@ #include "deepx/tensor.hpp" #include "deepx/tensorfunc/new.hpp" #include "deepx/tensorfunc/init_miaobyte.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "deepx/tensorfunc/authors.hpp" using namespace deepx::mem; diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp index 128c18d1..05a0df43 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp @@ -5,9 +5,8 @@ #include "deepx/tensorfunc/new.hpp" #include "deepx/tensorfunc/init_miaobyte.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" #include "deepx/tensorfunc/authors.hpp" -#include "deepx/tensorfunc/file.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" using namespace deepx; using namespace deepx::tensorfunc; @@ -15,11 +14,11 @@ void test_tensor_new(){ Tensor tensor=New({2, 3}); constant(tensor,1); print(tensor); - save(tensor,"tensor"); + save(tensor,"tensor"); Tensor tensor2=New({2, 3}); constant(tensor2,2); print(tensor2); - save(tensor2,"tensor2"); + save(tensor2,"tensor2"); } void test_arange() { diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.cpp index 969a0ad4..b1d5dcb8 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.cpp @@ -4,8 +4,8 @@ #include "deepx/tensor.hpp" #include "deepx/tensorfunc/new.hpp" #include "deepx/tensorfunc/init_miaobyte.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" -#include "deepx/tensorfunc/file.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" +#include "deepx/tensorfunc/authors.hpp" using namespace deepx; @@ -14,11 +14,11 @@ void test_tensor_range(){ Tensor tensor=New({2, 3}); constant(tensor,1); print(tensor); - save(tensor,"2_tensor_range.1"); + save(tensor,"2_tensor_range.1"); Tensor tensor2=New({2, 3}); constant(tensor2,2); print(tensor2); - save(tensor2,"2_tensor_range.2"); + save(tensor2,"2_tensor_range.2"); } int main(){ diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/3_tensor_print.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/3_tensor_print.cpp index 11df39bc..6657ecce 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/3_tensor_print.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/3_tensor_print.cpp @@ -1,13 +1,13 @@ #include #include "deepx/tensor.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "deepx/tensorfunc/new.hpp" -#include "deepx/tensorfunc/file.hpp" +#include "deepx/tensorfunc/authors.hpp" int main(){ deepx::Tensor t=deepx::tensorfunc::New({2, 3,4}); std::iota(t.data, t.data+t.shape.size, 0); deepx::tensorfunc::print(t); - deepx::tensorfunc::save(t,"3_tensor_print"); + deepx::tensorfunc::save(t,"3_tensor_print"); return 0; } \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_add.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_add.cpp index 784f642c..35a57404 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_add.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_add.cpp @@ -4,7 +4,7 @@ #include "deepx/tensor.hpp" #include "deepx/tensorfunc/elementwise.hpp" #include "deepx/tensorfunc/elementwise_miaobyte.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "deepx/tensorfunc/new.hpp" #include "deepx/tensorfunc/init.hpp" #include "tensorutil.hpp" diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_matmul.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_matmul.cpp index 3921c69d..0b1b791e 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_matmul.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_matmul.cpp @@ -3,7 +3,7 @@ #include #include -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "deepx/tensor.hpp" #include "deepx/tensorfunc/new.hpp" @@ -12,8 +12,7 @@ #include "deepx/tensorfunc/init_miaobyte.hpp" #include "deepx/tensorfunc/authors.hpp" #include "deepx/shape_matmul.hpp" -#include "deepx/tensorfunc/file.hpp" - + using namespace deepx; using namespace deepx::tensorfunc; /* @@ -50,10 +49,10 @@ void test_tensor_matmul(){ void bench_tensor_matmul(int i) { Tensor tensor= New({i,i}); uniform(tensor,0,1); - save(tensor,"4_tensor_matmul"+std::to_string(i)+"tensor"); + save(tensor,"4_tensor_matmul"+std::to_string(i)+"tensor"); Tensor tensor2= New({i,i}); uniform(tensor2,0,1); - save(tensor2,"4_tensor_matmul"+std::to_string(i)+"tensor2"); + save(tensor2,"4_tensor_matmul"+std::to_string(i)+"tensor2"); Tensor tensor3= New(matmul_shape(tensor.shape, tensor2.shape).shape); std::cout<<("matmul ", i, "x", i); auto start = std::chrono::high_resolution_clock::now(); @@ -61,7 +60,7 @@ void bench_tensor_matmul(int i) { matmul(tensor, tensor2, tensor3); auto end=std::chrono::high_resolution_clock::now(); std::chrono::duration duration = end - start; - save(tensor3,"4_tensor_matmul"+std::to_string(i)+"result"); + save(tensor3,"4_tensor_matmul"+std::to_string(i)+"result"); std::cout << "time:" << duration.count() << " seconds" << std::endl; } diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_max.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_max.cpp index 015e009f..a2438adf 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_max.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_max.cpp @@ -3,7 +3,7 @@ #include "deepx/tensorfunc/elementwise_miaobyte.hpp" #include "deepx/tensor.hpp" #include "deepx/tensorfunc/init_miaobyte.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "deepx/tensorfunc/new.hpp" #include "deepx/tensorfunc/authors.hpp" #include "tensorutil.hpp" diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_mul.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_mul.cpp index 4e469d58..c93e42d9 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_mul.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_mul.cpp @@ -4,7 +4,7 @@ #include "deepx/tensor.hpp" #include "deepx/tensorfunc/elementwise.hpp" #include "deepx/tensorfunc/elementwise_miaobyte.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "deepx/tensorfunc/new.hpp" #include "deepx/tensorfunc/init_miaobyte.hpp" #include "tensorutil.hpp" diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_sub.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_sub.cpp index e57b91ef..0cbd2b86 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_sub.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_sub.cpp @@ -3,7 +3,7 @@ #include "deepx/tensor.hpp" #include "deepx/tensorfunc/elementwise.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "deepx/tensorfunc/new.hpp" #include "deepx/tensorfunc/init.hpp" #include "deepx/tensorfunc/authors.hpp" diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp index 0a3f0949..02fc55ef 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp @@ -5,14 +5,14 @@ #include #include "deepx/tensor.hpp" -#include "deepx/tensorfunc/reduce.hpp" +#include "deepx/tensorfunc/reduce_miaobyte.hpp" #include "stdutil/vector.hpp" #include "deepx/vector_combination.hpp" #include "deepx/shape_reduce.hpp" #include "deepx/tensorfunc/new.hpp" #include "deepx/tensorfunc/init_miaobyte.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" -#include "deepx/tensorfunc/file.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" +#include "deepx/tensorfunc/authors.hpp" #include @@ -33,7 +33,7 @@ void test_sum() std::cout <<"sum(t,"<< comb <<")"<< std::endl; Shape sumshape=reduceShape(shape,comb); Tensor r = New(sumshape.shape); - sum(tensor, comb, r); + sum(tensor, comb,r); print(r); } /* @@ -59,8 +59,8 @@ void benchmark_sum(int i){ { Shape sShape = reduceShape(shape, comb); Tensor r=New(sShape.shape); - sum(tensor, comb,r); - save(r,"5_tensor_sum"+std::to_string(i)+"result"); + sum(tensor, comb,r); + save(r,"5_tensor_sum"+std::to_string(i)+"result"); } auto end=std::chrono::high_resolution_clock::now(); std::chrono::duration duration = end - start; diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/6_tensor_broadcast.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/6_tensor_broadcast.cpp index 2b7c6ebb..6658be10 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/6_tensor_broadcast.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/6_tensor_broadcast.cpp @@ -2,7 +2,7 @@ #include "deepx/tensor.hpp" #include "deepx/tensorfunc/changeshape.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "deepx/tensorfunc/new.hpp" #include "deepx/tensorfunc/init_miaobyte.hpp" #include "deepx/tensorfunc/elementwise.hpp" diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/7_tensor_transpose.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/7_tensor_transpose.cpp index 90188489..d843b6e1 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/7_tensor_transpose.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/7_tensor_transpose.cpp @@ -6,7 +6,7 @@ #include "deepx/tensorfunc/changeshape_miaobyte.hpp" #include "deepx/tensorfunc/new.hpp" #include "deepx/tensorfunc/authors.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "stdutil/vector.hpp" #include "tensorutil.hpp" #include "deepx/shape_transpose.hpp" diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp index 3a6bafdc..9456e408 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp @@ -9,7 +9,7 @@ #include "deepx/shape_concat.hpp" #include "deepx/tensorfunc/new.hpp" #include "deepx/tensorfunc/init_miaobyte.hpp" -#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/io_miaobyte.hpp" #include "stdutil/vector.hpp" #include "deepx/mem/mem.hpp" #include "deepx/mem/mem_ompsimd.hpp" diff --git a/front/py/deepx/autograd/graph.py b/front/py/deepx/autograd/graph.py index e5b0f86e..332407a3 100644 --- a/front/py/deepx/autograd/graph.py +++ b/front/py/deepx/autograd/graph.py @@ -6,6 +6,7 @@ class Graph: # 类属性存储默认实例 _default_graph = None + @classmethod def get_default(cls): """获取或创建默认计算图(线程不安全)""" @@ -87,5 +88,6 @@ def graph_method(f): return f + # 初始化默认图 Graph._default_graph = Graph() \ No newline at end of file diff --git a/src/deepx/tensorfunc/changeshape_miaobyte.cu b/src/deepx/tensorfunc/changeshape_miaobyte.cu deleted file mode 100644 index 0519ecba..00000000 --- a/src/deepx/tensorfunc/changeshape_miaobyte.cu +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file