diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu index 5ef7ea8..6cd9729 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu @@ -96,11 +96,7 @@ namespace deepx::tensorfunc default: throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM)); } - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("cuda error"); - } + throwcudaerror("Failed to launch transpose kernel",cudaGetLastError()); } template void launch_transpose(const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); @@ -224,12 +220,9 @@ namespace deepx::tensorfunc default: throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM)); } - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("cuda error"); - } + throwcudaerror("Failed to launch concat kernel",cudaGetLastError()); } + template void launch_concat(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); template void launch_concat(const float **tensorsData, const int *inputStrides, float *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); template void launch_concat(const nv_bfloat16 **tensorsData, const int *inputStrides, nv_bfloat16 *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); @@ -335,11 +328,7 @@ namespace deepx::tensorfunc default: throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM)); } - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("cuda error"); - } + throwcudaerror("Failed to launch broadcastTo kernel",cudaGetLastError()); } template void launch_broadcastTo(const double *input, const int *inputStrides, const int inputDim, const BroadcastMap *broadcastMap, @@ -489,12 +478,9 @@ namespace deepx::tensorfunc default: throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM)); } - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("cuda error"); - } + throwcudaerror("Failed to launch indexselect kernel",cudaGetLastError()); } + template void launch_indexselect(const double *input, const int *inputStrides, const int inputDim, const int64_t *index, const int *indexStrides, const int indexDim, const int gatherAxis, diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp index 5f4b3cd..e44cd1a 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp @@ -68,17 +68,7 @@ namespace deepx::tensorfunc using std::shared_ptr; - inline std::pair> device_offload(unsigned char *data,int size) - { - shared_ptr host_data(new unsigned char[size]); - cudaMemcpy(host_data.get(), data, size, cudaMemcpyDeviceToHost); - cudaError_t err=cudaGetLastError(); - if(err!=cudaSuccess){ - throw std::runtime_error("Failed to copy data from device to host"); - - } - return {size, host_data}; - } + inline void throwcudaerror(const std::string& msg,cudaError_t err){ if (err != cudaSuccess) @@ -86,6 +76,13 @@ namespace deepx::tensorfunc throw std::runtime_error(msg + "\n" + std::string(cudaGetErrorString(err))); } } + inline std::pair> device_offload(unsigned char *data,int size) + { + shared_ptr host_data(new unsigned char[size]); + cudaMemcpy(host_data.get(), data, size, cudaMemcpyDeviceToHost); + throwcudaerror("Failed to copy data from device to host",cudaGetLastError()); + return {size, host_data}; + } } #endif diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu index 16011b3..a9b1ca2 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu @@ -27,12 +27,7 @@ namespace deepx::tensorfunc void launch_todtype(const T* a, Dtype* c,const int size){ auto [numBlocks, blockSize] = BestDims(size); todtype_kernel<<>>(a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch todtype kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch todtype kernel",cudaGetLastError()); } template void launch_todtype(const double *a, float *c, const int size); template void launch_todtype(const double *a, half *c, const int size); @@ -114,13 +109,7 @@ namespace deepx::tensorfunc // 启动kernel auto [numBlocks, blockSize] = BestDims(size); add_kernel<<>>(a, b, c, size); - // 检查kernel执行是否成功 - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch add kernel",cudaGetLastError()); } template void launch_add(const double *a, const double *b, double *c, const int size); @@ -147,12 +136,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); addscalar_kernel<<>>(a, scalar, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch addscalar kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch addscalar kernel",cudaGetLastError()); } template void launch_addscalar(const double *a, const double scalar, double *c, const int size); template void launch_addscalar(const float *a, const float scalar, float *c, const int size); @@ -178,12 +162,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); sub_kernel<<>>(a, b, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch sub kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch sub kernel",cudaGetLastError()); } template void launch_sub(const double *a, const double *b, double *c, const int size); template void launch_sub(const float *a, const float *b, float *c, const int size); @@ -209,13 +188,9 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); subscalar_kernel<<>>(a, scalar, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch subscalar kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch subscalar kernel",cudaGetLastError()); } + template void launch_subscalar(const double *a, const double scalar, double *c, const int size); template void launch_subscalar(const float *a, const float scalar, float *c, const int size); template void launch_subscalar(const half *a, const half scalar, half *c, const int size); @@ -239,11 +214,7 @@ namespace deepx::tensorfunc void launch_rsubscalar(const T scalar, const T* a, T* c,const int size){ auto [numBlocks, blockSize] = BestDims(size); rsubscalar_kernel<<>>(scalar, a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch rsubscalar kernel: "+std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch rsubscalar kernel",cudaGetLastError()); } template void launch_rsubscalar(const double scalar, const double* a, double* c,const int size); template void launch_rsubscalar(const float scalar, const float* a, float* c,const int size); @@ -271,13 +242,9 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); mul_kernel<<>>(a, b, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch mul kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch mul kernel",cudaGetLastError()); } + template void launch_mul(const double *a, const double *b, double *c, const int size); template void launch_mul(const float *a, const float *b, float *c, const int size); template void launch_mul(const half *a, const half *b, half *c, const int size); @@ -302,12 +269,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); mulscalar_kernel<<>>(a, scalar, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch mulscalar kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch mulscalar kernel",cudaGetLastError()); } template void launch_mulscalar(const double *a, const double scalar, double *c, const int size); template void launch_mulscalar(const float *a, const float scalar, float *c, const int size); @@ -333,13 +295,9 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); div_kernel<<>>(a, b, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch div kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch div kernel",cudaGetLastError()); } + template void launch_div(const double *a, const double *b, double *c, const int size); template void launch_div(const float *a, const float *b, float *c, const int size); template void launch_div(const half *a, const half *b, half *c, const int size); @@ -364,13 +322,9 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); divscalar_kernel<<>>(a, scalar, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch divscalar kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch divscalar kernel",cudaGetLastError()); } + template void launch_divscalar(const double *a, const double scalar, double *c, const int size); template void launch_divscalar(const float *a, const float scalar, float *c, const int size); template void launch_divscalar(const half *a, const half scalar, half *c, const int size); @@ -395,13 +349,9 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); rdivscalar_kernel<<>>(scalar, a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch rdivscalar kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch rdivscalar kernel",cudaGetLastError()); } + template void launch_rdivscalar(const double scalar, const double *a, double *c, const int size); template void launch_rdivscalar(const float scalar, const float *a, float *c, const int size); template void launch_rdivscalar(const half scalar, const half *a, half *c, const int size); @@ -421,17 +371,21 @@ namespace deepx::tensorfunc } } + template <> + __global__ void invert_kernel(const bool *A, bool *C, const int size) + { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + { + C[idx] = !A[idx]; + } + } + template void launch_invert(const T *a, T *c, const int size) { auto [numBlocks, blockSize] = BestDims(size); invert_kernel<<>>(a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch invert kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch invert kernel",cudaGetLastError()); } template void launch_invert(const int64_t *a, int64_t *c, const int size); template void launch_invert(const int32_t *a, int32_t *c, const int size); diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu index a33a872..a514755 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu @@ -21,12 +21,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); max_kernel<<>>(A, B, C, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch add kernel",cudaGetLastError()); } template void launch_max(const double *A, const double *B, double *C, const int size); @@ -53,12 +48,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); maxscalar_kernel<<>>(A, scalar, C, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch add kernel",cudaGetLastError()); } template void launch_maxscalar(const double *A, const double scalar, double *C, const int size); @@ -85,12 +75,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); min_kernel<<>>(A, B, C, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch add kernel",cudaGetLastError()); } template void launch_min(const double *A, const double *B, double *C, const int size); @@ -117,12 +102,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); minscalar_kernel<<>>(A, scalar, C, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch add kernel",cudaGetLastError()); } template void launch_minscalar(const double *A, const double scalar, double *C, const int size); @@ -175,12 +155,7 @@ namespace deepx::tensorfunc { equalwithepsilon_kernel<<>>(A, B, epsilon, mask, size); } - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch add kernel",cudaGetLastError()); } template void launch_equal(const double *A, const double *B, const float epsilon, bool *mask, const int size); @@ -233,12 +208,7 @@ namespace deepx::tensorfunc { equalscalarwithepsilon_kernel<<>>(A, scalar, epsilon, mask, size); } - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch add kernel",cudaGetLastError()); } template void launch_equalscalar(const double *A, const double scalar, const float epsilon, bool *mask, const int size); @@ -291,12 +261,7 @@ namespace deepx::tensorfunc { notequalwithepsilon_kernel<<>>(A, B, epsilon, mask, size); } - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch add kernel",cudaGetLastError()); } template void launch_notequal(const double *A, const double *B, const float epsilon, bool *mask, const int size); @@ -349,12 +314,7 @@ namespace deepx::tensorfunc { notequalscalarwithepsilon_kernel<<>>(A, scalar, epsilon, mask, size); } - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch add kernel",cudaGetLastError()); } template void launch_notequalscalar(const double *A, const double scalar, const float epsilon, bool *mask, const int size); @@ -382,12 +342,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); less_kernel<<>>(A, B, mask, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch add kernel",cudaGetLastError()); } template void launch_less(const double *A, const double *B, bool *mask, const int size); @@ -416,12 +371,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); lessscalar_kernel<<>>(A, scalar, mask, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch add kernel",cudaGetLastError()); } template void launch_lessscalar(const double *A, const double scalar, bool *mask, const int size); @@ -449,12 +399,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); greater_kernel<<>>(A, B, mask, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch add kernel",cudaGetLastError()); } template void launch_greater(const double *A, const double *B, bool *mask, const int size); @@ -482,12 +427,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); greaterscalar_kernel<<>>(A, scalar, mask, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch add kernel",cudaGetLastError()); } template void launch_greaterscalar(const double *A, const double scalar, bool *mask, const int size); @@ -516,13 +456,9 @@ namespace deepx::tensorfunc auto [numBlocks, blockSize] = BestDims(size); cudaVector tensorsdataList(tensorsdata, numTensors, cudaMemcpyHostToDevice); switch_kernel<<>>(tensorsdataList.data, numTensors, cases, C, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch add kernel",cudaGetLastError()); } + template void launch_switch(const double **tensorsdata, const int numTensors, const int32_t *cases, double *C, const int size); template void launch_switch(const float **tensorsdata, const int numTensors, const int32_t *cases, float *C, const int size); diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu index 00bd232..18ca1a1 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu @@ -45,19 +45,16 @@ namespace deepx::tensorfunc } template - void launch_sin(int numBlocks, int blockSize, const T* a, T* c, const int size){ + void launch_sin(const T* a, T* c, const int size){ + auto [numBlocks, blockSize] = BestDims(size); sin_kernel<<>>(a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error("Failed to launch sin kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch sin kernel",cudaGetLastError()); } - template void launch_sin(int numBlocks, int blockSize, const double* a, double* c, const int size); - template void launch_sin(int numBlocks, int blockSize, const float* a, float* c, const int size); - template void launch_sin<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size); - template void launch_sin(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size); + template void launch_sin(const double* a, double* c, const int size); + template void launch_sin(const float* a, float* c, const int size); + template void launch_sin<__half>(const __half* a, __half* c, const int size); + template void launch_sin(const nv_bfloat16* a, nv_bfloat16* c, const int size); // cos template __global__ void cos_kernel(const T* A, T* C, const int size); @@ -91,18 +88,15 @@ namespace deepx::tensorfunc } } template - void launch_cos(int numBlocks, int blockSize, const T* a, T* c, const int size){ + void launch_cos(const T* a, T* c, const int size){ + auto [numBlocks, blockSize] = BestDims(size); cos_kernel<<>>(a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error("Failed to launch cos kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch cos kernel",cudaGetLastError()); } - template void launch_cos(int numBlocks, int blockSize, const double* a, double* c, const int size); - template void launch_cos(int numBlocks, int blockSize, const float* a, float* c, const int size); - template void launch_cos<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size); - template void launch_cos(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size); + template void launch_cos(const double* a, double* c, const int size); + template void launch_cos(const float* a, float* c, const int size); + template void launch_cos<__half>(const __half* a, __half* c, const int size); + template void launch_cos(const nv_bfloat16* a, nv_bfloat16* c, const int size); // tan template __global__ void tan_kernel(const T* A, T* C, const int size); @@ -123,17 +117,14 @@ namespace deepx::tensorfunc template - void launch_tan(int numBlocks, int blockSize, const T* a, T* c, const int size){ + void launch_tan(const T* a, T* c, const int size){ + auto [numBlocks, blockSize] = BestDims(size); tan_kernel<<>>(a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error("Failed to launch tan kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch tan kernel",cudaGetLastError()); } - template void launch_tan(int numBlocks, int blockSize, const double* a, double* c, const int size); - template void launch_tan(int numBlocks, int blockSize, const float* a, float* c, const int size); - + + template void launch_tan( const double* a, double* c, const int size); + template void launch_tan(const float* a, float* c, const int size); } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cuh index a66e996..f0e68d1 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cuh @@ -11,60 +11,26 @@ namespace deepx::tensorfunc { + // sin template __global__ void sin_kernel(const T* A, T* C, const int size); template - void launch_sin(int numBlocks, int blockSize, const T* a, T* c, const int size); + void launch_sin(const T* a, T* c, const int size); - template <> - void launch_sin(int numBlocks, int blockSize, const double* a, double* c, const int size); - - template <> - void launch_sin(int numBlocks, int blockSize, const float* a, float* c, const int size); - - template <> - void launch_sin(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size); - - template <> - void launch_sin<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size); template __global__ void cos_kernel(const T* A, T* C, const int size); template - void launch_cos(int numBlocks, int blockSize, const T* a, T* c, const int size); - - template <> - void launch_cos(int numBlocks, int blockSize, const double* a, double* c, const int size); - - template <> - void launch_cos(int numBlocks, int blockSize, const float* a, float* c, const int size); - - template <> - void launch_cos(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size); + void launch_cos( const T* a, T* c, const int size); - template <> - void launch_cos<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size); - + // tan template __global__ void tan_kernel(const T* A, T* C, const int size); template - void launch_tan(int numBlocks, int blockSize, const T* a, T* c, const int size); - - template <> - void launch_tan(int numBlocks, int blockSize, const double* a, double* c, const int size); - - template <> - void launch_tan(int numBlocks, int blockSize, const float* a, float* c, const int size); - - template <> - void launch_tan(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size); - - template <> - void launch_tan<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size); - + void launch_tan( const T* a, T* c, const int size); } #endif \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp index 4a71c66..b126217 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp @@ -19,9 +19,7 @@ namespace deepx::tensorfunc if (A.shape.size != C.shape.size) { throw TensorShapeError("sin"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_sin(numBlocks, blockSize, A.data, C.data, A.shape.size); + launch_sin(A.data, C.data, A.shape.size); } }; @@ -33,9 +31,7 @@ namespace deepx::tensorfunc if (A.shape.size != C.shape.size) { throw TensorShapeError("cos"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_cos(numBlocks, blockSize, A.data, C.data, A.shape.size); + launch_cos(A.data, C.data, A.shape.size); } }; @@ -47,9 +43,7 @@ namespace deepx::tensorfunc if (A.shape.size != C.shape.size) { throw TensorShapeError("tan"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_tan(numBlocks, blockSize, A.data, C.data, A.shape.size); + launch_tan(A.data, C.data, A.shape.size); } }; diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu index 45b24be..c655b60 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu @@ -24,12 +24,8 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); sqrt_kernel<<>>(a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch sqrt kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch sqrt kernel",cudaGetLastError()); + } template void launch_sqrt(const double *a, double *c, const int size); template void launch_sqrt(const float *a, float *c, const int size); @@ -51,12 +47,8 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); pow_kernel<<>>(a, b, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch pow kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch pow kernel",cudaGetLastError()); + } template void launch_pow(const double *a, const double *b, double *c, const int size); template void launch_pow(const float *a, const float *b, float *c, const int size); @@ -76,12 +68,8 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); powscalar_kernel<<>>(a, scalar, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch powscalar kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch powscalar kernel",cudaGetLastError()); + } template void launch_powscalar(const double *a, const double scalar, double *c, const int size); template void launch_powscalar(const float *a, const float scalar, float *c, const int size); @@ -101,12 +89,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); rpowscalar_kernel<<>>(scalar, a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch rpowscalar kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch rpowscalar kernel",cudaGetLastError()); } template void launch_rpowscalar(const double scalar, const double *a, double *c, const int size); template void launch_rpowscalar(const float scalar, const float *a, float *c, const int size); @@ -126,12 +109,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); log_kernel<<>>(a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch log kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch log kernel",cudaGetLastError()); } template void launch_log(const double *a, double *c, const int size); template void launch_log(const float *a, float *c, const int size); @@ -152,12 +130,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); exp_kernel<<>>(a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch exp kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch exp kernel",cudaGetLastError()); } template void launch_exp(const double *a, double *c, const int size); template void launch_exp(const float *a, float *c, const int size); diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu index 7f8d261..cc7d708 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu @@ -24,12 +24,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); kernel_constant<<>>(a, value, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - throw std::runtime_error("Failed to launch constant kernel"); - err = cudaDeviceSynchronize(); - if (err != cudaSuccess) - throw std::runtime_error("Failed to synchronize device"); + throwcudaerror("Failed to launch constant kernel",cudaGetLastError()); } template void launch_constant(double *a, const double value, const int size); @@ -65,12 +60,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); dropout_kernel<<>>(a, p, seed, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch dropout kernel: " + - std::string(cudaGetErrorString(err))); - } + throwcudaerror("Failed to launch dropout kernel",cudaGetLastError()); } template void launch_dropout(double *a, const float p, const unsigned int seed, const int size); template void launch_dropout(float *a, const float p, const unsigned int seed, const int size); @@ -98,12 +88,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); kernel_arange<<>>(a, static_cast(start), static_cast(step), size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - throw std::runtime_error("Failed to launch arange kernel"); - err = cudaDeviceSynchronize(); - if (err != cudaSuccess) - throw std::runtime_error("Failed to synchronize device"); + throwcudaerror("Failed to launch arange kernel",cudaGetLastError()); } template void launch_arange(double *a, const double start, const double step, const int size); template void launch_arange(float *a, const float start, const float step, const int size); @@ -138,12 +123,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); kernel_uniform<<>>(a, float(low), float(high), seed, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - throw std::runtime_error("Failed to launch uniform kernel"); - err = cudaDeviceSynchronize(); - if (err != cudaSuccess) - throw std::runtime_error("Failed to synchronize device"); + throwcudaerror("Failed to launch uniform kernel",cudaGetLastError()); } template void launch_uniform(double *a, const double low, const double high, const unsigned int seed, const int size); template void launch_uniform(float *a, const float low, const float high, const unsigned int seed, const int size); @@ -176,12 +156,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); kernel_normal<<>>(a, float(mean), float(stddev), seed, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - throw std::runtime_error("Failed to launch normal kernel"); - err = cudaDeviceSynchronize(); - if (err != cudaSuccess) - throw std::runtime_error("Failed to synchronize device"); + throwcudaerror("Failed to launch normal kernel",cudaGetLastError()); } template void launch_normal(double *a, const double mean, const double stddev, const unsigned int seed, const int size); template void launch_normal(float *a, const float mean, const float stddev, const unsigned int seed, const int size); diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp index 065923e..da58c2c 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp @@ -22,7 +22,6 @@ namespace deepx::tensorfunc static void print(const Tensor &t, const std::string &f = "") { int64_t total_bytes = t.shape.bytes(); - // 统一分配CPU内存 unsigned char* device_data=reinterpret_cast(t.data); auto [_,host_data]= device_offload(device_data,total_bytes); diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu index c6047ac..7aa90c4 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu @@ -100,6 +100,7 @@ namespace deepx::tensorfunc default: throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM)); } + throwcudaerror("Failed to launch sum kernel",cudaGetLastError()); } template void launch_sum(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, @@ -196,6 +197,7 @@ namespace deepx::tensorfunc default: throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM)); } + throwcudaerror("Failed to launch prod kernel",cudaGetLastError()); } template void launch_prod(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, @@ -304,6 +306,7 @@ namespace deepx::tensorfunc default: throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM)); } + throwcudaerror("Failed to launch max kernel",cudaGetLastError()); }; template void launch_reducemax(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, @@ -412,6 +415,7 @@ namespace deepx::tensorfunc default: throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM)); } + throwcudaerror("Failed to launch min kernel",cudaGetLastError()); } template void launch_reducemin(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp index 7f47046..864334c 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp @@ -339,6 +339,28 @@ namespace deepx::tensorfunc } }; + template <> + struct invertDispatcher + { + static void invert(const Tensor &A, Tensor &C) + { + if (A.shape == C.shape) + { + A.shape.rangeElementwiseParallel([&A, &C](int idx, int idx_end) + { + for (int j=0;j struct sqrtDispatcher>> { @@ -379,6 +401,8 @@ namespace deepx::tensorfunc } } }; + + // sqrt template struct sqrtDispatcher>> { diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py index ba40d67..d7dbd78 100644 --- a/front/py/deepx/tensor/tensor.py +++ b/front/py/deepx/tensor/tensor.py @@ -119,8 +119,7 @@ def __radd__(self, other:Union[Number,'Tensor']): def __sub__(self, other:Union[Number,'Tensor']): return self.sub(other) def __rsub__(self, other:Union[Number,'Tensor']): - x=self.mul(-1) - return x.add(other) + return self.rsub(other) def __mul__(self, other:Union[Number,'Tensor']): return self.mul(other) def __rmul__(self, other:Union[Number,'Tensor']): diff --git a/front/py/examples/1_tensor/1_new.py b/front/py/examples/1_tensor/1_new.py index aed5e7c..6a99a44 100644 --- a/front/py/examples/1_tensor/1_new.py +++ b/front/py/examples/1_tensor/1_new.py @@ -18,7 +18,7 @@ def printall(t): def newtensor(dtype): from deepx.nn.functional import newtensor - t=newtensor(1,2,3,dtype=dtype) + t=newtensor((1,2,3),dtype=dtype) printall(t) diff --git a/front/py/examples/4_transformer/llama/llama_rope.py b/front/py/examples/4_transformer/llama/llama_rope.py index 53c8995..e0f0598 100644 --- a/front/py/examples/4_transformer/llama/llama_rope.py +++ b/front/py/examples/4_transformer/llama/llama_rope.py @@ -1,4 +1,4 @@ -from .llama_rope_torch import dir,config +from llama_rope_torch import dir,config ############-------DEEPX-------################ from deepx.nn.modules import Embedding,Module @@ -14,7 +14,8 @@ def __init__(self,configdict:dict): super().__init__() self.embed_tokens = Embedding(configdict["vocab_size"], configdict["hidden_size"],weight=embed_tokens_weight) self.rotary_emb = LlamaRotaryEmbedding(config=configdict) - + print("rotary_emb.inv_freq") + self.rotary_emb.inv_freq.print() def forward(self,x): inputs_embeds = self.embed_tokens(x) hidden_states = inputs_embeds diff --git a/front/py/examples/4_transformer/llama/llama_rope_torch.py b/front/py/examples/4_transformer/llama/llama_rope_torch.py index 4e9301c..3f894e3 100644 --- a/front/py/examples/4_transformer/llama/llama_rope_torch.py +++ b/front/py/examples/4_transformer/llama/llama_rope_torch.py @@ -56,7 +56,6 @@ def __init__(self, config: LlamaConfig): print(self.rotary_emb.inv_freq) def forward(self, x): inputs_embeds = self.embed_tokens(x) - print(inputs_embeds) hidden_states = inputs_embeds # create position embeddings to be shared across the decoder layers position_ids = torch.arange(hidden_states.shape[1], device=hidden_states.device).unsqueeze(0)