Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 6 additions & 20 deletions excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,7 @@ namespace deepx::tensorfunc
default:
throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("cuda error");
}
throwcudaerror("Failed to launch transpose kernel",cudaGetLastError());
}

template void launch_transpose<double>(const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
Expand Down Expand Up @@ -224,12 +220,9 @@ namespace deepx::tensorfunc
default:
throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("cuda error");
}
throwcudaerror("Failed to launch concat kernel",cudaGetLastError());
}

template void launch_concat<double>(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
template void launch_concat<float>(const float **tensorsData, const int *inputStrides, float *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
template void launch_concat<nv_bfloat16>(const nv_bfloat16 **tensorsData, const int *inputStrides, nv_bfloat16 *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
Expand Down Expand Up @@ -335,11 +328,7 @@ namespace deepx::tensorfunc
default:
throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("cuda error");
}
throwcudaerror("Failed to launch broadcastTo kernel",cudaGetLastError());
}
template void launch_broadcastTo<double>(const double *input, const int *inputStrides, const int inputDim,
const BroadcastMap *broadcastMap,
Expand Down Expand Up @@ -489,12 +478,9 @@ namespace deepx::tensorfunc
default:
throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("cuda error");
}
throwcudaerror("Failed to launch indexselect kernel",cudaGetLastError());
}

template void launch_indexselect<double, int64_t>(const double *input, const int *inputStrides, const int inputDim,
const int64_t *index, const int *indexStrides, const int indexDim,
const int gatherAxis,
Expand Down
19 changes: 8 additions & 11 deletions excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,24 +68,21 @@ namespace deepx::tensorfunc

using std::shared_ptr;

inline std::pair<int, std::shared_ptr<unsigned char[]>> device_offload(unsigned char *data,int size)
{
shared_ptr<unsigned char[]> host_data(new unsigned char[size]);
cudaMemcpy(host_data.get(), data, size, cudaMemcpyDeviceToHost);
cudaError_t err=cudaGetLastError();
if(err!=cudaSuccess){
throw std::runtime_error("Failed to copy data from device to host");

}
return {size, host_data};
}


inline void throwcudaerror(const std::string& msg,cudaError_t err){
if (err != cudaSuccess)
{
throw std::runtime_error(msg + "\n" + std::string(cudaGetErrorString(err)));
}
}
inline std::pair<int, std::shared_ptr<unsigned char[]>> device_offload(unsigned char *data,int size)
{
shared_ptr<unsigned char[]> host_data(new unsigned char[size]);
cudaMemcpy(host_data.get(), data, size, cudaMemcpyDeviceToHost);
throwcudaerror("Failed to copy data from device to host",cudaGetLastError());
return {size, host_data};
}
}

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,7 @@ namespace deepx::tensorfunc
void launch_todtype(const T* a, Dtype* c,const int size){
auto [numBlocks, blockSize] = BestDims(size);
todtype_kernel<<<numBlocks, blockSize>>>(a, c, size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("Failed to launch todtype kernel: " +
std::string(cudaGetErrorString(err)));
}
throwcudaerror("Failed to launch todtype kernel",cudaGetLastError());
}
template void launch_todtype<double, float>(const double *a, float *c, const int size);
template void launch_todtype<double, half>(const double *a, half *c, const int size);
Expand Down Expand Up @@ -114,13 +109,7 @@ namespace deepx::tensorfunc
// 启动kernel
auto [numBlocks, blockSize] = BestDims(size);
add_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
// 检查kernel执行是否成功
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("Failed to launch add kernel: " +
std::string(cudaGetErrorString(err)));
}
throwcudaerror("Failed to launch add kernel",cudaGetLastError());
}

template void launch_add<double>(const double *a, const double *b, double *c, const int size);
Expand All @@ -147,12 +136,7 @@ namespace deepx::tensorfunc
{
auto [numBlocks, blockSize] = BestDims(size);
addscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("Failed to launch addscalar kernel: " +
std::string(cudaGetErrorString(err)));
}
throwcudaerror("Failed to launch addscalar kernel",cudaGetLastError());
}
template void launch_addscalar<double>(const double *a, const double scalar, double *c, const int size);
template void launch_addscalar<float>(const float *a, const float scalar, float *c, const int size);
Expand All @@ -178,12 +162,7 @@ namespace deepx::tensorfunc
{
auto [numBlocks, blockSize] = BestDims(size);
sub_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("Failed to launch sub kernel: " +
std::string(cudaGetErrorString(err)));
}
throwcudaerror("Failed to launch sub kernel",cudaGetLastError());
}
template void launch_sub<double>(const double *a, const double *b, double *c, const int size);
template void launch_sub<float>(const float *a, const float *b, float *c, const int size);
Expand All @@ -209,13 +188,9 @@ namespace deepx::tensorfunc
{
auto [numBlocks, blockSize] = BestDims(size);
subscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("Failed to launch subscalar kernel: " +
std::string(cudaGetErrorString(err)));
}
throwcudaerror("Failed to launch subscalar kernel",cudaGetLastError());
}

template void launch_subscalar<double>(const double *a, const double scalar, double *c, const int size);
template void launch_subscalar<float>(const float *a, const float scalar, float *c, const int size);
template void launch_subscalar<half>(const half *a, const half scalar, half *c, const int size);
Expand All @@ -239,11 +214,7 @@ namespace deepx::tensorfunc
void launch_rsubscalar(const T scalar, const T* a, T* c,const int size){
auto [numBlocks, blockSize] = BestDims(size);
rsubscalar_kernel<<<numBlocks, blockSize>>>(scalar, a, c, size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("Failed to launch rsubscalar kernel: "+std::string(cudaGetErrorString(err)));
}
throwcudaerror("Failed to launch rsubscalar kernel",cudaGetLastError());
}
template void launch_rsubscalar<double>(const double scalar, const double* a, double* c,const int size);
template void launch_rsubscalar<float>(const float scalar, const float* a, float* c,const int size);
Expand Down Expand Up @@ -271,13 +242,9 @@ namespace deepx::tensorfunc
{
auto [numBlocks, blockSize] = BestDims(size);
mul_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("Failed to launch mul kernel: " +
std::string(cudaGetErrorString(err)));
}
throwcudaerror("Failed to launch mul kernel",cudaGetLastError());
}

template void launch_mul<double>(const double *a, const double *b, double *c, const int size);
template void launch_mul<float>(const float *a, const float *b, float *c, const int size);
template void launch_mul<half>(const half *a, const half *b, half *c, const int size);
Expand All @@ -302,12 +269,7 @@ namespace deepx::tensorfunc
{
auto [numBlocks, blockSize] = BestDims(size);
mulscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("Failed to launch mulscalar kernel: " +
std::string(cudaGetErrorString(err)));
}
throwcudaerror("Failed to launch mulscalar kernel",cudaGetLastError());
}
template void launch_mulscalar<double>(const double *a, const double scalar, double *c, const int size);
template void launch_mulscalar<float>(const float *a, const float scalar, float *c, const int size);
Expand All @@ -333,13 +295,9 @@ namespace deepx::tensorfunc
{
auto [numBlocks, blockSize] = BestDims(size);
div_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("Failed to launch div kernel: " +
std::string(cudaGetErrorString(err)));
}
throwcudaerror("Failed to launch div kernel",cudaGetLastError());
}

template void launch_div<double>(const double *a, const double *b, double *c, const int size);
template void launch_div<float>(const float *a, const float *b, float *c, const int size);
template void launch_div<half>(const half *a, const half *b, half *c, const int size);
Expand All @@ -364,13 +322,9 @@ namespace deepx::tensorfunc
{
auto [numBlocks, blockSize] = BestDims(size);
divscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("Failed to launch divscalar kernel: " +
std::string(cudaGetErrorString(err)));
}
throwcudaerror("Failed to launch divscalar kernel",cudaGetLastError());
}

template void launch_divscalar<double>(const double *a, const double scalar, double *c, const int size);
template void launch_divscalar<float>(const float *a, const float scalar, float *c, const int size);
template void launch_divscalar<half>(const half *a, const half scalar, half *c, const int size);
Expand All @@ -395,13 +349,9 @@ namespace deepx::tensorfunc
{
auto [numBlocks, blockSize] = BestDims(size);
rdivscalar_kernel<<<numBlocks, blockSize>>>(scalar, a, c, size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("Failed to launch rdivscalar kernel: " +
std::string(cudaGetErrorString(err)));
}
throwcudaerror("Failed to launch rdivscalar kernel",cudaGetLastError());
}

template void launch_rdivscalar<double>(const double scalar, const double *a, double *c, const int size);
template void launch_rdivscalar<float>(const float scalar, const float *a, float *c, const int size);
template void launch_rdivscalar<half>(const half scalar, const half *a, half *c, const int size);
Expand All @@ -421,17 +371,21 @@ namespace deepx::tensorfunc
}
}

template <>
__global__ void invert_kernel<bool>(const bool *A, bool *C, const int size)
{
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
{
C[idx] = !A[idx];
}
}

template <typename T>
void launch_invert(const T *a, T *c, const int size)
{
auto [numBlocks, blockSize] = BestDims(size);
invert_kernel<<<numBlocks, blockSize>>>(a, c, size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("Failed to launch invert kernel: " +
std::string(cudaGetErrorString(err)));
}
throwcudaerror("Failed to launch invert kernel",cudaGetLastError());
}
template void launch_invert<int64_t>(const int64_t *a, int64_t *c, const int size);
template void launch_invert<int32_t>(const int32_t *a, int32_t *c, const int size);
Expand Down
Loading
Loading