Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/excuter/op-mem-cuda/list.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

| Operation | Author | Func Def | Math Formula | IR Instruction |
|-----------|--------|------------|--------------|----------------|
| broadcastTo | miaobyte | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) |
| concat | miaobyte | concat(listtensor<any> tensors, var<int32> axis)->(tensor<any> result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> axis)->(tensor<any> result) |
| transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
| reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) | T1.reshape(shape)->T2 | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) |
Expand Down
5 changes: 3 additions & 2 deletions doc/excuter/op-mem-ompsimd/list.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@

| Operation | Author | Func Def | Math Formula | IR Instruction |
|-----------|--------|------------|--------------|----------------|
| broadcastTo | miaobyte | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) |
| concat | miaobyte | concat(listtensor<any> tensors, var<int32> axis)->(tensor<any> result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> axis)->(tensor<any> result) |
| transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
| transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
| add | cblas | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
| add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
| comparescalar | miaobyte | comparescalar(tensor<any> A, var<any> scalar)->(tensor<float32> mask) | mask=compare(T1,scalar) | comparescalar(tensor<any> A, var<any> scalar)->(tensor<float32> mask) |
| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
| addscalar | miaobyte | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
| log | miaobyte | log(tensor<any> A)->(tensor<any> C) | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
| reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->() | T2=T1.reshape(shape) | reshape(tensor<any> A, vector<int32> shape)->() |
| reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) | T1.reshape(shape)->T2 | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) |
| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
| divscalar | miaobyte | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=T1/scalar | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
Expand Down
2 changes: 1 addition & 1 deletion excuter/cpp-common/src/deepx/shape.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ namespace deepx
}
int Shape::linearat(const std::vector<int> &indices) const{
int idx=0;
for(int i=0;i<dim;i++){
for(int i=0;i<indices.size();i++){
idx+=indices[i]*strides[i];
}
return idx;
Expand Down
3 changes: 1 addition & 2 deletions excuter/cpp-common/src/deepx/shape_broadcast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ namespace deepx {
xTo1 = 2,
};
std::vector<BroadcastMap> broadcastMap(const std::vector<int> &a, const std::vector<int> &b);
void fromBroadcastIndices(const std::vector<BroadcastMap> &broadcastMap, const std::vector<int> &broadcastIndices, std::vector<int> &oldIndices );


}

#endif // DEEPX_OP_CPU_SHAPE_HPP
14 changes: 14 additions & 0 deletions excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,20 @@ namespace deepx::tensorfunc
splitDispatcher<Author, T>::split(A, axis, num_outputs, B);
}

template <typename Author, typename T>
struct broadcastToDispatcher
{
static void broadcastTo(const Tensor<T> &A, const vector<int> &new_shape, Tensor<T> &B) = delete;
};

template <typename Author, typename T>
void broadcastTo(const Tensor<T> &A, const vector<int> &new_shape, Tensor<T> &B)
{
broadcastToDispatcher<Author, T>::broadcastTo(A, new_shape, B);
}



template <typename Author, typename T>
struct expandDispatcher
{
Expand Down
9 changes: 9 additions & 0 deletions excuter/op-mem-cuda/src/client/tfs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,15 @@ namespace deepx::tf
{
Param("result", DataCategory::Tensor, Precision::Any),
})));
tffactory.add_tf(std::make_shared<BroadcastTo<miaobyte>>(vector<Param>(
{
Param("A", DataCategory::Tensor, Precision::Any),
Param("new_shape", DataCategory::Vector, Precision::Int32),
}),
vector<Param>(
{
Param("B", DataCategory::Tensor, Precision::Any),
})));
}
// // reduce
// void register_reduce(OpFactory &opfactory)
Expand Down
13 changes: 0 additions & 13 deletions excuter/op-mem-cuda/src/deepx/tensorfunc/broadcast.hpp

This file was deleted.

151 changes: 131 additions & 20 deletions excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include "deepx/tensorfunc/authors.hpp"
#include "deepx/tensorfunc/tensor_cuda.cuh"
#include "deepx/tensorfunc/vector_cuda.cuh"
#include "deepx/shape_broadcast.hpp"

namespace deepx::tensorfunc
{
// transpose
Expand Down Expand Up @@ -150,39 +152,37 @@ namespace deepx::tensorfunc
currentTensorIndices.copyFromDevice(outputIndices.data, dim);
currentTensorIndices[axis] = concatIdxCurrentTensor;

int idxCurrentTensor = linearAt(inputStrides+tensorIdx*dim, dim, currentTensorIndices.data);
int idxCurrentTensor = linearAt(inputStrides + tensorIdx * dim, dim, currentTensorIndices.data);

int idx = linearAt(outputStrides, dim, outputIndices.data);
outputData[idx] = tensorsData[tensorIdx][idxCurrentTensor];
}
}


template <typename T>
void launch_concat(
const T **tensorsData,
const int *inputStrides,
T *outputData,
const int *outputStrides,
const int dim,
const int outputLen,
const int axis,
const int numTensors,
const int *shapeAtAxis)
{
const T **tensorsData,
const int *inputStrides,
T *outputData,
const int *outputStrides,
const int dim,
const int outputLen,
const int axis,
const int numTensors,
const int *shapeAtAxis)
{
auto [numBlocks, blockSize] = BestDims(outputLen);

//output
// output
cudaVector<int> outputStrides_d(outputStrides, dim, cudaMemcpyHostToDevice);

//input
//datas
cudaVector<const T*> tensorsDataList(tensorsData, numTensors, cudaMemcpyHostToDevice);
//strides
cudaVector<int> inputStrides_d(inputStrides, numTensors*dim, cudaMemcpyHostToDevice);

// input
// datas
cudaVector<const T *> tensorsDataList(tensorsData, numTensors, cudaMemcpyHostToDevice);
// strides
cudaVector<int> inputStrides_d(inputStrides, numTensors * dim, cudaMemcpyHostToDevice);

//shapeAtAxis
// shapeAtAxis
cudaVector<int> shapeAtAxis_d(shapeAtAxis, numTensors, cudaMemcpyHostToDevice);

int powDim = nextPowerOf2(dim);
Expand Down Expand Up @@ -227,5 +227,116 @@ namespace deepx::tensorfunc
template void launch_concat<int16_t>(const int16_t **tensorsData, const int *inputStrides, int16_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
template void launch_concat<int8_t>(const int8_t **tensorsData, const int *inputStrides, int8_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);

// broadcastTo
__host__ __device__ void fromBroadcastIndices(const BroadcastMap *broadcastMap, const int *broadcastIndices, const int broadcastIndicesDim, int *indices)
{
for (int i = 0, j = 0; i < broadcastIndicesDim; ++i)
{
switch (broadcastMap[i])
{
case xTox:
indices[j++] = broadcastIndices[i];
break;
case nullTo1:
break;
case xTo1:
indices[j++] = 0;
break;
}
}
}

template <int DIM, typename T>
__global__ void broadcastTo_kernel(const T *input, const int *inputStrides, const int inputDim,
const BroadcastMap *broadcastMap,
T *output, const int *outputStrides, const int outputDim, const int outputlen)
{
const int grid_stride = gridDim.x * blockDim.x;
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
for (; thread_id < outputlen; thread_id += grid_stride)
{
int output_indices[DIM];
linearTo(outputStrides, outputDim, output_indices, thread_id);
int input_indices[DIM];
fromBroadcastIndices(broadcastMap, output_indices, outputDim, input_indices);
int inputIdx = linearAt(inputStrides, inputDim, input_indices);
int outputIdx = linearAt(outputStrides, outputDim, output_indices);
output[outputIdx] = input[inputIdx];
}
}

template <typename T>
void launch_broadcastTo(const T *input, const int *inputStrides, const int intputDim,
const BroadcastMap *broadcastMap,
T *output, const int *outputStrides, const int outputDim, const int outputlen){

auto [numBlocks, blockSize] = BestDims(outputlen);

// output
cudaVector<int> outputStrides_d(outputStrides, outputDim, cudaMemcpyHostToDevice);

// broadcastMap
cudaVector<BroadcastMap> broadcastMap_d(broadcastMap, outputDim, cudaMemcpyHostToDevice);

// input
cudaVector<int> inputStrides_d(inputStrides, intputDim, cudaMemcpyHostToDevice);


int powDim = nextPowerOf2(outputDim);
// 根据计算出的2的幂次选择对应的模板实例
switch (powDim)
{
case 1:
broadcastTo_kernel<1, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
break;
case 2:
broadcastTo_kernel<2, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
break;
case 4:
broadcastTo_kernel<4, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
break;
case 8:
broadcastTo_kernel<8, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
break;
case 16:
broadcastTo_kernel<16, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
break;
case 32:
broadcastTo_kernel<32, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
break;
case 64:
broadcastTo_kernel<64, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
break;
case 128:
broadcastTo_kernel<128, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
break;
default:
throw std::runtime_error("dim too large, max support 128");
}
}
template void launch_broadcastTo<double>(const double *input, const int *inputStrides, const int inputDim,
const BroadcastMap *broadcastMap,
double *output, const int *outputStrides, const int outputDim, const int outputlen);
template void launch_broadcastTo<float>(const float *input, const int *inputStrides, const int inputDim,
const BroadcastMap *broadcastMap,
float *output, const int *outputStrides, const int outputDim, const int outputlen);
template void launch_broadcastTo<nv_bfloat16>(const nv_bfloat16 *input, const int *inputStrides, const int inputDim,
const BroadcastMap *broadcastMap,
nv_bfloat16 *output, const int *outputStrides, const int outputDim, const int outputlen);
template void launch_broadcastTo<__half>(const __half *input, const int *inputStrides, const int inputDim,
const BroadcastMap *broadcastMap,
__half *output, const int *outputStrides, const int outputDim, const int outputlen);
template void launch_broadcastTo<int64_t>(const int64_t *input, const int *inputStrides, const int inputDim,
const BroadcastMap *broadcastMap,
int64_t *output, const int *outputStrides, const int outputDim, const int outputlen);
template void launch_broadcastTo<int32_t>(const int32_t *input, const int *inputStrides, const int inputDim,
const BroadcastMap *broadcastMap,
int32_t *output, const int *outputStrides, const int outputDim, const int outputlen);
template void launch_broadcastTo<int16_t>(const int16_t *input, const int *inputStrides, const int inputDim,
const BroadcastMap *broadcastMap,
int16_t *output, const int *outputStrides, const int outputDim, const int outputlen);
template void launch_broadcastTo<int8_t>(const int8_t *input, const int *inputStrides, const int inputDim,
const BroadcastMap *broadcastMap,
int8_t *output, const int *outputStrides, const int outputDim, const int outputlen);
}
#endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP
Loading