Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/excuter-cuda-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: Excuter/cuda-linux Build
on: [push, pull_request]

env:
CUDA_VERSION: "12.1.0"
CUDA_VERSION: "12.6.0"
CUDA_MAJOR_VERSION: "12"
CUDNN_VERSION: "8.9.7.29"
CUTLASS_VERSION: "3.4.1"
Expand All @@ -29,7 +29,7 @@ jobs:
run: |
docker run --rm -v ${{ github.workspace }}:/workspace \
-w /workspace \
nvidia/cuda:12.1.0-devel-ubuntu22.04 \
nvidia/cuda:12.6.0-devel-ubuntu22.04 \
/bin/bash -c "
# 安装系统依赖
apt-get update && \
Expand Down
48 changes: 28 additions & 20 deletions doc/excuter/op-mem-cuda/list.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,35 @@
| Operation | Author | Func Def | Math Formula | IR Instruction |
|-----------|--------|------------|--------------|----------------|
| matmul | cublas | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
| exp | miaobyte | exp(tensor<any> A)->(tensor<any> C) | T3=exp(T1) | exp(tensor<any> A)->(tensor<any> C) |
| pow | miaobyte | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=pow(T1, T2) | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) |
| powscalar | miaobyte | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=pow(T1, scalar) | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
| div | miaobyte | div(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
| sub | miaobyte | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
| argset | none | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
| mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
| sqrt | miaobyte | sqrt(tensor<any> A)->(tensor<any> C) | T3=sqrt(T1) | sqrt(tensor<any> A)->(tensor<any> C) |
| vecset | none | vecset(vector<any> value)->(vector<any> name) | shape = [3 4 5] | vecset(vector<any> value)->(vector<any> name) |
| newtensor | none | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
| newtensor | none | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
| print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
| print | miaobyte | print(tensor<any> , var<string> )->() | print(T1) | print(tensor<any> , var<string> )->() |
| divscalar | miaobyte | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=scalar/T1 | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
| subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
| log | miaobyte | log(tensor<any> A)->(tensor<any> C) | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
| add | cublas | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
| add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
| addscalar | miaobyte | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
| log | miaobyte | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
| divscalar | miaobyte | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=scalar/T1 | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| sin | miaobyte | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=sin(T1) | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
| tan | miaobyte | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) | T3=tan(T1) | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) |
| print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
| print | miaobyte | print(tensor<any> , var<string> )->() | print(T1) | print(tensor<any> , var<string> )->() |
| newtensor | none | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
| newtensor | none | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
| vecset | none | vecset(vector<any> value)->(vector<any> name) | shape = [3 4 5] | vecset(vector<any> value)->(vector<any> name) |
| subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
| sqrt | miaobyte | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=sqrt(T1) | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
| argset | none | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
| sub | miaobyte | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
| mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
| div | miaobyte | div(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
| powscalar | miaobyte | powscalar(tensor<float64|float32> A, var<float64|float32> scalar)->(tensor<float64|float32> C) | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|float32> scalar)->(tensor<float64|float32> C) |
| max | miaobyte | max(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=max(T1, T2) | max(tensor<any> A, tensor<any> B)->(tensor<any> C) |
| pow | miaobyte | pow(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=pow(T1, T2) | pow(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
| maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1, scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
| exp | miaobyte | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=exp(T1) | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
| rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
| minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| cos | miaobyte | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
| min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1, T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
| compare | miaobyte | compare(tensor<any> A, tensor<any> B)->(tensor<int8> mask) | mask=compare(T1, T2) | compare(tensor<any> A, tensor<any> B)->(tensor<int8> mask) |
5 changes: 5 additions & 0 deletions doc/excuter/op-mem-ompsimd/list.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
| concat | none | concat()->() | Tresult = concat([T1, T2...], axis=3) | concat()->() |
| matmul | cblas | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
| matmul | miaobyte | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
| compare | miaobyte | compare(tensor<any> A, tensor<any> B)->(tensor<int8> mask) | mask=compare(T1,T2) | compare(tensor<any> A, tensor<any> B)->(tensor<int8> mask) |
| min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1,T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
| minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| exp | miaobyte | exp(tensor<any> A)->(tensor<any> C) | T3=exp(T1) | exp(tensor<any> A)->(tensor<any> C) |
| maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| pow | miaobyte | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1^T2 | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) |
| powscalar | miaobyte | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
Expand All @@ -21,6 +25,7 @@
| newtensor | none | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(var<string> shape)->(tensor<any> tensor1) |
| print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
| print | miaobyte | print(tensor<any> , var<string> )->() | print(T1) | print(tensor<any> , var<string> )->() |
| max | miaobyte | max(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=max(T1,T2) | max(tensor<any> A, tensor<any> B)->(tensor<any> C) |
| divscalar | miaobyte | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=T1/scalar | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1,value) | constant(tensor<any> t, var<any> value)->() |
| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
Expand Down
125 changes: 18 additions & 107 deletions excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,42 +87,8 @@ namespace deepx::tensorfunc
mulscalarDispatcher<Author, T>::mulscalar(input, value, output);
}

template <typename Author, typename T>
struct muladdDispatcher
{
static void muladd(const Tensor<T> &A, const Tensor<T> &B, const Tensor<T> &C, Tensor<T> &D) = delete;
};

template <typename Author, typename T>
void muladd(const Tensor<T> &A, const Tensor<T> &B, const Tensor<T> &C, Tensor<T> &D)
{
muladdDispatcher<Author, T>::muladd(A, B, C, D);
}

template <typename Author, typename T>
struct muladdscalarDispatcher
{
static void muladdscalar(const Tensor<T> &A, const Tensor<T> &B, const T alpha, const Tensor<T> &C, const T beta, Tensor<T> &D) = delete;
};

template <typename Author, typename T>
void muladdscalar(const Tensor<T> &A, const Tensor<T> &B, const T alpha, const Tensor<T> &C, const T beta, Tensor<T> &D)
{
muladdscalarDispatcher<Author, T>::muladdscalar(A, B, alpha, C, beta, D);
}

template <typename Author, typename T>
struct mulscalaraddDispatcher
{
static void mulscalaradd(const Tensor<T> &A, const T alpha, const Tensor<T> &B, const T beta, Tensor<T> &C) = delete;
};

template <typename Author, typename T>
void mulscalaradd(const Tensor<T> &A, const T alpha, const Tensor<T> &B, const T beta, Tensor<T> &C)
{
mulscalaraddDispatcher<Author, T>::mulscalaradd(A, alpha, B, beta, C);
}



template <typename Author, typename T>
struct divDispatcher
{
Expand Down Expand Up @@ -159,42 +125,7 @@ namespace deepx::tensorfunc
rdivscalarDispatcher<Author, T>::rdivscalar(value, input, output);
}

template <typename Author, typename T>
struct divaddDispatcher
{
static void divadd(const Tensor<T> &A, const Tensor<T> &B, const Tensor<T> &C, Tensor<T> &D) = delete;
};

template <typename Author, typename T>
void divadd(const Tensor<T> &A, const Tensor<T> &B, const Tensor<T> &C, Tensor<T> &D)
{
divaddDispatcher<Author, T>::divadd(A, B, C, D);
}

template <typename Author, typename T>
struct divscalaraddDispatcher
{
static void divscalaradd(const Tensor<T> &A, const T alpha, const Tensor<T> &B, const T beta, Tensor<T> &C) = delete;
};

template <typename Author, typename T>
void divscalaradd(const Tensor<T> &A, const T alpha, const Tensor<T> &B, const T beta, Tensor<T> &C)
{
divscalaraddDispatcher<Author, T>::divscalaradd(A, alpha, B, beta, C);
}

template <typename Author, typename T>
struct divaddbetaDispatcher
{
static void divaddbeta(const Tensor<T> &A, const Tensor<T> &B, const T alpha, const Tensor<T> &C, const T beta, Tensor<T> &D) = delete;
};

template <typename Author, typename T>
void divaddbeta(const Tensor<T> &A, const Tensor<T> &B, const T alpha, const Tensor<T> &C, const T beta, Tensor<T> &D)
{
divaddbetaDispatcher<Author, T>::divaddbeta(A, B, alpha, C, beta, D);
}


template <typename Author, typename T,typename = void>
struct sqrtDispatcher
{
Expand Down Expand Up @@ -303,17 +234,7 @@ namespace deepx::tensorfunc
maxDispatcher<Author, T>::max(A, B, C);
}

template <typename Author, typename T>
struct maxgradDispatcher
{
static void maxgrad(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &A_grad, Tensor<T> &B_grad, const Tensor<T> &output_grad) = delete;
};

template <typename Author, typename T>
void maxgrad(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &A_grad, Tensor<T> &B_grad, const Tensor<T> &output_grad)
{
maxgradDispatcher<Author, T>::maxgrad(A, B, A_grad, B_grad, output_grad);
}


template <typename Author, typename T>
struct maxscalarDispatcher
Expand All @@ -327,17 +248,7 @@ namespace deepx::tensorfunc
maxscalarDispatcher<Author, T>::maxscalar(A, b, C);
}

template <typename Author, typename T>
struct maxscalargradDispatcher
{
static void maxscalargrad(const Tensor<T> &A, const T b, Tensor<T> &A_grad, const Tensor<T> &output_grad) = delete;
};

template <typename Author, typename T>
void maxscalargrad(const Tensor<T> &A, const T b, Tensor<T> &A_grad, const Tensor<T> &output_grad)
{
maxscalargradDispatcher<Author, T>::maxscalargrad(A, b, A_grad, output_grad);
}


template <typename Author, typename T>
struct minDispatcher
Expand All @@ -352,39 +263,39 @@ namespace deepx::tensorfunc
}

template <typename Author, typename T>
struct mingradDispatcher
struct minscalarDispatcher
{
static void mingrad(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &A_grad, Tensor<T> &B_grad, const Tensor<T> &output_grad) = delete;
static void minscalar(const Tensor<T> &A, T b, Tensor<T> &C) = delete;
};

template <typename Author, typename T>
void mingrad(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &A_grad, Tensor<T> &B_grad, const Tensor<T> &output_grad)
void minscalar(const Tensor<T> &A, T b, Tensor<T> &C)
{
mingradDispatcher<Author, T>::mingrad(A, B, A_grad, B_grad, output_grad);
minscalarDispatcher<Author, T>::minscalar(A, b, C);
}

template <typename Author, typename T>
struct minscalarDispatcher
struct compareDispatcher
{
static void minscalar(const Tensor<T> &A, T b, Tensor<T> &C) = delete;
static void compare(const Tensor<T> &A, const Tensor<T> &B, Tensor<float> &mask) = delete;
};

template <typename Author, typename T>
void minscalar(const Tensor<T> &A, T b, Tensor<T> &C)
void compare(const Tensor<T> &A, const Tensor<T> &B,Tensor<float> &mask)
{
minscalarDispatcher<Author, T>::minscalar(A, b, C);
compareDispatcher<Author, T>::compare(A, B, mask);
}

template <typename Author, typename T>
struct minscalargradDispatcher
struct comparescalarDispatcher
{
static void minscalargrad(const Tensor<T> &A, const T b, Tensor<T> &A_grad, const Tensor<T> &output_grad) = delete;
static void comparescalar(const Tensor<T> &A, const T scalar, Tensor<float> &mask) = delete;
};

template <typename Author, typename T>
void minscalargrad(const Tensor<T> &A, const T b, Tensor<T> &A_grad, const Tensor<T> &output_grad)
void comparescalar(const Tensor<T> &A, const T scalar, Tensor<float> &mask)
{
minscalargradDispatcher<Author, T>::minscalargrad(A, b, A_grad, output_grad);
comparescalarDispatcher<Author, T>::comparescalar(A, scalar, mask);
}
} // namespace deepx::tensorfunc

Expand Down
Loading