diff --git a/include/infinicore/ops/addcdiv.hpp b/include/infinicore/ops/addcdiv.hpp new file mode 100644 index 000000000..ad1529d0a --- /dev/null +++ b/include/infinicore/ops/addcdiv.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Addcdiv { +public: + using schema = void (*)(Tensor, Tensor, Tensor, Tensor, float); + static void execute(Tensor input, Tensor t1, Tensor t2, Tensor output, float value); + static common::OpDispatcher &dispatcher(); +}; + +Tensor addcdiv(Tensor input, Tensor t1, Tensor t2, float value); +void addcdiv_(Tensor input, Tensor t1, Tensor t2, Tensor output, float value); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/atan2.hpp b/include/infinicore/ops/atan2.hpp new file mode 100644 index 000000000..825041742 --- /dev/null +++ b/include/infinicore/ops/atan2.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Atan2 { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + static void execute(Tensor input, Tensor other, Tensor output); + static common::OpDispatcher &dispatcher(); +}; + +Tensor atan2(Tensor input, Tensor other); +void atan2_(Tensor input, Tensor other, Tensor output); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/binary_cross_entropy.hpp b/include/infinicore/ops/binary_cross_entropy.hpp new file mode 100644 index 000000000..54be2ec2a --- /dev/null +++ b/include/infinicore/ops/binary_cross_entropy.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" +#include +#include + +namespace infinicore::op { + +class BinaryCrossEntropy { +public: + using schema = void (*)(Tensor, Tensor, std::optional, Tensor, std::string); + static void execute(Tensor input, Tensor target, std::optional weight, Tensor output, std::string reduction); + static common::OpDispatcher &dispatcher(); +}; + +Tensor binary_cross_entropy(Tensor input, Tensor target, std::optional weight, std::string reduction); +void binary_cross_entropy_(Tensor input, Tensor target, std::optional weight, Tensor output, std::string reduction); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/bucketize.hpp b/include/infinicore/ops/bucketize.hpp new file mode 100644 index 000000000..6d53d4810 --- /dev/null +++ b/include/infinicore/ops/bucketize.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Bucketize { +public: + using schema = void (*)(Tensor, Tensor, Tensor, bool); + static void execute(Tensor input, Tensor boundaries, Tensor output, bool right); + static common::OpDispatcher &dispatcher(); +}; + +Tensor bucketize(Tensor input, Tensor boundaries, bool right = false); +void bucketize_(Tensor input, Tensor boundaries, Tensor output, bool right = false); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/minimum.hpp b/include/infinicore/ops/minimum.hpp new file mode 100644 index 000000000..54f1ab272 --- /dev/null +++ b/include/infinicore/ops/minimum.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Minimum { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + static void execute(Tensor input, Tensor other, Tensor output); + static common::OpDispatcher &dispatcher(); +}; + +Tensor minimum(Tensor input, Tensor other); +void minimum_(Tensor input, Tensor other, Tensor output); + +} // namespace infinicore::op \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index c6b01d5aa..7575bf4c6 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -44,8 +44,12 @@ ) from infinicore.ops.add import add from infinicore.ops.add_rms_norm import add_rms_norm, add_rms_norm_ +from infinicore.ops.addcdiv import addcdiv +from infinicore.ops.atan2 import atan2 from infinicore.ops.attention import attention +from infinicore.ops.bucketize import bucketize from infinicore.ops.matmul import matmul +from infinicore.ops.minimum import minimum from infinicore.ops.mul import mul from infinicore.ops.narrow import narrow from infinicore.ops.paged_attention import paged_attention @@ -134,6 +138,10 @@ "strided_empty", "strided_from_blob", "zeros", + "minimum", + "atan2", + "addcdiv", + "bucketize", ] use_ntops = False diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 255079790..2f55e8dc1 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -1,3 +1,4 @@ +from .binary_cross_entropy import binary_cross_entropy from .causal_softmax import causal_softmax from .embedding import embedding from .linear import linear @@ -17,4 +18,5 @@ "embedding", "rope", "RopeAlgo", + "binary_cross_entropy", ] diff --git a/python/infinicore/nn/functional/binary_cross_entropy.py b/python/infinicore/nn/functional/binary_cross_entropy.py new file mode 100644 index 000000000..4f9a53d85 --- /dev/null +++ b/python/infinicore/nn/functional/binary_cross_entropy.py @@ -0,0 +1,47 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def binary_cross_entropy( + input: Tensor, + target: Tensor, + weight: Tensor | None = None, + size_average=None, + reduce=None, + reduction: str = "mean", + *, + out=None, +) -> Tensor: + r"""Apply the binary_cross_entropy function.""" + + if size_average is not None or reduce is not None: + if reduce is False: + reduction = "none" + elif size_average is True or size_average is None: + reduction = "mean" + else: + reduction = "sum" + + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.binary_cross_entropy( + input, target, weight=weight, reduction=reduction, out=out + ) + + weight_underlying = weight._underlying if weight is not None else None + + if out is None: + return Tensor( + _infinicore.binary_cross_entropy( + input._underlying, target._underlying, weight_underlying, reduction + ) + ) + + _infinicore.binary_cross_entropy_( + input._underlying, + target._underlying, + weight_underlying, + out._underlying, + reduction, + ) + return out diff --git a/python/infinicore/ops/addcdiv.py b/python/infinicore/ops/addcdiv.py new file mode 100644 index 000000000..7323eceda --- /dev/null +++ b/python/infinicore/ops/addcdiv.py @@ -0,0 +1,38 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def addcdiv( + input: Tensor, + tensor1: Tensor, + tensor2: Tensor, + *, + value=1.0, + out=None, +) -> Tensor: + r"""Apply the addcdiv function.""" + + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.addcdiv( + input, tensor1, tensor2, value=value, out=out + ) + + if out is None: + return Tensor( + _infinicore.addcdiv( + input._underlying, + tensor1._underlying, + tensor2._underlying, + float(value), + ) + ) + + _infinicore.addcdiv_( + input._underlying, + tensor1._underlying, + tensor2._underlying, + out._underlying, + float(value), + ) + return out diff --git a/python/infinicore/ops/atan2.py b/python/infinicore/ops/atan2.py new file mode 100644 index 000000000..ca7c122b6 --- /dev/null +++ b/python/infinicore/ops/atan2.py @@ -0,0 +1,21 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def atan2( + input: Tensor, + other: Tensor, + *, + out=None, +) -> Tensor: + r"""Apply the atan2 function.""" + + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.atan2(input, other, out=out) + + if out is None: + return Tensor(_infinicore.atan2(input._underlying, other._underlying)) + + _infinicore.atan2_(input._underlying, other._underlying, out._underlying) + return out diff --git a/python/infinicore/ops/bucketize.py b/python/infinicore/ops/bucketize.py new file mode 100644 index 000000000..9f4d411b3 --- /dev/null +++ b/python/infinicore/ops/bucketize.py @@ -0,0 +1,28 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def bucketize( + input: Tensor, + boundaries: Tensor, + *, + out=None, + right=False, +) -> Tensor: + r"""Apply the bucketize function.""" + + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.bucketize(input, boundaries, out=out, right=right) + + if out is None: + return Tensor( + _infinicore.bucketize( + input._underlying, boundaries._underlying, bool(right) + ) + ) + + _infinicore.bucketize_( + input._underlying, boundaries._underlying, out._underlying, bool(right) + ) + return out diff --git a/python/infinicore/ops/minimum.py b/python/infinicore/ops/minimum.py new file mode 100644 index 000000000..f09e6d316 --- /dev/null +++ b/python/infinicore/ops/minimum.py @@ -0,0 +1,21 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def minimum( + input: Tensor, + other: Tensor, + *, + out=None, +) -> Tensor: + r"""Apply the minimum function.""" + + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.minimum(input, other, out=out) + + if out is None: + return Tensor(_infinicore.minimum(input._underlying, other._underlying)) + + _infinicore.minimum_(input._underlying, other._underlying, out._underlying) + return out diff --git a/src/infinicore/ops/addcdiv/addcdiv.cc b/src/infinicore/ops/addcdiv/addcdiv.cc new file mode 100644 index 000000000..37b7c774e --- /dev/null +++ b/src/infinicore/ops/addcdiv/addcdiv.cc @@ -0,0 +1,47 @@ +#include "infinicore/ops/addcdiv.hpp" +#include "../../utils.hpp" +#include + +namespace infinicore::op { + +common::OpDispatcher &Addcdiv::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Addcdiv::execute(Tensor input, Tensor t1, Tensor t2, Tensor output, float value) { + infinicore::context::setDevice(input->device()); + dispatcher().lookup(input->device().getType())(input, t1, t2, output, value); +} + +static Shape broadcast_shape_3(const Shape &a, const Shape &b, const Shape &c) { + int ndim = std::max({a.size(), b.size(), c.size()}); + Shape out_shape; + + for (int i = 0; i < ndim; ++i) { + int dim_a = (i < ndim - a.size()) ? 1 : a[i - (ndim - a.size())]; + int dim_b = (i < ndim - b.size()) ? 1 : b[i - (ndim - b.size())]; + int dim_c = (i < ndim - c.size()) ? 1 : c[i - (ndim - c.size())]; + + int target = std::max({dim_a, dim_b, dim_c}); + + if ((dim_a != target && dim_a != 1) || (dim_b != target && dim_b != 1) || (dim_c != target && dim_c != 1)) { + throw std::runtime_error("Shapes are not broadcastable"); + } + out_shape.push_back(target); + } + return out_shape; +} + +Tensor addcdiv(Tensor input, Tensor t1, Tensor t2, float value) { + Shape out_shape = broadcast_shape_3(input->shape(), t1->shape(), t2->shape()); + auto output = Tensor::empty(out_shape, input->dtype(), input->device()); + addcdiv_(input, t1, t2, output, value); + return output; +} + +void addcdiv_(Tensor input, Tensor t1, Tensor t2, Tensor output, float value) { + Addcdiv::execute(input, t1, t2, output, value); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/addcdiv/addcdiv_cpu.cc b/src/infinicore/ops/addcdiv/addcdiv_cpu.cc new file mode 100644 index 000000000..fa9cd5c2a --- /dev/null +++ b/src/infinicore/ops/addcdiv/addcdiv_cpu.cc @@ -0,0 +1,111 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/addcdiv.hpp" +#include +#include +#include + +namespace infinicore::op::addcdiv_impl::cpu { + +template +inline T addcdiv_op(T in, T t1, T t2, float value) { + // out = input + value * (t1 / t2) + float val_in = utils::cast(in); + float val_t1 = utils::cast(t1); + float val_t2 = utils::cast(t2); + + float res = val_in + value * (val_t1 / val_t2); + return utils::cast(res); +} + +template +void addcdiv_kernel(const T *in_ptr, const T *t1_ptr, const T *t2_ptr, T *out_ptr, float value, size_t numel) { +#pragma omp parallel for + for (size_t i = 0; i < numel; ++i) { + out_ptr[i] = addcdiv_op(in_ptr[i], t1_ptr[i], t2_ptr[i], value); + } +} + +template +void addcdiv_strided_kernel(const T *in_ptr, const T *t1_ptr, const T *t2_ptr, T *out_ptr, float value, + const Shape &in_shape, const Strides &in_strides, + const Shape &t1_shape, const Strides &t1_strides, + const Shape &t2_shape, const Strides &t2_strides, + const Shape &out_shape, const Strides &out_strides, + size_t numel) { + int ndim = out_shape.size(); + int in_dim_offset = ndim - in_shape.size(); + int t1_dim_offset = ndim - t1_shape.size(); + int t2_dim_offset = ndim - t2_shape.size(); + +#pragma omp parallel for + for (size_t i = 0; i < numel; ++i) { + size_t temp_idx = i; + size_t in_offset = 0; + size_t t1_offset = 0; + size_t t2_offset = 0; + size_t out_offset = 0; + + for (int d = ndim - 1; d >= 0; --d) { + size_t coord = temp_idx % out_shape[d]; + temp_idx /= out_shape[d]; + + out_offset += coord * out_strides[d]; + + if (d >= in_dim_offset && in_shape[d - in_dim_offset] > 1) { + in_offset += coord * in_strides[d - in_dim_offset]; + } + + if (d >= t1_dim_offset && t1_shape[d - t1_dim_offset] > 1) { + t1_offset += coord * t1_strides[d - t1_dim_offset]; + } + + if (d >= t2_dim_offset && t2_shape[d - t2_dim_offset] > 1) { + t2_offset += coord * t2_strides[d - t2_dim_offset]; + } + } + + out_ptr[out_offset] = addcdiv_op(in_ptr[in_offset], t1_ptr[t1_offset], t2_ptr[t2_offset], value); + } +} + +void calculate_addcdiv(Tensor input, Tensor t1, Tensor t2, Tensor output, float value) { + auto dtype = input->dtype(); + if (t1->dtype() != dtype || t2->dtype() != dtype || output->dtype() != dtype) { + throw std::runtime_error("Dtype mismatch in addcdiv op"); + } + + size_t numel = output->numel(); + + bool exact_match = (input->shape() == t1->shape()) && (t1->shape() == t2->shape()) && (t2->shape() == output->shape()); + bool all_contiguous = input->is_contiguous() && t1->is_contiguous() && t2->is_contiguous() && output->is_contiguous(); + + if (exact_match && all_contiguous) { + if (dtype == DataType::F32) { + addcdiv_kernel((float *)input->data(), (float *)t1->data(), (float *)t2->data(), (float *)output->data(), value, numel); + } else if (dtype == DataType::F16) { + addcdiv_kernel((fp16_t *)input->data(), (fp16_t *)t1->data(), (fp16_t *)t2->data(), (fp16_t *)output->data(), value, numel); + } else { + throw std::runtime_error("Unsupported dtype for addcdiv contiguous"); + } + } else { + if (dtype == DataType::F32) { + addcdiv_strided_kernel( + (float *)input->data(), (float *)t1->data(), (float *)t2->data(), (float *)output->data(), value, + input->shape(), input->strides(), t1->shape(), t1->strides(), t2->shape(), t2->strides(), output->shape(), output->strides(), numel); + } else if (dtype == DataType::F16) { + addcdiv_strided_kernel( + (fp16_t *)input->data(), (fp16_t *)t1->data(), (fp16_t *)t2->data(), (fp16_t *)output->data(), value, + input->shape(), input->strides(), t1->shape(), t1->strides(), t2->shape(), t2->strides(), output->shape(), output->strides(), numel); + } else { + throw std::runtime_error("Unsupported dtype for addcdiv strided"); + } + } +} + +static bool registered = []() { + Addcdiv::dispatcher().registerDevice(Device::Type::CPU, &calculate_addcdiv); + return true; +}(); + +} // namespace infinicore::op::addcdiv_impl::cpu \ No newline at end of file diff --git a/src/infinicore/ops/atan2/atan2.cc b/src/infinicore/ops/atan2/atan2.cc new file mode 100644 index 000000000..bc761448d --- /dev/null +++ b/src/infinicore/ops/atan2/atan2.cc @@ -0,0 +1,46 @@ +#include "infinicore/ops/atan2.hpp" +#include "../../utils.hpp" +#include + +namespace infinicore::op { + +common::OpDispatcher &Atan2::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Atan2::execute(Tensor input, Tensor other, Tensor output) { + infinicore::context::setDevice(input->device()); + dispatcher().lookup(input->device().getType())(input, other, output); +} + +static Shape broadcast_shape(const Shape &a, const Shape &b) { + Shape out_shape; + int ndim_a = a.size(); + int ndim_b = b.size(); + int max_ndim = std::max(ndim_a, ndim_b); + + for (int i = 0; i < max_ndim; ++i) { + int dim_a = (i < max_ndim - ndim_a) ? 1 : a[i - (max_ndim - ndim_a)]; + int dim_b = (i < max_ndim - ndim_b) ? 1 : b[i - (max_ndim - ndim_b)]; + + if (dim_a != dim_b && dim_a != 1 && dim_b != 1) { + throw std::runtime_error("Shapes are not broadcastable"); + } + out_shape.push_back(std::max(dim_a, dim_b)); + } + return out_shape; +} + +Tensor atan2(Tensor input, Tensor other) { + Shape out_shape = broadcast_shape(input->shape(), other->shape()); + auto output = Tensor::empty(out_shape, input->dtype(), input->device()); + atan2_(input, other, output); + return output; +} + +void atan2_(Tensor input, Tensor other, Tensor output) { + Atan2::execute(input, other, output); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/atan2/atan2_cpu.cc b/src/infinicore/ops/atan2/atan2_cpu.cc new file mode 100644 index 000000000..d9992ce83 --- /dev/null +++ b/src/infinicore/ops/atan2/atan2_cpu.cc @@ -0,0 +1,130 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/atan2.hpp" +#include +#include +#include +#include + +namespace infinicore::op::atan2_impl::cpu { + +template +inline T atan2_op(T a, T b) { + CompT val_a = utils::cast(a); + CompT val_b = utils::cast(b); + return utils::cast(std::atan2(val_a, val_b)); +} + +template +void atan2_kernel(const T *input_ptr, const T *other_ptr, T *output_ptr, size_t numel) { +#pragma omp parallel for + for (size_t i = 0; i < numel; ++i) { + output_ptr[i] = atan2_op(input_ptr[i], other_ptr[i]); + } +} + +template +void atan2_strided_kernel(const T *in_data, const T *other_data, T *out_data, + const Shape &in_shape, const Strides &in_strides, + const Shape &other_shape, const Strides &other_strides, + const Shape &out_shape, const Strides &out_strides, + size_t numel) { + int ndim = out_shape.size(); + int in_dim_offset = ndim - in_shape.size(); + int other_dim_offset = ndim - other_shape.size(); + +#pragma omp parallel for + for (size_t i = 0; i < numel; ++i) { + size_t temp_idx = i; + size_t in_offset = 0; + size_t other_offset = 0; + size_t out_offset = 0; + + for (int d = ndim - 1; d >= 0; --d) { + size_t coord = temp_idx % out_shape[d]; + temp_idx /= out_shape[d]; + + out_offset += coord * out_strides[d]; + + if (d >= in_dim_offset) { + int local_d = d - in_dim_offset; + if (in_shape[local_d] > 1) { + in_offset += coord * in_strides[local_d]; + } + } + + if (d >= other_dim_offset) { + int local_d = d - other_dim_offset; + if (other_shape[local_d] > 1) { + other_offset += coord * other_strides[local_d]; + } + } + } + out_data[out_offset] = atan2_op(in_data[in_offset], other_data[other_offset]); + } +} + +void calculate_atan2(Tensor input, Tensor other, Tensor output) { + auto dtype = input->dtype(); + if (other->dtype() != dtype || output->dtype() != dtype) { + throw std::runtime_error("Dtype mismatch in atan2 op"); + } + + size_t numel = output->numel(); + + bool exact_match = (input->shape() == other->shape()) && (other->shape() == output->shape()); + bool all_contiguous = input->is_contiguous() && other->is_contiguous() && output->is_contiguous(); + + if (exact_match && all_contiguous) { + if (dtype == DataType::F32) { + atan2_kernel( + reinterpret_cast(input->data()), + reinterpret_cast(other->data()), + reinterpret_cast(output->data()), numel); + } else if (dtype == DataType::F64) { + atan2_kernel( + reinterpret_cast(input->data()), + reinterpret_cast(other->data()), + reinterpret_cast(output->data()), numel); + } else if (dtype == DataType::F16) { + atan2_kernel( + reinterpret_cast(input->data()), + reinterpret_cast(other->data()), + reinterpret_cast(output->data()), numel); + } else if (dtype == DataType::BF16) { + atan2_kernel( + reinterpret_cast(input->data()), + reinterpret_cast(other->data()), + reinterpret_cast(output->data()), numel); + } else { + throw std::runtime_error("Unsupported dtype for atan2 contiguous"); + } + } else { + if (dtype == DataType::F32) { + atan2_strided_kernel( + reinterpret_cast(input->data()), reinterpret_cast(other->data()), reinterpret_cast(output->data()), + input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel); + } else if (dtype == DataType::F64) { + atan2_strided_kernel( + reinterpret_cast(input->data()), reinterpret_cast(other->data()), reinterpret_cast(output->data()), + input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel); + } else if (dtype == DataType::F16) { + atan2_strided_kernel( + reinterpret_cast(input->data()), reinterpret_cast(other->data()), reinterpret_cast(output->data()), + input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel); + } else if (dtype == DataType::BF16) { + atan2_strided_kernel( + reinterpret_cast(input->data()), reinterpret_cast(other->data()), reinterpret_cast(output->data()), + input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel); + } else { + throw std::runtime_error("Unsupported dtype for atan2 strided"); + } + } +} + +static bool registered = []() { + Atan2::dispatcher().registerDevice(Device::Type::CPU, &calculate_atan2); + return true; +}(); + +} // namespace infinicore::op::atan2_impl::cpu \ No newline at end of file diff --git a/src/infinicore/ops/binary_cross_entropy/binary_cross_entropy.cc b/src/infinicore/ops/binary_cross_entropy/binary_cross_entropy.cc new file mode 100644 index 000000000..41892822b --- /dev/null +++ b/src/infinicore/ops/binary_cross_entropy/binary_cross_entropy.cc @@ -0,0 +1,31 @@ +#include "infinicore/ops/binary_cross_entropy.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &BinaryCrossEntropy::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void BinaryCrossEntropy::execute(Tensor input, Tensor target, std::optional weight, Tensor output, std::string reduction) { + infinicore::context::setDevice(input->device()); + dispatcher().lookup(input->device().getType())(input, target, weight, output, reduction); +} + +Tensor binary_cross_entropy(Tensor input, Tensor target, std::optional weight, std::string reduction) { + Shape output_shape = {}; + if (reduction == "none") { + output_shape = input->shape(); + } + + auto output = Tensor::empty(output_shape, input->dtype(), input->device()); + binary_cross_entropy_(input, target, weight, output, reduction); + return output; +} + +void binary_cross_entropy_(Tensor input, Tensor target, std::optional weight, Tensor output, std::string reduction) { + BinaryCrossEntropy::execute(input, target, weight, output, reduction); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/binary_cross_entropy/binary_cross_entropy_cpu.cc b/src/infinicore/ops/binary_cross_entropy/binary_cross_entropy_cpu.cc new file mode 100644 index 000000000..b1827ca8e --- /dev/null +++ b/src/infinicore/ops/binary_cross_entropy/binary_cross_entropy_cpu.cc @@ -0,0 +1,189 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/binary_cross_entropy.hpp" +#include +#include +#include +#include +#include +#include + +namespace infinicore::op::binary_cross_entropy_impl::cpu { + +inline float bf16_to_f32(uint16_t val) { + uint32_t bits = static_cast(val) << 16; + float f; + std::memcpy(&f, &bits, sizeof(f)); + return f; +} + +// PyTorch CPU 在逐元素输出时行为接近截断 +inline uint16_t f32_to_bf16_trunc(float val) { + if (std::isnan(val)) { + return 0x7FC0; + } + uint32_t bits; + std::memcpy(&bits, &val, sizeof(bits)); + return static_cast(bits >> 16); +} + +// Reduction 结果 RNE 舍入 +inline uint16_t f32_to_bf16_rne(float val) { + union { + float f; + uint32_t u; + } x; + x.f = val; + if (std::isnan(val)) { + return 0x7FC0; + } + + uint32_t lsb = (x.u >> 16) & 1; + uint32_t rounding_bias = 0x7FFF + lsb; + x.u += rounding_bias; + return static_cast(x.u >> 16); +} + +template +void bce_kernel(const Tensor &input, const Tensor &target, std::optional weight, Tensor &output, std::string reduction) { + + const void *input_raw = input->data(); + const void *target_raw = target->data(); + T *output_data = reinterpret_cast(output->data()); + + const void *weight_raw = nullptr; + if (weight.has_value() && weight.value()) { + weight_raw = weight.value()->data(); + } + + size_t numel = input->numel(); + auto input_strides = input->strides(); + auto target_strides = target->strides(); + auto shape = input->shape(); + int ndim = input->ndim(); + + bool contiguous = input->is_contiguous() && target->is_contiguous(); + if (weight_raw && !weight.value()->is_contiguous()) { + contiguous = false; + } + + auto dtype = input->dtype(); + + AccT total_loss = 0; + + auto read_val = [&](const void *ptr, size_t offset) -> AccT { + if (dtype == DataType::BF16) { + const uint16_t *p = reinterpret_cast(ptr); + return static_cast(bf16_to_f32(p[offset])); + } else { + return utils::cast(reinterpret_cast(ptr)[offset]); + } + }; + + auto write_val_elementwise = [&](size_t offset, AccT val) { + if (dtype == DataType::BF16) { + reinterpret_cast(output_data)[offset] = f32_to_bf16_trunc(static_cast(val)); + } else { + output_data[offset] = utils::cast(val); + } + }; + + const AccT eps = static_cast(1e-12); + const AccT one = static_cast(1.0); + + if (contiguous) { +#pragma omp parallel for reduction(+ : total_loss) + for (size_t i = 0; i < numel; ++i) { + AccT x = read_val(input_raw, i); + AccT y = read_val(target_raw, i); + + AccT term1 = std::max(x, eps); + AccT term2 = std::max(one - x, eps); + + AccT loss = -(y * std::log(term1) + (one - y) * std::log(term2)); + + if (weight_raw) { + AccT w = read_val(weight_raw, i); + loss *= w; + } + + if (reduction == "none") { + write_val_elementwise(i, loss); + } else { + total_loss += loss; + } + } + } else { +#pragma omp parallel for reduction(+ : total_loss) + for (size_t i = 0; i < numel; ++i) { + size_t temp_idx = i; + size_t input_offset = 0; + size_t target_offset = 0; + size_t weight_offset = 0; + + for (int d = ndim - 1; d >= 0; --d) { + size_t coord = temp_idx % shape[d]; + temp_idx /= shape[d]; + input_offset += coord * input_strides[d]; + target_offset += coord * target_strides[d]; + if (weight_raw) { + weight_offset += coord * weight.value()->strides()[d]; + } + } + + AccT x = read_val(input_raw, input_offset); + AccT y = read_val(target_raw, target_offset); + + AccT term1 = std::max(x, eps); + AccT term2 = std::max(one - x, eps); + + AccT loss = -(y * std::log(term1) + (one - y) * std::log(term2)); + + if (weight_raw) { + AccT w = read_val(weight_raw, weight_offset); + loss *= w; + } + + if (reduction == "none") { + write_val_elementwise(i, loss); + } else { + total_loss += loss; + } + } + } + + if (reduction != "none") { + if (reduction == "mean") { + total_loss /= static_cast(numel); + } + + if (dtype == DataType::BF16) { + *reinterpret_cast(output_data) = f32_to_bf16_rne(static_cast(total_loss)); + } else { + *output_data = utils::cast(total_loss); + } + } +} + +void calculate(Tensor input, Tensor target, std::optional weight, Tensor output, std::string reduction) { + auto dtype = input->dtype(); + + if (dtype == DataType::F32) { + bce_kernel(input, target, weight, output, reduction); + } else if (dtype == DataType::F16) { + bce_kernel(input, target, weight, output, reduction); + } else if (dtype == DataType::BF16) { + bce_kernel(input, target, weight, output, reduction); + } else if (dtype == DataType::F64) { + bce_kernel(input, target, weight, output, reduction); + } else { + throw std::runtime_error("Unsupported dtype for binary_cross_entropy"); + } +} + +static bool registered = []() { + BinaryCrossEntropy::dispatcher().registerDevice(Device::Type::CPU, &calculate); + return true; +}(); + +} // namespace infinicore::op::binary_cross_entropy_impl::cpu \ No newline at end of file diff --git a/src/infinicore/ops/bucketize/bucketize.cc b/src/infinicore/ops/bucketize/bucketize.cc new file mode 100644 index 000000000..c851d14c7 --- /dev/null +++ b/src/infinicore/ops/bucketize/bucketize.cc @@ -0,0 +1,26 @@ +#include "infinicore/ops/bucketize.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Bucketize::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Bucketize::execute(Tensor input, Tensor boundaries, Tensor output, bool right) { + infinicore::context::setDevice(input->device()); + dispatcher().lookup(input->device().getType())(input, boundaries, output, right); +} + +Tensor bucketize(Tensor input, Tensor boundaries, bool right) { + auto output = Tensor::empty(input->shape(), DataType::I64, input->device()); + bucketize_(input, boundaries, output, right); + return output; +} + +void bucketize_(Tensor input, Tensor boundaries, Tensor output, bool right) { + Bucketize::execute(input, boundaries, output, right); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/bucketize/bucketize_cpu.cc b/src/infinicore/ops/bucketize/bucketize_cpu.cc new file mode 100644 index 000000000..2ed64e068 --- /dev/null +++ b/src/infinicore/ops/bucketize/bucketize_cpu.cc @@ -0,0 +1,124 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/bucketize.hpp" +#include +#include +#include +#include +#include + +namespace infinicore::op::bucketize_impl::cpu { + +template +void bucketize_contiguous_kernel(const T *in_ptr, const T *bound_ptr, int64_t *out_ptr, + size_t numel, size_t bound_len, bool right) { + const T *bound_end = bound_ptr + bound_len; + +#pragma omp parallel for + for (size_t i = 0; i < numel; ++i) { + T val = in_ptr[i]; + const T *result_ptr; + + if (right) { + result_ptr = std::upper_bound(bound_ptr, bound_end, val); + } else { + result_ptr = std::lower_bound(bound_ptr, bound_end, val); + } + + out_ptr[i] = static_cast(result_ptr - bound_ptr); + } +} + +template +void bucketize_strided_kernel(const T *in_ptr, const T *bound_ptr, int64_t *out_ptr, + const Shape &in_shape, const Strides &in_strides, + const Shape &out_shape, const Strides &out_strides, + size_t numel, size_t bound_len, bool right) { + int ndim = out_shape.size(); + const T *bound_end = bound_ptr + bound_len; + +#pragma omp parallel for + for (size_t i = 0; i < numel; ++i) { + size_t temp_idx = i; + size_t in_offset = 0; + size_t out_offset = 0; + + for (int d = ndim - 1; d >= 0; --d) { + size_t coord = temp_idx % out_shape[d]; + temp_idx /= out_shape[d]; + + out_offset += coord * out_strides[d]; + in_offset += coord * in_strides[d]; + } + + T val = in_ptr[in_offset]; + const T *result_ptr; + + if (right) { + result_ptr = std::upper_bound(bound_ptr, bound_end, val); + } else { + result_ptr = std::lower_bound(bound_ptr, bound_end, val); + } + + out_ptr[out_offset] = static_cast(result_ptr - bound_ptr); + } +} + +void calculate_bucketize(Tensor input, Tensor boundaries, Tensor output, bool right) { + if (output->dtype() != DataType::I64) { + throw std::runtime_error("Bucketize output must be int64"); + } + + Tensor boundaries_contig = boundaries; + if (!boundaries->is_contiguous()) { + boundaries_contig = boundaries->contiguous(); + } + + size_t bound_len = boundaries_contig->numel(); + size_t numel = input->numel(); + auto dtype = input->dtype(); + + std::vector sorted_boundaries(bound_len); + const float *raw_bound_ptr = reinterpret_cast(boundaries_contig->data()); + + std::memcpy(sorted_boundaries.data(), raw_bound_ptr, bound_len * sizeof(float)); + + std::sort(sorted_boundaries.begin(), sorted_boundaries.end()); + + const float *bound_ptr = sorted_boundaries.data(); + + bool in_out_contiguous = input->is_contiguous() && output->is_contiguous(); + + if (in_out_contiguous) { + int64_t *out_ptr = reinterpret_cast(output->data()); + if (dtype == DataType::F32) { + bucketize_contiguous_kernel( + (float *)input->data(), + bound_ptr, + out_ptr, numel, bound_len, right); + } else if (dtype == DataType::F16) { + throw std::runtime_error("F16 bucketize cpu not implemented yet"); + } else { + throw std::runtime_error("Unsupported input dtype"); + } + } else { + int64_t *out_ptr = reinterpret_cast(output->data()); + if (dtype == DataType::F32) { + bucketize_strided_kernel( + (float *)input->data(), + bound_ptr, + out_ptr, + input->shape(), input->strides(), output->shape(), output->strides(), + numel, bound_len, right); + } else { + throw std::runtime_error("Unsupported input dtype"); + } + } +} + +static bool registered = []() { + Bucketize::dispatcher().registerDevice(Device::Type::CPU, &calculate_bucketize); + return true; +}(); + +} // namespace infinicore::op::bucketize_impl::cpu \ No newline at end of file diff --git a/src/infinicore/ops/minimum/minimum.cc b/src/infinicore/ops/minimum/minimum.cc new file mode 100644 index 000000000..eb5ddf40f --- /dev/null +++ b/src/infinicore/ops/minimum/minimum.cc @@ -0,0 +1,46 @@ +#include "infinicore/ops/minimum.hpp" +#include "../../utils.hpp" +#include + +namespace infinicore::op { + +common::OpDispatcher &Minimum::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Minimum::execute(Tensor input, Tensor other, Tensor output) { + infinicore::context::setDevice(input->device()); + dispatcher().lookup(input->device().getType())(input, other, output); +} + +static Shape broadcast_shape(const Shape &a, const Shape &b) { + Shape out_shape; + int ndim_a = a.size(); + int ndim_b = b.size(); + int max_ndim = std::max(ndim_a, ndim_b); + + for (int i = 0; i < max_ndim; ++i) { + int dim_a = (i < max_ndim - ndim_a) ? 1 : a[i - (max_ndim - ndim_a)]; + int dim_b = (i < max_ndim - ndim_b) ? 1 : b[i - (max_ndim - ndim_b)]; + + if (dim_a != dim_b && dim_a != 1 && dim_b != 1) { + throw std::runtime_error("Shapes are not broadcastable"); + } + out_shape.push_back(std::max(dim_a, dim_b)); + } + return out_shape; +} + +Tensor minimum(Tensor input, Tensor other) { + Shape out_shape = broadcast_shape(input->shape(), other->shape()); + auto output = Tensor::empty(out_shape, input->dtype(), input->device()); + minimum_(input, other, output); + return output; +} + +void minimum_(Tensor input, Tensor other, Tensor output) { + Minimum::execute(input, other, output); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/minimum/minimum_cpu.cc b/src/infinicore/ops/minimum/minimum_cpu.cc new file mode 100644 index 000000000..1b8ab4473 --- /dev/null +++ b/src/infinicore/ops/minimum/minimum_cpu.cc @@ -0,0 +1,130 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/minimum.hpp" +#include +#include +#include +#include + +namespace infinicore::op::minimum_impl::cpu { + +template +inline T min_op(T a, T b) { + CompT val_a = utils::cast(a); + CompT val_b = utils::cast(b); + return utils::cast(std::min(val_a, val_b)); +} + +template +void minimum_kernel(const T *input_ptr, const T *other_ptr, T *output_ptr, size_t numel) { +#pragma omp parallel for + for (size_t i = 0; i < numel; ++i) { + output_ptr[i] = min_op(input_ptr[i], other_ptr[i]); + } +} + +template +void minimum_strided_kernel(const T *in_data, const T *other_data, T *out_data, + const Shape &in_shape, const Strides &in_strides, + const Shape &other_shape, const Strides &other_strides, + const Shape &out_shape, const Strides &out_strides, + size_t numel) { + int ndim = out_shape.size(); + int in_dim_offset = ndim - in_shape.size(); + int other_dim_offset = ndim - other_shape.size(); + +#pragma omp parallel for + for (size_t i = 0; i < numel; ++i) { + size_t temp_idx = i; + size_t in_offset = 0; + size_t other_offset = 0; + size_t out_offset = 0; + + for (int d = ndim - 1; d >= 0; --d) { + size_t coord = temp_idx % out_shape[d]; + temp_idx /= out_shape[d]; + + out_offset += coord * out_strides[d]; + + if (d >= in_dim_offset) { + int local_d = d - in_dim_offset; + if (in_shape[local_d] > 1) { + in_offset += coord * in_strides[local_d]; + } + } + + if (d >= other_dim_offset) { + int local_d = d - other_dim_offset; + if (other_shape[local_d] > 1) { + other_offset += coord * other_strides[local_d]; + } + } + } + out_data[out_offset] = min_op(in_data[in_offset], other_data[other_offset]); + } +} + +void calculate_minimum(Tensor input, Tensor other, Tensor output) { + auto dtype = input->dtype(); + if (other->dtype() != dtype || output->dtype() != dtype) { + throw std::runtime_error("Dtype mismatch in minimum op"); + } + + size_t numel = output->numel(); + + bool exact_match = (input->shape() == other->shape()) && (other->shape() == output->shape()); + bool all_contiguous = input->is_contiguous() && other->is_contiguous() && output->is_contiguous(); + + if (exact_match && all_contiguous) { + if (dtype == DataType::F32) { + minimum_kernel( + reinterpret_cast(input->data()), + reinterpret_cast(other->data()), + reinterpret_cast(output->data()), numel); + } else if (dtype == DataType::F64) { + minimum_kernel( + reinterpret_cast(input->data()), + reinterpret_cast(other->data()), + reinterpret_cast(output->data()), numel); + } else if (dtype == DataType::F16) { + minimum_kernel( + reinterpret_cast(input->data()), + reinterpret_cast(other->data()), + reinterpret_cast(output->data()), numel); + } else if (dtype == DataType::BF16) { + minimum_kernel( + reinterpret_cast(input->data()), + reinterpret_cast(other->data()), + reinterpret_cast(output->data()), numel); + } else { + throw std::runtime_error("Unsupported dtype for minimum contiguous"); + } + } else { + if (dtype == DataType::F32) { + minimum_strided_kernel( + reinterpret_cast(input->data()), reinterpret_cast(other->data()), reinterpret_cast(output->data()), + input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel); + } else if (dtype == DataType::F64) { + minimum_strided_kernel( + reinterpret_cast(input->data()), reinterpret_cast(other->data()), reinterpret_cast(output->data()), + input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel); + } else if (dtype == DataType::F16) { + minimum_strided_kernel( + reinterpret_cast(input->data()), reinterpret_cast(other->data()), reinterpret_cast(output->data()), + input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel); + } else if (dtype == DataType::BF16) { + minimum_strided_kernel( + reinterpret_cast(input->data()), reinterpret_cast(other->data()), reinterpret_cast(output->data()), + input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel); + } else { + throw std::runtime_error("Unsupported dtype for minimum strided"); + } + } +} + +static bool registered = []() { + Minimum::dispatcher().registerDevice(Device::Type::CPU, &calculate_minimum); + return true; +}(); + +} // namespace infinicore::op::minimum_impl::cpu \ No newline at end of file diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 3d6ebe79a..848984916 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -4,11 +4,16 @@ #include "ops/add.hpp" #include "ops/add_rms_norm.hpp" +#include "ops/addcdiv.hpp" +#include "ops/atan2.hpp" #include "ops/attention.hpp" +#include "ops/binary_cross_entropy.hpp" +#include "ops/bucketize.hpp" #include "ops/causal_softmax.hpp" #include "ops/embedding.hpp" #include "ops/linear.hpp" #include "ops/matmul.hpp" +#include "ops/minimum.hpp" #include "ops/mul.hpp" #include "ops/paged_attention.hpp" #include "ops/paged_attention_prefill.hpp" @@ -42,6 +47,11 @@ inline void bind(py::module &m) { bind_swiglu(m); bind_rope(m); bind_embedding(m); + bind_minimum(m); + bind_atan2(m); + bind_addcdiv(m); + bind_bucketize(m); + bind_binary_cross_entropy(m); } } // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/addcdiv.hpp b/src/infinicore/pybind11/ops/addcdiv.hpp new file mode 100644 index 000000000..5f816875b --- /dev/null +++ b/src/infinicore/pybind11/ops/addcdiv.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "infinicore/ops/addcdiv.hpp" +#include + +namespace py = pybind11; +namespace infinicore::ops { + +inline void bind_addcdiv(py::module &m) { + m.def("addcdiv", + &op::addcdiv, + py::arg("input"), + py::arg("tensor1"), + py::arg("tensor2"), + py::arg("value") = 1.0f, + R"doc(Computes input + value * (tensor1 / tensor2).)doc"); + + m.def("addcdiv_", + &op::addcdiv_, + py::arg("input"), + py::arg("tensor1"), + py::arg("tensor2"), + py::arg("output"), + py::arg("value") = 1.0f, + R"doc(In-place version of addcdiv.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/atan2.hpp b/src/infinicore/pybind11/ops/atan2.hpp new file mode 100644 index 000000000..5d5324ec7 --- /dev/null +++ b/src/infinicore/pybind11/ops/atan2.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include "infinicore/ops/atan2.hpp" +#include + +namespace py = pybind11; +namespace infinicore::ops { + +inline void bind_atan2(py::module &m) { + m.def("atan2", + &op::atan2, + py::arg("input"), + py::arg("other"), + R"doc(Computes the element-wise arc tangent of input/other. Returns the angle in radians.)doc"); + + m.def("atan2_", + &op::atan2_, + py::arg("input"), + py::arg("other"), + py::arg("output"), + R"doc(In-place version of atan2.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/binary_cross_entropy.hpp b/src/infinicore/pybind11/ops/binary_cross_entropy.hpp new file mode 100644 index 000000000..a449fe7c0 --- /dev/null +++ b/src/infinicore/pybind11/ops/binary_cross_entropy.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include "infinicore/ops/binary_cross_entropy.hpp" +#include +#include + +namespace py = pybind11; +namespace infinicore::ops { + +inline void bind_binary_cross_entropy(py::module &m) { + m.def("binary_cross_entropy", + &op::binary_cross_entropy, + py::arg("input"), + py::arg("target"), + py::arg("weight") = py::none(), + py::arg("reduction") = "mean", + R"doc(Calculates Binary Cross Entropy.)doc"); + + m.def("binary_cross_entropy_", + &op::binary_cross_entropy_, + py::arg("input"), + py::arg("target"), + py::arg("weight"), + py::arg("output"), + py::arg("reduction") = "mean", + R"doc(In-place Binary Cross Entropy calculation.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/bucketize.hpp b/src/infinicore/pybind11/ops/bucketize.hpp new file mode 100644 index 000000000..d6ff77705 --- /dev/null +++ b/src/infinicore/pybind11/ops/bucketize.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include "infinicore/ops/bucketize.hpp" +#include + +namespace py = pybind11; +namespace infinicore::ops { + +inline void bind_bucketize(py::module &m) { + m.def("bucketize", + &op::bucketize, + py::arg("input"), + py::arg("boundaries"), + py::arg("right") = false, + R"doc(Returns the indices of the buckets to which each value in the input belongs.)doc"); + + m.def("bucketize_", + &op::bucketize_, + py::arg("input"), + py::arg("boundaries"), + py::arg("output"), + py::arg("right") = false, + R"doc(In-place version of bucketize.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/minimum.hpp b/src/infinicore/pybind11/ops/minimum.hpp new file mode 100644 index 000000000..232615fb1 --- /dev/null +++ b/src/infinicore/pybind11/ops/minimum.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include "infinicore/ops/minimum.hpp" +#include + +namespace py = pybind11; +namespace infinicore::ops { + +inline void bind_minimum(py::module &m) { + m.def("minimum", + &op::minimum, + py::arg("input"), + py::arg("other"), + R"doc(Computes the element-wise minimum of input and other.)doc"); + + m.def("minimum_", + &op::minimum_, + py::arg("input"), + py::arg("other"), + py::arg("output"), + R"doc(In-place version of minimum.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/test/infinicore/ops/addcdiv.py b/test/infinicore/ops/addcdiv.py index 47d574eb1..38af49331 100644 --- a/test/infinicore/ops/addcdiv.py +++ b/test/infinicore/ops/addcdiv.py @@ -124,9 +124,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.addcdiv(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.addcdiv(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.addcdiv(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/atan2.py b/test/infinicore/ops/atan2.py index 09f15a319..5d2e48477 100644 --- a/test/infinicore/ops/atan2.py +++ b/test/infinicore/ops/atan2.py @@ -115,9 +115,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.atan2(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.atan2(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.atan2(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/binary_cross_entropy.py b/test/infinicore/ops/binary_cross_entropy.py index cca511f38..971a97fb1 100644 --- a/test/infinicore/ops/binary_cross_entropy.py +++ b/test/infinicore/ops/binary_cross_entropy.py @@ -69,9 +69,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.binary_cross_entropy(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.binary_cross_entropy(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.nn.functional.binary_cross_entropy(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/bucketize.py b/test/infinicore/ops/bucketize.py index fe6ccb6da..011b1c4d9 100644 --- a/test/infinicore/ops/bucketize.py +++ b/test/infinicore/ops/bucketize.py @@ -50,12 +50,14 @@ def __init__(self): def get_test_cases(self): return parse_test_cases() - def torch_operator(self, *args, **kwargs): - return torch.bucketize(*args, **kwargs) - - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.bucketize(*args, **kwargs) + def torch_operator(self, input, boundaries, *args, **kwargs): + # 对 PyTorch 的输入进行排序 + sorted_boundaries, _ = torch.sort(boundaries) + return torch.bucketize(input, sorted_boundaries, *args, **kwargs) + + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.bucketize(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/minimum.py b/test/infinicore/ops/minimum.py index 4538665b6..1d51bc1c0 100644 --- a/test/infinicore/ops/minimum.py +++ b/test/infinicore/ops/minimum.py @@ -96,9 +96,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.minimum(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.minimum(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.minimum(*args, **kwargs) def main():