diff --git a/include/infinicore/ops/addcdiv.hpp b/include/infinicore/ops/addcdiv.hpp
new file mode 100644
index 000000000..ad1529d0a
--- /dev/null
+++ b/include/infinicore/ops/addcdiv.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Addcdiv {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, float);
+    static void execute(Tensor input, Tensor t1, Tensor t2, Tensor output, float value);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor addcdiv(Tensor input, Tensor t1, Tensor t2, float value);
+void addcdiv_(Tensor input, Tensor t1, Tensor t2, Tensor output, float value);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/atan2.hpp b/include/infinicore/ops/atan2.hpp
new file mode 100644
index 000000000..825041742
--- /dev/null
+++ b/include/infinicore/ops/atan2.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Atan2 {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    static void execute(Tensor input, Tensor other, Tensor output);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor atan2(Tensor input, Tensor other);
+void atan2_(Tensor input, Tensor other, Tensor output);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/binary_cross_entropy.hpp b/include/infinicore/ops/binary_cross_entropy.hpp
new file mode 100644
index 000000000..54be2ec2a
--- /dev/null
+++ b/include/infinicore/ops/binary_cross_entropy.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <optional>
+#include <string>
+
+namespace infinicore::op {
+
+class BinaryCrossEntropy {
+public:
+    using schema = void (*)(Tensor, Tensor, std::optional<Tensor>, Tensor, std::string);
+    static void execute(Tensor input, Tensor target, std::optional<Tensor> weight, Tensor output, std::string reduction);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor binary_cross_entropy(Tensor input, Tensor target, std::optional<Tensor> weight, std::string reduction);
+void binary_cross_entropy_(Tensor input, Tensor target, std::optional<Tensor> weight, Tensor output, std::string reduction);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/bucketize.hpp b/include/infinicore/ops/bucketize.hpp
new file mode 100644
index 000000000..6d53d4810
--- /dev/null
+++ b/include/infinicore/ops/bucketize.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Bucketize {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, bool);
+    static void execute(Tensor input, Tensor boundaries, Tensor output, bool right);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor bucketize(Tensor input, Tensor boundaries, bool right = false);
+void bucketize_(Tensor input, Tensor boundaries, Tensor output, bool right = false);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/minimum.hpp b/include/infinicore/ops/minimum.hpp
new file mode 100644
index 000000000..54f1ab272
--- /dev/null
+++ b/include/infinicore/ops/minimum.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Minimum {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    static void execute(Tensor input, Tensor other, Tensor output);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor minimum(Tensor input, Tensor other);
+void minimum_(Tensor input, Tensor other, Tensor output);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index c6b01d5aa..7575bf4c6 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -44,8 +44,12 @@
 )
 from infinicore.ops.add import add
 from infinicore.ops.add_rms_norm import add_rms_norm, add_rms_norm_
+from infinicore.ops.addcdiv import addcdiv
+from infinicore.ops.atan2 import atan2
 from infinicore.ops.attention import attention
+from infinicore.ops.bucketize import bucketize
 from infinicore.ops.matmul import matmul
+from infinicore.ops.minimum import minimum
 from infinicore.ops.mul import mul
 from infinicore.ops.narrow import narrow
 from infinicore.ops.paged_attention import paged_attention
@@ -134,6 +138,10 @@
     "strided_empty",
     "strided_from_blob",
     "zeros",
+    "minimum",
+    "atan2",
+    "addcdiv",
+    "bucketize",
 ]
 
 use_ntops = False
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index 255079790..2f55e8dc1 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -1,3 +1,4 @@
+from .binary_cross_entropy import binary_cross_entropy
 from .causal_softmax import causal_softmax
 from .embedding import embedding
 from .linear import linear
@@ -17,4 +18,5 @@
     "embedding",
     "rope",
     "RopeAlgo",
+    "binary_cross_entropy",
 ]
diff --git a/python/infinicore/nn/functional/binary_cross_entropy.py b/python/infinicore/nn/functional/binary_cross_entropy.py
new file mode 100644
index 000000000..4f9a53d85
--- /dev/null
+++ b/python/infinicore/nn/functional/binary_cross_entropy.py
@@ -0,0 +1,47 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def binary_cross_entropy(
+    input: Tensor,
+    target: Tensor,
+    weight: Tensor | None = None,
+    size_average=None,
+    reduce=None,
+    reduction: str = "mean",
+    *,
+    out=None,
+) -> Tensor:
+    r"""Apply the binary_cross_entropy function."""
+
+    if size_average is not None or reduce is not None:
+        if reduce is False:
+            reduction = "none"
+        elif size_average is True or size_average is None:
+            reduction = "mean"
+        else:
+            reduction = "sum"
+
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.binary_cross_entropy(
+            input, target, weight=weight, reduction=reduction, out=out
+        )
+
+    weight_underlying = weight._underlying if weight is not None else None
+
+    if out is None:
+        return Tensor(
+            _infinicore.binary_cross_entropy(
+                input._underlying, target._underlying, weight_underlying, reduction
+            )
+        )
+
+    _infinicore.binary_cross_entropy_(
+        input._underlying,
+        target._underlying,
+        weight_underlying,
+        out._underlying,
+        reduction,
+    )
+    return out
diff --git a/python/infinicore/ops/addcdiv.py b/python/infinicore/ops/addcdiv.py
new file mode 100644
index 000000000..7323eceda
--- /dev/null
+++ b/python/infinicore/ops/addcdiv.py
@@ -0,0 +1,38 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def addcdiv(
+    input: Tensor,
+    tensor1: Tensor,
+    tensor2: Tensor,
+    *,
+    value=1.0,
+    out=None,
+) -> Tensor:
+    r"""Apply the addcdiv function."""
+
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.addcdiv(
+            input, tensor1, tensor2, value=value, out=out
+        )
+
+    if out is None:
+        return Tensor(
+            _infinicore.addcdiv(
+                input._underlying,
+                tensor1._underlying,
+                tensor2._underlying,
+                float(value),
+            )
+        )
+
+    _infinicore.addcdiv_(
+        input._underlying,
+        tensor1._underlying,
+        tensor2._underlying,
+        out._underlying,
+        float(value),
+    )
+    return out
diff --git a/python/infinicore/ops/atan2.py b/python/infinicore/ops/atan2.py
new file mode 100644
index 000000000..ca7c122b6
--- /dev/null
+++ b/python/infinicore/ops/atan2.py
@@ -0,0 +1,21 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def atan2(
+    input: Tensor,
+    other: Tensor,
+    *,
+    out=None,
+) -> Tensor:
+    r"""Apply the atan2 function."""
+
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.atan2(input, other, out=out)
+
+    if out is None:
+        return Tensor(_infinicore.atan2(input._underlying, other._underlying))
+
+    _infinicore.atan2_(input._underlying, other._underlying, out._underlying)
+    return out
diff --git a/python/infinicore/ops/bucketize.py b/python/infinicore/ops/bucketize.py
new file mode 100644
index 000000000..9f4d411b3
--- /dev/null
+++ b/python/infinicore/ops/bucketize.py
@@ -0,0 +1,28 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def bucketize(
+    input: Tensor,
+    boundaries: Tensor,
+    *,
+    out=None,
+    right=False,
+) -> Tensor:
+    r"""Apply the bucketize function."""
+
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.bucketize(input, boundaries, out=out, right=right)
+
+    if out is None:
+        return Tensor(
+            _infinicore.bucketize(
+                input._underlying, boundaries._underlying, bool(right)
+            )
+        )
+
+    _infinicore.bucketize_(
+        input._underlying, boundaries._underlying, out._underlying, bool(right)
+    )
+    return out
diff --git a/python/infinicore/ops/minimum.py b/python/infinicore/ops/minimum.py
new file mode 100644
index 000000000..f09e6d316
--- /dev/null
+++ b/python/infinicore/ops/minimum.py
@@ -0,0 +1,21 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def minimum(
+    input: Tensor,
+    other: Tensor,
+    *,
+    out=None,
+) -> Tensor:
+    r"""Apply the minimum function."""
+
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.minimum(input, other, out=out)
+
+    if out is None:
+        return Tensor(_infinicore.minimum(input._underlying, other._underlying))
+
+    _infinicore.minimum_(input._underlying, other._underlying, out._underlying)
+    return out
diff --git a/src/infinicore/ops/addcdiv/addcdiv.cc b/src/infinicore/ops/addcdiv/addcdiv.cc
new file mode 100644
index 000000000..37b7c774e
--- /dev/null
+++ b/src/infinicore/ops/addcdiv/addcdiv.cc
@@ -0,0 +1,47 @@
+#include "infinicore/ops/addcdiv.hpp"
+#include "../../utils.hpp"
+#include <algorithm>
+
+namespace infinicore::op {
+
+common::OpDispatcher<Addcdiv::schema> &Addcdiv::dispatcher() {
+    static common::OpDispatcher<Addcdiv::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Addcdiv::execute(Tensor input, Tensor t1, Tensor t2, Tensor output, float value) {
+    infinicore::context::setDevice(input->device());
+    dispatcher().lookup(input->device().getType())(input, t1, t2, output, value);
+}
+
+static Shape broadcast_shape_3(const Shape &a, const Shape &b, const Shape &c) {
+    int ndim = std::max({a.size(), b.size(), c.size()});
+    Shape out_shape;
+
+    for (int i = 0; i < ndim; ++i) {
+        int dim_a = (i < ndim - a.size()) ? 1 : a[i - (ndim - a.size())];
+        int dim_b = (i < ndim - b.size()) ? 1 : b[i - (ndim - b.size())];
+        int dim_c = (i < ndim - c.size()) ? 1 : c[i - (ndim - c.size())];
+
+        int target = std::max({dim_a, dim_b, dim_c});
+
+        if ((dim_a != target && dim_a != 1) || (dim_b != target && dim_b != 1) || (dim_c != target && dim_c != 1)) {
+            throw std::runtime_error("Shapes are not broadcastable");
+        }
+        out_shape.push_back(target);
+    }
+    return out_shape;
+}
+
+Tensor addcdiv(Tensor input, Tensor t1, Tensor t2, float value) {
+    Shape out_shape = broadcast_shape_3(input->shape(), t1->shape(), t2->shape());
+    auto output = Tensor::empty(out_shape, input->dtype(), input->device());
+    addcdiv_(input, t1, t2, output, value);
+    return output;
+}
+
+void addcdiv_(Tensor input, Tensor t1, Tensor t2, Tensor output, float value) {
+    Addcdiv::execute(input, t1, t2, output, value);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/addcdiv/addcdiv_cpu.cc b/src/infinicore/ops/addcdiv/addcdiv_cpu.cc
new file mode 100644
index 000000000..fa9cd5c2a
--- /dev/null
+++ b/src/infinicore/ops/addcdiv/addcdiv_cpu.cc
@@ -0,0 +1,111 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/addcdiv.hpp"
+#include <cmath>
+#include <omp.h>
+#include <vector>
+
+namespace infinicore::op::addcdiv_impl::cpu {
+
+template <typename T>
+inline T addcdiv_op(T in, T t1, T t2, float value) {
+    // out = input + value * (t1 / t2)
+    float val_in = utils::cast<float>(in);
+    float val_t1 = utils::cast<float>(t1);
+    float val_t2 = utils::cast<float>(t2);
+
+    float res = val_in + value * (val_t1 / val_t2);
+    return utils::cast<T>(res);
+}
+
+template <typename T>
+void addcdiv_kernel(const T *in_ptr, const T *t1_ptr, const T *t2_ptr, T *out_ptr, float value, size_t numel) {
+#pragma omp parallel for
+    for (size_t i = 0; i < numel; ++i) {
+        out_ptr[i] = addcdiv_op<T>(in_ptr[i], t1_ptr[i], t2_ptr[i], value);
+    }
+}
+
+template <typename T>
+void addcdiv_strided_kernel(const T *in_ptr, const T *t1_ptr, const T *t2_ptr, T *out_ptr, float value,
+                            const Shape &in_shape, const Strides &in_strides,
+                            const Shape &t1_shape, const Strides &t1_strides,
+                            const Shape &t2_shape, const Strides &t2_strides,
+                            const Shape &out_shape, const Strides &out_strides,
+                            size_t numel) {
+    int ndim = out_shape.size();
+    int in_dim_offset = ndim - in_shape.size();
+    int t1_dim_offset = ndim - t1_shape.size();
+    int t2_dim_offset = ndim - t2_shape.size();
+
+#pragma omp parallel for
+    for (size_t i = 0; i < numel; ++i) {
+        size_t temp_idx = i;
+        size_t in_offset = 0;
+        size_t t1_offset = 0;
+        size_t t2_offset = 0;
+        size_t out_offset = 0;
+
+        for (int d = ndim - 1; d >= 0; --d) {
+            size_t coord = temp_idx % out_shape[d];
+            temp_idx /= out_shape[d];
+
+            out_offset += coord * out_strides[d];
+
+            if (d >= in_dim_offset && in_shape[d - in_dim_offset] > 1) {
+                in_offset += coord * in_strides[d - in_dim_offset];
+            }
+
+            if (d >= t1_dim_offset && t1_shape[d - t1_dim_offset] > 1) {
+                t1_offset += coord * t1_strides[d - t1_dim_offset];
+            }
+
+            if (d >= t2_dim_offset && t2_shape[d - t2_dim_offset] > 1) {
+                t2_offset += coord * t2_strides[d - t2_dim_offset];
+            }
+        }
+
+        out_ptr[out_offset] = addcdiv_op<T>(in_ptr[in_offset], t1_ptr[t1_offset], t2_ptr[t2_offset], value);
+    }
+}
+
+void calculate_addcdiv(Tensor input, Tensor t1, Tensor t2, Tensor output, float value) {
+    auto dtype = input->dtype();
+    if (t1->dtype() != dtype || t2->dtype() != dtype || output->dtype() != dtype) {
+        throw std::runtime_error("Dtype mismatch in addcdiv op");
+    }
+
+    size_t numel = output->numel();
+
+    bool exact_match = (input->shape() == t1->shape()) && (t1->shape() == t2->shape()) && (t2->shape() == output->shape());
+    bool all_contiguous = input->is_contiguous() && t1->is_contiguous() && t2->is_contiguous() && output->is_contiguous();
+
+    if (exact_match && all_contiguous) {
+        if (dtype == DataType::F32) {
+            addcdiv_kernel<float>((float *)input->data(), (float *)t1->data(), (float *)t2->data(), (float *)output->data(), value, numel);
+        } else if (dtype == DataType::F16) {
+            addcdiv_kernel<fp16_t>((fp16_t *)input->data(), (fp16_t *)t1->data(), (fp16_t *)t2->data(), (fp16_t *)output->data(), value, numel);
+        } else {
+            throw std::runtime_error("Unsupported dtype for addcdiv contiguous");
+        }
+    } else {
+        if (dtype == DataType::F32) {
+            addcdiv_strided_kernel<float>(
+                (float *)input->data(), (float *)t1->data(), (float *)t2->data(), (float *)output->data(), value,
+                input->shape(), input->strides(), t1->shape(), t1->strides(), t2->shape(), t2->strides(), output->shape(), output->strides(), numel);
+        } else if (dtype == DataType::F16) {
+            addcdiv_strided_kernel<fp16_t>(
+                (fp16_t *)input->data(), (fp16_t *)t1->data(), (fp16_t *)t2->data(), (fp16_t *)output->data(), value,
+                input->shape(), input->strides(), t1->shape(), t1->strides(), t2->shape(), t2->strides(), output->shape(), output->strides(), numel);
+        } else {
+            throw std::runtime_error("Unsupported dtype for addcdiv strided");
+        }
+    }
+}
+
+static bool registered = []() {
+    Addcdiv::dispatcher().registerDevice(Device::Type::CPU, &calculate_addcdiv);
+    return true;
+}();
+
+} // namespace infinicore::op::addcdiv_impl::cpu
\ No newline at end of file
diff --git a/src/infinicore/ops/atan2/atan2.cc b/src/infinicore/ops/atan2/atan2.cc
new file mode 100644
index 000000000..bc761448d
--- /dev/null
+++ b/src/infinicore/ops/atan2/atan2.cc
@@ -0,0 +1,46 @@
+#include "infinicore/ops/atan2.hpp"
+#include "../../utils.hpp"
+#include <algorithm>
+
+namespace infinicore::op {
+
+common::OpDispatcher<Atan2::schema> &Atan2::dispatcher() {
+    static common::OpDispatcher<Atan2::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Atan2::execute(Tensor input, Tensor other, Tensor output) {
+    infinicore::context::setDevice(input->device());
+    dispatcher().lookup(input->device().getType())(input, other, output);
+}
+
+static Shape broadcast_shape(const Shape &a, const Shape &b) {
+    Shape out_shape;
+    int ndim_a = a.size();
+    int ndim_b = b.size();
+    int max_ndim = std::max(ndim_a, ndim_b);
+
+    for (int i = 0; i < max_ndim; ++i) {
+        int dim_a = (i < max_ndim - ndim_a) ? 1 : a[i - (max_ndim - ndim_a)];
+        int dim_b = (i < max_ndim - ndim_b) ? 1 : b[i - (max_ndim - ndim_b)];
+
+        if (dim_a != dim_b && dim_a != 1 && dim_b != 1) {
+            throw std::runtime_error("Shapes are not broadcastable");
+        }
+        out_shape.push_back(std::max(dim_a, dim_b));
+    }
+    return out_shape;
+}
+
+Tensor atan2(Tensor input, Tensor other) {
+    Shape out_shape = broadcast_shape(input->shape(), other->shape());
+    auto output = Tensor::empty(out_shape, input->dtype(), input->device());
+    atan2_(input, other, output);
+    return output;
+}
+
+void atan2_(Tensor input, Tensor other, Tensor output) {
+    Atan2::execute(input, other, output);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/atan2/atan2_cpu.cc b/src/infinicore/ops/atan2/atan2_cpu.cc
new file mode 100644
index 000000000..d9992ce83
--- /dev/null
+++ b/src/infinicore/ops/atan2/atan2_cpu.cc
@@ -0,0 +1,130 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/atan2.hpp"
+#include <algorithm>
+#include <cmath>
+#include <omp.h>
+#include <vector>
+
+namespace infinicore::op::atan2_impl::cpu {
+
+template <typename T, typename CompT>
+inline T atan2_op(T a, T b) {
+    CompT val_a = utils::cast<CompT>(a);
+    CompT val_b = utils::cast<CompT>(b);
+    return utils::cast<T>(std::atan2(val_a, val_b));
+}
+
+template <typename T, typename CompT>
+void atan2_kernel(const T *input_ptr, const T *other_ptr, T *output_ptr, size_t numel) {
+#pragma omp parallel for
+    for (size_t i = 0; i < numel; ++i) {
+        output_ptr[i] = atan2_op<T, CompT>(input_ptr[i], other_ptr[i]);
+    }
+}
+
+template <typename T, typename CompT>
+void atan2_strided_kernel(const T *in_data, const T *other_data, T *out_data,
+                          const Shape &in_shape, const Strides &in_strides,
+                          const Shape &other_shape, const Strides &other_strides,
+                          const Shape &out_shape, const Strides &out_strides,
+                          size_t numel) {
+    int ndim = out_shape.size();
+    int in_dim_offset = ndim - in_shape.size();
+    int other_dim_offset = ndim - other_shape.size();
+
+#pragma omp parallel for
+    for (size_t i = 0; i < numel; ++i) {
+        size_t temp_idx = i;
+        size_t in_offset = 0;
+        size_t other_offset = 0;
+        size_t out_offset = 0;
+
+        for (int d = ndim - 1; d >= 0; --d) {
+            size_t coord = temp_idx % out_shape[d];
+            temp_idx /= out_shape[d];
+
+            out_offset += coord * out_strides[d];
+
+            if (d >= in_dim_offset) {
+                int local_d = d - in_dim_offset;
+                if (in_shape[local_d] > 1) {
+                    in_offset += coord * in_strides[local_d];
+                }
+            }
+
+            if (d >= other_dim_offset) {
+                int local_d = d - other_dim_offset;
+                if (other_shape[local_d] > 1) {
+                    other_offset += coord * other_strides[local_d];
+                }
+            }
+        }
+        out_data[out_offset] = atan2_op<T, CompT>(in_data[in_offset], other_data[other_offset]);
+    }
+}
+
+void calculate_atan2(Tensor input, Tensor other, Tensor output) {
+    auto dtype = input->dtype();
+    if (other->dtype() != dtype || output->dtype() != dtype) {
+        throw std::runtime_error("Dtype mismatch in atan2 op");
+    }
+
+    size_t numel = output->numel();
+
+    bool exact_match = (input->shape() == other->shape()) && (other->shape() == output->shape());
+    bool all_contiguous = input->is_contiguous() && other->is_contiguous() && output->is_contiguous();
+
+    if (exact_match && all_contiguous) {
+        if (dtype == DataType::F32) {
+            atan2_kernel<float, float>(
+                reinterpret_cast<float *>(input->data()),
+                reinterpret_cast<float *>(other->data()),
+                reinterpret_cast<float *>(output->data()), numel);
+        } else if (dtype == DataType::F64) {
+            atan2_kernel<double, double>(
+                reinterpret_cast<double *>(input->data()),
+                reinterpret_cast<double *>(other->data()),
+                reinterpret_cast<double *>(output->data()), numel);
+        } else if (dtype == DataType::F16) {
+            atan2_kernel<fp16_t, float>(
+                reinterpret_cast<fp16_t *>(input->data()),
+                reinterpret_cast<fp16_t *>(other->data()),
+                reinterpret_cast<fp16_t *>(output->data()), numel);
+        } else if (dtype == DataType::BF16) {
+            atan2_kernel<bf16_t, float>(
+                reinterpret_cast<bf16_t *>(input->data()),
+                reinterpret_cast<bf16_t *>(other->data()),
+                reinterpret_cast<bf16_t *>(output->data()), numel);
+        } else {
+            throw std::runtime_error("Unsupported dtype for atan2 contiguous");
+        }
+    } else {
+        if (dtype == DataType::F32) {
+            atan2_strided_kernel<float, float>(
+                reinterpret_cast<float *>(input->data()), reinterpret_cast<float *>(other->data()), reinterpret_cast<float *>(output->data()),
+                input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel);
+        } else if (dtype == DataType::F64) {
+            atan2_strided_kernel<double, double>(
+                reinterpret_cast<double *>(input->data()), reinterpret_cast<double *>(other->data()), reinterpret_cast<double *>(output->data()),
+                input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel);
+        } else if (dtype == DataType::F16) {
+            atan2_strided_kernel<fp16_t, float>(
+                reinterpret_cast<fp16_t *>(input->data()), reinterpret_cast<fp16_t *>(other->data()), reinterpret_cast<fp16_t *>(output->data()),
+                input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel);
+        } else if (dtype == DataType::BF16) {
+            atan2_strided_kernel<bf16_t, float>(
+                reinterpret_cast<bf16_t *>(input->data()), reinterpret_cast<bf16_t *>(other->data()), reinterpret_cast<bf16_t *>(output->data()),
+                input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel);
+        } else {
+            throw std::runtime_error("Unsupported dtype for atan2 strided");
+        }
+    }
+}
+
+static bool registered = []() {
+    Atan2::dispatcher().registerDevice(Device::Type::CPU, &calculate_atan2);
+    return true;
+}();
+
+} // namespace infinicore::op::atan2_impl::cpu
\ No newline at end of file
diff --git a/src/infinicore/ops/binary_cross_entropy/binary_cross_entropy.cc b/src/infinicore/ops/binary_cross_entropy/binary_cross_entropy.cc
new file mode 100644
index 000000000..41892822b
--- /dev/null
+++ b/src/infinicore/ops/binary_cross_entropy/binary_cross_entropy.cc
@@ -0,0 +1,31 @@
+#include "infinicore/ops/binary_cross_entropy.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<BinaryCrossEntropy::schema> &BinaryCrossEntropy::dispatcher() {
+    static common::OpDispatcher<BinaryCrossEntropy::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void BinaryCrossEntropy::execute(Tensor input, Tensor target, std::optional<Tensor> weight, Tensor output, std::string reduction) {
+    infinicore::context::setDevice(input->device());
+    dispatcher().lookup(input->device().getType())(input, target, weight, output, reduction);
+}
+
+Tensor binary_cross_entropy(Tensor input, Tensor target, std::optional<Tensor> weight, std::string reduction) {
+    Shape output_shape = {};
+    if (reduction == "none") {
+        output_shape = input->shape();
+    }
+
+    auto output = Tensor::empty(output_shape, input->dtype(), input->device());
+    binary_cross_entropy_(input, target, weight, output, reduction);
+    return output;
+}
+
+void binary_cross_entropy_(Tensor input, Tensor target, std::optional<Tensor> weight, Tensor output, std::string reduction) {
+    BinaryCrossEntropy::execute(input, target, weight, output, reduction);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/binary_cross_entropy/binary_cross_entropy_cpu.cc b/src/infinicore/ops/binary_cross_entropy/binary_cross_entropy_cpu.cc
new file mode 100644
index 000000000..b1827ca8e
--- /dev/null
+++ b/src/infinicore/ops/binary_cross_entropy/binary_cross_entropy_cpu.cc
@@ -0,0 +1,189 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/binary_cross_entropy.hpp"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <omp.h>
+#include <optional>
+#include <vector>
+
+namespace infinicore::op::binary_cross_entropy_impl::cpu {
+
+inline float bf16_to_f32(uint16_t val) {
+    uint32_t bits = static_cast<uint32_t>(val) << 16;
+    float f;
+    std::memcpy(&f, &bits, sizeof(f));
+    return f;
+}
+
+// PyTorch CPU 在逐元素输出时行为接近截断
+inline uint16_t f32_to_bf16_trunc(float val) {
+    if (std::isnan(val)) {
+        return 0x7FC0;
+    }
+    uint32_t bits;
+    std::memcpy(&bits, &val, sizeof(bits));
+    return static_cast<uint16_t>(bits >> 16);
+}
+
+// Reduction 结果 RNE 舍入
+inline uint16_t f32_to_bf16_rne(float val) {
+    union {
+        float f;
+        uint32_t u;
+    } x;
+    x.f = val;
+    if (std::isnan(val)) {
+        return 0x7FC0;
+    }
+
+    uint32_t lsb = (x.u >> 16) & 1;
+    uint32_t rounding_bias = 0x7FFF + lsb;
+    x.u += rounding_bias;
+    return static_cast<uint16_t>(x.u >> 16);
+}
+
+template <typename T, typename AccT>
+void bce_kernel(const Tensor &input, const Tensor &target, std::optional<Tensor> weight, Tensor &output, std::string reduction) {
+
+    const void *input_raw = input->data();
+    const void *target_raw = target->data();
+    T *output_data = reinterpret_cast<T *>(output->data());
+
+    const void *weight_raw = nullptr;
+    if (weight.has_value() && weight.value()) {
+        weight_raw = weight.value()->data();
+    }
+
+    size_t numel = input->numel();
+    auto input_strides = input->strides();
+    auto target_strides = target->strides();
+    auto shape = input->shape();
+    int ndim = input->ndim();
+
+    bool contiguous = input->is_contiguous() && target->is_contiguous();
+    if (weight_raw && !weight.value()->is_contiguous()) {
+        contiguous = false;
+    }
+
+    auto dtype = input->dtype();
+
+    AccT total_loss = 0;
+
+    auto read_val = [&](const void *ptr, size_t offset) -> AccT {
+        if (dtype == DataType::BF16) {
+            const uint16_t *p = reinterpret_cast<const uint16_t *>(ptr);
+            return static_cast<AccT>(bf16_to_f32(p[offset]));
+        } else {
+            return utils::cast<AccT>(reinterpret_cast<const T *>(ptr)[offset]);
+        }
+    };
+
+    auto write_val_elementwise = [&](size_t offset, AccT val) {
+        if (dtype == DataType::BF16) {
+            reinterpret_cast<uint16_t *>(output_data)[offset] = f32_to_bf16_trunc(static_cast<float>(val));
+        } else {
+            output_data[offset] = utils::cast<T>(val);
+        }
+    };
+
+    const AccT eps = static_cast<AccT>(1e-12);
+    const AccT one = static_cast<AccT>(1.0);
+
+    if (contiguous) {
+#pragma omp parallel for reduction(+ : total_loss)
+        for (size_t i = 0; i < numel; ++i) {
+            AccT x = read_val(input_raw, i);
+            AccT y = read_val(target_raw, i);
+
+            AccT term1 = std::max(x, eps);
+            AccT term2 = std::max(one - x, eps);
+
+            AccT loss = -(y * std::log(term1) + (one - y) * std::log(term2));
+
+            if (weight_raw) {
+                AccT w = read_val(weight_raw, i);
+                loss *= w;
+            }
+
+            if (reduction == "none") {
+                write_val_elementwise(i, loss);
+            } else {
+                total_loss += loss;
+            }
+        }
+    } else {
+#pragma omp parallel for reduction(+ : total_loss)
+        for (size_t i = 0; i < numel; ++i) {
+            size_t temp_idx = i;
+            size_t input_offset = 0;
+            size_t target_offset = 0;
+            size_t weight_offset = 0;
+
+            for (int d = ndim - 1; d >= 0; --d) {
+                size_t coord = temp_idx % shape[d];
+                temp_idx /= shape[d];
+                input_offset += coord * input_strides[d];
+                target_offset += coord * target_strides[d];
+                if (weight_raw) {
+                    weight_offset += coord * weight.value()->strides()[d];
+                }
+            }
+
+            AccT x = read_val(input_raw, input_offset);
+            AccT y = read_val(target_raw, target_offset);
+
+            AccT term1 = std::max(x, eps);
+            AccT term2 = std::max(one - x, eps);
+
+            AccT loss = -(y * std::log(term1) + (one - y) * std::log(term2));
+
+            if (weight_raw) {
+                AccT w = read_val(weight_raw, weight_offset);
+                loss *= w;
+            }
+
+            if (reduction == "none") {
+                write_val_elementwise(i, loss);
+            } else {
+                total_loss += loss;
+            }
+        }
+    }
+
+    if (reduction != "none") {
+        if (reduction == "mean") {
+            total_loss /= static_cast<AccT>(numel);
+        }
+
+        if (dtype == DataType::BF16) {
+            *reinterpret_cast<uint16_t *>(output_data) = f32_to_bf16_rne(static_cast<float>(total_loss));
+        } else {
+            *output_data = utils::cast<T>(total_loss);
+        }
+    }
+}
+
+void calculate(Tensor input, Tensor target, std::optional<Tensor> weight, Tensor output, std::string reduction) {
+    auto dtype = input->dtype();
+
+    if (dtype == DataType::F32) {
+        bce_kernel<float, float>(input, target, weight, output, reduction);
+    } else if (dtype == DataType::F16) {
+        bce_kernel<fp16_t, float>(input, target, weight, output, reduction);
+    } else if (dtype == DataType::BF16) {
+        bce_kernel<uint16_t, float>(input, target, weight, output, reduction);
+    } else if (dtype == DataType::F64) {
+        bce_kernel<double, double>(input, target, weight, output, reduction);
+    } else {
+        throw std::runtime_error("Unsupported dtype for binary_cross_entropy");
+    }
+}
+
+static bool registered = []() {
+    BinaryCrossEntropy::dispatcher().registerDevice(Device::Type::CPU, &calculate);
+    return true;
+}();
+
+} // namespace infinicore::op::binary_cross_entropy_impl::cpu
\ No newline at end of file
diff --git a/src/infinicore/ops/bucketize/bucketize.cc b/src/infinicore/ops/bucketize/bucketize.cc
new file mode 100644
index 000000000..c851d14c7
--- /dev/null
+++ b/src/infinicore/ops/bucketize/bucketize.cc
@@ -0,0 +1,26 @@
+#include "infinicore/ops/bucketize.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Bucketize::schema> &Bucketize::dispatcher() {
+    static common::OpDispatcher<Bucketize::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Bucketize::execute(Tensor input, Tensor boundaries, Tensor output, bool right) {
+    infinicore::context::setDevice(input->device());
+    dispatcher().lookup(input->device().getType())(input, boundaries, output, right);
+}
+
+Tensor bucketize(Tensor input, Tensor boundaries, bool right) {
+    auto output = Tensor::empty(input->shape(), DataType::I64, input->device());
+    bucketize_(input, boundaries, output, right);
+    return output;
+}
+
+void bucketize_(Tensor input, Tensor boundaries, Tensor output, bool right) {
+    Bucketize::execute(input, boundaries, output, right);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/bucketize/bucketize_cpu.cc b/src/infinicore/ops/bucketize/bucketize_cpu.cc
new file mode 100644
index 000000000..2ed64e068
--- /dev/null
+++ b/src/infinicore/ops/bucketize/bucketize_cpu.cc
@@ -0,0 +1,124 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/bucketize.hpp"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <omp.h>
+#include <vector>
+
+namespace infinicore::op::bucketize_impl::cpu {
+
+template <typename T>
+void bucketize_contiguous_kernel(const T *in_ptr, const T *bound_ptr, int64_t *out_ptr,
+                                 size_t numel, size_t bound_len, bool right) {
+    const T *bound_end = bound_ptr + bound_len;
+
+#pragma omp parallel for
+    for (size_t i = 0; i < numel; ++i) {
+        T val = in_ptr[i];
+        const T *result_ptr;
+
+        if (right) {
+            result_ptr = std::upper_bound(bound_ptr, bound_end, val);
+        } else {
+            result_ptr = std::lower_bound(bound_ptr, bound_end, val);
+        }
+
+        out_ptr[i] = static_cast<int64_t>(result_ptr - bound_ptr);
+    }
+}
+
+template <typename T>
+void bucketize_strided_kernel(const T *in_ptr, const T *bound_ptr, int64_t *out_ptr,
+                              const Shape &in_shape, const Strides &in_strides,
+                              const Shape &out_shape, const Strides &out_strides,
+                              size_t numel, size_t bound_len, bool right) {
+    int ndim = out_shape.size();
+    const T *bound_end = bound_ptr + bound_len;
+
+#pragma omp parallel for
+    for (size_t i = 0; i < numel; ++i) {
+        size_t temp_idx = i;
+        size_t in_offset = 0;
+        size_t out_offset = 0;
+
+        for (int d = ndim - 1; d >= 0; --d) {
+            size_t coord = temp_idx % out_shape[d];
+            temp_idx /= out_shape[d];
+
+            out_offset += coord * out_strides[d];
+            in_offset += coord * in_strides[d];
+        }
+
+        T val = in_ptr[in_offset];
+        const T *result_ptr;
+
+        if (right) {
+            result_ptr = std::upper_bound(bound_ptr, bound_end, val);
+        } else {
+            result_ptr = std::lower_bound(bound_ptr, bound_end, val);
+        }
+
+        out_ptr[out_offset] = static_cast<int64_t>(result_ptr - bound_ptr);
+    }
+}
+
+void calculate_bucketize(Tensor input, Tensor boundaries, Tensor output, bool right) {
+    if (output->dtype() != DataType::I64) {
+        throw std::runtime_error("Bucketize output must be int64");
+    }
+
+    Tensor boundaries_contig = boundaries;
+    if (!boundaries->is_contiguous()) {
+        boundaries_contig = boundaries->contiguous();
+    }
+
+    size_t bound_len = boundaries_contig->numel();
+    size_t numel = input->numel();
+    auto dtype = input->dtype();
+
+    std::vector<float> sorted_boundaries(bound_len);
+    const float *raw_bound_ptr = reinterpret_cast<const float *>(boundaries_contig->data());
+
+    std::memcpy(sorted_boundaries.data(), raw_bound_ptr, bound_len * sizeof(float));
+
+    std::sort(sorted_boundaries.begin(), sorted_boundaries.end());
+
+    const float *bound_ptr = sorted_boundaries.data();
+
+    bool in_out_contiguous = input->is_contiguous() && output->is_contiguous();
+
+    if (in_out_contiguous) {
+        int64_t *out_ptr = reinterpret_cast<int64_t *>(output->data());
+        if (dtype == DataType::F32) {
+            bucketize_contiguous_kernel<float>(
+                (float *)input->data(),
+                bound_ptr,
+                out_ptr, numel, bound_len, right);
+        } else if (dtype == DataType::F16) {
+            throw std::runtime_error("F16 bucketize cpu not implemented yet");
+        } else {
+            throw std::runtime_error("Unsupported input dtype");
+        }
+    } else {
+        int64_t *out_ptr = reinterpret_cast<int64_t *>(output->data());
+        if (dtype == DataType::F32) {
+            bucketize_strided_kernel<float>(
+                (float *)input->data(),
+                bound_ptr,
+                out_ptr,
+                input->shape(), input->strides(), output->shape(), output->strides(),
+                numel, bound_len, right);
+        } else {
+            throw std::runtime_error("Unsupported input dtype");
+        }
+    }
+}
+
+static bool registered = []() {
+    Bucketize::dispatcher().registerDevice(Device::Type::CPU, &calculate_bucketize);
+    return true;
+}();
+
+} // namespace infinicore::op::bucketize_impl::cpu
\ No newline at end of file
diff --git a/src/infinicore/ops/minimum/minimum.cc b/src/infinicore/ops/minimum/minimum.cc
new file mode 100644
index 000000000..eb5ddf40f
--- /dev/null
+++ b/src/infinicore/ops/minimum/minimum.cc
@@ -0,0 +1,46 @@
+#include "infinicore/ops/minimum.hpp"
+#include "../../utils.hpp"
+#include <algorithm>
+
+namespace infinicore::op {
+
+common::OpDispatcher<Minimum::schema> &Minimum::dispatcher() {
+    static common::OpDispatcher<Minimum::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Minimum::execute(Tensor input, Tensor other, Tensor output) {
+    infinicore::context::setDevice(input->device());
+    dispatcher().lookup(input->device().getType())(input, other, output);
+}
+
+static Shape broadcast_shape(const Shape &a, const Shape &b) {
+    Shape out_shape;
+    int ndim_a = a.size();
+    int ndim_b = b.size();
+    int max_ndim = std::max(ndim_a, ndim_b);
+
+    for (int i = 0; i < max_ndim; ++i) {
+        int dim_a = (i < max_ndim - ndim_a) ? 1 : a[i - (max_ndim - ndim_a)];
+        int dim_b = (i < max_ndim - ndim_b) ? 1 : b[i - (max_ndim - ndim_b)];
+
+        if (dim_a != dim_b && dim_a != 1 && dim_b != 1) {
+            throw std::runtime_error("Shapes are not broadcastable");
+        }
+        out_shape.push_back(std::max(dim_a, dim_b));
+    }
+    return out_shape;
+}
+
+Tensor minimum(Tensor input, Tensor other) {
+    Shape out_shape = broadcast_shape(input->shape(), other->shape());
+    auto output = Tensor::empty(out_shape, input->dtype(), input->device());
+    minimum_(input, other, output);
+    return output;
+}
+
+void minimum_(Tensor input, Tensor other, Tensor output) {
+    Minimum::execute(input, other, output);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/minimum/minimum_cpu.cc b/src/infinicore/ops/minimum/minimum_cpu.cc
new file mode 100644
index 000000000..1b8ab4473
--- /dev/null
+++ b/src/infinicore/ops/minimum/minimum_cpu.cc
@@ -0,0 +1,130 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/minimum.hpp"
+#include <algorithm>
+#include <cmath>
+#include <omp.h>
+#include <vector>
+
+namespace infinicore::op::minimum_impl::cpu {
+
+template <typename T, typename CompT>
+inline T min_op(T a, T b) {
+    CompT val_a = utils::cast<CompT>(a);
+    CompT val_b = utils::cast<CompT>(b);
+    return utils::cast<T>(std::min(val_a, val_b));
+}
+
+template <typename T, typename CompT>
+void minimum_kernel(const T *input_ptr, const T *other_ptr, T *output_ptr, size_t numel) {
+#pragma omp parallel for
+    for (size_t i = 0; i < numel; ++i) {
+        output_ptr[i] = min_op<T, CompT>(input_ptr[i], other_ptr[i]);
+    }
+}
+
+template <typename T, typename CompT>
+void minimum_strided_kernel(const T *in_data, const T *other_data, T *out_data,
+                            const Shape &in_shape, const Strides &in_strides,
+                            const Shape &other_shape, const Strides &other_strides,
+                            const Shape &out_shape, const Strides &out_strides,
+                            size_t numel) {
+    int ndim = out_shape.size();
+    int in_dim_offset = ndim - in_shape.size();
+    int other_dim_offset = ndim - other_shape.size();
+
+#pragma omp parallel for
+    for (size_t i = 0; i < numel; ++i) {
+        size_t temp_idx = i;
+        size_t in_offset = 0;
+        size_t other_offset = 0;
+        size_t out_offset = 0;
+
+        for (int d = ndim - 1; d >= 0; --d) {
+            size_t coord = temp_idx % out_shape[d];
+            temp_idx /= out_shape[d];
+
+            out_offset += coord * out_strides[d];
+
+            if (d >= in_dim_offset) {
+                int local_d = d - in_dim_offset;
+                if (in_shape[local_d] > 1) {
+                    in_offset += coord * in_strides[local_d];
+                }
+            }
+
+            if (d >= other_dim_offset) {
+                int local_d = d - other_dim_offset;
+                if (other_shape[local_d] > 1) {
+                    other_offset += coord * other_strides[local_d];
+                }
+            }
+        }
+        out_data[out_offset] = min_op<T, CompT>(in_data[in_offset], other_data[other_offset]);
+    }
+}
+
+void calculate_minimum(Tensor input, Tensor other, Tensor output) {
+    auto dtype = input->dtype();
+    if (other->dtype() != dtype || output->dtype() != dtype) {
+        throw std::runtime_error("Dtype mismatch in minimum op");
+    }
+
+    size_t numel = output->numel();
+
+    bool exact_match = (input->shape() == other->shape()) && (other->shape() == output->shape());
+    bool all_contiguous = input->is_contiguous() && other->is_contiguous() && output->is_contiguous();
+
+    if (exact_match && all_contiguous) {
+        if (dtype == DataType::F32) {
+            minimum_kernel<float, float>(
+                reinterpret_cast<float *>(input->data()),
+                reinterpret_cast<float *>(other->data()),
+                reinterpret_cast<float *>(output->data()), numel);
+        } else if (dtype == DataType::F64) {
+            minimum_kernel<double, double>(
+                reinterpret_cast<double *>(input->data()),
+                reinterpret_cast<double *>(other->data()),
+                reinterpret_cast<double *>(output->data()), numel);
+        } else if (dtype == DataType::F16) {
+            minimum_kernel<fp16_t, float>(
+                reinterpret_cast<fp16_t *>(input->data()),
+                reinterpret_cast<fp16_t *>(other->data()),
+                reinterpret_cast<fp16_t *>(output->data()), numel);
+        } else if (dtype == DataType::BF16) {
+            minimum_kernel<bf16_t, float>(
+                reinterpret_cast<bf16_t *>(input->data()),
+                reinterpret_cast<bf16_t *>(other->data()),
+                reinterpret_cast<bf16_t *>(output->data()), numel);
+        } else {
+            throw std::runtime_error("Unsupported dtype for minimum contiguous");
+        }
+    } else {
+        if (dtype == DataType::F32) {
+            minimum_strided_kernel<float, float>(
+                reinterpret_cast<float *>(input->data()), reinterpret_cast<float *>(other->data()), reinterpret_cast<float *>(output->data()),
+                input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel);
+        } else if (dtype == DataType::F64) {
+            minimum_strided_kernel<double, double>(
+                reinterpret_cast<double *>(input->data()), reinterpret_cast<double *>(other->data()), reinterpret_cast<double *>(output->data()),
+                input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel);
+        } else if (dtype == DataType::F16) {
+            minimum_strided_kernel<fp16_t, float>(
+                reinterpret_cast<fp16_t *>(input->data()), reinterpret_cast<fp16_t *>(other->data()), reinterpret_cast<fp16_t *>(output->data()),
+                input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel);
+        } else if (dtype == DataType::BF16) {
+            minimum_strided_kernel<bf16_t, float>(
+                reinterpret_cast<bf16_t *>(input->data()), reinterpret_cast<bf16_t *>(other->data()), reinterpret_cast<bf16_t *>(output->data()),
+                input->shape(), input->strides(), other->shape(), other->strides(), output->shape(), output->strides(), numel);
+        } else {
+            throw std::runtime_error("Unsupported dtype for minimum strided");
+        }
+    }
+}
+
+static bool registered = []() {
+    Minimum::dispatcher().registerDevice(Device::Type::CPU, &calculate_minimum);
+    return true;
+}();
+
+} // namespace infinicore::op::minimum_impl::cpu
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index 3d6ebe79a..848984916 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -4,11 +4,16 @@
 
 #include "ops/add.hpp"
 #include "ops/add_rms_norm.hpp"
+#include "ops/addcdiv.hpp"
+#include "ops/atan2.hpp"
 #include "ops/attention.hpp"
+#include "ops/binary_cross_entropy.hpp"
+#include "ops/bucketize.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/embedding.hpp"
 #include "ops/linear.hpp"
 #include "ops/matmul.hpp"
+#include "ops/minimum.hpp"
 #include "ops/mul.hpp"
 #include "ops/paged_attention.hpp"
 #include "ops/paged_attention_prefill.hpp"
@@ -42,6 +47,11 @@ inline void bind(py::module &m) {
     bind_swiglu(m);
     bind_rope(m);
     bind_embedding(m);
+    bind_minimum(m);
+    bind_atan2(m);
+    bind_addcdiv(m);
+    bind_bucketize(m);
+    bind_binary_cross_entropy(m);
 }
 
 } // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/addcdiv.hpp b/src/infinicore/pybind11/ops/addcdiv.hpp
new file mode 100644
index 000000000..5f816875b
--- /dev/null
+++ b/src/infinicore/pybind11/ops/addcdiv.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "infinicore/ops/addcdiv.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+namespace infinicore::ops {
+
+inline void bind_addcdiv(py::module &m) {
+    m.def("addcdiv",
+          &op::addcdiv,
+          py::arg("input"),
+          py::arg("tensor1"),
+          py::arg("tensor2"),
+          py::arg("value") = 1.0f,
+          R"doc(Computes input + value * (tensor1 / tensor2).)doc");
+
+    m.def("addcdiv_",
+          &op::addcdiv_,
+          py::arg("input"),
+          py::arg("tensor1"),
+          py::arg("tensor2"),
+          py::arg("output"),
+          py::arg("value") = 1.0f,
+          R"doc(In-place version of addcdiv.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/atan2.hpp b/src/infinicore/pybind11/ops/atan2.hpp
new file mode 100644
index 000000000..5d5324ec7
--- /dev/null
+++ b/src/infinicore/pybind11/ops/atan2.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "infinicore/ops/atan2.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+namespace infinicore::ops {
+
+inline void bind_atan2(py::module &m) {
+    m.def("atan2",
+          &op::atan2,
+          py::arg("input"),
+          py::arg("other"),
+          R"doc(Computes the element-wise arc tangent of input/other. Returns the angle in radians.)doc");
+
+    m.def("atan2_",
+          &op::atan2_,
+          py::arg("input"),
+          py::arg("other"),
+          py::arg("output"),
+          R"doc(In-place version of atan2.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/binary_cross_entropy.hpp b/src/infinicore/pybind11/ops/binary_cross_entropy.hpp
new file mode 100644
index 000000000..a449fe7c0
--- /dev/null
+++ b/src/infinicore/pybind11/ops/binary_cross_entropy.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "infinicore/ops/binary_cross_entropy.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+namespace infinicore::ops {
+
+inline void bind_binary_cross_entropy(py::module &m) {
+    m.def("binary_cross_entropy",
+          &op::binary_cross_entropy,
+          py::arg("input"),
+          py::arg("target"),
+          py::arg("weight") = py::none(),
+          py::arg("reduction") = "mean",
+          R"doc(Calculates Binary Cross Entropy.)doc");
+
+    m.def("binary_cross_entropy_",
+          &op::binary_cross_entropy_,
+          py::arg("input"),
+          py::arg("target"),
+          py::arg("weight"),
+          py::arg("output"),
+          py::arg("reduction") = "mean",
+          R"doc(In-place Binary Cross Entropy calculation.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/bucketize.hpp b/src/infinicore/pybind11/ops/bucketize.hpp
new file mode 100644
index 000000000..d6ff77705
--- /dev/null
+++ b/src/infinicore/pybind11/ops/bucketize.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "infinicore/ops/bucketize.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+namespace infinicore::ops {
+
+inline void bind_bucketize(py::module &m) {
+    m.def("bucketize",
+          &op::bucketize,
+          py::arg("input"),
+          py::arg("boundaries"),
+          py::arg("right") = false,
+          R"doc(Returns the indices of the buckets to which each value in the input belongs.)doc");
+
+    m.def("bucketize_",
+          &op::bucketize_,
+          py::arg("input"),
+          py::arg("boundaries"),
+          py::arg("output"),
+          py::arg("right") = false,
+          R"doc(In-place version of bucketize.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/minimum.hpp b/src/infinicore/pybind11/ops/minimum.hpp
new file mode 100644
index 000000000..232615fb1
--- /dev/null
+++ b/src/infinicore/pybind11/ops/minimum.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "infinicore/ops/minimum.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+namespace infinicore::ops {
+
+inline void bind_minimum(py::module &m) {
+    m.def("minimum",
+          &op::minimum,
+          py::arg("input"),
+          py::arg("other"),
+          R"doc(Computes the element-wise minimum of input and other.)doc");
+
+    m.def("minimum_",
+          &op::minimum_,
+          py::arg("input"),
+          py::arg("other"),
+          py::arg("output"),
+          R"doc(In-place version of minimum.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/test/infinicore/ops/addcdiv.py b/test/infinicore/ops/addcdiv.py
index 47d574eb1..38af49331 100644
--- a/test/infinicore/ops/addcdiv.py
+++ b/test/infinicore/ops/addcdiv.py
@@ -124,9 +124,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.addcdiv(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.addcdiv(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.addcdiv(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/atan2.py b/test/infinicore/ops/atan2.py
index 09f15a319..5d2e48477 100644
--- a/test/infinicore/ops/atan2.py
+++ b/test/infinicore/ops/atan2.py
@@ -115,9 +115,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.atan2(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.atan2(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.atan2(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/binary_cross_entropy.py b/test/infinicore/ops/binary_cross_entropy.py
index cca511f38..971a97fb1 100644
--- a/test/infinicore/ops/binary_cross_entropy.py
+++ b/test/infinicore/ops/binary_cross_entropy.py
@@ -69,9 +69,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.binary_cross_entropy(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.binary_cross_entropy(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.nn.functional.binary_cross_entropy(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/bucketize.py b/test/infinicore/ops/bucketize.py
index fe6ccb6da..011b1c4d9 100644
--- a/test/infinicore/ops/bucketize.py
+++ b/test/infinicore/ops/bucketize.py
@@ -50,12 +50,14 @@ def __init__(self):
     def get_test_cases(self):
         return parse_test_cases()
 
-    def torch_operator(self, *args, **kwargs):
-        return torch.bucketize(*args, **kwargs)
-
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.bucketize(*args, **kwargs)
+    def torch_operator(self, input, boundaries, *args, **kwargs):
+        # 对 PyTorch 的输入进行排序
+        sorted_boundaries, _ = torch.sort(boundaries)
+        return torch.bucketize(input, sorted_boundaries, *args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.bucketize(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/minimum.py b/test/infinicore/ops/minimum.py
index 4538665b6..1d51bc1c0 100644
--- a/test/infinicore/ops/minimum.py
+++ b/test/infinicore/ops/minimum.py
@@ -96,9 +96,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.minimum(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.minimum(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.minimum(*args, **kwargs)
 
 
 def main():