From 6e60ec07ddfc533222b604c2f14e5befefec406d Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Mon, 27 Oct 2025 11:34:19 +0800
Subject: [PATCH 01/56] init distance transform

---
 pyproject.toml                               | 3 +--
 test/test_distance_transform.py              | 4 ++++
 torchmorph/csrc/distance_transform_kernel.cu | 3 +++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1dce09a..6027eac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,5 @@ max-line-length = 100
 extend-ignore = ["E203", "W503"]
 
 [tool.pytest.ini_options]
-addopts = "-v"
+addopts = "-v --import-mode=importlib"
 testpaths = ["test"]
-
diff --git a/test/test_distance_transform.py b/test/test_distance_transform.py
index 166968c..e002200 100644
--- a/test/test_distance_transform.py
+++ b/test/test_distance_transform.py
@@ -14,6 +14,10 @@ def test_distance_transform():
     y = tm.distance_transform(x)
 
     expected = x * 2
+    # here we compare the output, i.e. results of our distance transform,
+    # with the expected output, e.g. the results of scipy.ndimage.distance_transform_edt
+    # currently, our implementation simply multiplies the input by 2,
+    # but eventually we have to implement the full algorithm.
     torch.testing.assert_close(y, expected)
     assert y.device.type == "cuda"
     assert y.shape == x.shape
diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 6a57f49..33d3abd 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -1,5 +1,8 @@
 #include <torch/extension.h>
 
+// distance transform: https://en.wikipedia.org/wiki/Distance_transform
+// https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.distance_transform_edt.html
+
 __global__ void distance_transform_kernel(const float* in, float* out, int64_t N) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < N) {

From 875483d93115b997e008a0b9f4fefb9bc8abd7ea Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Wed, 29 Oct 2025 02:08:05 +0800
Subject: [PATCH 02/56] =?UTF-8?q?docs:=20=E8=AF=A6=E7=BB=86=E8=AE=B0?=
 =?UTF-8?q?=E5=BD=95=E5=B9=B6=E4=BF=AE=E5=A4=8D=E9=A1=B9=E7=9B=AE=E7=8E=AF?=
 =?UTF-8?q?=E5=A2=83=E6=90=AD=E5=BB=BA=E4=B8=8E=E6=9E=84=E5=BB=BA=E6=B5=81?=
 =?UTF-8?q?=E7=A8=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

安装了miniconda来进行虚拟环境包管理 在执行`pip install -e. "import torch" 失败并抛出 `ModuleNotFoundError
 所以在 `pyproject.toml` 的 `[build-system].requires` 列表中明确添加 `"torch"`。
这会强制 pip 在构建开始前，先将 torch 安装到其临时环境中，从而确保构建过程顺利完成。
---
 pyproject.toml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 6027eac..10af0cd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,3 +16,7 @@ extend-ignore = ["E203", "W503"]
 [tool.pytest.ini_options]
 addopts = "-v --import-mode=importlib"
 testpaths = ["test"]
+
+[build-system]
+requires = ["setuptools>=61.0", "wheel", "torch"]
+build-backend = "setuptools.build_meta"
\ No newline at end of file

From 79c5acdb26faeba1f282a435c33ff36d04fc865c Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Sat, 1 Nov 2025 21:18:22 +0800
Subject: [PATCH 03/56] =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E4=BA=8C=E7=BB=B4?=
 =?UTF-8?q?=E6=AC=A7=E5=BC=8F=E8=B7=9D=E7=A6=BB=E5=8F=98=E6=8D=A2=EF=BC=88?=
 =?UTF-8?q?EDT=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 新增 CUDA 内核，分别用于行与列方向的距离变换
- 支持在 GPU 上处理二维张量
- 已通过基础单元测试验证正确性
- 注意：当前实现仅适用于二维情况，尚未推广到 N 维张量
---
 test/test_distance_transform.py              | 113 ++++++++++++++++---
 torchmorph/csrc/distance_transform_kernel.cu | 109 ++++++++++++++----
 torchmorph/csrc/torchmorph.cpp               |  11 --
 torchmorph/csrc/torchmorph.cu                |  51 +++++++++
 torchmorph/distance_transform.py             |   2 +-
 5 files changed, 239 insertions(+), 47 deletions(-)
 delete mode 100644 torchmorph/csrc/torchmorph.cpp
 create mode 100644 torchmorph/csrc/torchmorph.cu

diff --git a/test/test_distance_transform.py b/test/test_distance_transform.py
index e002200..493758a 100644
--- a/test/test_distance_transform.py
+++ b/test/test_distance_transform.py
@@ -1,24 +1,105 @@
 import torch
 import pytest
-import torchmorph as tm
 from scipy.ndimage import distance_transform_edt as dte
+import torchmorph as tm
+import numpy as np
+
+# --- 我们在这里定义所有的测试用例 ---
+
+# 用例 1: 我们之前成功的那个标准例子
+case_standard = np.array([
+    [0, 1, 1, 1, 1],
+    [0, 0, 1, 1, 1],
+    [0, 1, 1, 1, 1],
+    [0, 1, 1, 1, 0],
+    [0, 1, 1, 0, 0]
+], dtype=np.float32)
+
+# 用例 2: 全是背景 (0)，输出应该全是 0
+case_all_background = np.zeros((5, 5), dtype=np.float32)
+
+# 用例 3: 全是前景 (1)，输出应该也全是 0 (因为前景点到背景的距离未定义，SciPy默认输出0)
+case_all_foreground = np.ones((5, 5), dtype=np.float32)
+
+# 用例 4: 只有一个背景点 (0) 在中间
+case_single_background = np.ones((5, 5), dtype=np.float32)
+case_single_background[2, 2] = 0
 
+# 用例 5: 只有一个前景点 (1) 在中间
+case_single_foreground = np.zeros((5, 5), dtype=np.float32)
+case_single_foreground[2, 2] = 1
 
-@pytest.mark.cuda
-def test_distance_transform():
-    """Test that tm.foo doubles all tensor elements."""
+# 用例 6: 非正方形的矩阵 (高 > 宽)
+case_tall_matrix = np.array([
+    [1, 0, 1],
+    [1, 1, 1],
+    [1, 1, 1],
+    [0, 1, 0],
+    [1, 1, 1],
+], dtype=np.float32)
+
+# 用例 7: 非正方形的矩阵 (宽 > 高)
+case_wide_matrix = np.array([
+    [1, 1, 0, 1, 1],
+    [1, 1, 1, 1, 0],
+    [0, 1, 1, 1, 1],
+], dtype=np.float32)
+
+# 用例 8: 棋盘格，考验对角线距离的计算
+case_checkerboard = np.array([
+    [0, 1, 0, 1],
+    [1, 0, 1, 0],
+    [0, 1, 0, 1],
+    [1, 0, 1, 0],
+], dtype=np.float32)
+
+# --- 使用 pytest.mark.parametrize 来自动运行所有测试用例 ---
+
+@pytest.mark.parametrize(
+    "input_numpy",
+    [
+        pytest.param(case_standard, id="Standard Case"),
+        pytest.param(case_all_background, id="All Background"),
+        pytest.param(case_all_foreground, id="All Foreground"),
+        pytest.param(case_single_background, id="Single Background Pixel"),
+        pytest.param(case_single_foreground, id="Single Foreground Pixel"),
+        pytest.param(case_tall_matrix, id="Tall Matrix (H>W)"),
+        pytest.param(case_wide_matrix, id="Wide Matrix (W>H)"),
+        pytest.param(case_checkerboard, id="Checkerboard"),
+    ]
+)
+def test_distance_transform_comprehensive(input_numpy, request):
+    """
+    一个统一的测试函数，用来验证所有不同的输入情况。
+    """
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
 
-    x = torch.arange(6, dtype=torch.float32, device="cuda").reshape(2, 3)
-    y = tm.distance_transform(x)
-
-    expected = x * 2
-    # here we compare the output, i.e. results of our distance transform,
-    # with the expected output, e.g. the results of scipy.ndimage.distance_transform_edt
-    # currently, our implementation simply multiplies the input by 2,
-    # but eventually we have to implement the full algorithm.
-    torch.testing.assert_close(y, expected)
-    assert y.device.type == "cuda"
-    assert y.shape == x.shape
-    print("tm.foo test passed ✅")
+    # 准备输入数据
+    x = torch.from_numpy(input_numpy).cuda()
+
+    # 1. 运行你的 CUDA 实现
+    y_cuda = tm.distance_transform(x)
+
+    # 2. 运行 SciPy 官方实现
+    y_ref_numpy = dte(input_numpy)
+    y_ref = torch.from_numpy(y_ref_numpy).to(torch.float32).cuda()
+
+    # 打印结果用于直观对比
+    print(f"\n\n--- Running Test: {request.node.callspec.id} ---")
+    print("Input Array:\n", input_numpy)
+    print("\nYour CUDA Implementation Output:\n", y_cuda.cpu().numpy())
+    print("\nSciPy Reference Output:\n", y_ref.cpu().numpy())
+    if request.node.callspec.id == "All Foreground":
+        # 对于这个特殊情况，我们不与 SciPy 比较。
+        # 我们验证我们自己的逻辑：输出值是否都非常大 (代表无穷远)。
+        print("\nSciPy has different behavior for this edge case. Verifying CUDA output is ~inf.")
+        # 断言所有元素都大于一个很大的阈值
+        assert torch.all(y_cuda > 1e4)
+    else:
+        # 对于所有其他正常情况，我们与 SciPy 的黄金标准进行比较。
+        y_ref_numpy = dte(input_numpy)
+        y_ref = torch.from_numpy(y_ref_numpy).to(torch.float32).cuda()
+        print("\nSciPy Reference Output:\n", y_ref.cpu().numpy())
+        torch.testing.assert_close(y_cuda, y_ref, atol=1e-3, rtol=1e-3)
+    print("--- Test End ---")
diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 33d3abd..70527db 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -1,27 +1,98 @@
-#include <torch/extension.h>
+#include <cuda_runtime.h>
+#include <cmath>
 
-// distance transform: https://en.wikipedia.org/wiki/Distance_transform
-// https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.distance_transform_edt.html
+__global__ void edt_pass1_rows(const float* input, float* temp, int H, int W) {
+    int y = blockIdx.x;
+    if (y >= H) return;
 
-__global__ void distance_transform_kernel(const float* in, float* out, int64_t N) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < N) {
-        out[idx] = 2.0f * in[idx];
+    extern __shared__ float sdata[];
+    float* f = sdata;
+    int* v = (int*)(sdata + W);
+    float* z = (float*)(v + W + 1);
+
+    for (int x = threadIdx.x; x < W; x += blockDim.x) {
+        float val = input[y * W + x];
+        // 【关键任务修改】
+        // 如果像素是 0 (背景)，则为源点 (距离0)；否则为无穷远。
+        f[x] = (val < 0.5f) ? 0.0f : 1e10f;
     }
-}
+    __syncthreads();
 
-torch::Tensor distance_transform_cuda(torch::Tensor input) {
-    auto output = torch::empty_like(input);
-    int64_t N = input.numel();
-    int threads = 256;
-    int blocks = (N + threads - 1) / threads;
+    if (threadIdx.x == 0) {
+        int k = 0;
+        v[0] = 0;
+        z[0] = -1e10f;
+        z[1] = 1e10f;
 
-    distance_transform_kernel<<<blocks, threads>>>(
-        input.data_ptr<float>(),
-        output.data_ptr<float>(),
-        N
-    );
+        for (int q = 1; q < W; q++) {
+            float s;
+            while (true) {
+                int p = v[k];
+                s = ((f[q] + q * q) - (f[p] + p * p)) / (2.0f * (q - p));
+                if (s > z[k]) { break; }
+                if (k == 0) { break; }
+                k--;
+            }
+            k++;
+            v[k] = q;
+            z[k] = s;
+            z[k + 1] = 1e10f;
+        }
 
-    return output;
+        k = 0;
+        for (int q = 0; q < W; q++) {
+            while (z[k + 1] < q) k++;
+            int p = v[k];
+            temp[y * W + q] = (q - p) * (q - p) + f[p];
+        }
+    }
 }
+// PASS 2: 对每一列进行操作
+__global__ void edt_pass2_cols(const float* temp, float* output, int H, int W) {
+    int x = blockIdx.x;
+    if (x >= W) return;
+
+    extern __shared__ float sdata[];
+    float* f = sdata;
+    int* v = (int*)(sdata + H);
+    float* z = (float*)(v + H + 1);
+
+    for (int y = threadIdx.x; y < H; y += blockDim.x) {
+        f[y] = temp[y * W + x];
+    }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+        int k = 0;
+        v[0] = 0;
+        z[0] = -1e10f;
+        z[1] = 1e10f;
+
+        for (int q = 1; q < H; q++) {
 
+            float s;
+            while (true) {
+                int p = v[k];
+                s = ((f[q] + q * q) - (f[p] + p * p)) / (2.0f * (q - p));
+                if (s > z[k]) {
+                    break;
+                }
+                if (k == 0) {
+                    break;
+                }
+                k--;
+            }
+            k++;
+            v[k] = q;
+            z[k] = s;
+            z[k + 1] = 1e10f;
+        }
+
+        k = 0;
+        for (int q = 0; q < H; q++) {
+            while (z[k + 1] < q) k++;
+            int p = v[k];
+            output[q * W + x] = sqrtf((q - p) * (q - p) + f[p]);
+        }
+    }
+}
\ No newline at end of file
diff --git a/torchmorph/csrc/torchmorph.cpp b/torchmorph/csrc/torchmorph.cpp
deleted file mode 100644
index 5d1dae8..0000000
--- a/torchmorph/csrc/torchmorph.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-#include <torch/extension.h>
-
-// Declare CUDA implementations
-torch::Tensor add_cuda(torch::Tensor input, float scalar);
-torch::Tensor distance_transform_cuda(torch::Tensor input);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("add_cuda", &add_cuda, "Add tensor with scalar");
-    m.def("distance_transform_cuda", &distance_transform_cuda, "Distance transform");
-}
-
diff --git a/torchmorph/csrc/torchmorph.cu b/torchmorph/csrc/torchmorph.cu
new file mode 100644
index 0000000..dc2e4ce
--- /dev/null
+++ b/torchmorph/csrc/torchmorph.cu
@@ -0,0 +1,51 @@
+// =========================================================================
+//  内容保存到: torchmorph/csrc/torchmorph.cpp
+// =========================================================================
+
+#include <torch/extension.h>
+
+// 函数声明：告诉 C++ 编译器，这两个 CUDA 内核函数是在别的文件里定义的
+// 这样 C++ 代码才能成功调用 .cu 文件里的内核
+__global__ void edt_pass1_rows(const float* input, float* temp, int H, int W);
+__global__ void edt_pass2_cols(const float* temp, float* output, int H, int W);
+
+
+
+// 主调函数 (运行在 CPU 上)
+torch::Tensor distance_transform_cuda(torch::Tensor input) {
+    // 检查输入张量是否在 CUDA 上，以及是否为二维
+    TORCH_CHECK(input.is_cuda(), "Input must be on CUDA");
+    TORCH_CHECK(input.dim() == 2, "Only 2D tensors supported");
+
+    int H = input.size(0);
+    int W = input.size(1);
+
+    // 创建临时的和最终的输出张量
+    auto temp = torch::empty_like(input);
+    auto output = torch::empty_like(input);
+
+    // 计算动态共享内存的大小
+    size_t shared_mem_pass1 = W * sizeof(float) + (W + 1) * sizeof(int) + (W + 2) * sizeof(float);
+    size_t shared_mem_pass2 = H * sizeof(float) + (H + 1) * sizeof(int) + (H + 2) * sizeof(float);
+
+    // 设置每个块的线程数
+    int threads_per_block = 32;
+
+    // <<<...>>> 语法：启动 CUDA 内核
+    // 参数：Grid大小, Block大小, 共享内存大小, (可选的流)
+    
+    // Pass 1: 每行启动一个 block
+    edt_pass1_rows<<<H, threads_per_block, shared_mem_pass1>>>(
+        input.data_ptr<float>(), temp.data_ptr<float>(), H, W);
+
+    // Pass 2: 每列启动一个 block
+    edt_pass2_cols<<<W, threads_per_block, shared_mem_pass2>>>(
+        temp.data_ptr<float>(), output.data_ptr<float>(), H, W);
+
+    return output;
+}
+
+// 使用 PYBIND11 将 C++ 函数绑定到 Python 模块
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("distance_transform", &distance_transform_cuda, "CUDA-accelerated Exact Euclidean Distance Transform");
+}
\ No newline at end of file
diff --git a/torchmorph/distance_transform.py b/torchmorph/distance_transform.py
index e4b54db..b4cd458 100644
--- a/torchmorph/distance_transform.py
+++ b/torchmorph/distance_transform.py
@@ -6,4 +6,4 @@ def distance_transform(input: torch.Tensor) -> torch.Tensor:
     """Distance Transform in CUDA."""
     if not input.is_cuda:
         raise ValueError("Input tensor must be on CUDA device.")
-    return _C.distance_transform_cuda(input)
+    return _C.distance_transform(input)

From 72943e44a723e794deb4a31af56b345a8e5834a9 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Sun, 2 Nov 2025 04:12:02 +0800
Subject: [PATCH 04/56] =?UTF-8?q?N=E7=BB=B4=E6=89=B9=E5=A4=84=E7=90=86?=
 =?UTF-8?q?=E7=9A=84=E6=AC=A7=E6=B0=8F=E8=B7=9D=E7=A6=BB=E5=8F=98=E6=8D=A2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/test_distance_transform.py              | 138 ++++------
 torchmorph/csrc/distance_transform_kernel.cu | 271 ++++++++++++++-----
 torchmorph/csrc/torchmorph.cpp               |  10 +
 torchmorph/csrc/torchmorph.cu                |  51 ----
 torchmorph/distance_transform.py             |   2 +-
 5 files changed, 269 insertions(+), 203 deletions(-)
 create mode 100644 torchmorph/csrc/torchmorph.cpp
 delete mode 100644 torchmorph/csrc/torchmorph.cu

diff --git a/test/test_distance_transform.py b/test/test_distance_transform.py
index 493758a..3feffb4 100644
--- a/test/test_distance_transform.py
+++ b/test/test_distance_transform.py
@@ -4,102 +4,82 @@
 import torchmorph as tm
 import numpy as np
 
-# --- 我们在这里定义所有的测试用例 ---
 
-# 用例 1: 我们之前成功的那个标准例子
-case_standard = np.array([
-    [0, 1, 1, 1, 1],
-    [0, 0, 1, 1, 1],
-    [0, 1, 1, 1, 1],
-    [0, 1, 1, 1, 0],
-    [0, 1, 1, 0, 0]
-], dtype=np.float32)
+def batch_distance_transform_edt(batch_numpy):
 
-# 用例 2: 全是背景 (0)，输出应该全是 0
-case_all_background = np.zeros((5, 5), dtype=np.float32)
+    is_single_sample = batch_numpy.ndim <= 2
+    # (H, W) -> (1, H, W)
+    if is_single_sample:
+        batch_numpy = batch_numpy[np.newaxis, ...]
+        
+    results = [dte(sample) for sample in batch_numpy]
+    output = np.stack(results, axis=0)  
+    # (1, H, W) -> (H, W)
+    if is_single_sample:
+        output = output.squeeze(0)
+        
+    return output
 
-# 用例 3: 全是前景 (1)，输出应该也全是 0 (因为前景点到背景的距离未定义，SciPy默认输出0)
-case_all_foreground = np.ones((5, 5), dtype=np.float32)
+# 用例 1: 批处理的 2D 图像
+case_batch_2d = np.array([
+    # 第 1 张图
+    [[0, 1, 1, 1], [0, 0, 1, 1], [0, 1, 1, 0]],
+    # 第 2 张图
+    [[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]]
+], dtype=np.float32)
 
-# 用例 4: 只有一个背景点 (0) 在中间
-case_single_background = np.ones((5, 5), dtype=np.float32)
-case_single_background[2, 2] = 0
 
-# 用例 5: 只有一个前景点 (1) 在中间
-case_single_foreground = np.zeros((5, 5), dtype=np.float32)
-case_single_foreground[2, 2] = 1
+# 用例 2: 批处理的 3D 图像
+case_3d_sample1 = np.ones((4, 5, 6), dtype=np.float32); case_3d_sample1[1, 1, 1] = 0.0; case_3d_sample1[2, 3, 4] = 0.0
+case_3d_sample2 = np.ones((4, 5, 6), dtype=np.float32); case_3d_sample2[0, 0, 0] = 0.0
+case_batch_3d = np.stack([case_3d_sample1, case_3d_sample2], axis=0)
 
-# 用例 6: 非正方形的矩阵 (高 > 宽)
-case_tall_matrix = np.array([
-    [1, 0, 1],
-    [1, 1, 1],
-    [1, 1, 1],
-    [0, 1, 0],
-    [1, 1, 1],
+# 用例 3: 单张 2D 图像 (隐式批处理)
+case_single_2d = np.array([
+    [0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0],
 ], dtype=np.float32)
 
-# 用例 7: 非正方形的矩阵 (宽 > 高)
-case_wide_matrix = np.array([
-    [1, 1, 0, 1, 1],
-    [1, 1, 1, 1, 0],
-    [0, 1, 1, 1, 1],
-], dtype=np.float32)
 
-# 用例 8: 棋盘格，考验对角线距离的计算
-case_checkerboard = np.array([
-    [0, 1, 0, 1],
-    [1, 0, 1, 0],
-    [0, 1, 0, 1],
-    [1, 0, 1, 0],
-], dtype=np.float32)
+# 用例 4: 单张 2D 图像 (显式批处理)
+case_explicit_batch_one = case_single_2d[np.newaxis, ...]
 
-# --- 使用 pytest.mark.parametrize 来自动运行所有测试用例 ---
+# 用例 5: 含幺元维度的批处理
+case_dim_one = np.ones((2, 5, 1), dtype=np.float32) # 两张 5x1 的图片
+case_dim_one[0, 2, 0] = 0.0
+case_dim_one[1, 4, 0] = 0.0
+
+# 用例 6: 1D 张量的批处理
+case_batch_1d = np.array([
+    [1, 1, 0, 1, 0, 1],
+    [0, 1, 1, 1, 1, 0]
+], dtype=np.float32)
 
 @pytest.mark.parametrize(
     "input_numpy",
     [
-        pytest.param(case_standard, id="Standard Case"),
-        pytest.param(case_all_background, id="All Background"),
-        pytest.param(case_all_foreground, id="All Foreground"),
-        pytest.param(case_single_background, id="Single Background Pixel"),
-        pytest.param(case_single_foreground, id="Single Foreground Pixel"),
-        pytest.param(case_tall_matrix, id="Tall Matrix (H>W)"),
-        pytest.param(case_wide_matrix, id="Wide Matrix (W>H)"),
-        pytest.param(case_checkerboard, id="Checkerboard"),
+        pytest.param(case_batch_2d, id="批处理2D图像"),
+        pytest.param(case_batch_3d, id="批处理3D图像"),
+        pytest.param(case_single_2d, id="单张2D图像(隐式批处理)"),
+        pytest.param(case_explicit_batch_one, id="单张2D图像(显式批处理)"),
+        pytest.param(case_dim_one, id="含幺元维度的批处理"),
+        pytest.param(case_batch_1d, id="批处理1D数据"),
     ]
 )
-def test_distance_transform_comprehensive(input_numpy, request):
-    """
-    一个统一的测试函数，用来验证所有不同的输入情况。
-    """
+def test_batch_processing(input_numpy, request):
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
+    x_numpy_contiguous = np.ascontiguousarray(input_numpy)
+    x = torch.from_numpy(x_numpy_contiguous).cuda()
 
-    # 准备输入数据
-    x = torch.from_numpy(input_numpy).cuda()
-
-    # 1. 运行你的 CUDA 实现
-    y_cuda = tm.distance_transform(x)
-
-    # 2. 运行 SciPy 官方实现
-    y_ref_numpy = dte(input_numpy)
+    print(f"\n\n--- 正在运行测试: {request.node.callspec.id} ---")
+    print(f"输入张量形状: {x.shape}")
+    y_cuda = tm.distance_transform(x.clone())
+    
+    y_ref_numpy = batch_distance_transform_edt(x_numpy_contiguous)
     y_ref = torch.from_numpy(y_ref_numpy).to(torch.float32).cuda()
-
-    # 打印结果用于直观对比
-    print(f"\n\n--- Running Test: {request.node.callspec.id} ---")
-    print("Input Array:\n", input_numpy)
-    print("\nYour CUDA Implementation Output:\n", y_cuda.cpu().numpy())
-    print("\nSciPy Reference Output:\n", y_ref.cpu().numpy())
-    if request.node.callspec.id == "All Foreground":
-        # 对于这个特殊情况，我们不与 SciPy 比较。
-        # 我们验证我们自己的逻辑：输出值是否都非常大 (代表无穷远)。
-        print("\nSciPy has different behavior for this edge case. Verifying CUDA output is ~inf.")
-        # 断言所有元素都大于一个很大的阈值
-        assert torch.all(y_cuda > 1e4)
-    else:
-        # 对于所有其他正常情况，我们与 SciPy 的黄金标准进行比较。
-        y_ref_numpy = dte(input_numpy)
-        y_ref = torch.from_numpy(y_ref_numpy).to(torch.float32).cuda()
-        print("\nSciPy Reference Output:\n", y_ref.cpu().numpy())
-        torch.testing.assert_close(y_cuda, y_ref, atol=1e-3, rtol=1e-3)
-    print("--- Test End ---")
+    
+    assert y_cuda.shape == y_ref.shape, f"形状不匹配! CUDA输出: {y_cuda.shape}, SciPy应为: {y_ref.shape}"
+    print("CUDA 和 SciPy 输出形状匹配。")
+    
+    torch.testing.assert_close(y_cuda, y_ref, atol=1e-3, rtol=1e-3)
+    print("--- 断言通过 (数值接近) ---")
\ No newline at end of file
diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 70527db..0bc5c26 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -1,98 +1,225 @@
-#include <cuda_runtime.h>
-#include <cmath>
+#include <torch/extension.h>
+#include <vector>
+
+// --- Kernel 1: 二值化内核 ---
+/**
+ * @brief 对输入张量进行逐元素二值化。
+ * @details 这是一个简单的并行操作。它将输入张量中的背景像素（值<0.5）设置为0，
+ *          并将前景像素（值>=0.5）设置为一个极大的值（1e20f），这在距离变换的上下文中
+ *          可以被认为是无穷大。
+ * @param in 输入张量的数据指针。
+ * @param out 输出张量的数据指针。
+ * @param N 张量中的元素总数。
+ */
+__global__ void binarize_kernel(const float* in, float* out, int64_t N) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < N) {
+        out[idx] = (in[idx] < 0.5f) ? 0.0f : 1e20f;
+    }
+}
+
+// --- Kernel 2: 1D Pass 距离平方计算内核 ---
+/**
+ * @brief 沿着一个指定的空间维度，对N维张量执行一维抛物线下包络算法。
+ * @details 这是Felzenszwalb和Huttenlocher EDT算法的核心。它通过将N维问题分解为N个
+ *          一维问题来解决。此内核负责处理其中一个维度。它只计算距离的平方，以避免
+ *          昂贵的开方运算并保持数值精度。
+ *          每个CUDA线程块（block）负责处理一条完整的一维扫描线（slice）。
+ * @param in_data 输入张量数据指针。
+ * @param out_data 输出张量数据指针。
+ * @param shape 描述输入张量形状的数组指针 (在GPU上)。
+ * @param strides 描述输入张量步幅的数组指针 (在GPU上)。
+ * @param ndim 张量的总维度数 (包括批处理维度)。
+ * @param process_dim_sample 当前正在处理的空间维度索引 (0代表第一个空间维度，依此类推)。
+ * @param total_slices 需要处理的一维扫描线总数 (batch_size * num_slices_per_sample)。
+ * @param num_slices_per_sample 每个样本中，垂直于当前处理维度的扫描线数量。
+ */
+__global__ void edt_1d_pass_sq_kernel(
+    const float* in_data, float* out_data, 
+    const int64_t* shape, const int64_t* strides, 
+    int32_t ndim, int32_t process_dim_sample, 
+    int64_t total_slices, int64_t num_slices_per_sample
+) {
+    // 每个线程块处理一条一维扫描线
+    int64_t slice_idx = blockIdx.x;
+    if (slice_idx >= total_slices) return;
+
+
+    int64_t batch_idx = slice_idx / num_slices_per_sample;
+    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
+    int64_t batch_offset = batch_idx * strides[0]; // 获取批处理的基地址
+    int64_t sample_base_offset = 0;
+    int64_t temp_idx = slice_idx_in_sample;
+    int sample_ndim = ndim - 1;
+
+    // 从非处理维度中计算出样本内的基地址偏移
+    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
+        if (d == process_dim_sample) continue; // 跳过当前正在处理的维度
+        int64_t size_of_dim = shape[d + 1];
+        if (size_of_dim == 0) continue;
+        int64_t coord_in_dim = temp_idx % size_of_dim;
+        temp_idx /= size_of_dim;
+        sample_base_offset += coord_in_dim * strides[d + 1];
+    }
+    
+    const int64_t process_dim_actual = process_dim_sample + 1; // 加上批处理维度的实际索引
+    const int64_t N = shape[process_dim_actual]; // 当前处理维度的长度
+    const int64_t stride = strides[process_dim_actual]; // 沿当前维度移动一个元素所需的步幅
+    const int64_t base_offset = batch_offset + sample_base_offset; // 最终的起始地址
 
-__global__ void edt_pass1_rows(const float* input, float* temp, int H, int W) {
-    int y = blockIdx.x;
-    if (y >= H) return;
 
     extern __shared__ float sdata[];
-    float* f = sdata;
-    int* v = (int*)(sdata + W);
-    float* z = (float*)(v + W + 1);
-
-    for (int x = threadIdx.x; x < W; x += blockDim.x) {
-        float val = input[y * W + x];
-        // 【关键任务修改】
-        // 如果像素是 0 (背景)，则为源点 (距离0)；否则为无穷远。
-        f[x] = (val < 0.5f) ? 0.0f : 1e10f;
+    float* f = sdata;                       // 存储函数值 g(p) = f(p) + p^2
+    int*   v = (int*)(sdata + N);           // 存储抛物线顶点的索引
+    float* z = (float*)(v + N + 1);         // 存储相邻抛物线的交点
+
+    // 块内的所有线程协同将数据从全局内存加载到共享内存
+    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
+        f[i] = in_data[base_offset + i * stride];
     }
-    __syncthreads();
+    __syncthreads(); // 等待所有线程完成加载
 
-    if (threadIdx.x == 0) {
-        int k = 0;
-        v[0] = 0;
-        z[0] = -1e10f;
-        z[1] = 1e10f;
+    //计算抛物线的下包络 
+    if (threadIdx.x == 0 && N > 0) {
+        int k = 0; // 下包络中的抛物线数量
+        v[0] = 0;  // 第一个抛物线的顶点索引为0
+        z[0] = -1e20f; z[1] = 1e20f; // 初始化交点为负无穷和正无穷
 
-        for (int q = 1; q < W; q++) {
+        // 遍历所有点，构建下包络
+        for (int q = 1; q < N; q++) {
             float s;
+            // 寻找新的抛物线q应该插入的位置
             while (true) {
-                int p = v[k];
+                int p = v[k]; if (q == p) break;
+                // s 是抛物线 p 和 q 的交点的横坐标
                 s = ((f[q] + q * q) - (f[p] + p * p)) / (2.0f * (q - p));
+                // 如果交点在当前区间的右侧，则找到了插入点
                 if (s > z[k]) { break; }
-                if (k == 0) { break; }
+                // 否则，抛物线p被q完全覆盖，需要移除p
+                if (k == 0) { break; } 
                 k--;
             }
-            k++;
-            v[k] = q;
-            z[k] = s;
-            z[k + 1] = 1e10f;
+            // 插入新的抛物线q
+            k++; 
+            v[k] = q; 
+            z[k] = s; 
+            z[k + 1] = 1e20f;
         }
-
+        // 计算距离平方
         k = 0;
-        for (int q = 0; q < W; q++) {
-            while (z[k + 1] < q) k++;
-            int p = v[k];
-            temp[y * W + q] = (q - p) * (q - p) + f[p];
+        // 遍历所有点，找到其头顶上方的下包络线段，并计算距离
+        for (int q = 0; q < N; q++) {
+            while (z[k + 1] < q) k++; // 找到点q所属的区间
+            int p = v[k]; // 获取该区间的抛物线顶点索引
+            // 计算距离平方: D(q)^2 = (q - p)^2 + g(p)
+            out_data[base_offset + q * stride] = (q - p) * (q - p) + f[p];
         }
     }
 }
-// PASS 2: 对每一列进行操作
-__global__ void edt_pass2_cols(const float* temp, float* output, int H, int W) {
-    int x = blockIdx.x;
-    if (x >= W) return;
 
-    extern __shared__ float sdata[];
-    float* f = sdata;
-    int* v = (int*)(sdata + H);
-    float* z = (float*)(v + H + 1);
-
-    for (int y = threadIdx.x; y < H; y += blockDim.x) {
-        f[y] = temp[y * W + x];
+// --- Kernel 3: 开平方根内核 ---
+/**
+ * @brief 对张量中的每个元素计算平方根。
+ * @details 这是一个简单的逐元素操作。由于之前的1D pass计算的是距离的平方，
+ *          此内核在所有维度处理完毕后被调用，以得到最终的欧氏距离。
+ * @param data 需要进行开方操作的张量数据指针。
+ * @param N 张量中的元素总数。
+ */
+__global__ void sqrt_kernel(float* data, int64_t N) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < N) {
+        data[idx] = sqrtf(data[idx]);
     }
-    __syncthreads();
+}
 
-    if (threadIdx.x == 0) {
-        int k = 0;
-        v[0] = 0;
-        z[0] = -1e10f;
-        z[1] = 1e10f;
+// --- 主调函数 (Host) ---
+/**
+ * @brief 执行N维欧氏距离变换。
+ * @param input 一个N维的PyTorch张量，第一个维度被视为批处理（batch）维度。
+ * @return 一个与输入形状相同的张量，包含每个点到最近前景点（值>=0.5）的欧氏距离。
+ */
+torch::Tensor distance_transform_cuda(torch::Tensor input) {
+    auto original_input = input;
+    
+    // --- 预处理: 统一输入格式 ---
+    // 确保所有输入都至少是3D的 (B, ...)，方便后续统一处理。
+    // 如果输入是 (H, W) 或 (L)，则变为 (1, H, W) 或 (1, L)。
+    bool had_no_batch_dim = (input.dim() <= 2); 
+    if (had_no_batch_dim) { input = input.unsqueeze(0); }
 
-        for (int q = 1; q < H; q++) {
+    // 检查输入张量是否在CUDA上并且是内存连续的
+    TORCH_CHECK(input.is_cuda(), "Input must be on a CUDA device.");
+    TORCH_CHECK(input.is_contiguous(), "Input tensor must be contiguous.");
 
-            float s;
-            while (true) {
-                int p = v[k];
-                s = ((f[q] + q * q) - (f[p] + p * p)) / (2.0f * (q - p));
-                if (s > z[k]) {
-                    break;
-                }
-                if (k == 0) {
-                    break;
-                }
-                k--;
-            }
-            k++;
-            v[k] = q;
-            z[k] = s;
-            z[k + 1] = 1e10f;
-        }
+    if (input.numel() == 0) { return torch::empty_like(original_input); }
+    
+    // --- 获取张量元数据 ---
+    const auto ndim = input.dim();
+    const auto sample_ndim = ndim - 1; // 空间维度 = 总维度 - 1 (batch)
+    const auto batch_size = input.size(0);
+    const int64_t N_total = input.numel();
+    
+    auto shape_vec = input.sizes().vec();
+    auto strides_vec = input.strides().vec();
+    
+    // --- 内存分配: 使用Ping-Pong缓冲策略 ---
+    // 分配两个缓冲区，在处理每个维度时交替作为输入和输出，避免原地读写冲突。
+    auto output = torch::empty_like(input);
+    auto buffer = (sample_ndim > 0) ? torch::empty_like(input) : output;
 
-        k = 0;
-        for (int q = 0; q < H; q++) {
-            while (z[k + 1] < q) k++;
-            int p = v[k];
-            output[q * W + x] = sqrtf((q - p) * (q - p) + f[p]);
+    //二值化
+    int threads = 256; // 定义每个线程块的线程数
+    int blocks = (N_total + threads - 1) / threads; // 计算启动的线程块数
+    binarize_kernel<<<blocks, threads>>>(input.data_ptr<float>(), buffer.data_ptr<float>(), N_total);
+
+    //循环调用 edt_1d_pass_sq_kernel
+    // 将shape和strides信息从CPU内存拷贝到GPU内存，以便内核可以访问
+    int64_t *shape_gpu, *strides_gpu;
+    cudaMalloc(&shape_gpu, ndim * sizeof(int64_t));
+    cudaMalloc(&strides_gpu, ndim * sizeof(int64_t));
+    cudaMemcpy(shape_gpu, shape_vec.data(), ndim * sizeof(int64_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(strides_gpu, strides_vec.data(), ndim * sizeof(int64_t), cudaMemcpyHostToDevice);
+
+    torch::Tensor current_input = buffer;
+    torch::Tensor current_output = output;
+
+    // 遍历所有空间维度
+    for (int32_t d_sample = 0; d_sample < sample_ndim; ++d_sample) {
+        // 为当前处理的维度计算启动内核所需的参数
+        int64_t num_slices_per_sample = 1;
+        for(int i = 0; i < sample_ndim; ++i) { 
+            if (i != d_sample) num_slices_per_sample *= shape_vec[i + 1]; 
         }
+        int64_t total_slices = batch_size * num_slices_per_sample;
+        int64_t slice_len = shape_vec[d_sample + 1];
+        
+        // 动态设置线程数和共享内存大小
+        int threads_pass = (slice_len > 0 && slice_len < 256) ? slice_len : 256;
+        if (threads_pass == 0) threads_pass = 1;
+        size_t shared_mem_size = slice_len * sizeof(float) + (slice_len + 1) * sizeof(int) + (slice_len + 2) * sizeof(float);
+        
+        edt_1d_pass_sq_kernel<<<total_slices, threads_pass, shared_mem_size>>>(
+            current_input.data_ptr<float>(), current_output.data_ptr<float>(),
+            shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+        );
+        // 交换输入和输出缓冲区，为下一个维度做准备
+        std::swap(current_input, current_output);
+    }
+    
+    cudaFree(shape_gpu);
+    cudaFree(strides_gpu);
+
+    //计算最终距离
+    // 经过循环后，current_input 指向的是包含最终距离平方结果的张量
+    sqrt_kernel<<<blocks, threads>>>(current_input.data_ptr<float>(), N_total);
+    
+    // 如果最后一轮的输出不在我们期望的 output 张量里，就做一次拷贝
+    if (current_input.data_ptr() != output.data_ptr()){
+        output.copy_(current_input);
     }
+    
+    // 如果最初没有批处理维度，则移除我们添加的维度
+    if (had_no_batch_dim) { output = output.squeeze(0); }
+    
+    return output;
 }
\ No newline at end of file
diff --git a/torchmorph/csrc/torchmorph.cpp b/torchmorph/csrc/torchmorph.cpp
new file mode 100644
index 0000000..b7f466a
--- /dev/null
+++ b/torchmorph/csrc/torchmorph.cpp
@@ -0,0 +1,10 @@
+#include <torch/extension.h>
+
+// Declare CUDA implementations
+torch::Tensor add_cuda(torch::Tensor input, float scalar);
+torch::Tensor distance_transform_cuda(torch::Tensor input);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("add_cuda", &add_cuda, "Add tensor with scalar");
+    m.def("distance_transform_cuda", &distance_transform_cuda, "Distance transform");
+}
\ No newline at end of file
diff --git a/torchmorph/csrc/torchmorph.cu b/torchmorph/csrc/torchmorph.cu
deleted file mode 100644
index dc2e4ce..0000000
--- a/torchmorph/csrc/torchmorph.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-// =========================================================================
-//  内容保存到: torchmorph/csrc/torchmorph.cpp
-// =========================================================================
-
-#include <torch/extension.h>
-
-// 函数声明：告诉 C++ 编译器，这两个 CUDA 内核函数是在别的文件里定义的
-// 这样 C++ 代码才能成功调用 .cu 文件里的内核
-__global__ void edt_pass1_rows(const float* input, float* temp, int H, int W);
-__global__ void edt_pass2_cols(const float* temp, float* output, int H, int W);
-
-
-
-// 主调函数 (运行在 CPU 上)
-torch::Tensor distance_transform_cuda(torch::Tensor input) {
-    // 检查输入张量是否在 CUDA 上，以及是否为二维
-    TORCH_CHECK(input.is_cuda(), "Input must be on CUDA");
-    TORCH_CHECK(input.dim() == 2, "Only 2D tensors supported");
-
-    int H = input.size(0);
-    int W = input.size(1);
-
-    // 创建临时的和最终的输出张量
-    auto temp = torch::empty_like(input);
-    auto output = torch::empty_like(input);
-
-    // 计算动态共享内存的大小
-    size_t shared_mem_pass1 = W * sizeof(float) + (W + 1) * sizeof(int) + (W + 2) * sizeof(float);
-    size_t shared_mem_pass2 = H * sizeof(float) + (H + 1) * sizeof(int) + (H + 2) * sizeof(float);
-
-    // 设置每个块的线程数
-    int threads_per_block = 32;
-
-    // <<<...>>> 语法：启动 CUDA 内核
-    // 参数：Grid大小, Block大小, 共享内存大小, (可选的流)
-    
-    // Pass 1: 每行启动一个 block
-    edt_pass1_rows<<<H, threads_per_block, shared_mem_pass1>>>(
-        input.data_ptr<float>(), temp.data_ptr<float>(), H, W);
-
-    // Pass 2: 每列启动一个 block
-    edt_pass2_cols<<<W, threads_per_block, shared_mem_pass2>>>(
-        temp.data_ptr<float>(), output.data_ptr<float>(), H, W);
-
-    return output;
-}
-
-// 使用 PYBIND11 将 C++ 函数绑定到 Python 模块
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("distance_transform", &distance_transform_cuda, "CUDA-accelerated Exact Euclidean Distance Transform");
-}
\ No newline at end of file
diff --git a/torchmorph/distance_transform.py b/torchmorph/distance_transform.py
index b4cd458..7840158 100644
--- a/torchmorph/distance_transform.py
+++ b/torchmorph/distance_transform.py
@@ -6,4 +6,4 @@ def distance_transform(input: torch.Tensor) -> torch.Tensor:
     """Distance Transform in CUDA."""
     if not input.is_cuda:
         raise ValueError("Input tensor must be on CUDA device.")
-    return _C.distance_transform(input)
+    return _C.distance_transform_cuda(input)
\ No newline at end of file

From f9420b25ffcca6115a49a7ec7c329c2db46583e7 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Sun, 2 Nov 2025 15:07:10 +0800
Subject: [PATCH 05/56] =?UTF-8?q?=E4=BF=AE=E5=A4=8DBUG:=E5=8E=9F=E5=85=88?=
 =?UTF-8?q?=E7=89=88=E6=9C=AC=E4=B8=AD=E8=AF=AF=E5=B0=860=E5=BD=93?=
 =?UTF-8?q?=E6=88=90=E8=83=8C=E6=99=AF=EF=BC=8C1=E5=BD=93=E6=88=90?=
 =?UTF-8?q?=E5=89=8D=E6=99=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torchmorph/csrc/distance_transform_kernel.cu | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 0bc5c26..d8cc8a7 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -1,11 +1,11 @@
 #include <torch/extension.h>
 #include <vector>
 
-// --- Kernel 1: 二值化内核 ---
-/**
- * @brief 对输入张量进行逐元素二值化。
- * @details 这是一个简单的并行操作。它将输入张量中的背景像素（值<0.5）设置为0，
- *          并将前景像素（值>=0.5）设置为一个极大的值（1e20f），这在距离变换的上下文中
+// --- Kernel 1: 二值化内核  ---
+/*
+ * @brief 对输入张量进行逐元素二值化，为距离变换做准备。
+ * @details 将前景点(in[idx] == 0)的初始距离设为0，
+ *          背景点的初始距离设为一个极大值(1e20f)，这在距离变换的上下文中
  *          可以被认为是无穷大。
  * @param in 输入张量的数据指针。
  * @param out 输出张量的数据指针。
@@ -14,7 +14,9 @@
 __global__ void binarize_kernel(const float* in, float* out, int64_t N) {
     int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < N) {
-        out[idx] = (in[idx] < 0.5f) ? 0.0f : 1e20f;
+        // 如果输入像素为0，则为前景点，其距离为0。
+        // 如果输入像素非0，则为背景点，其初始距离为无穷大。
+        out[idx] = (in[idx] == 0.0f) ? 0.0f : 1e20f;
     }
 }
 

From 85bd1e546b2c53552868ff79b165e78c6a2176e5 Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Mon, 3 Nov 2025 16:53:10 +0800
Subject: [PATCH 06/56] returns both distance and index

---
 test/test_distance_transform.py              |  9 ++---
 torchmorph/csrc/distance_transform_kernel.cu | 37 +++++++++++---------
 torchmorph/csrc/torchmorph.cpp               |  4 +--
 torchmorph/distance_transform.py             |  8 ++++-
 4 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/test/test_distance_transform.py b/test/test_distance_transform.py
index 3feffb4..63b8568 100644
--- a/test/test_distance_transform.py
+++ b/test/test_distance_transform.py
@@ -73,13 +73,14 @@ def test_batch_processing(input_numpy, request):
 
     print(f"\n\n--- 正在运行测试: {request.node.callspec.id} ---")
     print(f"输入张量形状: {x.shape}")
-    y_cuda = tm.distance_transform(x.clone())
+    dist_cuda, idx_cuda = tm.distance_transform(x.clone())
+    print(f"Output index shape: {idx_cuda.shape}.")
     
     y_ref_numpy = batch_distance_transform_edt(x_numpy_contiguous)
     y_ref = torch.from_numpy(y_ref_numpy).to(torch.float32).cuda()
     
-    assert y_cuda.shape == y_ref.shape, f"形状不匹配! CUDA输出: {y_cuda.shape}, SciPy应为: {y_ref.shape}"
+    assert dist_cuda.shape == y_ref.shape, f"形状不匹配! CUDA输出: {dist_cuda.shape}, SciPy应为: {y_ref.shape}"
     print("CUDA 和 SciPy 输出形状匹配。")
     
-    torch.testing.assert_close(y_cuda, y_ref, atol=1e-3, rtol=1e-3)
-    print("--- 断言通过 (数值接近) ---")
\ No newline at end of file
+    torch.testing.assert_close(dist_cuda, y_ref, atol=1e-3, rtol=1e-3)
+    print("--- 断言通过 (数值接近) ---")
diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index d8cc8a7..534618f 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -11,7 +11,7 @@
  * @param out 输出张量的数据指针。
  * @param N 张量中的元素总数。
  */
-__global__ void binarize_kernel(const float* in, float* out, int64_t N) {
+__global__ void initialize_distance_kernel(const float* in, float* out, int64_t N) {
     int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < N) {
         // 如果输入像素为0，则为前景点，其距离为0。
@@ -140,7 +140,7 @@ __global__ void sqrt_kernel(float* data, int64_t N) {
  * @param input 一个N维的PyTorch张量，第一个维度被视为批处理（batch）维度。
  * @return 一个与输入形状相同的张量，包含每个点到最近前景点（值>=0.5）的欧氏距离。
  */
-torch::Tensor distance_transform_cuda(torch::Tensor input) {
+std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
     auto original_input = input;
     
     // --- 预处理: 统一输入格式 ---
@@ -153,7 +153,6 @@ torch::Tensor distance_transform_cuda(torch::Tensor input) {
     TORCH_CHECK(input.is_cuda(), "Input must be on a CUDA device.");
     TORCH_CHECK(input.is_contiguous(), "Input tensor must be contiguous.");
 
-    if (input.numel() == 0) { return torch::empty_like(original_input); }
     
     // --- 获取张量元数据 ---
     const auto ndim = input.dim();
@@ -161,39 +160,45 @@ torch::Tensor distance_transform_cuda(torch::Tensor input) {
     const auto batch_size = input.size(0);
     const int64_t N_total = input.numel();
     
-    auto shape_vec = input.sizes().vec();
+    auto shape = input.sizes().vec();
+    auto index_shape = shape;
+    index_shape.push_back(sample_ndim);
+
     auto strides_vec = input.strides().vec();
     
     // --- 内存分配: 使用Ping-Pong缓冲策略 ---
     // 分配两个缓冲区，在处理每个维度时交替作为输入和输出，避免原地读写冲突。
-    auto output = torch::empty_like(input);
-    auto buffer = (sample_ndim > 0) ? torch::empty_like(input) : output;
+    auto distance = torch::zeros_like(input);
+    auto index = torch::zeros(index_shape);
+    auto buffer = (sample_ndim > 0) ? torch::empty_like(input) : distance;
+
+    if (input.numel() == 0) { return std::make_tuple(distance, index); }
 
     //二值化
     int threads = 256; // 定义每个线程块的线程数
     int blocks = (N_total + threads - 1) / threads; // 计算启动的线程块数
-    binarize_kernel<<<blocks, threads>>>(input.data_ptr<float>(), buffer.data_ptr<float>(), N_total);
+    initialize_distance_kernel<<<blocks, threads>>>(input.data_ptr<float>(), buffer.data_ptr<float>(), N_total);
 
     //循环调用 edt_1d_pass_sq_kernel
     // 将shape和strides信息从CPU内存拷贝到GPU内存，以便内核可以访问
     int64_t *shape_gpu, *strides_gpu;
     cudaMalloc(&shape_gpu, ndim * sizeof(int64_t));
     cudaMalloc(&strides_gpu, ndim * sizeof(int64_t));
-    cudaMemcpy(shape_gpu, shape_vec.data(), ndim * sizeof(int64_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(shape_gpu, shape.data(), ndim * sizeof(int64_t), cudaMemcpyHostToDevice);
     cudaMemcpy(strides_gpu, strides_vec.data(), ndim * sizeof(int64_t), cudaMemcpyHostToDevice);
 
     torch::Tensor current_input = buffer;
-    torch::Tensor current_output = output;
+    torch::Tensor current_output = distance;
 
     // 遍历所有空间维度
     for (int32_t d_sample = 0; d_sample < sample_ndim; ++d_sample) {
         // 为当前处理的维度计算启动内核所需的参数
         int64_t num_slices_per_sample = 1;
         for(int i = 0; i < sample_ndim; ++i) { 
-            if (i != d_sample) num_slices_per_sample *= shape_vec[i + 1]; 
+            if (i != d_sample) num_slices_per_sample *= shape[i + 1]; 
         }
         int64_t total_slices = batch_size * num_slices_per_sample;
-        int64_t slice_len = shape_vec[d_sample + 1];
+        int64_t slice_len = shape[d_sample + 1];
         
         // 动态设置线程数和共享内存大小
         int threads_pass = (slice_len > 0 && slice_len < 256) ? slice_len : 256;
@@ -216,12 +221,12 @@ torch::Tensor distance_transform_cuda(torch::Tensor input) {
     sqrt_kernel<<<blocks, threads>>>(current_input.data_ptr<float>(), N_total);
     
     // 如果最后一轮的输出不在我们期望的 output 张量里，就做一次拷贝
-    if (current_input.data_ptr() != output.data_ptr()){
-        output.copy_(current_input);
+    if (current_input.data_ptr() != distance.data_ptr()){
+        distance.copy_(current_input);
     }
     
     // 如果最初没有批处理维度，则移除我们添加的维度
-    if (had_no_batch_dim) { output = output.squeeze(0); }
+    if (had_no_batch_dim) { distance = distance.squeeze(0); }
     
-    return output;
-}
\ No newline at end of file
+    return std::make_tuple(distance, index);
+}
diff --git a/torchmorph/csrc/torchmorph.cpp b/torchmorph/csrc/torchmorph.cpp
index b7f466a..c79970c 100644
--- a/torchmorph/csrc/torchmorph.cpp
+++ b/torchmorph/csrc/torchmorph.cpp
@@ -2,9 +2,9 @@
 
 // Declare CUDA implementations
 torch::Tensor add_cuda(torch::Tensor input, float scalar);
-torch::Tensor distance_transform_cuda(torch::Tensor input);
+std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input);
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     m.def("add_cuda", &add_cuda, "Add tensor with scalar");
     m.def("distance_transform_cuda", &distance_transform_cuda, "Distance transform");
-}
\ No newline at end of file
+}
diff --git a/torchmorph/distance_transform.py b/torchmorph/distance_transform.py
index 7840158..0184be5 100644
--- a/torchmorph/distance_transform.py
+++ b/torchmorph/distance_transform.py
@@ -6,4 +6,10 @@ def distance_transform(input: torch.Tensor) -> torch.Tensor:
     """Distance Transform in CUDA."""
     if not input.is_cuda:
         raise ValueError("Input tensor must be on CUDA device.")
-    return _C.distance_transform_cuda(input)
\ No newline at end of file
+    if input.ndim < 2 or input.numel() == 0:
+        raise ValueError(f"Invalid input dimension: {input.shape}.")
+
+    # binarize input
+    input[input != 0] = 1
+
+    return _C.distance_transform_cuda(input)

From be4eb3dba55a34b80dc87089cd110d31715d7741 Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Mon, 3 Nov 2025 16:53:30 +0800
Subject: [PATCH 07/56] format

---
 test/test_distance_transform.py | 61 ++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 24 deletions(-)

diff --git a/test/test_distance_transform.py b/test/test_distance_transform.py
index 63b8568..f65a0b4 100644
--- a/test/test_distance_transform.py
+++ b/test/test_distance_transform.py
@@ -11,48 +11,59 @@ def batch_distance_transform_edt(batch_numpy):
     # (H, W) -> (1, H, W)
     if is_single_sample:
         batch_numpy = batch_numpy[np.newaxis, ...]
-        
+
     results = [dte(sample) for sample in batch_numpy]
-    output = np.stack(results, axis=0)  
+    output = np.stack(results, axis=0)
     # (1, H, W) -> (H, W)
     if is_single_sample:
         output = output.squeeze(0)
-        
+
     return output
 
+
 # 用例 1: 批处理的 2D 图像
-case_batch_2d = np.array([
-    # 第 1 张图
-    [[0, 1, 1, 1], [0, 0, 1, 1], [0, 1, 1, 0]],
-    # 第 2 张图
-    [[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]]
-], dtype=np.float32)
+case_batch_2d = np.array(
+    [
+        # 第 1 张图
+        [[0, 1, 1, 1], [0, 0, 1, 1], [0, 1, 1, 0]],
+        # 第 2 张图
+        [[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]],
+    ],
+    dtype=np.float32,
+)
 
 
 # 用例 2: 批处理的 3D 图像
-case_3d_sample1 = np.ones((4, 5, 6), dtype=np.float32); case_3d_sample1[1, 1, 1] = 0.0; case_3d_sample1[2, 3, 4] = 0.0
-case_3d_sample2 = np.ones((4, 5, 6), dtype=np.float32); case_3d_sample2[0, 0, 0] = 0.0
+case_3d_sample1 = np.ones((4, 5, 6), dtype=np.float32)
+case_3d_sample1[1, 1, 1] = 0.0
+case_3d_sample1[2, 3, 4] = 0.0
+case_3d_sample2 = np.ones((4, 5, 6), dtype=np.float32)
+case_3d_sample2[0, 0, 0] = 0.0
 case_batch_3d = np.stack([case_3d_sample1, case_3d_sample2], axis=0)
 
 # 用例 3: 单张 2D 图像 (隐式批处理)
-case_single_2d = np.array([
-    [0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0],
-], dtype=np.float32)
+case_single_2d = np.array(
+    [
+        [0, 1, 0, 1],
+        [1, 0, 1, 0],
+        [0, 1, 0, 1],
+        [1, 0, 1, 0],
+    ],
+    dtype=np.float32,
+)
 
 
 # 用例 4: 单张 2D 图像 (显式批处理)
 case_explicit_batch_one = case_single_2d[np.newaxis, ...]
 
 # 用例 5: 含幺元维度的批处理
-case_dim_one = np.ones((2, 5, 1), dtype=np.float32) # 两张 5x1 的图片
+case_dim_one = np.ones((2, 5, 1), dtype=np.float32)  # 两张 5x1 的图片
 case_dim_one[0, 2, 0] = 0.0
 case_dim_one[1, 4, 0] = 0.0
 
 # 用例 6: 1D 张量的批处理
-case_batch_1d = np.array([
-    [1, 1, 0, 1, 0, 1],
-    [0, 1, 1, 1, 1, 0]
-], dtype=np.float32)
+case_batch_1d = np.array([[1, 1, 0, 1, 0, 1], [0, 1, 1, 1, 1, 0]], dtype=np.float32)
+
 
 @pytest.mark.parametrize(
     "input_numpy",
@@ -63,7 +74,7 @@ def batch_distance_transform_edt(batch_numpy):
         pytest.param(case_explicit_batch_one, id="单张2D图像(显式批处理)"),
         pytest.param(case_dim_one, id="含幺元维度的批处理"),
         pytest.param(case_batch_1d, id="批处理1D数据"),
-    ]
+    ],
 )
 def test_batch_processing(input_numpy, request):
     if not torch.cuda.is_available():
@@ -75,12 +86,14 @@ def test_batch_processing(input_numpy, request):
     print(f"输入张量形状: {x.shape}")
     dist_cuda, idx_cuda = tm.distance_transform(x.clone())
     print(f"Output index shape: {idx_cuda.shape}.")
-    
+
     y_ref_numpy = batch_distance_transform_edt(x_numpy_contiguous)
     y_ref = torch.from_numpy(y_ref_numpy).to(torch.float32).cuda()
-    
-    assert dist_cuda.shape == y_ref.shape, f"形状不匹配! CUDA输出: {dist_cuda.shape}, SciPy应为: {y_ref.shape}"
+
+    assert (
+        dist_cuda.shape == y_ref.shape
+    ), f"形状不匹配! CUDA输出: {dist_cuda.shape}, SciPy应为: {y_ref.shape}"
     print("CUDA 和 SciPy 输出形状匹配。")
-    
+
     torch.testing.assert_close(dist_cuda, y_ref, atol=1e-3, rtol=1e-3)
     print("--- 断言通过 (数值接近) ---")

From 8f835e37e59e2749eb93ef9efb4bd88836a44453 Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Mon, 3 Nov 2025 18:00:28 +0800
Subject: [PATCH 08/56] benchmark

---
 benchmark/distance_transform.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 benchmark/distance_transform.py

diff --git a/benchmark/distance_transform.py b/benchmark/distance_transform.py
new file mode 100644
index 0000000..91219bb
--- /dev/null
+++ b/benchmark/distance_transform.py
@@ -0,0 +1,24 @@
+import torch
+import torch.utils.benchmark as benchmark
+import scipy.ndimage as ndi
+import torchmorph as tm
+
+for size in [64, 128, 256, 512, 1024, 2048]:
+    x = (torch.randn(1, 1, size, size, device="cuda") > 0).to(torch.float32)
+
+    # TorchMorph CUDA
+    t1 = benchmark.Timer(
+        stmt="tm.distance_transform(x)",
+        setup="from __main__ import x, tm",
+        num_threads=torch.get_num_threads()
+    )
+    # SciPy (CPU)
+    import numpy as np
+    x_np = x.cpu().squeeze().numpy()
+    t2 = benchmark.Timer(
+        stmt="ndi.distance_transform_edt(x_np)",
+        setup="from __main__ import x_np, ndi"
+    )
+
+    print(f"Size {size}:\n", t1.blocked_autorange())
+    print(f"Size {size}:\n", t2.blocked_autorange())

From 7e108791dee292f7fcaa1e035e58384ce752adc8 Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Mon, 3 Nov 2025 18:27:09 +0800
Subject: [PATCH 09/56] benchmark outputs tables

---
 benchmark/distance_transform.py | 93 ++++++++++++++++++++++++++-------
 1 file changed, 74 insertions(+), 19 deletions(-)

diff --git a/benchmark/distance_transform.py b/benchmark/distance_transform.py
index 91219bb..c659c82 100644
--- a/benchmark/distance_transform.py
+++ b/benchmark/distance_transform.py
@@ -1,24 +1,79 @@
 import torch
 import torch.utils.benchmark as benchmark
 import scipy.ndimage as ndi
+import numpy as np
+from prettytable import PrettyTable
 import torchmorph as tm
 
-for size in [64, 128, 256, 512, 1024, 2048]:
-    x = (torch.randn(1, 1, size, size, device="cuda") > 0).to(torch.float32)
-
-    # TorchMorph CUDA
-    t1 = benchmark.Timer(
-        stmt="tm.distance_transform(x)",
-        setup="from __main__ import x, tm",
-        num_threads=torch.get_num_threads()
-    )
-    # SciPy (CPU)
-    import numpy as np
-    x_np = x.cpu().squeeze().numpy()
-    t2 = benchmark.Timer(
-        stmt="ndi.distance_transform_edt(x_np)",
-        setup="from __main__ import x_np, ndi"
-    )
-
-    print(f"Size {size}:\n", t1.blocked_autorange())
-    print(f"Size {size}:\n", t2.blocked_autorange())
+sizes = [64, 128, 256, 512, 1024]
+batches = [1, 4, 8, 16]
+dtype = torch.float32
+device = "cuda"
+MIN_RUN = 1.0  # seconds per measurement
+
+torch.set_num_threads(torch.get_num_threads())
+
+for B in batches:
+    table = PrettyTable()
+    table.field_names = [
+        "Size",
+        "SciPy (ms/img)",
+        "Torch 1× (ms/img)",
+        "Torch batch (ms/img)",
+        "Speedup 1×",
+        "Speedup batch",
+    ]
+    for c in table.field_names:
+        table.align[c] = "r"
+
+    for s in sizes:
+        # Inputs
+        x = (torch.randn(B, 1, s, s, device=device) > 0).to(dtype)
+        x_np_list = [x[i, 0].detach().cpu().numpy() for i in range(B)]
+        x_imgs = [x[i:i+1] for i in range(B)]
+
+        # SciPy (CPU, one-by-one)
+        stmt_scipy = "out = [ndi.distance_transform_edt(arr) for arr in x_np_list]"
+        t_scipy = benchmark.Timer(
+            stmt=stmt_scipy,
+            setup="from __main__ import x_np_list, ndi",
+            num_threads=torch.get_num_threads(),
+        ).blocked_autorange(min_run_time=MIN_RUN)
+        scipy_per_img_ms = (t_scipy.median * 1e3) / B
+
+        # Torch (CUDA, one-by-one)
+        stmt_torch1 = """
+for xi in x_imgs:
+    tm.distance_transform(xi)
+"""
+        t_torch1 = benchmark.Timer(
+            stmt=stmt_torch1,
+            setup="from __main__ import x_imgs, tm",
+            num_threads=torch.get_num_threads(),
+        ).blocked_autorange(min_run_time=MIN_RUN)
+        torch1_per_img_ms = (t_torch1.median * 1e3) / B
+
+        # Torch (CUDA, batched)
+        t_batch = benchmark.Timer(
+            stmt="tm.distance_transform(x)",
+            setup="from __main__ import x, tm",
+            num_threads=torch.get_num_threads(),
+        ).blocked_autorange(min_run_time=MIN_RUN)
+        torchB_per_img_ms = (t_batch.median * 1e3) / B
+
+        # Speedups
+        speed1 = scipy_per_img_ms / torch1_per_img_ms
+        speedB = scipy_per_img_ms / torchB_per_img_ms
+
+        table.add_row([
+            s,
+            f"{scipy_per_img_ms:.3f}",
+            f"{torch1_per_img_ms:.3f}",
+            f"{torchB_per_img_ms:.3f}",
+            f"{speed1:.1f}×",
+            f"{speedB:.1f}×",
+        ])
+
+    print(f"\n=== Batch Size: {B} ===")
+    print(table)
+

From 7b6b8aae176f5dc8a7ac03898d84fb287fcea674 Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Mon, 3 Nov 2025 21:31:14 +0800
Subject: [PATCH 10/56] prettytable

---
 requirements-dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index fc961bb..0df6115 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -12,4 +12,4 @@ flake8>=6.0
 setuptools>=65.0
 wheel>=0.40
 ninja>=1.11  # optional, speeds up torch extension builds
-
+prettytable>=3.16.0

From 15933d9df18aefd1b10c38c5ef8e32e4ad967a76 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Thu, 20 Nov 2025 18:20:03 +0800
Subject: [PATCH 11/56] =?UTF-8?q?=E5=AE=9E=E7=8E=B0n=E7=BB=B4=E6=89=B9?=
 =?UTF-8?q?=E5=A4=84=E7=90=86=E5=90=8C=E6=97=B6=E8=BF=94=E5=9B=9E=E5=9D=90?=
 =?UTF-8?q?=E6=A0=87=E5=92=8C=E8=B7=9D=E7=A6=BB=E7=9A=84=E7=B2=BE=E7=A1=AE?=
 =?UTF-8?q?=E6=AC=A7=E5=BC=8F=E8=B7=9D=E7=A6=BB=E5=8F=98=E6=8D=A2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml                               |   2 +-
 test/test_distance_transform.py              | 136 ++--
 torchmorph/csrc/distance_transform_kernel.cu | 731 ++++++++++++++-----
 3 files changed, 619 insertions(+), 250 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 10af0cd..a4d6163 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,5 +18,5 @@ addopts = "-v --import-mode=importlib"
 testpaths = ["test"]
 
 [build-system]
-requires = ["setuptools>=61.0", "wheel", "torch"]
+requires = ["setuptools>=61.0", "wheel", "torch", "numpy"]
 build-backend = "setuptools.build_meta"
\ No newline at end of file
diff --git a/test/test_distance_transform.py b/test/test_distance_transform.py
index f65a0b4..3a11166 100644
--- a/test/test_distance_transform.py
+++ b/test/test_distance_transform.py
@@ -1,70 +1,37 @@
 import torch
 import pytest
-from scipy.ndimage import distance_transform_edt as dte
-import torchmorph as tm
+from scipy.ndimage import distance_transform_edt as scipy_edt
 import numpy as np
+import torchmorph as tm
 
-
-def batch_distance_transform_edt(batch_numpy):
-
+# 辅助函数：调用 SciPy 并处理格式
+def batch_scipy_edt_with_indices(batch_numpy: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
     is_single_sample = batch_numpy.ndim <= 2
-    # (H, W) -> (1, H, W)
     if is_single_sample:
         batch_numpy = batch_numpy[np.newaxis, ...]
-
-    results = [dte(sample) for sample in batch_numpy]
-    output = np.stack(results, axis=0)
-    # (1, H, W) -> (H, W)
+    dist_results, indices_results = [], []
+    for sample in batch_numpy:
+        dist, indices = scipy_edt(sample, return_indices=True, return_distances=True)
+        dist_results.append(dist)
+        indices_results.append(indices)
+    output_dist = np.stack(dist_results, axis=0)
+    output_indices = np.stack(indices_results, axis=0)
+    output_indices = np.moveaxis(output_indices, 1, -1)
     if is_single_sample:
-        output = output.squeeze(0)
-
-    return output
-
-
-# 用例 1: 批处理的 2D 图像
-case_batch_2d = np.array(
-    [
-        # 第 1 张图
-        [[0, 1, 1, 1], [0, 0, 1, 1], [0, 1, 1, 0]],
-        # 第 2 张图
-        [[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]],
-    ],
-    dtype=np.float32,
-)
-
-
-# 用例 2: 批处理的 3D 图像
-case_3d_sample1 = np.ones((4, 5, 6), dtype=np.float32)
-case_3d_sample1[1, 1, 1] = 0.0
-case_3d_sample1[2, 3, 4] = 0.0
-case_3d_sample2 = np.ones((4, 5, 6), dtype=np.float32)
-case_3d_sample2[0, 0, 0] = 0.0
-case_batch_3d = np.stack([case_3d_sample1, case_3d_sample2], axis=0)
-
-# 用例 3: 单张 2D 图像 (隐式批处理)
-case_single_2d = np.array(
-    [
-        [0, 1, 0, 1],
-        [1, 0, 1, 0],
-        [0, 1, 0, 1],
-        [1, 0, 1, 0],
-    ],
-    dtype=np.float32,
-)
-
-
-# 用例 4: 单张 2D 图像 (显式批处理)
+        output_dist = output_dist.squeeze(0)
+        output_indices = output_indices.squeeze(0)
+    return output_dist, output_indices
+
+# 用例定义 
+case_batch_2d = np.array([[[0., 1, 1, 1], [0, 0, 1, 1], [0, 1, 1, 0]],[[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]]], dtype=np.float32)
+_case_3d_s1 = np.ones((4, 5, 6), dtype=np.float32); _case_3d_s1[1, 1, 1] = 0.0; _case_3d_s1[2, 3, 4] = 0.0
+_case_3d_s2 = np.ones((4, 5, 6), dtype=np.float32); _case_3d_s2[0, 0, 0] = 0.0
+case_batch_3d = np.stack([_case_3d_s1, _case_3d_s2], axis=0)
+case_single_2d = np.array([[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0]], dtype=np.float32)
 case_explicit_batch_one = case_single_2d[np.newaxis, ...]
-
-# 用例 5: 含幺元维度的批处理
-case_dim_one = np.ones((2, 5, 1), dtype=np.float32)  # 两张 5x1 的图片
-case_dim_one[0, 2, 0] = 0.0
-case_dim_one[1, 4, 0] = 0.0
-
-# 用例 6: 1D 张量的批处理
+case_dim_one = np.ones((2, 5, 1), dtype=np.float32); case_dim_one[0, 2, 0] = 0.0; case_dim_one[1, 4, 0] = 0.0
 case_batch_1d = np.array([[1, 1, 0, 1, 0, 1], [0, 1, 1, 1, 1, 0]], dtype=np.float32)
 
-
 @pytest.mark.parametrize(
     "input_numpy",
     [
@@ -76,24 +43,49 @@ def batch_distance_transform_edt(batch_numpy):
         pytest.param(case_batch_1d, id="批处理1D数据"),
     ],
 )
-def test_batch_processing(input_numpy, request):
+def test_distance_transform_and_indices(input_numpy: np.ndarray, request: pytest.FixtureRequest):
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
+    
     x_numpy_contiguous = np.ascontiguousarray(input_numpy)
-    x = torch.from_numpy(x_numpy_contiguous).cuda()
+    x_cuda = torch.from_numpy(x_numpy_contiguous).cuda()
 
     print(f"\n\n--- 正在运行测试: {request.node.callspec.id} ---")
-    print(f"输入张量形状: {x.shape}")
-    dist_cuda, idx_cuda = tm.distance_transform(x.clone())
-    print(f"Output index shape: {idx_cuda.shape}.")
-
-    y_ref_numpy = batch_distance_transform_edt(x_numpy_contiguous)
-    y_ref = torch.from_numpy(y_ref_numpy).to(torch.float32).cuda()
-
-    assert (
-        dist_cuda.shape == y_ref.shape
-    ), f"形状不匹配! CUDA输出: {dist_cuda.shape}, SciPy应为: {y_ref.shape}"
-    print("CUDA 和 SciPy 输出形状匹配。")
-
-    torch.testing.assert_close(dist_cuda, y_ref, atol=1e-3, rtol=1e-3)
-    print("--- 断言通过 (数值接近) ---")
+    print(f"输入张量形状: {x_cuda.shape}")
+
+    # 调用您的 Python 包装函数
+    dist_cuda, idx_cuda = tm.distance_transform(x_cuda.clone())
+
+    print(f"CUDA 距离输出形状: {dist_cuda.shape}")
+    print(f"CUDA 坐标输出形状: {idx_cuda.shape}")
+
+    # 调用 SciPy 作为参考基准
+    dist_ref_numpy, idx_ref_numpy = batch_scipy_edt_with_indices(x_numpy_contiguous)
+    dist_ref = torch.from_numpy(dist_ref_numpy).to(torch.float32).cuda()
+    
+    print(f"SciPy 距离输出形状: {dist_ref.shape}")
+
+    # 断言验证
+    print("\n--- 正在验证距离... ---")
+    assert dist_cuda.shape == dist_ref.shape
+    torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-3, rtol=1e-3)
+    print("距离断言通过 (形状和数值接近)。")
+
+    print("\n--- 正在验证坐标... ---")
+    
+    # 鲁棒的坐标验证逻辑
+    had_no_batch_dim = (x_numpy_contiguous.ndim <= idx_cuda.shape[-1])
+    spatial_shape = x_cuda.shape if had_no_batch_dim else x_cuda.shape[1:]
+    coords = [torch.arange(s, device='cuda') for s in spatial_shape]
+    grid = torch.stack(torch.meshgrid(*coords, indexing='ij'), dim=-1)
+    
+    if not had_no_batch_dim:
+        grid = grid.unsqueeze(0)
+        
+    diff = grid.float() - idx_cuda.float()
+    dist_sq_from_indices = torch.sum(diff * diff, dim=-1)
+    
+    torch.testing.assert_close(dist_sq_from_indices, dist_cuda * dist_cuda, atol=1e-3, rtol=1e-3)
+    print("坐标正确性断言通过 (计算出的距离与返回距离匹配)。")
+        
+    print("--- 测试通过 ---")
\ No newline at end of file
diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 534618f..90785bd 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -1,232 +1,609 @@
 #include <torch/extension.h>
 #include <vector>
+#include <algorithm>
+#include <utility>
+#include <cmath>
+#include <limits>
 
-// --- Kernel 1: 二值化内核  ---
-/*
- * @brief 对输入张量进行逐元素二值化，为距离变换做准备。
- * @details 将前景点(in[idx] == 0)的初始距离设为0，
- *          背景点的初始距离设为一个极大值(1e20f)，这在距离变换的上下文中
- *          可以被认为是无穷大。
- * @param in 输入张量的数据指针。
- * @param out 输出张量的数据指针。
- * @param N 张量中的元素总数。
- */
-__global__ void initialize_distance_kernel(const float* in, float* out, int64_t N) {
-    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < N) {
-        // 如果输入像素为0，则为前景点，其距离为0。
-        // 如果输入像素非0，则为背景点，其初始距离为无穷大。
-        out[idx] = (in[idx] == 0.0f) ? 0.0f : 1e20f;
+// 优化策略：用4个独立的内核函数替代模板，完全消除分支
+
+// 内核1: 第一个pass且是唯一pass (1D情况)
+__global__ void edt_kernel_first_final(
+    const float* in_data,
+    float* out_dist,
+    int32_t* out_idx,
+    const int64_t* shape,
+    const int64_t* strides,
+    int32_t ndim,
+    int32_t process_dim_sample,
+    int64_t total_slices,
+    int64_t num_slices_per_sample
+) {
+    int64_t slice_idx = blockIdx.x;
+    if (slice_idx >= total_slices) return;
+
+    int64_t batch_idx = slice_idx / num_slices_per_sample;
+    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
+    int64_t batch_offset = batch_idx * strides[0];
+    int64_t sample_base_offset = 0;
+    int64_t temp_idx = slice_idx_in_sample;
+    const int sample_ndim = ndim - 1;
+
+    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
+        if (d == process_dim_sample) continue;
+        int64_t size_of_dim = shape[d + 1];
+        if (size_of_dim > 0) {
+            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
+            temp_idx /= size_of_dim;
+        }
+    }
+    
+    const int64_t process_dim_actual = process_dim_sample + 1;
+    const int64_t N = shape[process_dim_actual];
+    const int64_t stride = strides[process_dim_actual];
+    const int64_t base_offset = batch_offset + sample_base_offset;
+
+    if (N == 0) return;
+
+    extern __shared__ char s_buffer[];
+    float* f = (float*)s_buffer;
+    int*   v = (int*)(f + N);
+    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
+    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
+
+    // 加载数据 - 第一个pass
+    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
+        int64_t global_offset = base_offset + i * stride;
+        float val = __ldg(&in_data[global_offset]);
+        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
+        
+        if (val == 0.0f) {
+            f[i] = 0.0f;
+            int64_t temp_coord = slice_idx_in_sample;
+            for (int32_t d = sample_ndim - 1; d >= 0; --d) {
+                if (d == process_dim_sample) continue;
+                int64_t size_of_dim = shape[d + 1];
+                if (size_of_dim > 0) {
+                    shared_idx_ptr[d] = temp_coord % size_of_dim;
+                    temp_coord /= size_of_dim;
+                } else {
+                    shared_idx_ptr[d] = 0;
+                }
+            }
+            shared_idx_ptr[process_dim_sample] = i;
+        } else {
+            f[i] = 1e20f;
+            for (int d = 0; d < sample_ndim; ++d) shared_idx_ptr[d] = -1;
+        }
+    }
+    __syncthreads();
+
+    // 构建包络
+    if (threadIdx.x == 0) {
+        int k = 0;
+        v[0] = 0;
+        z[0] = -1e20f;
+        z[1] = 1e20f;
+        
+        for (int q = 1; q < N; q++) {
+            float fq = f[q];
+            int q_sq = q * q;
+            
+            while (k >= 0) {
+                int p = v[k];
+                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
+                if (s > z[k]) {
+                    k++;
+                    v[k] = q;
+                    z[k] = s;
+                    z[k + 1] = 1e20f;
+                    break;
+                }
+                k--;
+                if (k < 0) {
+                    k = 0;
+                    v[0] = q;
+                    z[0] = -1e20f;
+                    z[1] = 1e20f;
+                    break;
+                }
+            }
+        }
+    }
+    __syncthreads();
+
+    // 计算距离 - 最后一个pass，直接开方
+    for (int q = threadIdx.x; q < N; q += blockDim.x) {
+        int k = 0;
+        float q_float = (float)q;
+        while (z[k + 1] < q_float) k++;
+        
+        int p = v[k];
+        int64_t global_offset = base_offset + q * stride;
+        float dist_sq = (float)(q - p) * (q - p) + f[p];
+        
+        out_dist[global_offset] = sqrtf(dist_sq);  // 直接开方
+
+        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
+        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
+        for (int d = 0; d < sample_ndim; ++d) {
+            out_idx_ptr[d] = src_idx_ptr[d];
+        }
     }
 }
 
-// --- Kernel 2: 1D Pass 距离平方计算内核 ---
-/**
- * @brief 沿着一个指定的空间维度，对N维张量执行一维抛物线下包络算法。
- * @details 这是Felzenszwalb和Huttenlocher EDT算法的核心。它通过将N维问题分解为N个
- *          一维问题来解决。此内核负责处理其中一个维度。它只计算距离的平方，以避免
- *          昂贵的开方运算并保持数值精度。
- *          每个CUDA线程块（block）负责处理一条完整的一维扫描线（slice）。
- * @param in_data 输入张量数据指针。
- * @param out_data 输出张量数据指针。
- * @param shape 描述输入张量形状的数组指针 (在GPU上)。
- * @param strides 描述输入张量步幅的数组指针 (在GPU上)。
- * @param ndim 张量的总维度数 (包括批处理维度)。
- * @param process_dim_sample 当前正在处理的空间维度索引 (0代表第一个空间维度，依此类推)。
- * @param total_slices 需要处理的一维扫描线总数 (batch_size * num_slices_per_sample)。
- * @param num_slices_per_sample 每个样本中，垂直于当前处理维度的扫描线数量。
- */
-__global__ void edt_1d_pass_sq_kernel(
-    const float* in_data, float* out_data, 
-    const int64_t* shape, const int64_t* strides, 
-    int32_t ndim, int32_t process_dim_sample, 
-    int64_t total_slices, int64_t num_slices_per_sample
+// 内核2: 第一个pass但不是最后
+__global__ void edt_kernel_first_only(
+    const float* in_data,
+    float* out_dist,
+    int32_t* out_idx,
+    const int64_t* shape,
+    const int64_t* strides,
+    int32_t ndim,
+    int32_t process_dim_sample,
+    int64_t total_slices,
+    int64_t num_slices_per_sample
 ) {
-    // 每个线程块处理一条一维扫描线
     int64_t slice_idx = blockIdx.x;
     if (slice_idx >= total_slices) return;
 
-
     int64_t batch_idx = slice_idx / num_slices_per_sample;
     int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
-    int64_t batch_offset = batch_idx * strides[0]; // 获取批处理的基地址
+    int64_t batch_offset = batch_idx * strides[0];
     int64_t sample_base_offset = 0;
     int64_t temp_idx = slice_idx_in_sample;
-    int sample_ndim = ndim - 1;
+    const int sample_ndim = ndim - 1;
 
-    // 从非处理维度中计算出样本内的基地址偏移
     for (int32_t d = sample_ndim - 1; d >= 0; --d) {
-        if (d == process_dim_sample) continue; // 跳过当前正在处理的维度
+        if (d == process_dim_sample) continue;
         int64_t size_of_dim = shape[d + 1];
-        if (size_of_dim == 0) continue;
-        int64_t coord_in_dim = temp_idx % size_of_dim;
-        temp_idx /= size_of_dim;
-        sample_base_offset += coord_in_dim * strides[d + 1];
+        if (size_of_dim > 0) {
+            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
+            temp_idx /= size_of_dim;
+        }
     }
     
-    const int64_t process_dim_actual = process_dim_sample + 1; // 加上批处理维度的实际索引
-    const int64_t N = shape[process_dim_actual]; // 当前处理维度的长度
-    const int64_t stride = strides[process_dim_actual]; // 沿当前维度移动一个元素所需的步幅
-    const int64_t base_offset = batch_offset + sample_base_offset; // 最终的起始地址
+    const int64_t process_dim_actual = process_dim_sample + 1;
+    const int64_t N = shape[process_dim_actual];
+    const int64_t stride = strides[process_dim_actual];
+    const int64_t base_offset = batch_offset + sample_base_offset;
 
+    if (N == 0) return;
 
-    extern __shared__ float sdata[];
-    float* f = sdata;                       // 存储函数值 g(p) = f(p) + p^2
-    int*   v = (int*)(sdata + N);           // 存储抛物线顶点的索引
-    float* z = (float*)(v + N + 1);         // 存储相邻抛物线的交点
+    extern __shared__ char s_buffer[];
+    float* f = (float*)s_buffer;
+    int*   v = (int*)(f + N);
+    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
+    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
 
-    // 块内的所有线程协同将数据从全局内存加载到共享内存
     for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
-        f[i] = in_data[base_offset + i * stride];
+        int64_t global_offset = base_offset + i * stride;
+        float val = __ldg(&in_data[global_offset]);
+        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
+        
+        if (val == 0.0f) {
+            f[i] = 0.0f;
+            int64_t temp_coord = slice_idx_in_sample;
+            for (int32_t d = sample_ndim - 1; d >= 0; --d) {
+                if (d == process_dim_sample) continue;
+                int64_t size_of_dim = shape[d + 1];
+                if (size_of_dim > 0) {
+                    shared_idx_ptr[d] = temp_coord % size_of_dim;
+                    temp_coord /= size_of_dim;
+                } else {
+                    shared_idx_ptr[d] = 0;
+                }
+            }
+            shared_idx_ptr[process_dim_sample] = i;
+        } else {
+            f[i] = 1e20f;
+            for (int d = 0; d < sample_ndim; ++d) shared_idx_ptr[d] = -1;
+        }
+    }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+        int k = 0;
+        v[0] = 0;
+        z[0] = -1e20f;
+        z[1] = 1e20f;
+        
+        for (int q = 1; q < N; q++) {
+            float fq = f[q];
+            int q_sq = q * q;
+            
+            while (k >= 0) {
+                int p = v[k];
+                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
+                if (s > z[k]) {
+                    k++;
+                    v[k] = q;
+                    z[k] = s;
+                    z[k + 1] = 1e20f;
+                    break;
+                }
+                k--;
+                if (k < 0) {
+                    k = 0;
+                    v[0] = q;
+                    z[0] = -1e20f;
+                    z[1] = 1e20f;
+                    break;
+                }
+            }
+        }
+    }
+    __syncthreads();
+
+    for (int q = threadIdx.x; q < N; q += blockDim.x) {
+        int k = 0;
+        float q_float = (float)q;
+        while (z[k + 1] < q_float) k++;
+        
+        int p = v[k];
+        int64_t global_offset = base_offset + q * stride;
+        float dist_sq = (float)(q - p) * (q - p) + f[p];
+        
+        out_dist[global_offset] = dist_sq;  // 不开方
+
+        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
+        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
+        for (int d = 0; d < sample_ndim; ++d) {
+            out_idx_ptr[d] = src_idx_ptr[d];
+        }
+    }
+}
+
+// 内核3: 中间pass
+__global__ void edt_kernel_middle(
+    const float* in_dist,
+    const int32_t* in_idx,
+    float* out_dist,
+    int32_t* out_idx,
+    const int64_t* shape,
+    const int64_t* strides,
+    int32_t ndim,
+    int32_t process_dim_sample,
+    int64_t total_slices,
+    int64_t num_slices_per_sample
+) {
+    int64_t slice_idx = blockIdx.x;
+    if (slice_idx >= total_slices) return;
+
+    int64_t batch_idx = slice_idx / num_slices_per_sample;
+    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
+    int64_t batch_offset = batch_idx * strides[0];
+    int64_t sample_base_offset = 0;
+    int64_t temp_idx = slice_idx_in_sample;
+    const int sample_ndim = ndim - 1;
+
+    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
+        if (d == process_dim_sample) continue;
+        int64_t size_of_dim = shape[d + 1];
+        if (size_of_dim > 0) {
+            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
+            temp_idx /= size_of_dim;
+        }
     }
-    __syncthreads(); // 等待所有线程完成加载
+    
+    const int64_t process_dim_actual = process_dim_sample + 1;
+    const int64_t N = shape[process_dim_actual];
+    const int64_t stride = strides[process_dim_actual];
+    const int64_t base_offset = batch_offset + sample_base_offset;
+
+    if (N == 0) return;
 
-    //计算抛物线的下包络 
-    if (threadIdx.x == 0 && N > 0) {
-        int k = 0; // 下包络中的抛物线数量
-        v[0] = 0;  // 第一个抛物线的顶点索引为0
-        z[0] = -1e20f; z[1] = 1e20f; // 初始化交点为负无穷和正无穷
+    extern __shared__ char s_buffer[];
+    float* f = (float*)s_buffer;
+    int*   v = (int*)(f + N);
+    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
+    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
 
-        // 遍历所有点，构建下包络
+    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
+        int64_t global_offset = base_offset + i * stride;
+        f[i] = __ldg(&in_dist[global_offset]);
+        
+        const int32_t* global_idx_ptr = in_idx + global_offset * sample_ndim;
+        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
+        for (int d = 0; d < sample_ndim; ++d) {
+            shared_idx_ptr[d] = __ldg(&global_idx_ptr[d]);
+        }
+    }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+        int k = 0;
+        v[0] = 0;
+        z[0] = -1e20f;
+        z[1] = 1e20f;
+        
         for (int q = 1; q < N; q++) {
-            float s;
-            // 寻找新的抛物线q应该插入的位置
-            while (true) {
-                int p = v[k]; if (q == p) break;
-                // s 是抛物线 p 和 q 的交点的横坐标
-                s = ((f[q] + q * q) - (f[p] + p * p)) / (2.0f * (q - p));
-                // 如果交点在当前区间的右侧，则找到了插入点
-                if (s > z[k]) { break; }
-                // 否则，抛物线p被q完全覆盖，需要移除p
-                if (k == 0) { break; } 
+            float fq = f[q];
+            int q_sq = q * q;
+            
+            while (k >= 0) {
+                int p = v[k];
+                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
+                if (s > z[k]) {
+                    k++;
+                    v[k] = q;
+                    z[k] = s;
+                    z[k + 1] = 1e20f;
+                    break;
+                }
                 k--;
+                if (k < 0) {
+                    k = 0;
+                    v[0] = q;
+                    z[0] = -1e20f;
+                    z[1] = 1e20f;
+                    break;
+                }
             }
-            // 插入新的抛物线q
-            k++; 
-            v[k] = q; 
-            z[k] = s; 
-            z[k + 1] = 1e20f;
-        }
-        // 计算距离平方
-        k = 0;
-        // 遍历所有点，找到其头顶上方的下包络线段，并计算距离
-        for (int q = 0; q < N; q++) {
-            while (z[k + 1] < q) k++; // 找到点q所属的区间
-            int p = v[k]; // 获取该区间的抛物线顶点索引
-            // 计算距离平方: D(q)^2 = (q - p)^2 + g(p)
-            out_data[base_offset + q * stride] = (q - p) * (q - p) + f[p];
+        }
+    }
+    __syncthreads();
+
+    for (int q = threadIdx.x; q < N; q += blockDim.x) {
+        int k = 0;
+        float q_float = (float)q;
+        while (z[k + 1] < q_float) k++;
+        
+        int p = v[k];
+        int64_t global_offset = base_offset + q * stride;
+        float dist_sq = (float)(q - p) * (q - p) + f[p];
+        
+        out_dist[global_offset] = dist_sq;  // 不开方
+
+        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
+        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
+        for (int d = 0; d < sample_ndim; ++d) {
+            out_idx_ptr[d] = src_idx_ptr[d];
         }
     }
 }
 
-// --- Kernel 3: 开平方根内核 ---
-/**
- * @brief 对张量中的每个元素计算平方根。
- * @details 这是一个简单的逐元素操作。由于之前的1D pass计算的是距离的平方，
- *          此内核在所有维度处理完毕后被调用，以得到最终的欧氏距离。
- * @param data 需要进行开方操作的张量数据指针。
- * @param N 张量中的元素总数。
- */
-__global__ void sqrt_kernel(float* data, int64_t N) {
-    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < N) {
-        data[idx] = sqrtf(data[idx]);
+// 内核4: 最后一个pass
+__global__ void edt_kernel_final(
+    const float* in_dist,
+    const int32_t* in_idx,
+    float* out_dist,
+    int32_t* out_idx,
+    const int64_t* shape,
+    const int64_t* strides,
+    int32_t ndim,
+    int32_t process_dim_sample,
+    int64_t total_slices,
+    int64_t num_slices_per_sample
+) {
+    int64_t slice_idx = blockIdx.x;
+    if (slice_idx >= total_slices) return;
+
+    int64_t batch_idx = slice_idx / num_slices_per_sample;
+    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
+    int64_t batch_offset = batch_idx * strides[0];
+    int64_t sample_base_offset = 0;
+    int64_t temp_idx = slice_idx_in_sample;
+    const int sample_ndim = ndim - 1;
+
+    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
+        if (d == process_dim_sample) continue;
+        int64_t size_of_dim = shape[d + 1];
+        if (size_of_dim > 0) {
+            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
+            temp_idx /= size_of_dim;
+        }
+    }
+    
+    const int64_t process_dim_actual = process_dim_sample + 1;
+    const int64_t N = shape[process_dim_actual];
+    const int64_t stride = strides[process_dim_actual];
+    const int64_t base_offset = batch_offset + sample_base_offset;
+
+    if (N == 0) return;
+
+    extern __shared__ char s_buffer[];
+    float* f = (float*)s_buffer;
+    int*   v = (int*)(f + N);
+    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
+    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
+
+    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
+        int64_t global_offset = base_offset + i * stride;
+        f[i] = __ldg(&in_dist[global_offset]);
+        
+        const int32_t* global_idx_ptr = in_idx + global_offset * sample_ndim;
+        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
+        for (int d = 0; d < sample_ndim; ++d) {
+            shared_idx_ptr[d] = __ldg(&global_idx_ptr[d]);
+        }
+    }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+        int k = 0;
+        v[0] = 0;
+        z[0] = -1e20f;
+        z[1] = 1e20f;
+        
+        for (int q = 1; q < N; q++) {
+            float fq = f[q];
+            int q_sq = q * q;
+            
+            while (k >= 0) {
+                int p = v[k];
+                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
+                if (s > z[k]) {
+                    k++;
+                    v[k] = q;
+                    z[k] = s;
+                    z[k + 1] = 1e20f;
+                    break;
+                }
+                k--;
+                if (k < 0) {
+                    k = 0;
+                    v[0] = q;
+                    z[0] = -1e20f;
+                    z[1] = 1e20f;
+                    break;
+                }
+            }
+        }
+    }
+    __syncthreads();
+
+    for (int q = threadIdx.x; q < N; q += blockDim.x) {
+        int k = 0;
+        float q_float = (float)q;
+        while (z[k + 1] < q_float) k++;
+        
+        int p = v[k];
+        int64_t global_offset = base_offset + q * stride;
+        float dist_sq = (float)(q - p) * (q - p) + f[p];
+        
+        out_dist[global_offset] = sqrtf(dist_sq);  // 最后开方
+
+        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
+        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
+        for (int d = 0; d < sample_ndim; ++d) {
+            out_idx_ptr[d] = src_idx_ptr[d];
+        }
     }
 }
 
-// --- 主调函数 (Host) ---
-/**
- * @brief 执行N维欧氏距离变换。
- * @param input 一个N维的PyTorch张量，第一个维度被视为批处理（batch）维度。
- * @return 一个与输入形状相同的张量，包含每个点到最近前景点（值>=0.5）的欧氏距离。
- */
+// Host函数
 std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
-    auto original_input = input;
-    
-    // --- 预处理: 统一输入格式 ---
-    // 确保所有输入都至少是3D的 (B, ...)，方便后续统一处理。
-    // 如果输入是 (H, W) 或 (L)，则变为 (1, H, W) 或 (1, L)。
+    TORCH_CHECK(input.is_cuda(), "Input must be on a CUDA device.");
+    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be a float tensor.");
+    input = input.contiguous();
+
     bool had_no_batch_dim = (input.dim() <= 2); 
     if (had_no_batch_dim) { input = input.unsqueeze(0); }
 
-    // 检查输入张量是否在CUDA上并且是内存连续的
-    TORCH_CHECK(input.is_cuda(), "Input must be on a CUDA device.");
-    TORCH_CHECK(input.is_contiguous(), "Input tensor must be contiguous.");
-
-    
-    // --- 获取张量元数据 ---
     const auto ndim = input.dim();
-    const auto sample_ndim = ndim - 1; // 空间维度 = 总维度 - 1 (batch)
+    const auto sample_ndim = ndim - 1;
     const auto batch_size = input.size(0);
-    const int64_t N_total = input.numel();
     
     auto shape = input.sizes().vec();
-    auto index_shape = shape;
-    index_shape.push_back(sample_ndim);
-
     auto strides_vec = input.strides().vec();
     
-    // --- 内存分配: 使用Ping-Pong缓冲策略 ---
-    // 分配两个缓冲区，在处理每个维度时交替作为输入和输出，避免原地读写冲突。
-    auto distance = torch::zeros_like(input);
-    auto index = torch::zeros(index_shape);
-    auto buffer = (sample_ndim > 0) ? torch::empty_like(input) : distance;
-
-    if (input.numel() == 0) { return std::make_tuple(distance, index); }
-
-    //二值化
-    int threads = 256; // 定义每个线程块的线程数
-    int blocks = (N_total + threads - 1) / threads; // 计算启动的线程块数
-    initialize_distance_kernel<<<blocks, threads>>>(input.data_ptr<float>(), buffer.data_ptr<float>(), N_total);
-
-    //循环调用 edt_1d_pass_sq_kernel
-    // 将shape和strides信息从CPU内存拷贝到GPU内存，以便内核可以访问
-    int64_t *shape_gpu, *strides_gpu;
-    cudaMalloc(&shape_gpu, ndim * sizeof(int64_t));
-    cudaMalloc(&strides_gpu, ndim * sizeof(int64_t));
-    cudaMemcpy(shape_gpu, shape.data(), ndim * sizeof(int64_t), cudaMemcpyHostToDevice);
-    cudaMemcpy(strides_gpu, strides_vec.data(), ndim * sizeof(int64_t), cudaMemcpyHostToDevice);
-
-    torch::Tensor current_input = buffer;
-    torch::Tensor current_output = distance;
-
-    // 遍历所有空间维度
-    for (int32_t d_sample = 0; d_sample < sample_ndim; ++d_sample) {
-        // 为当前处理的维度计算启动内核所需的参数
-        int64_t num_slices_per_sample = 1;
-        for(int i = 0; i < sample_ndim; ++i) { 
-            if (i != d_sample) num_slices_per_sample *= shape[i + 1]; 
-        }
-        int64_t total_slices = batch_size * num_slices_per_sample;
-        int64_t slice_len = shape[d_sample + 1];
-        
-        // 动态设置线程数和共享内存大小
-        int threads_pass = (slice_len > 0 && slice_len < 256) ? slice_len : 256;
-        if (threads_pass == 0) threads_pass = 1;
-        size_t shared_mem_size = slice_len * sizeof(float) + (slice_len + 1) * sizeof(int) + (slice_len + 2) * sizeof(float);
-        
-        edt_1d_pass_sq_kernel<<<total_slices, threads_pass, shared_mem_size>>>(
-            current_input.data_ptr<float>(), current_output.data_ptr<float>(),
-            shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
-        );
-        // 交换输入和输出缓冲区，为下一个维度做准备
-        std::swap(current_input, current_output);
+    if (input.numel() == 0) { 
+        auto distance = torch::empty_like(input);
+        auto index_shape = shape;
+        index_shape.push_back(sample_ndim > 0 ? sample_ndim : 1);
+        auto index = torch::empty(index_shape, input.options().dtype(torch::kInt32));
+        if (had_no_batch_dim) return std::make_tuple(distance.squeeze(0), index.squeeze(0));
+        return std::make_tuple(distance, index); 
     }
     
-    cudaFree(shape_gpu);
-    cudaFree(strides_gpu);
+    auto distance = torch::empty_like(input);
+    auto index_options = input.options().dtype(torch::kInt32);
+    auto index_shape = shape;
+    index_shape.push_back(sample_ndim > 0 ? sample_ndim : 1);
+    auto index = torch::empty(index_shape, index_options);
 
-    //计算最终距离
-    // 经过循环后，current_input 指向的是包含最终距离平方结果的张量
-    sqrt_kernel<<<blocks, threads>>>(current_input.data_ptr<float>(), N_total);
+    if (torch::all(input != 0).item<bool>()) {
+        distance.fill_(std::numeric_limits<float>::infinity());
+        index.fill_(-1);
+        if (had_no_batch_dim) {
+            return std::make_tuple(distance.squeeze(0), index.squeeze(0));
+        }
+        return std::make_tuple(distance, index);
+    }
+
+    auto shape_tensor = torch::tensor(shape, 
+        torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
+    auto strides_tensor = torch::tensor(strides_vec, 
+        torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
     
-    // 如果最后一轮的输出不在我们期望的 output 张量里，就做一次拷贝
-    if (current_input.data_ptr() != distance.data_ptr()){
-        distance.copy_(current_input);
+    const int64_t* shape_gpu = shape_tensor.data_ptr<int64_t>();
+    const int64_t* strides_gpu = strides_tensor.data_ptr<int64_t>();
+
+    std::vector<std::pair<int64_t, int32_t>> dim_order_pairs;
+    for (int32_t d_sample = 0; d_sample < sample_ndim; ++d_sample) {
+        dim_order_pairs.push_back({strides_vec[d_sample + 1], d_sample});
+    }
+    std::sort(dim_order_pairs.rbegin(), dim_order_pairs.rend());
+
+    if (sample_ndim == 0) {
+        int64_t total_slices = batch_size;
+        int64_t slice_len = (shape.size() > 1) ? shape[1] : 0;
+        int threads = std::min((int64_t)256, slice_len);
+        size_t smem = slice_len * sizeof(float) + (slice_len + 1) * sizeof(int) + 
+                      (slice_len + 2) * sizeof(float) + slice_len * 1 * sizeof(int32_t);
+        
+        edt_kernel_first_final<<<total_slices, threads, smem>>>(
+           input.data_ptr<float>(),
+           distance.data_ptr<float>(), index.data_ptr<int32_t>(),
+           shape_gpu, strides_gpu, ndim, 0, total_slices, 1
+       );
+    } else {
+        auto buffer_dist = torch::empty_like(distance);
+        auto buffer_idx = torch::empty_like(index);
+        
+        for (int pass = 0; pass < sample_ndim; ++pass) {
+            int32_t d_sample = dim_order_pairs[pass].second;
+            bool is_first = (pass == 0);
+            bool is_final = (pass == sample_ndim - 1);
+
+            torch::Tensor *in_dist, *in_idx, *out_dist, *out_idx;
+            
+            if (pass % 2 == 0) {
+                in_dist = &distance; in_idx = &index;
+                out_dist = &buffer_dist; out_idx = &buffer_idx;
+            } else {
+                in_dist = &buffer_dist; in_idx = &buffer_idx;
+                out_dist = &distance; out_idx = &index;
+            }
+            
+            int64_t num_slices_per_sample = 1;
+            for(int i = 0; i < sample_ndim; ++i) { 
+                if (i != d_sample) num_slices_per_sample *= shape[i + 1]; 
+            }
+            int64_t total_slices = batch_size * num_slices_per_sample;
+            int64_t slice_len = shape[d_sample + 1];
+            
+            int threads = std::min((int64_t)256, slice_len);
+            size_t smem = slice_len * sizeof(float) + (slice_len + 1) * sizeof(int) + 
+                          (slice_len + 2) * sizeof(float) + slice_len * sample_ndim * sizeof(int32_t);
+
+            if (is_first && is_final) {
+                edt_kernel_first_final<<<total_slices, threads, smem>>>(
+                    input.data_ptr<float>(),
+                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
+                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+                );
+            } else if (is_first) {
+                edt_kernel_first_only<<<total_slices, threads, smem>>>(
+                    input.data_ptr<float>(),
+                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
+                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+                );
+            } else if (is_final) {
+                edt_kernel_final<<<total_slices, threads, smem>>>(
+                    in_dist->data_ptr<float>(), in_idx->data_ptr<int32_t>(),
+                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
+                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+                );
+            } else {
+                edt_kernel_middle<<<total_slices, threads, smem>>>(
+                    in_dist->data_ptr<float>(), in_idx->data_ptr<int32_t>(),
+                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
+                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+                );
+            }
+        }
+        
+        if (sample_ndim % 2 != 0) {
+            distance.copy_(buffer_dist);
+            index.copy_(buffer_idx);
+        }
     }
     
-    // 如果最初没有批处理维度，则移除我们添加的维度
-    if (had_no_batch_dim) { distance = distance.squeeze(0); }
+    if (had_no_batch_dim) { 
+        return std::make_tuple(distance.squeeze(0), index.squeeze(0));
+    }
     
     return std::make_tuple(distance, index);
 }

From 11f067d24d841d0d0071236f8f221f0d7ca30578 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Thu, 20 Nov 2025 19:22:11 +0800
Subject: [PATCH 12/56] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E8=B0=83=E6=95=B4+=E9=80=9F=E5=BA=A6=E4=BC=98=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/test_distance_transform.py              |  20 +-
 torchmorph/csrc/distance_transform_kernel.cu | 638 ++++++-------------
 2 files changed, 205 insertions(+), 453 deletions(-)

diff --git a/test/test_distance_transform.py b/test/test_distance_transform.py
index 3a11166..476ffca 100644
--- a/test/test_distance_transform.py
+++ b/test/test_distance_transform.py
@@ -6,20 +6,32 @@
 
 # 辅助函数：调用 SciPy 并处理格式
 def batch_scipy_edt_with_indices(batch_numpy: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
-    is_single_sample = batch_numpy.ndim <= 2
-    if is_single_sample:
-        batch_numpy = batch_numpy[np.newaxis, ...]
+
+    
+    input_is_1d_batch = (batch_numpy.ndim == 2) 
+    input_is_single_sample_no_batch = (batch_numpy.ndim == 1)
+
+    if input_is_single_sample_no_batch:
+        batch_numpy = batch_numpy[np.newaxis, ...] # (L) -> (1, L)
+
+    
     dist_results, indices_results = [], []
     for sample in batch_numpy:
         dist, indices = scipy_edt(sample, return_indices=True, return_distances=True)
         dist_results.append(dist)
         indices_results.append(indices)
+        
     output_dist = np.stack(dist_results, axis=0)
     output_indices = np.stack(indices_results, axis=0)
+    
+    # indices shape fix: (N, ndim_sample, ...) -> (N, ..., ndim_sample)
+    # 对于 1D: (N, 1, L) -> (N, L, 1)
     output_indices = np.moveaxis(output_indices, 1, -1)
-    if is_single_sample:
+    
+    if input_is_single_sample_no_batch:
         output_dist = output_dist.squeeze(0)
         output_indices = output_indices.squeeze(0)
+        
     return output_dist, output_indices
 
 # 用例定义 
diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 90785bd..c3ded1e 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -5,15 +5,21 @@
 #include <cmath>
 #include <limits>
 
-// 优化策略：用4个独立的内核函数替代模板，完全消除分支
-
-// 内核1: 第一个pass且是唯一pass (1D情况)
-__global__ void edt_kernel_first_final(
-    const float* in_data,
-    float* out_dist,
-    int32_t* out_idx,
-    const int64_t* shape,
-    const int64_t* strides,
+#define MAX_DIMS 10
+#define INF_VAL 1e8f  // 使用 1e8 保证 float32 精度下的数值稳定性
+
+__device__ __forceinline__ float sqr(float x) { return x * x; }
+
+// ------------------------------------------------------------------
+// 内核 1: 初始 Pass (First Pass)
+// ------------------------------------------------------------------
+template <bool IsFinal>
+__global__ void edt_kernel_first_pass(
+    const float* __restrict__ in_data,
+    float* __restrict__ out_dist,
+    int32_t* __restrict__ out_idx,
+    const int64_t* __restrict__ shape,
+    const int64_t* __restrict__ strides,
     int32_t ndim,
     int32_t process_dim_sample,
     int64_t total_slices,
@@ -24,244 +30,117 @@ __global__ void edt_kernel_first_final(
 
     int64_t batch_idx = slice_idx / num_slices_per_sample;
     int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
-    int64_t batch_offset = batch_idx * strides[0];
-    int64_t sample_base_offset = 0;
-    int64_t temp_idx = slice_idx_in_sample;
-    const int sample_ndim = ndim - 1;
-
-    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
-        if (d == process_dim_sample) continue;
-        int64_t size_of_dim = shape[d + 1];
-        if (size_of_dim > 0) {
-            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
-            temp_idx /= size_of_dim;
-        }
-    }
-    
-    const int64_t process_dim_actual = process_dim_sample + 1;
-    const int64_t N = shape[process_dim_actual];
-    const int64_t stride = strides[process_dim_actual];
-    const int64_t base_offset = batch_offset + sample_base_offset;
+    int64_t current_offset = batch_idx * strides[0];
 
-    if (N == 0) return;
-
-    extern __shared__ char s_buffer[];
-    float* f = (float*)s_buffer;
-    int*   v = (int*)(f + N);
-    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
-    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
-
-    // 加载数据 - 第一个pass
-    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
-        int64_t global_offset = base_offset + i * stride;
-        float val = __ldg(&in_data[global_offset]);
-        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
-        
-        if (val == 0.0f) {
-            f[i] = 0.0f;
-            int64_t temp_coord = slice_idx_in_sample;
-            for (int32_t d = sample_ndim - 1; d >= 0; --d) {
-                if (d == process_dim_sample) continue;
-                int64_t size_of_dim = shape[d + 1];
-                if (size_of_dim > 0) {
-                    shared_idx_ptr[d] = temp_coord % size_of_dim;
-                    temp_coord /= size_of_dim;
-                } else {
-                    shared_idx_ptr[d] = 0;
-                }
-            }
-            shared_idx_ptr[process_dim_sample] = i;
-        } else {
-            f[i] = 1e20f;
-            for (int d = 0; d < sample_ndim; ++d) shared_idx_ptr[d] = -1;
-        }
-    }
-    __syncthreads();
-
-    // 构建包络
-    if (threadIdx.x == 0) {
-        int k = 0;
-        v[0] = 0;
-        z[0] = -1e20f;
-        z[1] = 1e20f;
-        
-        for (int q = 1; q < N; q++) {
-            float fq = f[q];
-            int q_sq = q * q;
-            
-            while (k >= 0) {
-                int p = v[k];
-                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
-                if (s > z[k]) {
-                    k++;
-                    v[k] = q;
-                    z[k] = s;
-                    z[k + 1] = 1e20f;
-                    break;
-                }
-                k--;
-                if (k < 0) {
-                    k = 0;
-                    v[0] = q;
-                    z[0] = -1e20f;
-                    z[1] = 1e20f;
-                    break;
-                }
-            }
-        }
-    }
-    __syncthreads();
-
-    // 计算距离 - 最后一个pass，直接开方
-    for (int q = threadIdx.x; q < N; q += blockDim.x) {
-        int k = 0;
-        float q_float = (float)q;
-        while (z[k + 1] < q_float) k++;
-        
-        int p = v[k];
-        int64_t global_offset = base_offset + q * stride;
-        float dist_sq = (float)(q - p) * (q - p) + f[p];
-        
-        out_dist[global_offset] = sqrtf(dist_sq);  // 直接开方
-
-        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
-        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
-        for (int d = 0; d < sample_ndim; ++d) {
-            out_idx_ptr[d] = src_idx_ptr[d];
-        }
-    }
-}
-
-// 内核2: 第一个pass但不是最后
-__global__ void edt_kernel_first_only(
-    const float* in_data,
-    float* out_dist,
-    int32_t* out_idx,
-    const int64_t* shape,
-    const int64_t* strides,
-    int32_t ndim,
-    int32_t process_dim_sample,
-    int64_t total_slices,
-    int64_t num_slices_per_sample
-) {
-    int64_t slice_idx = blockIdx.x;
-    if (slice_idx >= total_slices) return;
-
-    int64_t batch_idx = slice_idx / num_slices_per_sample;
-    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
-    int64_t batch_offset = batch_idx * strides[0];
-    int64_t sample_base_offset = 0;
+    // 预计算基准坐标 (除了 process_dim 以外的维度坐标)
+    int32_t base_coords[MAX_DIMS]; 
     int64_t temp_idx = slice_idx_in_sample;
     const int sample_ndim = ndim - 1;
 
+    // 根据 slice_idx 反解坐标
     for (int32_t d = sample_ndim - 1; d >= 0; --d) {
-        if (d == process_dim_sample) continue;
-        int64_t size_of_dim = shape[d + 1];
-        if (size_of_dim > 0) {
-            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
-            temp_idx /= size_of_dim;
+        if (d == process_dim_sample) {
+            base_coords[d] = 0; // 占位
+            continue; 
         }
+        int64_t size_of_dim = shape[d + 1];
+        int32_t coord = (int32_t)(temp_idx % size_of_dim);
+        base_coords[d] = coord;
+        current_offset += coord * strides[d + 1];
+        temp_idx /= size_of_dim;
     }
-    
+
     const int64_t process_dim_actual = process_dim_sample + 1;
     const int64_t N = shape[process_dim_actual];
     const int64_t stride = strides[process_dim_actual];
-    const int64_t base_offset = batch_offset + sample_base_offset;
-
+    
     if (N == 0) return;
 
     extern __shared__ char s_buffer[];
     float* f = (float*)s_buffer;
     int*   v = (int*)(f + N);
     float* z = (float*)((char*)v + (N + 1) * sizeof(int));
-    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
 
+    // Phase 1: 加载数据
     for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
-        int64_t global_offset = base_offset + i * stride;
-        float val = __ldg(&in_data[global_offset]);
-        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
-        
-        if (val == 0.0f) {
-            f[i] = 0.0f;
-            int64_t temp_coord = slice_idx_in_sample;
-            for (int32_t d = sample_ndim - 1; d >= 0; --d) {
-                if (d == process_dim_sample) continue;
-                int64_t size_of_dim = shape[d + 1];
-                if (size_of_dim > 0) {
-                    shared_idx_ptr[d] = temp_coord % size_of_dim;
-                    temp_coord /= size_of_dim;
-                } else {
-                    shared_idx_ptr[d] = 0;
-                }
-            }
-            shared_idx_ptr[process_dim_sample] = i;
-        } else {
-            f[i] = 1e20f;
-            for (int d = 0; d < sample_ndim; ++d) shared_idx_ptr[d] = -1;
-        }
+        float val = __ldg(&in_data[current_offset + i * stride]);
+        f[i] = (val == 0.0f) ? 0.0f : INF_VAL;
     }
     __syncthreads();
 
+    // Phase 2: 构建包络
     if (threadIdx.x == 0) {
         int k = 0;
         v[0] = 0;
-        z[0] = -1e20f;
-        z[1] = 1e20f;
+        z[0] = -INF_VAL;
+        z[1] = INF_VAL;
         
         for (int q = 1; q < N; q++) {
+            // 显式跳过背景点，避免 INF 污染计算
+            if (f[q] >= (INF_VAL * 0.9f)) continue;
+
             float fq = f[q];
-            int q_sq = q * q;
-            
-            while (k >= 0) {
-                int p = v[k];
-                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
-                if (s > z[k]) {
-                    k++;
-                    v[k] = q;
-                    z[k] = s;
-                    z[k + 1] = 1e20f;
-                    break;
-                }
-                k--;
-                if (k < 0) {
-                    k = 0;
-                    v[0] = q;
-                    z[0] = -1e20f;
-                    z[1] = 1e20f;
+            int k_curr = k;
+            while (k_curr >= 0) {
+                int p = v[k_curr];
+                
+                // --- 核心修复：数值稳定的交点公式 ---
+                // 先计算差值再相加，防止大数吞小数
+                float diff_f = fq - f[p];
+                float diff_sq = (float)q*(float)q - (float)p*(float)p;
+                float s = (diff_f + diff_sq) / (2.0f * (float)(q - p));
+                
+                if (s > z[k_curr]) {
+                    k_curr++;
+                    v[k_curr] = q;
+                    z[k_curr] = s;
+                    z[k_curr + 1] = INF_VAL;
+                    k = k_curr;
                     break;
                 }
+                k_curr--;
+            }
+            if (k_curr < 0) {
+                k = 0; v[0] = q; z[0] = -INF_VAL; z[1] = INF_VAL;
             }
         }
     }
     __syncthreads();
 
+    // Phase 3: 计算距离
     for (int q = threadIdx.x; q < N; q += blockDim.x) {
         int k = 0;
         float q_float = (float)q;
         while (z[k + 1] < q_float) k++;
         
-        int p = v[k];
-        int64_t global_offset = base_offset + q * stride;
-        float dist_sq = (float)(q - p) * (q - p) + f[p];
+        int p = v[k]; 
+        
+        int64_t global_idx = current_offset + q * stride;
+        float dist_sq = sqr(q_float - (float)p) + f[p];
         
-        out_dist[global_offset] = dist_sq;  // 不开方
+        out_dist[global_idx] = IsFinal ? sqrtf(dist_sq) : dist_sq;
 
-        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
-        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
+        // 写入索引
+        int32_t* out_idx_ptr = out_idx + global_idx * sample_ndim;
+        #pragma unroll
         for (int d = 0; d < sample_ndim; ++d) {
-            out_idx_ptr[d] = src_idx_ptr[d];
+            // 只有当前处理的维度写入 p，其他维度写入基准坐标
+            out_idx_ptr[d] = (d == process_dim_sample) ? p : base_coords[d];
         }
     }
 }
 
-// 内核3: 中间pass
-__global__ void edt_kernel_middle(
-    const float* in_dist,
-    const int32_t* in_idx,
-    float* out_dist,
-    int32_t* out_idx,
-    const int64_t* shape,
-    const int64_t* strides,
+// ------------------------------------------------------------------
+// 内核 2: 后续 Pass (Subsequent Pass)
+// ------------------------------------------------------------------
+template <bool IsFinal>
+__global__ void edt_kernel_subsequent_pass(
+    const float* __restrict__ in_dist,
+    const int32_t* __restrict__ in_idx,
+    float* __restrict__ out_dist,
+    int32_t* __restrict__ out_idx,
+    const int64_t* __restrict__ shape,
+    const int64_t* __restrict__ strides,
     int32_t ndim,
     int32_t process_dim_sample,
     int64_t total_slices,
@@ -272,24 +151,21 @@ __global__ void edt_kernel_middle(
 
     int64_t batch_idx = slice_idx / num_slices_per_sample;
     int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
-    int64_t batch_offset = batch_idx * strides[0];
-    int64_t sample_base_offset = 0;
+    int64_t current_offset = batch_idx * strides[0];
+    
     int64_t temp_idx = slice_idx_in_sample;
     const int sample_ndim = ndim - 1;
 
     for (int32_t d = sample_ndim - 1; d >= 0; --d) {
         if (d == process_dim_sample) continue;
         int64_t size_of_dim = shape[d + 1];
-        if (size_of_dim > 0) {
-            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
-            temp_idx /= size_of_dim;
-        }
+        current_offset += (temp_idx % size_of_dim) * strides[d + 1];
+        temp_idx /= size_of_dim;
     }
     
     const int64_t process_dim_actual = process_dim_sample + 1;
     const int64_t N = shape[process_dim_actual];
     const int64_t stride = strides[process_dim_actual];
-    const int64_t base_offset = batch_offset + sample_base_offset;
 
     if (N == 0) return;
 
@@ -297,49 +173,33 @@ __global__ void edt_kernel_middle(
     float* f = (float*)s_buffer;
     int*   v = (int*)(f + N);
     float* z = (float*)((char*)v + (N + 1) * sizeof(int));
-    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
 
     for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
-        int64_t global_offset = base_offset + i * stride;
-        f[i] = __ldg(&in_dist[global_offset]);
-        
-        const int32_t* global_idx_ptr = in_idx + global_offset * sample_ndim;
-        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
-        for (int d = 0; d < sample_ndim; ++d) {
-            shared_idx_ptr[d] = __ldg(&global_idx_ptr[d]);
-        }
+        f[i] = __ldg(&in_dist[current_offset + i * stride]);
     }
     __syncthreads();
 
     if (threadIdx.x == 0) {
         int k = 0;
-        v[0] = 0;
-        z[0] = -1e20f;
-        z[1] = 1e20f;
+        v[0] = 0; z[0] = -INF_VAL; z[1] = INF_VAL;
         
         for (int q = 1; q < N; q++) {
+            if (f[q] >= (INF_VAL * 0.9f)) continue;
+
             float fq = f[q];
-            int q_sq = q * q;
-            
-            while (k >= 0) {
-                int p = v[k];
-                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
-                if (s > z[k]) {
-                    k++;
-                    v[k] = q;
-                    z[k] = s;
-                    z[k + 1] = 1e20f;
-                    break;
-                }
-                k--;
-                if (k < 0) {
-                    k = 0;
-                    v[0] = q;
-                    z[0] = -1e20f;
-                    z[1] = 1e20f;
-                    break;
+            int k_curr = k;
+            while (k_curr >= 0) {
+                int p = v[k_curr];
+                float diff_f = fq - f[p];
+                float diff_sq = (float)q*(float)q - (float)p*(float)p;
+                float s = (diff_f + diff_sq) / (2.0f * (float)(q - p));
+                if (s > z[k_curr]) {
+                    k_curr++; v[k_curr] = q; z[k_curr] = s; z[k_curr + 1] = INF_VAL;
+                    k = k_curr; break;
                 }
+                k_curr--;
             }
+            if (k_curr < 0) { k = 0; v[0] = q; z[0] = -INF_VAL; z[1] = INF_VAL; }
         }
     }
     __syncthreads();
@@ -350,260 +210,140 @@ __global__ void edt_kernel_middle(
         while (z[k + 1] < q_float) k++;
         
         int p = v[k];
-        int64_t global_offset = base_offset + q * stride;
-        float dist_sq = (float)(q - p) * (q - p) + f[p];
-        
-        out_dist[global_offset] = dist_sq;  // 不开方
-
-        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
-        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
-        for (int d = 0; d < sample_ndim; ++d) {
-            out_idx_ptr[d] = src_idx_ptr[d];
-        }
-    }
-}
-
-// 内核4: 最后一个pass
-__global__ void edt_kernel_final(
-    const float* in_dist,
-    const int32_t* in_idx,
-    float* out_dist,
-    int32_t* out_idx,
-    const int64_t* shape,
-    const int64_t* strides,
-    int32_t ndim,
-    int32_t process_dim_sample,
-    int64_t total_slices,
-    int64_t num_slices_per_sample
-) {
-    int64_t slice_idx = blockIdx.x;
-    if (slice_idx >= total_slices) return;
-
-    int64_t batch_idx = slice_idx / num_slices_per_sample;
-    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
-    int64_t batch_offset = batch_idx * strides[0];
-    int64_t sample_base_offset = 0;
-    int64_t temp_idx = slice_idx_in_sample;
-    const int sample_ndim = ndim - 1;
-
-    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
-        if (d == process_dim_sample) continue;
-        int64_t size_of_dim = shape[d + 1];
-        if (size_of_dim > 0) {
-            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
-            temp_idx /= size_of_dim;
-        }
-    }
-    
-    const int64_t process_dim_actual = process_dim_sample + 1;
-    const int64_t N = shape[process_dim_actual];
-    const int64_t stride = strides[process_dim_actual];
-    const int64_t base_offset = batch_offset + sample_base_offset;
-
-    if (N == 0) return;
-
-    extern __shared__ char s_buffer[];
-    float* f = (float*)s_buffer;
-    int*   v = (int*)(f + N);
-    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
-    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
-
-    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
-        int64_t global_offset = base_offset + i * stride;
-        f[i] = __ldg(&in_dist[global_offset]);
         
-        const int32_t* global_idx_ptr = in_idx + global_offset * sample_ndim;
-        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
-        for (int d = 0; d < sample_ndim; ++d) {
-            shared_idx_ptr[d] = __ldg(&global_idx_ptr[d]);
-        }
-    }
-    __syncthreads();
-
-    if (threadIdx.x == 0) {
-        int k = 0;
-        v[0] = 0;
-        z[0] = -1e20f;
-        z[1] = 1e20f;
+        int64_t q_global_offset = current_offset + q * stride;
+        int64_t p_global_offset = current_offset + p * stride;
         
-        for (int q = 1; q < N; q++) {
-            float fq = f[q];
-            int q_sq = q * q;
-            
-            while (k >= 0) {
-                int p = v[k];
-                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
-                if (s > z[k]) {
-                    k++;
-                    v[k] = q;
-                    z[k] = s;
-                    z[k + 1] = 1e20f;
-                    break;
-                }
-                k--;
-                if (k < 0) {
-                    k = 0;
-                    v[0] = q;
-                    z[0] = -1e20f;
-                    z[1] = 1e20f;
-                    break;
-                }
-            }
-        }
-    }
-    __syncthreads();
+        float dist_sq = sqr(q_float - (float)p) + f[p];
+        out_dist[q_global_offset] = IsFinal ? sqrtf(dist_sq) : dist_sq;
 
-    for (int q = threadIdx.x; q < N; q += blockDim.x) {
-        int k = 0;
-        float q_float = (float)q;
-        while (z[k + 1] < q_float) k++;
-        
-        int p = v[k];
-        int64_t global_offset = base_offset + q * stride;
-        float dist_sq = (float)(q - p) * (q - p) + f[p];
-        
-        out_dist[global_offset] = sqrtf(dist_sq);  // 最后开方
+        // 索引直接从 Global Memory 拷贝，无需 Shared Memory
+        const int32_t* src_idx_ptr = in_idx + p_global_offset * sample_ndim;
+        int32_t* out_idx_ptr = out_idx + q_global_offset * sample_ndim;
 
-        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
-        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
+        #pragma unroll
         for (int d = 0; d < sample_ndim; ++d) {
-            out_idx_ptr[d] = src_idx_ptr[d];
+            out_idx_ptr[d] = src_idx_ptr[d]; 
         }
     }
 }
 
-// Host函数
+// ------------------------------------------------------------------
+// Host 函数
+// ------------------------------------------------------------------
 std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
-    TORCH_CHECK(input.is_cuda(), "Input must be on a CUDA device.");
-    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be a float tensor.");
+    TORCH_CHECK(input.is_cuda(), "Input must be on CUDA device.");
+    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be float32.");
+    
     input = input.contiguous();
 
-    bool had_no_batch_dim = (input.dim() <= 2); 
-    if (had_no_batch_dim) { input = input.unsqueeze(0); }
+    // 自动处理 1D 输入：(L) -> (1, L)
+    // 自动处理 1D 批处理：(N, L) 保持不变 (视为 N 个 1D 样本)
+    bool had_no_batch_dim = (input.dim() == 1); 
+    if (had_no_batch_dim) { 
+        input = input.unsqueeze(0); 
+    }
 
     const auto ndim = input.dim();
     const auto sample_ndim = ndim - 1;
     const auto batch_size = input.size(0);
-    
     auto shape = input.sizes().vec();
     auto strides_vec = input.strides().vec();
     
     if (input.numel() == 0) { 
-        auto distance = torch::empty_like(input);
         auto index_shape = shape;
         index_shape.push_back(sample_ndim > 0 ? sample_ndim : 1);
-        auto index = torch::empty(index_shape, input.options().dtype(torch::kInt32));
-        if (had_no_batch_dim) return std::make_tuple(distance.squeeze(0), index.squeeze(0));
-        return std::make_tuple(distance, index); 
+        return std::make_tuple(torch::empty_like(input), 
+                               torch::empty(index_shape, input.options().dtype(torch::kInt32))); 
     }
-    
+
     auto distance = torch::empty_like(input);
-    auto index_options = input.options().dtype(torch::kInt32);
     auto index_shape = shape;
-    index_shape.push_back(sample_ndim > 0 ? sample_ndim : 1);
-    auto index = torch::empty(index_shape, index_options);
-
-    if (torch::all(input != 0).item<bool>()) {
-        distance.fill_(std::numeric_limits<float>::infinity());
-        index.fill_(-1);
-        if (had_no_batch_dim) {
-            return std::make_tuple(distance.squeeze(0), index.squeeze(0));
-        }
-        return std::make_tuple(distance, index);
-    }
+    index_shape.push_back(sample_ndim); 
+    auto index = torch::empty(index_shape, input.options().dtype(torch::kInt32));
 
-    auto shape_tensor = torch::tensor(shape, 
-        torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
-    auto strides_tensor = torch::tensor(strides_vec, 
-        torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
-    
-    const int64_t* shape_gpu = shape_tensor.data_ptr<int64_t>();
-    const int64_t* strides_gpu = strides_tensor.data_ptr<int64_t>();
+    auto buffer_dist = torch::empty_like(distance);
+    auto buffer_idx = torch::empty_like(index);
+
+    auto shape_tensor = torch::tensor(shape, torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
+    auto strides_tensor = torch::tensor(strides_vec, torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
 
     std::vector<std::pair<int64_t, int32_t>> dim_order_pairs;
-    for (int32_t d_sample = 0; d_sample < sample_ndim; ++d_sample) {
-        dim_order_pairs.push_back({strides_vec[d_sample + 1], d_sample});
+    for (int32_t d = 0; d < sample_ndim; ++d) {
+        dim_order_pairs.push_back({strides_vec[d + 1], d});
     }
     std::sort(dim_order_pairs.rbegin(), dim_order_pairs.rend());
 
-    if (sample_ndim == 0) {
-        int64_t total_slices = batch_size;
-        int64_t slice_len = (shape.size() > 1) ? shape[1] : 0;
-        int threads = std::min((int64_t)256, slice_len);
-        size_t smem = slice_len * sizeof(float) + (slice_len + 1) * sizeof(int) + 
-                      (slice_len + 2) * sizeof(float) + slice_len * 1 * sizeof(int32_t);
-        
-        edt_kernel_first_final<<<total_slices, threads, smem>>>(
-           input.data_ptr<float>(),
-           distance.data_ptr<float>(), index.data_ptr<int32_t>(),
-           shape_gpu, strides_gpu, ndim, 0, total_slices, 1
-       );
-    } else {
-        auto buffer_dist = torch::empty_like(distance);
-        auto buffer_idx = torch::empty_like(index);
-        
-        for (int pass = 0; pass < sample_ndim; ++pass) {
-            int32_t d_sample = dim_order_pairs[pass].second;
-            bool is_first = (pass == 0);
-            bool is_final = (pass == sample_ndim - 1);
-
-            torch::Tensor *in_dist, *in_idx, *out_dist, *out_idx;
-            
-            if (pass % 2 == 0) {
-                in_dist = &distance; in_idx = &index;
-                out_dist = &buffer_dist; out_idx = &buffer_idx;
+    for (int pass = 0; pass < sample_ndim; ++pass) {
+        int32_t d_sample = dim_order_pairs[pass].second;
+        bool is_first_pass = (pass == 0);
+        bool is_final_pass = (pass == sample_ndim - 1);
+
+        torch::Tensor *in_d, *in_i, *out_d, *out_i;
+
+        if (is_first_pass) {
+            in_d = nullptr; in_i = nullptr; 
+            out_d = is_final_pass ? &distance : &buffer_dist;
+            out_i = is_final_pass ? &index : &buffer_idx;
+        } else {
+            if (pass % 2 != 0) {
+                in_d = &buffer_dist; in_i = &buffer_idx;
+                out_d = &distance;   out_i = &index;
             } else {
-                in_dist = &buffer_dist; in_idx = &buffer_idx;
-                out_dist = &distance; out_idx = &index;
+                in_d = &distance;    in_i = &index;
+                out_d = &buffer_dist; out_i = &buffer_idx;
             }
-            
-            int64_t num_slices_per_sample = 1;
-            for(int i = 0; i < sample_ndim; ++i) { 
-                if (i != d_sample) num_slices_per_sample *= shape[i + 1]; 
+            if (is_final_pass) {
+                out_d = &distance; out_i = &index;
             }
-            int64_t total_slices = batch_size * num_slices_per_sample;
-            int64_t slice_len = shape[d_sample + 1];
-            
-            int threads = std::min((int64_t)256, slice_len);
-            size_t smem = slice_len * sizeof(float) + (slice_len + 1) * sizeof(int) + 
-                          (slice_len + 2) * sizeof(float) + slice_len * sample_ndim * sizeof(int32_t);
-
-            if (is_first && is_final) {
-                edt_kernel_first_final<<<total_slices, threads, smem>>>(
-                    input.data_ptr<float>(),
-                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
-                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+        }
+
+        int64_t num_slices_per_sample = 1;
+        for(int i = 0; i < sample_ndim; ++i) { 
+            if (i != d_sample) num_slices_per_sample *= shape[i + 1]; 
+        }
+        int64_t total_slices = batch_size * num_slices_per_sample;
+        int64_t slice_len = shape[d_sample + 1];
+        
+        int threads = std::min((int64_t)256, slice_len);
+        size_t smem = slice_len * (sizeof(float) + sizeof(int)) + (slice_len + 1) * sizeof(float);
+
+        if (is_first_pass) {
+            const float* in_ptr = input.data_ptr<float>();
+            if (is_final_pass) {
+                edt_kernel_first_pass<true><<<total_slices, threads, smem>>>(
+                    in_ptr, out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
+                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
+                    ndim, d_sample, total_slices, num_slices_per_sample
                 );
-            } else if (is_first) {
-                edt_kernel_first_only<<<total_slices, threads, smem>>>(
-                    input.data_ptr<float>(),
-                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
-                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+            } else {
+                edt_kernel_first_pass<false><<<total_slices, threads, smem>>>(
+                    in_ptr, out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
+                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
+                    ndim, d_sample, total_slices, num_slices_per_sample
                 );
-            } else if (is_final) {
-                edt_kernel_final<<<total_slices, threads, smem>>>(
-                    in_dist->data_ptr<float>(), in_idx->data_ptr<int32_t>(),
-                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
-                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+            }
+        } else {
+            if (is_final_pass) {
+                edt_kernel_subsequent_pass<true><<<total_slices, threads, smem>>>(
+                    in_d->data_ptr<float>(), in_i->data_ptr<int32_t>(),
+                    out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
+                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
+                    ndim, d_sample, total_slices, num_slices_per_sample
                 );
             } else {
-                edt_kernel_middle<<<total_slices, threads, smem>>>(
-                    in_dist->data_ptr<float>(), in_idx->data_ptr<int32_t>(),
-                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
-                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+                edt_kernel_subsequent_pass<false><<<total_slices, threads, smem>>>(
+                    in_d->data_ptr<float>(), in_i->data_ptr<int32_t>(),
+                    out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
+                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
+                    ndim, d_sample, total_slices, num_slices_per_sample
                 );
             }
         }
-        
-        if (sample_ndim % 2 != 0) {
-            distance.copy_(buffer_dist);
-            index.copy_(buffer_idx);
-        }
     }
     
     if (had_no_batch_dim) { 
         return std::make_tuple(distance.squeeze(0), index.squeeze(0));
     }
-    
     return std::make_tuple(distance, index);
-}
+}
\ No newline at end of file

From 4c427c70f036ca24094ca671c9b44ab560ffc0d8 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Wed, 3 Dec 2025 03:32:21 +0800
Subject: [PATCH 13/56] =?UTF-8?q?=E9=87=87=E7=94=A8JFA=E7=AE=97=E6=B3=95?=
 =?UTF-8?q?=E6=8F=90=E9=AB=98=E5=B9=B6=E8=A1=8C=E5=BA=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torchmorph/csrc/distance_transform_kernel.cu | 290 ++++++++++++-------
 1 file changed, 184 insertions(+), 106 deletions(-)

diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index c3ded1e..64c69fb 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -4,14 +4,126 @@
 #include <utility>
 #include <cmath>
 #include <limits>
+#include <cuda.h>
+#include <cuda_runtime.h>
 
 #define MAX_DIMS 10
-#define INF_VAL 1e8f  // 使用 1e8 保证 float32 精度下的数值稳定性
+#define INF_VAL 1e8f  
+// 保证 blockDim 足以覆盖大多数常见维度大小，或者配合 Loop 处理
+#define MAX_THREADS 1024 
 
 __device__ __forceinline__ float sqr(float x) { return x * x; }
 
+// 计算从像素 q 到源点 p 的距离代价 (考虑了 p 点本身的数值权重 val_p)
+__device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
+    if (p < 0) return INF_VAL; // 无效点
+    // dist = (q - p)^2 + f[p]
+    return sqr((float)q - (float)p) + val_p;
+}
+
 // ------------------------------------------------------------------
-// 内核 1: 初始 Pass (First Pass)
+// 核心逻辑: 1D Jump Flooding (JFA) / Doubling Algorithm
+// 全并行求解最近点索引，替代串行的抛物线构建
+// ------------------------------------------------------------------
+__device__ void compute_1d_jfa(
+    int N,
+    float* __restrict__ s_vals,      // 输入数值 (dist^2)
+    int*   __restrict__ s_idx_curr,  // ping-pong buffer 1
+    int*   __restrict__ s_idx_next   // ping-pong buffer 2
+) {
+    int tid = threadIdx.x;
+
+    // --- 1. 初始化 ---
+    // 每个线程负责一个或多个像素的初始化
+    for (int i = tid; i < N; i += blockDim.x) {
+        // 如果当前位置的值很大，说明它是背景，没有初始源点 (-1)
+        // 否则源点就是它自己 (i)
+        if (s_vals[i] >= INF_VAL * 0.9f) {
+            s_idx_curr[i] = -1;
+        } else {
+            s_idx_curr[i] = i;
+        }
+    }
+    __syncthreads();
+
+    // --- 2. 迭代传播 (Step = 1, 2, 4, 8...) ---
+    // 类似于双调排序或倍增法
+    int* idx_in = s_idx_curr;
+    int* idx_out = s_idx_next;
+
+    // 只要步长小于 N，就需要传播
+    // 对于 N=1024, 只需要 10 次迭代，每次所有线程全并行
+    for (int step = 1; step < N; step *= 2) {
+        
+        for (int i = tid; i < N; i += blockDim.x) {
+            int my_best_p = idx_in[i];
+            float min_cost = INF_VAL;
+
+            // 获取当前最优点的代价
+            if (my_best_p != -1) {
+                min_cost = compute_cost(i, my_best_p, s_vals[my_best_p]);
+            }
+
+            // --- 检查左边邻居 (i - step) ---
+            int left = i - step;
+            if (left >= 0) {
+                int left_p = idx_in[left]; // 邻居推荐的源点
+                if (left_p != -1) {
+                    float c = compute_cost(i, left_p, s_vals[left_p]);
+                    if (c < min_cost) {
+                        min_cost = c;
+                        my_best_p = left_p;
+                    }
+                }
+            }
+
+            // --- 检查右边邻居 (i + step) ---
+            int right = i + step;
+            if (right < N) {
+                int right_p = idx_in[right]; // 邻居推荐的源点
+                if (right_p != -1) {
+                    float c = compute_cost(i, right_p, s_vals[right_p]);
+                    if (c < min_cost) {
+                        min_cost = c;
+                        my_best_p = right_p;
+                    }
+                }
+            }
+
+            // 写入下一轮 Buffer
+            idx_out[i] = my_best_p;
+        }
+        
+        // 交换 Buffer 指针
+        int* temp = idx_in;
+        idx_in = idx_out;
+        idx_out = temp;
+        
+        __syncthreads();
+    }
+
+    // --- 3. 结果写回 ---
+    // 如果最后结果在 s_idx_next 里 (循环次数是奇数)，需要拷回 s_idx_curr
+    // 或者直接让调用者知道结果在哪。
+    // 为了简单，我们统一把结果放在 s_idx_curr 指向的内存里。
+    // 注意：idx_in 现在指向的是包含最新结果的 buffer。
+    
+    // 如果 idx_in 已经指向 s_idx_curr，那不用动。
+    // 如果 idx_in 指向 s_idx_next，说明最新结果在 s_idx_next，我们需要把它拷贝回 s_idx_curr
+    // 或者是调整后续代码读取的指针。
+    
+    // 这里采用简单拷贝回 s_idx_curr 的方式，确保后续逻辑一致
+    if (idx_in != s_idx_curr) {
+        for (int i = tid; i < N; i += blockDim.x) {
+            s_idx_curr[i] = s_idx_next[i];
+        }
+        __syncthreads();
+    }
+}
+
+
+// ------------------------------------------------------------------
+// 内核 1: 初始 Pass (JFA Version)
 // ------------------------------------------------------------------
 template <bool IsFinal>
 __global__ void edt_kernel_first_pass(
@@ -32,15 +144,13 @@ __global__ void edt_kernel_first_pass(
     int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
     int64_t current_offset = batch_idx * strides[0];
 
-    // 预计算基准坐标 (除了 process_dim 以外的维度坐标)
     int32_t base_coords[MAX_DIMS]; 
     int64_t temp_idx = slice_idx_in_sample;
     const int sample_ndim = ndim - 1;
 
-    // 根据 slice_idx 反解坐标
     for (int32_t d = sample_ndim - 1; d >= 0; --d) {
         if (d == process_dim_sample) {
-            base_coords[d] = 0; // 占位
+            base_coords[d] = 0; 
             continue; 
         }
         int64_t size_of_dim = shape[d + 1];
@@ -56,82 +166,58 @@ __global__ void edt_kernel_first_pass(
     
     if (N == 0) return;
 
+    // Shared Memory Layout:
+    // f: float[N] (Values)
+    // idx1: int[N] (Buffer 1)
+    // idx2: int[N] (Buffer 2)
     extern __shared__ char s_buffer[];
     float* f = (float*)s_buffer;
-    int*   v = (int*)(f + N);
-    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
+    int*   idx1 = (int*)(f + N);
+    int*   idx2 = (int*)(idx1 + N);
 
-    // Phase 1: 加载数据
+    // Phase 1: 并行加载数据
     for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
         float val = __ldg(&in_data[current_offset + i * stride]);
         f[i] = (val == 0.0f) ? 0.0f : INF_VAL;
     }
     __syncthreads();
 
-    // Phase 2: 构建包络
-    if (threadIdx.x == 0) {
-        int k = 0;
-        v[0] = 0;
-        z[0] = -INF_VAL;
-        z[1] = INF_VAL;
-        
-        for (int q = 1; q < N; q++) {
-            // 显式跳过背景点，避免 INF 污染计算
-            if (f[q] >= (INF_VAL * 0.9f)) continue;
-
-            float fq = f[q];
-            int k_curr = k;
-            while (k_curr >= 0) {
-                int p = v[k_curr];
-                
-                // --- 核心修复：数值稳定的交点公式 ---
-                // 先计算差值再相加，防止大数吞小数
-                float diff_f = fq - f[p];
-                float diff_sq = (float)q*(float)q - (float)p*(float)p;
-                float s = (diff_f + diff_sq) / (2.0f * (float)(q - p));
-                
-                if (s > z[k_curr]) {
-                    k_curr++;
-                    v[k_curr] = q;
-                    z[k_curr] = s;
-                    z[k_curr + 1] = INF_VAL;
-                    k = k_curr;
-                    break;
-                }
-                k_curr--;
-            }
-            if (k_curr < 0) {
-                k = 0; v[0] = q; z[0] = -INF_VAL; z[1] = INF_VAL;
-            }
-        }
-    }
-    __syncthreads();
+    // Phase 2: 并行 JFA 计算
+    compute_1d_jfa(N, f, idx1, idx2);
+    // 结果现在存储在 idx1 中
 
-    // Phase 3: 计算距离
+    // Phase 3: 并行写回
     for (int q = threadIdx.x; q < N; q += blockDim.x) {
-        int k = 0;
-        float q_float = (float)q;
-        while (z[k + 1] < q_float) k++;
-        
-        int p = v[k]; 
-        
+        int p = idx1[q];
+        float dist_val;
+        int p_idx;
+
+        if (p != -1) {
+            // JFA 得到的是最近源点的索引 p
+            // 距离 = (q-p)^2 + f[p]
+            // 注意：在 First Pass 中，f[p] 要么是 0 要么是 INF。如果 p != -1，f[p] 必为 0。
+            // 但为了通用性，还是加上 f[p]
+            float dist_sq = sqr((float)q - (float)p) + f[p];
+            dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
+            p_idx = p;
+        } else {
+            dist_val = IsFinal ? INF_VAL : sqr(INF_VAL);
+            p_idx = 0; 
+        }
+
         int64_t global_idx = current_offset + q * stride;
-        float dist_sq = sqr(q_float - (float)p) + f[p];
-        
-        out_dist[global_idx] = IsFinal ? sqrtf(dist_sq) : dist_sq;
+        out_dist[global_idx] = dist_val;
 
-        // 写入索引
         int32_t* out_idx_ptr = out_idx + global_idx * sample_ndim;
         #pragma unroll
         for (int d = 0; d < sample_ndim; ++d) {
-            // 只有当前处理的维度写入 p，其他维度写入基准坐标
-            out_idx_ptr[d] = (d == process_dim_sample) ? p : base_coords[d];
+            out_idx_ptr[d] = (d == process_dim_sample) ? p_idx : base_coords[d];
         }
     }
 }
 
 // ------------------------------------------------------------------
-// 内核 2: 后续 Pass (Subsequent Pass)
+// 内核 2: 后续 Pass (JFA Version)
 // ------------------------------------------------------------------
 template <bool IsFinal>
 __global__ void edt_kernel_subsequent_pass(
@@ -169,61 +255,48 @@ __global__ void edt_kernel_subsequent_pass(
 
     if (N == 0) return;
 
+    // Shared Memory Layout 同上
     extern __shared__ char s_buffer[];
     float* f = (float*)s_buffer;
-    int*   v = (int*)(f + N);
-    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
+    int*   idx1 = (int*)(f + N);
+    int*   idx2 = (int*)(idx1 + N);
 
+    // Phase 1: 加载
     for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
         f[i] = __ldg(&in_dist[current_offset + i * stride]);
     }
     __syncthreads();
 
-    if (threadIdx.x == 0) {
-        int k = 0;
-        v[0] = 0; z[0] = -INF_VAL; z[1] = INF_VAL;
-        
-        for (int q = 1; q < N; q++) {
-            if (f[q] >= (INF_VAL * 0.9f)) continue;
-
-            float fq = f[q];
-            int k_curr = k;
-            while (k_curr >= 0) {
-                int p = v[k_curr];
-                float diff_f = fq - f[p];
-                float diff_sq = (float)q*(float)q - (float)p*(float)p;
-                float s = (diff_f + diff_sq) / (2.0f * (float)(q - p));
-                if (s > z[k_curr]) {
-                    k_curr++; v[k_curr] = q; z[k_curr] = s; z[k_curr + 1] = INF_VAL;
-                    k = k_curr; break;
-                }
-                k_curr--;
-            }
-            if (k_curr < 0) { k = 0; v[0] = q; z[0] = -INF_VAL; z[1] = INF_VAL; }
-        }
-    }
-    __syncthreads();
+    // Phase 2: 并行 JFA 计算
+    // 这里的 f[i] 是上一轮计算出的距离平方，作为权重
+    compute_1d_jfa(N, f, idx1, idx2);
 
+    // Phase 3: 写回
     for (int q = threadIdx.x; q < N; q += blockDim.x) {
-        int k = 0;
-        float q_float = (float)q;
-        while (z[k + 1] < q_float) k++;
-        
-        int p = v[k];
-        
+        int p = idx1[q]; // 最近源点在当前行的索引
+        float dist_val;
+
+        if (p != -1) {
+            float dist_sq = sqr((float)q - (float)p) + f[p];
+            dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
+        } else {
+            dist_val = IsFinal ? INF_VAL : sqr(INF_VAL);
+            p = 0; // fallback
+        }
+
         int64_t q_global_offset = current_offset + q * stride;
-        int64_t p_global_offset = current_offset + p * stride;
-        
-        float dist_sq = sqr(q_float - (float)p) + f[p];
-        out_dist[q_global_offset] = IsFinal ? sqrtf(dist_sq) : dist_sq;
+        out_dist[q_global_offset] = dist_val;
 
-        // 索引直接从 Global Memory 拷贝，无需 Shared Memory
-        const int32_t* src_idx_ptr = in_idx + p_global_offset * sample_ndim;
-        int32_t* out_idx_ptr = out_idx + q_global_offset * sample_ndim;
+        // 索引处理
+        if (p != -1) {
+            int64_t p_global_offset = current_offset + p * stride;
+            const int32_t* src_idx_ptr = in_idx + p_global_offset * sample_ndim;
+            int32_t* out_idx_ptr = out_idx + q_global_offset * sample_ndim;
 
-        #pragma unroll
-        for (int d = 0; d < sample_ndim; ++d) {
-            out_idx_ptr[d] = src_idx_ptr[d]; 
+            #pragma unroll
+            for (int d = 0; d < sample_ndim; ++d) {
+                out_idx_ptr[d] = src_idx_ptr[d]; 
+            }
         }
     }
 }
@@ -237,8 +310,6 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
     
     input = input.contiguous();
 
-    // 自动处理 1D 输入：(L) -> (1, L)
-    // 自动处理 1D 批处理：(N, L) 保持不变 (视为 N 个 1D 样本)
     bool had_no_batch_dim = (input.dim() == 1); 
     if (had_no_batch_dim) { 
         input = input.unsqueeze(0); 
@@ -305,8 +376,15 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
         int64_t total_slices = batch_size * num_slices_per_sample;
         int64_t slice_len = shape[d_sample + 1];
         
-        int threads = std::min((int64_t)256, slice_len);
-        size_t smem = slice_len * (sizeof(float) + sizeof(int)) + (slice_len + 1) * sizeof(float);
+        int threads = std::min((int64_t)MAX_THREADS, slice_len);
+        
+        // JFA 需要的 Shared Memory:
+        // float f[N]
+        // int idx1[N]
+        // int idx2[N]
+        // 总共 slice_len * (4 + 4 + 4) = 12 * slice_len bytes
+        size_t smem = slice_len * sizeof(float) + 
+                      slice_len * sizeof(int) * 2; 
 
         if (is_first_pass) {
             const float* in_ptr = input.data_ptr<float>();

From 9291a49cf655aa938c20fe031c4b6305b9997521 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Wed, 3 Dec 2025 04:07:18 +0800
Subject: [PATCH 14/56] =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=90=88=E5=B9=B6?=
 =?UTF-8?q?=E5=86=85=E5=AD=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torchmorph/csrc/distance_transform_kernel.cu | 588 ++++++++++---------
 1 file changed, 298 insertions(+), 290 deletions(-)

diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 64c69fb..24ba467 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -1,75 +1,64 @@
 #include <torch/extension.h>
 #include <vector>
-#include <algorithm>
-#include <utility>
 #include <cmath>
-#include <limits>
+#include <algorithm>
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-#define MAX_DIMS 10
-#define INF_VAL 1e8f  
-// 保证 blockDim 足以覆盖大多数常见维度大小，或者配合 Loop 处理
-#define MAX_THREADS 1024 
+#define INF_VAL 1e8f
+#define MAX_THREADS 1024
+#define SMEM_LIMIT_ELEMENTS 4096 // 48KB / 12 bytes (float+int+int) ~= 4096
 
 __device__ __forceinline__ float sqr(float x) { return x * x; }
 
-// 计算从像素 q 到源点 p 的距离代价 (考虑了 p 点本身的数值权重 val_p)
+// 计算从像素 q 到源点 p 的距离代价
+// val_p 是源点 p 在上一轮计算后的距离平方值 (weight)
 __device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
-    if (p < 0) return INF_VAL; // 无效点
-    // dist = (q - p)^2 + f[p]
+    if (p < 0) return INF_VAL; 
     return sqr((float)q - (float)p) + val_p;
 }
 
 // ------------------------------------------------------------------
-// 核心逻辑: 1D Jump Flooding (JFA) / Doubling Algorithm
-// 全并行求解最近点索引，替代串行的抛物线构建
+// JFA 核心逻辑 (Device Function)
 // ------------------------------------------------------------------
-__device__ void compute_1d_jfa(
+// 无论数据是在 Shared Memory 还是 Global Memory，逻辑是一样的
+__device__ void run_jfa_core(
     int N,
-    float* __restrict__ s_vals,      // 输入数值 (dist^2)
-    int*   __restrict__ s_idx_curr,  // ping-pong buffer 1
-    int*   __restrict__ s_idx_next   // ping-pong buffer 2
+    int tid,
+    const float* __restrict__ vals,  // 输入权重 (只读)
+    int* __restrict__ idx_curr,      // Ping-Pong Buffer A
+    int* __restrict__ idx_next       // Ping-Pong Buffer B
 ) {
-    int tid = threadIdx.x;
-
-    // --- 1. 初始化 ---
-    // 每个线程负责一个或多个像素的初始化
+    // 1. 初始化
     for (int i = tid; i < N; i += blockDim.x) {
-        // 如果当前位置的值很大，说明它是背景，没有初始源点 (-1)
-        // 否则源点就是它自己 (i)
-        if (s_vals[i] >= INF_VAL * 0.9f) {
-            s_idx_curr[i] = -1;
+        // 如果输入值很大，说明是背景，没有初始源点
+        if (vals[i] >= INF_VAL * 0.9f) {
+            idx_curr[i] = -1;
         } else {
-            s_idx_curr[i] = i;
+            idx_curr[i] = i;
         }
     }
     __syncthreads();
 
-    // --- 2. 迭代传播 (Step = 1, 2, 4, 8...) ---
-    // 类似于双调排序或倍增法
-    int* idx_in = s_idx_curr;
-    int* idx_out = s_idx_next;
+    // 2. 迭代传播 (Step = 1, 2, 4, ... < N)
+    int* idx_in = idx_curr;
+    int* idx_out = idx_next;
 
-    // 只要步长小于 N，就需要传播
-    // 对于 N=1024, 只需要 10 次迭代，每次所有线程全并行
     for (int step = 1; step < N; step *= 2) {
-        
         for (int i = tid; i < N; i += blockDim.x) {
             int my_best_p = idx_in[i];
             float min_cost = INF_VAL;
 
-            // 获取当前最优点的代价
             if (my_best_p != -1) {
-                min_cost = compute_cost(i, my_best_p, s_vals[my_best_p]);
+                min_cost = compute_cost(i, my_best_p, vals[my_best_p]);
             }
 
-            // --- 检查左边邻居 (i - step) ---
+            // Check Left
             int left = i - step;
             if (left >= 0) {
-                int left_p = idx_in[left]; // 邻居推荐的源点
+                int left_p = idx_in[left];
                 if (left_p != -1) {
-                    float c = compute_cost(i, left_p, s_vals[left_p]);
+                    float c = compute_cost(i, left_p, vals[left_p]);
                     if (c < min_cost) {
                         min_cost = c;
                         my_best_p = left_p;
@@ -77,230 +66,206 @@ __device__ void compute_1d_jfa(
                 }
             }
 
-            // --- 检查右边邻居 (i + step) ---
+            // Check Right
             int right = i + step;
             if (right < N) {
-                int right_p = idx_in[right]; // 邻居推荐的源点
+                int right_p = idx_in[right];
                 if (right_p != -1) {
-                    float c = compute_cost(i, right_p, s_vals[right_p]);
+                    float c = compute_cost(i, right_p, vals[right_p]);
                     if (c < min_cost) {
                         min_cost = c;
                         my_best_p = right_p;
                     }
                 }
             }
-
-            // 写入下一轮 Buffer
             idx_out[i] = my_best_p;
         }
         
-        // 交换 Buffer 指针
+        // Swap Pointers
         int* temp = idx_in;
         idx_in = idx_out;
         idx_out = temp;
-        
         __syncthreads();
     }
 
-    // --- 3. 结果写回 ---
-    // 如果最后结果在 s_idx_next 里 (循环次数是奇数)，需要拷回 s_idx_curr
-    // 或者直接让调用者知道结果在哪。
-    // 为了简单，我们统一把结果放在 s_idx_curr 指向的内存里。
-    // 注意：idx_in 现在指向的是包含最新结果的 buffer。
-    
-    // 如果 idx_in 已经指向 s_idx_curr，那不用动。
-    // 如果 idx_in 指向 s_idx_next，说明最新结果在 s_idx_next，我们需要把它拷贝回 s_idx_curr
-    // 或者是调整后续代码读取的指针。
-    
-    // 这里采用简单拷贝回 s_idx_curr 的方式，确保后续逻辑一致
-    if (idx_in != s_idx_curr) {
+    // 3. 确保最终结果在 idx_curr (如果循环结束时在 next，则拷回)
+    if (idx_in != idx_curr) {
         for (int i = tid; i < N; i += blockDim.x) {
-            s_idx_curr[i] = s_idx_next[i];
+            idx_curr[i] = idx_next[i];
         }
         __syncthreads();
     }
 }
 
-
 // ------------------------------------------------------------------
-// 内核 1: 初始 Pass (JFA Version)
+// Kernel 1: Shared Memory JFA (Fast Path)
+// 适用于 N <= 4096
 // ------------------------------------------------------------------
-template <bool IsFinal>
-__global__ void edt_kernel_first_pass(
-    const float* __restrict__ in_data,
-    float* __restrict__ out_dist,
-    int32_t* __restrict__ out_idx,
-    const int64_t* __restrict__ shape,
-    const int64_t* __restrict__ strides,
-    int32_t ndim,
-    int32_t process_dim_sample,
-    int64_t total_slices,
-    int64_t num_slices_per_sample
+template <bool IsFinal, int NDim>
+__global__ void edt_kernel_shared(
+    const float* __restrict__ in_data,       // 当前维度的输入 (dist^2)
+    const int32_t* __restrict__ in_indices,  // 上一轮的索引图 (N_slices, L, NDim)
+    float* __restrict__ out_dist,            // 输出距离
+    int32_t* __restrict__ out_indices,       // 输出索引图
+    int64_t L,                               // 当前维度的长度 (Length)
+    int64_t total_elements                   // Batch * ... * L
 ) {
-    int64_t slice_idx = blockIdx.x;
-    if (slice_idx >= total_slices) return;
-
-    int64_t batch_idx = slice_idx / num_slices_per_sample;
-    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
-    int64_t current_offset = batch_idx * strides[0];
-
-    int32_t base_coords[MAX_DIMS]; 
-    int64_t temp_idx = slice_idx_in_sample;
-    const int sample_ndim = ndim - 1;
-
-    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
-        if (d == process_dim_sample) {
-            base_coords[d] = 0; 
-            continue; 
-        }
-        int64_t size_of_dim = shape[d + 1];
-        int32_t coord = (int32_t)(temp_idx % size_of_dim);
-        base_coords[d] = coord;
-        current_offset += coord * strides[d + 1];
-        temp_idx /= size_of_dim;
-    }
-
-    const int64_t process_dim_actual = process_dim_sample + 1;
-    const int64_t N = shape[process_dim_actual];
-    const int64_t stride = strides[process_dim_actual];
+    // 这里的 total_elements 是展平后的总像素数
+    // 由于我们做了 transpose，数据布局是 [Batch_and_other_dims, L]
+    // 每个 Block 处理一行 (长度 L)
+    
+    int64_t row_idx = blockIdx.x;
+    int64_t offset = row_idx * L;
     
-    if (N == 0) return;
+    if (offset >= total_elements) return;
 
-    // Shared Memory Layout:
-    // f: float[N] (Values)
-    // idx1: int[N] (Buffer 1)
-    // idx2: int[N] (Buffer 2)
+    // Shared Memory 布局: float vals[L], int idx1[L], int idx2[L]
     extern __shared__ char s_buffer[];
-    float* f = (float*)s_buffer;
-    int*   idx1 = (int*)(f + N);
-    int*   idx2 = (int*)(idx1 + N);
-
-    // Phase 1: 并行加载数据
-    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
-        float val = __ldg(&in_data[current_offset + i * stride]);
-        f[i] = (val == 0.0f) ? 0.0f : INF_VAL;
+    float* s_vals = (float*)s_buffer;
+    int*   s_idx1 = (int*)(s_vals + L);
+    int*   s_idx2 = (int*)(s_idx1 + L);
+
+    // 1. 加载数据到 Shared Memory
+    for (int i = threadIdx.x; i < L; i += blockDim.x) {
+        float val = __ldg(&in_data[offset + i]);
+        // 如果是初始 Pass (无输入索引)，val 为 0 或 INF
+        // 如果是后续 Pass，val 为上一步的 dist^2
+        s_vals[i] = val;
     }
     __syncthreads();
 
-    // Phase 2: 并行 JFA 计算
-    compute_1d_jfa(N, f, idx1, idx2);
-    // 结果现在存储在 idx1 中
-
-    // Phase 3: 并行写回
-    for (int q = threadIdx.x; q < N; q += blockDim.x) {
-        int p = idx1[q];
+    // 2. 运行 JFA
+    run_jfa_core(L, threadIdx.x, s_vals, s_idx1, s_idx2);
+    
+    // 3. 写回结果
+    for (int q = threadIdx.x; q < L; q += blockDim.x) {
+        int p = s_idx1[q]; // 最近点在当前行内的局部索引 (0..L-1)
         float dist_val;
-        int p_idx;
 
         if (p != -1) {
-            // JFA 得到的是最近源点的索引 p
-            // 距离 = (q-p)^2 + f[p]
-            // 注意：在 First Pass 中，f[p] 要么是 0 要么是 INF。如果 p != -1，f[p] 必为 0。
-            // 但为了通用性，还是加上 f[p]
-            float dist_sq = sqr((float)q - (float)p) + f[p];
+            // 计算新距离: (q-p)^2 + val[p]
+            float dist_sq = sqr((float)q - (float)p) + s_vals[p];
             dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
-            p_idx = p;
         } else {
             dist_val = IsFinal ? INF_VAL : sqr(INF_VAL);
-            p_idx = 0; 
+            p = 0; // fallback
         }
 
-        int64_t global_idx = current_offset + q * stride;
-        out_dist[global_idx] = dist_val;
+        out_dist[offset + q] = dist_val;
 
-        int32_t* out_idx_ptr = out_idx + global_idx * sample_ndim;
-        #pragma unroll
-        for (int d = 0; d < sample_ndim; ++d) {
-            out_idx_ptr[d] = (d == process_dim_sample) ? p_idx : base_coords[d];
+        // 4. 索引传播
+        // 我们需要从 in_indices 查找完整的高维索引
+        // in_indices 形状: [Batch..., L, NDim]
+        // 这里的 offset 对应 [Batch..., 0]
+        // p 是当前维度的偏移
+        if (p != -1) {
+            int64_t src_offset = (offset + p) * NDim;
+            int64_t dst_offset = (offset + q) * NDim;
+            
+            // 手动展开拷贝，或者循环
+            for (int d = 0; d < NDim; ++d) {
+                out_indices[dst_offset + d] = in_indices[src_offset + d];
+            }
+        } else {
+            // 保持原样或填0 (通常保持原样即可，或者为了安全填0)
+             int64_t dst_offset = (offset + q) * NDim;
+             for (int d = 0; d < NDim; ++d) out_indices[dst_offset + d] = 0;
         }
     }
 }
 
 // ------------------------------------------------------------------
-// 内核 2: 后续 Pass (JFA Version)
+// Kernel 2: Global Memory JFA (Fallback Path)
+// 适用于 N > 4096，使用 Global Memory 作为 Ping-Pong Buffer
 // ------------------------------------------------------------------
-template <bool IsFinal>
-__global__ void edt_kernel_subsequent_pass(
-    const float* __restrict__ in_dist,
-    const int32_t* __restrict__ in_idx,
+template <bool IsFinal, int NDim>
+__global__ void edt_kernel_global(
+    const float* __restrict__ in_data,
+    const int32_t* __restrict__ in_indices,
     float* __restrict__ out_dist,
-    int32_t* __restrict__ out_idx,
-    const int64_t* __restrict__ shape,
-    const int64_t* __restrict__ strides,
-    int32_t ndim,
-    int32_t process_dim_sample,
-    int64_t total_slices,
-    int64_t num_slices_per_sample
+    int32_t* __restrict__ out_indices,
+    int* __restrict__ global_buffer_1, // 临时 buffer A [TotalElements]
+    int* __restrict__ global_buffer_2, // 临时 buffer B [TotalElements]
+    int64_t L,
+    int64_t total_elements
 ) {
-    int64_t slice_idx = blockIdx.x;
-    if (slice_idx >= total_slices) return;
-
-    int64_t batch_idx = slice_idx / num_slices_per_sample;
-    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
-    int64_t current_offset = batch_idx * strides[0];
+    int64_t row_idx = blockIdx.x;
+    int64_t offset = row_idx * L;
     
-    int64_t temp_idx = slice_idx_in_sample;
-    const int sample_ndim = ndim - 1;
-
-    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
-        if (d == process_dim_sample) continue;
-        int64_t size_of_dim = shape[d + 1];
-        current_offset += (temp_idx % size_of_dim) * strides[d + 1];
-        temp_idx /= size_of_dim;
-    }
-    
-    const int64_t process_dim_actual = process_dim_sample + 1;
-    const int64_t N = shape[process_dim_actual];
-    const int64_t stride = strides[process_dim_actual];
-
-    if (N == 0) return;
-
-    // Shared Memory Layout 同上
-    extern __shared__ char s_buffer[];
-    float* f = (float*)s_buffer;
-    int*   idx1 = (int*)(f + N);
-    int*   idx2 = (int*)(idx1 + N);
+    if (offset >= total_elements) return;
 
-    // Phase 1: 加载
-    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
-        f[i] = __ldg(&in_dist[current_offset + i * stride]);
-    }
-    __syncthreads();
-
-    // Phase 2: 并行 JFA 计算
-    // 这里的 f[i] 是上一轮计算出的距离平方，作为权重
-    compute_1d_jfa(N, f, idx1, idx2);
+    // 指向当前行在 Global Memory 中的位置
+    // 注意：in_data 是只读的，我们需要把它当做 weight
+    // JFA 需要两个 int buffer 来存 index
+    int* g_idx1 = global_buffer_1 + offset;
+    int* g_idx2 = global_buffer_2 + offset;
+    
+    // 直接在 Global Memory 上运行 JFA
+    // 注意：这里 vals 指针直接指向 in_data (Global)，读取稍慢但无需拷贝
+    run_jfa_core(L, threadIdx.x, in_data + offset, g_idx1, g_idx2);
 
-    // Phase 3: 写回
-    for (int q = threadIdx.x; q < N; q += blockDim.x) {
-        int p = idx1[q]; // 最近源点在当前行的索引
+    // 写回逻辑同上
+    for (int q = threadIdx.x; q < L; q += blockDim.x) {
+        int p = g_idx1[q];
         float dist_val;
 
         if (p != -1) {
-            float dist_sq = sqr((float)q - (float)p) + f[p];
+            float val_p = in_data[offset + p]; 
+            float dist_sq = sqr((float)q - (float)p) + val_p;
             dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
         } else {
             dist_val = IsFinal ? INF_VAL : sqr(INF_VAL);
-            p = 0; // fallback
+            p = 0; 
         }
 
-        int64_t q_global_offset = current_offset + q * stride;
-        out_dist[q_global_offset] = dist_val;
+        out_dist[offset + q] = dist_val;
 
-        // 索引处理
         if (p != -1) {
-            int64_t p_global_offset = current_offset + p * stride;
-            const int32_t* src_idx_ptr = in_idx + p_global_offset * sample_ndim;
-            int32_t* out_idx_ptr = out_idx + q_global_offset * sample_ndim;
-
-            #pragma unroll
-            for (int d = 0; d < sample_ndim; ++d) {
-                out_idx_ptr[d] = src_idx_ptr[d]; 
+            int64_t src_offset = (offset + p) * NDim;
+            int64_t dst_offset = (offset + q) * NDim;
+            for (int d = 0; d < NDim; ++d) {
+                out_indices[dst_offset + d] = in_indices[src_offset + d];
             }
+        } else {
+             int64_t dst_offset = (offset + q) * NDim;
+             for (int d = 0; d < NDim; ++d) out_indices[dst_offset + d] = 0;
         }
     }
 }
 
+
+// ------------------------------------------------------------------
+// 辅助：初始化索引张量
+// ------------------------------------------------------------------
+// 将 index tensor 初始化为 grid grid coordinates
+// shape: (..., D), 最后一个维度存坐标
+__global__ void init_indices_kernel(int32_t* indices, int64_t total_elements, int NDim, 
+                                    const int64_t* shape, const int64_t* strides) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_elements) return;
+
+    // 反解坐标
+    int64_t temp = idx;
+    int32_t coords[10]; // max dims
+
+    // strides 是针对 elements 展开的，但这里 indices 是 (Total, NDim)
+    // 我们可以简单地根据 shape 反解
+    // 注意：这里的 total_elements 是像素数，不是 indices 数组的大小
+    
+    // 假设 shape 是 [D0, D1, D2]
+    // idx 对应 flat index
+    
+    for (int d = NDim - 1; d >= 0; --d) {
+        coords[d] = temp % shape[d];
+        temp /= shape[d];
+    }
+
+    // 写入
+    for (int d = 0; d < NDim; ++d) {
+        indices[idx * NDim + d] = coords[d];
+    }
+}
+
 // ------------------------------------------------------------------
 // Host 函数
 // ------------------------------------------------------------------
@@ -309,119 +274,162 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
     TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be float32.");
     
     input = input.contiguous();
+    bool had_no_batch_dim = (input.dim() == 1);
+    if (had_no_batch_dim) input = input.unsqueeze(0);
 
-    bool had_no_batch_dim = (input.dim() == 1); 
-    if (had_no_batch_dim) { 
-        input = input.unsqueeze(0); 
-    }
-
-    const auto ndim = input.dim();
-    const auto sample_ndim = ndim - 1;
-    const auto batch_size = input.size(0);
+    const int ndim = input.dim(); // Include batch
+    const int sample_ndim = ndim - 1; 
     auto shape = input.sizes().vec();
-    auto strides_vec = input.strides().vec();
-    
-    if (input.numel() == 0) { 
+    int64_t num_pixels = input.numel();
+
+    if (num_pixels == 0) {
         auto index_shape = shape;
         index_shape.push_back(sample_ndim > 0 ? sample_ndim : 1);
         return std::make_tuple(torch::empty_like(input), 
-                               torch::empty(index_shape, input.options().dtype(torch::kInt32))); 
+                               torch::empty(index_shape, input.options().dtype(torch::kInt32)));
     }
 
-    auto distance = torch::empty_like(input);
+    // 1. 初始化输出 Tensor
+    // current_dist 在迭代过程中存储 dist^2，最后开方
+    // 初始状态：Input 里的 0 还是 0，其他非 0 (背景) 设为 INF
+    auto current_dist = torch::where(input == 0, 
+                                     torch::tensor(0.0f, input.options()), 
+                                     torch::tensor(INF_VAL, input.options()));
+    
+    // 初始化索引 Map (Batch, ..., NDim)
     auto index_shape = shape;
-    index_shape.push_back(sample_ndim); 
-    auto index = torch::empty(index_shape, input.options().dtype(torch::kInt32));
-
-    auto buffer_dist = torch::empty_like(distance);
-    auto buffer_idx = torch::empty_like(index);
-
-    auto shape_tensor = torch::tensor(shape, torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
-    auto strides_tensor = torch::tensor(strides_vec, torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
-
-    std::vector<std::pair<int64_t, int32_t>> dim_order_pairs;
-    for (int32_t d = 0; d < sample_ndim; ++d) {
-        dim_order_pairs.push_back({strides_vec[d + 1], d});
+    index_shape.push_back(sample_ndim);
+    auto current_idx = torch::empty(index_shape, input.options().dtype(torch::kInt32));
+    
+    // 启动 Kernel 初始化索引
+    // 为了反解坐标，我们需要把 shape 传进去
+    {
+        // 排除 batch 维度的 shape 用于坐标计算? 
+        // 需求是：返回的索引是 (batch_idx, z, y, x) 还是只是 (z, y, x)?
+        // 通常 EDT 返回的是 sample 内的坐标。所以我们忽略 batch 维度。
+        std::vector<int64_t> sample_shape_vec(shape.begin() + 1, shape.end());
+        auto sample_shape_tensor = torch::tensor(sample_shape_vec, torch::kInt64).to(input.device());
+        // 这里的 strides 不需要，直接由 shape 反解
+        
+        int threads = 256;
+        int blocks = (num_pixels + threads - 1) / threads;
+        
+        // 我们需要传递 sample_ndim
+        init_indices_kernel<<<blocks, threads>>>(
+            current_idx.data_ptr<int32_t>(), 
+            num_pixels, 
+            sample_ndim,
+            sample_shape_tensor.data_ptr<int64_t>(),
+            nullptr // strides not needed for simple unravel
+        );
     }
-    std::sort(dim_order_pairs.rbegin(), dim_order_pairs.rend());
+    
+    // 用于 Global Memory Fallback 的临时 buffer
+    torch::Tensor global_buf1, global_buf2;
+
+    // 2. 逐维处理 (Separable Phases)
+    // 从最后一个维度倒着处理，或者顺序处理都可以。
+    // 为了 Host Transpose 方便，我们遍历 sample 的每一个维度 (1 到 ndim-1)
+    for (int d = 1; d < ndim; ++d) {
+        bool is_final_pass = (d == ndim - 1);
+        
+        // -----------------------------------------------------------
+        // Step A: Permute & Contiguous
+        // 将当前处理维度 d 移到最后: (0, 1, ..., d, ..., N-1) -> (0, 1, ..., N-1, d)
+        // 这样最后内存布局就是 [..., L]，stride=1
+        // -----------------------------------------------------------
+        
+        // 这种 swap 策略比较简单: transpose(d, -1)
+        // 注意：index tensor 也要变换，但 index tensor 最后一维是 coord_dim，不能乱动。
+        // Index tensor 形状是 [..., sample_ndim]。
+        // 我们需要变换的是前面的空间维度 [...]。
+        
+        auto dist_in = current_dist.transpose(d, ndim - 1).contiguous();
+        auto idx_in  = current_idx.transpose(d, ndim - 1).contiguous(); 
+        
+        // 此时 dist_in shape: [..., L]
+        // idx_in shape: [..., L, sample_ndim]
+        
+        int64_t L = dist_in.size(-1); // 当前维度的长度
+        int64_t total_slices = dist_in.numel() / L; // 有多少行
+        
+        auto dist_out = torch::empty_like(dist_in);
+        auto idx_out  = torch::empty_like(idx_in);
 
-    for (int pass = 0; pass < sample_ndim; ++pass) {
-        int32_t d_sample = dim_order_pairs[pass].second;
-        bool is_first_pass = (pass == 0);
-        bool is_final_pass = (pass == sample_ndim - 1);
+        // -----------------------------------------------------------
+        // Step B: Kernel Dispatch
+        // -----------------------------------------------------------
+        int threads = std::min((int64_t)MAX_THREADS, L);
+        
+        // 检查 Shared Memory 需求
+        // Need: float(4) + int(4) + int(4) = 12 bytes per pixel
+        if (L <= SMEM_LIMIT_ELEMENTS) {
+            size_t smem_size = L * (sizeof(float) + 2 * sizeof(int));
+            
+            // 模板参数 NDim 需要是编译期常量。
+            // 动态分发 sample_ndim (1D, 2D, 3D usually)
+            // 使用 switch case 覆盖常见维度 (1, 2, 3)
+            #define DISPATCH_SHARED(IS_FINAL) \
+                switch(sample_ndim) { \
+                    case 1: edt_kernel_shared<IS_FINAL, 1><<<total_slices, threads, smem_size>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        L, dist_in.numel()); break; \
+                    case 2: edt_kernel_shared<IS_FINAL, 2><<<total_slices, threads, smem_size>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        L, dist_in.numel()); break; \
+                    case 3: edt_kernel_shared<IS_FINAL, 3><<<total_slices, threads, smem_size>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        L, dist_in.numel()); break; \
+                    default: /* fallback for >3D */ break; \
+                }
 
-        torch::Tensor *in_d, *in_i, *out_d, *out_i;
+            if (is_final_pass) { DISPATCH_SHARED(true); } 
+            else { DISPATCH_SHARED(false); }
 
-        if (is_first_pass) {
-            in_d = nullptr; in_i = nullptr; 
-            out_d = is_final_pass ? &distance : &buffer_dist;
-            out_i = is_final_pass ? &index : &buffer_idx;
         } else {
-            if (pass % 2 != 0) {
-                in_d = &buffer_dist; in_i = &buffer_idx;
-                out_d = &distance;   out_i = &index;
-            } else {
-                in_d = &distance;    in_i = &index;
-                out_d = &buffer_dist; out_i = &buffer_idx;
-            }
-            if (is_final_pass) {
-                out_d = &distance; out_i = &index;
+            // Fallback: Global Memory
+            // 需要分配 buffer: [total_slices * L] = [numel]
+            if (global_buf1.numel() < dist_in.numel()) {
+                global_buf1 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
+                global_buf2 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
             }
-        }
 
-        int64_t num_slices_per_sample = 1;
-        for(int i = 0; i < sample_ndim; ++i) { 
-            if (i != d_sample) num_slices_per_sample *= shape[i + 1]; 
-        }
-        int64_t total_slices = batch_size * num_slices_per_sample;
-        int64_t slice_len = shape[d_sample + 1];
-        
-        int threads = std::min((int64_t)MAX_THREADS, slice_len);
-        
-        // JFA 需要的 Shared Memory:
-        // float f[N]
-        // int idx1[N]
-        // int idx2[N]
-        // 总共 slice_len * (4 + 4 + 4) = 12 * slice_len bytes
-        size_t smem = slice_len * sizeof(float) + 
-                      slice_len * sizeof(int) * 2; 
-
-        if (is_first_pass) {
-            const float* in_ptr = input.data_ptr<float>();
-            if (is_final_pass) {
-                edt_kernel_first_pass<true><<<total_slices, threads, smem>>>(
-                    in_ptr, out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
-                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
-                    ndim, d_sample, total_slices, num_slices_per_sample
-                );
-            } else {
-                edt_kernel_first_pass<false><<<total_slices, threads, smem>>>(
-                    in_ptr, out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
-                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
-                    ndim, d_sample, total_slices, num_slices_per_sample
-                );
-            }
-        } else {
-            if (is_final_pass) {
-                edt_kernel_subsequent_pass<true><<<total_slices, threads, smem>>>(
-                    in_d->data_ptr<float>(), in_i->data_ptr<int32_t>(),
-                    out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
-                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
-                    ndim, d_sample, total_slices, num_slices_per_sample
-                );
-            } else {
-                edt_kernel_subsequent_pass<false><<<total_slices, threads, smem>>>(
-                    in_d->data_ptr<float>(), in_i->data_ptr<int32_t>(),
-                    out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
-                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
-                    ndim, d_sample, total_slices, num_slices_per_sample
-                );
-            }
+            #define DISPATCH_GLOBAL(IS_FINAL) \
+                switch(sample_ndim) { \
+                    case 1: edt_kernel_global<IS_FINAL, 1><<<total_slices, threads>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
+                        L, dist_in.numel()); break; \
+                    case 2: edt_kernel_global<IS_FINAL, 2><<<total_slices, threads>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
+                        L, dist_in.numel()); break; \
+                    case 3: edt_kernel_global<IS_FINAL, 3><<<total_slices, threads>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
+                        L, dist_in.numel()); break; \
+                    default: break; \
+                }
+
+            if (is_final_pass) { DISPATCH_GLOBAL(true); }
+            else { DISPATCH_GLOBAL(false); }
         }
+
+        // -----------------------------------------------------------
+        // Step C: Transpose Back
+        // -----------------------------------------------------------
+        current_dist = dist_out.transpose(d, ndim - 1); // View, non-contiguous is fine here as next step makes it contiguous
+        current_idx  = idx_out.transpose(d, ndim - 1);
     }
-    
-    if (had_no_batch_dim) { 
-        return std::make_tuple(distance.squeeze(0), index.squeeze(0));
+
+    if (had_no_batch_dim) {
+        return std::make_tuple(current_dist.squeeze(0), current_idx.squeeze(0));
     }
-    return std::make_tuple(distance, index);
+    return std::make_tuple(current_dist, current_idx);
 }
\ No newline at end of file

From 698ce2493053f21f7e7acc7b55f5ab38736599f6 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Sat, 6 Dec 2025 21:28:04 +0800
Subject: [PATCH 15/56] =?UTF-8?q?=E5=A2=9E=E5=8A=A03=E7=BB=B4=E4=BB=A5?=
 =?UTF-8?q?=E4=B8=8A=E7=BB=B4=E5=BA=A6=E7=9A=84=E8=AE=A1=E7=AE=97=E5=A4=84?=
 =?UTF-8?q?=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/test_distance_transform.py              | 146 ++++----
 torchmorph/csrc/distance_transform_kernel.cu | 335 ++++++++++---------
 2 files changed, 269 insertions(+), 212 deletions(-)

diff --git a/test/test_distance_transform.py b/test/test_distance_transform.py
index 476ffca..6e6265d 100644
--- a/test/test_distance_transform.py
+++ b/test/test_distance_transform.py
@@ -1,21 +1,20 @@
 import torch
 import pytest
-from scipy.ndimage import distance_transform_edt as scipy_edt
 import numpy as np
-import torchmorph as tm
+from scipy.ndimage import distance_transform_edt as scipy_edt
+import torchmorph as tm 
 
-# 辅助函数：调用 SciPy 并处理格式
+# ======================================================================
+# 辅助函数
+# ======================================================================
 def batch_scipy_edt_with_indices(batch_numpy: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
-
+    dist_results, indices_results = [], []
     
-    input_is_1d_batch = (batch_numpy.ndim == 2) 
-    input_is_single_sample_no_batch = (batch_numpy.ndim == 1)
+    # 保证 batch_numpy 至少是 (Batch, ...)
+    # 如果进来的是 (H, W)，我们在外面已经处理成 (1, H, W) 了
+    if batch_numpy.ndim == 1:
+        batch_numpy = batch_numpy[np.newaxis, ...]
 
-    if input_is_single_sample_no_batch:
-        batch_numpy = batch_numpy[np.newaxis, ...] # (L) -> (1, L)
-
-    
-    dist_results, indices_results = [], []
     for sample in batch_numpy:
         dist, indices = scipy_edt(sample, return_indices=True, return_distances=True)
         dist_results.append(dist)
@@ -23,81 +22,106 @@ def batch_scipy_edt_with_indices(batch_numpy: np.ndarray) -> tuple[np.ndarray, n
         
     output_dist = np.stack(dist_results, axis=0)
     output_indices = np.stack(indices_results, axis=0)
+    output_indices = np.moveaxis(output_indices, 1, -1) 
     
-    # indices shape fix: (N, ndim_sample, ...) -> (N, ..., ndim_sample)
-    # 对于 1D: (N, 1, L) -> (N, L, 1)
-    output_indices = np.moveaxis(output_indices, 1, -1)
-    
-    if input_is_single_sample_no_batch:
-        output_dist = output_dist.squeeze(0)
-        output_indices = output_indices.squeeze(0)
-        
     return output_dist, output_indices
 
-# 用例定义 
-case_batch_2d = np.array([[[0., 1, 1, 1], [0, 0, 1, 1], [0, 1, 1, 0]],[[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]]], dtype=np.float32)
+# ======================================================================
+# 测试数据
+# ======================================================================
+case_batch_1d = np.array([[1, 1, 0, 1, 0, 1], [0, 1, 1, 1, 1, 0]], dtype=np.float32)
+
+case_batch_2d = np.array([[[0., 1, 1, 1], [0, 0, 1, 1], [0, 1, 1, 0]],
+                          [[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]]], dtype=np.float32)
+
+# 这里定义为 (4, 4)，意图是单张 2D 图
+case_single_2d = np.array([[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0]], dtype=np.float32)
+case_explicit_batch_one = case_single_2d[np.newaxis, ...]
+
 _case_3d_s1 = np.ones((4, 5, 6), dtype=np.float32); _case_3d_s1[1, 1, 1] = 0.0; _case_3d_s1[2, 3, 4] = 0.0
 _case_3d_s2 = np.ones((4, 5, 6), dtype=np.float32); _case_3d_s2[0, 0, 0] = 0.0
 case_batch_3d = np.stack([_case_3d_s1, _case_3d_s2], axis=0)
-case_single_2d = np.array([[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0]], dtype=np.float32)
-case_explicit_batch_one = case_single_2d[np.newaxis, ...]
+
 case_dim_one = np.ones((2, 5, 1), dtype=np.float32); case_dim_one[0, 2, 0] = 0.0; case_dim_one[1, 4, 0] = 0.0
-case_batch_1d = np.array([[1, 1, 0, 1, 0, 1], [0, 1, 1, 1, 1, 0]], dtype=np.float32)
 
+# 4D Case
+_case_4d_s1 = np.ones((3, 3, 3, 3), dtype=np.float32); _case_4d_s1[0, 0, 0, 0] = 0.0
+_case_4d_s2 = np.ones((3, 3, 3, 3), dtype=np.float32); _case_4d_s2[1, 1, 1, 1] = 0.0
+case_batch_4d_spatial = np.stack([_case_4d_s1, _case_4d_s2], axis=0)
+
+# 5D Case
+case_batch_5d_spatial = np.ones((1, 2, 2, 2, 2, 2), dtype=np.float32)
+case_batch_5d_spatial[0, 0, 0, 0, 0, 0] = 0.0; case_batch_5d_spatial[0, 1, 1, 1, 1, 1] = 0.0
+
+# ======================================================================
+# 测试逻辑
+# ======================================================================
 @pytest.mark.parametrize(
-    "input_numpy",
+    "input_numpy, has_batch_dim",
     [
-        pytest.param(case_batch_2d, id="批处理2D图像"),
-        pytest.param(case_batch_3d, id="批处理3D图像"),
-        pytest.param(case_single_2d, id="单张2D图像(隐式批处理)"),
-        pytest.param(case_explicit_batch_one, id="单张2D图像(显式批处理)"),
-        pytest.param(case_dim_one, id="含幺元维度的批处理"),
-        pytest.param(case_batch_1d, id="批处理1D数据"),
+        pytest.param(case_batch_1d, True, id="1D_Batch"),
+        pytest.param(case_batch_2d, True, id="2D_Batch"),
+        pytest.param(case_single_2d, False, id="2D_Single_NoBatch"),
+        pytest.param(case_explicit_batch_one, True, id="2D_Single_ExplicitBatch"),
+        pytest.param(case_batch_3d, True, id="3D_Batch"),
+        pytest.param(case_dim_one, True, id="2D_UnitDim_Batch"),
+        pytest.param(case_batch_4d_spatial, True, id="4D_Spatial_Batch"),
+        pytest.param(case_batch_5d_spatial, True, id="5D_Spatial_Batch"),
     ],
 )
-def test_distance_transform_and_indices(input_numpy: np.ndarray, request: pytest.FixtureRequest):
+def test_distance_transform_and_indices(input_numpy: np.ndarray, has_batch_dim: bool, request: pytest.FixtureRequest):
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
     
+    # 1. 准备 Numpy 数据
     x_numpy_contiguous = np.ascontiguousarray(input_numpy)
+    
+    # 2. 准备 SciPy 输入
+    # 如果意图是单样本 (has_batch_dim=False)，我们手动增加 Batch 维，
+    # 这样 scipy 辅助函数就会把它当做一张图来处理，而不是 N 张 1D 图
+    if not has_batch_dim:
+        scipy_input = x_numpy_contiguous[np.newaxis, ...]
+    else:
+        scipy_input = x_numpy_contiguous
+
+    # 3. 准备 CUDA 输入
+    # 关键修复: 
+    # 如果 has_batch_dim=False，说明这是单张 (H, W)，我们要测 2D EDT。
+    # C++ API 默认第一维是 Batch，所以我们必须 unsqueeze(0) 变成 (1, H, W)。
+    # 否则 C++ 会把它当做 (Batch=H, Len=W) 做 1D EDT。
     x_cuda = torch.from_numpy(x_numpy_contiguous).cuda()
+    if not has_batch_dim:
+        x_cuda = x_cuda.unsqueeze(0)
 
-    print(f"\n\n--- 正在运行测试: {request.node.callspec.id} ---")
-    print(f"输入张量形状: {x_cuda.shape}")
+    print(f"\n\n--- 运行测试: {request.node.callspec.id} ---")
+    print(f"CUDA 输入形状: {x_cuda.shape}")
 
-    # 调用您的 Python 包装函数
+    # 4. 运行 CUDA EDT
     dist_cuda, idx_cuda = tm.distance_transform(x_cuda.clone())
 
-    print(f"CUDA 距离输出形状: {dist_cuda.shape}")
-    print(f"CUDA 坐标输出形状: {idx_cuda.shape}")
-
-    # 调用 SciPy 作为参考基准
-    dist_ref_numpy, idx_ref_numpy = batch_scipy_edt_with_indices(x_numpy_contiguous)
+    # 5. 运行 SciPy (Ground Truth)
+    dist_ref_numpy, idx_ref_numpy = batch_scipy_edt_with_indices(scipy_input)
     dist_ref = torch.from_numpy(dist_ref_numpy).to(torch.float32).cuda()
-    
-    print(f"SciPy 距离输出形状: {dist_ref.shape}")
 
-    # 断言验证
-    print("\n--- 正在验证距离... ---")
-    assert dist_cuda.shape == dist_ref.shape
+    # 6. 验证距离
+    # 此时 dist_cuda 是 (1, H, W)，dist_ref 也是 (1, H, W)
+    # 如果原意是 NoBatch，我们可以把 Batch 维 squeeze 掉再比，或者直接比
+    print(f"CUDA Out Shape: {dist_cuda.shape}, Ref Shape: {dist_ref.shape}")
+    assert dist_cuda.shape == dist_ref.shape, f"Shape mismatch: {dist_cuda.shape} vs {dist_ref.shape}"
     torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-3, rtol=1e-3)
-    print("距离断言通过 (形状和数值接近)。")
+    print(">> 距离验证通过。")
 
-    print("\n--- 正在验证坐标... ---")
-    
-    # 鲁棒的坐标验证逻辑
-    had_no_batch_dim = (x_numpy_contiguous.ndim <= idx_cuda.shape[-1])
-    spatial_shape = x_cuda.shape if had_no_batch_dim else x_cuda.shape[1:]
+    # 7. 验证索引
+    # idx_cuda: (1, H, W, 2)
+    # 构造 Grid
+    spatial_shape = x_cuda.shape[1:] # (H, W)
     coords = [torch.arange(s, device='cuda') for s in spatial_shape]
-    grid = torch.stack(torch.meshgrid(*coords, indexing='ij'), dim=-1)
-    
-    if not had_no_batch_dim:
-        grid = grid.unsqueeze(0)
-        
+    grid = torch.stack(torch.meshgrid(*coords, indexing='ij'), dim=-1) # (H, W, 2)
+    grid = grid.unsqueeze(0) # (1, H, W, 2)
+
     diff = grid.float() - idx_cuda.float()
-    dist_sq_from_indices = torch.sum(diff * diff, dim=-1)
+    dist_sq_calculated = torch.sum(diff * diff, dim=-1)
+    dist_sq_output = dist_cuda * dist_cuda
     
-    torch.testing.assert_close(dist_sq_from_indices, dist_cuda * dist_cuda, atol=1e-3, rtol=1e-3)
-    print("坐标正确性断言通过 (计算出的距离与返回距离匹配)。")
-        
-    print("--- 测试通过 ---")
\ No newline at end of file
+    torch.testing.assert_close(dist_sq_calculated, dist_sq_output, atol=1e-3, rtol=1e-3)
+    print(">> 索引验证通过。")
\ No newline at end of file
diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 24ba467..3bda330 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -5,23 +5,32 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
+// ------------------------------------------------------------------
+// 配置常量
+// ------------------------------------------------------------------
 #define INF_VAL 1e8f
 #define MAX_THREADS 1024
-#define SMEM_LIMIT_ELEMENTS 4096 // 48KB / 12 bytes (float+int+int) ~= 4096
+// Shared Memory 限制: 48KB 一般安全。
+// 每个像素需要: float(val) + int(idx1) + int(idx2) = 12 bytes
+// 4096 * 12 = 48KB.
+#define SMEM_LIMIT_ELEMENTS 4096 
+
+// ------------------------------------------------------------------
+// Device Helper Functions
+// ------------------------------------------------------------------
 
 __device__ __forceinline__ float sqr(float x) { return x * x; }
 
-// 计算从像素 q 到源点 p 的距离代价
-// val_p 是源点 p 在上一轮计算后的距离平方值 (weight)
+// 计算 JFA 代价: (q - p)^2 + weight[p]
 __device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
     if (p < 0) return INF_VAL; 
     return sqr((float)q - (float)p) + val_p;
 }
 
 // ------------------------------------------------------------------
-// JFA 核心逻辑 (Device Function)
+// JFA Core Logic (Device Only)
 // ------------------------------------------------------------------
-// 无论数据是在 Shared Memory 还是 Global Memory，逻辑是一样的
+// 核心 JFA 逻辑，与数据位置无关 (Shared 或 Global 均通用)
 __device__ void run_jfa_core(
     int N,
     int tid,
@@ -29,13 +38,12 @@ __device__ void run_jfa_core(
     int* __restrict__ idx_curr,      // Ping-Pong Buffer A
     int* __restrict__ idx_next       // Ping-Pong Buffer B
 ) {
-    // 1. 初始化
+    // 1. 初始化: 根据 vals 决定是否是有效源点
     for (int i = tid; i < N; i += blockDim.x) {
-        // 如果输入值很大，说明是背景，没有初始源点
         if (vals[i] >= INF_VAL * 0.9f) {
-            idx_curr[i] = -1;
+            idx_curr[i] = -1; // 背景
         } else {
-            idx_curr[i] = i;
+            idx_curr[i] = i;  // 物体/源点，初始索引指向自己
         }
     }
     __syncthreads();
@@ -49,11 +57,12 @@ __device__ void run_jfa_core(
             int my_best_p = idx_in[i];
             float min_cost = INF_VAL;
 
+            // 检查自己当前的最优解
             if (my_best_p != -1) {
                 min_cost = compute_cost(i, my_best_p, vals[my_best_p]);
             }
 
-            // Check Left
+            // Check Left Neighbor (-step)
             int left = i - step;
             if (left >= 0) {
                 int left_p = idx_in[left];
@@ -66,7 +75,7 @@ __device__ void run_jfa_core(
                 }
             }
 
-            // Check Right
+            // Check Right Neighbor (+step)
             int right = i + step;
             if (right < N) {
                 int right_p = idx_in[right];
@@ -99,42 +108,41 @@ __device__ void run_jfa_core(
 
 // ------------------------------------------------------------------
 // Kernel 1: Shared Memory JFA (Fast Path)
-// 适用于 N <= 4096
 // ------------------------------------------------------------------
+// 模板参数 NDim: 如果 > 0，编译器会展开循环优化。
+// 参数 runtime_ndim: 如果 NDim == 0 (Default case)，使用该参数作为维度。
 template <bool IsFinal, int NDim>
 __global__ void edt_kernel_shared(
-    const float* __restrict__ in_data,       // 当前维度的输入 (dist^2)
-    const int32_t* __restrict__ in_indices,  // 上一轮的索引图 (N_slices, L, NDim)
-    float* __restrict__ out_dist,            // 输出距离
-    int32_t* __restrict__ out_indices,       // 输出索引图
-    int64_t L,                               // 当前维度的长度 (Length)
-    int64_t total_elements                   // Batch * ... * L
+    const float* __restrict__ in_data,       // 输入 Dist^2
+    const int32_t* __restrict__ in_indices,  // 输入 Indices
+    float* __restrict__ out_dist,            // 输出 Dist (IsFinal ? sqrt : sqr)
+    int32_t* __restrict__ out_indices,       // 输出 Indices
+    int64_t L,                               // 当前维度的长度
+    int64_t total_elements,                  // 总像素数
+    int runtime_ndim                         // 运行时维度 (fallback)
 ) {
-    // 这里的 total_elements 是展平后的总像素数
-    // 由于我们做了 transpose，数据布局是 [Batch_and_other_dims, L]
-    // 每个 Block 处理一行 (长度 L)
-    
+    // 确定实际维度
+    const int D = (NDim > 0) ? NDim : runtime_ndim;
+
+    // 计算行偏移
     int64_t row_idx = blockIdx.x;
     int64_t offset = row_idx * L;
     
     if (offset >= total_elements) return;
 
-    // Shared Memory 布局: float vals[L], int idx1[L], int idx2[L]
+    // Shared Memory 布局
     extern __shared__ char s_buffer[];
     float* s_vals = (float*)s_buffer;
     int*   s_idx1 = (int*)(s_vals + L);
     int*   s_idx2 = (int*)(s_idx1 + L);
 
-    // 1. 加载数据到 Shared Memory
+    // 1. 加载 Dist 到 Shared Memory
     for (int i = threadIdx.x; i < L; i += blockDim.x) {
-        float val = __ldg(&in_data[offset + i]);
-        // 如果是初始 Pass (无输入索引)，val 为 0 或 INF
-        // 如果是后续 Pass，val 为上一步的 dist^2
-        s_vals[i] = val;
+        s_vals[i] = __ldg(&in_data[offset + i]);
     }
     __syncthreads();
 
-    // 2. 运行 JFA
+    // 2. 运行 JFA 核心
     run_jfa_core(L, threadIdx.x, s_vals, s_idx1, s_idx2);
     
     // 3. 写回结果
@@ -142,69 +150,64 @@ __global__ void edt_kernel_shared(
         int p = s_idx1[q]; // 最近点在当前行内的局部索引 (0..L-1)
         float dist_val;
 
+        // 计算新距离
         if (p != -1) {
-            // 计算新距离: (q-p)^2 + val[p]
             float dist_sq = sqr((float)q - (float)p) + s_vals[p];
             dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
         } else {
             dist_val = IsFinal ? INF_VAL : sqr(INF_VAL);
-            p = 0; // fallback
+            p = 0; // 防止越界，随便指一个
         }
-
         out_dist[offset + q] = dist_val;
 
-        // 4. 索引传播
-        // 我们需要从 in_indices 查找完整的高维索引
-        // in_indices 形状: [Batch..., L, NDim]
-        // 这里的 offset 对应 [Batch..., 0]
-        // p 是当前维度的偏移
+        // 索引传播: Copy Vector [D]
         if (p != -1) {
-            int64_t src_offset = (offset + p) * NDim;
-            int64_t dst_offset = (offset + q) * NDim;
+            int64_t src_offset = (offset + p) * D;
+            int64_t dst_offset = (offset + q) * D;
             
-            // 手动展开拷贝，或者循环
-            for (int d = 0; d < NDim; ++d) {
+            // 如果 NDim > 0，这里会完全展开，非常快
+            for (int d = 0; d < D; ++d) {
                 out_indices[dst_offset + d] = in_indices[src_offset + d];
             }
         } else {
-            // 保持原样或填0 (通常保持原样即可，或者为了安全填0)
-             int64_t dst_offset = (offset + q) * NDim;
-             for (int d = 0; d < NDim; ++d) out_indices[dst_offset + d] = 0;
+             // 找不到源点（全图都是背景的情况）
+             int64_t dst_offset = (offset + q) * D;
+             for (int d = 0; d < D; ++d) out_indices[dst_offset + d] = 0;
         }
     }
 }
 
 // ------------------------------------------------------------------
 // Kernel 2: Global Memory JFA (Fallback Path)
-// 适用于 N > 4096，使用 Global Memory 作为 Ping-Pong Buffer
 // ------------------------------------------------------------------
+// 逻辑同上，只是用 Global Memory 做 Ping-Pong Buffer
 template <bool IsFinal, int NDim>
 __global__ void edt_kernel_global(
     const float* __restrict__ in_data,
     const int32_t* __restrict__ in_indices,
     float* __restrict__ out_dist,
     int32_t* __restrict__ out_indices,
-    int* __restrict__ global_buffer_1, // 临时 buffer A [TotalElements]
-    int* __restrict__ global_buffer_2, // 临时 buffer B [TotalElements]
+    int* __restrict__ global_buffer_1,
+    int* __restrict__ global_buffer_2,
     int64_t L,
-    int64_t total_elements
+    int64_t total_elements,
+    int runtime_ndim
 ) {
+    const int D = (NDim > 0) ? NDim : runtime_ndim;
+
     int64_t row_idx = blockIdx.x;
     int64_t offset = row_idx * L;
     
     if (offset >= total_elements) return;
 
-    // 指向当前行在 Global Memory 中的位置
-    // 注意：in_data 是只读的，我们需要把它当做 weight
-    // JFA 需要两个 int buffer 来存 index
+    // 指向 Global Memory 的指针
     int* g_idx1 = global_buffer_1 + offset;
     int* g_idx2 = global_buffer_2 + offset;
     
-    // 直接在 Global Memory 上运行 JFA
-    // 注意：这里 vals 指针直接指向 in_data (Global)，读取稍慢但无需拷贝
+    // 1. & 2. 运行 JFA (直接在 Global Mem 上读写)
     run_jfa_core(L, threadIdx.x, in_data + offset, g_idx1, g_idx2);
 
-    // 写回逻辑同上
+    // 3. 写回结果
     for (int q = threadIdx.x; q < L; q += blockDim.x) {
         int p = g_idx1[q];
         float dist_val;
@@ -221,177 +224,191 @@ __global__ void edt_kernel_global(
         out_dist[offset + q] = dist_val;
 
         if (p != -1) {
-            int64_t src_offset = (offset + p) * NDim;
-            int64_t dst_offset = (offset + q) * NDim;
-            for (int d = 0; d < NDim; ++d) {
+            int64_t src_offset = (offset + p) * D;
+            int64_t dst_offset = (offset + q) * D;
+            for (int d = 0; d < D; ++d) {
                 out_indices[dst_offset + d] = in_indices[src_offset + d];
             }
         } else {
-             int64_t dst_offset = (offset + q) * NDim;
-             for (int d = 0; d < NDim; ++d) out_indices[dst_offset + d] = 0;
+             int64_t dst_offset = (offset + q) * D;
+             for (int d = 0; d < D; ++d) out_indices[dst_offset + d] = 0;
         }
     }
 }
 
-
 // ------------------------------------------------------------------
-// 辅助：初始化索引张量
+// Kernel 3: Initialize Indices
 // ------------------------------------------------------------------
-// 将 index tensor 初始化为 grid grid coordinates
-// shape: (..., D), 最后一个维度存坐标
-__global__ void init_indices_kernel(int32_t* indices, int64_t total_elements, int NDim, 
-                                    const int64_t* shape, const int64_t* strides) {
+// 初始化索引张量为网格坐标
+// indices shape: (..., D)
+__global__ void init_indices_kernel(
+    int32_t* indices, 
+    int64_t total_pixels, 
+    int NDim, 
+    const int64_t* __restrict__ shape_ptr // shape of spatial dimensions
+) {
     int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= total_elements) return;
+    if (idx >= total_pixels) return;
 
-    // 反解坐标
+    // 反解坐标 (Unravel Index)
+    // idx 是每个像素的 flat index
+    // 我们需要计算它在 spatial_shape 中的坐标
+    
     int64_t temp = idx;
-    int32_t coords[10]; // max dims
+    // 使用本地寄存器数组避免多次全局内存读取 (假设最大 10 维)
+    int32_t coords[10]; 
 
-    // strides 是针对 elements 展开的，但这里 indices 是 (Total, NDim)
-    // 我们可以简单地根据 shape 反解
-    // 注意：这里的 total_elements 是像素数，不是 indices 数组的大小
-    
-    // 假设 shape 是 [D0, D1, D2]
-    // idx 对应 flat index
-    
+    // 假设 spatial_shape 是 [D0, D1, D2]
+    // 倒序计算除余
     for (int d = NDim - 1; d >= 0; --d) {
-        coords[d] = temp % shape[d];
-        temp /= shape[d];
+        int64_t dim_size = shape_ptr[d];
+        coords[d] = temp % dim_size;
+        temp /= dim_size;
     }
 
-    // 写入
+    // 写入 Global Memory
+    // Indices tensor 是 (TotalPixels, NDim) 扁平化的
+    int64_t out_ptr = idx * NDim;
     for (int d = 0; d < NDim; ++d) {
-        indices[idx * NDim + d] = coords[d];
+        indices[out_ptr + d] = coords[d];
     }
 }
 
 // ------------------------------------------------------------------
-// Host 函数
+// Host Function: C++ Entry Point
 // ------------------------------------------------------------------
+
 std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
     TORCH_CHECK(input.is_cuda(), "Input must be on CUDA device.");
     TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be float32.");
     
     input = input.contiguous();
-    bool had_no_batch_dim = (input.dim() == 1);
-    if (had_no_batch_dim) input = input.unsqueeze(0);
-
-    const int ndim = input.dim(); // Include batch
+    
+    // 处理 Batch 维度：如果输入是 1D (L)，视为无 Batch，但在处理中统一加一个 Batch 维方便
+    // 标准约定：Input shape (Batch, D1, D2, ..., Dn)
+    // 算法对 Batch 维度和其他维度处理其实是一样的（视为无关维度）
+    // 但索引初始化需要知道哪些是 "Spatial Dimensions"。
+    // 这里假设：输入的所有维度除了 Batch (Dim 0) 外都是空间维度。
+    
+    const int ndim = input.dim(); 
+    // 如果 ndim=1, 假设是 (L)，sample_ndim=1
+    // 如果 ndim=4 (B, C, H, W)，sample_ndim=3 (C,H,W 都算空间? 通常 C 也是独立处理的)
+    // **修正**: 标准 EDT 通常是在 (H, W) 或 (D, H, W) 上进行的。
+    // 如果有 Channel，通常 Channel 也是独立的。
+    // 为了最通用，我们将 **除了第0维(Batch)** 以外的所有维度都视为空间维度进行索引记录。
+    // 如果用户输入没有 Batch 维，请在 Python 端 unsqueeze(0)。
+    
+    // 假设输入已经是 (Batch, ...Spatial...)
     const int sample_ndim = ndim - 1; 
+    TORCH_CHECK(sample_ndim > 0, "Input tensor must have at least 2 dimensions (Batch, ...)");
+
     auto shape = input.sizes().vec();
     int64_t num_pixels = input.numel();
 
     if (num_pixels == 0) {
         auto index_shape = shape;
-        index_shape.push_back(sample_ndim > 0 ? sample_ndim : 1);
+        index_shape.push_back(sample_ndim);
         return std::make_tuple(torch::empty_like(input), 
                                torch::empty(index_shape, input.options().dtype(torch::kInt32)));
     }
 
-    // 1. 初始化输出 Tensor
-    // current_dist 在迭代过程中存储 dist^2，最后开方
-    // 初始状态：Input 里的 0 还是 0，其他非 0 (背景) 设为 INF
+    // 1. 初始化 Distance Tensor
+    // 0 -> 0, 1 -> INF
     auto current_dist = torch::where(input == 0, 
                                      torch::tensor(0.0f, input.options()), 
                                      torch::tensor(INF_VAL, input.options()));
     
-    // 初始化索引 Map (Batch, ..., NDim)
+    // 2. 初始化 Index Tensor
+    // Shape: (Batch, D1, ..., Dn, sample_ndim)
     auto index_shape = shape;
     index_shape.push_back(sample_ndim);
     auto current_idx = torch::empty(index_shape, input.options().dtype(torch::kInt32));
     
-    // 启动 Kernel 初始化索引
-    // 为了反解坐标，我们需要把 shape 传进去
+    // 2.1 准备 Shape 数据传给 Kernel
+    std::vector<int64_t> spatial_shape(shape.begin() + 1, shape.end());
+    auto shape_tensor = torch::tensor(spatial_shape, torch::kInt64).to(input.device());
+
+    // 2.2 运行初始化 Kernel
     {
-        // 排除 batch 维度的 shape 用于坐标计算? 
-        // 需求是：返回的索引是 (batch_idx, z, y, x) 还是只是 (z, y, x)?
-        // 通常 EDT 返回的是 sample 内的坐标。所以我们忽略 batch 维度。
-        std::vector<int64_t> sample_shape_vec(shape.begin() + 1, shape.end());
-        auto sample_shape_tensor = torch::tensor(sample_shape_vec, torch::kInt64).to(input.device());
-        // 这里的 strides 不需要，直接由 shape 反解
-        
         int threads = 256;
         int blocks = (num_pixels + threads - 1) / threads;
-        
-        // 我们需要传递 sample_ndim
         init_indices_kernel<<<blocks, threads>>>(
-            current_idx.data_ptr<int32_t>(), 
-            num_pixels, 
+            current_idx.data_ptr<int32_t>(),
+            num_pixels,
             sample_ndim,
-            sample_shape_tensor.data_ptr<int64_t>(),
-            nullptr // strides not needed for simple unravel
+            shape_tensor.data_ptr<int64_t>()
         );
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            printf("Init Kernel Failed: %s\n", cudaGetErrorString(err));
+        }
     }
-    
-    // 用于 Global Memory Fallback 的临时 buffer
+
+    // 预分配 Global Memory Buffer (懒加载)
     torch::Tensor global_buf1, global_buf2;
 
-    // 2. 逐维处理 (Separable Phases)
-    // 从最后一个维度倒着处理，或者顺序处理都可以。
-    // 为了 Host Transpose 方便，我们遍历 sample 的每一个维度 (1 到 ndim-1)
+    // 3. 逐维处理 (Separable JFA)
+    // 遍历每一个空间维度 (从 1 到 ndim-1)
     for (int d = 1; d < ndim; ++d) {
         bool is_final_pass = (d == ndim - 1);
         
-        // -----------------------------------------------------------
-        // Step A: Permute & Contiguous
-        // 将当前处理维度 d 移到最后: (0, 1, ..., d, ..., N-1) -> (0, 1, ..., N-1, d)
-        // 这样最后内存布局就是 [..., L]，stride=1
-        // -----------------------------------------------------------
-        
-        // 这种 swap 策略比较简单: transpose(d, -1)
-        // 注意：index tensor 也要变换，但 index tensor 最后一维是 coord_dim，不能乱动。
-        // Index tensor 形状是 [..., sample_ndim]。
-        // 我们需要变换的是前面的空间维度 [...]。
-        
+        // --- Step A: Transpose current dim to last ---
+        // 变换后 Shape: (..., L)
         auto dist_in = current_dist.transpose(d, ndim - 1).contiguous();
         auto idx_in  = current_idx.transpose(d, ndim - 1).contiguous(); 
         
-        // 此时 dist_in shape: [..., L]
-        // idx_in shape: [..., L, sample_ndim]
-        
         int64_t L = dist_in.size(-1); // 当前维度的长度
-        int64_t total_slices = dist_in.numel() / L; // 有多少行
+        int64_t total_slices = dist_in.numel() / L; 
         
         auto dist_out = torch::empty_like(dist_in);
         auto idx_out  = torch::empty_like(idx_in);
 
-        // -----------------------------------------------------------
-        // Step B: Kernel Dispatch
-        // -----------------------------------------------------------
+        // --- Step B: Kernel Dispatch ---
         int threads = std::min((int64_t)MAX_THREADS, L);
         
-        // 检查 Shared Memory 需求
-        // Need: float(4) + int(4) + int(4) = 12 bytes per pixel
+        // 检查是否可以使用 Shared Memory
         if (L <= SMEM_LIMIT_ELEMENTS) {
             size_t smem_size = L * (sizeof(float) + 2 * sizeof(int));
             
-            // 模板参数 NDim 需要是编译期常量。
-            // 动态分发 sample_ndim (1D, 2D, 3D usually)
-            // 使用 switch case 覆盖常见维度 (1, 2, 3)
+            // 使用 Switch 宏来处理常用的维度模板特化
             #define DISPATCH_SHARED(IS_FINAL) \
                 switch(sample_ndim) { \
                     case 1: edt_kernel_shared<IS_FINAL, 1><<<total_slices, threads, smem_size>>>( \
                         dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
                         dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel()); break; \
+                        L, dist_in.numel(), sample_ndim); break; \
                     case 2: edt_kernel_shared<IS_FINAL, 2><<<total_slices, threads, smem_size>>>( \
                         dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
                         dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel()); break; \
+                        L, dist_in.numel(), sample_ndim); break; \
                     case 3: edt_kernel_shared<IS_FINAL, 3><<<total_slices, threads, smem_size>>>( \
                         dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
                         dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel()); break; \
-                    default: /* fallback for >3D */ break; \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    case 4: edt_kernel_shared<IS_FINAL, 4><<<total_slices, threads, smem_size>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    case 5: edt_kernel_shared<IS_FINAL, 5><<<total_slices, threads, smem_size>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    case 6: edt_kernel_shared<IS_FINAL, 6><<<total_slices, threads, smem_size>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    default: /* Fallback for > 6D */ \
+                        edt_kernel_shared<IS_FINAL, 0><<<total_slices, threads, smem_size>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
                 }
 
             if (is_final_pass) { DISPATCH_SHARED(true); } 
             else { DISPATCH_SHARED(false); }
 
         } else {
-            // Fallback: Global Memory
-            // 需要分配 buffer: [total_slices * L] = [numel]
+            // Global Memory Fallback (L > 4096)
             if (global_buf1.numel() < dist_in.numel()) {
                 global_buf1 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
                 global_buf2 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
@@ -403,33 +420,49 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
                         dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
                         dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
                         global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel()); break; \
+                        L, dist_in.numel(), sample_ndim); break; \
                     case 2: edt_kernel_global<IS_FINAL, 2><<<total_slices, threads>>>( \
                         dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
                         dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
                         global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel()); break; \
+                        L, dist_in.numel(), sample_ndim); break; \
                     case 3: edt_kernel_global<IS_FINAL, 3><<<total_slices, threads>>>( \
                         dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
                         dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
                         global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel()); break; \
-                    default: break; \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    case 4: edt_kernel_global<IS_FINAL, 4><<<total_slices, threads>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    case 5: edt_kernel_global<IS_FINAL, 5><<<total_slices, threads>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    case 6: edt_kernel_global<IS_FINAL, 6><<<total_slices, threads>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    default: /* Fallback */ \
+                        edt_kernel_global<IS_FINAL, 0><<<total_slices, threads>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
                 }
 
-            if (is_final_pass) { DISPATCH_GLOBAL(true); }
+            if (is_final_pass) { DISPATCH_GLOBAL(true); } 
             else { DISPATCH_GLOBAL(false); }
         }
 
-        // -----------------------------------------------------------
-        // Step C: Transpose Back
-        // -----------------------------------------------------------
-        current_dist = dist_out.transpose(d, ndim - 1); // View, non-contiguous is fine here as next step makes it contiguous
+        // --- Step C: Transpose Back ---
+        current_dist = dist_out.transpose(d, ndim - 1);
         current_idx  = idx_out.transpose(d, ndim - 1);
     }
 
-    if (had_no_batch_dim) {
-        return std::make_tuple(current_dist.squeeze(0), current_idx.squeeze(0));
-    }
     return std::make_tuple(current_dist, current_idx);
-}
\ No newline at end of file
+}
+

From bc9c03b3033522c8a98b6a07ee012e751ba25960 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Mon, 8 Dec 2025 19:58:15 +0800
Subject: [PATCH 16/56] =?UTF-8?q?=E5=AE=9E=E7=8E=B0n=E7=BB=B4=E6=89=B9?=
 =?UTF-8?q?=E5=A4=84=E7=90=86=E5=90=8C=E6=97=B6=E8=BF=94=E5=9B=9E=E5=9D=90?=
 =?UTF-8?q?=E6=A0=87=E5=92=8C=E8=B7=9D=E7=A6=BB=E7=9A=84=E7=B2=BE=E7=A1=AE?=
 =?UTF-8?q?=E6=AC=A7=E5=BC=8F=E8=B7=9D=E7=A6=BB=E5=8F=98=E6=8D=A2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/test_distance_transform.py              | 140 +--
 torchmorph/csrc/distance_transform_kernel.cu | 939 +++++++++++--------
 2 files changed, 592 insertions(+), 487 deletions(-)

diff --git a/test/test_distance_transform.py b/test/test_distance_transform.py
index 6e6265d..3a11166 100644
--- a/test/test_distance_transform.py
+++ b/test/test_distance_transform.py
@@ -1,127 +1,91 @@
 import torch
 import pytest
-import numpy as np
 from scipy.ndimage import distance_transform_edt as scipy_edt
-import torchmorph as tm 
+import numpy as np
+import torchmorph as tm
 
-# ======================================================================
-# 辅助函数
-# ======================================================================
+# 辅助函数：调用 SciPy 并处理格式
 def batch_scipy_edt_with_indices(batch_numpy: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
-    dist_results, indices_results = [], []
-    
-    # 保证 batch_numpy 至少是 (Batch, ...)
-    # 如果进来的是 (H, W)，我们在外面已经处理成 (1, H, W) 了
-    if batch_numpy.ndim == 1:
+    is_single_sample = batch_numpy.ndim <= 2
+    if is_single_sample:
         batch_numpy = batch_numpy[np.newaxis, ...]
-
+    dist_results, indices_results = [], []
     for sample in batch_numpy:
         dist, indices = scipy_edt(sample, return_indices=True, return_distances=True)
         dist_results.append(dist)
         indices_results.append(indices)
-        
     output_dist = np.stack(dist_results, axis=0)
     output_indices = np.stack(indices_results, axis=0)
-    output_indices = np.moveaxis(output_indices, 1, -1) 
-    
+    output_indices = np.moveaxis(output_indices, 1, -1)
+    if is_single_sample:
+        output_dist = output_dist.squeeze(0)
+        output_indices = output_indices.squeeze(0)
     return output_dist, output_indices
 
-# ======================================================================
-# 测试数据
-# ======================================================================
-case_batch_1d = np.array([[1, 1, 0, 1, 0, 1], [0, 1, 1, 1, 1, 0]], dtype=np.float32)
-
-case_batch_2d = np.array([[[0., 1, 1, 1], [0, 0, 1, 1], [0, 1, 1, 0]],
-                          [[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]]], dtype=np.float32)
-
-# 这里定义为 (4, 4)，意图是单张 2D 图
-case_single_2d = np.array([[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0]], dtype=np.float32)
-case_explicit_batch_one = case_single_2d[np.newaxis, ...]
-
+# 用例定义 
+case_batch_2d = np.array([[[0., 1, 1, 1], [0, 0, 1, 1], [0, 1, 1, 0]],[[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]]], dtype=np.float32)
 _case_3d_s1 = np.ones((4, 5, 6), dtype=np.float32); _case_3d_s1[1, 1, 1] = 0.0; _case_3d_s1[2, 3, 4] = 0.0
 _case_3d_s2 = np.ones((4, 5, 6), dtype=np.float32); _case_3d_s2[0, 0, 0] = 0.0
 case_batch_3d = np.stack([_case_3d_s1, _case_3d_s2], axis=0)
-
+case_single_2d = np.array([[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0]], dtype=np.float32)
+case_explicit_batch_one = case_single_2d[np.newaxis, ...]
 case_dim_one = np.ones((2, 5, 1), dtype=np.float32); case_dim_one[0, 2, 0] = 0.0; case_dim_one[1, 4, 0] = 0.0
+case_batch_1d = np.array([[1, 1, 0, 1, 0, 1], [0, 1, 1, 1, 1, 0]], dtype=np.float32)
 
-# 4D Case
-_case_4d_s1 = np.ones((3, 3, 3, 3), dtype=np.float32); _case_4d_s1[0, 0, 0, 0] = 0.0
-_case_4d_s2 = np.ones((3, 3, 3, 3), dtype=np.float32); _case_4d_s2[1, 1, 1, 1] = 0.0
-case_batch_4d_spatial = np.stack([_case_4d_s1, _case_4d_s2], axis=0)
-
-# 5D Case
-case_batch_5d_spatial = np.ones((1, 2, 2, 2, 2, 2), dtype=np.float32)
-case_batch_5d_spatial[0, 0, 0, 0, 0, 0] = 0.0; case_batch_5d_spatial[0, 1, 1, 1, 1, 1] = 0.0
-
-# ======================================================================
-# 测试逻辑
-# ======================================================================
 @pytest.mark.parametrize(
-    "input_numpy, has_batch_dim",
+    "input_numpy",
     [
-        pytest.param(case_batch_1d, True, id="1D_Batch"),
-        pytest.param(case_batch_2d, True, id="2D_Batch"),
-        pytest.param(case_single_2d, False, id="2D_Single_NoBatch"),
-        pytest.param(case_explicit_batch_one, True, id="2D_Single_ExplicitBatch"),
-        pytest.param(case_batch_3d, True, id="3D_Batch"),
-        pytest.param(case_dim_one, True, id="2D_UnitDim_Batch"),
-        pytest.param(case_batch_4d_spatial, True, id="4D_Spatial_Batch"),
-        pytest.param(case_batch_5d_spatial, True, id="5D_Spatial_Batch"),
+        pytest.param(case_batch_2d, id="批处理2D图像"),
+        pytest.param(case_batch_3d, id="批处理3D图像"),
+        pytest.param(case_single_2d, id="单张2D图像(隐式批处理)"),
+        pytest.param(case_explicit_batch_one, id="单张2D图像(显式批处理)"),
+        pytest.param(case_dim_one, id="含幺元维度的批处理"),
+        pytest.param(case_batch_1d, id="批处理1D数据"),
     ],
 )
-def test_distance_transform_and_indices(input_numpy: np.ndarray, has_batch_dim: bool, request: pytest.FixtureRequest):
+def test_distance_transform_and_indices(input_numpy: np.ndarray, request: pytest.FixtureRequest):
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
     
-    # 1. 准备 Numpy 数据
     x_numpy_contiguous = np.ascontiguousarray(input_numpy)
-    
-    # 2. 准备 SciPy 输入
-    # 如果意图是单样本 (has_batch_dim=False)，我们手动增加 Batch 维，
-    # 这样 scipy 辅助函数就会把它当做一张图来处理，而不是 N 张 1D 图
-    if not has_batch_dim:
-        scipy_input = x_numpy_contiguous[np.newaxis, ...]
-    else:
-        scipy_input = x_numpy_contiguous
-
-    # 3. 准备 CUDA 输入
-    # 关键修复: 
-    # 如果 has_batch_dim=False，说明这是单张 (H, W)，我们要测 2D EDT。
-    # C++ API 默认第一维是 Batch，所以我们必须 unsqueeze(0) 变成 (1, H, W)。
-    # 否则 C++ 会把它当做 (Batch=H, Len=W) 做 1D EDT。
     x_cuda = torch.from_numpy(x_numpy_contiguous).cuda()
-    if not has_batch_dim:
-        x_cuda = x_cuda.unsqueeze(0)
 
-    print(f"\n\n--- 运行测试: {request.node.callspec.id} ---")
-    print(f"CUDA 输入形状: {x_cuda.shape}")
+    print(f"\n\n--- 正在运行测试: {request.node.callspec.id} ---")
+    print(f"输入张量形状: {x_cuda.shape}")
 
-    # 4. 运行 CUDA EDT
+    # 调用您的 Python 包装函数
     dist_cuda, idx_cuda = tm.distance_transform(x_cuda.clone())
 
-    # 5. 运行 SciPy (Ground Truth)
-    dist_ref_numpy, idx_ref_numpy = batch_scipy_edt_with_indices(scipy_input)
+    print(f"CUDA 距离输出形状: {dist_cuda.shape}")
+    print(f"CUDA 坐标输出形状: {idx_cuda.shape}")
+
+    # 调用 SciPy 作为参考基准
+    dist_ref_numpy, idx_ref_numpy = batch_scipy_edt_with_indices(x_numpy_contiguous)
     dist_ref = torch.from_numpy(dist_ref_numpy).to(torch.float32).cuda()
+    
+    print(f"SciPy 距离输出形状: {dist_ref.shape}")
 
-    # 6. 验证距离
-    # 此时 dist_cuda 是 (1, H, W)，dist_ref 也是 (1, H, W)
-    # 如果原意是 NoBatch，我们可以把 Batch 维 squeeze 掉再比，或者直接比
-    print(f"CUDA Out Shape: {dist_cuda.shape}, Ref Shape: {dist_ref.shape}")
-    assert dist_cuda.shape == dist_ref.shape, f"Shape mismatch: {dist_cuda.shape} vs {dist_ref.shape}"
+    # 断言验证
+    print("\n--- 正在验证距离... ---")
+    assert dist_cuda.shape == dist_ref.shape
     torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-3, rtol=1e-3)
-    print(">> 距离验证通过。")
+    print("距离断言通过 (形状和数值接近)。")
 
-    # 7. 验证索引
-    # idx_cuda: (1, H, W, 2)
-    # 构造 Grid
-    spatial_shape = x_cuda.shape[1:] # (H, W)
+    print("\n--- 正在验证坐标... ---")
+    
+    # 鲁棒的坐标验证逻辑
+    had_no_batch_dim = (x_numpy_contiguous.ndim <= idx_cuda.shape[-1])
+    spatial_shape = x_cuda.shape if had_no_batch_dim else x_cuda.shape[1:]
     coords = [torch.arange(s, device='cuda') for s in spatial_shape]
-    grid = torch.stack(torch.meshgrid(*coords, indexing='ij'), dim=-1) # (H, W, 2)
-    grid = grid.unsqueeze(0) # (1, H, W, 2)
-
+    grid = torch.stack(torch.meshgrid(*coords, indexing='ij'), dim=-1)
+    
+    if not had_no_batch_dim:
+        grid = grid.unsqueeze(0)
+        
     diff = grid.float() - idx_cuda.float()
-    dist_sq_calculated = torch.sum(diff * diff, dim=-1)
-    dist_sq_output = dist_cuda * dist_cuda
+    dist_sq_from_indices = torch.sum(diff * diff, dim=-1)
     
-    torch.testing.assert_close(dist_sq_calculated, dist_sq_output, atol=1e-3, rtol=1e-3)
-    print(">> 索引验证通过。")
\ No newline at end of file
+    torch.testing.assert_close(dist_sq_from_indices, dist_cuda * dist_cuda, atol=1e-3, rtol=1e-3)
+    print("坐标正确性断言通过 (计算出的距离与返回距离匹配)。")
+        
+    print("--- 测试通过 ---")
\ No newline at end of file
diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 3bda330..4ec4647 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -1,468 +1,609 @@
 #include <torch/extension.h>
 #include <vector>
-#include <cmath>
 #include <algorithm>
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-// ------------------------------------------------------------------
-// 配置常量
-// ------------------------------------------------------------------
-#define INF_VAL 1e8f
-#define MAX_THREADS 1024
-// Shared Memory 限制: 48KB 一般安全。
-// 每个像素需要: float(val) + int(idx1) + int(idx2) = 12 bytes
-// 4096 * 12 = 48KB.
-#define SMEM_LIMIT_ELEMENTS 4096 
-
-// ------------------------------------------------------------------
-// Device Helper Functions
-// ------------------------------------------------------------------
-
-__device__ __forceinline__ float sqr(float x) { return x * x; }
-
-// 计算 JFA 代价: (q - p)^2 + weight[p]
-__device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
-    if (p < 0) return INF_VAL; 
-    return sqr((float)q - (float)p) + val_p;
-}
-
-// ------------------------------------------------------------------
-// JFA Core Logic (Device Only)
-// ------------------------------------------------------------------
-// 核心 JFA 逻辑，与数据位置无关 (Shared 或 Global 均通用)
-__device__ void run_jfa_core(
-    int N,
-    int tid,
-    const float* __restrict__ vals,  // 输入权重 (只读)
-    int* __restrict__ idx_curr,      // Ping-Pong Buffer A
-    int* __restrict__ idx_next       // Ping-Pong Buffer B
+#include <utility>
+#include <cmath>
+#include <limits>
+
+// 优化策略：用4个独立的内核函数替代模板，完全消除分支
+
+// 内核1: 第一个pass且是唯一pass (1D情况)
+__global__ void edt_kernel_first_final(
+    const float* in_data,
+    float* out_dist,
+    int32_t* out_idx,
+    const int64_t* shape,
+    const int64_t* strides,
+    int32_t ndim,
+    int32_t process_dim_sample,
+    int64_t total_slices,
+    int64_t num_slices_per_sample
 ) {
-    // 1. 初始化: 根据 vals 决定是否是有效源点
-    for (int i = tid; i < N; i += blockDim.x) {
-        if (vals[i] >= INF_VAL * 0.9f) {
-            idx_curr[i] = -1; // 背景
-        } else {
-            idx_curr[i] = i;  // 物体/源点，初始索引指向自己
+    int64_t slice_idx = blockIdx.x;
+    if (slice_idx >= total_slices) return;
+
+    int64_t batch_idx = slice_idx / num_slices_per_sample;
+    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
+    int64_t batch_offset = batch_idx * strides[0];
+    int64_t sample_base_offset = 0;
+    int64_t temp_idx = slice_idx_in_sample;
+    const int sample_ndim = ndim - 1;
+
+    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
+        if (d == process_dim_sample) continue;
+        int64_t size_of_dim = shape[d + 1];
+        if (size_of_dim > 0) {
+            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
+            temp_idx /= size_of_dim;
         }
     }
-    __syncthreads();
-
-    // 2. 迭代传播 (Step = 1, 2, 4, ... < N)
-    int* idx_in = idx_curr;
-    int* idx_out = idx_next;
+    
+    const int64_t process_dim_actual = process_dim_sample + 1;
+    const int64_t N = shape[process_dim_actual];
+    const int64_t stride = strides[process_dim_actual];
+    const int64_t base_offset = batch_offset + sample_base_offset;
 
-    for (int step = 1; step < N; step *= 2) {
-        for (int i = tid; i < N; i += blockDim.x) {
-            int my_best_p = idx_in[i];
-            float min_cost = INF_VAL;
+    if (N == 0) return;
 
-            // 检查自己当前的最优解
-            if (my_best_p != -1) {
-                min_cost = compute_cost(i, my_best_p, vals[my_best_p]);
-            }
-
-            // Check Left Neighbor (-step)
-            int left = i - step;
-            if (left >= 0) {
-                int left_p = idx_in[left];
-                if (left_p != -1) {
-                    float c = compute_cost(i, left_p, vals[left_p]);
-                    if (c < min_cost) {
-                        min_cost = c;
-                        my_best_p = left_p;
-                    }
+    extern __shared__ char s_buffer[];
+    float* f = (float*)s_buffer;
+    int*   v = (int*)(f + N);
+    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
+    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
+
+    // 加载数据 - 第一个pass
+    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
+        int64_t global_offset = base_offset + i * stride;
+        float val = __ldg(&in_data[global_offset]);
+        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
+        
+        if (val == 0.0f) {
+            f[i] = 0.0f;
+            int64_t temp_coord = slice_idx_in_sample;
+            for (int32_t d = sample_ndim - 1; d >= 0; --d) {
+                if (d == process_dim_sample) continue;
+                int64_t size_of_dim = shape[d + 1];
+                if (size_of_dim > 0) {
+                    shared_idx_ptr[d] = temp_coord % size_of_dim;
+                    temp_coord /= size_of_dim;
+                } else {
+                    shared_idx_ptr[d] = 0;
                 }
             }
+            shared_idx_ptr[process_dim_sample] = i;
+        } else {
+            f[i] = 1e20f;
+            for (int d = 0; d < sample_ndim; ++d) shared_idx_ptr[d] = -1;
+        }
+    }
+    __syncthreads();
 
-            // Check Right Neighbor (+step)
-            int right = i + step;
-            if (right < N) {
-                int right_p = idx_in[right];
-                if (right_p != -1) {
-                    float c = compute_cost(i, right_p, vals[right_p]);
-                    if (c < min_cost) {
-                        min_cost = c;
-                        my_best_p = right_p;
-                    }
+    // 构建包络
+    if (threadIdx.x == 0) {
+        int k = 0;
+        v[0] = 0;
+        z[0] = -1e20f;
+        z[1] = 1e20f;
+        
+        for (int q = 1; q < N; q++) {
+            float fq = f[q];
+            int q_sq = q * q;
+            
+            while (k >= 0) {
+                int p = v[k];
+                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
+                if (s > z[k]) {
+                    k++;
+                    v[k] = q;
+                    z[k] = s;
+                    z[k + 1] = 1e20f;
+                    break;
+                }
+                k--;
+                if (k < 0) {
+                    k = 0;
+                    v[0] = q;
+                    z[0] = -1e20f;
+                    z[1] = 1e20f;
+                    break;
                 }
             }
-            idx_out[i] = my_best_p;
         }
-        
-        // Swap Pointers
-        int* temp = idx_in;
-        idx_in = idx_out;
-        idx_out = temp;
-        __syncthreads();
     }
+    __syncthreads();
 
-    // 3. 确保最终结果在 idx_curr (如果循环结束时在 next，则拷回)
-    if (idx_in != idx_curr) {
-        for (int i = tid; i < N; i += blockDim.x) {
-            idx_curr[i] = idx_next[i];
+    // 计算距离 - 最后一个pass，直接开方
+    for (int q = threadIdx.x; q < N; q += blockDim.x) {
+        int k = 0;
+        float q_float = (float)q;
+        while (z[k + 1] < q_float) k++;
+        
+        int p = v[k];
+        int64_t global_offset = base_offset + q * stride;
+        float dist_sq = (float)(q - p) * (q - p) + f[p];
+        
+        out_dist[global_offset] = sqrtf(dist_sq);  // 直接开方
+
+        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
+        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
+        for (int d = 0; d < sample_ndim; ++d) {
+            out_idx_ptr[d] = src_idx_ptr[d];
         }
-        __syncthreads();
     }
 }
 
-// ------------------------------------------------------------------
-// Kernel 1: Shared Memory JFA (Fast Path)
-// ------------------------------------------------------------------
-// 模板参数 NDim: 如果 > 0，编译器会展开循环优化。
-// 参数 runtime_ndim: 如果 NDim == 0 (Default case)，使用该参数作为维度。
-template <bool IsFinal, int NDim>
-__global__ void edt_kernel_shared(
-    const float* __restrict__ in_data,       // 输入 Dist^2
-    const int32_t* __restrict__ in_indices,  // 输入 Indices
-    float* __restrict__ out_dist,            // 输出 Dist (IsFinal ? sqrt : sqr)
-    int32_t* __restrict__ out_indices,       // 输出 Indices
-    int64_t L,                               // 当前维度的长度
-    int64_t total_elements,                  // 总像素数
-    int runtime_ndim                         // 运行时维度 (fallback)
+// 内核2: 第一个pass但不是最后
+__global__ void edt_kernel_first_only(
+    const float* in_data,
+    float* out_dist,
+    int32_t* out_idx,
+    const int64_t* shape,
+    const int64_t* strides,
+    int32_t ndim,
+    int32_t process_dim_sample,
+    int64_t total_slices,
+    int64_t num_slices_per_sample
 ) {
-    // 确定实际维度
-    const int D = (NDim > 0) ? NDim : runtime_ndim;
-
-    // 计算行偏移
-    int64_t row_idx = blockIdx.x;
-    int64_t offset = row_idx * L;
+    int64_t slice_idx = blockIdx.x;
+    if (slice_idx >= total_slices) return;
+
+    int64_t batch_idx = slice_idx / num_slices_per_sample;
+    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
+    int64_t batch_offset = batch_idx * strides[0];
+    int64_t sample_base_offset = 0;
+    int64_t temp_idx = slice_idx_in_sample;
+    const int sample_ndim = ndim - 1;
+
+    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
+        if (d == process_dim_sample) continue;
+        int64_t size_of_dim = shape[d + 1];
+        if (size_of_dim > 0) {
+            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
+            temp_idx /= size_of_dim;
+        }
+    }
     
-    if (offset >= total_elements) return;
-
-    // Shared Memory 布局
-    extern __shared__ char s_buffer[];
-    float* s_vals = (float*)s_buffer;
-    int*   s_idx1 = (int*)(s_vals + L);
-    int*   s_idx2 = (int*)(s_idx1 + L);
+    const int64_t process_dim_actual = process_dim_sample + 1;
+    const int64_t N = shape[process_dim_actual];
+    const int64_t stride = strides[process_dim_actual];
+    const int64_t base_offset = batch_offset + sample_base_offset;
 
-    // 1. 加载 Dist 到 Shared Memory
-    for (int i = threadIdx.x; i < L; i += blockDim.x) {
-        s_vals[i] = __ldg(&in_data[offset + i]);
-    }
-    __syncthreads();
+    if (N == 0) return;
 
-    // 2. 运行 JFA 核心
-    run_jfa_core(L, threadIdx.x, s_vals, s_idx1, s_idx2);
-    
-    // 3. 写回结果
-    for (int q = threadIdx.x; q < L; q += blockDim.x) {
-        int p = s_idx1[q]; // 最近点在当前行内的局部索引 (0..L-1)
-        float dist_val;
-
-        // 计算新距离
-        if (p != -1) {
-            float dist_sq = sqr((float)q - (float)p) + s_vals[p];
-            dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
+    extern __shared__ char s_buffer[];
+    float* f = (float*)s_buffer;
+    int*   v = (int*)(f + N);
+    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
+    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
+
+    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
+        int64_t global_offset = base_offset + i * stride;
+        float val = __ldg(&in_data[global_offset]);
+        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
+        
+        if (val == 0.0f) {
+            f[i] = 0.0f;
+            int64_t temp_coord = slice_idx_in_sample;
+            for (int32_t d = sample_ndim - 1; d >= 0; --d) {
+                if (d == process_dim_sample) continue;
+                int64_t size_of_dim = shape[d + 1];
+                if (size_of_dim > 0) {
+                    shared_idx_ptr[d] = temp_coord % size_of_dim;
+                    temp_coord /= size_of_dim;
+                } else {
+                    shared_idx_ptr[d] = 0;
+                }
+            }
+            shared_idx_ptr[process_dim_sample] = i;
         } else {
-            dist_val = IsFinal ? INF_VAL : sqr(INF_VAL);
-            p = 0; // 防止越界，随便指一个
+            f[i] = 1e20f;
+            for (int d = 0; d < sample_ndim; ++d) shared_idx_ptr[d] = -1;
         }
-        out_dist[offset + q] = dist_val;
+    }
+    __syncthreads();
 
-        // 索引传播: Copy Vector [D]
-        if (p != -1) {
-            int64_t src_offset = (offset + p) * D;
-            int64_t dst_offset = (offset + q) * D;
+    if (threadIdx.x == 0) {
+        int k = 0;
+        v[0] = 0;
+        z[0] = -1e20f;
+        z[1] = 1e20f;
+        
+        for (int q = 1; q < N; q++) {
+            float fq = f[q];
+            int q_sq = q * q;
             
-            // 如果 NDim > 0，这里会完全展开，非常快
-            for (int d = 0; d < D; ++d) {
-                out_indices[dst_offset + d] = in_indices[src_offset + d];
+            while (k >= 0) {
+                int p = v[k];
+                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
+                if (s > z[k]) {
+                    k++;
+                    v[k] = q;
+                    z[k] = s;
+                    z[k + 1] = 1e20f;
+                    break;
+                }
+                k--;
+                if (k < 0) {
+                    k = 0;
+                    v[0] = q;
+                    z[0] = -1e20f;
+                    z[1] = 1e20f;
+                    break;
+                }
             }
-        } else {
-             // 找不到源点（全图都是背景的情况）
-             int64_t dst_offset = (offset + q) * D;
-             for (int d = 0; d < D; ++d) out_indices[dst_offset + d] = 0;
+        }
+    }
+    __syncthreads();
+
+    for (int q = threadIdx.x; q < N; q += blockDim.x) {
+        int k = 0;
+        float q_float = (float)q;
+        while (z[k + 1] < q_float) k++;
+        
+        int p = v[k];
+        int64_t global_offset = base_offset + q * stride;
+        float dist_sq = (float)(q - p) * (q - p) + f[p];
+        
+        out_dist[global_offset] = dist_sq;  // 不开方
+
+        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
+        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
+        for (int d = 0; d < sample_ndim; ++d) {
+            out_idx_ptr[d] = src_idx_ptr[d];
         }
     }
 }
 
-// ------------------------------------------------------------------
-// Kernel 2: Global Memory JFA (Fallback Path)
-// ------------------------------------------------------------------
-// 逻辑同上，只是用 Global Memory 做 Ping-Pong Buffer
-template <bool IsFinal, int NDim>
-__global__ void edt_kernel_global(
-    const float* __restrict__ in_data,
-    const int32_t* __restrict__ in_indices,
-    float* __restrict__ out_dist,
-    int32_t* __restrict__ out_indices,
-    int* __restrict__ global_buffer_1,
-    int* __restrict__ global_buffer_2,
-    int64_t L,
-    int64_t total_elements,
-    int runtime_ndim
+// 内核3: 中间pass
+__global__ void edt_kernel_middle(
+    const float* in_dist,
+    const int32_t* in_idx,
+    float* out_dist,
+    int32_t* out_idx,
+    const int64_t* shape,
+    const int64_t* strides,
+    int32_t ndim,
+    int32_t process_dim_sample,
+    int64_t total_slices,
+    int64_t num_slices_per_sample
 ) {
-    const int D = (NDim > 0) ? NDim : runtime_ndim;
-
-    int64_t row_idx = blockIdx.x;
-    int64_t offset = row_idx * L;
+    int64_t slice_idx = blockIdx.x;
+    if (slice_idx >= total_slices) return;
+
+    int64_t batch_idx = slice_idx / num_slices_per_sample;
+    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
+    int64_t batch_offset = batch_idx * strides[0];
+    int64_t sample_base_offset = 0;
+    int64_t temp_idx = slice_idx_in_sample;
+    const int sample_ndim = ndim - 1;
+
+    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
+        if (d == process_dim_sample) continue;
+        int64_t size_of_dim = shape[d + 1];
+        if (size_of_dim > 0) {
+            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
+            temp_idx /= size_of_dim;
+        }
+    }
     
-    if (offset >= total_elements) return;
+    const int64_t process_dim_actual = process_dim_sample + 1;
+    const int64_t N = shape[process_dim_actual];
+    const int64_t stride = strides[process_dim_actual];
+    const int64_t base_offset = batch_offset + sample_base_offset;
 
-    // 指向 Global Memory 的指针
-    int* g_idx1 = global_buffer_1 + offset;
-    int* g_idx2 = global_buffer_2 + offset;
-    
-    // 1. & 2. 运行 JFA (直接在 Global Mem 上读写)
-    run_jfa_core(L, threadIdx.x, in_data + offset, g_idx1, g_idx2);
-
-    // 3. 写回结果
-    for (int q = threadIdx.x; q < L; q += blockDim.x) {
-        int p = g_idx1[q];
-        float dist_val;
-
-        if (p != -1) {
-            float val_p = in_data[offset + p]; 
-            float dist_sq = sqr((float)q - (float)p) + val_p;
-            dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
-        } else {
-            dist_val = IsFinal ? INF_VAL : sqr(INF_VAL);
-            p = 0; 
-        }
+    if (N == 0) return;
 
-        out_dist[offset + q] = dist_val;
+    extern __shared__ char s_buffer[];
+    float* f = (float*)s_buffer;
+    int*   v = (int*)(f + N);
+    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
+    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
+
+    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
+        int64_t global_offset = base_offset + i * stride;
+        f[i] = __ldg(&in_dist[global_offset]);
+        
+        const int32_t* global_idx_ptr = in_idx + global_offset * sample_ndim;
+        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
+        for (int d = 0; d < sample_ndim; ++d) {
+            shared_idx_ptr[d] = __ldg(&global_idx_ptr[d]);
+        }
+    }
+    __syncthreads();
 
-        if (p != -1) {
-            int64_t src_offset = (offset + p) * D;
-            int64_t dst_offset = (offset + q) * D;
-            for (int d = 0; d < D; ++d) {
-                out_indices[dst_offset + d] = in_indices[src_offset + d];
+    if (threadIdx.x == 0) {
+        int k = 0;
+        v[0] = 0;
+        z[0] = -1e20f;
+        z[1] = 1e20f;
+        
+        for (int q = 1; q < N; q++) {
+            float fq = f[q];
+            int q_sq = q * q;
+            
+            while (k >= 0) {
+                int p = v[k];
+                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
+                if (s > z[k]) {
+                    k++;
+                    v[k] = q;
+                    z[k] = s;
+                    z[k + 1] = 1e20f;
+                    break;
+                }
+                k--;
+                if (k < 0) {
+                    k = 0;
+                    v[0] = q;
+                    z[0] = -1e20f;
+                    z[1] = 1e20f;
+                    break;
+                }
             }
-        } else {
-             int64_t dst_offset = (offset + q) * D;
-             for (int d = 0; d < D; ++d) out_indices[dst_offset + d] = 0;
+        }
+    }
+    __syncthreads();
+
+    for (int q = threadIdx.x; q < N; q += blockDim.x) {
+        int k = 0;
+        float q_float = (float)q;
+        while (z[k + 1] < q_float) k++;
+        
+        int p = v[k];
+        int64_t global_offset = base_offset + q * stride;
+        float dist_sq = (float)(q - p) * (q - p) + f[p];
+        
+        out_dist[global_offset] = dist_sq;  // 不开方
+
+        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
+        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
+        for (int d = 0; d < sample_ndim; ++d) {
+            out_idx_ptr[d] = src_idx_ptr[d];
         }
     }
 }
 
-// ------------------------------------------------------------------
-// Kernel 3: Initialize Indices
-// ------------------------------------------------------------------
-// 初始化索引张量为网格坐标
-// indices shape: (..., D)
-__global__ void init_indices_kernel(
-    int32_t* indices, 
-    int64_t total_pixels, 
-    int NDim, 
-    const int64_t* __restrict__ shape_ptr // shape of spatial dimensions
+// 内核4: 最后一个pass
+__global__ void edt_kernel_final(
+    const float* in_dist,
+    const int32_t* in_idx,
+    float* out_dist,
+    int32_t* out_idx,
+    const int64_t* shape,
+    const int64_t* strides,
+    int32_t ndim,
+    int32_t process_dim_sample,
+    int64_t total_slices,
+    int64_t num_slices_per_sample
 ) {
-    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= total_pixels) return;
-
-    // 反解坐标 (Unravel Index)
-    // idx 是每个像素的 flat index
-    // 我们需要计算它在 spatial_shape 中的坐标
+    int64_t slice_idx = blockIdx.x;
+    if (slice_idx >= total_slices) return;
+
+    int64_t batch_idx = slice_idx / num_slices_per_sample;
+    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
+    int64_t batch_offset = batch_idx * strides[0];
+    int64_t sample_base_offset = 0;
+    int64_t temp_idx = slice_idx_in_sample;
+    const int sample_ndim = ndim - 1;
+
+    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
+        if (d == process_dim_sample) continue;
+        int64_t size_of_dim = shape[d + 1];
+        if (size_of_dim > 0) {
+            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
+            temp_idx /= size_of_dim;
+        }
+    }
     
-    int64_t temp = idx;
-    // 使用本地寄存器数组避免多次全局内存读取 (假设最大 10 维)
-    int32_t coords[10]; 
-
-    // 假设 spatial_shape 是 [D0, D1, D2]
-    // 倒序计算除余
-    for (int d = NDim - 1; d >= 0; --d) {
-        int64_t dim_size = shape_ptr[d];
-        coords[d] = temp % dim_size;
-        temp /= dim_size;
+    const int64_t process_dim_actual = process_dim_sample + 1;
+    const int64_t N = shape[process_dim_actual];
+    const int64_t stride = strides[process_dim_actual];
+    const int64_t base_offset = batch_offset + sample_base_offset;
+
+    if (N == 0) return;
+
+    extern __shared__ char s_buffer[];
+    float* f = (float*)s_buffer;
+    int*   v = (int*)(f + N);
+    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
+    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
+
+    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
+        int64_t global_offset = base_offset + i * stride;
+        f[i] = __ldg(&in_dist[global_offset]);
+        
+        const int32_t* global_idx_ptr = in_idx + global_offset * sample_ndim;
+        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
+        for (int d = 0; d < sample_ndim; ++d) {
+            shared_idx_ptr[d] = __ldg(&global_idx_ptr[d]);
+        }
+    }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+        int k = 0;
+        v[0] = 0;
+        z[0] = -1e20f;
+        z[1] = 1e20f;
+        
+        for (int q = 1; q < N; q++) {
+            float fq = f[q];
+            int q_sq = q * q;
+            
+            while (k >= 0) {
+                int p = v[k];
+                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
+                if (s > z[k]) {
+                    k++;
+                    v[k] = q;
+                    z[k] = s;
+                    z[k + 1] = 1e20f;
+                    break;
+                }
+                k--;
+                if (k < 0) {
+                    k = 0;
+                    v[0] = q;
+                    z[0] = -1e20f;
+                    z[1] = 1e20f;
+                    break;
+                }
+            }
+        }
     }
+    __syncthreads();
+
+    for (int q = threadIdx.x; q < N; q += blockDim.x) {
+        int k = 0;
+        float q_float = (float)q;
+        while (z[k + 1] < q_float) k++;
+        
+        int p = v[k];
+        int64_t global_offset = base_offset + q * stride;
+        float dist_sq = (float)(q - p) * (q - p) + f[p];
+        
+        out_dist[global_offset] = sqrtf(dist_sq);  // 最后开方
 
-    // 写入 Global Memory
-    // Indices tensor 是 (TotalPixels, NDim) 扁平化的
-    int64_t out_ptr = idx * NDim;
-    for (int d = 0; d < NDim; ++d) {
-        indices[out_ptr + d] = coords[d];
+        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
+        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
+        for (int d = 0; d < sample_ndim; ++d) {
+            out_idx_ptr[d] = src_idx_ptr[d];
+        }
     }
 }
 
-// ------------------------------------------------------------------
-// Host Function: C++ Entry Point
-// ------------------------------------------------------------------
-
+// Host函数
 std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
-    TORCH_CHECK(input.is_cuda(), "Input must be on CUDA device.");
-    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be float32.");
-    
+    TORCH_CHECK(input.is_cuda(), "Input must be on a CUDA device.");
+    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be a float tensor.");
     input = input.contiguous();
-    
-    // 处理 Batch 维度：如果输入是 1D (L)，视为无 Batch，但在处理中统一加一个 Batch 维方便
-    // 标准约定：Input shape (Batch, D1, D2, ..., Dn)
-    // 算法对 Batch 维度和其他维度处理其实是一样的（视为无关维度）
-    // 但索引初始化需要知道哪些是 "Spatial Dimensions"。
-    // 这里假设：输入的所有维度除了 Batch (Dim 0) 外都是空间维度。
-    
-    const int ndim = input.dim(); 
-    // 如果 ndim=1, 假设是 (L)，sample_ndim=1
-    // 如果 ndim=4 (B, C, H, W)，sample_ndim=3 (C,H,W 都算空间? 通常 C 也是独立处理的)
-    // **修正**: 标准 EDT 通常是在 (H, W) 或 (D, H, W) 上进行的。
-    // 如果有 Channel，通常 Channel 也是独立的。
-    // 为了最通用，我们将 **除了第0维(Batch)** 以外的所有维度都视为空间维度进行索引记录。
-    // 如果用户输入没有 Batch 维，请在 Python 端 unsqueeze(0)。
-    
-    // 假设输入已经是 (Batch, ...Spatial...)
-    const int sample_ndim = ndim - 1; 
-    TORCH_CHECK(sample_ndim > 0, "Input tensor must have at least 2 dimensions (Batch, ...)");
 
-    auto shape = input.sizes().vec();
-    int64_t num_pixels = input.numel();
+    bool had_no_batch_dim = (input.dim() <= 2); 
+    if (had_no_batch_dim) { input = input.unsqueeze(0); }
 
-    if (num_pixels == 0) {
+    const auto ndim = input.dim();
+    const auto sample_ndim = ndim - 1;
+    const auto batch_size = input.size(0);
+    
+    auto shape = input.sizes().vec();
+    auto strides_vec = input.strides().vec();
+    
+    if (input.numel() == 0) { 
+        auto distance = torch::empty_like(input);
         auto index_shape = shape;
-        index_shape.push_back(sample_ndim);
-        return std::make_tuple(torch::empty_like(input), 
-                               torch::empty(index_shape, input.options().dtype(torch::kInt32)));
+        index_shape.push_back(sample_ndim > 0 ? sample_ndim : 1);
+        auto index = torch::empty(index_shape, input.options().dtype(torch::kInt32));
+        if (had_no_batch_dim) return std::make_tuple(distance.squeeze(0), index.squeeze(0));
+        return std::make_tuple(distance, index); 
     }
-
-    // 1. 初始化 Distance Tensor
-    // 0 -> 0, 1 -> INF
-    auto current_dist = torch::where(input == 0, 
-                                     torch::tensor(0.0f, input.options()), 
-                                     torch::tensor(INF_VAL, input.options()));
     
-    // 2. 初始化 Index Tensor
-    // Shape: (Batch, D1, ..., Dn, sample_ndim)
+    auto distance = torch::empty_like(input);
+    auto index_options = input.options().dtype(torch::kInt32);
     auto index_shape = shape;
-    index_shape.push_back(sample_ndim);
-    auto current_idx = torch::empty(index_shape, input.options().dtype(torch::kInt32));
-    
-    // 2.1 准备 Shape 数据传给 Kernel
-    std::vector<int64_t> spatial_shape(shape.begin() + 1, shape.end());
-    auto shape_tensor = torch::tensor(spatial_shape, torch::kInt64).to(input.device());
-
-    // 2.2 运行初始化 Kernel
-    {
-        int threads = 256;
-        int blocks = (num_pixels + threads - 1) / threads;
-        init_indices_kernel<<<blocks, threads>>>(
-            current_idx.data_ptr<int32_t>(),
-            num_pixels,
-            sample_ndim,
-            shape_tensor.data_ptr<int64_t>()
-        );
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess) {
-            printf("Init Kernel Failed: %s\n", cudaGetErrorString(err));
+    index_shape.push_back(sample_ndim > 0 ? sample_ndim : 1);
+    auto index = torch::empty(index_shape, index_options);
+
+    if (torch::all(input != 0).item<bool>()) {
+        distance.fill_(std::numeric_limits<float>::infinity());
+        index.fill_(-1);
+        if (had_no_batch_dim) {
+            return std::make_tuple(distance.squeeze(0), index.squeeze(0));
         }
+        return std::make_tuple(distance, index);
     }
 
-    // 预分配 Global Memory Buffer (懒加载)
-    torch::Tensor global_buf1, global_buf2;
+    auto shape_tensor = torch::tensor(shape, 
+        torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
+    auto strides_tensor = torch::tensor(strides_vec, 
+        torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
+    
+    const int64_t* shape_gpu = shape_tensor.data_ptr<int64_t>();
+    const int64_t* strides_gpu = strides_tensor.data_ptr<int64_t>();
 
-    // 3. 逐维处理 (Separable JFA)
-    // 遍历每一个空间维度 (从 1 到 ndim-1)
-    for (int d = 1; d < ndim; ++d) {
-        bool is_final_pass = (d == ndim - 1);
-        
-        // --- Step A: Transpose current dim to last ---
-        // 变换后 Shape: (..., L)
-        auto dist_in = current_dist.transpose(d, ndim - 1).contiguous();
-        auto idx_in  = current_idx.transpose(d, ndim - 1).contiguous(); 
+    std::vector<std::pair<int64_t, int32_t>> dim_order_pairs;
+    for (int32_t d_sample = 0; d_sample < sample_ndim; ++d_sample) {
+        dim_order_pairs.push_back({strides_vec[d_sample + 1], d_sample});
+    }
+    std::sort(dim_order_pairs.rbegin(), dim_order_pairs.rend());
+
+    if (sample_ndim == 0) {
+        int64_t total_slices = batch_size;
+        int64_t slice_len = (shape.size() > 1) ? shape[1] : 0;
+        int threads = std::min((int64_t)256, slice_len);
+        size_t smem = slice_len * sizeof(float) + (slice_len + 1) * sizeof(int) + 
+                      (slice_len + 2) * sizeof(float) + slice_len * 1 * sizeof(int32_t);
         
-        int64_t L = dist_in.size(-1); // 当前维度的长度
-        int64_t total_slices = dist_in.numel() / L; 
+        edt_kernel_first_final<<<total_slices, threads, smem>>>(
+           input.data_ptr<float>(),
+           distance.data_ptr<float>(), index.data_ptr<int32_t>(),
+           shape_gpu, strides_gpu, ndim, 0, total_slices, 1
+       );
+    } else {
+        auto buffer_dist = torch::empty_like(distance);
+        auto buffer_idx = torch::empty_like(index);
         
-        auto dist_out = torch::empty_like(dist_in);
-        auto idx_out  = torch::empty_like(idx_in);
+        for (int pass = 0; pass < sample_ndim; ++pass) {
+            int32_t d_sample = dim_order_pairs[pass].second;
+            bool is_first = (pass == 0);
+            bool is_final = (pass == sample_ndim - 1);
 
-        // --- Step B: Kernel Dispatch ---
-        int threads = std::min((int64_t)MAX_THREADS, L);
-        
-        // 检查是否可以使用 Shared Memory
-        if (L <= SMEM_LIMIT_ELEMENTS) {
-            size_t smem_size = L * (sizeof(float) + 2 * sizeof(int));
+            torch::Tensor *in_dist, *in_idx, *out_dist, *out_idx;
             
-            // 使用 Switch 宏来处理常用的维度模板特化
-            #define DISPATCH_SHARED(IS_FINAL) \
-                switch(sample_ndim) { \
-                    case 1: edt_kernel_shared<IS_FINAL, 1><<<total_slices, threads, smem_size>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 2: edt_kernel_shared<IS_FINAL, 2><<<total_slices, threads, smem_size>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 3: edt_kernel_shared<IS_FINAL, 3><<<total_slices, threads, smem_size>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 4: edt_kernel_shared<IS_FINAL, 4><<<total_slices, threads, smem_size>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 5: edt_kernel_shared<IS_FINAL, 5><<<total_slices, threads, smem_size>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 6: edt_kernel_shared<IS_FINAL, 6><<<total_slices, threads, smem_size>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    default: /* Fallback for > 6D */ \
-                        edt_kernel_shared<IS_FINAL, 0><<<total_slices, threads, smem_size>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                }
-
-            if (is_final_pass) { DISPATCH_SHARED(true); } 
-            else { DISPATCH_SHARED(false); }
-
-        } else {
-            // Global Memory Fallback (L > 4096)
-            if (global_buf1.numel() < dist_in.numel()) {
-                global_buf1 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
-                global_buf2 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
+            if (pass % 2 == 0) {
+                in_dist = &distance; in_idx = &index;
+                out_dist = &buffer_dist; out_idx = &buffer_idx;
+            } else {
+                in_dist = &buffer_dist; in_idx = &buffer_idx;
+                out_dist = &distance; out_idx = &index;
+            }
+            
+            int64_t num_slices_per_sample = 1;
+            for(int i = 0; i < sample_ndim; ++i) { 
+                if (i != d_sample) num_slices_per_sample *= shape[i + 1]; 
+            }
+            int64_t total_slices = batch_size * num_slices_per_sample;
+            int64_t slice_len = shape[d_sample + 1];
+            
+            int threads = std::min((int64_t)256, slice_len);
+            size_t smem = slice_len * sizeof(float) + (slice_len + 1) * sizeof(int) + 
+                          (slice_len + 2) * sizeof(float) + slice_len * sample_ndim * sizeof(int32_t);
+
+            if (is_first && is_final) {
+                edt_kernel_first_final<<<total_slices, threads, smem>>>(
+                    input.data_ptr<float>(),
+                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
+                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+                );
+            } else if (is_first) {
+                edt_kernel_first_only<<<total_slices, threads, smem>>>(
+                    input.data_ptr<float>(),
+                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
+                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+                );
+            } else if (is_final) {
+                edt_kernel_final<<<total_slices, threads, smem>>>(
+                    in_dist->data_ptr<float>(), in_idx->data_ptr<int32_t>(),
+                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
+                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+                );
+            } else {
+                edt_kernel_middle<<<total_slices, threads, smem>>>(
+                    in_dist->data_ptr<float>(), in_idx->data_ptr<int32_t>(),
+                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
+                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+                );
             }
-
-            #define DISPATCH_GLOBAL(IS_FINAL) \
-                switch(sample_ndim) { \
-                    case 1: edt_kernel_global<IS_FINAL, 1><<<total_slices, threads>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 2: edt_kernel_global<IS_FINAL, 2><<<total_slices, threads>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 3: edt_kernel_global<IS_FINAL, 3><<<total_slices, threads>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 4: edt_kernel_global<IS_FINAL, 4><<<total_slices, threads>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 5: edt_kernel_global<IS_FINAL, 5><<<total_slices, threads>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 6: edt_kernel_global<IS_FINAL, 6><<<total_slices, threads>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    default: /* Fallback */ \
-                        edt_kernel_global<IS_FINAL, 0><<<total_slices, threads>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                }
-
-            if (is_final_pass) { DISPATCH_GLOBAL(true); } 
-            else { DISPATCH_GLOBAL(false); }
         }
-
-        // --- Step C: Transpose Back ---
-        current_dist = dist_out.transpose(d, ndim - 1);
-        current_idx  = idx_out.transpose(d, ndim - 1);
+        
+        if (sample_ndim % 2 != 0) {
+            distance.copy_(buffer_dist);
+            index.copy_(buffer_idx);
+        }
     }
-
-    return std::make_tuple(current_dist, current_idx);
-}
-
+    
+    if (had_no_batch_dim) { 
+        return std::make_tuple(distance.squeeze(0), index.squeeze(0));
+    }
+    
+    return std::make_tuple(distance, index);
+}
\ No newline at end of file

From a6fafaa41678abcdaad1fceaa523cc20c25d3914 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Mon, 8 Dec 2025 20:23:07 +0800
Subject: [PATCH 17/56] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E8=B0=83=E6=95=B4+=E9=80=9F=E5=BA=A6=E4=BC=98=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/test_distance_transform.py              |  20 +-
 torchmorph/csrc/distance_transform_kernel.cu | 636 ++++++-------------
 2 files changed, 204 insertions(+), 452 deletions(-)

diff --git a/test/test_distance_transform.py b/test/test_distance_transform.py
index 3a11166..476ffca 100644
--- a/test/test_distance_transform.py
+++ b/test/test_distance_transform.py
@@ -6,20 +6,32 @@
 
 # 辅助函数：调用 SciPy 并处理格式
 def batch_scipy_edt_with_indices(batch_numpy: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
-    is_single_sample = batch_numpy.ndim <= 2
-    if is_single_sample:
-        batch_numpy = batch_numpy[np.newaxis, ...]
+
+    
+    input_is_1d_batch = (batch_numpy.ndim == 2) 
+    input_is_single_sample_no_batch = (batch_numpy.ndim == 1)
+
+    if input_is_single_sample_no_batch:
+        batch_numpy = batch_numpy[np.newaxis, ...] # (L) -> (1, L)
+
+    
     dist_results, indices_results = [], []
     for sample in batch_numpy:
         dist, indices = scipy_edt(sample, return_indices=True, return_distances=True)
         dist_results.append(dist)
         indices_results.append(indices)
+        
     output_dist = np.stack(dist_results, axis=0)
     output_indices = np.stack(indices_results, axis=0)
+    
+    # indices shape fix: (N, ndim_sample, ...) -> (N, ..., ndim_sample)
+    # 对于 1D: (N, 1, L) -> (N, L, 1)
     output_indices = np.moveaxis(output_indices, 1, -1)
-    if is_single_sample:
+    
+    if input_is_single_sample_no_batch:
         output_dist = output_dist.squeeze(0)
         output_indices = output_indices.squeeze(0)
+        
     return output_dist, output_indices
 
 # 用例定义 
diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 4ec4647..c3ded1e 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -5,15 +5,21 @@
 #include <cmath>
 #include <limits>
 
-// 优化策略：用4个独立的内核函数替代模板，完全消除分支
-
-// 内核1: 第一个pass且是唯一pass (1D情况)
-__global__ void edt_kernel_first_final(
-    const float* in_data,
-    float* out_dist,
-    int32_t* out_idx,
-    const int64_t* shape,
-    const int64_t* strides,
+#define MAX_DIMS 10
+#define INF_VAL 1e8f  // 使用 1e8 保证 float32 精度下的数值稳定性
+
+__device__ __forceinline__ float sqr(float x) { return x * x; }
+
+// ------------------------------------------------------------------
+// 内核 1: 初始 Pass (First Pass)
+// ------------------------------------------------------------------
+template <bool IsFinal>
+__global__ void edt_kernel_first_pass(
+    const float* __restrict__ in_data,
+    float* __restrict__ out_dist,
+    int32_t* __restrict__ out_idx,
+    const int64_t* __restrict__ shape,
+    const int64_t* __restrict__ strides,
     int32_t ndim,
     int32_t process_dim_sample,
     int64_t total_slices,
@@ -24,244 +30,117 @@ __global__ void edt_kernel_first_final(
 
     int64_t batch_idx = slice_idx / num_slices_per_sample;
     int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
-    int64_t batch_offset = batch_idx * strides[0];
-    int64_t sample_base_offset = 0;
-    int64_t temp_idx = slice_idx_in_sample;
-    const int sample_ndim = ndim - 1;
-
-    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
-        if (d == process_dim_sample) continue;
-        int64_t size_of_dim = shape[d + 1];
-        if (size_of_dim > 0) {
-            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
-            temp_idx /= size_of_dim;
-        }
-    }
-    
-    const int64_t process_dim_actual = process_dim_sample + 1;
-    const int64_t N = shape[process_dim_actual];
-    const int64_t stride = strides[process_dim_actual];
-    const int64_t base_offset = batch_offset + sample_base_offset;
-
-    if (N == 0) return;
-
-    extern __shared__ char s_buffer[];
-    float* f = (float*)s_buffer;
-    int*   v = (int*)(f + N);
-    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
-    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
-
-    // 加载数据 - 第一个pass
-    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
-        int64_t global_offset = base_offset + i * stride;
-        float val = __ldg(&in_data[global_offset]);
-        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
-        
-        if (val == 0.0f) {
-            f[i] = 0.0f;
-            int64_t temp_coord = slice_idx_in_sample;
-            for (int32_t d = sample_ndim - 1; d >= 0; --d) {
-                if (d == process_dim_sample) continue;
-                int64_t size_of_dim = shape[d + 1];
-                if (size_of_dim > 0) {
-                    shared_idx_ptr[d] = temp_coord % size_of_dim;
-                    temp_coord /= size_of_dim;
-                } else {
-                    shared_idx_ptr[d] = 0;
-                }
-            }
-            shared_idx_ptr[process_dim_sample] = i;
-        } else {
-            f[i] = 1e20f;
-            for (int d = 0; d < sample_ndim; ++d) shared_idx_ptr[d] = -1;
-        }
-    }
-    __syncthreads();
-
-    // 构建包络
-    if (threadIdx.x == 0) {
-        int k = 0;
-        v[0] = 0;
-        z[0] = -1e20f;
-        z[1] = 1e20f;
-        
-        for (int q = 1; q < N; q++) {
-            float fq = f[q];
-            int q_sq = q * q;
-            
-            while (k >= 0) {
-                int p = v[k];
-                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
-                if (s > z[k]) {
-                    k++;
-                    v[k] = q;
-                    z[k] = s;
-                    z[k + 1] = 1e20f;
-                    break;
-                }
-                k--;
-                if (k < 0) {
-                    k = 0;
-                    v[0] = q;
-                    z[0] = -1e20f;
-                    z[1] = 1e20f;
-                    break;
-                }
-            }
-        }
-    }
-    __syncthreads();
-
-    // 计算距离 - 最后一个pass，直接开方
-    for (int q = threadIdx.x; q < N; q += blockDim.x) {
-        int k = 0;
-        float q_float = (float)q;
-        while (z[k + 1] < q_float) k++;
-        
-        int p = v[k];
-        int64_t global_offset = base_offset + q * stride;
-        float dist_sq = (float)(q - p) * (q - p) + f[p];
-        
-        out_dist[global_offset] = sqrtf(dist_sq);  // 直接开方
-
-        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
-        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
-        for (int d = 0; d < sample_ndim; ++d) {
-            out_idx_ptr[d] = src_idx_ptr[d];
-        }
-    }
-}
+    int64_t current_offset = batch_idx * strides[0];
 
-// 内核2: 第一个pass但不是最后
-__global__ void edt_kernel_first_only(
-    const float* in_data,
-    float* out_dist,
-    int32_t* out_idx,
-    const int64_t* shape,
-    const int64_t* strides,
-    int32_t ndim,
-    int32_t process_dim_sample,
-    int64_t total_slices,
-    int64_t num_slices_per_sample
-) {
-    int64_t slice_idx = blockIdx.x;
-    if (slice_idx >= total_slices) return;
-
-    int64_t batch_idx = slice_idx / num_slices_per_sample;
-    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
-    int64_t batch_offset = batch_idx * strides[0];
-    int64_t sample_base_offset = 0;
+    // 预计算基准坐标 (除了 process_dim 以外的维度坐标)
+    int32_t base_coords[MAX_DIMS]; 
     int64_t temp_idx = slice_idx_in_sample;
     const int sample_ndim = ndim - 1;
 
+    // 根据 slice_idx 反解坐标
     for (int32_t d = sample_ndim - 1; d >= 0; --d) {
-        if (d == process_dim_sample) continue;
-        int64_t size_of_dim = shape[d + 1];
-        if (size_of_dim > 0) {
-            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
-            temp_idx /= size_of_dim;
+        if (d == process_dim_sample) {
+            base_coords[d] = 0; // 占位
+            continue; 
         }
+        int64_t size_of_dim = shape[d + 1];
+        int32_t coord = (int32_t)(temp_idx % size_of_dim);
+        base_coords[d] = coord;
+        current_offset += coord * strides[d + 1];
+        temp_idx /= size_of_dim;
     }
-    
+
     const int64_t process_dim_actual = process_dim_sample + 1;
     const int64_t N = shape[process_dim_actual];
     const int64_t stride = strides[process_dim_actual];
-    const int64_t base_offset = batch_offset + sample_base_offset;
-
+    
     if (N == 0) return;
 
     extern __shared__ char s_buffer[];
     float* f = (float*)s_buffer;
     int*   v = (int*)(f + N);
     float* z = (float*)((char*)v + (N + 1) * sizeof(int));
-    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
 
+    // Phase 1: 加载数据
     for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
-        int64_t global_offset = base_offset + i * stride;
-        float val = __ldg(&in_data[global_offset]);
-        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
-        
-        if (val == 0.0f) {
-            f[i] = 0.0f;
-            int64_t temp_coord = slice_idx_in_sample;
-            for (int32_t d = sample_ndim - 1; d >= 0; --d) {
-                if (d == process_dim_sample) continue;
-                int64_t size_of_dim = shape[d + 1];
-                if (size_of_dim > 0) {
-                    shared_idx_ptr[d] = temp_coord % size_of_dim;
-                    temp_coord /= size_of_dim;
-                } else {
-                    shared_idx_ptr[d] = 0;
-                }
-            }
-            shared_idx_ptr[process_dim_sample] = i;
-        } else {
-            f[i] = 1e20f;
-            for (int d = 0; d < sample_ndim; ++d) shared_idx_ptr[d] = -1;
-        }
+        float val = __ldg(&in_data[current_offset + i * stride]);
+        f[i] = (val == 0.0f) ? 0.0f : INF_VAL;
     }
     __syncthreads();
 
+    // Phase 2: 构建包络
     if (threadIdx.x == 0) {
         int k = 0;
         v[0] = 0;
-        z[0] = -1e20f;
-        z[1] = 1e20f;
+        z[0] = -INF_VAL;
+        z[1] = INF_VAL;
         
         for (int q = 1; q < N; q++) {
+            // 显式跳过背景点，避免 INF 污染计算
+            if (f[q] >= (INF_VAL * 0.9f)) continue;
+
             float fq = f[q];
-            int q_sq = q * q;
-            
-            while (k >= 0) {
-                int p = v[k];
-                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
-                if (s > z[k]) {
-                    k++;
-                    v[k] = q;
-                    z[k] = s;
-                    z[k + 1] = 1e20f;
-                    break;
-                }
-                k--;
-                if (k < 0) {
-                    k = 0;
-                    v[0] = q;
-                    z[0] = -1e20f;
-                    z[1] = 1e20f;
+            int k_curr = k;
+            while (k_curr >= 0) {
+                int p = v[k_curr];
+                
+                // --- 核心修复：数值稳定的交点公式 ---
+                // 先计算差值再相加，防止大数吞小数
+                float diff_f = fq - f[p];
+                float diff_sq = (float)q*(float)q - (float)p*(float)p;
+                float s = (diff_f + diff_sq) / (2.0f * (float)(q - p));
+                
+                if (s > z[k_curr]) {
+                    k_curr++;
+                    v[k_curr] = q;
+                    z[k_curr] = s;
+                    z[k_curr + 1] = INF_VAL;
+                    k = k_curr;
                     break;
                 }
+                k_curr--;
+            }
+            if (k_curr < 0) {
+                k = 0; v[0] = q; z[0] = -INF_VAL; z[1] = INF_VAL;
             }
         }
     }
     __syncthreads();
 
+    // Phase 3: 计算距离
     for (int q = threadIdx.x; q < N; q += blockDim.x) {
         int k = 0;
         float q_float = (float)q;
         while (z[k + 1] < q_float) k++;
         
-        int p = v[k];
-        int64_t global_offset = base_offset + q * stride;
-        float dist_sq = (float)(q - p) * (q - p) + f[p];
+        int p = v[k]; 
+        
+        int64_t global_idx = current_offset + q * stride;
+        float dist_sq = sqr(q_float - (float)p) + f[p];
         
-        out_dist[global_offset] = dist_sq;  // 不开方
+        out_dist[global_idx] = IsFinal ? sqrtf(dist_sq) : dist_sq;
 
-        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
-        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
+        // 写入索引
+        int32_t* out_idx_ptr = out_idx + global_idx * sample_ndim;
+        #pragma unroll
         for (int d = 0; d < sample_ndim; ++d) {
-            out_idx_ptr[d] = src_idx_ptr[d];
+            // 只有当前处理的维度写入 p，其他维度写入基准坐标
+            out_idx_ptr[d] = (d == process_dim_sample) ? p : base_coords[d];
         }
     }
 }
 
-// 内核3: 中间pass
-__global__ void edt_kernel_middle(
-    const float* in_dist,
-    const int32_t* in_idx,
-    float* out_dist,
-    int32_t* out_idx,
-    const int64_t* shape,
-    const int64_t* strides,
+// ------------------------------------------------------------------
+// 内核 2: 后续 Pass (Subsequent Pass)
+// ------------------------------------------------------------------
+template <bool IsFinal>
+__global__ void edt_kernel_subsequent_pass(
+    const float* __restrict__ in_dist,
+    const int32_t* __restrict__ in_idx,
+    float* __restrict__ out_dist,
+    int32_t* __restrict__ out_idx,
+    const int64_t* __restrict__ shape,
+    const int64_t* __restrict__ strides,
     int32_t ndim,
     int32_t process_dim_sample,
     int64_t total_slices,
@@ -272,24 +151,21 @@ __global__ void edt_kernel_middle(
 
     int64_t batch_idx = slice_idx / num_slices_per_sample;
     int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
-    int64_t batch_offset = batch_idx * strides[0];
-    int64_t sample_base_offset = 0;
+    int64_t current_offset = batch_idx * strides[0];
+    
     int64_t temp_idx = slice_idx_in_sample;
     const int sample_ndim = ndim - 1;
 
     for (int32_t d = sample_ndim - 1; d >= 0; --d) {
         if (d == process_dim_sample) continue;
         int64_t size_of_dim = shape[d + 1];
-        if (size_of_dim > 0) {
-            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
-            temp_idx /= size_of_dim;
-        }
+        current_offset += (temp_idx % size_of_dim) * strides[d + 1];
+        temp_idx /= size_of_dim;
     }
     
     const int64_t process_dim_actual = process_dim_sample + 1;
     const int64_t N = shape[process_dim_actual];
     const int64_t stride = strides[process_dim_actual];
-    const int64_t base_offset = batch_offset + sample_base_offset;
 
     if (N == 0) return;
 
@@ -297,49 +173,33 @@ __global__ void edt_kernel_middle(
     float* f = (float*)s_buffer;
     int*   v = (int*)(f + N);
     float* z = (float*)((char*)v + (N + 1) * sizeof(int));
-    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
 
     for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
-        int64_t global_offset = base_offset + i * stride;
-        f[i] = __ldg(&in_dist[global_offset]);
-        
-        const int32_t* global_idx_ptr = in_idx + global_offset * sample_ndim;
-        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
-        for (int d = 0; d < sample_ndim; ++d) {
-            shared_idx_ptr[d] = __ldg(&global_idx_ptr[d]);
-        }
+        f[i] = __ldg(&in_dist[current_offset + i * stride]);
     }
     __syncthreads();
 
     if (threadIdx.x == 0) {
         int k = 0;
-        v[0] = 0;
-        z[0] = -1e20f;
-        z[1] = 1e20f;
+        v[0] = 0; z[0] = -INF_VAL; z[1] = INF_VAL;
         
         for (int q = 1; q < N; q++) {
+            if (f[q] >= (INF_VAL * 0.9f)) continue;
+
             float fq = f[q];
-            int q_sq = q * q;
-            
-            while (k >= 0) {
-                int p = v[k];
-                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
-                if (s > z[k]) {
-                    k++;
-                    v[k] = q;
-                    z[k] = s;
-                    z[k + 1] = 1e20f;
-                    break;
-                }
-                k--;
-                if (k < 0) {
-                    k = 0;
-                    v[0] = q;
-                    z[0] = -1e20f;
-                    z[1] = 1e20f;
-                    break;
+            int k_curr = k;
+            while (k_curr >= 0) {
+                int p = v[k_curr];
+                float diff_f = fq - f[p];
+                float diff_sq = (float)q*(float)q - (float)p*(float)p;
+                float s = (diff_f + diff_sq) / (2.0f * (float)(q - p));
+                if (s > z[k_curr]) {
+                    k_curr++; v[k_curr] = q; z[k_curr] = s; z[k_curr + 1] = INF_VAL;
+                    k = k_curr; break;
                 }
+                k_curr--;
             }
+            if (k_curr < 0) { k = 0; v[0] = q; z[0] = -INF_VAL; z[1] = INF_VAL; }
         }
     }
     __syncthreads();
@@ -350,260 +210,140 @@ __global__ void edt_kernel_middle(
         while (z[k + 1] < q_float) k++;
         
         int p = v[k];
-        int64_t global_offset = base_offset + q * stride;
-        float dist_sq = (float)(q - p) * (q - p) + f[p];
-        
-        out_dist[global_offset] = dist_sq;  // 不开方
-
-        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
-        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
-        for (int d = 0; d < sample_ndim; ++d) {
-            out_idx_ptr[d] = src_idx_ptr[d];
-        }
-    }
-}
-
-// 内核4: 最后一个pass
-__global__ void edt_kernel_final(
-    const float* in_dist,
-    const int32_t* in_idx,
-    float* out_dist,
-    int32_t* out_idx,
-    const int64_t* shape,
-    const int64_t* strides,
-    int32_t ndim,
-    int32_t process_dim_sample,
-    int64_t total_slices,
-    int64_t num_slices_per_sample
-) {
-    int64_t slice_idx = blockIdx.x;
-    if (slice_idx >= total_slices) return;
-
-    int64_t batch_idx = slice_idx / num_slices_per_sample;
-    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
-    int64_t batch_offset = batch_idx * strides[0];
-    int64_t sample_base_offset = 0;
-    int64_t temp_idx = slice_idx_in_sample;
-    const int sample_ndim = ndim - 1;
-
-    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
-        if (d == process_dim_sample) continue;
-        int64_t size_of_dim = shape[d + 1];
-        if (size_of_dim > 0) {
-            sample_base_offset += (temp_idx % size_of_dim) * strides[d + 1];
-            temp_idx /= size_of_dim;
-        }
-    }
-    
-    const int64_t process_dim_actual = process_dim_sample + 1;
-    const int64_t N = shape[process_dim_actual];
-    const int64_t stride = strides[process_dim_actual];
-    const int64_t base_offset = batch_offset + sample_base_offset;
-
-    if (N == 0) return;
-
-    extern __shared__ char s_buffer[];
-    float* f = (float*)s_buffer;
-    int*   v = (int*)(f + N);
-    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
-    int32_t* s_idx = (int32_t*)((char*)z + (N + 2) * sizeof(float));
-
-    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
-        int64_t global_offset = base_offset + i * stride;
-        f[i] = __ldg(&in_dist[global_offset]);
         
-        const int32_t* global_idx_ptr = in_idx + global_offset * sample_ndim;
-        int32_t* shared_idx_ptr = s_idx + i * sample_ndim;
-        for (int d = 0; d < sample_ndim; ++d) {
-            shared_idx_ptr[d] = __ldg(&global_idx_ptr[d]);
-        }
-    }
-    __syncthreads();
-
-    if (threadIdx.x == 0) {
-        int k = 0;
-        v[0] = 0;
-        z[0] = -1e20f;
-        z[1] = 1e20f;
+        int64_t q_global_offset = current_offset + q * stride;
+        int64_t p_global_offset = current_offset + p * stride;
         
-        for (int q = 1; q < N; q++) {
-            float fq = f[q];
-            int q_sq = q * q;
-            
-            while (k >= 0) {
-                int p = v[k];
-                float s = ((fq + q_sq) - (f[p] + p * p)) / (2.0f * (q - p));
-                if (s > z[k]) {
-                    k++;
-                    v[k] = q;
-                    z[k] = s;
-                    z[k + 1] = 1e20f;
-                    break;
-                }
-                k--;
-                if (k < 0) {
-                    k = 0;
-                    v[0] = q;
-                    z[0] = -1e20f;
-                    z[1] = 1e20f;
-                    break;
-                }
-            }
-        }
-    }
-    __syncthreads();
+        float dist_sq = sqr(q_float - (float)p) + f[p];
+        out_dist[q_global_offset] = IsFinal ? sqrtf(dist_sq) : dist_sq;
 
-    for (int q = threadIdx.x; q < N; q += blockDim.x) {
-        int k = 0;
-        float q_float = (float)q;
-        while (z[k + 1] < q_float) k++;
-        
-        int p = v[k];
-        int64_t global_offset = base_offset + q * stride;
-        float dist_sq = (float)(q - p) * (q - p) + f[p];
-        
-        out_dist[global_offset] = sqrtf(dist_sq);  // 最后开方
+        // 索引直接从 Global Memory 拷贝，无需 Shared Memory
+        const int32_t* src_idx_ptr = in_idx + p_global_offset * sample_ndim;
+        int32_t* out_idx_ptr = out_idx + q_global_offset * sample_ndim;
 
-        int32_t* out_idx_ptr = out_idx + global_offset * sample_ndim;
-        const int32_t* src_idx_ptr = s_idx + p * sample_ndim;
+        #pragma unroll
         for (int d = 0; d < sample_ndim; ++d) {
-            out_idx_ptr[d] = src_idx_ptr[d];
+            out_idx_ptr[d] = src_idx_ptr[d]; 
         }
     }
 }
 
-// Host函数
+// ------------------------------------------------------------------
+// Host 函数
+// ------------------------------------------------------------------
 std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
-    TORCH_CHECK(input.is_cuda(), "Input must be on a CUDA device.");
-    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be a float tensor.");
+    TORCH_CHECK(input.is_cuda(), "Input must be on CUDA device.");
+    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be float32.");
+    
     input = input.contiguous();
 
-    bool had_no_batch_dim = (input.dim() <= 2); 
-    if (had_no_batch_dim) { input = input.unsqueeze(0); }
+    // 自动处理 1D 输入：(L) -> (1, L)
+    // 自动处理 1D 批处理：(N, L) 保持不变 (视为 N 个 1D 样本)
+    bool had_no_batch_dim = (input.dim() == 1); 
+    if (had_no_batch_dim) { 
+        input = input.unsqueeze(0); 
+    }
 
     const auto ndim = input.dim();
     const auto sample_ndim = ndim - 1;
     const auto batch_size = input.size(0);
-    
     auto shape = input.sizes().vec();
     auto strides_vec = input.strides().vec();
     
     if (input.numel() == 0) { 
-        auto distance = torch::empty_like(input);
         auto index_shape = shape;
         index_shape.push_back(sample_ndim > 0 ? sample_ndim : 1);
-        auto index = torch::empty(index_shape, input.options().dtype(torch::kInt32));
-        if (had_no_batch_dim) return std::make_tuple(distance.squeeze(0), index.squeeze(0));
-        return std::make_tuple(distance, index); 
+        return std::make_tuple(torch::empty_like(input), 
+                               torch::empty(index_shape, input.options().dtype(torch::kInt32))); 
     }
-    
+
     auto distance = torch::empty_like(input);
-    auto index_options = input.options().dtype(torch::kInt32);
     auto index_shape = shape;
-    index_shape.push_back(sample_ndim > 0 ? sample_ndim : 1);
-    auto index = torch::empty(index_shape, index_options);
-
-    if (torch::all(input != 0).item<bool>()) {
-        distance.fill_(std::numeric_limits<float>::infinity());
-        index.fill_(-1);
-        if (had_no_batch_dim) {
-            return std::make_tuple(distance.squeeze(0), index.squeeze(0));
-        }
-        return std::make_tuple(distance, index);
-    }
+    index_shape.push_back(sample_ndim); 
+    auto index = torch::empty(index_shape, input.options().dtype(torch::kInt32));
 
-    auto shape_tensor = torch::tensor(shape, 
-        torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
-    auto strides_tensor = torch::tensor(strides_vec, 
-        torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
-    
-    const int64_t* shape_gpu = shape_tensor.data_ptr<int64_t>();
-    const int64_t* strides_gpu = strides_tensor.data_ptr<int64_t>();
+    auto buffer_dist = torch::empty_like(distance);
+    auto buffer_idx = torch::empty_like(index);
+
+    auto shape_tensor = torch::tensor(shape, torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
+    auto strides_tensor = torch::tensor(strides_vec, torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
 
     std::vector<std::pair<int64_t, int32_t>> dim_order_pairs;
-    for (int32_t d_sample = 0; d_sample < sample_ndim; ++d_sample) {
-        dim_order_pairs.push_back({strides_vec[d_sample + 1], d_sample});
+    for (int32_t d = 0; d < sample_ndim; ++d) {
+        dim_order_pairs.push_back({strides_vec[d + 1], d});
     }
     std::sort(dim_order_pairs.rbegin(), dim_order_pairs.rend());
 
-    if (sample_ndim == 0) {
-        int64_t total_slices = batch_size;
-        int64_t slice_len = (shape.size() > 1) ? shape[1] : 0;
-        int threads = std::min((int64_t)256, slice_len);
-        size_t smem = slice_len * sizeof(float) + (slice_len + 1) * sizeof(int) + 
-                      (slice_len + 2) * sizeof(float) + slice_len * 1 * sizeof(int32_t);
-        
-        edt_kernel_first_final<<<total_slices, threads, smem>>>(
-           input.data_ptr<float>(),
-           distance.data_ptr<float>(), index.data_ptr<int32_t>(),
-           shape_gpu, strides_gpu, ndim, 0, total_slices, 1
-       );
-    } else {
-        auto buffer_dist = torch::empty_like(distance);
-        auto buffer_idx = torch::empty_like(index);
-        
-        for (int pass = 0; pass < sample_ndim; ++pass) {
-            int32_t d_sample = dim_order_pairs[pass].second;
-            bool is_first = (pass == 0);
-            bool is_final = (pass == sample_ndim - 1);
-
-            torch::Tensor *in_dist, *in_idx, *out_dist, *out_idx;
-            
-            if (pass % 2 == 0) {
-                in_dist = &distance; in_idx = &index;
-                out_dist = &buffer_dist; out_idx = &buffer_idx;
+    for (int pass = 0; pass < sample_ndim; ++pass) {
+        int32_t d_sample = dim_order_pairs[pass].second;
+        bool is_first_pass = (pass == 0);
+        bool is_final_pass = (pass == sample_ndim - 1);
+
+        torch::Tensor *in_d, *in_i, *out_d, *out_i;
+
+        if (is_first_pass) {
+            in_d = nullptr; in_i = nullptr; 
+            out_d = is_final_pass ? &distance : &buffer_dist;
+            out_i = is_final_pass ? &index : &buffer_idx;
+        } else {
+            if (pass % 2 != 0) {
+                in_d = &buffer_dist; in_i = &buffer_idx;
+                out_d = &distance;   out_i = &index;
             } else {
-                in_dist = &buffer_dist; in_idx = &buffer_idx;
-                out_dist = &distance; out_idx = &index;
+                in_d = &distance;    in_i = &index;
+                out_d = &buffer_dist; out_i = &buffer_idx;
             }
-            
-            int64_t num_slices_per_sample = 1;
-            for(int i = 0; i < sample_ndim; ++i) { 
-                if (i != d_sample) num_slices_per_sample *= shape[i + 1]; 
+            if (is_final_pass) {
+                out_d = &distance; out_i = &index;
             }
-            int64_t total_slices = batch_size * num_slices_per_sample;
-            int64_t slice_len = shape[d_sample + 1];
-            
-            int threads = std::min((int64_t)256, slice_len);
-            size_t smem = slice_len * sizeof(float) + (slice_len + 1) * sizeof(int) + 
-                          (slice_len + 2) * sizeof(float) + slice_len * sample_ndim * sizeof(int32_t);
-
-            if (is_first && is_final) {
-                edt_kernel_first_final<<<total_slices, threads, smem>>>(
-                    input.data_ptr<float>(),
-                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
-                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+        }
+
+        int64_t num_slices_per_sample = 1;
+        for(int i = 0; i < sample_ndim; ++i) { 
+            if (i != d_sample) num_slices_per_sample *= shape[i + 1]; 
+        }
+        int64_t total_slices = batch_size * num_slices_per_sample;
+        int64_t slice_len = shape[d_sample + 1];
+        
+        int threads = std::min((int64_t)256, slice_len);
+        size_t smem = slice_len * (sizeof(float) + sizeof(int)) + (slice_len + 1) * sizeof(float);
+
+        if (is_first_pass) {
+            const float* in_ptr = input.data_ptr<float>();
+            if (is_final_pass) {
+                edt_kernel_first_pass<true><<<total_slices, threads, smem>>>(
+                    in_ptr, out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
+                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
+                    ndim, d_sample, total_slices, num_slices_per_sample
                 );
-            } else if (is_first) {
-                edt_kernel_first_only<<<total_slices, threads, smem>>>(
-                    input.data_ptr<float>(),
-                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
-                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+            } else {
+                edt_kernel_first_pass<false><<<total_slices, threads, smem>>>(
+                    in_ptr, out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
+                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
+                    ndim, d_sample, total_slices, num_slices_per_sample
                 );
-            } else if (is_final) {
-                edt_kernel_final<<<total_slices, threads, smem>>>(
-                    in_dist->data_ptr<float>(), in_idx->data_ptr<int32_t>(),
-                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
-                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+            }
+        } else {
+            if (is_final_pass) {
+                edt_kernel_subsequent_pass<true><<<total_slices, threads, smem>>>(
+                    in_d->data_ptr<float>(), in_i->data_ptr<int32_t>(),
+                    out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
+                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
+                    ndim, d_sample, total_slices, num_slices_per_sample
                 );
             } else {
-                edt_kernel_middle<<<total_slices, threads, smem>>>(
-                    in_dist->data_ptr<float>(), in_idx->data_ptr<int32_t>(),
-                    out_dist->data_ptr<float>(), out_idx->data_ptr<int32_t>(),
-                    shape_gpu, strides_gpu, ndim, d_sample, total_slices, num_slices_per_sample
+                edt_kernel_subsequent_pass<false><<<total_slices, threads, smem>>>(
+                    in_d->data_ptr<float>(), in_i->data_ptr<int32_t>(),
+                    out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
+                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
+                    ndim, d_sample, total_slices, num_slices_per_sample
                 );
             }
         }
-        
-        if (sample_ndim % 2 != 0) {
-            distance.copy_(buffer_dist);
-            index.copy_(buffer_idx);
-        }
     }
     
     if (had_no_batch_dim) { 
         return std::make_tuple(distance.squeeze(0), index.squeeze(0));
     }
-    
     return std::make_tuple(distance, index);
 }
\ No newline at end of file

From 54464a87aed9492434c125e599f41bea87866aef Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Mon, 8 Dec 2025 20:26:03 +0800
Subject: [PATCH 18/56] =?UTF-8?q?=E9=87=87=E7=94=A8JFA=E7=AE=97=E6=B3=95?=
 =?UTF-8?q?=E6=8F=90=E9=AB=98=E5=B9=B6=E8=A1=8C=E5=BA=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torchmorph/csrc/distance_transform_kernel.cu | 290 ++++++++++++-------
 1 file changed, 184 insertions(+), 106 deletions(-)

diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index c3ded1e..64c69fb 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -4,14 +4,126 @@
 #include <utility>
 #include <cmath>
 #include <limits>
+#include <cuda.h>
+#include <cuda_runtime.h>
 
 #define MAX_DIMS 10
-#define INF_VAL 1e8f  // 使用 1e8 保证 float32 精度下的数值稳定性
+#define INF_VAL 1e8f  
+// 保证 blockDim 足以覆盖大多数常见维度大小，或者配合 Loop 处理
+#define MAX_THREADS 1024 
 
 __device__ __forceinline__ float sqr(float x) { return x * x; }
 
+// 计算从像素 q 到源点 p 的距离代价 (考虑了 p 点本身的数值权重 val_p)
+__device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
+    if (p < 0) return INF_VAL; // 无效点
+    // dist = (q - p)^2 + f[p]
+    return sqr((float)q - (float)p) + val_p;
+}
+
 // ------------------------------------------------------------------
-// 内核 1: 初始 Pass (First Pass)
+// 核心逻辑: 1D Jump Flooding (JFA) / Doubling Algorithm
+// 全并行求解最近点索引，替代串行的抛物线构建
+// ------------------------------------------------------------------
+__device__ void compute_1d_jfa(
+    int N,
+    float* __restrict__ s_vals,      // 输入数值 (dist^2)
+    int*   __restrict__ s_idx_curr,  // ping-pong buffer 1
+    int*   __restrict__ s_idx_next   // ping-pong buffer 2
+) {
+    int tid = threadIdx.x;
+
+    // --- 1. 初始化 ---
+    // 每个线程负责一个或多个像素的初始化
+    for (int i = tid; i < N; i += blockDim.x) {
+        // 如果当前位置的值很大，说明它是背景，没有初始源点 (-1)
+        // 否则源点就是它自己 (i)
+        if (s_vals[i] >= INF_VAL * 0.9f) {
+            s_idx_curr[i] = -1;
+        } else {
+            s_idx_curr[i] = i;
+        }
+    }
+    __syncthreads();
+
+    // --- 2. 迭代传播 (Step = 1, 2, 4, 8...) ---
+    // 类似于双调排序或倍增法
+    int* idx_in = s_idx_curr;
+    int* idx_out = s_idx_next;
+
+    // 只要步长小于 N，就需要传播
+    // 对于 N=1024, 只需要 10 次迭代，每次所有线程全并行
+    for (int step = 1; step < N; step *= 2) {
+        
+        for (int i = tid; i < N; i += blockDim.x) {
+            int my_best_p = idx_in[i];
+            float min_cost = INF_VAL;
+
+            // 获取当前最优点的代价
+            if (my_best_p != -1) {
+                min_cost = compute_cost(i, my_best_p, s_vals[my_best_p]);
+            }
+
+            // --- 检查左边邻居 (i - step) ---
+            int left = i - step;
+            if (left >= 0) {
+                int left_p = idx_in[left]; // 邻居推荐的源点
+                if (left_p != -1) {
+                    float c = compute_cost(i, left_p, s_vals[left_p]);
+                    if (c < min_cost) {
+                        min_cost = c;
+                        my_best_p = left_p;
+                    }
+                }
+            }
+
+            // --- 检查右边邻居 (i + step) ---
+            int right = i + step;
+            if (right < N) {
+                int right_p = idx_in[right]; // 邻居推荐的源点
+                if (right_p != -1) {
+                    float c = compute_cost(i, right_p, s_vals[right_p]);
+                    if (c < min_cost) {
+                        min_cost = c;
+                        my_best_p = right_p;
+                    }
+                }
+            }
+
+            // 写入下一轮 Buffer
+            idx_out[i] = my_best_p;
+        }
+        
+        // 交换 Buffer 指针
+        int* temp = idx_in;
+        idx_in = idx_out;
+        idx_out = temp;
+        
+        __syncthreads();
+    }
+
+    // --- 3. 结果写回 ---
+    // 如果最后结果在 s_idx_next 里 (循环次数是奇数)，需要拷回 s_idx_curr
+    // 或者直接让调用者知道结果在哪。
+    // 为了简单，我们统一把结果放在 s_idx_curr 指向的内存里。
+    // 注意：idx_in 现在指向的是包含最新结果的 buffer。
+    
+    // 如果 idx_in 已经指向 s_idx_curr，那不用动。
+    // 如果 idx_in 指向 s_idx_next，说明最新结果在 s_idx_next，我们需要把它拷贝回 s_idx_curr
+    // 或者是调整后续代码读取的指针。
+    
+    // 这里采用简单拷贝回 s_idx_curr 的方式，确保后续逻辑一致
+    if (idx_in != s_idx_curr) {
+        for (int i = tid; i < N; i += blockDim.x) {
+            s_idx_curr[i] = s_idx_next[i];
+        }
+        __syncthreads();
+    }
+}
+
+
+// ------------------------------------------------------------------
+// 内核 1: 初始 Pass (JFA Version)
 // ------------------------------------------------------------------
 template <bool IsFinal>
 __global__ void edt_kernel_first_pass(
@@ -32,15 +144,13 @@ __global__ void edt_kernel_first_pass(
     int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
     int64_t current_offset = batch_idx * strides[0];
 
-    // 预计算基准坐标 (除了 process_dim 以外的维度坐标)
     int32_t base_coords[MAX_DIMS]; 
     int64_t temp_idx = slice_idx_in_sample;
     const int sample_ndim = ndim - 1;
 
-    // 根据 slice_idx 反解坐标
     for (int32_t d = sample_ndim - 1; d >= 0; --d) {
         if (d == process_dim_sample) {
-            base_coords[d] = 0; // 占位
+            base_coords[d] = 0; 
             continue; 
         }
         int64_t size_of_dim = shape[d + 1];
@@ -56,82 +166,58 @@ __global__ void edt_kernel_first_pass(
     
     if (N == 0) return;
 
+    // Shared Memory Layout:
+    // f: float[N] (Values)
+    // idx1: int[N] (Buffer 1)
+    // idx2: int[N] (Buffer 2)
     extern __shared__ char s_buffer[];
     float* f = (float*)s_buffer;
-    int*   v = (int*)(f + N);
-    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
+    int*   idx1 = (int*)(f + N);
+    int*   idx2 = (int*)(idx1 + N);
 
-    // Phase 1: 加载数据
+    // Phase 1: 并行加载数据
     for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
         float val = __ldg(&in_data[current_offset + i * stride]);
         f[i] = (val == 0.0f) ? 0.0f : INF_VAL;
     }
     __syncthreads();
 
-    // Phase 2: 构建包络
-    if (threadIdx.x == 0) {
-        int k = 0;
-        v[0] = 0;
-        z[0] = -INF_VAL;
-        z[1] = INF_VAL;
-        
-        for (int q = 1; q < N; q++) {
-            // 显式跳过背景点，避免 INF 污染计算
-            if (f[q] >= (INF_VAL * 0.9f)) continue;
-
-            float fq = f[q];
-            int k_curr = k;
-            while (k_curr >= 0) {
-                int p = v[k_curr];
-                
-                // --- 核心修复：数值稳定的交点公式 ---
-                // 先计算差值再相加，防止大数吞小数
-                float diff_f = fq - f[p];
-                float diff_sq = (float)q*(float)q - (float)p*(float)p;
-                float s = (diff_f + diff_sq) / (2.0f * (float)(q - p));
-                
-                if (s > z[k_curr]) {
-                    k_curr++;
-                    v[k_curr] = q;
-                    z[k_curr] = s;
-                    z[k_curr + 1] = INF_VAL;
-                    k = k_curr;
-                    break;
-                }
-                k_curr--;
-            }
-            if (k_curr < 0) {
-                k = 0; v[0] = q; z[0] = -INF_VAL; z[1] = INF_VAL;
-            }
-        }
-    }
-    __syncthreads();
+    // Phase 2: 并行 JFA 计算
+    compute_1d_jfa(N, f, idx1, idx2);
+    // 结果现在存储在 idx1 中
 
-    // Phase 3: 计算距离
+    // Phase 3: 并行写回
     for (int q = threadIdx.x; q < N; q += blockDim.x) {
-        int k = 0;
-        float q_float = (float)q;
-        while (z[k + 1] < q_float) k++;
-        
-        int p = v[k]; 
-        
+        int p = idx1[q];
+        float dist_val;
+        int p_idx;
+
+        if (p != -1) {
+            // JFA 得到的是最近源点的索引 p
+            // 距离 = (q-p)^2 + f[p]
+            // 注意：在 First Pass 中，f[p] 要么是 0 要么是 INF。如果 p != -1，f[p] 必为 0。
+            // 但为了通用性，还是加上 f[p]
+            float dist_sq = sqr((float)q - (float)p) + f[p];
+            dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
+            p_idx = p;
+        } else {
+            dist_val = IsFinal ? INF_VAL : sqr(INF_VAL);
+            p_idx = 0; 
+        }
+
         int64_t global_idx = current_offset + q * stride;
-        float dist_sq = sqr(q_float - (float)p) + f[p];
-        
-        out_dist[global_idx] = IsFinal ? sqrtf(dist_sq) : dist_sq;
+        out_dist[global_idx] = dist_val;
 
-        // 写入索引
         int32_t* out_idx_ptr = out_idx + global_idx * sample_ndim;
         #pragma unroll
         for (int d = 0; d < sample_ndim; ++d) {
-            // 只有当前处理的维度写入 p，其他维度写入基准坐标
-            out_idx_ptr[d] = (d == process_dim_sample) ? p : base_coords[d];
+            out_idx_ptr[d] = (d == process_dim_sample) ? p_idx : base_coords[d];
         }
     }
 }
 
 // ------------------------------------------------------------------
-// 内核 2: 后续 Pass (Subsequent Pass)
+// 内核 2: 后续 Pass (JFA Version)
 // ------------------------------------------------------------------
 template <bool IsFinal>
 __global__ void edt_kernel_subsequent_pass(
@@ -169,61 +255,48 @@ __global__ void edt_kernel_subsequent_pass(
 
     if (N == 0) return;
 
+    // Shared Memory Layout 同上
     extern __shared__ char s_buffer[];
     float* f = (float*)s_buffer;
-    int*   v = (int*)(f + N);
-    float* z = (float*)((char*)v + (N + 1) * sizeof(int));
+    int*   idx1 = (int*)(f + N);
+    int*   idx2 = (int*)(idx1 + N);
 
+    // Phase 1: 加载
     for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
         f[i] = __ldg(&in_dist[current_offset + i * stride]);
     }
     __syncthreads();
 
-    if (threadIdx.x == 0) {
-        int k = 0;
-        v[0] = 0; z[0] = -INF_VAL; z[1] = INF_VAL;
-        
-        for (int q = 1; q < N; q++) {
-            if (f[q] >= (INF_VAL * 0.9f)) continue;
-
-            float fq = f[q];
-            int k_curr = k;
-            while (k_curr >= 0) {
-                int p = v[k_curr];
-                float diff_f = fq - f[p];
-                float diff_sq = (float)q*(float)q - (float)p*(float)p;
-                float s = (diff_f + diff_sq) / (2.0f * (float)(q - p));
-                if (s > z[k_curr]) {
-                    k_curr++; v[k_curr] = q; z[k_curr] = s; z[k_curr + 1] = INF_VAL;
-                    k = k_curr; break;
-                }
-                k_curr--;
-            }
-            if (k_curr < 0) { k = 0; v[0] = q; z[0] = -INF_VAL; z[1] = INF_VAL; }
-        }
-    }
-    __syncthreads();
+    // Phase 2: 并行 JFA 计算
+    // 这里的 f[i] 是上一轮计算出的距离平方，作为权重
+    compute_1d_jfa(N, f, idx1, idx2);
 
+    // Phase 3: 写回
     for (int q = threadIdx.x; q < N; q += blockDim.x) {
-        int k = 0;
-        float q_float = (float)q;
-        while (z[k + 1] < q_float) k++;
-        
-        int p = v[k];
-        
+        int p = idx1[q]; // 最近源点在当前行的索引
+        float dist_val;
+
+        if (p != -1) {
+            float dist_sq = sqr((float)q - (float)p) + f[p];
+            dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
+        } else {
+            dist_val = IsFinal ? INF_VAL : sqr(INF_VAL);
+            p = 0; // fallback
+        }
+
         int64_t q_global_offset = current_offset + q * stride;
-        int64_t p_global_offset = current_offset + p * stride;
-        
-        float dist_sq = sqr(q_float - (float)p) + f[p];
-        out_dist[q_global_offset] = IsFinal ? sqrtf(dist_sq) : dist_sq;
+        out_dist[q_global_offset] = dist_val;
 
-        // 索引直接从 Global Memory 拷贝，无需 Shared Memory
-        const int32_t* src_idx_ptr = in_idx + p_global_offset * sample_ndim;
-        int32_t* out_idx_ptr = out_idx + q_global_offset * sample_ndim;
+        // 索引处理
+        if (p != -1) {
+            int64_t p_global_offset = current_offset + p * stride;
+            const int32_t* src_idx_ptr = in_idx + p_global_offset * sample_ndim;
+            int32_t* out_idx_ptr = out_idx + q_global_offset * sample_ndim;
 
-        #pragma unroll
-        for (int d = 0; d < sample_ndim; ++d) {
-            out_idx_ptr[d] = src_idx_ptr[d]; 
+            #pragma unroll
+            for (int d = 0; d < sample_ndim; ++d) {
+                out_idx_ptr[d] = src_idx_ptr[d]; 
+            }
         }
     }
 }
@@ -237,8 +310,6 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
     
     input = input.contiguous();
 
-    // 自动处理 1D 输入：(L) -> (1, L)
-    // 自动处理 1D 批处理：(N, L) 保持不变 (视为 N 个 1D 样本)
     bool had_no_batch_dim = (input.dim() == 1); 
     if (had_no_batch_dim) { 
         input = input.unsqueeze(0); 
@@ -305,8 +376,15 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
         int64_t total_slices = batch_size * num_slices_per_sample;
         int64_t slice_len = shape[d_sample + 1];
         
-        int threads = std::min((int64_t)256, slice_len);
-        size_t smem = slice_len * (sizeof(float) + sizeof(int)) + (slice_len + 1) * sizeof(float);
+        int threads = std::min((int64_t)MAX_THREADS, slice_len);
+        
+        // JFA 需要的 Shared Memory:
+        // float f[N]
+        // int idx1[N]
+        // int idx2[N]
+        // 总共 slice_len * (4 + 4 + 4) = 12 * slice_len bytes
+        size_t smem = slice_len * sizeof(float) + 
+                      slice_len * sizeof(int) * 2; 
 
         if (is_first_pass) {
             const float* in_ptr = input.data_ptr<float>();

From 32a09da72bc0653e7bfa6b286416f4677396144a Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Mon, 8 Dec 2025 20:27:55 +0800
Subject: [PATCH 19/56] =?UTF-8?q?=E5=90=88=E5=B9=B6=E4=BC=98=E5=8C=96?=
 =?UTF-8?q?=E5=86=85=E5=AD=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torchmorph/csrc/distance_transform_kernel.cu | 588 ++++++++++---------
 1 file changed, 298 insertions(+), 290 deletions(-)

diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 64c69fb..a8e5a6a 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -1,75 +1,64 @@
 #include <torch/extension.h>
 #include <vector>
-#include <algorithm>
-#include <utility>
 #include <cmath>
-#include <limits>
+#include <algorithm>
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-#define MAX_DIMS 10
-#define INF_VAL 1e8f  
-// 保证 blockDim 足以覆盖大多数常见维度大小，或者配合 Loop 处理
-#define MAX_THREADS 1024 
+#define INF_VAL 1e8f
+#define MAX_THREADS 1024
+#define SMEM_LIMIT_ELEMENTS 4096 // 48KB / 12 bytes (float+int+int) ~= 4096
 
 __device__ __forceinline__ float sqr(float x) { return x * x; }
 
-// 计算从像素 q 到源点 p 的距离代价 (考虑了 p 点本身的数值权重 val_p)
+// 计算从像素 q 到源点 p 的距离代价
+// val_p 是源点 p 在上一轮计算后的距离平方值 (weight)
 __device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
-    if (p < 0) return INF_VAL; // 无效点
-    // dist = (q - p)^2 + f[p]
+    if (p < 0) return INF_VAL; 
     return sqr((float)q - (float)p) + val_p;
 }
 
 // ------------------------------------------------------------------
-// 核心逻辑: 1D Jump Flooding (JFA) / Doubling Algorithm
-// 全并行求解最近点索引，替代串行的抛物线构建
+// JFA 核心逻辑 (Device Function)
 // ------------------------------------------------------------------
-__device__ void compute_1d_jfa(
+// 无论数据是在 Shared Memory 还是 Global Memory，逻辑是一样的
+__device__ void run_jfa_core(
     int N,
-    float* __restrict__ s_vals,      // 输入数值 (dist^2)
-    int*   __restrict__ s_idx_curr,  // ping-pong buffer 1
-    int*   __restrict__ s_idx_next   // ping-pong buffer 2
+    int tid,
+    const float* __restrict__ vals,  // 输入权重 (只读)
+    int* __restrict__ idx_curr,      // Ping-Pong Buffer A
+    int* __restrict__ idx_next       // Ping-Pong Buffer B
 ) {
-    int tid = threadIdx.x;
-
-    // --- 1. 初始化 ---
-    // 每个线程负责一个或多个像素的初始化
+    // 1. 初始化
     for (int i = tid; i < N; i += blockDim.x) {
-        // 如果当前位置的值很大，说明它是背景，没有初始源点 (-1)
-        // 否则源点就是它自己 (i)
-        if (s_vals[i] >= INF_VAL * 0.9f) {
-            s_idx_curr[i] = -1;
+        // 如果输入值很大，说明是背景，没有初始源点
+        if (vals[i] >= INF_VAL * 0.9f) {
+            idx_curr[i] = -1;
         } else {
-            s_idx_curr[i] = i;
+            idx_curr[i] = i;
         }
     }
     __syncthreads();
 
-    // --- 2. 迭代传播 (Step = 1, 2, 4, 8...) ---
-    // 类似于双调排序或倍增法
-    int* idx_in = s_idx_curr;
-    int* idx_out = s_idx_next;
+    // 2. 迭代传播 (Step = 1, 2, 4, ... < N)
+    int* idx_in = idx_curr;
+    int* idx_out = idx_next;
 
-    // 只要步长小于 N，就需要传播
-    // 对于 N=1024, 只需要 10 次迭代，每次所有线程全并行
     for (int step = 1; step < N; step *= 2) {
-        
         for (int i = tid; i < N; i += blockDim.x) {
             int my_best_p = idx_in[i];
             float min_cost = INF_VAL;
 
-            // 获取当前最优点的代价
             if (my_best_p != -1) {
-                min_cost = compute_cost(i, my_best_p, s_vals[my_best_p]);
+                min_cost = compute_cost(i, my_best_p, vals[my_best_p]);
             }
 
-            // --- 检查左边邻居 (i - step) ---
+            // Check Left
             int left = i - step;
             if (left >= 0) {
-                int left_p = idx_in[left]; // 邻居推荐的源点
+                int left_p = idx_in[left];
                 if (left_p != -1) {
-                    float c = compute_cost(i, left_p, s_vals[left_p]);
+                    float c = compute_cost(i, left_p, vals[left_p]);
                     if (c < min_cost) {
                         min_cost = c;
                         my_best_p = left_p;
@@ -77,230 +66,206 @@ __device__ void compute_1d_jfa(
                 }
             }
 
-            // --- 检查右边邻居 (i + step) ---
+            // Check Right
             int right = i + step;
             if (right < N) {
-                int right_p = idx_in[right]; // 邻居推荐的源点
+                int right_p = idx_in[right];
                 if (right_p != -1) {
-                    float c = compute_cost(i, right_p, s_vals[right_p]);
+                    float c = compute_cost(i, right_p, vals[right_p]);
                     if (c < min_cost) {
                         min_cost = c;
                         my_best_p = right_p;
                     }
                 }
             }
-
-            // 写入下一轮 Buffer
             idx_out[i] = my_best_p;
         }
         
-        // 交换 Buffer 指针
+        // Swap Pointers
         int* temp = idx_in;
         idx_in = idx_out;
         idx_out = temp;
-        
         __syncthreads();
     }
 
-    // --- 3. 结果写回 ---
-    // 如果最后结果在 s_idx_next 里 (循环次数是奇数)，需要拷回 s_idx_curr
-    // 或者直接让调用者知道结果在哪。
-    // 为了简单，我们统一把结果放在 s_idx_curr 指向的内存里。
-    // 注意：idx_in 现在指向的是包含最新结果的 buffer。
-    
-    // 如果 idx_in 已经指向 s_idx_curr，那不用动。
-    // 如果 idx_in 指向 s_idx_next，说明最新结果在 s_idx_next，我们需要把它拷贝回 s_idx_curr
-    // 或者是调整后续代码读取的指针。
-    
-    // 这里采用简单拷贝回 s_idx_curr 的方式，确保后续逻辑一致
-    if (idx_in != s_idx_curr) {
+    // 3. 确保最终结果在 idx_curr (如果循环结束时在 next，则拷回)
+    if (idx_in != idx_curr) {
         for (int i = tid; i < N; i += blockDim.x) {
-            s_idx_curr[i] = s_idx_next[i];
+            idx_curr[i] = idx_next[i];
         }
         __syncthreads();
     }
 }
 
-
 // ------------------------------------------------------------------
-// 内核 1: 初始 Pass (JFA Version)
+// Kernel 1: Shared Memory JFA (Fast Path)
+// 适用于 N <= 4096
 // ------------------------------------------------------------------
-template <bool IsFinal>
-__global__ void edt_kernel_first_pass(
-    const float* __restrict__ in_data,
-    float* __restrict__ out_dist,
-    int32_t* __restrict__ out_idx,
-    const int64_t* __restrict__ shape,
-    const int64_t* __restrict__ strides,
-    int32_t ndim,
-    int32_t process_dim_sample,
-    int64_t total_slices,
-    int64_t num_slices_per_sample
+template <bool IsFinal, int NDim>
+__global__ void edt_kernel_shared(
+    const float* __restrict__ in_data,       // 当前维度的输入 (dist^2)
+    const int32_t* __restrict__ in_indices,  // 上一轮的索引图 (N_slices, L, NDim)
+    float* __restrict__ out_dist,            // 输出距离
+    int32_t* __restrict__ out_indices,       // 输出索引图
+    int64_t L,                               // 当前维度的长度 (Length)
+    int64_t total_elements                   // Batch * ... * L
 ) {
-    int64_t slice_idx = blockIdx.x;
-    if (slice_idx >= total_slices) return;
-
-    int64_t batch_idx = slice_idx / num_slices_per_sample;
-    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
-    int64_t current_offset = batch_idx * strides[0];
-
-    int32_t base_coords[MAX_DIMS]; 
-    int64_t temp_idx = slice_idx_in_sample;
-    const int sample_ndim = ndim - 1;
-
-    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
-        if (d == process_dim_sample) {
-            base_coords[d] = 0; 
-            continue; 
-        }
-        int64_t size_of_dim = shape[d + 1];
-        int32_t coord = (int32_t)(temp_idx % size_of_dim);
-        base_coords[d] = coord;
-        current_offset += coord * strides[d + 1];
-        temp_idx /= size_of_dim;
-    }
-
-    const int64_t process_dim_actual = process_dim_sample + 1;
-    const int64_t N = shape[process_dim_actual];
-    const int64_t stride = strides[process_dim_actual];
+    // 这里的 total_elements 是展平后的总像素数
+    // 由于我们做了 transpose，数据布局是 [Batch_and_other_dims, L]
+    // 每个 Block 处理一行 (长度 L)
+    
+    int64_t row_idx = blockIdx.x;
+    int64_t offset = row_idx * L;
     
-    if (N == 0) return;
+    if (offset >= total_elements) return;
 
-    // Shared Memory Layout:
-    // f: float[N] (Values)
-    // idx1: int[N] (Buffer 1)
-    // idx2: int[N] (Buffer 2)
+    // Shared Memory 布局: float vals[L], int idx1[L], int idx2[L]
     extern __shared__ char s_buffer[];
-    float* f = (float*)s_buffer;
-    int*   idx1 = (int*)(f + N);
-    int*   idx2 = (int*)(idx1 + N);
-
-    // Phase 1: 并行加载数据
-    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
-        float val = __ldg(&in_data[current_offset + i * stride]);
-        f[i] = (val == 0.0f) ? 0.0f : INF_VAL;
+    float* s_vals = (float*)s_buffer;
+    int*   s_idx1 = (int*)(s_vals + L);
+    int*   s_idx2 = (int*)(s_idx1 + L);
+
+    // 1. 加载数据到 Shared Memory
+    for (int i = threadIdx.x; i < L; i += blockDim.x) {
+        float val = __ldg(&in_data[offset + i]);
+        // 如果是初始 Pass (无输入索引)，val 为 0 或 INF
+        // 如果是后续 Pass，val 为上一步的 dist^2
+        s_vals[i] = val;
     }
     __syncthreads();
 
-    // Phase 2: 并行 JFA 计算
-    compute_1d_jfa(N, f, idx1, idx2);
-    // 结果现在存储在 idx1 中
-
-    // Phase 3: 并行写回
-    for (int q = threadIdx.x; q < N; q += blockDim.x) {
-        int p = idx1[q];
+    // 2. 运行 JFA
+    run_jfa_core(L, threadIdx.x, s_vals, s_idx1, s_idx2);
+    
+    // 3. 写回结果
+    for (int q = threadIdx.x; q < L; q += blockDim.x) {
+        int p = s_idx1[q]; // 最近点在当前行内的局部索引 (0..L-1)
         float dist_val;
-        int p_idx;
 
         if (p != -1) {
-            // JFA 得到的是最近源点的索引 p
-            // 距离 = (q-p)^2 + f[p]
-            // 注意：在 First Pass 中，f[p] 要么是 0 要么是 INF。如果 p != -1，f[p] 必为 0。
-            // 但为了通用性，还是加上 f[p]
-            float dist_sq = sqr((float)q - (float)p) + f[p];
+            // 计算新距离: (q-p)^2 + val[p]
+            float dist_sq = sqr((float)q - (float)p) + s_vals[p];
             dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
-            p_idx = p;
         } else {
             dist_val = IsFinal ? INF_VAL : sqr(INF_VAL);
-            p_idx = 0; 
+            p = 0; // fallback
         }
 
-        int64_t global_idx = current_offset + q * stride;
-        out_dist[global_idx] = dist_val;
+        out_dist[offset + q] = dist_val;
 
-        int32_t* out_idx_ptr = out_idx + global_idx * sample_ndim;
-        #pragma unroll
-        for (int d = 0; d < sample_ndim; ++d) {
-            out_idx_ptr[d] = (d == process_dim_sample) ? p_idx : base_coords[d];
+        // 4. 索引传播
+        // 我们需要从 in_indices 查找完整的高维索引
+        // in_indices 形状: [Batch..., L, NDim]
+        // 这里的 offset 对应 [Batch..., 0]
+        // p 是当前维度的偏移
+        if (p != -1) {
+            int64_t src_offset = (offset + p) * NDim;
+            int64_t dst_offset = (offset + q) * NDim;
+            
+            // 手动展开拷贝，或者循环
+            for (int d = 0; d < NDim; ++d) {
+                out_indices[dst_offset + d] = in_indices[src_offset + d];
+            }
+        } else {
+            // 保持原样或填0 (通常保持原样即可，或者为了安全填0)
+             int64_t dst_offset = (offset + q) * NDim;
+             for (int d = 0; d < NDim; ++d) out_indices[dst_offset + d] = 0;
         }
     }
 }
 
 // ------------------------------------------------------------------
-// 内核 2: 后续 Pass (JFA Version)
+// Kernel 2: Global Memory JFA (Fallback Path)
+// 适用于 N > 4096，使用 Global Memory 作为 Ping-Pong Buffer
 // ------------------------------------------------------------------
-template <bool IsFinal>
-__global__ void edt_kernel_subsequent_pass(
-    const float* __restrict__ in_dist,
-    const int32_t* __restrict__ in_idx,
+template <bool IsFinal, int NDim>
+__global__ void edt_kernel_global(
+    const float* __restrict__ in_data,
+    const int32_t* __restrict__ in_indices,
     float* __restrict__ out_dist,
-    int32_t* __restrict__ out_idx,
-    const int64_t* __restrict__ shape,
-    const int64_t* __restrict__ strides,
-    int32_t ndim,
-    int32_t process_dim_sample,
-    int64_t total_slices,
-    int64_t num_slices_per_sample
+    int32_t* __restrict__ out_indices,
+    int* __restrict__ global_buffer_1, // 临时 buffer A [TotalElements]
+    int* __restrict__ global_buffer_2, // 临时 buffer B [TotalElements]
+    int64_t L,
+    int64_t total_elements
 ) {
-    int64_t slice_idx = blockIdx.x;
-    if (slice_idx >= total_slices) return;
-
-    int64_t batch_idx = slice_idx / num_slices_per_sample;
-    int64_t slice_idx_in_sample = slice_idx % num_slices_per_sample;
-    int64_t current_offset = batch_idx * strides[0];
+    int64_t row_idx = blockIdx.x;
+    int64_t offset = row_idx * L;
     
-    int64_t temp_idx = slice_idx_in_sample;
-    const int sample_ndim = ndim - 1;
-
-    for (int32_t d = sample_ndim - 1; d >= 0; --d) {
-        if (d == process_dim_sample) continue;
-        int64_t size_of_dim = shape[d + 1];
-        current_offset += (temp_idx % size_of_dim) * strides[d + 1];
-        temp_idx /= size_of_dim;
-    }
-    
-    const int64_t process_dim_actual = process_dim_sample + 1;
-    const int64_t N = shape[process_dim_actual];
-    const int64_t stride = strides[process_dim_actual];
-
-    if (N == 0) return;
-
-    // Shared Memory Layout 同上
-    extern __shared__ char s_buffer[];
-    float* f = (float*)s_buffer;
-    int*   idx1 = (int*)(f + N);
-    int*   idx2 = (int*)(idx1 + N);
+    if (offset >= total_elements) return;
 
-    // Phase 1: 加载
-    for (int64_t i = threadIdx.x; i < N; i += blockDim.x) {
-        f[i] = __ldg(&in_dist[current_offset + i * stride]);
-    }
-    __syncthreads();
-
-    // Phase 2: 并行 JFA 计算
-    // 这里的 f[i] 是上一轮计算出的距离平方，作为权重
-    compute_1d_jfa(N, f, idx1, idx2);
+    // 指向当前行在 Global Memory 中的位置
+    // 注意：in_data 是只读的，我们需要把它当做 weight
+    // JFA 需要两个 int buffer 来存 index
+    int* g_idx1 = global_buffer_1 + offset;
+    int* g_idx2 = global_buffer_2 + offset;
+    
+    // 直接在 Global Memory 上运行 JFA
+    // 注意：这里 vals 指针直接指向 in_data (Global)，读取稍慢但无需拷贝
+    run_jfa_core(L, threadIdx.x, in_data + offset, g_idx1, g_idx2);
 
-    // Phase 3: 写回
-    for (int q = threadIdx.x; q < N; q += blockDim.x) {
-        int p = idx1[q]; // 最近源点在当前行的索引
+    // 写回逻辑同上
+    for (int q = threadIdx.x; q < L; q += blockDim.x) {
+        int p = g_idx1[q];
         float dist_val;
 
         if (p != -1) {
-            float dist_sq = sqr((float)q - (float)p) + f[p];
+            float val_p = in_data[offset + p]; 
+            float dist_sq = sqr((float)q - (float)p) + val_p;
             dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
         } else {
             dist_val = IsFinal ? INF_VAL : sqr(INF_VAL);
-            p = 0; // fallback
+            p = 0; 
         }
 
-        int64_t q_global_offset = current_offset + q * stride;
-        out_dist[q_global_offset] = dist_val;
+        out_dist[offset + q] = dist_val;
 
-        // 索引处理
         if (p != -1) {
-            int64_t p_global_offset = current_offset + p * stride;
-            const int32_t* src_idx_ptr = in_idx + p_global_offset * sample_ndim;
-            int32_t* out_idx_ptr = out_idx + q_global_offset * sample_ndim;
-
-            #pragma unroll
-            for (int d = 0; d < sample_ndim; ++d) {
-                out_idx_ptr[d] = src_idx_ptr[d]; 
+            int64_t src_offset = (offset + p) * NDim;
+            int64_t dst_offset = (offset + q) * NDim;
+            for (int d = 0; d < NDim; ++d) {
+                out_indices[dst_offset + d] = in_indices[src_offset + d];
             }
+        } else {
+             int64_t dst_offset = (offset + q) * NDim;
+             for (int d = 0; d < NDim; ++d) out_indices[dst_offset + d] = 0;
         }
     }
 }
 
+
+// ------------------------------------------------------------------
+// 辅助：初始化索引张量
+// ------------------------------------------------------------------
+// 将 index tensor 初始化为 grid grid coordinates
+// shape: (..., D), 最后一个维度存坐标
+__global__ void init_indices_kernel(int32_t* indices, int64_t total_elements, int NDim, 
+                                    const int64_t* shape, const int64_t* strides) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_elements) return;
+
+    // 反解坐标
+    int64_t temp = idx;
+    int32_t coords[10]; // max dims
+
+    // strides 是针对 elements 展开的，但这里 indices 是 (Total, NDim)
+    // 我们可以简单地根据 shape 反解
+    // 注意：这里的 total_elements 是像素数，不是 indices 数组的大小
+    
+    // 假设 shape 是 [D0, D1, D2]
+    // idx 对应 flat index
+    
+    for (int d = NDim - 1; d >= 0; --d) {
+        coords[d] = temp % shape[d];
+        temp /= shape[d];
+    }
+
+    // 写入
+    for (int d = 0; d < NDim; ++d) {
+        indices[idx * NDim + d] = coords[d];
+    }
+}
+
 // ------------------------------------------------------------------
 // Host 函数
 // ------------------------------------------------------------------
@@ -309,119 +274,162 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
     TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be float32.");
     
     input = input.contiguous();
+    bool had_no_batch_dim = (input.dim() == 1);
+    if (had_no_batch_dim) input = input.unsqueeze(0);
 
-    bool had_no_batch_dim = (input.dim() == 1); 
-    if (had_no_batch_dim) { 
-        input = input.unsqueeze(0); 
-    }
-
-    const auto ndim = input.dim();
-    const auto sample_ndim = ndim - 1;
-    const auto batch_size = input.size(0);
+    const int ndim = input.dim(); // Include batch
+    const int sample_ndim = ndim - 1; 
     auto shape = input.sizes().vec();
-    auto strides_vec = input.strides().vec();
-    
-    if (input.numel() == 0) { 
+    int64_t num_pixels = input.numel();
+
+    if (num_pixels == 0) {
         auto index_shape = shape;
         index_shape.push_back(sample_ndim > 0 ? sample_ndim : 1);
         return std::make_tuple(torch::empty_like(input), 
-                               torch::empty(index_shape, input.options().dtype(torch::kInt32))); 
+                               torch::empty(index_shape, input.options().dtype(torch::kInt32)));
     }
 
-    auto distance = torch::empty_like(input);
+    // 1. 初始化输出 Tensor
+    // current_dist 在迭代过程中存储 dist^2，最后开方
+    // 初始状态：Input 里的 0 还是 0，其他非 0 (背景) 设为 INF
+    auto current_dist = torch::where(input == 0, 
+                                     torch::tensor(0.0f, input.options()), 
+                                     torch::tensor(INF_VAL, input.options()));
+    
+    // 初始化索引 Map (Batch, ..., NDim)
     auto index_shape = shape;
-    index_shape.push_back(sample_ndim); 
-    auto index = torch::empty(index_shape, input.options().dtype(torch::kInt32));
-
-    auto buffer_dist = torch::empty_like(distance);
-    auto buffer_idx = torch::empty_like(index);
-
-    auto shape_tensor = torch::tensor(shape, torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
-    auto strides_tensor = torch::tensor(strides_vec, torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
-
-    std::vector<std::pair<int64_t, int32_t>> dim_order_pairs;
-    for (int32_t d = 0; d < sample_ndim; ++d) {
-        dim_order_pairs.push_back({strides_vec[d + 1], d});
+    index_shape.push_back(sample_ndim);
+    auto current_idx = torch::empty(index_shape, input.options().dtype(torch::kInt32));
+    
+    // 启动 Kernel 初始化索引
+    // 为了反解坐标，我们需要把 shape 传进去
+    {
+        // 排除 batch 维度的 shape 用于坐标计算? 
+        // 需求是：返回的索引是 (batch_idx, z, y, x) 还是只是 (z, y, x)?
+        // 通常 EDT 返回的是 sample 内的坐标。所以我们忽略 batch 维度。
+        std::vector<int64_t> sample_shape_vec(shape.begin() + 1, shape.end());
+        auto sample_shape_tensor = torch::tensor(sample_shape_vec, torch::kInt64).to(input.device());
+        // 这里的 strides 不需要，直接由 shape 反解
+        
+        int threads = 256;
+        int blocks = (num_pixels + threads - 1) / threads;
+        
+        // 我们需要传递 sample_ndim
+        init_indices_kernel<<<blocks, threads>>>(
+            current_idx.data_ptr<int32_t>(), 
+            num_pixels, 
+            sample_ndim,
+            sample_shape_tensor.data_ptr<int64_t>(),
+            nullptr // strides not needed for simple unravel
+        );
     }
-    std::sort(dim_order_pairs.rbegin(), dim_order_pairs.rend());
+    
+    // 用于 Global Memory Fallback 的临时 buffer
+    torch::Tensor global_buf1, global_buf2;
+
+    // 2. 逐维处理 (Separable Phases)
+    // 从最后一个维度倒着处理，或者顺序处理都可以。
+    // 为了 Host Transpose 方便，我们遍历 sample 的每一个维度 (1 到 ndim-1)
+    for (int d = 1; d < ndim; ++d) {
+        bool is_final_pass = (d == ndim - 1);
+        
+        // -----------------------------------------------------------
+        // Step A: Permute & Contiguous
+        // 将当前处理维度 d 移到最后: (0, 1, ..., d, ..., N-1) -> (0, 1, ..., N-1, d)
+        // 这样最后内存布局就是 [..., L]，stride=1
+        // -----------------------------------------------------------
+        
+        // 这种 swap 策略比较简单: transpose(d, -1)
+        // 注意：index tensor 也要变换，但 index tensor 最后一维是 coord_dim，不能乱动。
+        // Index tensor 形状是 [..., sample_ndim]。
+        // 我们需要变换的是前面的空间维度 [...]。
+        
+        auto dist_in = current_dist.transpose(d, ndim - 1).contiguous();
+        auto idx_in  = current_idx.transpose(d, ndim - 1).contiguous(); 
+        
+        // 此时 dist_in shape: [..., L]
+        // idx_in shape: [..., L, sample_ndim]
+        
+        int64_t L = dist_in.size(-1); // 当前维度的长度
+        int64_t total_slices = dist_in.numel() / L; // 有多少行
+        
+        auto dist_out = torch::empty_like(dist_in);
+        auto idx_out  = torch::empty_like(idx_in);
 
-    for (int pass = 0; pass < sample_ndim; ++pass) {
-        int32_t d_sample = dim_order_pairs[pass].second;
-        bool is_first_pass = (pass == 0);
-        bool is_final_pass = (pass == sample_ndim - 1);
+        // -----------------------------------------------------------
+        // Step B: Kernel Dispatch
+        // -----------------------------------------------------------
+        int threads = std::min((int64_t)MAX_THREADS, L);
+        
+        // 检查 Shared Memory 需求
+        // Need: float(4) + int(4) + int(4) = 12 bytes per pixel
+        if (L <= SMEM_LIMIT_ELEMENTS) {
+            size_t smem_size = L * (sizeof(float) + 2 * sizeof(int));
+            
+            // 模板参数 NDim 需要是编译期常量。
+            // 动态分发 sample_ndim (1D, 2D, 3D usually)
+            // 使用 switch case 覆盖常见维度 (1, 2, 3)
+            #define DISPATCH_SHARED(IS_FINAL) \
+                switch(sample_ndim) { \
+                    case 1: edt_kernel_shared<IS_FINAL, 1><<<total_slices, threads, smem_size>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        L, dist_in.numel()); break; \
+                    case 2: edt_kernel_shared<IS_FINAL, 2><<<total_slices, threads, smem_size>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        L, dist_in.numel()); break; \
+                    case 3: edt_kernel_shared<IS_FINAL, 3><<<total_slices, threads, smem_size>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        L, dist_in.numel()); break; \
+                    default: /* fallback for >3D */ break; \
+                }
 
-        torch::Tensor *in_d, *in_i, *out_d, *out_i;
+            if (is_final_pass) { DISPATCH_SHARED(true); } 
+            else { DISPATCH_SHARED(false); }
 
-        if (is_first_pass) {
-            in_d = nullptr; in_i = nullptr; 
-            out_d = is_final_pass ? &distance : &buffer_dist;
-            out_i = is_final_pass ? &index : &buffer_idx;
         } else {
-            if (pass % 2 != 0) {
-                in_d = &buffer_dist; in_i = &buffer_idx;
-                out_d = &distance;   out_i = &index;
-            } else {
-                in_d = &distance;    in_i = &index;
-                out_d = &buffer_dist; out_i = &buffer_idx;
-            }
-            if (is_final_pass) {
-                out_d = &distance; out_i = &index;
+            // Fallback: Global Memory
+            // 需要分配 buffer: [total_slices * L] = [numel]
+            if (global_buf1.numel() < dist_in.numel()) {
+                global_buf1 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
+                global_buf2 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
             }
-        }
 
-        int64_t num_slices_per_sample = 1;
-        for(int i = 0; i < sample_ndim; ++i) { 
-            if (i != d_sample) num_slices_per_sample *= shape[i + 1]; 
-        }
-        int64_t total_slices = batch_size * num_slices_per_sample;
-        int64_t slice_len = shape[d_sample + 1];
-        
-        int threads = std::min((int64_t)MAX_THREADS, slice_len);
-        
-        // JFA 需要的 Shared Memory:
-        // float f[N]
-        // int idx1[N]
-        // int idx2[N]
-        // 总共 slice_len * (4 + 4 + 4) = 12 * slice_len bytes
-        size_t smem = slice_len * sizeof(float) + 
-                      slice_len * sizeof(int) * 2; 
-
-        if (is_first_pass) {
-            const float* in_ptr = input.data_ptr<float>();
-            if (is_final_pass) {
-                edt_kernel_first_pass<true><<<total_slices, threads, smem>>>(
-                    in_ptr, out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
-                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
-                    ndim, d_sample, total_slices, num_slices_per_sample
-                );
-            } else {
-                edt_kernel_first_pass<false><<<total_slices, threads, smem>>>(
-                    in_ptr, out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
-                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
-                    ndim, d_sample, total_slices, num_slices_per_sample
-                );
-            }
-        } else {
-            if (is_final_pass) {
-                edt_kernel_subsequent_pass<true><<<total_slices, threads, smem>>>(
-                    in_d->data_ptr<float>(), in_i->data_ptr<int32_t>(),
-                    out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
-                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
-                    ndim, d_sample, total_slices, num_slices_per_sample
-                );
-            } else {
-                edt_kernel_subsequent_pass<false><<<total_slices, threads, smem>>>(
-                    in_d->data_ptr<float>(), in_i->data_ptr<int32_t>(),
-                    out_d->data_ptr<float>(), out_i->data_ptr<int32_t>(),
-                    shape_tensor.data_ptr<int64_t>(), strides_tensor.data_ptr<int64_t>(),
-                    ndim, d_sample, total_slices, num_slices_per_sample
-                );
-            }
+            #define DISPATCH_GLOBAL(IS_FINAL) \
+                switch(sample_ndim) { \
+                    case 1: edt_kernel_global<IS_FINAL, 1><<<total_slices, threads>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
+                        L, dist_in.numel()); break; \
+                    case 2: edt_kernel_global<IS_FINAL, 2><<<total_slices, threads>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
+                        L, dist_in.numel()); break; \
+                    case 3: edt_kernel_global<IS_FINAL, 3><<<total_slices, threads>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
+                        L, dist_in.numel()); break; \
+                    default: break; \
+                }
+
+            if (is_final_pass) { DISPATCH_GLOBAL(true); }
+            else { DISPATCH_GLOBAL(false); }
         }
+
+        // -----------------------------------------------------------
+        // Step C: Transpose Back
+        // -----------------------------------------------------------
+        current_dist = dist_out.transpose(d, ndim - 1); 
+        current_idx  = idx_out.transpose(d, ndim - 1);
     }
-    
-    if (had_no_batch_dim) { 
-        return std::make_tuple(distance.squeeze(0), index.squeeze(0));
+
+    if (had_no_batch_dim) {
+        return std::make_tuple(current_dist.squeeze(0), current_idx.squeeze(0));
     }
-    return std::make_tuple(distance, index);
+    return std::make_tuple(current_dist, current_idx);
 }
\ No newline at end of file

From 5a9872f96073b4d4e3c8472d188825209752c274 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Mon, 8 Dec 2025 20:30:09 +0800
Subject: [PATCH 20/56] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=B8=89=E7=BB=B4?=
 =?UTF-8?q?=E4=BB=A5=E4=B8=8A=E7=BB=B4=E5=BA=A6=E7=9A=84=E8=AE=A1=E7=AE=97?=
 =?UTF-8?q?=E5=A4=84=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/test_distance_transform.py              | 146 ++++----
 torchmorph/csrc/distance_transform_kernel.cu | 332 ++++++++++---------
 2 files changed, 267 insertions(+), 211 deletions(-)

diff --git a/test/test_distance_transform.py b/test/test_distance_transform.py
index 476ffca..6e6265d 100644
--- a/test/test_distance_transform.py
+++ b/test/test_distance_transform.py
@@ -1,21 +1,20 @@
 import torch
 import pytest
-from scipy.ndimage import distance_transform_edt as scipy_edt
 import numpy as np
-import torchmorph as tm
+from scipy.ndimage import distance_transform_edt as scipy_edt
+import torchmorph as tm 
 
-# 辅助函数：调用 SciPy 并处理格式
+# ======================================================================
+# 辅助函数
+# ======================================================================
 def batch_scipy_edt_with_indices(batch_numpy: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
-
+    dist_results, indices_results = [], []
     
-    input_is_1d_batch = (batch_numpy.ndim == 2) 
-    input_is_single_sample_no_batch = (batch_numpy.ndim == 1)
+    # 保证 batch_numpy 至少是 (Batch, ...)
+    # 如果进来的是 (H, W)，我们在外面已经处理成 (1, H, W) 了
+    if batch_numpy.ndim == 1:
+        batch_numpy = batch_numpy[np.newaxis, ...]
 
-    if input_is_single_sample_no_batch:
-        batch_numpy = batch_numpy[np.newaxis, ...] # (L) -> (1, L)
-
-    
-    dist_results, indices_results = [], []
     for sample in batch_numpy:
         dist, indices = scipy_edt(sample, return_indices=True, return_distances=True)
         dist_results.append(dist)
@@ -23,81 +22,106 @@ def batch_scipy_edt_with_indices(batch_numpy: np.ndarray) -> tuple[np.ndarray, n
         
     output_dist = np.stack(dist_results, axis=0)
     output_indices = np.stack(indices_results, axis=0)
+    output_indices = np.moveaxis(output_indices, 1, -1) 
     
-    # indices shape fix: (N, ndim_sample, ...) -> (N, ..., ndim_sample)
-    # 对于 1D: (N, 1, L) -> (N, L, 1)
-    output_indices = np.moveaxis(output_indices, 1, -1)
-    
-    if input_is_single_sample_no_batch:
-        output_dist = output_dist.squeeze(0)
-        output_indices = output_indices.squeeze(0)
-        
     return output_dist, output_indices
 
-# 用例定义 
-case_batch_2d = np.array([[[0., 1, 1, 1], [0, 0, 1, 1], [0, 1, 1, 0]],[[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]]], dtype=np.float32)
+# ======================================================================
+# 测试数据
+# ======================================================================
+case_batch_1d = np.array([[1, 1, 0, 1, 0, 1], [0, 1, 1, 1, 1, 0]], dtype=np.float32)
+
+case_batch_2d = np.array([[[0., 1, 1, 1], [0, 0, 1, 1], [0, 1, 1, 0]],
+                          [[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]]], dtype=np.float32)
+
+# 这里定义为 (4, 4)，意图是单张 2D 图
+case_single_2d = np.array([[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0]], dtype=np.float32)
+case_explicit_batch_one = case_single_2d[np.newaxis, ...]
+
 _case_3d_s1 = np.ones((4, 5, 6), dtype=np.float32); _case_3d_s1[1, 1, 1] = 0.0; _case_3d_s1[2, 3, 4] = 0.0
 _case_3d_s2 = np.ones((4, 5, 6), dtype=np.float32); _case_3d_s2[0, 0, 0] = 0.0
 case_batch_3d = np.stack([_case_3d_s1, _case_3d_s2], axis=0)
-case_single_2d = np.array([[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0]], dtype=np.float32)
-case_explicit_batch_one = case_single_2d[np.newaxis, ...]
+
 case_dim_one = np.ones((2, 5, 1), dtype=np.float32); case_dim_one[0, 2, 0] = 0.0; case_dim_one[1, 4, 0] = 0.0
-case_batch_1d = np.array([[1, 1, 0, 1, 0, 1], [0, 1, 1, 1, 1, 0]], dtype=np.float32)
 
+# 4D Case
+_case_4d_s1 = np.ones((3, 3, 3, 3), dtype=np.float32); _case_4d_s1[0, 0, 0, 0] = 0.0
+_case_4d_s2 = np.ones((3, 3, 3, 3), dtype=np.float32); _case_4d_s2[1, 1, 1, 1] = 0.0
+case_batch_4d_spatial = np.stack([_case_4d_s1, _case_4d_s2], axis=0)
+
+# 5D Case
+case_batch_5d_spatial = np.ones((1, 2, 2, 2, 2, 2), dtype=np.float32)
+case_batch_5d_spatial[0, 0, 0, 0, 0, 0] = 0.0; case_batch_5d_spatial[0, 1, 1, 1, 1, 1] = 0.0
+
+# ======================================================================
+# 测试逻辑
+# ======================================================================
 @pytest.mark.parametrize(
-    "input_numpy",
+    "input_numpy, has_batch_dim",
     [
-        pytest.param(case_batch_2d, id="批处理2D图像"),
-        pytest.param(case_batch_3d, id="批处理3D图像"),
-        pytest.param(case_single_2d, id="单张2D图像(隐式批处理)"),
-        pytest.param(case_explicit_batch_one, id="单张2D图像(显式批处理)"),
-        pytest.param(case_dim_one, id="含幺元维度的批处理"),
-        pytest.param(case_batch_1d, id="批处理1D数据"),
+        pytest.param(case_batch_1d, True, id="1D_Batch"),
+        pytest.param(case_batch_2d, True, id="2D_Batch"),
+        pytest.param(case_single_2d, False, id="2D_Single_NoBatch"),
+        pytest.param(case_explicit_batch_one, True, id="2D_Single_ExplicitBatch"),
+        pytest.param(case_batch_3d, True, id="3D_Batch"),
+        pytest.param(case_dim_one, True, id="2D_UnitDim_Batch"),
+        pytest.param(case_batch_4d_spatial, True, id="4D_Spatial_Batch"),
+        pytest.param(case_batch_5d_spatial, True, id="5D_Spatial_Batch"),
     ],
 )
-def test_distance_transform_and_indices(input_numpy: np.ndarray, request: pytest.FixtureRequest):
+def test_distance_transform_and_indices(input_numpy: np.ndarray, has_batch_dim: bool, request: pytest.FixtureRequest):
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
     
+    # 1. 准备 Numpy 数据
     x_numpy_contiguous = np.ascontiguousarray(input_numpy)
+    
+    # 2. 准备 SciPy 输入
+    # 如果意图是单样本 (has_batch_dim=False)，我们手动增加 Batch 维，
+    # 这样 scipy 辅助函数就会把它当做一张图来处理，而不是 N 张 1D 图
+    if not has_batch_dim:
+        scipy_input = x_numpy_contiguous[np.newaxis, ...]
+    else:
+        scipy_input = x_numpy_contiguous
+
+    # 3. 准备 CUDA 输入
+    # 关键修复: 
+    # 如果 has_batch_dim=False，说明这是单张 (H, W)，我们要测 2D EDT。
+    # C++ API 默认第一维是 Batch，所以我们必须 unsqueeze(0) 变成 (1, H, W)。
+    # 否则 C++ 会把它当做 (Batch=H, Len=W) 做 1D EDT。
     x_cuda = torch.from_numpy(x_numpy_contiguous).cuda()
+    if not has_batch_dim:
+        x_cuda = x_cuda.unsqueeze(0)
 
-    print(f"\n\n--- 正在运行测试: {request.node.callspec.id} ---")
-    print(f"输入张量形状: {x_cuda.shape}")
+    print(f"\n\n--- 运行测试: {request.node.callspec.id} ---")
+    print(f"CUDA 输入形状: {x_cuda.shape}")
 
-    # 调用您的 Python 包装函数
+    # 4. 运行 CUDA EDT
     dist_cuda, idx_cuda = tm.distance_transform(x_cuda.clone())
 
-    print(f"CUDA 距离输出形状: {dist_cuda.shape}")
-    print(f"CUDA 坐标输出形状: {idx_cuda.shape}")
-
-    # 调用 SciPy 作为参考基准
-    dist_ref_numpy, idx_ref_numpy = batch_scipy_edt_with_indices(x_numpy_contiguous)
+    # 5. 运行 SciPy (Ground Truth)
+    dist_ref_numpy, idx_ref_numpy = batch_scipy_edt_with_indices(scipy_input)
     dist_ref = torch.from_numpy(dist_ref_numpy).to(torch.float32).cuda()
-    
-    print(f"SciPy 距离输出形状: {dist_ref.shape}")
 
-    # 断言验证
-    print("\n--- 正在验证距离... ---")
-    assert dist_cuda.shape == dist_ref.shape
+    # 6. 验证距离
+    # 此时 dist_cuda 是 (1, H, W)，dist_ref 也是 (1, H, W)
+    # 如果原意是 NoBatch，我们可以把 Batch 维 squeeze 掉再比，或者直接比
+    print(f"CUDA Out Shape: {dist_cuda.shape}, Ref Shape: {dist_ref.shape}")
+    assert dist_cuda.shape == dist_ref.shape, f"Shape mismatch: {dist_cuda.shape} vs {dist_ref.shape}"
     torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-3, rtol=1e-3)
-    print("距离断言通过 (形状和数值接近)。")
+    print(">> 距离验证通过。")
 
-    print("\n--- 正在验证坐标... ---")
-    
-    # 鲁棒的坐标验证逻辑
-    had_no_batch_dim = (x_numpy_contiguous.ndim <= idx_cuda.shape[-1])
-    spatial_shape = x_cuda.shape if had_no_batch_dim else x_cuda.shape[1:]
+    # 7. 验证索引
+    # idx_cuda: (1, H, W, 2)
+    # 构造 Grid
+    spatial_shape = x_cuda.shape[1:] # (H, W)
     coords = [torch.arange(s, device='cuda') for s in spatial_shape]
-    grid = torch.stack(torch.meshgrid(*coords, indexing='ij'), dim=-1)
-    
-    if not had_no_batch_dim:
-        grid = grid.unsqueeze(0)
-        
+    grid = torch.stack(torch.meshgrid(*coords, indexing='ij'), dim=-1) # (H, W, 2)
+    grid = grid.unsqueeze(0) # (1, H, W, 2)
+
     diff = grid.float() - idx_cuda.float()
-    dist_sq_from_indices = torch.sum(diff * diff, dim=-1)
+    dist_sq_calculated = torch.sum(diff * diff, dim=-1)
+    dist_sq_output = dist_cuda * dist_cuda
     
-    torch.testing.assert_close(dist_sq_from_indices, dist_cuda * dist_cuda, atol=1e-3, rtol=1e-3)
-    print("坐标正确性断言通过 (计算出的距离与返回距离匹配)。")
-        
-    print("--- 测试通过 ---")
\ No newline at end of file
+    torch.testing.assert_close(dist_sq_calculated, dist_sq_output, atol=1e-3, rtol=1e-3)
+    print(">> 索引验证通过。")
\ No newline at end of file
diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index a8e5a6a..6e9c4c1 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -5,23 +5,32 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
+// ------------------------------------------------------------------
+// 配置常量
+// ------------------------------------------------------------------
 #define INF_VAL 1e8f
 #define MAX_THREADS 1024
-#define SMEM_LIMIT_ELEMENTS 4096 // 48KB / 12 bytes (float+int+int) ~= 4096
+// Shared Memory 限制: 48KB 一般安全。
+// 每个像素需要: float(val) + int(idx1) + int(idx2) = 12 bytes
+// 4096 * 12 = 48KB.
+#define SMEM_LIMIT_ELEMENTS 4096 
+
+// ------------------------------------------------------------------
+// Device Helper Functions
+// ------------------------------------------------------------------
 
 __device__ __forceinline__ float sqr(float x) { return x * x; }
 
-// 计算从像素 q 到源点 p 的距离代价
-// val_p 是源点 p 在上一轮计算后的距离平方值 (weight)
+// 计算 JFA 代价: (q - p)^2 + weight[p]
 __device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
     if (p < 0) return INF_VAL; 
     return sqr((float)q - (float)p) + val_p;
 }
 
 // ------------------------------------------------------------------
-// JFA 核心逻辑 (Device Function)
+// JFA Core Logic (Device Only)
 // ------------------------------------------------------------------
-// 无论数据是在 Shared Memory 还是 Global Memory，逻辑是一样的
+// 核心 JFA 逻辑，与数据位置无关 (Shared 或 Global 均通用)
 __device__ void run_jfa_core(
     int N,
     int tid,
@@ -29,13 +38,12 @@ __device__ void run_jfa_core(
     int* __restrict__ idx_curr,      // Ping-Pong Buffer A
     int* __restrict__ idx_next       // Ping-Pong Buffer B
 ) {
-    // 1. 初始化
+    // 1. 初始化: 根据 vals 决定是否是有效源点
     for (int i = tid; i < N; i += blockDim.x) {
-        // 如果输入值很大，说明是背景，没有初始源点
         if (vals[i] >= INF_VAL * 0.9f) {
-            idx_curr[i] = -1;
+            idx_curr[i] = -1; // 背景
         } else {
-            idx_curr[i] = i;
+            idx_curr[i] = i;  // 物体/源点，初始索引指向自己
         }
     }
     __syncthreads();
@@ -49,11 +57,12 @@ __device__ void run_jfa_core(
             int my_best_p = idx_in[i];
             float min_cost = INF_VAL;
 
+            // 检查自己当前的最优解
             if (my_best_p != -1) {
                 min_cost = compute_cost(i, my_best_p, vals[my_best_p]);
             }
 
-            // Check Left
+            // Check Left Neighbor (-step)
             int left = i - step;
             if (left >= 0) {
                 int left_p = idx_in[left];
@@ -66,7 +75,7 @@ __device__ void run_jfa_core(
                 }
             }
 
-            // Check Right
+            // Check Right Neighbor (+step)
             int right = i + step;
             if (right < N) {
                 int right_p = idx_in[right];
@@ -99,42 +108,41 @@ __device__ void run_jfa_core(
 
 // ------------------------------------------------------------------
 // Kernel 1: Shared Memory JFA (Fast Path)
-// 适用于 N <= 4096
 // ------------------------------------------------------------------
+// 模板参数 NDim: 如果 > 0，编译器会展开循环优化。
+// 参数 runtime_ndim: 如果 NDim == 0 (Default case)，使用该参数作为维度。
 template <bool IsFinal, int NDim>
 __global__ void edt_kernel_shared(
-    const float* __restrict__ in_data,       // 当前维度的输入 (dist^2)
-    const int32_t* __restrict__ in_indices,  // 上一轮的索引图 (N_slices, L, NDim)
-    float* __restrict__ out_dist,            // 输出距离
-    int32_t* __restrict__ out_indices,       // 输出索引图
-    int64_t L,                               // 当前维度的长度 (Length)
-    int64_t total_elements                   // Batch * ... * L
+    const float* __restrict__ in_data,       // 输入 Dist^2
+    const int32_t* __restrict__ in_indices,  // 输入 Indices
+    float* __restrict__ out_dist,            // 输出 Dist (IsFinal ? sqrt : sqr)
+    int32_t* __restrict__ out_indices,       // 输出 Indices
+    int64_t L,                               // 当前维度的长度
+    int64_t total_elements,                  // 总像素数
+    int runtime_ndim                         // 运行时维度 (fallback)
 ) {
-    // 这里的 total_elements 是展平后的总像素数
-    // 由于我们做了 transpose，数据布局是 [Batch_and_other_dims, L]
-    // 每个 Block 处理一行 (长度 L)
-    
+    // 确定实际维度
+    const int D = (NDim > 0) ? NDim : runtime_ndim;
+
+    // 计算行偏移
     int64_t row_idx = blockIdx.x;
     int64_t offset = row_idx * L;
     
     if (offset >= total_elements) return;
 
-    // Shared Memory 布局: float vals[L], int idx1[L], int idx2[L]
+    // Shared Memory 布局
     extern __shared__ char s_buffer[];
     float* s_vals = (float*)s_buffer;
     int*   s_idx1 = (int*)(s_vals + L);
     int*   s_idx2 = (int*)(s_idx1 + L);
 
-    // 1. 加载数据到 Shared Memory
+    // 1. 加载 Dist 到 Shared Memory
     for (int i = threadIdx.x; i < L; i += blockDim.x) {
-        float val = __ldg(&in_data[offset + i]);
-        // 如果是初始 Pass (无输入索引)，val 为 0 或 INF
-        // 如果是后续 Pass，val 为上一步的 dist^2
-        s_vals[i] = val;
+        s_vals[i] = __ldg(&in_data[offset + i]);
     }
     __syncthreads();
 
-    // 2. 运行 JFA
+    // 2. 运行 JFA 核心
     run_jfa_core(L, threadIdx.x, s_vals, s_idx1, s_idx2);
     
     // 3. 写回结果
@@ -142,69 +150,64 @@ __global__ void edt_kernel_shared(
         int p = s_idx1[q]; // 最近点在当前行内的局部索引 (0..L-1)
         float dist_val;
 
+        // 计算新距离
         if (p != -1) {
-            // 计算新距离: (q-p)^2 + val[p]
             float dist_sq = sqr((float)q - (float)p) + s_vals[p];
             dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
         } else {
             dist_val = IsFinal ? INF_VAL : sqr(INF_VAL);
-            p = 0; // fallback
+            p = 0; // 防止越界，随便指一个
         }
-
         out_dist[offset + q] = dist_val;
 
-        // 4. 索引传播
-        // 我们需要从 in_indices 查找完整的高维索引
-        // in_indices 形状: [Batch..., L, NDim]
-        // 这里的 offset 对应 [Batch..., 0]
-        // p 是当前维度的偏移
+        // 索引传播: Copy Vector [D]
         if (p != -1) {
-            int64_t src_offset = (offset + p) * NDim;
-            int64_t dst_offset = (offset + q) * NDim;
+            int64_t src_offset = (offset + p) * D;
+            int64_t dst_offset = (offset + q) * D;
             
-            // 手动展开拷贝，或者循环
-            for (int d = 0; d < NDim; ++d) {
+            // 如果 NDim > 0，这里会完全展开，非常快
+            for (int d = 0; d < D; ++d) {
                 out_indices[dst_offset + d] = in_indices[src_offset + d];
             }
         } else {
-            // 保持原样或填0 (通常保持原样即可，或者为了安全填0)
-             int64_t dst_offset = (offset + q) * NDim;
-             for (int d = 0; d < NDim; ++d) out_indices[dst_offset + d] = 0;
+             // 找不到源点（全图都是背景的情况）
+             int64_t dst_offset = (offset + q) * D;
+             for (int d = 0; d < D; ++d) out_indices[dst_offset + d] = 0;
         }
     }
 }
 
 // ------------------------------------------------------------------
 // Kernel 2: Global Memory JFA (Fallback Path)
-// 适用于 N > 4096，使用 Global Memory 作为 Ping-Pong Buffer
 // ------------------------------------------------------------------
+// 逻辑同上，只是用 Global Memory 做 Ping-Pong Buffer
 template <bool IsFinal, int NDim>
 __global__ void edt_kernel_global(
     const float* __restrict__ in_data,
     const int32_t* __restrict__ in_indices,
     float* __restrict__ out_dist,
     int32_t* __restrict__ out_indices,
-    int* __restrict__ global_buffer_1, // 临时 buffer A [TotalElements]
-    int* __restrict__ global_buffer_2, // 临时 buffer B [TotalElements]
+    int* __restrict__ global_buffer_1,
+    int* __restrict__ global_buffer_2,
     int64_t L,
-    int64_t total_elements
+    int64_t total_elements,
+    int runtime_ndim
 ) {
+    const int D = (NDim > 0) ? NDim : runtime_ndim;
+
     int64_t row_idx = blockIdx.x;
     int64_t offset = row_idx * L;
     
     if (offset >= total_elements) return;
 
-    // 指向当前行在 Global Memory 中的位置
-    // 注意：in_data 是只读的，我们需要把它当做 weight
-    // JFA 需要两个 int buffer 来存 index
+    // 指向 Global Memory 的指针
     int* g_idx1 = global_buffer_1 + offset;
     int* g_idx2 = global_buffer_2 + offset;
     
-    // 直接在 Global Memory 上运行 JFA
-    // 注意：这里 vals 指针直接指向 in_data (Global)，读取稍慢但无需拷贝
+    // 1. & 2. 运行 JFA (直接在 Global Mem 上读写)
     run_jfa_core(L, threadIdx.x, in_data + offset, g_idx1, g_idx2);
 
-    // 写回逻辑同上
+    // 3. 写回结果
     for (int q = threadIdx.x; q < L; q += blockDim.x) {
         int p = g_idx1[q];
         float dist_val;
@@ -221,177 +224,191 @@ __global__ void edt_kernel_global(
         out_dist[offset + q] = dist_val;
 
         if (p != -1) {
-            int64_t src_offset = (offset + p) * NDim;
-            int64_t dst_offset = (offset + q) * NDim;
-            for (int d = 0; d < NDim; ++d) {
+            int64_t src_offset = (offset + p) * D;
+            int64_t dst_offset = (offset + q) * D;
+            for (int d = 0; d < D; ++d) {
                 out_indices[dst_offset + d] = in_indices[src_offset + d];
             }
         } else {
-             int64_t dst_offset = (offset + q) * NDim;
-             for (int d = 0; d < NDim; ++d) out_indices[dst_offset + d] = 0;
+             int64_t dst_offset = (offset + q) * D;
+             for (int d = 0; d < D; ++d) out_indices[dst_offset + d] = 0;
         }
     }
 }
 
-
 // ------------------------------------------------------------------
-// 辅助：初始化索引张量
+// Kernel 3: Initialize Indices
 // ------------------------------------------------------------------
-// 将 index tensor 初始化为 grid grid coordinates
-// shape: (..., D), 最后一个维度存坐标
-__global__ void init_indices_kernel(int32_t* indices, int64_t total_elements, int NDim, 
-                                    const int64_t* shape, const int64_t* strides) {
+// 初始化索引张量为网格坐标
+// indices shape: (..., D)
+__global__ void init_indices_kernel(
+    int32_t* indices, 
+    int64_t total_pixels, 
+    int NDim, 
+    const int64_t* __restrict__ shape_ptr // shape of spatial dimensions
+) {
     int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= total_elements) return;
+    if (idx >= total_pixels) return;
 
-    // 反解坐标
+    // 反解坐标 (Unravel Index)
+    // idx 是每个像素的 flat index
+    // 我们需要计算它在 spatial_shape 中的坐标
+    
     int64_t temp = idx;
-    int32_t coords[10]; // max dims
+    // 使用本地寄存器数组避免多次全局内存读取 (假设最大 10 维)
+    int32_t coords[10]; 
 
-    // strides 是针对 elements 展开的，但这里 indices 是 (Total, NDim)
-    // 我们可以简单地根据 shape 反解
-    // 注意：这里的 total_elements 是像素数，不是 indices 数组的大小
-    
-    // 假设 shape 是 [D0, D1, D2]
-    // idx 对应 flat index
-    
+    // 假设 spatial_shape 是 [D0, D1, D2]
+    // 倒序计算除余
     for (int d = NDim - 1; d >= 0; --d) {
-        coords[d] = temp % shape[d];
-        temp /= shape[d];
+        int64_t dim_size = shape_ptr[d];
+        coords[d] = temp % dim_size;
+        temp /= dim_size;
     }
 
-    // 写入
+    // 写入 Global Memory
+    // Indices tensor 是 (TotalPixels, NDim) 扁平化的
+    int64_t out_ptr = idx * NDim;
     for (int d = 0; d < NDim; ++d) {
-        indices[idx * NDim + d] = coords[d];
+        indices[out_ptr + d] = coords[d];
     }
 }
 
 // ------------------------------------------------------------------
-// Host 函数
+// Host Function: C++ Entry Point
 // ------------------------------------------------------------------
+
 std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
     TORCH_CHECK(input.is_cuda(), "Input must be on CUDA device.");
     TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be float32.");
     
     input = input.contiguous();
-    bool had_no_batch_dim = (input.dim() == 1);
-    if (had_no_batch_dim) input = input.unsqueeze(0);
-
-    const int ndim = input.dim(); // Include batch
+    
+    // 处理 Batch 维度：如果输入是 1D (L)，视为无 Batch，但在处理中统一加一个 Batch 维方便
+    // 标准约定：Input shape (Batch, D1, D2, ..., Dn)
+    // 算法对 Batch 维度和其他维度处理其实是一样的（视为无关维度）
+    // 但索引初始化需要知道哪些是 "Spatial Dimensions"。
+    // 这里假设：输入的所有维度除了 Batch (Dim 0) 外都是空间维度。
+    
+    const int ndim = input.dim(); 
+    // 如果 ndim=1, 假设是 (L)，sample_ndim=1
+    // 如果 ndim=4 (B, C, H, W)，sample_ndim=3 (C,H,W 都算空间? 通常 C 也是独立处理的)
+    // **修正**: 标准 EDT 通常是在 (H, W) 或 (D, H, W) 上进行的。
+    // 如果有 Channel，通常 Channel 也是独立的。
+    // 为了最通用，我们将 **除了第0维(Batch)** 以外的所有维度都视为空间维度进行索引记录。
+    // 如果用户输入没有 Batch 维，请在 Python 端 unsqueeze(0)。
+    
+    // 假设输入已经是 (Batch, ...Spatial...)
     const int sample_ndim = ndim - 1; 
+    TORCH_CHECK(sample_ndim > 0, "Input tensor must have at least 2 dimensions (Batch, ...)");
+
     auto shape = input.sizes().vec();
     int64_t num_pixels = input.numel();
 
     if (num_pixels == 0) {
         auto index_shape = shape;
-        index_shape.push_back(sample_ndim > 0 ? sample_ndim : 1);
+        index_shape.push_back(sample_ndim);
         return std::make_tuple(torch::empty_like(input), 
                                torch::empty(index_shape, input.options().dtype(torch::kInt32)));
     }
 
-    // 1. 初始化输出 Tensor
-    // current_dist 在迭代过程中存储 dist^2，最后开方
-    // 初始状态：Input 里的 0 还是 0，其他非 0 (背景) 设为 INF
+    // 1. 初始化 Distance Tensor
+    // 0 -> 0, 1 -> INF
     auto current_dist = torch::where(input == 0, 
                                      torch::tensor(0.0f, input.options()), 
                                      torch::tensor(INF_VAL, input.options()));
     
-    // 初始化索引 Map (Batch, ..., NDim)
+    // 2. 初始化 Index Tensor
+    // Shape: (Batch, D1, ..., Dn, sample_ndim)
     auto index_shape = shape;
     index_shape.push_back(sample_ndim);
     auto current_idx = torch::empty(index_shape, input.options().dtype(torch::kInt32));
     
-    // 启动 Kernel 初始化索引
-    // 为了反解坐标，我们需要把 shape 传进去
+    // 2.1 准备 Shape 数据传给 Kernel
+    std::vector<int64_t> spatial_shape(shape.begin() + 1, shape.end());
+    auto shape_tensor = torch::tensor(spatial_shape, torch::kInt64).to(input.device());
+
+    // 2.2 运行初始化 Kernel
     {
-        // 排除 batch 维度的 shape 用于坐标计算? 
-        // 需求是：返回的索引是 (batch_idx, z, y, x) 还是只是 (z, y, x)?
-        // 通常 EDT 返回的是 sample 内的坐标。所以我们忽略 batch 维度。
-        std::vector<int64_t> sample_shape_vec(shape.begin() + 1, shape.end());
-        auto sample_shape_tensor = torch::tensor(sample_shape_vec, torch::kInt64).to(input.device());
-        // 这里的 strides 不需要，直接由 shape 反解
-        
         int threads = 256;
         int blocks = (num_pixels + threads - 1) / threads;
-        
-        // 我们需要传递 sample_ndim
         init_indices_kernel<<<blocks, threads>>>(
-            current_idx.data_ptr<int32_t>(), 
-            num_pixels, 
+            current_idx.data_ptr<int32_t>(),
+            num_pixels,
             sample_ndim,
-            sample_shape_tensor.data_ptr<int64_t>(),
-            nullptr // strides not needed for simple unravel
+            shape_tensor.data_ptr<int64_t>()
         );
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            printf("Init Kernel Failed: %s\n", cudaGetErrorString(err));
+        }
     }
-    
-    // 用于 Global Memory Fallback 的临时 buffer
+
+    // 预分配 Global Memory Buffer (懒加载)
     torch::Tensor global_buf1, global_buf2;
 
-    // 2. 逐维处理 (Separable Phases)
-    // 从最后一个维度倒着处理，或者顺序处理都可以。
-    // 为了 Host Transpose 方便，我们遍历 sample 的每一个维度 (1 到 ndim-1)
+    // 3. 逐维处理 (Separable JFA)
+    // 遍历每一个空间维度 (从 1 到 ndim-1)
     for (int d = 1; d < ndim; ++d) {
         bool is_final_pass = (d == ndim - 1);
         
-        // -----------------------------------------------------------
-        // Step A: Permute & Contiguous
-        // 将当前处理维度 d 移到最后: (0, 1, ..., d, ..., N-1) -> (0, 1, ..., N-1, d)
-        // 这样最后内存布局就是 [..., L]，stride=1
-        // -----------------------------------------------------------
-        
-        // 这种 swap 策略比较简单: transpose(d, -1)
-        // 注意：index tensor 也要变换，但 index tensor 最后一维是 coord_dim，不能乱动。
-        // Index tensor 形状是 [..., sample_ndim]。
-        // 我们需要变换的是前面的空间维度 [...]。
-        
+        // --- Step A: Transpose current dim to last ---
+        // 变换后 Shape: (..., L)
         auto dist_in = current_dist.transpose(d, ndim - 1).contiguous();
         auto idx_in  = current_idx.transpose(d, ndim - 1).contiguous(); 
         
-        // 此时 dist_in shape: [..., L]
-        // idx_in shape: [..., L, sample_ndim]
-        
         int64_t L = dist_in.size(-1); // 当前维度的长度
-        int64_t total_slices = dist_in.numel() / L; // 有多少行
+        int64_t total_slices = dist_in.numel() / L; 
         
         auto dist_out = torch::empty_like(dist_in);
         auto idx_out  = torch::empty_like(idx_in);
 
-        // -----------------------------------------------------------
-        // Step B: Kernel Dispatch
-        // -----------------------------------------------------------
+        // --- Step B: Kernel Dispatch ---
         int threads = std::min((int64_t)MAX_THREADS, L);
         
-        // 检查 Shared Memory 需求
-        // Need: float(4) + int(4) + int(4) = 12 bytes per pixel
+        // 检查是否可以使用 Shared Memory
         if (L <= SMEM_LIMIT_ELEMENTS) {
             size_t smem_size = L * (sizeof(float) + 2 * sizeof(int));
             
-            // 模板参数 NDim 需要是编译期常量。
-            // 动态分发 sample_ndim (1D, 2D, 3D usually)
-            // 使用 switch case 覆盖常见维度 (1, 2, 3)
+            // 使用 Switch 宏来处理常用的维度模板特化
             #define DISPATCH_SHARED(IS_FINAL) \
                 switch(sample_ndim) { \
                     case 1: edt_kernel_shared<IS_FINAL, 1><<<total_slices, threads, smem_size>>>( \
                         dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
                         dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel()); break; \
+                        L, dist_in.numel(), sample_ndim); break; \
                     case 2: edt_kernel_shared<IS_FINAL, 2><<<total_slices, threads, smem_size>>>( \
                         dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
                         dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel()); break; \
+                        L, dist_in.numel(), sample_ndim); break; \
                     case 3: edt_kernel_shared<IS_FINAL, 3><<<total_slices, threads, smem_size>>>( \
                         dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
                         dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel()); break; \
-                    default: /* fallback for >3D */ break; \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    case 4: edt_kernel_shared<IS_FINAL, 4><<<total_slices, threads, smem_size>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    case 5: edt_kernel_shared<IS_FINAL, 5><<<total_slices, threads, smem_size>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    case 6: edt_kernel_shared<IS_FINAL, 6><<<total_slices, threads, smem_size>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    default: /* Fallback for > 6D */ \
+                        edt_kernel_shared<IS_FINAL, 0><<<total_slices, threads, smem_size>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
                 }
 
             if (is_final_pass) { DISPATCH_SHARED(true); } 
             else { DISPATCH_SHARED(false); }
 
         } else {
-            // Fallback: Global Memory
-            // 需要分配 buffer: [total_slices * L] = [numel]
+            // Global Memory Fallback (L > 4096)
             if (global_buf1.numel() < dist_in.numel()) {
                 global_buf1 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
                 global_buf2 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
@@ -403,33 +420,48 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
                         dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
                         dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
                         global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel()); break; \
+                        L, dist_in.numel(), sample_ndim); break; \
                     case 2: edt_kernel_global<IS_FINAL, 2><<<total_slices, threads>>>( \
                         dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
                         dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
                         global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel()); break; \
+                        L, dist_in.numel(), sample_ndim); break; \
                     case 3: edt_kernel_global<IS_FINAL, 3><<<total_slices, threads>>>( \
                         dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
                         dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
                         global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel()); break; \
-                    default: break; \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    case 4: edt_kernel_global<IS_FINAL, 4><<<total_slices, threads>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    case 5: edt_kernel_global<IS_FINAL, 5><<<total_slices, threads>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    case 6: edt_kernel_global<IS_FINAL, 6><<<total_slices, threads>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
+                    default: /* Fallback */ \
+                        edt_kernel_global<IS_FINAL, 0><<<total_slices, threads>>>( \
+                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
+                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
+                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
+                        L, dist_in.numel(), sample_ndim); break; \
                 }
 
-            if (is_final_pass) { DISPATCH_GLOBAL(true); }
+            if (is_final_pass) { DISPATCH_GLOBAL(true); } 
             else { DISPATCH_GLOBAL(false); }
         }
 
-        // -----------------------------------------------------------
-        // Step C: Transpose Back
-        // -----------------------------------------------------------
-        current_dist = dist_out.transpose(d, ndim - 1); 
+        // --- Step C: Transpose Back ---
+        current_dist = dist_out.transpose(d, ndim - 1);
         current_idx  = idx_out.transpose(d, ndim - 1);
     }
 
-    if (had_no_batch_dim) {
-        return std::make_tuple(current_dist.squeeze(0), current_idx.squeeze(0));
-    }
     return std::make_tuple(current_dist, current_idx);
 }
\ No newline at end of file

From 44930399b03615df52ec95713890e5bdbbffb3ef Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Tue, 9 Dec 2025 08:12:40 +0800
Subject: [PATCH 21/56] forbid non-ascii

---
 .pre-commit-config.yaml | 12 +++++++++
 scripts/check_ascii.py  | 58 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 scripts/check_ascii.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6df40f7..a79f4db 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,3 +11,15 @@ repos:
     rev: 6.1.0
     hooks:
       - id: flake8
+
+  # -------------------------
+  # ⭐ Local Hook: forbid non-ASCII in C/C++/CUDA
+  # -------------------------
+  - repo: local
+    hooks:
+      - id: forbid-non-ascii
+        name: "Forbid non-ASCII characters in C/C++/CUDA"
+        entry: python3 scripts/check_ascii.py
+        language: system
+        types: [file]
+        files: '\.(c|cc|cpp|cxx|cu|cuh|h|hpp)$'
diff --git a/scripts/check_ascii.py b/scripts/check_ascii.py
new file mode 100644
index 0000000..eb1cf91
--- /dev/null
+++ b/scripts/check_ascii.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+import sys
+from pathlib import Path
+
+TARGET_SUFFIXES = {".c", ".cc", ".cpp", ".cxx", ".cu", ".cuh", ".h", ".hpp"}
+
+def find_non_ascii(line):
+    """Return list of (index, char) for all non-ASCII chars in a line."""
+    result = []
+    for i, ch in enumerate(line):
+        if ord(ch) > 127:
+            result.append((i, ch))
+    return result
+
+
+def check_file(path: Path) -> bool:
+    ok = True
+    with path.open("r", encoding="utf-8", errors="ignore") as f:
+        for lineno, line in enumerate(f, start=1):
+            non_ascii = find_non_ascii(line)
+            if non_ascii:
+                ok = False
+                print(f"\n❌ {path}:{lineno}: non-ASCII characters detected")
+
+                # Print the full line
+                print(f"   Line content:")
+                print(f"   {line.rstrip()}")
+
+                # Underline the exact non-ASCII characters
+                underline = [" " for _ in line.rstrip("\n")]
+                for idx, ch in non_ascii:
+                    if idx < len(underline):
+                        underline[idx] = "^"
+                print(f"   {' '.join(underline)}")
+
+                # Print what characters exactly
+                chars = ", ".join(f"'{ch}' (U+{ord(ch):04X})" for _, ch in non_ascii)
+                print(f"   Offending chars: {chars}")
+
+    return ok
+
+
+def main(files):
+    ok = True
+    for f in files:
+        p = Path(f)
+        if p.suffix.lower() in TARGET_SUFFIXES and p.exists():
+            if not check_file(p):
+                ok = False
+    sys.exit(0 if ok else 1)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: check_ascii.py <files...>")
+        sys.exit(1)
+    main(sys.argv[1:])
+

From 81e43df98f9efff001b6b5c7d24655ff4a59f3aa Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Tue, 9 Dec 2025 08:23:01 +0800
Subject: [PATCH 22/56] workflow: precommit

---
 .github/workflows/pre-commit.yml | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 .github/workflows/pre-commit.yml

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 0000000..ca7885d
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,32 @@
+name: pre-commit
+
+on:
+  push:
+    branches: [ "*" ]
+  pull_request:
+    branches: [ "*" ]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.x"
+
+      - name: Install pre-commit
+        run: |
+          pip install pre-commit
+
+      - name: Install pre-commit hooks
+        run: |
+          pre-commit install --install-hooks
+
+      - name: Run pre-commit on all files
+        run: |
+          pre-commit run --from-ref origin/main --to-ref HEAD

From 95f8f34a9884d8c3ccebf120b8f41f89dd544dba Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Tue, 9 Dec 2025 08:25:21 +0800
Subject: [PATCH 23/56] workflow: run on all files

---
 .github/workflows/pre-commit.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index ca7885d..2c1bd6c 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -29,4 +29,4 @@ jobs:
 
       - name: Run pre-commit on all files
         run: |
-          pre-commit run --from-ref origin/main --to-ref HEAD
+          pre-commit run --all-files

From 83410854d0bc49658b45221bd75554450eb58d1e Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Tue, 9 Dec 2025 09:17:51 +0800
Subject: [PATCH 24/56] prevent duplicated ci

---
 .github/workflows/pre-commit.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 2c1bd6c..bfebf61 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -2,7 +2,7 @@ name: pre-commit
 
 on:
   push:
-    branches: [ "*" ]
+    branches: [ "main" ]
   pull_request:
     branches: [ "*" ]
 

From fd4cd5e191327dc3a9c871b756d8c450e7bd35de Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Tue, 9 Dec 2025 22:10:13 +0800
Subject: [PATCH 25/56] test workflow

---
 .github/workflows/test.yml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..f602494
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,24 @@
+name: test
+
+on:
+    pull_request:
+      branches: ["*"]
+    push:
+        branches: [main]
+
+permissions:
+    contents: read
+    pull-requests: read
+
+test:
+    runs-on: self-hosted
+    steps:
+        - uses: actions/checkout@v4
+        - run: |
+            pip install -r requirements-dev.txt --break-system-packages --user
+            pip uninstall torchmorph --yes
+            python setup.py install --user
+        - run: |
+            ORIGINAL=$(pwd)
+            cd /tmp
+            pytest $ORIGINAL/test
\ No newline at end of file

From 1b520c25c738e56a37068572e4f58d67759dea10 Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Tue, 9 Dec 2025 22:12:46 +0800
Subject: [PATCH 26/56] test workflow

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index f602494..d8e1647 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -16,7 +16,7 @@ test:
         - uses: actions/checkout@v4
         - run: |
             pip install -r requirements-dev.txt --break-system-packages --user
-            pip uninstall torchmorph --yes
+            pip uninstall torchmorph --yes --break-system-packages
             python setup.py install --user
         - run: |
             ORIGINAL=$(pwd)

From 157bfac630449175a87778209487d5b8aa763862 Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Tue, 9 Dec 2025 22:13:55 +0800
Subject: [PATCH 27/56] test workflow

---
 .github/workflows/test.yml | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d8e1647..bc4ddf6 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -6,19 +6,17 @@ on:
     push:
         branches: [main]
 
-permissions:
-    contents: read
-    pull-requests: read
+jobs:
 
-test:
-    runs-on: self-hosted
-    steps:
-        - uses: actions/checkout@v4
-        - run: |
-            pip install -r requirements-dev.txt --break-system-packages --user
-            pip uninstall torchmorph --yes --break-system-packages
-            python setup.py install --user
-        - run: |
-            ORIGINAL=$(pwd)
-            cd /tmp
-            pytest $ORIGINAL/test
\ No newline at end of file
+  test:
+      runs-on: self-hosted
+      steps:
+          - uses: actions/checkout@v4
+          - run: |
+              pip install -r requirements-dev.txt --break-system-packages --user
+              pip uninstall torchmorph --yes --break-system-packages
+              python setup.py install --user
+          - run: |
+              ORIGINAL=$(pwd)
+              cd /tmp
+              pytest $ORIGINAL/test
\ No newline at end of file

From 659999175ea83da5c74359526fabfcbc833ab51a Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Tue, 9 Dec 2025 22:22:20 +0800
Subject: [PATCH 28/56] CUDA_HOME

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index bc4ddf6..5ef06c7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -19,4 +19,4 @@ jobs:
           - run: |
               ORIGINAL=$(pwd)
               cd /tmp
-              pytest $ORIGINAL/test
\ No newline at end of file
+              CUDA_HOME=/usr/local/cuda pytest $ORIGINAL/test
\ No newline at end of file

From d417ba9339c7a9e26d634373bd54e6106c25485b Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Tue, 9 Dec 2025 22:24:28 +0800
Subject: [PATCH 29/56] CUDA_HOME

---
 .github/workflows/test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5ef06c7..0f7cb49 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -13,10 +13,11 @@ jobs:
       steps:
           - uses: actions/checkout@v4
           - run: |
+              CUDA_HOME=/usr/local/cuda
               pip install -r requirements-dev.txt --break-system-packages --user
               pip uninstall torchmorph --yes --break-system-packages
               python setup.py install --user
           - run: |
               ORIGINAL=$(pwd)
               cd /tmp
-              CUDA_HOME=/usr/local/cuda pytest $ORIGINAL/test
\ No newline at end of file
+              pytest $ORIGINAL/test
\ No newline at end of file

From c93176c8adaccf7f3a5be9fa00be6a3feec4f917 Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Tue, 9 Dec 2025 23:27:11 +0800
Subject: [PATCH 30/56] CUDA_HOME

---
 .github/workflows/test.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0f7cb49..ca3e9b3 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -13,7 +13,11 @@ jobs:
       steps:
           - uses: actions/checkout@v4
           - run: |
-              CUDA_HOME=/usr/local/cuda
+              # Make CUDA visible to this shell and all child processes
+              export CUDA_HOME=/usr/local/cuda
+              export PATH="$CUDA_HOME/bin:$PATH"
+              export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
+              echo "CUDA_HOME=$CUDA_HOME"
               pip install -r requirements-dev.txt --break-system-packages --user
               pip uninstall torchmorph --yes --break-system-packages
               python setup.py install --user

From 384f6413a152106b93d02168465b6cb005545f6a Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Wed, 10 Dec 2025 12:04:53 +0800
Subject: [PATCH 31/56] check ascii for .py

---
 scripts/check_ascii.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/check_ascii.py b/scripts/check_ascii.py
index eb1cf91..75d9151 100644
--- a/scripts/check_ascii.py
+++ b/scripts/check_ascii.py
@@ -2,7 +2,7 @@
 import sys
 from pathlib import Path
 
-TARGET_SUFFIXES = {".c", ".cc", ".cpp", ".cxx", ".cu", ".cuh", ".h", ".hpp"}
+TARGET_SUFFIXES = {".c", ".cc", ".cpp", ".cxx", ".cu", ".cuh", ".h", ".hpp", ".py"}
 
 def find_non_ascii(line):
     """Return list of (index, char) for all non-ASCII chars in a line."""

From da7cadd6da2409f2896d0d8081402f32b9ee0ada Mon Sep 17 00:00:00 2001
From: dongliangnie <niedongliang1217@shu.edu.cn>
Date: Wed, 10 Dec 2025 21:41:40 +0800
Subject: [PATCH 32/56] =?UTF-8?q?=E4=BF=AE=E6=94=B9distance=5Ftransform=5F?=
 =?UTF-8?q?kernel.cu=E6=B3=A8=E9=87=8A=E4=B8=BA=E8=8B=B1=E6=96=87?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torchmorph/csrc/distance_transform_kernel.cu | 158 ++++++++++---------
 1 file changed, 80 insertions(+), 78 deletions(-)

diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 6e9c4c1..41defa6 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -6,13 +6,13 @@
 #include <cuda_runtime.h>
 
 // ------------------------------------------------------------------
-// 配置常量
+// Configuration Constants
 // ------------------------------------------------------------------
 #define INF_VAL 1e8f
 #define MAX_THREADS 1024
-// Shared Memory 限制: 48KB 一般安全。
-// 每个像素需要: float(val) + int(idx1) + int(idx2) = 12 bytes
-// 4096 * 12 = 48KB.
+// Shared memory limit: typically 48 KB.
+// Each pixel requires: float(value) + int(idx1) + int(idx2) = 12 bytes.
+// 4096 * 12 = 48 KB.
 #define SMEM_LIMIT_ELEMENTS 4096 
 
 // ------------------------------------------------------------------
@@ -21,7 +21,7 @@
 
 __device__ __forceinline__ float sqr(float x) { return x * x; }
 
-// 计算 JFA 代价: (q - p)^2 + weight[p]
+// Compute the JFA cost: (q - p)^2 + weight[p]
 __device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
     if (p < 0) return INF_VAL; 
     return sqr((float)q - (float)p) + val_p;
@@ -30,25 +30,25 @@ __device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
 // ------------------------------------------------------------------
 // JFA Core Logic (Device Only)
 // ------------------------------------------------------------------
-// 核心 JFA 逻辑，与数据位置无关 (Shared 或 Global 均通用)
+// Core JFA logic, independent of data location (works with both Shared and Global memory).
 __device__ void run_jfa_core(
     int N,
     int tid,
-    const float* __restrict__ vals,  // 输入权重 (只读)
+    const float* __restrict__ vals,  // input weight (read-only)
     int* __restrict__ idx_curr,      // Ping-Pong Buffer A
     int* __restrict__ idx_next       // Ping-Pong Buffer B
 ) {
-    // 1. 初始化: 根据 vals 决定是否是有效源点
+    // 1. Initialization: determine whether each pixel is a valid source based on vals.
     for (int i = tid; i < N; i += blockDim.x) {
         if (vals[i] >= INF_VAL * 0.9f) {
-            idx_curr[i] = -1; // 背景
+            idx_curr[i] = -1; // background
         } else {
-            idx_curr[i] = i;  // 物体/源点，初始索引指向自己
+            idx_curr[i] = i;  // For each object/source point, the initial index points to itself.
         }
     }
     __syncthreads();
 
-    // 2. 迭代传播 (Step = 1, 2, 4, ... < N)
+    // 2. Iterative Propagation (Step = 1, 2, 4, ... < N)
     int* idx_in = idx_curr;
     int* idx_out = idx_next;
 
@@ -57,7 +57,7 @@ __device__ void run_jfa_core(
             int my_best_p = idx_in[i];
             float min_cost = INF_VAL;
 
-            // 检查自己当前的最优解
+            // Check its current best solution
             if (my_best_p != -1) {
                 min_cost = compute_cost(i, my_best_p, vals[my_best_p]);
             }
@@ -97,7 +97,7 @@ __device__ void run_jfa_core(
         __syncthreads();
     }
 
-    // 3. 确保最终结果在 idx_curr (如果循环结束时在 next，则拷回)
+    // 3. Ensure the final result is stored in idx_curr (if the loop ends with idx_next, copy it back).
     if (idx_in != idx_curr) {
         for (int i = tid; i < N; i += blockDim.x) {
             idx_curr[i] = idx_next[i];
@@ -109,70 +109,71 @@ __device__ void run_jfa_core(
 // ------------------------------------------------------------------
 // Kernel 1: Shared Memory JFA (Fast Path)
 // ------------------------------------------------------------------
-// 模板参数 NDim: 如果 > 0，编译器会展开循环优化。
-// 参数 runtime_ndim: 如果 NDim == 0 (Default case)，使用该参数作为维度。
+// Template parameter NDim: when NDim > 0, the compiler performs loop unrolling optimizations.
+// Runtime parameter runtime_ndim: when NDim == 0 (default behavior), this parameter specifies the dimension.
 template <bool IsFinal, int NDim>
 __global__ void edt_kernel_shared(
-    const float* __restrict__ in_data,       // 输入 Dist^2
-    const int32_t* __restrict__ in_indices,  // 输入 Indices
-    float* __restrict__ out_dist,            // 输出 Dist (IsFinal ? sqrt : sqr)
-    int32_t* __restrict__ out_indices,       // 输出 Indices
-    int64_t L,                               // 当前维度的长度
-    int64_t total_elements,                  // 总像素数
-    int runtime_ndim                         // 运行时维度 (fallback)
+    const float* __restrict__ in_data,       // input Dist^2
+    const int32_t* __restrict__ in_indices,  // output Indices
+    float* __restrict__ out_dist,            // output Dist (IsFinal ? sqrt : sqr)
+    int32_t* __restrict__ out_indices,       // output Indices
+    int64_t L,               // Size of the current dimension
+    int64_t total_elements,  // Total number of elements
+    int runtime_ndim         // Runtime dimension (used as fallback)
 ) {
-    // 确定实际维度
+    // Determine the effective dimension
     const int D = (NDim > 0) ? NDim : runtime_ndim;
 
-    // 计算行偏移
+    // Compute row offset
     int64_t row_idx = blockIdx.x;
     int64_t offset = row_idx * L;
-    
+
     if (offset >= total_elements) return;
 
-    // Shared Memory 布局
+    // Shared memory layout
     extern __shared__ char s_buffer[];
     float* s_vals = (float*)s_buffer;
     int*   s_idx1 = (int*)(s_vals + L);
     int*   s_idx2 = (int*)(s_idx1 + L);
 
-    // 1. 加载 Dist 到 Shared Memory
+    // 1. Load distances into Shared Memory
     for (int i = threadIdx.x; i < L; i += blockDim.x) {
         s_vals[i] = __ldg(&in_data[offset + i]);
     }
     __syncthreads();
 
-    // 2. 运行 JFA 核心
+    // 2. Run the core JFA logic
     run_jfa_core(L, threadIdx.x, s_vals, s_idx1, s_idx2);
-    
-    // 3. 写回结果
+
+    // 3. Write back the results
     for (int q = threadIdx.x; q < L; q += blockDim.x) {
-        int p = s_idx1[q]; // 最近点在当前行内的局部索引 (0..L-1)
+        int p = s_idx1[q];  // Nearest point (local index within 0..L-1)
         float dist_val;
 
-        // 计算新距离
+        // Compute updated distance
         if (p != -1) {
             float dist_sq = sqr((float)q - (float)p) + s_vals[p];
             dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
         } else {
+            // No source point found (e.g., entire row is background)
             dist_val = IsFinal ? INF_VAL : sqr(INF_VAL);
-            p = 0; // 防止越界，随便指一个
+            p = 0;  // Prevent out-of-bounds access
         }
         out_dist[offset + q] = dist_val;
 
-        // 索引传播: Copy Vector [D]
+        // Propagate indices: copy a vector of size [D]
         if (p != -1) {
             int64_t src_offset = (offset + p) * D;
             int64_t dst_offset = (offset + q) * D;
-            
-            // 如果 NDim > 0，这里会完全展开，非常快
+
+            // When NDim > 0, this loop is fully unrolled by the compiler
             for (int d = 0; d < D; ++d) {
                 out_indices[dst_offset + d] = in_indices[src_offset + d];
             }
         } else {
-             // 找不到源点（全图都是背景的情况）
-             int64_t dst_offset = (offset + q) * D;
-             for (int d = 0; d < D; ++d) out_indices[dst_offset + d] = 0;
+            // Fallback: no source available
+            int64_t dst_offset = (offset + q) * D;
+            for (int d = 0; d < D; ++d) out_indices[dst_offset + d] = 0;
         }
     }
 }
@@ -180,7 +181,7 @@ __global__ void edt_kernel_shared(
 // ------------------------------------------------------------------
 // Kernel 2: Global Memory JFA (Fallback Path)
 // ------------------------------------------------------------------
-// 逻辑同上，只是用 Global Memory 做 Ping-Pong Buffer
+// Same logic as above, but uses Global Memory as the ping-pong buffer
 template <bool IsFinal, int NDim>
 __global__ void edt_kernel_global(
     const float* __restrict__ in_data,
@@ -200,14 +201,14 @@ __global__ void edt_kernel_global(
     
     if (offset >= total_elements) return;
 
-    // 指向 Global Memory 的指针
+    // Pointers to Global Memory
     int* g_idx1 = global_buffer_1 + offset;
     int* g_idx2 = global_buffer_2 + offset;
     
-    // 1. & 2. 运行 JFA (直接在 Global Mem 上读写)
+    // 1. & 2. Run the JFA core (operating directly on Global Memory)
     run_jfa_core(L, threadIdx.x, in_data + offset, g_idx1, g_idx2);
 
-    // 3. 写回结果
+    // 3. Write back results
     for (int q = threadIdx.x; q < L; q += blockDim.x) {
         int p = g_idx1[q];
         float dist_val;
@@ -236,10 +237,11 @@ __global__ void edt_kernel_global(
     }
 }
 
+
 // ------------------------------------------------------------------
 // Kernel 3: Initialize Indices
 // ------------------------------------------------------------------
-// 初始化索引张量为网格坐标
+// Initialize index tensor as grid coordinates
 // indices shape: (..., D)
 __global__ void init_indices_kernel(
     int32_t* indices, 
@@ -250,24 +252,24 @@ __global__ void init_indices_kernel(
     int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= total_pixels) return;
 
-    // 反解坐标 (Unravel Index)
-    // idx 是每个像素的 flat index
-    // 我们需要计算它在 spatial_shape 中的坐标
+    // Unravel Index
+    // idx is the flat index of each pixel
+    // We need to compute its coordinate in spatial_shape
     
     int64_t temp = idx;
-    // 使用本地寄存器数组避免多次全局内存读取 (假设最大 10 维)
+    // Use local register array to avoid repeated global memory reads (assume max 10 dims)
     int32_t coords[10]; 
 
-    // 假设 spatial_shape 是 [D0, D1, D2]
-    // 倒序计算除余
+    // Example: spatial_shape = [D0, D1, D2]
+    // compute by modulo from last dimension
     for (int d = NDim - 1; d >= 0; --d) {
         int64_t dim_size = shape_ptr[d];
         coords[d] = temp % dim_size;
         temp /= dim_size;
     }
 
-    // 写入 Global Memory
-    // Indices tensor 是 (TotalPixels, NDim) 扁平化的
+    // Write to Global Memory
+    // Indices tensor is flattened as (TotalPixels, NDim)
     int64_t out_ptr = idx * NDim;
     for (int d = 0; d < NDim; ++d) {
         indices[out_ptr + d] = coords[d];
@@ -284,24 +286,23 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
     
     input = input.contiguous();
     
-    // 处理 Batch 维度：如果输入是 1D (L)，视为无 Batch，但在处理中统一加一个 Batch 维方便
-    // 标准约定：Input shape (Batch, D1, D2, ..., Dn)
-    // 算法对 Batch 维度和其他维度处理其实是一样的（视为无关维度）
-    // 但索引初始化需要知道哪些是 "Spatial Dimensions"。
-    // 这里假设：输入的所有维度除了 Batch (Dim 0) 外都是空间维度。
+    // Handle batch dimension: if input is 1D (L), treat as no batch but internally add a batch dimension.
+    // Convention: input shape is (Batch, D1, D2, ..., Dn)
+    // Algorithm treats batch and other dims identically (batch is just another leading dimension)
+    // But index initialization needs to know which are "spatial dimensions".
+    // Assumption: all dims except dim 0 (Batch) are spatial.
     
     const int ndim = input.dim(); 
-    // 如果 ndim=1, 假设是 (L)，sample_ndim=1
-    // 如果 ndim=4 (B, C, H, W)，sample_ndim=3 (C,H,W 都算空间? 通常 C 也是独立处理的)
-    // **修正**: 标准 EDT 通常是在 (H, W) 或 (D, H, W) 上进行的。
-    // 如果有 Channel，通常 Channel 也是独立的。
-    // 为了最通用，我们将 **除了第0维(Batch)** 以外的所有维度都视为空间维度进行索引记录。
-    // 如果用户输入没有 Batch 维，请在 Python 端 unsqueeze(0)。
+    // If ndim=1, assume (L) → sample_ndim=1
+    // If ndim=4 (B, C, H, W), sample_ndim=3 (C,H,W treated as spatial? Channels often processed independently)
+    // Correction: classical EDT usually runs on (H,W) or (D,H,W).
+    // If channels exist, typically each channel is processed independently.
+    // For maximum generality, we treat **all dims except dim 0** as spatial dims.
+    // If input has no batch dim, user should use unsqueeze(0) in Python.
     
-    // 假设输入已经是 (Batch, ...Spatial...)
     const int sample_ndim = ndim - 1; 
     TORCH_CHECK(sample_ndim > 0, "Input tensor must have at least 2 dimensions (Batch, ...)");
-
+    
     auto shape = input.sizes().vec();
     int64_t num_pixels = input.numel();
 
@@ -312,23 +313,23 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
                                torch::empty(index_shape, input.options().dtype(torch::kInt32)));
     }
 
-    // 1. 初始化 Distance Tensor
+    // 1. Initialize Distance Tensor
     // 0 -> 0, 1 -> INF
     auto current_dist = torch::where(input == 0, 
                                      torch::tensor(0.0f, input.options()), 
                                      torch::tensor(INF_VAL, input.options()));
     
-    // 2. 初始化 Index Tensor
+    // 2. Initialize Index Tensor
     // Shape: (Batch, D1, ..., Dn, sample_ndim)
     auto index_shape = shape;
     index_shape.push_back(sample_ndim);
     auto current_idx = torch::empty(index_shape, input.options().dtype(torch::kInt32));
     
-    // 2.1 准备 Shape 数据传给 Kernel
+    // 2.1 Prepare shape tensor for kernel
     std::vector<int64_t> spatial_shape(shape.begin() + 1, shape.end());
     auto shape_tensor = torch::tensor(spatial_shape, torch::kInt64).to(input.device());
 
-    // 2.2 运行初始化 Kernel
+    // 2.2 Launch initialization kernel
     {
         int threads = 256;
         int blocks = (num_pixels + threads - 1) / threads;
@@ -344,20 +345,20 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
         }
     }
 
-    // 预分配 Global Memory Buffer (懒加载)
+    // Pre-allocate Global Memory Buffers (lazy)
     torch::Tensor global_buf1, global_buf2;
 
-    // 3. 逐维处理 (Separable JFA)
-    // 遍历每一个空间维度 (从 1 到 ndim-1)
+    // 3. Process each spatial dimension (Separable JFA)
+    // Iterate through each spatial dimension (1 to ndim-1)
     for (int d = 1; d < ndim; ++d) {
         bool is_final_pass = (d == ndim - 1);
         
         // --- Step A: Transpose current dim to last ---
-        // 变换后 Shape: (..., L)
+        // Resulting shape: (..., L)
         auto dist_in = current_dist.transpose(d, ndim - 1).contiguous();
         auto idx_in  = current_idx.transpose(d, ndim - 1).contiguous(); 
         
-        int64_t L = dist_in.size(-1); // 当前维度的长度
+        int64_t L = dist_in.size(-1); 
         int64_t total_slices = dist_in.numel() / L; 
         
         auto dist_out = torch::empty_like(dist_in);
@@ -366,11 +367,11 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
         // --- Step B: Kernel Dispatch ---
         int threads = std::min((int64_t)MAX_THREADS, L);
         
-        // 检查是否可以使用 Shared Memory
+        // Check whether Shared Memory can be used
         if (L <= SMEM_LIMIT_ELEMENTS) {
             size_t smem_size = L * (sizeof(float) + 2 * sizeof(int));
             
-            // 使用 Switch 宏来处理常用的维度模板特化
+            // Switch macro to handle template dimension specialization
             #define DISPATCH_SHARED(IS_FINAL) \
                 switch(sample_ndim) { \
                     case 1: edt_kernel_shared<IS_FINAL, 1><<<total_slices, threads, smem_size>>>( \
@@ -408,7 +409,7 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
             else { DISPATCH_SHARED(false); }
 
         } else {
-            // Global Memory Fallback (L > 4096)
+            // Global Memory fallback (L > 4096)
             if (global_buf1.numel() < dist_in.numel()) {
                 global_buf1 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
                 global_buf2 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
@@ -464,4 +465,5 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
     }
 
     return std::make_tuple(current_dist, current_idx);
-}
\ No newline at end of file
+}
+

From d622ccddacb338ddea3cd10384450857db99db63 Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Wed, 10 Dec 2025 23:43:03 +0800
Subject: [PATCH 33/56] flake8

---
 .flake8 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.flake8 b/.flake8
index 5fc408e..6ef9e30 100644
--- a/.flake8
+++ b/.flake8
@@ -1,6 +1,6 @@
 [flake8]
 max-line-length = 100
-extend-ignore = E203, W503  # Compatibility with Black
+extend-ignore = E203, W503
 exclude = 
     __pycache__,
     build,

From ee9e779a5606dde6290712a2cd55a29ce293f517 Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Wed, 10 Dec 2025 23:45:05 +0800
Subject: [PATCH 34/56] ->

---
 torchmorph/csrc/distance_transform_kernel.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 41defa6..503d13c 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -293,7 +293,7 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
     // Assumption: all dims except dim 0 (Batch) are spatial.
     
     const int ndim = input.dim(); 
-    // If ndim=1, assume (L) → sample_ndim=1
+    // If ndim=1, assume (L) -> sample_ndim=1
     // If ndim=4 (B, C, H, W), sample_ndim=3 (C,H,W treated as spatial? Channels often processed independently)
     // Correction: classical EDT usually runs on (H,W) or (D,H,W).
     // If channels exist, typically each channel is processed independently.

From c42bb3eb119839f1215a157633648d344b8b413a Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Wed, 10 Dec 2025 23:50:39 +0800
Subject: [PATCH 35/56] check ascii for py

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a79f4db..86088b6 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,4 +22,4 @@ repos:
         entry: python3 scripts/check_ascii.py
         language: system
         types: [file]
-        files: '\.(c|cc|cpp|cxx|cu|cuh|h|hpp)$'
+        files: '\.(c|cc|cpp|cxx|cu|cuh|h|hpp|py)$'

From b8fbc8e4a177af96d34fdae14283f1a8ae36ef0c Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Wed, 10 Dec 2025 23:54:25 +0800
Subject: [PATCH 36/56] check non-latin languages

---
 scripts/check_ascii.py | 77 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 65 insertions(+), 12 deletions(-)

diff --git a/scripts/check_ascii.py b/scripts/check_ascii.py
index 75d9151..bc1dcdd 100644
--- a/scripts/check_ascii.py
+++ b/scripts/check_ascii.py
@@ -1,40 +1,94 @@
 #!/usr/bin/env python3
 import sys
+import unicodedata
 from pathlib import Path
 
 TARGET_SUFFIXES = {".c", ".cc", ".cpp", ".cxx", ".cu", ".cuh", ".h", ".hpp", ".py"}
 
-def find_non_ascii(line):
-    """Return list of (index, char) for all non-ASCII chars in a line."""
+
+# --- Helpers --------------------------------------------------------
+
+
+# Latin ranges we still consider "English-ish" and therefore allowed.
+# (You can shrink this if you want to ban accented letters too.)
+LATIN_RANGES = [
+    (0x0000, 0x007F),  # Basic Latin (ASCII)
+    (0x00C0, 0x024F),  # Latin-1 Supplement + Latin Extended-A/B
+    (0x1E00, 0x1EFF),  # Latin Extended Additional
+]
+
+
+def in_ranges(ch: str, ranges) -> bool:
+    cp = ord(ch)
+    for start, end in ranges:
+        if start <= cp <= end:
+            return True
+    return False
+
+
+def is_forbidden_char(ch: str) -> bool:
+    """
+    Return True if ch should be *forbidden*.
+
+    Policy:
+      - ASCII (<= 0x7F): always OK
+      - Non-ASCII letters (Unicode category starting with 'L')
+        that are NOT in Latin ranges: forbidden
+      - Everything else (emoji, arrows, symbols, etc.): allowed
+    """
+    cp = ord(ch)
+    if cp <= 0x7F:
+        return False  # pure ASCII
+
+    cat = unicodedata.category(ch)
+
+    # Forbid letters that are not Latin.
+    if cat.startswith("L"):  # Letter
+        if in_ranges(ch, LATIN_RANGES):
+            return False  # Latin letters allowed
+        return True       # Non-Latin letters forbidden
+
+    # All non-letter stuff (emoji, arrows, symbols, punctuation) is allowed.
+    return False
+
+
+def find_forbidden_chars(line: str):
+    """Return list of (index, char) for all forbidden chars in a line."""
     result = []
     for i, ch in enumerate(line):
-        if ord(ch) > 127:
+        if is_forbidden_char(ch):
             result.append((i, ch))
     return result
 
 
+# --- Core logic -----------------------------------------------------
+
+
 def check_file(path: Path) -> bool:
     ok = True
     with path.open("r", encoding="utf-8", errors="ignore") as f:
         for lineno, line in enumerate(f, start=1):
-            non_ascii = find_non_ascii(line)
-            if non_ascii:
+            forbidden = find_forbidden_chars(line)
+            if forbidden:
                 ok = False
-                print(f"\n❌ {path}:{lineno}: non-ASCII characters detected")
+                print(f"\n❌ {path}:{lineno}: non-English letters detected")
 
                 # Print the full line
-                print(f"   Line content:")
+                print("   Line content:")
                 print(f"   {line.rstrip()}")
 
-                # Underline the exact non-ASCII characters
+                # Underline the forbidden characters
                 underline = [" " for _ in line.rstrip("\n")]
-                for idx, ch in non_ascii:
+                for idx, ch in forbidden:
                     if idx < len(underline):
                         underline[idx] = "^"
-                print(f"   {' '.join(underline)}")
+                print(f"   {''.join(underline)}")
 
                 # Print what characters exactly
-                chars = ", ".join(f"'{ch}' (U+{ord(ch):04X})" for _, ch in non_ascii)
+                chars = ", ".join(
+                    f"'{ch}' (U+{ord(ch):04X}) [{unicodedata.name(ch, 'UNKNOWN')}]"
+                    for _, ch in forbidden
+                )
                 print(f"   Offending chars: {chars}")
 
     return ok
@@ -55,4 +109,3 @@ def main(files):
         print("Usage: check_ascii.py <files...>")
         sys.exit(1)
     main(sys.argv[1:])
-

From 4ed06101a9e0edc33068a16e8dcfc3ea1be58dac Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Wed, 10 Dec 2025 23:58:19 +0800
Subject: [PATCH 37/56] reformat

---
 test/test_distance_transform.py | 178 +++++++++++++++++++++-----------
 1 file changed, 117 insertions(+), 61 deletions(-)

diff --git a/test/test_distance_transform.py b/test/test_distance_transform.py
index 6e6265d..56acb37 100644
--- a/test/test_distance_transform.py
+++ b/test/test_distance_transform.py
@@ -1,60 +1,100 @@
-import torch
-import pytest
 import numpy as np
+import pytest
+import torch
 from scipy.ndimage import distance_transform_edt as scipy_edt
-import torchmorph as tm 
+
+import torchmorph as tm
+
 
 # ======================================================================
-# 辅助函数
+# Helper functions
 # ======================================================================
-def batch_scipy_edt_with_indices(batch_numpy: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
-    dist_results, indices_results = [], []
-    
-    # 保证 batch_numpy 至少是 (Batch, ...)
-    # 如果进来的是 (H, W)，我们在外面已经处理成 (1, H, W) 了
+def batch_scipy_edt_with_indices(
+    batch_numpy: np.ndarray,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Compute SciPy EDT and indices for a batch of arrays."""
+    dist_results: list[np.ndarray] = []
+    indices_results: list[np.ndarray] = []
+
+    # Ensure batch_numpy has at least shape (Batch, ...)
+    # If the input is (H, W), it is already converted to (1, H, W) outside.
     if batch_numpy.ndim == 1:
         batch_numpy = batch_numpy[np.newaxis, ...]
 
     for sample in batch_numpy:
-        dist, indices = scipy_edt(sample, return_indices=True, return_distances=True)
+        dist, indices = scipy_edt(
+            sample,
+            return_indices=True,
+            return_distances=True,
+        )
         dist_results.append(dist)
         indices_results.append(indices)
-        
+
     output_dist = np.stack(dist_results, axis=0)
     output_indices = np.stack(indices_results, axis=0)
-    output_indices = np.moveaxis(output_indices, 1, -1) 
-    
+    output_indices = np.moveaxis(output_indices, 1, -1)
+
     return output_dist, output_indices
 
+
 # ======================================================================
-# 测试数据
+# Test data
 # ======================================================================
-case_batch_1d = np.array([[1, 1, 0, 1, 0, 1], [0, 1, 1, 1, 1, 0]], dtype=np.float32)
+case_batch_1d = np.array(
+    [[1, 1, 0, 1, 0, 1], [0, 1, 1, 1, 1, 0]],
+    dtype=np.float32,
+)
 
-case_batch_2d = np.array([[[0., 1, 1, 1], [0, 0, 1, 1], [0, 1, 1, 0]],
-                          [[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]]], dtype=np.float32)
+case_batch_2d = np.array(
+    [
+        [[0.0, 1, 1, 1], [0, 0, 1, 1], [0, 1, 1, 0]],
+        [[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]],
+    ],
+    dtype=np.float32,
+)
 
-# 这里定义为 (4, 4)，意图是单张 2D 图
-case_single_2d = np.array([[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0]], dtype=np.float32)
+# This is a single 2D image with shape (4, 4)
+case_single_2d = np.array(
+    [
+        [0, 1, 0, 1],
+        [1, 0, 1, 0],
+        [0, 1, 0, 1],
+        [1, 0, 1, 0],
+    ],
+    dtype=np.float32,
+)
 case_explicit_batch_one = case_single_2d[np.newaxis, ...]
 
-_case_3d_s1 = np.ones((4, 5, 6), dtype=np.float32); _case_3d_s1[1, 1, 1] = 0.0; _case_3d_s1[2, 3, 4] = 0.0
-_case_3d_s2 = np.ones((4, 5, 6), dtype=np.float32); _case_3d_s2[0, 0, 0] = 0.0
+_case_3d_s1 = np.ones((4, 5, 6), dtype=np.float32)
+_case_3d_s1[1, 1, 1] = 0.0
+_case_3d_s1[2, 3, 4] = 0.0
+
+_case_3d_s2 = np.ones((4, 5, 6), dtype=np.float32)
+_case_3d_s2[0, 0, 0] = 0.0
+
 case_batch_3d = np.stack([_case_3d_s1, _case_3d_s2], axis=0)
 
-case_dim_one = np.ones((2, 5, 1), dtype=np.float32); case_dim_one[0, 2, 0] = 0.0; case_dim_one[1, 4, 0] = 0.0
+case_dim_one = np.ones((2, 5, 1), dtype=np.float32)
+case_dim_one[0, 2, 0] = 0.0
+case_dim_one[1, 4, 0] = 0.0
+
+# 4D spatial case
+_case_4d_s1 = np.ones((3, 3, 3, 3), dtype=np.float32)
+_case_4d_s1[0, 0, 0, 0] = 0.0
+
+_case_4d_s2 = np.ones((3, 3, 3, 3), dtype=np.float32)
+_case_4d_s2[1, 1, 1, 1] = 0.0
 
-# 4D Case
-_case_4d_s1 = np.ones((3, 3, 3, 3), dtype=np.float32); _case_4d_s1[0, 0, 0, 0] = 0.0
-_case_4d_s2 = np.ones((3, 3, 3, 3), dtype=np.float32); _case_4d_s2[1, 1, 1, 1] = 0.0
 case_batch_4d_spatial = np.stack([_case_4d_s1, _case_4d_s2], axis=0)
 
-# 5D Case
+# 5D spatial case
 case_batch_5d_spatial = np.ones((1, 2, 2, 2, 2, 2), dtype=np.float32)
-case_batch_5d_spatial[0, 0, 0, 0, 0, 0] = 0.0; case_batch_5d_spatial[0, 1, 1, 1, 1, 1] = 0.0
+case_batch_5d_spatial[0, 0, 0, 0, 0, 0] = 0.0
+case_batch_5d_spatial[0, 1, 1, 1, 1, 1] = 0.0
+
 
 # ======================================================================
-# 测试逻辑
+# Test logic
 # ======================================================================
 @pytest.mark.parametrize(
     "input_numpy, has_batch_dim",
@@ -62,66 +102,82 @@ def batch_scipy_edt_with_indices(batch_numpy: np.ndarray) -> tuple[np.ndarray, n
         pytest.param(case_batch_1d, True, id="1D_Batch"),
         pytest.param(case_batch_2d, True, id="2D_Batch"),
         pytest.param(case_single_2d, False, id="2D_Single_NoBatch"),
-        pytest.param(case_explicit_batch_one, True, id="2D_Single_ExplicitBatch"),
+        pytest.param(
+            case_explicit_batch_one,
+            True,
+            id="2D_Single_ExplicitBatch",
+        ),
         pytest.param(case_batch_3d, True, id="3D_Batch"),
         pytest.param(case_dim_one, True, id="2D_UnitDim_Batch"),
         pytest.param(case_batch_4d_spatial, True, id="4D_Spatial_Batch"),
         pytest.param(case_batch_5d_spatial, True, id="5D_Spatial_Batch"),
     ],
 )
-def test_distance_transform_and_indices(input_numpy: np.ndarray, has_batch_dim: bool, request: pytest.FixtureRequest):
+def test_distance_transform_and_indices(
+    input_numpy: np.ndarray,
+    has_batch_dim: bool,
+    request: pytest.FixtureRequest,
+) -> None:
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
-    
-    # 1. 准备 Numpy 数据
+
+    # 1. Prepare NumPy data
     x_numpy_contiguous = np.ascontiguousarray(input_numpy)
-    
-    # 2. 准备 SciPy 输入
-    # 如果意图是单样本 (has_batch_dim=False)，我们手动增加 Batch 维，
-    # 这样 scipy 辅助函数就会把它当做一张图来处理，而不是 N 张 1D 图
+
+    # 2. Prepare SciPy input.
+    # If this is a single sample (has_batch_dim=False), manually add a
+    # batch dimension so SciPy treats it as one image instead of N 1D
+    # signals.
     if not has_batch_dim:
         scipy_input = x_numpy_contiguous[np.newaxis, ...]
     else:
         scipy_input = x_numpy_contiguous
 
-    # 3. 准备 CUDA 输入
-    # 关键修复: 
-    # 如果 has_batch_dim=False，说明这是单张 (H, W)，我们要测 2D EDT。
-    # C++ API 默认第一维是 Batch，所以我们必须 unsqueeze(0) 变成 (1, H, W)。
-    # 否则 C++ 会把它当做 (Batch=H, Len=W) 做 1D EDT。
+    # 3. Prepare CUDA input.
+    # If has_batch_dim=False, the input is (H, W) and we want 2D EDT.
+    # The C++ API assumes the first dimension is batch, so we must
+    # unsqueeze(0) to get shape (1, H, W). Otherwise, it will be
+    # interpreted as (Batch=H, Length=W) and run 1D EDT.
     x_cuda = torch.from_numpy(x_numpy_contiguous).cuda()
     if not has_batch_dim:
         x_cuda = x_cuda.unsqueeze(0)
 
-    print(f"\n\n--- 运行测试: {request.node.callspec.id} ---")
-    print(f"CUDA 输入形状: {x_cuda.shape}")
+    print(f"\n\n--- Running test: {request.node.callspec.id} ---")
+    print(f"CUDA input shape: {x_cuda.shape}")
 
-    # 4. 运行 CUDA EDT
+    # 4. Run CUDA EDT
     dist_cuda, idx_cuda = tm.distance_transform(x_cuda.clone())
 
-    # 5. 运行 SciPy (Ground Truth)
+    # 5. Run SciPy (ground truth)
     dist_ref_numpy, idx_ref_numpy = batch_scipy_edt_with_indices(scipy_input)
     dist_ref = torch.from_numpy(dist_ref_numpy).to(torch.float32).cuda()
 
-    # 6. 验证距离
-    # 此时 dist_cuda 是 (1, H, W)，dist_ref 也是 (1, H, W)
-    # 如果原意是 NoBatch，我们可以把 Batch 维 squeeze 掉再比，或者直接比
-    print(f"CUDA Out Shape: {dist_cuda.shape}, Ref Shape: {dist_ref.shape}")
-    assert dist_cuda.shape == dist_ref.shape, f"Shape mismatch: {dist_cuda.shape} vs {dist_ref.shape}"
+    # 6. Validate distances
+    print(
+        f"CUDA distance shape: {dist_cuda.shape}, "
+        f"reference shape: {dist_ref.shape}",
+    )
+    assert dist_cuda.shape == dist_ref.shape, (
+        f"Shape mismatch: {dist_cuda.shape} vs {dist_ref.shape}"
+    )
     torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-3, rtol=1e-3)
-    print(">> 距离验证通过。")
+    print(">> Distance validation passed.")
 
-    # 7. 验证索引
-    # idx_cuda: (1, H, W, 2)
-    # 构造 Grid
-    spatial_shape = x_cuda.shape[1:] # (H, W)
-    coords = [torch.arange(s, device='cuda') for s in spatial_shape]
-    grid = torch.stack(torch.meshgrid(*coords, indexing='ij'), dim=-1) # (H, W, 2)
-    grid = grid.unsqueeze(0) # (1, H, W, 2)
+    # 7. Validate indices
+    # idx_cuda: (B, H, W, D)
+    spatial_shape = x_cuda.shape[1:]
+    coords = [torch.arange(s, device="cuda") for s in spatial_shape]
+    grid = torch.stack(torch.meshgrid(*coords, indexing="ij"), dim=-1)
+    grid = grid.unsqueeze(0)  # (1, H, W, D)
 
     diff = grid.float() - idx_cuda.float()
     dist_sq_calculated = torch.sum(diff * diff, dim=-1)
     dist_sq_output = dist_cuda * dist_cuda
-    
-    torch.testing.assert_close(dist_sq_calculated, dist_sq_output, atol=1e-3, rtol=1e-3)
-    print(">> 索引验证通过。")
\ No newline at end of file
+
+    torch.testing.assert_close(
+        dist_sq_calculated,
+        dist_sq_output,
+        atol=1e-3,
+        rtol=1e-3,
+    )
+    print(">> Index validation passed.")

From 95724d9e200fde67276cad184ff5bd381f496607 Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Thu, 11 Dec 2025 00:04:11 +0800
Subject: [PATCH 38/56] reformat

---
 benchmark/distance_transform.py | 4 ----
 test/test_distance_transform.py | 7 +++----
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/benchmark/distance_transform.py b/benchmark/distance_transform.py
index c659c82..c809ca3 100644
--- a/benchmark/distance_transform.py
+++ b/benchmark/distance_transform.py
@@ -1,9 +1,6 @@
 import torch
 import torch.utils.benchmark as benchmark
-import scipy.ndimage as ndi
-import numpy as np
 from prettytable import PrettyTable
-import torchmorph as tm
 
 sizes = [64, 128, 256, 512, 1024]
 batches = [1, 4, 8, 16]
@@ -76,4 +73,3 @@
 
     print(f"\n=== Batch Size: {B} ===")
     print(table)
-
diff --git a/test/test_distance_transform.py b/test/test_distance_transform.py
index 56acb37..f852d36 100644
--- a/test/test_distance_transform.py
+++ b/test/test_distance_transform.py
@@ -1,9 +1,8 @@
-import numpy as np
+import numpy as np  # noqa: F401
 import pytest
 import torch
-from scipy.ndimage import distance_transform_edt as scipy_edt
-
-import torchmorph as tm
+from scipy.ndimage import distance_transform_edt as scipy_edt  # noqa: F401
+import torchmorph as tm  # noqa: F401
 
 
 # ======================================================================

From 86381e40e1d863ebc21c685956953f94a9553f50 Mon Sep 17 00:00:00 2001
From: Kai ZHAO <694691@qq.com>
Date: Thu, 11 Dec 2025 00:07:26 +0800
Subject: [PATCH 39/56] isort

---
 benchmark/distance_transform.py  | 20 +++++++++++---------
 scripts/check_ascii.py           |  2 +-
 setup.py                         |  5 +++--
 test/test_add.py                 |  3 ++-
 test/test_distance_transform.py  | 10 +++++-----
 torchmorph/add.py                |  1 +
 torchmorph/distance_transform.py |  1 +
 7 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/benchmark/distance_transform.py b/benchmark/distance_transform.py
index c809ca3..3737ced 100644
--- a/benchmark/distance_transform.py
+++ b/benchmark/distance_transform.py
@@ -27,7 +27,7 @@
         # Inputs
         x = (torch.randn(B, 1, s, s, device=device) > 0).to(dtype)
         x_np_list = [x[i, 0].detach().cpu().numpy() for i in range(B)]
-        x_imgs = [x[i:i+1] for i in range(B)]
+        x_imgs = [x[i : i + 1] for i in range(B)]
 
         # SciPy (CPU, one-by-one)
         stmt_scipy = "out = [ndi.distance_transform_edt(arr) for arr in x_np_list]"
@@ -62,14 +62,16 @@
         speed1 = scipy_per_img_ms / torch1_per_img_ms
         speedB = scipy_per_img_ms / torchB_per_img_ms
 
-        table.add_row([
-            s,
-            f"{scipy_per_img_ms:.3f}",
-            f"{torch1_per_img_ms:.3f}",
-            f"{torchB_per_img_ms:.3f}",
-            f"{speed1:.1f}×",
-            f"{speedB:.1f}×",
-        ])
+        table.add_row(
+            [
+                s,
+                f"{scipy_per_img_ms:.3f}",
+                f"{torch1_per_img_ms:.3f}",
+                f"{torchB_per_img_ms:.3f}",
+                f"{speed1:.1f}×",
+                f"{speedB:.1f}×",
+            ]
+        )
 
     print(f"\n=== Batch Size: {B} ===")
     print(table)
diff --git a/scripts/check_ascii.py b/scripts/check_ascii.py
index bc1dcdd..d788056 100644
--- a/scripts/check_ascii.py
+++ b/scripts/check_ascii.py
@@ -46,7 +46,7 @@ def is_forbidden_char(ch: str) -> bool:
     if cat.startswith("L"):  # Letter
         if in_ranges(ch, LATIN_RANGES):
             return False  # Latin letters allowed
-        return True       # Non-Latin letters forbidden
+        return True  # Non-Latin letters forbidden
 
     # All non-letter stuff (emoji, arrows, symbols, punctuation) is allowed.
     return False
diff --git a/setup.py b/setup.py
index 9876b88..fd3f95e 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,7 @@
-import os
 import glob
-from setuptools import setup, find_packages
+import os
+
+from setuptools import find_packages, setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 
 
diff --git a/test/test_add.py b/test/test_add.py
index a641494..5f647ff 100644
--- a/test/test_add.py
+++ b/test/test_add.py
@@ -1,5 +1,6 @@
-import torch
 import pytest
+import torch
+
 import torchmorph as tm
 
 
diff --git a/test/test_distance_transform.py b/test/test_distance_transform.py
index f852d36..5855bf3 100644
--- a/test/test_distance_transform.py
+++ b/test/test_distance_transform.py
@@ -2,6 +2,7 @@
 import pytest
 import torch
 from scipy.ndimage import distance_transform_edt as scipy_edt  # noqa: F401
+
 import torchmorph as tm  # noqa: F401
 
 
@@ -153,12 +154,11 @@ def test_distance_transform_and_indices(
 
     # 6. Validate distances
     print(
-        f"CUDA distance shape: {dist_cuda.shape}, "
-        f"reference shape: {dist_ref.shape}",
-    )
-    assert dist_cuda.shape == dist_ref.shape, (
-        f"Shape mismatch: {dist_cuda.shape} vs {dist_ref.shape}"
+        f"CUDA distance shape: {dist_cuda.shape}, " f"reference shape: {dist_ref.shape}",
     )
+    assert (
+        dist_cuda.shape == dist_ref.shape
+    ), f"Shape mismatch: {dist_cuda.shape} vs {dist_ref.shape}"
     torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-3, rtol=1e-3)
     print(">> Distance validation passed.")
 
diff --git a/torchmorph/add.py b/torchmorph/add.py
index 4737073..f7c16b9 100644
--- a/torchmorph/add.py
+++ b/torchmorph/add.py
@@ -1,4 +1,5 @@
 import torch
+
 from torchmorph import _C
 
 
diff --git a/torchmorph/distance_transform.py b/torchmorph/distance_transform.py
index 0184be5..868e84a 100644
--- a/torchmorph/distance_transform.py
+++ b/torchmorph/distance_transform.py
@@ -1,4 +1,5 @@
 import torch
+
 from torchmorph import _C
 
 

From d2df725ae46ef303b300547701fbfe9d6e1f8e25 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Sun, 14 Dec 2025 23:35:33 +0800
Subject: [PATCH 40/56] =?UTF-8?q?=E7=A7=BB=E9=99=A4switch=E7=B2=BE?=
 =?UTF-8?q?=E7=AE=80=E4=BB=A3=E7=A0=81=20INF=5FVAL=20=E6=94=B9=E4=B8=BA=20?=
 =?UTF-8?q?1e20?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torchmorph/csrc/distance_transform_kernel.cu | 327 +++++++------------
 1 file changed, 117 insertions(+), 210 deletions(-)

diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 503d13c..6c30c64 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -1,54 +1,52 @@
 #include <torch/extension.h>
 #include <vector>
 #include <cmath>
-#include <algorithm>
+#include <limits>
 #include <cuda.h>
 #include <cuda_runtime.h>
 
 // ------------------------------------------------------------------
-// Configuration Constants
+// Configuration
 // ------------------------------------------------------------------
-#define INF_VAL 1e8f
+// Use a large enough value to avoid overflow, but safe for float addition
+#define INF_VAL 1e20f 
 #define MAX_THREADS 1024
-// Shared memory limit: typically 48 KB.
-// Each pixel requires: float(value) + int(idx1) + int(idx2) = 12 bytes.
-// 4096 * 12 = 48 KB.
 #define SMEM_LIMIT_ELEMENTS 4096 
 
 // ------------------------------------------------------------------
-// Device Helper Functions
+// Device Helper
 // ------------------------------------------------------------------
 
 __device__ __forceinline__ float sqr(float x) { return x * x; }
 
-// Compute the JFA cost: (q - p)^2 + weight[p]
 __device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
-    if (p < 0) return INF_VAL; 
+    // Safety check for boundaries and INF propagation
+    if (p < 0 || val_p >= INF_VAL) return INF_VAL; 
     return sqr((float)q - (float)p) + val_p;
 }
 
 // ------------------------------------------------------------------
 // JFA Core Logic (Device Only)
 // ------------------------------------------------------------------
-// Core JFA logic, independent of data location (works with both Shared and Global memory).
 __device__ void run_jfa_core(
     int N,
     int tid,
-    const float* __restrict__ vals,  // input weight (read-only)
-    int* __restrict__ idx_curr,      // Ping-Pong Buffer A
-    int* __restrict__ idx_next       // Ping-Pong Buffer B
+    const float* __restrict__ vals,  
+    int* __restrict__ idx_curr,      
+    int* __restrict__ idx_next       
 ) {
-    // 1. Initialization: determine whether each pixel is a valid source based on vals.
+    // 1. Initialization
     for (int i = tid; i < N; i += blockDim.x) {
+        // Use a relative threshold to safely detect background
         if (vals[i] >= INF_VAL * 0.9f) {
-            idx_curr[i] = -1; // background
+            idx_curr[i] = -1; 
         } else {
-            idx_curr[i] = i;  // For each object/source point, the initial index points to itself.
+            idx_curr[i] = i;  
         }
     }
     __syncthreads();
 
-    // 2. Iterative Propagation (Step = 1, 2, 4, ... < N)
+    // 2. Iterative Propagation (Pointer Jumping: 1 -> 2 -> 4...)
     int* idx_in = idx_curr;
     int* idx_out = idx_next;
 
@@ -57,12 +55,12 @@ __device__ void run_jfa_core(
             int my_best_p = idx_in[i];
             float min_cost = INF_VAL;
 
-            // Check its current best solution
+            // Check self (current best)
             if (my_best_p != -1) {
                 min_cost = compute_cost(i, my_best_p, vals[my_best_p]);
             }
 
-            // Check Left Neighbor (-step)
+            // Check Left Neighbor
             int left = i - step;
             if (left >= 0) {
                 int left_p = idx_in[left];
@@ -75,7 +73,7 @@ __device__ void run_jfa_core(
                 }
             }
 
-            // Check Right Neighbor (+step)
+            // Check Right Neighbor
             int right = i + step;
             if (right < N) {
                 int right_p = idx_in[right];
@@ -90,14 +88,14 @@ __device__ void run_jfa_core(
             idx_out[i] = my_best_p;
         }
         
-        // Swap Pointers
+        // Swap Ping-Pong buffers
         int* temp = idx_in;
         idx_in = idx_out;
         idx_out = temp;
         __syncthreads();
     }
 
-    // 3. Ensure the final result is stored in idx_curr (if the loop ends with idx_next, copy it back).
+    // 3. Final Copy Back (if needed)
     if (idx_in != idx_curr) {
         for (int i = tid; i < N; i += blockDim.x) {
             idx_curr[i] = idx_next[i];
@@ -109,22 +107,19 @@ __device__ void run_jfa_core(
 // ------------------------------------------------------------------
 // Kernel 1: Shared Memory JFA (Fast Path)
 // ------------------------------------------------------------------
-// Template parameter NDim: when NDim > 0, the compiler performs loop unrolling optimizations.
-// Runtime parameter runtime_ndim: when NDim == 0 (default behavior), this parameter specifies the dimension.
-template <bool IsFinal, int NDim>
+// Note: We removed the template switch for NDim to reduce compile time.
+// The performance impact is negligible for the copy loop.
+template <bool IsFinal>
 __global__ void edt_kernel_shared(
-    const float* __restrict__ in_data,       // input Dist^2
-    const int32_t* __restrict__ in_indices,  // output Indices
-    float* __restrict__ out_dist,            // output Dist (IsFinal ? sqrt : sqr)
-    int32_t* __restrict__ out_indices,       // output Indices
-    int64_t L,               // Size of the current dimension
-    int64_t total_elements,  // Total number of elements
-    int runtime_ndim         // Runtime dimension (used as fallback)
+    const float* __restrict__ in_data,       // Contiguous Input
+    const int32_t* __restrict__ in_indices,  // Contiguous Input
+    float* __restrict__ out_dist,            
+    int32_t* __restrict__ out_indices,       
+    int64_t L,               
+    int64_t total_elements,  
+    int coord_ndim           
 ) {
-    // Determine the effective dimension
-    const int D = (NDim > 0) ? NDim : runtime_ndim;
-
-    // Compute row offset
+    // 1 Block processes 1 Row (L elements)
     int64_t row_idx = blockIdx.x;
     int64_t offset = row_idx * L;
 
@@ -136,44 +131,44 @@ __global__ void edt_kernel_shared(
     int*   s_idx1 = (int*)(s_vals + L);
     int*   s_idx2 = (int*)(s_idx1 + L);
 
-    // 1. Load distances into Shared Memory
+    // 1. Load Data (Coalesced Read due to .contiguous() input)
     for (int i = threadIdx.x; i < L; i += blockDim.x) {
         s_vals[i] = __ldg(&in_data[offset + i]);
     }
     __syncthreads();
 
-    // 2. Run the core JFA logic
+    // 2. Run JFA Core
     run_jfa_core(L, threadIdx.x, s_vals, s_idx1, s_idx2);
 
-    // 3. Write back the results
+    // 3. Write Back Results
     for (int q = threadIdx.x; q < L; q += blockDim.x) {
-        int p = s_idx1[q];  // Nearest point (local index within 0..L-1)
+        int p = s_idx1[q];
         float dist_val;
 
-        // Compute updated distance
+        // Calculate final distance
         if (p != -1) {
             float dist_sq = sqr((float)q - (float)p) + s_vals[p];
             dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
         } else {
-            // No source point found (e.g., entire row is background)
-            dist_val = IsFinal ? INF_VAL : sqr(INF_VAL);
-            p = 0;  // Prevent out-of-bounds access
+            dist_val = IsFinal ? INF_VAL : INF_VAL; // Use large val instead of sqr(INF)
+            p = 0;
         }
         out_dist[offset + q] = dist_val;
 
-        // Propagate indices: copy a vector of size [D]
-        if (p != -1) {
-            int64_t src_offset = (offset + p) * D;
-            int64_t dst_offset = (offset + q) * D;
-
-            // When NDim > 0, this loop is fully unrolled by the compiler
-            for (int d = 0; d < D; ++d) {
-                out_indices[dst_offset + d] = in_indices[src_offset + d];
+        // Propagate Indices
+        // out_indices shape is (TotalElements, coord_ndim) flattened
+        // Using runtime loop instead of template unrolling
+        int64_t dst_base = (offset + q) * coord_ndim;
+        
+        if (p != -1 && s_vals[p] < INF_VAL) {
+            int64_t src_base = (offset + p) * coord_ndim;
+            for (int d = 0; d < coord_ndim; ++d) {
+                out_indices[dst_base + d] = in_indices[src_base + d];
             }
         } else {
-            // Fallback: no source available
-            int64_t dst_offset = (offset + q) * D;
-            for (int d = 0; d < D; ++d) out_indices[dst_offset + d] = 0;
+            for (int d = 0; d < coord_ndim; ++d) {
+                out_indices[dst_base + d] = 0;
+            }
         }
     }
 }
@@ -181,8 +176,7 @@ __global__ void edt_kernel_shared(
 // ------------------------------------------------------------------
 // Kernel 2: Global Memory JFA (Fallback Path)
 // ------------------------------------------------------------------
-// Same logic as above, but uses Global Memory as the ping-pong buffer
-template <bool IsFinal, int NDim>
+template <bool IsFinal>
 __global__ void edt_kernel_global(
     const float* __restrict__ in_data,
     const int32_t* __restrict__ in_indices,
@@ -192,23 +186,19 @@ __global__ void edt_kernel_global(
     int* __restrict__ global_buffer_2,
     int64_t L,
     int64_t total_elements,
-    int runtime_ndim
+    int coord_ndim
 ) {
-    const int D = (NDim > 0) ? NDim : runtime_ndim;
-
     int64_t row_idx = blockIdx.x;
     int64_t offset = row_idx * L;
     
     if (offset >= total_elements) return;
 
-    // Pointers to Global Memory
     int* g_idx1 = global_buffer_1 + offset;
     int* g_idx2 = global_buffer_2 + offset;
     
-    // 1. & 2. Run the JFA core (operating directly on Global Memory)
+    // Core Logic operates on Global Memory pointers
     run_jfa_core(L, threadIdx.x, in_data + offset, g_idx1, g_idx2);
 
-    // 3. Write back results
     for (int q = threadIdx.x; q < L; q += blockDim.x) {
         int p = g_idx1[q];
         float dist_val;
@@ -218,58 +208,46 @@ __global__ void edt_kernel_global(
             float dist_sq = sqr((float)q - (float)p) + val_p;
             dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
         } else {
-            dist_val = IsFinal ? INF_VAL : sqr(INF_VAL);
+            dist_val = IsFinal ? INF_VAL : INF_VAL;
             p = 0; 
         }
 
         out_dist[offset + q] = dist_val;
 
-        if (p != -1) {
-            int64_t src_offset = (offset + p) * D;
-            int64_t dst_offset = (offset + q) * D;
-            for (int d = 0; d < D; ++d) {
-                out_indices[dst_offset + d] = in_indices[src_offset + d];
+        int64_t dst_base = (offset + q) * coord_ndim;
+        if (p != -1 && in_data[offset + p] < INF_VAL) {
+            int64_t src_base = (offset + p) * coord_ndim;
+            for (int d = 0; d < coord_ndim; ++d) {
+                out_indices[dst_base + d] = in_indices[src_base + d];
             }
         } else {
-             int64_t dst_offset = (offset + q) * D;
-             for (int d = 0; d < D; ++d) out_indices[dst_offset + d] = 0;
+             for (int d = 0; d < coord_ndim; ++d) out_indices[dst_base + d] = 0;
         }
     }
 }
 
-
 // ------------------------------------------------------------------
-// Kernel 3: Initialize Indices
+// Initialization Kernel
 // ------------------------------------------------------------------
-// Initialize index tensor as grid coordinates
-// indices shape: (..., D)
 __global__ void init_indices_kernel(
     int32_t* indices, 
     int64_t total_pixels, 
     int NDim, 
-    const int64_t* __restrict__ shape_ptr // shape of spatial dimensions
+    const int64_t* __restrict__ shape_ptr
 ) {
     int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= total_pixels) return;
 
-    // Unravel Index
-    // idx is the flat index of each pixel
-    // We need to compute its coordinate in spatial_shape
-    
     int64_t temp = idx;
-    // Use local register array to avoid repeated global memory reads (assume max 10 dims)
-    int32_t coords[10]; 
+    int32_t coords[8]; // Max 8 dims supported locally
 
-    // Example: spatial_shape = [D0, D1, D2]
-    // compute by modulo from last dimension
+    // Unravel index
     for (int d = NDim - 1; d >= 0; --d) {
         int64_t dim_size = shape_ptr[d];
         coords[d] = temp % dim_size;
         temp /= dim_size;
     }
 
-    // Write to Global Memory
-    // Indices tensor is flattened as (TotalPixels, NDim)
     int64_t out_ptr = idx * NDim;
     for (int d = 0; d < NDim; ++d) {
         indices[out_ptr + d] = coords[d];
@@ -277,35 +255,25 @@ __global__ void init_indices_kernel(
 }
 
 // ------------------------------------------------------------------
-// Host Function: C++ Entry Point
+// Host Function
 // ------------------------------------------------------------------
 
 std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
     TORCH_CHECK(input.is_cuda(), "Input must be on CUDA device.");
     TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be float32.");
     
+    // 1. Force Contiguous Input (Optimized Copy)
+    // This is crucial for coalesced memory access in init kernel.
     input = input.contiguous();
     
-    // Handle batch dimension: if input is 1D (L), treat as no batch but internally add a batch dimension.
-    // Convention: input shape is (Batch, D1, D2, ..., Dn)
-    // Algorithm treats batch and other dims identically (batch is just another leading dimension)
-    // But index initialization needs to know which are "spatial dimensions".
-    // Assumption: all dims except dim 0 (Batch) are spatial.
-    
     const int ndim = input.dim(); 
-    // If ndim=1, assume (L) -> sample_ndim=1
-    // If ndim=4 (B, C, H, W), sample_ndim=3 (C,H,W treated as spatial? Channels often processed independently)
-    // Correction: classical EDT usually runs on (H,W) or (D,H,W).
-    // If channels exist, typically each channel is processed independently.
-    // For maximum generality, we treat **all dims except dim 0** as spatial dims.
-    // If input has no batch dim, user should use unsqueeze(0) in Python.
-    
     const int sample_ndim = ndim - 1; 
-    TORCH_CHECK(sample_ndim > 0, "Input tensor must have at least 2 dimensions (Batch, ...)");
+    TORCH_CHECK(sample_ndim > 0 && sample_ndim <= 8, "Dims must be between 2 and 9 (Batch + 8 Spatial)");
     
     auto shape = input.sizes().vec();
     int64_t num_pixels = input.numel();
 
+    // Handle empty input
     if (num_pixels == 0) {
         auto index_shape = shape;
         index_shape.push_back(sample_ndim);
@@ -313,24 +281,20 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
                                torch::empty(index_shape, input.options().dtype(torch::kInt32)));
     }
 
-    // 1. Initialize Distance Tensor
-    // 0 -> 0, 1 -> INF
+    // 2. Init Distances
     auto current_dist = torch::where(input == 0, 
                                      torch::tensor(0.0f, input.options()), 
                                      torch::tensor(INF_VAL, input.options()));
     
-    // 2. Initialize Index Tensor
-    // Shape: (Batch, D1, ..., Dn, sample_ndim)
+    // 3. Init Indices
     auto index_shape = shape;
     index_shape.push_back(sample_ndim);
     auto current_idx = torch::empty(index_shape, input.options().dtype(torch::kInt32));
     
-    // 2.1 Prepare shape tensor for kernel
-    std::vector<int64_t> spatial_shape(shape.begin() + 1, shape.end());
-    auto shape_tensor = torch::tensor(spatial_shape, torch::kInt64).to(input.device());
-
-    // 2.2 Launch initialization kernel
     {
+        std::vector<int64_t> spatial_shape(shape.begin() + 1, shape.end());
+        auto shape_tensor = torch::tensor(spatial_shape, torch::kInt64).to(input.device());
+
         int threads = 256;
         int blocks = (num_pixels + threads - 1) / threads;
         init_indices_kernel<<<blocks, threads>>>(
@@ -339,131 +303,74 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
             sample_ndim,
             shape_tensor.data_ptr<int64_t>()
         );
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess) {
-            printf("Init Kernel Failed: %s\n", cudaGetErrorString(err));
-        }
     }
 
-    // Pre-allocate Global Memory Buffers (lazy)
+    // Lazy buffers
     torch::Tensor global_buf1, global_buf2;
 
-    // 3. Process each spatial dimension (Separable JFA)
-    // Iterate through each spatial dimension (1 to ndim-1)
+    // 4. Dimensional Iteration
     for (int d = 1; d < ndim; ++d) {
         bool is_final_pass = (d == ndim - 1);
         
-        // --- Step A: Transpose current dim to last ---
-        // Resulting shape: (..., L)
+        // --- Step A: Transpose + Contiguous (The "Expensive" Copy) ---
+        // We accept this copy because it enables fully coalesced memory access in the kernel.
+        // Without this, the kernel bandwidth drops to <5%, which is much slower than the copy.
         auto dist_in = current_dist.transpose(d, ndim - 1).contiguous();
         auto idx_in  = current_idx.transpose(d, ndim - 1).contiguous(); 
         
+        // Prepare Output (Contiguous)
+        // Using empty() instead of empty_like() to ensure standard stride layout
+        auto dist_out = torch::empty(dist_in.sizes(), dist_in.options());
+        auto idx_out  = torch::empty(idx_in.sizes(), idx_in.options());
+
         int64_t L = dist_in.size(-1); 
         int64_t total_slices = dist_in.numel() / L; 
-        
-        auto dist_out = torch::empty_like(dist_in);
-        auto idx_out  = torch::empty_like(idx_in);
-
-        // --- Step B: Kernel Dispatch ---
         int threads = std::min((int64_t)MAX_THREADS, L);
         
-        // Check whether Shared Memory can be used
+        // --- Step B: Kernel Dispatch ---
         if (L <= SMEM_LIMIT_ELEMENTS) {
             size_t smem_size = L * (sizeof(float) + 2 * sizeof(int));
-            
-            // Switch macro to handle template dimension specialization
-            #define DISPATCH_SHARED(IS_FINAL) \
-                switch(sample_ndim) { \
-                    case 1: edt_kernel_shared<IS_FINAL, 1><<<total_slices, threads, smem_size>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 2: edt_kernel_shared<IS_FINAL, 2><<<total_slices, threads, smem_size>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 3: edt_kernel_shared<IS_FINAL, 3><<<total_slices, threads, smem_size>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 4: edt_kernel_shared<IS_FINAL, 4><<<total_slices, threads, smem_size>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 5: edt_kernel_shared<IS_FINAL, 5><<<total_slices, threads, smem_size>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 6: edt_kernel_shared<IS_FINAL, 6><<<total_slices, threads, smem_size>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    default: /* Fallback for > 6D */ \
-                        edt_kernel_shared<IS_FINAL, 0><<<total_slices, threads, smem_size>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                }
-
-            if (is_final_pass) { DISPATCH_SHARED(true); } 
-            else { DISPATCH_SHARED(false); }
-
+            if (is_final_pass) {
+                edt_kernel_shared<true><<<total_slices, threads, smem_size>>>(
+                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
+                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
+                    L, dist_in.numel(), sample_ndim
+                );
+            } else {
+                edt_kernel_shared<false><<<total_slices, threads, smem_size>>>(
+                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
+                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
+                    L, dist_in.numel(), sample_ndim
+                );
+            }
         } else {
-            // Global Memory fallback (L > 4096)
+            // Global Memory Fallback
             if (global_buf1.numel() < dist_in.numel()) {
                 global_buf1 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
                 global_buf2 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
             }
-
-            #define DISPATCH_GLOBAL(IS_FINAL) \
-                switch(sample_ndim) { \
-                    case 1: edt_kernel_global<IS_FINAL, 1><<<total_slices, threads>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 2: edt_kernel_global<IS_FINAL, 2><<<total_slices, threads>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 3: edt_kernel_global<IS_FINAL, 3><<<total_slices, threads>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 4: edt_kernel_global<IS_FINAL, 4><<<total_slices, threads>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 5: edt_kernel_global<IS_FINAL, 5><<<total_slices, threads>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    case 6: edt_kernel_global<IS_FINAL, 6><<<total_slices, threads>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                    default: /* Fallback */ \
-                        edt_kernel_global<IS_FINAL, 0><<<total_slices, threads>>>( \
-                        dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(), \
-                        dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(), \
-                        global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(), \
-                        L, dist_in.numel(), sample_ndim); break; \
-                }
-
-            if (is_final_pass) { DISPATCH_GLOBAL(true); } 
-            else { DISPATCH_GLOBAL(false); }
+            if (is_final_pass) {
+                edt_kernel_global<true><<<total_slices, threads>>>(
+                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
+                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
+                    global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(),
+                    L, dist_in.numel(), sample_ndim
+                );
+            } else {
+                edt_kernel_global<false><<<total_slices, threads>>>(
+                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
+                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
+                    global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(),
+                    L, dist_in.numel(), sample_ndim
+                );
+            }
         }
 
-        // --- Step C: Transpose Back ---
+        // --- Step C: Logical Transpose ---
+        // This is just a metadata swap, no copy. The next loop's .contiguous() will handle the copy.
         current_dist = dist_out.transpose(d, ndim - 1);
         current_idx  = idx_out.transpose(d, ndim - 1);
     }
 
     return std::make_tuple(current_dist, current_idx);
-}
-
+}
\ No newline at end of file

From 11a9657cdad5f5d769e88ce4bfecca60d3a8bf46 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Mon, 15 Dec 2025 00:02:04 +0800
Subject: [PATCH 41/56] add imports for scipy and torchmorph

---
 benchmark/distance_transform.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchmark/distance_transform.py b/benchmark/distance_transform.py
index 3737ced..9bfe27e 100644
--- a/benchmark/distance_transform.py
+++ b/benchmark/distance_transform.py
@@ -1,7 +1,10 @@
+import scipy.ndimage as ndi  # noqa: F401
 import torch
 import torch.utils.benchmark as benchmark
 from prettytable import PrettyTable
 
+import torchmorph as tm  # noqa: F401
+
 sizes = [64, 128, 256, 512, 1024]
 batches = [1, 4, 8, 16]
 dtype = torch.float32

From bdaf48fec543f8ff0ed8fa911072bf78223bcca7 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Sat, 20 Dec 2025 09:39:09 +0800
Subject: [PATCH 42/56] use JFA for 2D/3D and separable transform for high
 dimensions

---
 torchmorph/csrc/distance_transform_kernel.cu | 536 ++++++++++++++-----
 1 file changed, 410 insertions(+), 126 deletions(-)

diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 6c30c64..812f1a8 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -2,33 +2,258 @@
 #include <vector>
 #include <cmath>
 #include <limits>
+#include <cstdint>
+#include <cstdio>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <algorithm>
+#include <tuple>
 
 // ------------------------------------------------------------------
-// Configuration
+// Global Configuration
 // ------------------------------------------------------------------
-// Use a large enough value to avoid overflow, but safe for float addition
-#define INF_VAL 1e20f 
+#define BLOCK_SIZE 256
+#define INF_VAL 1e20f
 #define MAX_THREADS 1024
 #define SMEM_LIMIT_ELEMENTS 4096 
 
 // ------------------------------------------------------------------
-// Device Helper
+// Device Helpers
 // ------------------------------------------------------------------
-
 __device__ __forceinline__ float sqr(float x) { return x * x; }
 
+// Helper for JFA 2D/3D
+__device__ __forceinline__ float dist_sq_2d(int y1, int x1, int y2, int x2) {
+    return sqr((float)(y1 - y2)) + sqr((float)(x1 - x2));
+}
+
+__device__ __forceinline__ float dist_sq_3d(int z1, int y1, int x1, int z2, int y2, int x2) {
+    return sqr((float)(z1 - z2)) + sqr((float)(y1 - y2)) + sqr((float)(x1 - x2));
+}
+
+// Helper for Separable 1D
 __device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
-    // Safety check for boundaries and INF propagation
     if (p < 0 || val_p >= INF_VAL) return INF_VAL; 
     return sqr((float)q - (float)p) + val_p;
 }
 
-// ------------------------------------------------------------------
-// JFA Core Logic (Device Only)
-// ------------------------------------------------------------------
-__device__ void run_jfa_core(
+// ==================================================================
+// PART 1: JFA KERNELS (Optimized for 2D & 3D)
+// ==================================================================
+
+// --- 2D JFA Init ---
+template <typename IndexType>
+__global__ void init_jfa_kernel_2d(
+    const float* __restrict__ input,
+    IndexType* __restrict__ indices, 
+    int64_t total_elements,
+    int H, int W
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    if (input[tid] == 0.0f) {
+        int64_t spatial_size = (int64_t)H * W;
+        int64_t rem = tid % spatial_size;
+        int w = (int)(rem % W);
+        int h = (int)(rem / W);
+        int64_t idx_ptr = tid * 2;
+        indices[idx_ptr + 0] = (IndexType)h;
+        indices[idx_ptr + 1] = (IndexType)w;
+    } else {
+        int64_t idx_ptr = tid * 2;
+        indices[idx_ptr + 0] = (IndexType)-1;
+        indices[idx_ptr + 1] = (IndexType)-1;
+    }
+}
+
+// --- 2D JFA Step ---
+template <typename IndexType>
+__global__ void jfa_step_2d(
+    const IndexType* __restrict__ in_idx,
+    IndexType* __restrict__ out_idx,
+    int step,
+    int H, int W, 
+    int64_t total_pixels
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_pixels) return;
+
+    int64_t spatial_size = (int64_t)H * W;
+    int64_t rem = tid % spatial_size;
+    int64_t batch_offset = tid - rem; 
+    int w = (int)(rem % W);
+    int h = (int)(rem / W);
+
+    int best_y = -1, best_x = -1;
+    float best_dist = INF_VAL;
+
+    #pragma unroll
+    for (int dy = -1; dy <= 1; ++dy) {
+        #pragma unroll
+        for (int dx = -1; dx <= 1; ++dx) {
+            int ny = h + dy * step;
+            int nx = w + dx * step;
+            if (ny >= 0 && ny < H && nx >= 0 && nx < W) {
+                int64_t n_ptr = (batch_offset + ny * W + nx) * 2;
+                int seed_y = (int)in_idx[n_ptr + 0];
+                if (seed_y != -1) {
+                    int seed_x = (int)in_idx[n_ptr + 1];
+                    float d = dist_sq_2d(h, w, seed_y, seed_x);
+                    if (d < best_dist) {
+                        best_dist = d;
+                        best_y = seed_y;
+                        best_x = seed_x;
+                    }
+                }
+            }
+        }
+    }
+    int64_t out_ptr = tid * 2;
+    out_idx[out_ptr + 0] = (IndexType)best_y;
+    out_idx[out_ptr + 1] = (IndexType)best_x;
+}
+
+// --- 2D JFA Calc ---
+template <typename IndexType>
+__global__ void calc_dist_kernel_2d(
+    const IndexType* __restrict__ indices,
+    float* __restrict__ dist_out,
+    int64_t total_elements,
+    int H, int W
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    int64_t idx_ptr = tid * 2;
+    int seed_h = (int)indices[idx_ptr + 0];
+    if (seed_h == -1) { dist_out[tid] = INF_VAL; return; }
+    int seed_w = (int)indices[idx_ptr + 1];
+
+    int64_t spatial_size = (int64_t)H * W;
+    int64_t rem = tid % spatial_size;
+    int cur_w = (int)(rem % W);
+    int cur_h = (int)(rem / W);
+
+    dist_out[tid] = sqrtf(dist_sq_2d(cur_h, cur_w, seed_h, seed_w));
+}
+
+// --- 3D JFA Init ---
+template <typename IndexType>
+__global__ void init_jfa_kernel_3d(
+    const float* __restrict__ input,
+    IndexType* __restrict__ indices, 
+    int64_t total_elements,
+    int D, int H, int W
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    if (input[tid] == 0.0f) {
+        int64_t spatial_size = (int64_t)D * H * W;
+        int64_t rem = tid % spatial_size;
+        int w = (int)(rem % W);
+        int h = (int)((rem / W) % H);
+        int d = (int)(rem / (W * H));
+        int64_t idx_ptr = tid * 3;
+        indices[idx_ptr + 0] = (IndexType)d;
+        indices[idx_ptr + 1] = (IndexType)h;
+        indices[idx_ptr + 2] = (IndexType)w;
+    } else {
+        int64_t idx_ptr = tid * 3;
+        indices[idx_ptr + 0] = (IndexType)-1;
+        indices[idx_ptr + 1] = (IndexType)-1;
+        indices[idx_ptr + 2] = (IndexType)-1;
+    }
+}
+
+// --- 3D JFA Step ---
+template <typename IndexType>
+__global__ void jfa_step_3d(
+    const IndexType* __restrict__ in_idx,
+    IndexType* __restrict__ out_idx,
+    int step,
+    int D, int H, int W, 
+    int64_t total_pixels
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_pixels) return;
+
+    int64_t spatial_size = (int64_t)D * H * W;
+    int64_t rem = tid % spatial_size;
+    int64_t batch_offset = tid - rem; 
+    int w = (int)(rem % W);
+    int h = (int)((rem / W) % H);
+    int d = (int)(rem / (W * H));
+
+    int best_z = -1, best_y = -1, best_x = -1;
+    float best_dist = INF_VAL;
+
+    #pragma unroll
+    for (int dz = -1; dz <= 1; ++dz) {
+        #pragma unroll
+        for (int dy = -1; dy <= 1; ++dy) {
+            #pragma unroll
+            for (int dx = -1; dx <= 1; ++dx) {
+                int nz = d + dz * step;
+                int ny = h + dy * step;
+                int nx = w + dx * step;
+                if (nz >= 0 && nz < D && ny >= 0 && ny < H && nx >= 0 && nx < W) {
+                    int64_t n_ptr = (batch_offset + (int64_t)nz * (H * W) + ny * W + nx) * 3;
+                    int seed_z = (int)in_idx[n_ptr + 0];
+                    if (seed_z != -1) {
+                        int seed_y = (int)in_idx[n_ptr + 1];
+                        int seed_x = (int)in_idx[n_ptr + 2];
+                        float dist = dist_sq_3d(d, h, w, seed_z, seed_y, seed_x);
+                        if (dist < best_dist) {
+                            best_dist = dist;
+                            best_z = seed_z;
+                            best_y = seed_y;
+                            best_x = seed_x;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    int64_t out_ptr = tid * 3;
+    out_idx[out_ptr + 0] = (IndexType)best_z;
+    out_idx[out_ptr + 1] = (IndexType)best_y;
+    out_idx[out_ptr + 2] = (IndexType)best_x;
+}
+
+// --- 3D JFA Calc ---
+template <typename IndexType>
+__global__ void calc_dist_kernel_3d(
+    const IndexType* __restrict__ indices,
+    float* __restrict__ dist_out,
+    int64_t total_elements,
+    int D, int H, int W
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    int64_t idx_ptr = tid * 3;
+    int seed_d = (int)indices[idx_ptr + 0];
+    if (seed_d == -1) { dist_out[tid] = INF_VAL; return; }
+    int seed_h = (int)indices[idx_ptr + 1];
+    int seed_w = (int)indices[idx_ptr + 2];
+
+    int64_t spatial_size = (int64_t)D * H * W;
+    int64_t rem = tid % spatial_size;
+    int cur_w = (int)(rem % W);
+    int cur_h = (int)((rem / W) % H);
+    int cur_d = (int)(rem / (W * H));
+
+    dist_out[tid] = sqrtf(dist_sq_3d(cur_d, cur_h, cur_w, seed_d, seed_h, seed_w));
+}
+
+// ==================================================================
+// PART 2: SEPARABLE N-DIM KERNELS (For 4D+ Spatial)
+// ==================================================================
+
+// Core logic for 1D Scan (similar to JFA 1D)
+__device__ void run_separable_scan_core(
     int N,
     int tid,
     const float* __restrict__ vals,  
@@ -37,16 +262,12 @@ __device__ void run_jfa_core(
 ) {
     // 1. Initialization
     for (int i = tid; i < N; i += blockDim.x) {
-        // Use a relative threshold to safely detect background
-        if (vals[i] >= INF_VAL * 0.9f) {
-            idx_curr[i] = -1; 
-        } else {
-            idx_curr[i] = i;  
-        }
+        if (vals[i] >= INF_VAL * 0.9f) idx_curr[i] = -1; 
+        else idx_curr[i] = i;  
     }
     __syncthreads();
 
-    // 2. Iterative Propagation (Pointer Jumping: 1 -> 2 -> 4...)
+    // 2. Iterative Propagation
     int* idx_in = idx_curr;
     int* idx_out = idx_next;
 
@@ -55,129 +276,93 @@ __device__ void run_jfa_core(
             int my_best_p = idx_in[i];
             float min_cost = INF_VAL;
 
-            // Check self (current best)
-            if (my_best_p != -1) {
-                min_cost = compute_cost(i, my_best_p, vals[my_best_p]);
-            }
+            if (my_best_p != -1) min_cost = compute_cost(i, my_best_p, vals[my_best_p]);
 
-            // Check Left Neighbor
             int left = i - step;
             if (left >= 0) {
                 int left_p = idx_in[left];
                 if (left_p != -1) {
                     float c = compute_cost(i, left_p, vals[left_p]);
-                    if (c < min_cost) {
-                        min_cost = c;
-                        my_best_p = left_p;
-                    }
+                    if (c < min_cost) { min_cost = c; my_best_p = left_p; }
                 }
             }
 
-            // Check Right Neighbor
             int right = i + step;
             if (right < N) {
                 int right_p = idx_in[right];
                 if (right_p != -1) {
                     float c = compute_cost(i, right_p, vals[right_p]);
-                    if (c < min_cost) {
-                        min_cost = c;
-                        my_best_p = right_p;
-                    }
+                    if (c < min_cost) { min_cost = c; my_best_p = right_p; }
                 }
             }
             idx_out[i] = my_best_p;
         }
-        
-        // Swap Ping-Pong buffers
-        int* temp = idx_in;
-        idx_in = idx_out;
-        idx_out = temp;
+        int* temp = idx_in; idx_in = idx_out; idx_out = temp;
         __syncthreads();
     }
 
-    // 3. Final Copy Back (if needed)
+    // 3. Final Copy Back
     if (idx_in != idx_curr) {
-        for (int i = tid; i < N; i += blockDim.x) {
-            idx_curr[i] = idx_next[i];
-        }
+        for (int i = tid; i < N; i += blockDim.x) idx_curr[i] = idx_next[i];
         __syncthreads();
     }
 }
 
-// ------------------------------------------------------------------
-// Kernel 1: Shared Memory JFA (Fast Path)
-// ------------------------------------------------------------------
-// Note: We removed the template switch for NDim to reduce compile time.
-// The performance impact is negligible for the copy loop.
+// Separable Kernel: Shared Memory (Fast)
 template <bool IsFinal>
-__global__ void edt_kernel_shared(
-    const float* __restrict__ in_data,       // Contiguous Input
-    const int32_t* __restrict__ in_indices,  // Contiguous Input
+__global__ void separable_kernel_shared(
+    const float* __restrict__ in_data,
+    const int32_t* __restrict__ in_indices,
     float* __restrict__ out_dist,            
     int32_t* __restrict__ out_indices,       
     int64_t L,               
     int64_t total_elements,  
     int coord_ndim           
 ) {
-    // 1 Block processes 1 Row (L elements)
     int64_t row_idx = blockIdx.x;
     int64_t offset = row_idx * L;
-
     if (offset >= total_elements) return;
 
-    // Shared memory layout
     extern __shared__ char s_buffer[];
     float* s_vals = (float*)s_buffer;
     int*   s_idx1 = (int*)(s_vals + L);
     int*   s_idx2 = (int*)(s_idx1 + L);
 
-    // 1. Load Data (Coalesced Read due to .contiguous() input)
     for (int i = threadIdx.x; i < L; i += blockDim.x) {
         s_vals[i] = __ldg(&in_data[offset + i]);
     }
     __syncthreads();
 
-    // 2. Run JFA Core
-    run_jfa_core(L, threadIdx.x, s_vals, s_idx1, s_idx2);
+    run_separable_scan_core(L, threadIdx.x, s_vals, s_idx1, s_idx2);
 
-    // 3. Write Back Results
     for (int q = threadIdx.x; q < L; q += blockDim.x) {
         int p = s_idx1[q];
         float dist_val;
 
-        // Calculate final distance
         if (p != -1) {
             float dist_sq = sqr((float)q - (float)p) + s_vals[p];
             dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
         } else {
-            dist_val = IsFinal ? INF_VAL : INF_VAL; // Use large val instead of sqr(INF)
+            dist_val = IsFinal ? INF_VAL : INF_VAL;
             p = 0;
         }
         out_dist[offset + q] = dist_val;
 
-        // Propagate Indices
-        // out_indices shape is (TotalElements, coord_ndim) flattened
-        // Using runtime loop instead of template unrolling
         int64_t dst_base = (offset + q) * coord_ndim;
-        
         if (p != -1 && s_vals[p] < INF_VAL) {
             int64_t src_base = (offset + p) * coord_ndim;
             for (int d = 0; d < coord_ndim; ++d) {
                 out_indices[dst_base + d] = in_indices[src_base + d];
             }
         } else {
-            for (int d = 0; d < coord_ndim; ++d) {
-                out_indices[dst_base + d] = 0;
-            }
+            for (int d = 0; d < coord_ndim; ++d) out_indices[dst_base + d] = 0;
         }
     }
 }
 
-// ------------------------------------------------------------------
-// Kernel 2: Global Memory JFA (Fallback Path)
-// ------------------------------------------------------------------
+// Separable Kernel: Global Memory (Fallback)
 template <bool IsFinal>
-__global__ void edt_kernel_global(
+__global__ void separable_kernel_global(
     const float* __restrict__ in_data,
     const int32_t* __restrict__ in_indices,
     float* __restrict__ out_dist,
@@ -190,19 +375,16 @@ __global__ void edt_kernel_global(
 ) {
     int64_t row_idx = blockIdx.x;
     int64_t offset = row_idx * L;
-    
     if (offset >= total_elements) return;
 
     int* g_idx1 = global_buffer_1 + offset;
     int* g_idx2 = global_buffer_2 + offset;
     
-    // Core Logic operates on Global Memory pointers
-    run_jfa_core(L, threadIdx.x, in_data + offset, g_idx1, g_idx2);
+    run_separable_scan_core(L, threadIdx.x, in_data + offset, g_idx1, g_idx2);
 
     for (int q = threadIdx.x; q < L; q += blockDim.x) {
         int p = g_idx1[q];
         float dist_val;
-
         if (p != -1) {
             float val_p = in_data[offset + p]; 
             float dist_sq = sqr((float)q - (float)p) + val_p;
@@ -211,7 +393,6 @@ __global__ void edt_kernel_global(
             dist_val = IsFinal ? INF_VAL : INF_VAL;
             p = 0; 
         }
-
         out_dist[offset + q] = dist_val;
 
         int64_t dst_base = (offset + q) * coord_ndim;
@@ -226,10 +407,8 @@ __global__ void edt_kernel_global(
     }
 }
 
-// ------------------------------------------------------------------
-// Initialization Kernel
-// ------------------------------------------------------------------
-__global__ void init_indices_kernel(
+// Separable Init Kernel
+__global__ void init_indices_separable_kernel(
     int32_t* indices, 
     int64_t total_pixels, 
     int NDim, 
@@ -239,54 +418,102 @@ __global__ void init_indices_kernel(
     if (idx >= total_pixels) return;
 
     int64_t temp = idx;
-    int32_t coords[8]; // Max 8 dims supported locally
-
-    // Unravel index
+    int32_t coords[8]; 
     for (int d = NDim - 1; d >= 0; --d) {
         int64_t dim_size = shape_ptr[d];
         coords[d] = temp % dim_size;
         temp /= dim_size;
     }
-
     int64_t out_ptr = idx * NDim;
-    for (int d = 0; d < NDim; ++d) {
-        indices[out_ptr + d] = coords[d];
-    }
+    for (int d = 0; d < NDim; ++d) indices[out_ptr + d] = coords[d];
 }
 
-// ------------------------------------------------------------------
-// Host Function
-// ------------------------------------------------------------------
+// ==================================================================
+// PART 3: DISPATCH HELPERS
+// ==================================================================
 
-std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
-    TORCH_CHECK(input.is_cuda(), "Input must be on CUDA device.");
-    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be float32.");
+// --- JFA Dispatches ---
+std::tuple<torch::Tensor, torch::Tensor> run_jfa_2d(
+    torch::Tensor input, int64_t H, int64_t W, int grid, int block, int64_t numel
+) {
+    bool use_int16 = (H < 32767 && W < 32767);
+    auto index_opts = input.options().dtype(use_int16 ? torch::kInt16 : torch::kInt32);
+    
+    auto idx_shape = input.sizes().vec();
+    idx_shape.push_back(2); 
+    auto curr_idx = torch::empty(idx_shape, index_opts);
+    auto next_idx = torch::empty(idx_shape, index_opts);
     
-    // 1. Force Contiguous Input (Optimized Copy)
-    // This is crucial for coalesced memory access in init kernel.
+    if (use_int16) init_jfa_kernel_2d<int16_t><<<grid, block>>>(input.data_ptr<float>(), (int16_t*)curr_idx.data_ptr(), numel, H, W);
+    else init_jfa_kernel_2d<int32_t><<<grid, block>>>(input.data_ptr<float>(), (int32_t*)curr_idx.data_ptr(), numel, H, W);
+    
+    int max_dim = std::max((int)H, (int)W);
+    int step = 1;
+    while (step < max_dim) step *= 2;
+    step /= 2;
+
+    while (step >= 1) {
+        if (use_int16) jfa_step_2d<int16_t><<<grid, block>>>((int16_t*)curr_idx.data_ptr(), (int16_t*)next_idx.data_ptr(), step, H, W, numel);
+        else jfa_step_2d<int32_t><<<grid, block>>>((int32_t*)curr_idx.data_ptr(), (int32_t*)next_idx.data_ptr(), step, H, W, numel);
+        std::swap(curr_idx, next_idx);
+        step /= 2;
+    }
+    auto final_dist = torch::empty_like(input);
+    if (use_int16) calc_dist_kernel_2d<int16_t><<<grid, block>>>((int16_t*)curr_idx.data_ptr(), final_dist.data_ptr<float>(), numel, H, W);
+    else calc_dist_kernel_2d<int32_t><<<grid, block>>>((int32_t*)curr_idx.data_ptr(), final_dist.data_ptr<float>(), numel, H, W);
+
+    return std::make_tuple(final_dist, curr_idx);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> run_jfa_3d(
+    torch::Tensor input, int64_t D, int64_t H, int64_t W, int grid, int block, int64_t numel
+) {
+    bool use_int16 = (D < 32767 && H < 32767 && W < 32767);
+    auto index_opts = input.options().dtype(use_int16 ? torch::kInt16 : torch::kInt32);
+    auto idx_shape = input.sizes().vec();
+    idx_shape.push_back(3);
+    auto curr_idx = torch::empty(idx_shape, index_opts);
+    auto next_idx = torch::empty(idx_shape, index_opts);
+
+    if (use_int16) init_jfa_kernel_3d<int16_t><<<grid, block>>>(input.data_ptr<float>(), (int16_t*)curr_idx.data_ptr(), numel, D, H, W);
+    else init_jfa_kernel_3d<int32_t><<<grid, block>>>(input.data_ptr<float>(), (int32_t*)curr_idx.data_ptr(), numel, D, H, W);
+
+    int max_dim = std::max({(int)D, (int)H, (int)W});
+    int step = 1;
+    while (step < max_dim) step *= 2;
+    step /= 2;
+
+    while (step >= 1) {
+        if (use_int16) jfa_step_3d<int16_t><<<grid, block>>>((int16_t*)curr_idx.data_ptr(), (int16_t*)next_idx.data_ptr(), step, D, H, W, numel);
+        else jfa_step_3d<int32_t><<<grid, block>>>((int32_t*)curr_idx.data_ptr(), (int32_t*)next_idx.data_ptr(), step, D, H, W, numel);
+        std::swap(curr_idx, next_idx);
+        step /= 2;
+    }
+    auto final_dist = torch::empty_like(input);
+    if (use_int16) calc_dist_kernel_3d<int16_t><<<grid, block>>>((int16_t*)curr_idx.data_ptr(), final_dist.data_ptr<float>(), numel, D, H, W);
+    else calc_dist_kernel_3d<int32_t><<<grid, block>>>((int32_t*)curr_idx.data_ptr(), final_dist.data_ptr<float>(), numel, D, H, W);
+
+    return std::make_tuple(final_dist, curr_idx);
+}
+
+// --- Separable N-Dim Dispatch ---
+std::tuple<torch::Tensor, torch::Tensor> run_separable_ndim(torch::Tensor input) {
+    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Separable N-Dim input must be float32.");
     input = input.contiguous();
     
     const int ndim = input.dim(); 
-    const int sample_ndim = ndim - 1; 
-    TORCH_CHECK(sample_ndim > 0 && sample_ndim <= 8, "Dims must be between 2 and 9 (Batch + 8 Spatial)");
+    const int sample_ndim = ndim - 1; // Assuming Dim 0 is Batch
+    TORCH_CHECK(sample_ndim > 0 && sample_ndim <= 8, "Unsupported dims for Separable EDT");
     
     auto shape = input.sizes().vec();
     int64_t num_pixels = input.numel();
 
-    // Handle empty input
-    if (num_pixels == 0) {
-        auto index_shape = shape;
-        index_shape.push_back(sample_ndim);
-        return std::make_tuple(torch::empty_like(input), 
-                               torch::empty(index_shape, input.options().dtype(torch::kInt32)));
-    }
-
-    // 2. Init Distances
+    // 1. Init Distances
     auto current_dist = torch::where(input == 0, 
                                      torch::tensor(0.0f, input.options()), 
                                      torch::tensor(INF_VAL, input.options()));
     
-    // 3. Init Indices
+    // 2. Init Indices
     auto index_shape = shape;
     index_shape.push_back(sample_ndim);
     auto current_idx = torch::empty(index_shape, input.options().dtype(torch::kInt32));
@@ -294,32 +521,22 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
     {
         std::vector<int64_t> spatial_shape(shape.begin() + 1, shape.end());
         auto shape_tensor = torch::tensor(spatial_shape, torch::kInt64).to(input.device());
-
         int threads = 256;
         int blocks = (num_pixels + threads - 1) / threads;
-        init_indices_kernel<<<blocks, threads>>>(
-            current_idx.data_ptr<int32_t>(),
-            num_pixels,
-            sample_ndim,
-            shape_tensor.data_ptr<int64_t>()
+        init_indices_separable_kernel<<<blocks, threads>>>(
+            current_idx.data_ptr<int32_t>(), num_pixels, sample_ndim, shape_tensor.data_ptr<int64_t>()
         );
     }
 
-    // Lazy buffers
     torch::Tensor global_buf1, global_buf2;
 
-    // 4. Dimensional Iteration
+    // 3. Dimensional Iteration (Skip Batch Dim 0)
     for (int d = 1; d < ndim; ++d) {
         bool is_final_pass = (d == ndim - 1);
         
-        // --- Step A: Transpose + Contiguous (The "Expensive" Copy) ---
-        // We accept this copy because it enables fully coalesced memory access in the kernel.
-        // Without this, the kernel bandwidth drops to <5%, which is much slower than the copy.
         auto dist_in = current_dist.transpose(d, ndim - 1).contiguous();
         auto idx_in  = current_idx.transpose(d, ndim - 1).contiguous(); 
         
-        // Prepare Output (Contiguous)
-        // Using empty() instead of empty_like() to ensure standard stride layout
         auto dist_out = torch::empty(dist_in.sizes(), dist_in.options());
         auto idx_out  = torch::empty(idx_in.sizes(), idx_in.options());
 
@@ -327,37 +544,35 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
         int64_t total_slices = dist_in.numel() / L; 
         int threads = std::min((int64_t)MAX_THREADS, L);
         
-        // --- Step B: Kernel Dispatch ---
         if (L <= SMEM_LIMIT_ELEMENTS) {
             size_t smem_size = L * (sizeof(float) + 2 * sizeof(int));
             if (is_final_pass) {
-                edt_kernel_shared<true><<<total_slices, threads, smem_size>>>(
+                separable_kernel_shared<true><<<total_slices, threads, smem_size>>>(
                     dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
                     dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
                     L, dist_in.numel(), sample_ndim
                 );
             } else {
-                edt_kernel_shared<false><<<total_slices, threads, smem_size>>>(
+                separable_kernel_shared<false><<<total_slices, threads, smem_size>>>(
                     dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
                     dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
                     L, dist_in.numel(), sample_ndim
                 );
             }
         } else {
-            // Global Memory Fallback
             if (global_buf1.numel() < dist_in.numel()) {
                 global_buf1 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
                 global_buf2 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
             }
             if (is_final_pass) {
-                edt_kernel_global<true><<<total_slices, threads>>>(
+                separable_kernel_global<true><<<total_slices, threads>>>(
                     dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
                     dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
                     global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(),
                     L, dist_in.numel(), sample_ndim
                 );
             } else {
-                edt_kernel_global<false><<<total_slices, threads>>>(
+                separable_kernel_global<false><<<total_slices, threads>>>(
                     dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
                     dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
                     global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(),
@@ -365,12 +580,81 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
                 );
             }
         }
-
-        // --- Step C: Logical Transpose ---
-        // This is just a metadata swap, no copy. The next loop's .contiguous() will handle the copy.
         current_dist = dist_out.transpose(d, ndim - 1);
         current_idx  = idx_out.transpose(d, ndim - 1);
     }
 
     return std::make_tuple(current_dist, current_idx);
+}
+
+// ==================================================================
+// PART 4: MAIN ENTRY POINT (INTEGRATED)
+// ==================================================================
+
+std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
+    TORCH_CHECK(input.is_cuda(), "Input must be CUDA tensor");
+    input = input.contiguous();
+
+    int64_t dims = input.dim();
+    int64_t numel = input.numel();
+    int block = BLOCK_SIZE;
+    int grid = (numel + block - 1) / block;
+
+    // ------------------------------------------------------------------
+    // CASE 1: High-Dimension (5D+) -> Use Separable N-Dim Algorithm
+    // Input: (Batch, D1, D2, D3...) -> Treated as N-Dim spatial
+    // ------------------------------------------------------------------
+    if (dims >= 5) {
+        return run_separable_ndim(input);
+    }
+    
+    // ------------------------------------------------------------------
+    // CASE 2: 4D Tensor -> (Batch, Dim1, H, W)
+    // ------------------------------------------------------------------
+    else if (dims == 4) {
+        int64_t dim1 = input.size(1); 
+        
+        // [Fast Path for Benchmark]: (Batch, 1, H, W) -> 2D JFA
+        if (dim1 == 1) {
+            int64_t H = input.size(-2);
+            int64_t W = input.size(-1);
+            return run_jfa_2d(input, H, W, grid, block, numel);
+        } 
+        // [Correct Path for Pytest]: (Batch, Depth, H, W) -> 3D JFA
+        else {
+            int64_t D = dim1;
+            int64_t H = input.size(-2);
+            int64_t W = input.size(-1);
+            return run_jfa_3d(input, D, H, W, grid, block, numel);
+        }
+    }
+
+    // ------------------------------------------------------------------
+    // CASE 3: 3D Tensor -> (Batch, H, W) -> 2D JFA
+    // ------------------------------------------------------------------
+    else if (dims == 3) {
+        int64_t H = input.size(-2);
+        int64_t W = input.size(-1);
+        return run_jfa_2d(input, H, W, grid, block, numel);
+    }
+
+    // ------------------------------------------------------------------
+    // CASE 4: 2D Tensor -> (Batch, Length) -> 1D JFA (via 2D)
+    // ------------------------------------------------------------------
+    else if (dims == 2) {
+        int64_t H = 1;
+        int64_t W = input.size(-1);
+        auto result = run_jfa_2d(input, H, W, grid, block, numel);
+        
+        // Fix for 1D test: slice out the dummy Y coordinate
+        torch::Tensor dist = std::get<0>(result);
+        torch::Tensor idx_2d = std::get<1>(result); 
+        auto idx_1d = idx_2d.slice(/*dim=*/-1, /*start=*/1, /*end=*/2).contiguous();
+        return std::make_tuple(dist, idx_1d);
+    }
+
+    else {
+        TORCH_CHECK(false, "Unsupported dimensions.");
+        return std::make_tuple(torch::Tensor(), torch::Tensor());
+    }
 }
\ No newline at end of file

From 326e247f7fa3bc0875655db0a81fcfbaea451b44 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Tue, 23 Dec 2025 01:46:06 +0800
Subject: [PATCH 43/56] speed up 2D

---
 torchmorph/csrc/distance_transform_kernel.cu | 310 ++++++++++++++-----
 1 file changed, 231 insertions(+), 79 deletions(-)

diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 812f1a8..64816c5 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -17,36 +17,52 @@
 #define MAX_THREADS 1024
 #define SMEM_LIMIT_ELEMENTS 4096 
 
+// Configuration for 2D Optimized Block-JFA
+#define JFA_BLOCK_DIM 32       //  Tile size: 32x32
+#define JFA_FUSED_STEPS 4      //  Fused steps: 1, 2, 4, 8
+#define JFA_MAX_OFFSET 8       //  Max offset processed in shared memory (Step 8)
+#define JFA_SMEM_DIM (JFA_BLOCK_DIM + 2 * JFA_MAX_OFFSET) // Shared mem size: 48x48 (includes halo)
+
 // ------------------------------------------------------------------
 // Device Helpers
 // ------------------------------------------------------------------
 __device__ __forceinline__ float sqr(float x) { return x * x; }
 
-// Helper for JFA 2D/3D
+// Helper for JFA 2D calculation
 __device__ __forceinline__ float dist_sq_2d(int y1, int x1, int y2, int x2) {
     return sqr((float)(y1 - y2)) + sqr((float)(x1 - x2));
 }
 
+// Helper for JFA 3D calculation
 __device__ __forceinline__ float dist_sq_3d(int z1, int y1, int x1, int z2, int y2, int x2) {
     return sqr((float)(z1 - z2)) + sqr((float)(y1 - y2)) + sqr((float)(x1 - x2));
 }
 
-// Helper for Separable 1D
+// Helper for Separable 1D cost calculation
 __device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
     if (p < 0 || val_p >= INF_VAL) return INF_VAL; 
     return sqr((float)q - (float)p) + val_p;
 }
 
+// Device Helpers for int2 (Vectorized Coordinate)
+// seed.x represents Y, seed.y represents X
+__device__ __forceinline__ float dist_sq_int2(int y, int x, int2 seed) {
+    if (seed.x == -1) return INF_VAL; 
+    float dy = (float)(y - seed.x);
+    float dx = (float)(x - seed.y);
+    return dy*dy + dx*dx;
+}
+
 // ==================================================================
-// PART 1: JFA KERNELS (Optimized for 2D & 3D)
+// PART 1: JFA KERNELS (Optimized for 2D with Block-Shared Memory)
 // ==================================================================
 
-// --- 2D JFA Init ---
-template <typename IndexType>
-__global__ void init_jfa_kernel_2d(
+// --- 2D Initialization (Vectorized int2) ---
+// Initializes the coordinate map. Pixels with value 0 become seeds.
+__global__ void init_jfa_kernel_2d_opt(
     const float* __restrict__ input,
-    IndexType* __restrict__ indices, 
-    int64_t total_elements,
+    int2* __restrict__ output, // Output treats (y,x) pair as a single int2
+    int64_t total_elements,    // Total pixels (B*H*W)
     int H, int W
 ) {
     int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -57,21 +73,116 @@ __global__ void init_jfa_kernel_2d(
         int64_t rem = tid % spatial_size;
         int w = (int)(rem % W);
         int h = (int)(rem / W);
-        int64_t idx_ptr = tid * 2;
-        indices[idx_ptr + 0] = (IndexType)h;
-        indices[idx_ptr + 1] = (IndexType)w;
+        // Store coordinates: .x=y (height), .y=x (width)
+        output[tid] = make_int2(h, w); 
     } else {
-        int64_t idx_ptr = tid * 2;
-        indices[idx_ptr + 0] = (IndexType)-1;
-        indices[idx_ptr + 1] = (IndexType)-1;
+        output[tid] = make_int2(-1, -1);
     }
 }
 
-// --- 2D JFA Step ---
-template <typename IndexType>
-__global__ void jfa_step_2d(
-    const IndexType* __restrict__ in_idx,
-    IndexType* __restrict__ out_idx,
+// --- 2D Block-JFA Fused Step ---
+// Innovation: Performs Steps 1, 2, 4, and 8 entirely within Shared Memory.
+// Uses a "Halo" (Apron) region to avoid boundary checks during iteration.
+__global__ void jfa_block_fused_kernel_2d(
+    const int2* __restrict__ in_idx,
+    int2* __restrict__ out_idx,
+    int H, int W,
+    int64_t num_images // Batch Size
+) {
+    // Shared Memory: 48x48 int2 array (~18KB)
+    // Covers the 32x32 block plus an 8-pixel halo on all sides.
+    __shared__ int2 smem[JFA_SMEM_DIM][JFA_SMEM_DIM];
+
+    int tx = threadIdx.x; // 0..31
+    int ty = threadIdx.y; // 0..31
+    
+    // Global Block Indices
+    int bx = blockIdx.x * blockDim.x;
+    int by = blockIdx.y * blockDim.y;
+
+    int img_idx = blockIdx.z;
+    int64_t batch_offset = (int64_t)img_idx * (H * W);
+
+    int gx = bx + tx;
+    int gy = by + ty;
+
+    // --- Phase 1: Cooperative Load to Shared Memory (Tile + Halo) ---
+    
+    int smem_linear_size = JFA_SMEM_DIM * JFA_SMEM_DIM;
+    int total_threads = blockDim.x * blockDim.y;
+    int thread_linear_idx = ty * blockDim.x + tx;
+
+    // Base coordinates for the top-left corner of the Halo region
+    int base_x = bx - JFA_MAX_OFFSET;
+    int base_y = by - JFA_MAX_OFFSET;
+
+    // Loop to fill the entire shared memory buffer (larger than block size)
+    for (int i = thread_linear_idx; i < smem_linear_size; i += total_threads) {
+        int s_y = i / JFA_SMEM_DIM;
+        int s_x = i % JFA_SMEM_DIM;
+
+        int global_y = base_y + s_y;
+        int global_x = base_x + s_x;
+
+        int2 val = make_int2(-1, -1);
+        if (global_y >= 0 && global_y < H && global_x >= 0 && global_x < W) {
+            val = in_idx[batch_offset + global_y * W + global_x];
+        }
+        smem[s_y][s_x] = val;
+    }
+    __syncthreads();
+
+    // --- Phase 2: Iterative JFA in Shared Memory ---
+    // Only process valid pixels within the image bounds
+    if (gx < W && gy < H) {
+        // Map thread to the center region of Shared Memory
+        int center_sy = ty + JFA_MAX_OFFSET;
+        int center_sx = tx + JFA_MAX_OFFSET;
+
+        int2 best_seed = smem[center_sy][center_sx];
+        float best_dist = dist_sq_int2(gy, gx, best_seed);
+
+        int step = 1;
+        
+        // Unroll steps 1, 2, 4, 8
+        #pragma unroll
+        for (int k = 0; k < JFA_FUSED_STEPS; ++k) { 
+            
+            #pragma unroll
+            for (int dy = -1; dy <= 1; ++dy) {
+                #pragma unroll
+                for (int dx = -1; dx <= 1; ++dx) {
+                    if (dy == 0 && dx == 0) continue;
+                    
+                    // Access neighbor in SMEM directly. 
+                    // No boundary check needed because the Halo covers the max offset (8).
+                    int2 neighbor_seed = smem[center_sy + dy * step][center_sx + dx * step];
+                    
+                    if (neighbor_seed.x != -1) {
+                        float d = dist_sq_int2(gy, gx, neighbor_seed);
+                        if (d < best_dist) {
+                            best_dist = d;
+                            best_seed = neighbor_seed;
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+            smem[center_sy][center_sx] = best_seed;
+            __syncthreads();
+            step *= 2;
+        }
+
+        // --- Phase 3: Write results back to Global Memory ---
+        out_idx[batch_offset + gy * W + gx] = best_seed;
+    }
+}
+
+// --- 2D Global Step (Vectorized int2) ---
+// Handles larger steps (16, 32, ...) that exceed Shared Memory capacity.
+__global__ void jfa_step_global_2d_opt(
+    const int2* __restrict__ in_idx,
+    int2* __restrict__ out_idx,
     int step,
     int H, int W, 
     int64_t total_pixels
@@ -85,39 +196,36 @@ __global__ void jfa_step_2d(
     int w = (int)(rem % W);
     int h = (int)(rem / W);
 
-    int best_y = -1, best_x = -1;
-    float best_dist = INF_VAL;
+    int2 best_seed = in_idx[tid];
+    float best_dist = dist_sq_int2(h, w, best_seed);
 
     #pragma unroll
     for (int dy = -1; dy <= 1; ++dy) {
         #pragma unroll
         for (int dx = -1; dx <= 1; ++dx) {
+            if (dx == 0 && dy == 0) continue; 
+            
             int ny = h + dy * step;
             int nx = w + dx * step;
+
             if (ny >= 0 && ny < H && nx >= 0 && nx < W) {
-                int64_t n_ptr = (batch_offset + ny * W + nx) * 2;
-                int seed_y = (int)in_idx[n_ptr + 0];
-                if (seed_y != -1) {
-                    int seed_x = (int)in_idx[n_ptr + 1];
-                    float d = dist_sq_2d(h, w, seed_y, seed_x);
+                int2 neighbor_seed = in_idx[batch_offset + ny * W + nx];
+                if (neighbor_seed.x != -1) {
+                    float d = dist_sq_int2(h, w, neighbor_seed);
                     if (d < best_dist) {
                         best_dist = d;
-                        best_y = seed_y;
-                        best_x = seed_x;
+                        best_seed = neighbor_seed;
                     }
                 }
             }
         }
     }
-    int64_t out_ptr = tid * 2;
-    out_idx[out_ptr + 0] = (IndexType)best_y;
-    out_idx[out_ptr + 1] = (IndexType)best_x;
+    out_idx[tid] = best_seed;
 }
 
-// --- 2D JFA Calc ---
-template <typename IndexType>
-__global__ void calc_dist_kernel_2d(
-    const IndexType* __restrict__ indices,
+// --- 2D Final Distance Calculation ---
+__global__ void calc_dist_kernel_2d_opt(
+    const int2* __restrict__ indices,
     float* __restrict__ dist_out,
     int64_t total_elements,
     int H, int W
@@ -125,20 +233,19 @@ __global__ void calc_dist_kernel_2d(
     int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid >= total_elements) return;
 
-    int64_t idx_ptr = tid * 2;
-    int seed_h = (int)indices[idx_ptr + 0];
-    if (seed_h == -1) { dist_out[tid] = INF_VAL; return; }
-    int seed_w = (int)indices[idx_ptr + 1];
-
-    int64_t spatial_size = (int64_t)H * W;
-    int64_t rem = tid % spatial_size;
-    int cur_w = (int)(rem % W);
-    int cur_h = (int)(rem / W);
-
-    dist_out[tid] = sqrtf(dist_sq_2d(cur_h, cur_w, seed_h, seed_w));
+    int2 s = indices[tid];
+    if (s.x == -1) { 
+        dist_out[tid] = INF_VAL; 
+    } else {
+        int64_t spatial_size = (int64_t)H * W;
+        int64_t rem = tid % spatial_size;
+        int cur_w = (int)(rem % W);
+        int cur_h = (int)(rem / W);
+        dist_out[tid] = sqrtf(dist_sq_int2(cur_h, cur_w, s));
+    }
 }
 
-// --- 3D JFA Init ---
+// --- 3D JFA Initialization ---
 template <typename IndexType>
 __global__ void init_jfa_kernel_3d(
     const float* __restrict__ input,
@@ -222,7 +329,7 @@ __global__ void jfa_step_3d(
     out_idx[out_ptr + 2] = (IndexType)best_x;
 }
 
-// --- 3D JFA Calc ---
+// --- 3D JFA Distance Calculation ---
 template <typename IndexType>
 __global__ void calc_dist_kernel_3d(
     const IndexType* __restrict__ indices,
@@ -252,7 +359,7 @@ __global__ void calc_dist_kernel_3d(
 // PART 2: SEPARABLE N-DIM KERNELS (For 4D+ Spatial)
 // ==================================================================
 
-// Core logic for 1D Scan (similar to JFA 1D)
+// Core logic for 1D Voronoi scan
 __device__ void run_separable_scan_core(
     int N,
     int tid,
@@ -267,7 +374,7 @@ __device__ void run_separable_scan_core(
     }
     __syncthreads();
 
-    // 2. Iterative Propagation
+    // 2. Iterative Propagation (Logarithmic steps)
     int* idx_in = idx_curr;
     int* idx_out = idx_next;
 
@@ -278,6 +385,7 @@ __device__ void run_separable_scan_core(
 
             if (my_best_p != -1) min_cost = compute_cost(i, my_best_p, vals[my_best_p]);
 
+            // Check Left Neighbor
             int left = i - step;
             if (left >= 0) {
                 int left_p = idx_in[left];
@@ -287,6 +395,7 @@ __device__ void run_separable_scan_core(
                 }
             }
 
+            // Check Right Neighbor
             int right = i + step;
             if (right < N) {
                 int right_p = idx_in[right];
@@ -297,18 +406,19 @@ __device__ void run_separable_scan_core(
             }
             idx_out[i] = my_best_p;
         }
+        // Swap buffers
         int* temp = idx_in; idx_in = idx_out; idx_out = temp;
         __syncthreads();
     }
 
-    // 3. Final Copy Back
+    // 3. Final Copy Back (ensure result is in idx_curr)
     if (idx_in != idx_curr) {
         for (int i = tid; i < N; i += blockDim.x) idx_curr[i] = idx_next[i];
         __syncthreads();
     }
 }
 
-// Separable Kernel: Shared Memory (Fast)
+// Separable Kernel: Optimized using Shared Memory
 template <bool IsFinal>
 __global__ void separable_kernel_shared(
     const float* __restrict__ in_data,
@@ -323,11 +433,13 @@ __global__ void separable_kernel_shared(
     int64_t offset = row_idx * L;
     if (offset >= total_elements) return;
 
+    // Dynamic Shared Memory allocation
     extern __shared__ char s_buffer[];
     float* s_vals = (float*)s_buffer;
     int*   s_idx1 = (int*)(s_vals + L);
     int*   s_idx2 = (int*)(s_idx1 + L);
 
+    // Load data
     for (int i = threadIdx.x; i < L; i += blockDim.x) {
         s_vals[i] = __ldg(&in_data[offset + i]);
     }
@@ -335,6 +447,7 @@ __global__ void separable_kernel_shared(
 
     run_separable_scan_core(L, threadIdx.x, s_vals, s_idx1, s_idx2);
 
+    // Write back
     for (int q = threadIdx.x; q < L; q += blockDim.x) {
         int p = s_idx1[q];
         float dist_val;
@@ -360,7 +473,7 @@ __global__ void separable_kernel_shared(
     }
 }
 
-// Separable Kernel: Global Memory (Fallback)
+// Separable Kernel: Global Memory Fallback (when dim size > Shared Mem)
 template <bool IsFinal>
 __global__ void separable_kernel_global(
     const float* __restrict__ in_data,
@@ -402,12 +515,12 @@ __global__ void separable_kernel_global(
                 out_indices[dst_base + d] = in_indices[src_base + d];
             }
         } else {
-             for (int d = 0; d < coord_ndim; ++d) out_indices[dst_base + d] = 0;
+            for (int d = 0; d < coord_ndim; ++d) out_indices[dst_base + d] = 0;
         }
     }
 }
 
-// Separable Init Kernel
+// Separable Initialization
 __global__ void init_indices_separable_kernel(
     int32_t* indices, 
     int64_t total_pixels, 
@@ -432,35 +545,73 @@ __global__ void init_indices_separable_kernel(
 // PART 3: DISPATCH HELPERS
 // ==================================================================
 
-// --- JFA Dispatches ---
+// --- JFA 2D Dispatch ---
 std::tuple<torch::Tensor, torch::Tensor> run_jfa_2d(
     torch::Tensor input, int64_t H, int64_t W, int grid, int block, int64_t numel
 ) {
-    bool use_int16 = (H < 32767 && W < 32767);
-    auto index_opts = input.options().dtype(use_int16 ? torch::kInt16 : torch::kInt32);
+    // Force Int32 to enable int2 vectorized loads/stores
+    auto index_opts = input.options().dtype(torch::kInt32);
     
+    // Create Double Buffer indices: (Batch, H, W, 2)
     auto idx_shape = input.sizes().vec();
     idx_shape.push_back(2); 
     auto curr_idx = torch::empty(idx_shape, index_opts);
     auto next_idx = torch::empty(idx_shape, index_opts);
     
-    if (use_int16) init_jfa_kernel_2d<int16_t><<<grid, block>>>(input.data_ptr<float>(), (int16_t*)curr_idx.data_ptr(), numel, H, W);
-    else init_jfa_kernel_2d<int32_t><<<grid, block>>>(input.data_ptr<float>(), (int32_t*)curr_idx.data_ptr(), numel, H, W);
-    
-    int max_dim = std::max((int)H, (int)W);
-    int step = 1;
-    while (step < max_dim) step *= 2;
-    step /= 2;
+    // Cast int32 pointer to int2 pointer.
+    // Memory layout matches: consecutive pairs of (y, x) form int2.
+    int2* d_curr = (int2*)curr_idx.data_ptr<int32_t>();
+    int2* d_next = (int2*)next_idx.data_ptr<int32_t>();
+
+    // 1. Initialization Kernel
+    // numel equals the number of int2 elements (pixels)
+    init_jfa_kernel_2d_opt<<<grid, block>>>(
+        input.data_ptr<float>(), 
+        d_curr, 
+        numel, H, W
+    );
+
+    // 2. Block-JFA Fused Kernel (Optimized)
+    // Runs Steps 1, 2, 4, 8 inside Shared Memory
+    {
+        dim3 dimBlock(JFA_BLOCK_DIM, JFA_BLOCK_DIM); 
+        int64_t batch_size = numel / (H * W);
+        dim3 dimGrid((W + JFA_BLOCK_DIM - 1) / JFA_BLOCK_DIM, 
+                     (H + JFA_BLOCK_DIM - 1) / JFA_BLOCK_DIM, 
+                     batch_size);
+        
+        jfa_block_fused_kernel_2d<<<dimGrid, dimBlock>>>(
+            d_curr, 
+            d_next, 
+            H, W, batch_size
+        );
+        std::swap(d_curr, d_next);     // d_curr now holds result of Step 8
+        std::swap(curr_idx, next_idx); // Keep Tensor pointers in sync
+    }
 
-    while (step >= 1) {
-        if (use_int16) jfa_step_2d<int16_t><<<grid, block>>>((int16_t*)curr_idx.data_ptr(), (int16_t*)next_idx.data_ptr(), step, H, W, numel);
-        else jfa_step_2d<int32_t><<<grid, block>>>((int32_t*)curr_idx.data_ptr(), (int32_t*)next_idx.data_ptr(), step, H, W, numel);
+    // 3. Global Loop (Steps 16, 32...)
+    int max_dim = std::max((int)H, (int)W);
+    int step = 16; 
+
+    while (step < max_dim) {
+        jfa_step_global_2d_opt<<<grid, block>>>(
+            d_curr, 
+            d_next, 
+            step, 
+            H, W, numel
+        );
+        std::swap(d_curr, d_next);
         std::swap(curr_idx, next_idx);
-        step /= 2;
+        step *= 2;
     }
+    
+    // 4. Final Distance Calculation
     auto final_dist = torch::empty_like(input);
-    if (use_int16) calc_dist_kernel_2d<int16_t><<<grid, block>>>((int16_t*)curr_idx.data_ptr(), final_dist.data_ptr<float>(), numel, H, W);
-    else calc_dist_kernel_2d<int32_t><<<grid, block>>>((int32_t*)curr_idx.data_ptr(), final_dist.data_ptr<float>(), numel, H, W);
+    calc_dist_kernel_2d_opt<<<grid, block>>>(
+        d_curr, 
+        final_dist.data_ptr<float>(), 
+        numel, H, W
+    );
 
     return std::make_tuple(final_dist, curr_idx);
 }
@@ -510,8 +661,8 @@ std::tuple<torch::Tensor, torch::Tensor> run_separable_ndim(torch::Tensor input)
 
     // 1. Init Distances
     auto current_dist = torch::where(input == 0, 
-                                     torch::tensor(0.0f, input.options()), 
-                                     torch::tensor(INF_VAL, input.options()));
+                                    torch::tensor(0.0f, input.options()), 
+                                    torch::tensor(INF_VAL, input.options()));
     
     // 2. Init Indices
     auto index_shape = shape;
@@ -530,7 +681,7 @@ std::tuple<torch::Tensor, torch::Tensor> run_separable_ndim(torch::Tensor input)
 
     torch::Tensor global_buf1, global_buf2;
 
-    // 3. Dimensional Iteration (Skip Batch Dim 0)
+    // 3. Dimensional Iteration (Apply 1D transform along each spatial axis)
     for (int d = 1; d < ndim; ++d) {
         bool is_final_pass = (d == ndim - 1);
         
@@ -544,6 +695,7 @@ std::tuple<torch::Tensor, torch::Tensor> run_separable_ndim(torch::Tensor input)
         int64_t total_slices = dist_in.numel() / L; 
         int threads = std::min((int64_t)MAX_THREADS, L);
         
+        // Choose between Shared Memory or Global Memory kernel based on dimension size
         if (L <= SMEM_LIMIT_ELEMENTS) {
             size_t smem_size = L * (sizeof(float) + 2 * sizeof(int));
             if (is_final_pass) {
@@ -614,13 +766,13 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
     else if (dims == 4) {
         int64_t dim1 = input.size(1); 
         
-        // [Fast Path for Benchmark]: (Batch, 1, H, W) -> 2D JFA
+        // [Fast Path]: (Batch, 1, H, W) -> Treat as 2D JFA
         if (dim1 == 1) {
             int64_t H = input.size(-2);
             int64_t W = input.size(-1);
             return run_jfa_2d(input, H, W, grid, block, numel);
         } 
-        // [Correct Path for Pytest]: (Batch, Depth, H, W) -> 3D JFA
+        // [Standard Path]: (Batch, Depth, H, W) -> Use 3D JFA
         else {
             int64_t D = dim1;
             int64_t H = input.size(-2);
@@ -630,7 +782,7 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
     }
 
     // ------------------------------------------------------------------
-    // CASE 3: 3D Tensor -> (Batch, H, W) -> 2D JFA
+    // CASE 3: 3D Tensor -> (Batch, H, W) -> Use 2D JFA
     // ------------------------------------------------------------------
     else if (dims == 3) {
         int64_t H = input.size(-2);
@@ -639,14 +791,14 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
     }
 
     // ------------------------------------------------------------------
-    // CASE 4: 2D Tensor -> (Batch, Length) -> 1D JFA (via 2D)
+    // CASE 4: 2D Tensor -> (Batch, Length) -> 1D JFA (via 2D wrapper)
     // ------------------------------------------------------------------
     else if (dims == 2) {
         int64_t H = 1;
         int64_t W = input.size(-1);
         auto result = run_jfa_2d(input, H, W, grid, block, numel);
         
-        // Fix for 1D test: slice out the dummy Y coordinate
+        // Post-process for 1D: slice out the dummy Y coordinate
         torch::Tensor dist = std::get<0>(result);
         torch::Tensor idx_2d = std::get<1>(result); 
         auto idx_1d = idx_2d.slice(/*dim=*/-1, /*start=*/1, /*end=*/2).contiguous();

From f91761a86366a0c155b96d4146e1a2c95196b0f7 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Wed, 24 Dec 2025 04:59:38 +0800
Subject: [PATCH 44/56] speed up 3D

---
 torchmorph/csrc/distance_transform_kernel.cu | 529 ++++++++++++-------
 1 file changed, 331 insertions(+), 198 deletions(-)

diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 64816c5..09d4cbc 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -17,35 +17,41 @@
 #define MAX_THREADS 1024
 #define SMEM_LIMIT_ELEMENTS 4096 
 
-// Configuration for 2D Optimized Block-JFA
-#define JFA_BLOCK_DIM 32       //  Tile size: 32x32
-#define JFA_FUSED_STEPS 4      //  Fused steps: 1, 2, 4, 8
-#define JFA_MAX_OFFSET 8       //  Max offset processed in shared memory (Step 8)
-#define JFA_SMEM_DIM (JFA_BLOCK_DIM + 2 * JFA_MAX_OFFSET) // Shared mem size: 48x48 (includes halo)
+#define JFA_BLOCK_DIM 32       
+#define JFA_FUSED_STEPS 4      
+#define JFA_MAX_OFFSET 8       
+#define JFA_SMEM_DIM (JFA_BLOCK_DIM + 2 * JFA_MAX_OFFSET) 
+
+// 3D Config
+#define JFA_3D_BLOCK 8
+#define JFA_3D_HALO 1 
 
 // ------------------------------------------------------------------
 // Device Helpers
 // ------------------------------------------------------------------
 __device__ __forceinline__ float sqr(float x) { return x * x; }
 
-// Helper for JFA 2D calculation
+// Helper for JFA 2D/3D (Standard)
 __device__ __forceinline__ float dist_sq_2d(int y1, int x1, int y2, int x2) {
     return sqr((float)(y1 - y2)) + sqr((float)(x1 - x2));
 }
 
-// Helper for JFA 3D calculation
-__device__ __forceinline__ float dist_sq_3d(int z1, int y1, int x1, int z2, int y2, int x2) {
-    return sqr((float)(z1 - z2)) + sqr((float)(y1 - y2)) + sqr((float)(x1 - x2));
+// Helper for SoA 3D (Z, Y, X separate)
+__device__ __forceinline__ float dist_sq_3d_soa(int z1, int y1, int x1, int z2, int y2, int x2) {
+    if (z2 == -1) return INF_VAL;
+    float dz = (float)(z1 - z2);
+    float dy = (float)(y1 - y2);
+    float dx = (float)(x1 - x2);
+    return dz*dz + dy*dy + dx*dx;
 }
 
-// Helper for Separable 1D cost calculation
+// Helper for Separable 1D
 __device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
     if (p < 0 || val_p >= INF_VAL) return INF_VAL; 
     return sqr((float)q - (float)p) + val_p;
 }
 
-// Device Helpers for int2 (Vectorized Coordinate)
-// seed.x represents Y, seed.y represents X
+// Device Helpers for int2 (2D Vectorized)
 __device__ __forceinline__ float dist_sq_int2(int y, int x, int2 seed) {
     if (seed.x == -1) return INF_VAL; 
     float dy = (float)(y - seed.x);
@@ -54,15 +60,13 @@ __device__ __forceinline__ float dist_sq_int2(int y, int x, int2 seed) {
 }
 
 // ==================================================================
-// PART 1: JFA KERNELS (Optimized for 2D with Block-Shared Memory)
+// PART 1: JFA KERNELS 2D (Vectorized int2 + Block Shared)
 // ==================================================================
 
-// --- 2D Initialization (Vectorized int2) ---
-// Initializes the coordinate map. Pixels with value 0 become seeds.
 __global__ void init_jfa_kernel_2d_opt(
     const float* __restrict__ input,
-    int2* __restrict__ output, // Output treats (y,x) pair as a single int2
-    int64_t total_elements,    // Total pixels (B*H*W)
+    int2* __restrict__ output, 
+    int64_t total_elements,    
     int H, int W
 ) {
     int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -73,57 +77,44 @@ __global__ void init_jfa_kernel_2d_opt(
         int64_t rem = tid % spatial_size;
         int w = (int)(rem % W);
         int h = (int)(rem / W);
-        // Store coordinates: .x=y (height), .y=x (width)
         output[tid] = make_int2(h, w); 
     } else {
         output[tid] = make_int2(-1, -1);
     }
 }
 
-// --- 2D Block-JFA Fused Step ---
-// Innovation: Performs Steps 1, 2, 4, and 8 entirely within Shared Memory.
-// Uses a "Halo" (Apron) region to avoid boundary checks during iteration.
 __global__ void jfa_block_fused_kernel_2d(
     const int2* __restrict__ in_idx,
     int2* __restrict__ out_idx,
     int H, int W,
-    int64_t num_images // Batch Size
+    int64_t num_images 
 ) {
-    // Shared Memory: 48x48 int2 array (~18KB)
-    // Covers the 32x32 block plus an 8-pixel halo on all sides.
     __shared__ int2 smem[JFA_SMEM_DIM][JFA_SMEM_DIM];
 
-    int tx = threadIdx.x; // 0..31
-    int ty = threadIdx.y; // 0..31
+    int tx = threadIdx.x; 
+    int ty = threadIdx.y; 
     
-    // Global Block Indices
     int bx = blockIdx.x * blockDim.x;
     int by = blockIdx.y * blockDim.y;
-
     int img_idx = blockIdx.z;
     int64_t batch_offset = (int64_t)img_idx * (H * W);
 
     int gx = bx + tx;
     int gy = by + ty;
 
-    // --- Phase 1: Cooperative Load to Shared Memory (Tile + Halo) ---
-    
+    // Phase 1: load data to Shared Memory
     int smem_linear_size = JFA_SMEM_DIM * JFA_SMEM_DIM;
     int total_threads = blockDim.x * blockDim.y;
     int thread_linear_idx = ty * blockDim.x + tx;
 
-    // Base coordinates for the top-left corner of the Halo region
     int base_x = bx - JFA_MAX_OFFSET;
     int base_y = by - JFA_MAX_OFFSET;
 
-    // Loop to fill the entire shared memory buffer (larger than block size)
     for (int i = thread_linear_idx; i < smem_linear_size; i += total_threads) {
         int s_y = i / JFA_SMEM_DIM;
         int s_x = i % JFA_SMEM_DIM;
-
         int global_y = base_y + s_y;
         int global_x = base_x + s_x;
-
         int2 val = make_int2(-1, -1);
         if (global_y >= 0 && global_y < H && global_x >= 0 && global_x < W) {
             val = in_idx[batch_offset + global_y * W + global_x];
@@ -132,10 +123,8 @@ __global__ void jfa_block_fused_kernel_2d(
     }
     __syncthreads();
 
-    // --- Phase 2: Iterative JFA in Shared Memory ---
-    // Only process valid pixels within the image bounds
+    // Phase 2: Iterate in Shared Memory
     if (gx < W && gy < H) {
-        // Map thread to the center region of Shared Memory
         int center_sy = ty + JFA_MAX_OFFSET;
         int center_sx = tx + JFA_MAX_OFFSET;
 
@@ -143,21 +132,14 @@ __global__ void jfa_block_fused_kernel_2d(
         float best_dist = dist_sq_int2(gy, gx, best_seed);
 
         int step = 1;
-        
-        // Unroll steps 1, 2, 4, 8
         #pragma unroll
         for (int k = 0; k < JFA_FUSED_STEPS; ++k) { 
-            
             #pragma unroll
             for (int dy = -1; dy <= 1; ++dy) {
                 #pragma unroll
                 for (int dx = -1; dx <= 1; ++dx) {
                     if (dy == 0 && dx == 0) continue;
-                    
-                    // Access neighbor in SMEM directly. 
-                    // No boundary check needed because the Halo covers the max offset (8).
                     int2 neighbor_seed = smem[center_sy + dy * step][center_sx + dx * step];
-                    
                     if (neighbor_seed.x != -1) {
                         float d = dist_sq_int2(gy, gx, neighbor_seed);
                         if (d < best_dist) {
@@ -172,14 +154,10 @@ __global__ void jfa_block_fused_kernel_2d(
             __syncthreads();
             step *= 2;
         }
-
-        // --- Phase 3: Write results back to Global Memory ---
         out_idx[batch_offset + gy * W + gx] = best_seed;
     }
 }
 
-// --- 2D Global Step (Vectorized int2) ---
-// Handles larger steps (16, 32, ...) that exceed Shared Memory capacity.
 __global__ void jfa_step_global_2d_opt(
     const int2* __restrict__ in_idx,
     int2* __restrict__ out_idx,
@@ -203,7 +181,7 @@ __global__ void jfa_step_global_2d_opt(
     for (int dy = -1; dy <= 1; ++dy) {
         #pragma unroll
         for (int dx = -1; dx <= 1; ++dx) {
-            if (dx == 0 && dy == 0) continue; 
+            if (dx == 0 && dy == 0) continue;
             
             int ny = h + dy * step;
             int nx = w + dx * step;
@@ -223,7 +201,6 @@ __global__ void jfa_step_global_2d_opt(
     out_idx[tid] = best_seed;
 }
 
-// --- 2D Final Distance Calculation ---
 __global__ void calc_dist_kernel_2d_opt(
     const int2* __restrict__ indices,
     float* __restrict__ dist_out,
@@ -245,12 +222,17 @@ __global__ void calc_dist_kernel_2d_opt(
     }
 }
 
-// --- 3D JFA Initialization ---
+// ==================================================================
+// PART 2: JFA KERNELS 3D (Optimized SoA Layout)
+// ==================================================================
+
 template <typename IndexType>
-__global__ void init_jfa_kernel_3d(
+__global__ void init_jfa_kernel_3d_soa(
     const float* __restrict__ input,
-    IndexType* __restrict__ indices, 
-    int64_t total_elements,
+    IndexType* __restrict__ indices_z,
+    IndexType* __restrict__ indices_y,
+    IndexType* __restrict__ indices_x,
+    int64_t total_elements, 
     int D, int H, int W
 ) {
     int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -262,23 +244,150 @@ __global__ void init_jfa_kernel_3d(
         int w = (int)(rem % W);
         int h = (int)((rem / W) % H);
         int d = (int)(rem / (W * H));
-        int64_t idx_ptr = tid * 3;
-        indices[idx_ptr + 0] = (IndexType)d;
-        indices[idx_ptr + 1] = (IndexType)h;
-        indices[idx_ptr + 2] = (IndexType)w;
+
+        indices_z[tid] = (IndexType)d;
+        indices_y[tid] = (IndexType)h;
+        indices_x[tid] = (IndexType)w;
     } else {
-        int64_t idx_ptr = tid * 3;
-        indices[idx_ptr + 0] = (IndexType)-1;
-        indices[idx_ptr + 1] = (IndexType)-1;
-        indices[idx_ptr + 2] = (IndexType)-1;
+        indices_z[tid] = (IndexType)-1;
+        indices_y[tid] = (IndexType)-1;
+        indices_x[tid] = (IndexType)-1;
     }
 }
 
-// --- 3D JFA Step ---
 template <typename IndexType>
-__global__ void jfa_step_3d(
-    const IndexType* __restrict__ in_idx,
-    IndexType* __restrict__ out_idx,
+__global__ void jfa_block_fused_kernel_3d_soa(
+    const IndexType* __restrict__ in_z,
+    const IndexType* __restrict__ in_y,
+    const IndexType* __restrict__ in_x,
+    IndexType* __restrict__ out_z,
+    IndexType* __restrict__ out_y,
+    IndexType* __restrict__ out_x,
+    int D, int H, int W,
+    int blocks_per_d
+) {
+    const int BLOCK_DIM = 8;
+    const int HALO = 3; 
+    const int SMEM_DIM = BLOCK_DIM + 2 * HALO; // 14
+    const int SMEM_SIZE = SMEM_DIM * SMEM_DIM * SMEM_DIM;
+
+    extern __shared__ char smem_raw[];
+    IndexType* smem_z = (IndexType*)smem_raw;
+    IndexType* smem_y = smem_z + SMEM_SIZE;
+    IndexType* smem_x = smem_y + SMEM_SIZE;
+
+    int tx = threadIdx.x; int ty = threadIdx.y; int tz = threadIdx.z;
+
+    int b_z_total = blockIdx.z;
+    int batch_id = b_z_total / blocks_per_d;
+    int b_z_local = b_z_total % blocks_per_d;
+    
+    int bx = blockIdx.x * BLOCK_DIM;
+    int by = blockIdx.y * BLOCK_DIM;
+    int bz = b_z_local * BLOCK_DIM;
+
+    int64_t spatial_offset = (int64_t)batch_id * (D * H * W);
+
+    // Phase 1: Load to SoA Shared Memory
+    int tid = tz * 64 + ty * 8 + tx; 
+    int base_x = bx - HALO;
+    int base_y = by - HALO;
+    int base_z = bz - HALO;
+
+    for (int i = tid; i < SMEM_SIZE; i += 512) {
+        int temp = i;
+        int sx = temp % SMEM_DIM; temp /= SMEM_DIM;
+        int sy = temp % SMEM_DIM;
+        int sz = temp / SMEM_DIM;
+
+        int gx = base_x + sx;
+        int gy = base_y + sy;
+        int gz = base_z + sz;
+
+        IndexType val_z = -1, val_y = -1, val_x = -1;
+        if (gz >= 0 && gz < D && gy >= 0 && gy < H && gx >= 0 && gx < W) {
+            int64_t idx = spatial_offset + (int64_t)gz * (H * W) + gy * W + gx;
+            val_z = in_z[idx];
+            val_y = in_y[idx];
+            val_x = in_x[idx];
+        }
+        smem_z[i] = val_z;
+        smem_y[i] = val_y;
+        smem_x[i] = val_x;
+    }
+    __syncthreads();
+
+    // Phase 2: Compute
+    int center_sz = tz + HALO;
+    int center_sy = ty + HALO;
+    int center_sx = tx + HALO;
+    int my_s_idx = (center_sz * SMEM_DIM + center_sy) * SMEM_DIM + center_sx;
+
+    int best_z = (int)smem_z[my_s_idx];
+    int best_y = (int)smem_y[my_s_idx];
+    int best_x = (int)smem_x[my_s_idx];
+
+    int g_cz = bz + tz;
+    int g_cy = by + ty;
+    int g_cx = bx + tx;
+
+    float best_dist = dist_sq_3d_soa(g_cz, g_cy, g_cx, best_z, best_y, best_x);
+
+    int step = 1;
+    #pragma unroll
+    for (int k = 0; k < 2; ++k) { 
+        #pragma unroll
+        for (int dz = -1; dz <= 1; ++dz) {
+            #pragma unroll
+            for (int dy = -1; dy <= 1; ++dy) {
+                #pragma unroll
+                for (int dx = -1; dx <= 1; ++dx) {
+                    if (dz == 0 && dy == 0 && dx == 0) continue;
+
+                    int nz = center_sz + dz * step;
+                    int ny = center_sy + dy * step;
+                    int nx = center_sx + dx * step;
+                    int n_idx = (nz * SMEM_DIM + ny) * SMEM_DIM + nx;
+
+                    int sz_in = (int)smem_z[n_idx]; 
+                    if (sz_in != -1) {
+                        int sy_in = (int)smem_y[n_idx];
+                        int sx_in = (int)smem_x[n_idx];
+                        float d = dist_sq_3d_soa(g_cz, g_cy, g_cx, sz_in, sy_in, sx_in);
+                        if (d < best_dist) {
+                            best_dist = d;
+                            best_z = sz_in;
+                            best_y = sy_in;
+                            best_x = sx_in;
+                        }
+                    }
+                }
+            }
+        }
+        __syncthreads();
+        smem_z[my_s_idx] = (IndexType)best_z;
+        smem_y[my_s_idx] = (IndexType)best_y;
+        smem_x[my_s_idx] = (IndexType)best_x;
+        __syncthreads();
+        step *= 2;
+    }
+
+    if (g_cz < D && g_cy < H && g_cx < W) {
+        int64_t out_idx_g = spatial_offset + (int64_t)g_cz * (H * W) + g_cy * W + g_cx;
+        out_z[out_idx_g] = (IndexType)best_z;
+        out_y[out_idx_g] = (IndexType)best_y;
+        out_x[out_idx_g] = (IndexType)best_x;
+    }
+}
+
+template <typename IndexType>
+__global__ void jfa_step_3d_soa(
+    const IndexType* __restrict__ in_z,
+    const IndexType* __restrict__ in_y,
+    const IndexType* __restrict__ in_x,
+    IndexType* __restrict__ out_z,
+    IndexType* __restrict__ out_y,
+    IndexType* __restrict__ out_x,
     int step,
     int D, int H, int W, 
     int64_t total_pixels
@@ -289,12 +398,15 @@ __global__ void jfa_step_3d(
     int64_t spatial_size = (int64_t)D * H * W;
     int64_t rem = tid % spatial_size;
     int64_t batch_offset = tid - rem; 
-    int w = (int)(rem % W);
-    int h = (int)((rem / W) % H);
-    int d = (int)(rem / (W * H));
+    int cur_w = (int)(rem % W);
+    int cur_h = (int)((rem / W) % H);
+    int cur_d = (int)(rem / (W * H));
 
-    int best_z = -1, best_y = -1, best_x = -1;
-    float best_dist = INF_VAL;
+    int best_z = (int)in_z[tid];
+    int best_y = (int)in_y[tid];
+    int best_x = (int)in_x[tid];
+    
+    float best_dist = dist_sq_3d_soa(cur_d, cur_h, cur_w, best_z, best_y, best_x);
 
     #pragma unroll
     for (int dz = -1; dz <= 1; ++dz) {
@@ -302,37 +414,47 @@ __global__ void jfa_step_3d(
         for (int dy = -1; dy <= 1; ++dy) {
             #pragma unroll
             for (int dx = -1; dx <= 1; ++dx) {
-                int nz = d + dz * step;
-                int ny = h + dy * step;
-                int nx = w + dx * step;
+                if (dz == 0 && dy == 0 && dx == 0) continue;
+
+                int nz = cur_d + dz * step;
+                int ny = cur_h + dy * step;
+                int nx = cur_w + dx * step;
+
                 if (nz >= 0 && nz < D && ny >= 0 && ny < H && nx >= 0 && nx < W) {
-                    int64_t n_ptr = (batch_offset + (int64_t)nz * (H * W) + ny * W + nx) * 3;
-                    int seed_z = (int)in_idx[n_ptr + 0];
+                    int64_t n_idx = batch_offset + (int64_t)nz * (H * W) + ny * W + nx;
+                    
+                    int seed_z = (int)in_z[n_idx];
                     if (seed_z != -1) {
-                        int seed_y = (int)in_idx[n_ptr + 1];
-                        int seed_x = (int)in_idx[n_ptr + 2];
-                        float dist = dist_sq_3d(d, h, w, seed_z, seed_y, seed_x);
-                        if (dist < best_dist) {
-                            best_dist = dist;
-                            best_z = seed_z;
-                            best_y = seed_y;
-                            best_x = seed_x;
+                        float dz_val = (float)(cur_d - seed_z);
+                        float dz_sq = dz_val * dz_val;
+                        
+                        if (dz_sq < best_dist) {
+                            int seed_y = (int)in_y[n_idx];
+                            int seed_x = (int)in_x[n_idx];
+                            float dist = dz_sq + sqr((float)(cur_h - seed_y)) + sqr((float)(cur_w - seed_x));
+                            
+                            if (dist < best_dist) {
+                                best_dist = dist;
+                                best_z = seed_z;
+                                best_y = seed_y;
+                                best_x = seed_x;
+                            }
                         }
                     }
                 }
             }
         }
     }
-    int64_t out_ptr = tid * 3;
-    out_idx[out_ptr + 0] = (IndexType)best_z;
-    out_idx[out_ptr + 1] = (IndexType)best_y;
-    out_idx[out_ptr + 2] = (IndexType)best_x;
+    out_z[tid] = (IndexType)best_z;
+    out_y[tid] = (IndexType)best_y;
+    out_x[tid] = (IndexType)best_x;
 }
 
-// --- 3D JFA Distance Calculation ---
 template <typename IndexType>
-__global__ void calc_dist_kernel_3d(
-    const IndexType* __restrict__ indices,
+__global__ void calc_dist_kernel_3d_soa(
+    const IndexType* __restrict__ in_z,
+    const IndexType* __restrict__ in_y,
+    const IndexType* __restrict__ in_x,
     float* __restrict__ dist_out,
     int64_t total_elements,
     int D, int H, int W
@@ -340,26 +462,27 @@ __global__ void calc_dist_kernel_3d(
     int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid >= total_elements) return;
 
-    int64_t idx_ptr = tid * 3;
-    int seed_d = (int)indices[idx_ptr + 0];
-    if (seed_d == -1) { dist_out[tid] = INF_VAL; return; }
-    int seed_h = (int)indices[idx_ptr + 1];
-    int seed_w = (int)indices[idx_ptr + 2];
-
-    int64_t spatial_size = (int64_t)D * H * W;
-    int64_t rem = tid % spatial_size;
-    int cur_w = (int)(rem % W);
-    int cur_h = (int)((rem / W) % H);
-    int cur_d = (int)(rem / (W * H));
+    int seed_d = (int)in_z[tid];
+    if (seed_d == -1) { 
+        dist_out[tid] = INF_VAL; 
+    } else {
+        int seed_h = (int)in_y[tid];
+        int seed_w = (int)in_x[tid];
 
-    dist_out[tid] = sqrtf(dist_sq_3d(cur_d, cur_h, cur_w, seed_d, seed_h, seed_w));
+        int64_t spatial_size = (int64_t)D * H * W;
+        int64_t rem = tid % spatial_size;
+        int cur_w = (int)(rem % W);
+        int cur_h = (int)((rem / W) % H);
+        int cur_d = (int)(rem / (W * H));
+        
+        dist_out[tid] = sqrtf(dist_sq_3d_soa(cur_d, cur_h, cur_w, seed_d, seed_h, seed_w));
+    }
 }
 
 // ==================================================================
-// PART 2: SEPARABLE N-DIM KERNELS (For 4D+ Spatial)
+// PART 3: SEPARABLE N-DIM KERNELS
 // ==================================================================
 
-// Core logic for 1D Voronoi scan
 __device__ void run_separable_scan_core(
     int N,
     int tid,
@@ -367,14 +490,12 @@ __device__ void run_separable_scan_core(
     int* __restrict__ idx_curr,      
     int* __restrict__ idx_next       
 ) {
-    // 1. Initialization
     for (int i = tid; i < N; i += blockDim.x) {
         if (vals[i] >= INF_VAL * 0.9f) idx_curr[i] = -1; 
         else idx_curr[i] = i;  
     }
     __syncthreads();
 
-    // 2. Iterative Propagation (Logarithmic steps)
     int* idx_in = idx_curr;
     int* idx_out = idx_next;
 
@@ -385,7 +506,6 @@ __device__ void run_separable_scan_core(
 
             if (my_best_p != -1) min_cost = compute_cost(i, my_best_p, vals[my_best_p]);
 
-            // Check Left Neighbor
             int left = i - step;
             if (left >= 0) {
                 int left_p = idx_in[left];
@@ -395,7 +515,6 @@ __device__ void run_separable_scan_core(
                 }
             }
 
-            // Check Right Neighbor
             int right = i + step;
             if (right < N) {
                 int right_p = idx_in[right];
@@ -406,19 +525,16 @@ __device__ void run_separable_scan_core(
             }
             idx_out[i] = my_best_p;
         }
-        // Swap buffers
         int* temp = idx_in; idx_in = idx_out; idx_out = temp;
         __syncthreads();
     }
 
-    // 3. Final Copy Back (ensure result is in idx_curr)
     if (idx_in != idx_curr) {
         for (int i = tid; i < N; i += blockDim.x) idx_curr[i] = idx_next[i];
         __syncthreads();
     }
 }
 
-// Separable Kernel: Optimized using Shared Memory
 template <bool IsFinal>
 __global__ void separable_kernel_shared(
     const float* __restrict__ in_data,
@@ -433,13 +549,11 @@ __global__ void separable_kernel_shared(
     int64_t offset = row_idx * L;
     if (offset >= total_elements) return;
 
-    // Dynamic Shared Memory allocation
     extern __shared__ char s_buffer[];
     float* s_vals = (float*)s_buffer;
     int*   s_idx1 = (int*)(s_vals + L);
     int*   s_idx2 = (int*)(s_idx1 + L);
 
-    // Load data
     for (int i = threadIdx.x; i < L; i += blockDim.x) {
         s_vals[i] = __ldg(&in_data[offset + i]);
     }
@@ -447,7 +561,6 @@ __global__ void separable_kernel_shared(
 
     run_separable_scan_core(L, threadIdx.x, s_vals, s_idx1, s_idx2);
 
-    // Write back
     for (int q = threadIdx.x; q < L; q += blockDim.x) {
         int p = s_idx1[q];
         float dist_val;
@@ -473,7 +586,6 @@ __global__ void separable_kernel_shared(
     }
 }
 
-// Separable Kernel: Global Memory Fallback (when dim size > Shared Mem)
 template <bool IsFinal>
 __global__ void separable_kernel_global(
     const float* __restrict__ in_data,
@@ -520,7 +632,6 @@ __global__ void separable_kernel_global(
     }
 }
 
-// Separable Initialization
 __global__ void init_indices_separable_kernel(
     int32_t* indices, 
     int64_t total_pixels, 
@@ -542,37 +653,25 @@ __global__ void init_indices_separable_kernel(
 }
 
 // ==================================================================
-// PART 3: DISPATCH HELPERS
+// PART 4: DISPATCH HELPERS
 // ==================================================================
 
-// --- JFA 2D Dispatch ---
 std::tuple<torch::Tensor, torch::Tensor> run_jfa_2d(
     torch::Tensor input, int64_t H, int64_t W, int grid, int block, int64_t numel
 ) {
-    // Force Int32 to enable int2 vectorized loads/stores
     auto index_opts = input.options().dtype(torch::kInt32);
-    
-    // Create Double Buffer indices: (Batch, H, W, 2)
     auto idx_shape = input.sizes().vec();
     idx_shape.push_back(2); 
     auto curr_idx = torch::empty(idx_shape, index_opts);
     auto next_idx = torch::empty(idx_shape, index_opts);
     
-    // Cast int32 pointer to int2 pointer.
-    // Memory layout matches: consecutive pairs of (y, x) form int2.
     int2* d_curr = (int2*)curr_idx.data_ptr<int32_t>();
     int2* d_next = (int2*)next_idx.data_ptr<int32_t>();
 
-    // 1. Initialization Kernel
-    // numel equals the number of int2 elements (pixels)
     init_jfa_kernel_2d_opt<<<grid, block>>>(
-        input.data_ptr<float>(), 
-        d_curr, 
-        numel, H, W
+        input.data_ptr<float>(), d_curr, numel, H, W
     );
 
-    // 2. Block-JFA Fused Kernel (Optimized)
-    // Runs Steps 1, 2, 4, 8 inside Shared Memory
     {
         dim3 dimBlock(JFA_BLOCK_DIM, JFA_BLOCK_DIM); 
         int64_t batch_size = numel / (H * W);
@@ -580,91 +679,149 @@ std::tuple<torch::Tensor, torch::Tensor> run_jfa_2d(
                      (H + JFA_BLOCK_DIM - 1) / JFA_BLOCK_DIM, 
                      batch_size);
         
-        jfa_block_fused_kernel_2d<<<dimGrid, dimBlock>>>(
-            d_curr, 
-            d_next, 
-            H, W, batch_size
-        );
-        std::swap(d_curr, d_next);     // d_curr now holds result of Step 8
-        std::swap(curr_idx, next_idx); // Keep Tensor pointers in sync
+        jfa_block_fused_kernel_2d<<<dimGrid, dimBlock>>>(d_curr, d_next, H, W, batch_size);
+        std::swap(d_curr, d_next); 
+        std::swap(curr_idx, next_idx); 
     }
 
-    // 3. Global Loop (Steps 16, 32...)
     int max_dim = std::max((int)H, (int)W);
     int step = 16; 
 
     while (step < max_dim) {
-        jfa_step_global_2d_opt<<<grid, block>>>(
-            d_curr, 
-            d_next, 
-            step, 
-            H, W, numel
-        );
+        jfa_step_global_2d_opt<<<grid, block>>>(d_curr, d_next, step, H, W, numel);
         std::swap(d_curr, d_next);
         std::swap(curr_idx, next_idx);
         step *= 2;
     }
     
-    // 4. Final Distance Calculation
     auto final_dist = torch::empty_like(input);
-    calc_dist_kernel_2d_opt<<<grid, block>>>(
-        d_curr, 
-        final_dist.data_ptr<float>(), 
-        numel, H, W
-    );
+    calc_dist_kernel_2d_opt<<<grid, block>>>(d_curr, final_dist.data_ptr<float>(), numel, H, W);
 
     return std::make_tuple(final_dist, curr_idx);
 }
 
+
 std::tuple<torch::Tensor, torch::Tensor> run_jfa_3d(
     torch::Tensor input, int64_t D, int64_t H, int64_t W, int grid, int block, int64_t numel
 ) {
     bool use_int16 = (D < 32767 && H < 32767 && W < 32767);
     auto index_opts = input.options().dtype(use_int16 ? torch::kInt16 : torch::kInt32);
-    auto idx_shape = input.sizes().vec();
-    idx_shape.push_back(3);
-    auto curr_idx = torch::empty(idx_shape, index_opts);
-    auto next_idx = torch::empty(idx_shape, index_opts);
+    
+    int64_t batch = numel / (D * H * W);
+    
+    // (3, Batch, D, H, W)
+    auto curr_idx_soa = torch::empty({3, batch, D, H, W}, index_opts);
+    auto next_idx_soa = torch::empty({3, batch, D, H, W}, index_opts);
+    
+    void* d_curr = curr_idx_soa.data_ptr();
+    void* d_next = next_idx_soa.data_ptr();
+    int64_t plane_stride = numel; // B*D*H*W
+
+    // 1. Init
+    if (use_int16) {
+        int16_t* ptr = (int16_t*)d_curr;
+        init_jfa_kernel_3d_soa<int16_t><<<grid, block>>>(
+            input.data_ptr<float>(), ptr, ptr + plane_stride, ptr + 2 * plane_stride, numel, D, H, W
+        );
+    } else {
+        int32_t* ptr = (int32_t*)d_curr;
+        init_jfa_kernel_3d_soa<int32_t><<<grid, block>>>(
+            input.data_ptr<float>(), ptr, ptr + plane_stride, ptr + 2 * plane_stride, numel, D, H, W
+        );
+    }
 
-    if (use_int16) init_jfa_kernel_3d<int16_t><<<grid, block>>>(input.data_ptr<float>(), (int16_t*)curr_idx.data_ptr(), numel, D, H, W);
-    else init_jfa_kernel_3d<int32_t><<<grid, block>>>(input.data_ptr<float>(), (int32_t*)curr_idx.data_ptr(), numel, D, H, W);
+    // 2. Fused Steps
+    int block_dim = 8;
+    int blocks_per_d = (D + block_dim - 1) / block_dim;
+    dim3 fused_block(block_dim, block_dim, block_dim);
+    dim3 fused_grid((W + block_dim - 1) / block_dim, (H + block_dim - 1) / block_dim, blocks_per_d * batch);
+    size_t smem_bytes = (14*14*14) * 3 * (use_int16 ? 2 : 4);
+
+    if (use_int16) {
+        int16_t* c = (int16_t*)d_curr;
+        int16_t* n = (int16_t*)d_next;
+        jfa_block_fused_kernel_3d_soa<int16_t><<<fused_grid, fused_block, smem_bytes>>>(
+            c, c + plane_stride, c + 2 * plane_stride, 
+            n, n + plane_stride, n + 2 * plane_stride, 
+            D, H, W, blocks_per_d
+        );
+    } else {
+        int32_t* c = (int32_t*)d_curr;
+        int32_t* n = (int32_t*)d_next;
+        jfa_block_fused_kernel_3d_soa<int32_t><<<fused_grid, fused_block, smem_bytes>>>(
+            c, c + plane_stride, c + 2 * plane_stride, 
+            n, n + plane_stride, n + 2 * plane_stride, 
+            D, H, W, blocks_per_d
+        );
+    }
+    std::swap(d_curr, d_next); 
 
+    // 3. Global Steps
     int max_dim = std::max({(int)D, (int)H, (int)W});
-    int step = 1;
-    while (step < max_dim) step *= 2;
-    step /= 2;
-
-    while (step >= 1) {
-        if (use_int16) jfa_step_3d<int16_t><<<grid, block>>>((int16_t*)curr_idx.data_ptr(), (int16_t*)next_idx.data_ptr(), step, D, H, W, numel);
-        else jfa_step_3d<int32_t><<<grid, block>>>((int32_t*)curr_idx.data_ptr(), (int32_t*)next_idx.data_ptr(), step, D, H, W, numel);
-        std::swap(curr_idx, next_idx);
-        step /= 2;
+    int step = 4;
+    while (step < max_dim) {
+        if (use_int16) {
+            int16_t* c = (int16_t*)d_curr;
+            int16_t* n = (int16_t*)d_next;
+            jfa_step_3d_soa<int16_t><<<grid, block>>>(
+                c, c + plane_stride, c + 2 * plane_stride, 
+                n, n + plane_stride, n + 2 * plane_stride, 
+                step, D, H, W, numel
+            );
+        } else {
+            int32_t* c = (int32_t*)d_curr;
+            int32_t* n = (int32_t*)d_next;
+            jfa_step_3d_soa<int32_t><<<grid, block>>>(
+                c, c + plane_stride, c + 2 * plane_stride, 
+                n, n + plane_stride, n + 2 * plane_stride, 
+                step, D, H, W, numel
+            );
+        }
+        std::swap(d_curr, d_next);
+        step *= 2;
     }
+
+    // 4. Final Dist
     auto final_dist = torch::empty_like(input);
-    if (use_int16) calc_dist_kernel_3d<int16_t><<<grid, block>>>((int16_t*)curr_idx.data_ptr(), final_dist.data_ptr<float>(), numel, D, H, W);
-    else calc_dist_kernel_3d<int32_t><<<grid, block>>>((int32_t*)curr_idx.data_ptr(), final_dist.data_ptr<float>(), numel, D, H, W);
+    if (use_int16) {
+        int16_t* c = (int16_t*)d_curr;
+        calc_dist_kernel_3d_soa<int16_t><<<grid, block>>>(
+            c, c + plane_stride, c + 2 * plane_stride, 
+            final_dist.data_ptr<float>(), numel, D, H, W
+        );
+    } else {
+        int32_t* c = (int32_t*)d_curr;
+        calc_dist_kernel_3d_soa<int32_t><<<grid, block>>>(
+            c, c + plane_stride, c + 2 * plane_stride, 
+            final_dist.data_ptr<float>(), numel, D, H, W
+        );
+    }
 
-    return std::make_tuple(final_dist, curr_idx);
+    // Permute result indices back to (Batch, D, H, W, 3)
+    torch::Tensor result_indices;
+    if (d_curr == curr_idx_soa.data_ptr()) result_indices = curr_idx_soa;
+    else result_indices = next_idx_soa;
+    
+    result_indices = result_indices.permute({1, 2, 3, 4, 0}).contiguous();
+
+    return std::make_tuple(final_dist, result_indices);
 }
 
-// --- Separable N-Dim Dispatch ---
 std::tuple<torch::Tensor, torch::Tensor> run_separable_ndim(torch::Tensor input) {
     TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Separable N-Dim input must be float32.");
     input = input.contiguous();
     
     const int ndim = input.dim(); 
-    const int sample_ndim = ndim - 1; // Assuming Dim 0 is Batch
+    const int sample_ndim = ndim - 1; 
     TORCH_CHECK(sample_ndim > 0 && sample_ndim <= 8, "Unsupported dims for Separable EDT");
     
     auto shape = input.sizes().vec();
     int64_t num_pixels = input.numel();
 
-    // 1. Init Distances
     auto current_dist = torch::where(input == 0, 
                                     torch::tensor(0.0f, input.options()), 
                                     torch::tensor(INF_VAL, input.options()));
     
-    // 2. Init Indices
     auto index_shape = shape;
     index_shape.push_back(sample_ndim);
     auto current_idx = torch::empty(index_shape, input.options().dtype(torch::kInt32));
@@ -681,7 +838,6 @@ std::tuple<torch::Tensor, torch::Tensor> run_separable_ndim(torch::Tensor input)
 
     torch::Tensor global_buf1, global_buf2;
 
-    // 3. Dimensional Iteration (Apply 1D transform along each spatial axis)
     for (int d = 1; d < ndim; ++d) {
         bool is_final_pass = (d == ndim - 1);
         
@@ -695,7 +851,6 @@ std::tuple<torch::Tensor, torch::Tensor> run_separable_ndim(torch::Tensor input)
         int64_t total_slices = dist_in.numel() / L; 
         int threads = std::min((int64_t)MAX_THREADS, L);
         
-        // Choose between Shared Memory or Global Memory kernel based on dimension size
         if (L <= SMEM_LIMIT_ELEMENTS) {
             size_t smem_size = L * (sizeof(float) + 2 * sizeof(int));
             if (is_final_pass) {
@@ -740,7 +895,7 @@ std::tuple<torch::Tensor, torch::Tensor> run_separable_ndim(torch::Tensor input)
 }
 
 // ==================================================================
-// PART 4: MAIN ENTRY POINT (INTEGRATED)
+// PART 5: MAIN ENTRY POINT
 // ==================================================================
 
 std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
@@ -752,27 +907,16 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
     int block = BLOCK_SIZE;
     int grid = (numel + block - 1) / block;
 
-    // ------------------------------------------------------------------
-    // CASE 1: High-Dimension (5D+) -> Use Separable N-Dim Algorithm
-    // Input: (Batch, D1, D2, D3...) -> Treated as N-Dim spatial
-    // ------------------------------------------------------------------
     if (dims >= 5) {
         return run_separable_ndim(input);
     }
-    
-    // ------------------------------------------------------------------
-    // CASE 2: 4D Tensor -> (Batch, Dim1, H, W)
-    // ------------------------------------------------------------------
     else if (dims == 4) {
         int64_t dim1 = input.size(1); 
-        
-        // [Fast Path]: (Batch, 1, H, W) -> Treat as 2D JFA
         if (dim1 == 1) {
             int64_t H = input.size(-2);
             int64_t W = input.size(-1);
             return run_jfa_2d(input, H, W, grid, block, numel);
         } 
-        // [Standard Path]: (Batch, Depth, H, W) -> Use 3D JFA
         else {
             int64_t D = dim1;
             int64_t H = input.size(-2);
@@ -780,31 +924,20 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor i
             return run_jfa_3d(input, D, H, W, grid, block, numel);
         }
     }
-
-    // ------------------------------------------------------------------
-    // CASE 3: 3D Tensor -> (Batch, H, W) -> Use 2D JFA
-    // ------------------------------------------------------------------
     else if (dims == 3) {
         int64_t H = input.size(-2);
         int64_t W = input.size(-1);
         return run_jfa_2d(input, H, W, grid, block, numel);
     }
-
-    // ------------------------------------------------------------------
-    // CASE 4: 2D Tensor -> (Batch, Length) -> 1D JFA (via 2D wrapper)
-    // ------------------------------------------------------------------
     else if (dims == 2) {
         int64_t H = 1;
         int64_t W = input.size(-1);
         auto result = run_jfa_2d(input, H, W, grid, block, numel);
-        
-        // Post-process for 1D: slice out the dummy Y coordinate
         torch::Tensor dist = std::get<0>(result);
         torch::Tensor idx_2d = std::get<1>(result); 
         auto idx_1d = idx_2d.slice(/*dim=*/-1, /*start=*/1, /*end=*/2).contiguous();
         return std::make_tuple(dist, idx_1d);
     }
-
     else {
         TORCH_CHECK(false, "Unsupported dimensions.");
         return std::make_tuple(torch::Tensor(), torch::Tensor());

From 879eaa514490c83d86f946bed3b60b5da2671ac7 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Sat, 3 Jan 2026 03:25:58 +0800
Subject: [PATCH 45/56] add 1D EDT test

---
 test/test_edt_1d.py | 154 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 test/test_edt_1d.py

diff --git a/test/test_edt_1d.py b/test/test_edt_1d.py
new file mode 100644
index 0000000..b81cd63
--- /dev/null
+++ b/test/test_edt_1d.py
@@ -0,0 +1,154 @@
+import unittest
+
+import torch
+
+import torchmorph
+
+
+class Test1DEuclideanDistanceTransform(unittest.TestCase):
+    def setUp(self):
+        if not torch.cuda.is_available():
+            self.skipTest("CUDA not available")
+        self.device = torch.device("cuda")
+
+    def test_basic_features(self):
+        """Test with 32 elements and 3 feature points"""
+        print("\n=== Test 1: Basic Features (32 elements) ===")
+        input_tensor = torch.zeros(32, dtype=torch.float32, device=self.device)
+        input_tensor[0] = 1.0
+        input_tensor[12] = 1.0
+        input_tensor[31] = 1.0
+
+        dist, indices = torchmorph.distance_transform(input_tensor)
+
+        # Check specific positions
+        self._check_position(dist, indices, 0, 0, 0)
+        self._check_position(dist, indices, 6, 0, 6)
+        self._check_position(dist, indices, 12, 12, 0)
+        self._check_position(dist, indices, 21, 12, 9)
+        self._check_position(dist, indices, 31, 31, 0)
+
+    def test_multiple_features(self):
+        """Test with multiple feature points"""
+        print("\n=== Test 2: Multiple Features ===")
+        input_tensor = torch.zeros(32, dtype=torch.float32, device=self.device)
+        features = [0, 3, 7, 10, 14, 21, 31]
+        input_tensor[features] = 1.0
+
+        dist, indices = torchmorph.distance_transform(input_tensor)
+
+        for pos in range(32):
+            self._verify_nearest(pos, dist, indices, features)
+
+    def test_batch_processing(self):
+        """Test with 2D array (batch of 1D rows)"""
+        print("\n=== Test 3: Batch Processing (4x32) ===")
+        input_tensor = torch.zeros(4, 32, dtype=torch.float32, device=self.device)
+        input_tensor[0, [0, 15, 31]] = 1.0
+        input_tensor[1, [5, 10, 20]] = 1.0
+        input_tensor[2, [8, 24]] = 1.0
+        input_tensor[3, [16]] = 1.0
+
+        dist, indices = torchmorph.distance_transform(input_tensor)
+        self.assertEqual(dist.shape, (4, 32))
+
+        # Check row 0
+        features_row0 = [0, 15, 31]
+        for pos in range(32):
+            self._verify_nearest(pos, dist[0], indices[0], features_row0)
+
+    def test_boundary_conditions(self):
+        """Test empty and full feature arrays"""
+        print("\n=== Test 4: Boundary Conditions ===")
+        # No features
+        input_empty = torch.zeros(32, dtype=torch.float32, device=self.device)
+        dist_empty, idx_empty = torchmorph.distance_transform(input_empty)
+        self.assertTrue(torch.all(dist_empty > 1000))  # Should be large/inf
+        self.assertTrue(torch.all(idx_empty == -1))
+
+        # All features
+        input_full = torch.ones(32, dtype=torch.float32, device=self.device)
+        dist_full, idx_full = torchmorph.distance_transform(input_full)
+        self.assertTrue(torch.all(dist_full == 0))
+        expected_idx = torch.arange(32, device=self.device, dtype=torch.int32).unsqueeze(-1)
+        self.assertTrue(torch.all(idx_full == expected_idx))
+
+    def test_large_array(self):
+        """Test large array to verify cross-tile propagation"""
+        print("\n=== Test 5: Large Array (1024 elements) ===")
+        input_tensor = torch.zeros(1024, dtype=torch.float32, device=self.device)
+        features = [0, 512, 1023]
+        input_tensor[features] = 1.0
+
+        dist, indices = torchmorph.distance_transform(input_tensor)
+
+        test_positions = [0, 256, 512, 768, 1023]
+        for pos in test_positions:
+            self._verify_nearest(pos, dist, indices.squeeze(), features)
+
+    def test_cross_tile_boundary(self):
+        """Test propagation across tile boundaries"""
+        print("\n=== Test 6: Cross-Tile Propagation ===")
+        # 768 elements (3 tiles of 256)
+        input_tensor = torch.zeros(768, dtype=torch.float32, device=self.device)
+        features = [100, 600]
+        input_tensor[features] = 1.0
+
+        dist, indices = torchmorph.distance_transform(input_tensor)
+
+        # Check around boundaries (256, 512)
+        test_positions = [250, 255, 256, 260, 350, 500, 510, 512, 520]
+        for pos in test_positions:
+            self._verify_nearest(pos, dist, indices.squeeze(), features)
+
+    def test_large_2d_batch(self):
+        """Test large 2D batch"""
+        print("\n=== Test 7: Large 2D Batch ===")
+        input_tensor = torch.zeros(3, 600, dtype=torch.float32, device=self.device)
+        rows_features = {
+            0: [0, 299, 599],
+            1: [150, 450],
+            2: [300],
+        }
+
+        for row, feats in rows_features.items():
+            input_tensor[row, feats] = 1.0
+
+        dist, indices = torchmorph.distance_transform(input_tensor)
+
+        # Verify specific points with dynamic calculation
+        test_cases = [
+            (0, 150),
+            (0, 450),
+            (1, 300),
+            (2, 100),
+            (2, 500),
+        ]
+
+        for row, pos in test_cases:
+            self._verify_nearest(pos, dist[row], indices[row].squeeze(), rows_features[row])
+
+    def _check_position(self, dist, indices, pos, expected_idx, expected_dist):
+        actual_dist = dist[pos].item()
+        actual_idx = indices[pos].item() if indices.ndim == 1 else indices[pos, 0].item()
+
+        self.assertAlmostEqual(actual_dist, float(expected_dist), places=1)
+        self.assertEqual(actual_idx, expected_idx)
+
+    def _verify_nearest(self, pos, dist, indices, features):
+        actual_dist = dist[pos].item()
+        nearest_idx = indices[pos].item() if indices.ndim == 1 else indices[pos].item()
+
+        # Calculate ground truth dynamically
+        true_dists = [abs(pos - f) for f in features]
+        min_dist = min(true_dists)
+        candidates = [f for f, d in zip(features, true_dists) if d == min_dist]
+
+        self.assertAlmostEqual(
+            actual_dist, float(min_dist), places=1, msg=f"Distance mismatch at {pos}"
+        )
+        self.assertIn(nearest_idx, candidates, msg=f"Nearest index mismatch at {pos}")
+
+
+if __name__ == "__main__":
+    unittest.main()

From e4ff01382b831ba37fd5203b179e3795aa467d61 Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Sat, 3 Jan 2026 03:30:53 +0800
Subject: [PATCH 46/56]  1D EDT

---
 torchmorph/csrc/distance_transform_kernel.cu | 1176 +++++-------------
 1 file changed, 293 insertions(+), 883 deletions(-)

diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 09d4cbc..59b1d99 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -1,945 +1,355 @@
 #include <torch/extension.h>
-#include <vector>
-#include <cmath>
-#include <limits>
-#include <cstdint>
-#include <cstdio>
-#include <cuda.h>
 #include <cuda_runtime.h>
-#include <algorithm>
-#include <tuple>
-
-// ------------------------------------------------------------------
-// Global Configuration
-// ------------------------------------------------------------------
-#define BLOCK_SIZE 256
-#define INF_VAL 1e20f
-#define MAX_THREADS 1024
-#define SMEM_LIMIT_ELEMENTS 4096 
-
-#define JFA_BLOCK_DIM 32       
-#define JFA_FUSED_STEPS 4      
-#define JFA_MAX_OFFSET 8       
-#define JFA_SMEM_DIM (JFA_BLOCK_DIM + 2 * JFA_MAX_OFFSET) 
+#include <vector>
 
-// 3D Config
-#define JFA_3D_BLOCK 8
-#define JFA_3D_HALO 1 
+// ============================================================================
+// 1D Euclidean Distance Transform - Optimized Warp-Level Parallel
+// ============================================================================
+// 
+// Based on the paper's algorithm using:
+// - __ballot_sync() for feature point voting
+// - __shfl_sync() for warp-level communication (NO shared memory within warp)
+// - Parallel reduction tree for cross-warp propagation
+// - Time complexity: O(log32(n)) with O(n) total work
+//
+// ============================================================================
+
+#define WARP_SIZE 32
+#define INF_VAL 1e9f
 
-// ------------------------------------------------------------------
-// Device Helpers
-// ------------------------------------------------------------------
 __device__ __forceinline__ float sqr(float x) { return x * x; }
 
-// Helper for JFA 2D/3D (Standard)
-__device__ __forceinline__ float dist_sq_2d(int y1, int x1, int y2, int x2) {
-    return sqr((float)(y1 - y2)) + sqr((float)(x1 - x2));
-}
-
-// Helper for SoA 3D (Z, Y, X separate)
-__device__ __forceinline__ float dist_sq_3d_soa(int z1, int y1, int x1, int z2, int y2, int x2) {
-    if (z2 == -1) return INF_VAL;
-    float dz = (float)(z1 - z2);
-    float dy = (float)(y1 - y2);
-    float dx = (float)(x1 - x2);
-    return dz*dz + dy*dy + dx*dx;
-}
-
-// Helper for Separable 1D
-__device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
-    if (p < 0 || val_p >= INF_VAL) return INF_VAL; 
-    return sqr((float)q - (float)p) + val_p;
-}
-
-// Device Helpers for int2 (2D Vectorized)
-__device__ __forceinline__ float dist_sq_int2(int y, int x, int2 seed) {
-    if (seed.x == -1) return INF_VAL; 
-    float dy = (float)(y - seed.x);
-    float dx = (float)(x - seed.y);
-    return dy*dy + dx*dx;
-}
-
-// ==================================================================
-// PART 1: JFA KERNELS 2D (Vectorized int2 + Block Shared)
-// ==================================================================
-
-__global__ void init_jfa_kernel_2d_opt(
-    const float* __restrict__ input,
-    int2* __restrict__ output, 
-    int64_t total_elements,    
-    int H, int W
+// ============================================================================
+// Device Function: Find nearest feature to the LEFT using warp operations
+// ============================================================================
+// 
+// Algorithm (as described in the paper, Figure 7a):
+// 1. Each thread votes if it holds a feature point -> ballot() creates bitmask
+// 2. Mask high (warpSize - lane - 1) bits with 0
+// 3. Use clz() to count leading zeros
+// 4. Nearest thread lane = (warpSize - clz() - 1)
+// 5. Use __shfl_sync() to get the feature index from that thread
+//
+// Returns: index of nearest feature to the left, or -1 if none exists
+// ============================================================================
+
+__device__ __forceinline__ int find_nearest_left_in_warp(
+    int lane,
+    int my_index,
+    unsigned int feature_mask
 ) {
-    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total_elements) return;
-
-    if (input[tid] == 0.0f) {
-        int64_t spatial_size = (int64_t)H * W;
-        int64_t rem = tid % spatial_size;
-        int w = (int)(rem % W);
-        int h = (int)(rem / W);
-        output[tid] = make_int2(h, w); 
-    } else {
-        output[tid] = make_int2(-1, -1);
+    // Mask high bits: only keep features to the LEFT of current lane
+    unsigned int left_mask = feature_mask & ((1U << lane) - 1);
+    
+    // We must execute __shfl_sync for ALL threads in the warp
+    // Calculate nearest_lane if valid, otherwise use 0 (safe default)
+    int nearest_lane = 0;
+    if (left_mask != 0) {
+        nearest_lane = 31 - __clz(left_mask);
     }
+    
+    // Perform shuffle for ALL threads
+    int nearest_index = __shfl_sync(0xFFFFFFFF, my_index, nearest_lane);
+    
+    // Only return valid result if we actually found a feature
+    return (left_mask != 0) ? nearest_index : -1;
 }
 
-__global__ void jfa_block_fused_kernel_2d(
-    const int2* __restrict__ in_idx,
-    int2* __restrict__ out_idx,
-    int H, int W,
-    int64_t num_images 
-) {
-    __shared__ int2 smem[JFA_SMEM_DIM][JFA_SMEM_DIM];
+// ============================================================================
+// Device Function: Find nearest feature to the RIGHT using warp operations
+// ============================================================================
 
-    int tx = threadIdx.x; 
-    int ty = threadIdx.y; 
+__device__ __forceinline__ int find_nearest_right_in_warp(
+    int lane,
+    int my_index,
+    unsigned int feature_mask
+) {
+    // Mask low bits: only keep features to the RIGHT of current lane
+    unsigned int right_mask = feature_mask & ~((1U << (lane + 1)) - 1);
     
-    int bx = blockIdx.x * blockDim.x;
-    int by = blockIdx.y * blockDim.y;
-    int img_idx = blockIdx.z;
-    int64_t batch_offset = (int64_t)img_idx * (H * W);
-
-    int gx = bx + tx;
-    int gy = by + ty;
-
-    // Phase 1: load data to Shared Memory
-    int smem_linear_size = JFA_SMEM_DIM * JFA_SMEM_DIM;
-    int total_threads = blockDim.x * blockDim.y;
-    int thread_linear_idx = ty * blockDim.x + tx;
-
-    int base_x = bx - JFA_MAX_OFFSET;
-    int base_y = by - JFA_MAX_OFFSET;
-
-    for (int i = thread_linear_idx; i < smem_linear_size; i += total_threads) {
-        int s_y = i / JFA_SMEM_DIM;
-        int s_x = i % JFA_SMEM_DIM;
-        int global_y = base_y + s_y;
-        int global_x = base_x + s_x;
-        int2 val = make_int2(-1, -1);
-        if (global_y >= 0 && global_y < H && global_x >= 0 && global_x < W) {
-            val = in_idx[batch_offset + global_y * W + global_x];
-        }
-        smem[s_y][s_x] = val;
+    // Calculate nearest_lane if valid, otherwise use 0 (safe default)
+    int nearest_lane = 0;
+    if (right_mask != 0) {
+        nearest_lane = __ffs(right_mask) - 1;
     }
-    __syncthreads();
-
-    // Phase 2: Iterate in Shared Memory
-    if (gx < W && gy < H) {
-        int center_sy = ty + JFA_MAX_OFFSET;
-        int center_sx = tx + JFA_MAX_OFFSET;
+    
+    // Perform shuffle for ALL threads
+    int nearest_index = __shfl_sync(0xFFFFFFFF, my_index, nearest_lane);
+    
+    return (right_mask != 0) ? nearest_index : -1;
+}
 
-        int2 best_seed = smem[center_sy][center_sx];
-        float best_dist = dist_sq_int2(gy, gx, best_seed);
+// ============================================================================
+// Warp Scan Helpers
+// ============================================================================
 
-        int step = 1;
-        #pragma unroll
-        for (int k = 0; k < JFA_FUSED_STEPS; ++k) { 
-            #pragma unroll
-            for (int dy = -1; dy <= 1; ++dy) {
-                #pragma unroll
-                for (int dx = -1; dx <= 1; ++dx) {
-                    if (dy == 0 && dx == 0) continue;
-                    int2 neighbor_seed = smem[center_sy + dy * step][center_sx + dx * step];
-                    if (neighbor_seed.x != -1) {
-                        float d = dist_sq_int2(gy, gx, neighbor_seed);
-                        if (d < best_dist) {
-                            best_dist = d;
-                            best_seed = neighbor_seed;
-                        }
-                    }
-                }
-            }
-            __syncthreads();
-            smem[center_sy][center_sx] = best_seed;
-            __syncthreads();
-            step *= 2;
+// Inclusive Max Scan for positive integers (returns max seen so far)
+__device__ __forceinline__ int warp_scan_inclusive_max(int val, int width) {
+    // Hillis-Steele Scan (O(log N))
+    #pragma unroll
+    for (int offset = 1; offset < 32; offset *= 2) {
+        int neighbor_val = __shfl_up_sync(0xFFFFFFFF, val, offset);
+        if (threadIdx.x % 32 >= offset) {
+            // Logic: take the max of current and neighbor
+            // Handle -1 (invalid) carefully: max behavior handles -1 naturally if features are >= 0
+            if (neighbor_val > val) val = neighbor_val;
         }
-        out_idx[batch_offset + gy * W + gx] = best_seed;
     }
+    return val;
 }
 
-__global__ void jfa_step_global_2d_opt(
-    const int2* __restrict__ in_idx,
-    int2* __restrict__ out_idx,
-    int step,
-    int H, int W, 
-    int64_t total_pixels
-) {
-    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total_pixels) return;
-
-    int64_t spatial_size = (int64_t)H * W;
-    int64_t rem = tid % spatial_size;
-    int64_t batch_offset = tid - rem; 
-    int w = (int)(rem % W);
-    int h = (int)(rem / W);
-
-    int2 best_seed = in_idx[tid];
-    float best_dist = dist_sq_int2(h, w, best_seed);
-
+// Inclusive Min Scan (returns min seen so far from right)
+// Note: We use __shfl_down_sync for suffix scan
+__device__ __forceinline__ int warp_scan_suffix_min(int val, int width) {
+    // Suffix Scan (Right to Left)
     #pragma unroll
-    for (int dy = -1; dy <= 1; ++dy) {
-        #pragma unroll
-        for (int dx = -1; dx <= 1; ++dx) {
-            if (dx == 0 && dy == 0) continue;
-            
-            int ny = h + dy * step;
-            int nx = w + dx * step;
-
-            if (ny >= 0 && ny < H && nx >= 0 && nx < W) {
-                int2 neighbor_seed = in_idx[batch_offset + ny * W + nx];
-                if (neighbor_seed.x != -1) {
-                    float d = dist_sq_int2(h, w, neighbor_seed);
-                    if (d < best_dist) {
-                        best_dist = d;
-                        best_seed = neighbor_seed;
-                    }
-                }
+    for (int offset = 1; offset < 32; offset *= 2) {
+        int neighbor_val = __shfl_down_sync(0xFFFFFFFF, val, offset);
+        // If we have a neighbor to the right
+        if ((threadIdx.x % 32) + offset < width) {
+            // Logic: take min. If current is -1 (invalid), take neighbor.
+            // If neighbor is -1, ignore it.
+            if (val == -1) val = neighbor_val;
+            else if (neighbor_val != -1) {
+                if (neighbor_val < val) val = neighbor_val;
             }
         }
     }
-    out_idx[tid] = best_seed;
-}
-
-__global__ void calc_dist_kernel_2d_opt(
-    const int2* __restrict__ indices,
-    float* __restrict__ dist_out,
-    int64_t total_elements,
-    int H, int W
-) {
-    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total_elements) return;
-
-    int2 s = indices[tid];
-    if (s.x == -1) { 
-        dist_out[tid] = INF_VAL; 
-    } else {
-        int64_t spatial_size = (int64_t)H * W;
-        int64_t rem = tid % spatial_size;
-        int cur_w = (int)(rem % W);
-        int cur_h = (int)(rem / W);
-        dist_out[tid] = sqrtf(dist_sq_int2(cur_h, cur_w, s));
-    }
+    return val;
 }
 
-// ==================================================================
-// PART 2: JFA KERNELS 3D (Optimized SoA Layout)
-// ==================================================================
-
-template <typename IndexType>
-__global__ void init_jfa_kernel_3d_soa(
-    const float* __restrict__ input,
-    IndexType* __restrict__ indices_z,
-    IndexType* __restrict__ indices_y,
-    IndexType* __restrict__ indices_x,
-    int64_t total_elements, 
-    int D, int H, int W
-) {
-    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total_elements) return;
 
-    if (input[tid] == 0.0f) {
-        int64_t spatial_size = (int64_t)D * H * W;
-        int64_t rem = tid % spatial_size;
-        int w = (int)(rem % W);
-        int h = (int)((rem / W) % H);
-        int d = (int)(rem / (W * H));
+// ============================================================================
+// Kernel: Optimized 1D EDT using Two-Level Tree reduction
+// ============================================================================
 
-        indices_z[tid] = (IndexType)d;
-        indices_y[tid] = (IndexType)h;
-        indices_x[tid] = (IndexType)w;
-    } else {
-        indices_z[tid] = (IndexType)-1;
-        indices_y[tid] = (IndexType)-1;
-        indices_x[tid] = (IndexType)-1;
-    }
-}
-
-template <typename IndexType>
-__global__ void jfa_block_fused_kernel_3d_soa(
-    const IndexType* __restrict__ in_z,
-    const IndexType* __restrict__ in_y,
-    const IndexType* __restrict__ in_x,
-    IndexType* __restrict__ out_z,
-    IndexType* __restrict__ out_y,
-    IndexType* __restrict__ out_x,
-    int D, int H, int W,
-    int blocks_per_d
+__global__ void edt_1d_warp_optimized_kernel(
+    const float* __restrict__ d_input,
+    float* __restrict__ d_dist,
+    int32_t* __restrict__ d_indices,
+    int width,
+    int height
 ) {
-    const int BLOCK_DIM = 8;
-    const int HALO = 3; 
-    const int SMEM_DIM = BLOCK_DIM + 2 * HALO; // 14
-    const int SMEM_SIZE = SMEM_DIM * SMEM_DIM * SMEM_DIM;
-
-    extern __shared__ char smem_raw[];
-    IndexType* smem_z = (IndexType*)smem_raw;
-    IndexType* smem_y = smem_z + SMEM_SIZE;
-    IndexType* smem_x = smem_y + SMEM_SIZE;
-
-    int tx = threadIdx.x; int ty = threadIdx.y; int tz = threadIdx.z;
-
-    int b_z_total = blockIdx.z;
-    int batch_id = b_z_total / blocks_per_d;
-    int b_z_local = b_z_total % blocks_per_d;
+    int row = blockIdx.x;
+    if (row >= height) return;
     
-    int bx = blockIdx.x * BLOCK_DIM;
-    int by = blockIdx.y * BLOCK_DIM;
-    int bz = b_z_local * BLOCK_DIM;
-
-    int64_t spatial_offset = (int64_t)batch_id * (D * H * W);
-
-    // Phase 1: Load to SoA Shared Memory
-    int tid = tz * 64 + ty * 8 + tx; 
-    int base_x = bx - HALO;
-    int base_y = by - HALO;
-    int base_z = bz - HALO;
-
-    for (int i = tid; i < SMEM_SIZE; i += 512) {
-        int temp = i;
-        int sx = temp % SMEM_DIM; temp /= SMEM_DIM;
-        int sy = temp % SMEM_DIM;
-        int sz = temp / SMEM_DIM;
-
-        int gx = base_x + sx;
-        int gy = base_y + sy;
-        int gz = base_z + sz;
-
-        IndexType val_z = -1, val_y = -1, val_x = -1;
-        if (gz >= 0 && gz < D && gy >= 0 && gy < H && gx >= 0 && gx < W) {
-            int64_t idx = spatial_offset + (int64_t)gz * (H * W) + gy * W + gx;
-            val_z = in_z[idx];
-            val_y = in_y[idx];
-            val_x = in_x[idx];
+    const float* row_input = d_input + row * width;
+    float* row_dist = d_dist + row * width;
+    int32_t* row_indices = d_indices + row * width;
+    
+    int tid = threadIdx.x;
+    int lane = tid % WARP_SIZE;
+    int warp_id = tid / WARP_SIZE;
+    int num_warps = blockDim.x / WARP_SIZE;
+    
+    // Shared memory for Inter-Warp Scan
+    // We use one buffer for the reduction result
+    __shared__ int s_warp_boundary[32]; 
+    
+    // ========================================================================
+    // PASS 1: Find nearest feature to the LEFT (Prefix Max Scan)
+    // ========================================================================
+    
+    int global_left_feature = -1;
+    
+    for (int base = 0; base < width; base += blockDim.x) {
+        int i = base + tid;
+        bool is_valid = (i < width);
+        bool is_feature = is_valid && (row_input[i] > 0.5f);
+        
+        // 1. Warp-Level: Find local nearest
+        unsigned int feature_mask = __ballot_sync(0xFFFFFFFF, is_feature);
+        int my_index = is_feature ? i : -1;
+        int warp_left_feature = find_nearest_left_in_warp(lane, my_index, feature_mask);
+        
+        // 2. Prepare for Block-Level Scan: Write rightmost feature of this warp
+        int rightmost_lane = (feature_mask != 0) ? (31 - __clz(feature_mask)) : 0;
+        int rightmost_index = __shfl_sync(0xFFFFFFFF, my_index, rightmost_lane);
+        
+        if (lane == 0) {
+            s_warp_boundary[warp_id] = (feature_mask != 0) ? rightmost_index : -1;
         }
-        smem_z[i] = val_z;
-        smem_y[i] = val_y;
-        smem_x[i] = val_x;
-    }
-    __syncthreads();
-
-    // Phase 2: Compute
-    int center_sz = tz + HALO;
-    int center_sy = ty + HALO;
-    int center_sx = tx + HALO;
-    int my_s_idx = (center_sz * SMEM_DIM + center_sy) * SMEM_DIM + center_sx;
-
-    int best_z = (int)smem_z[my_s_idx];
-    int best_y = (int)smem_y[my_s_idx];
-    int best_x = (int)smem_x[my_s_idx];
-
-    int g_cz = bz + tz;
-    int g_cy = by + ty;
-    int g_cx = bx + tx;
-
-    float best_dist = dist_sq_3d_soa(g_cz, g_cy, g_cx, best_z, best_y, best_x);
-
-    int step = 1;
-    #pragma unroll
-    for (int k = 0; k < 2; ++k) { 
-        #pragma unroll
-        for (int dz = -1; dz <= 1; ++dz) {
-            #pragma unroll
-            for (int dy = -1; dy <= 1; ++dy) {
-                #pragma unroll
-                for (int dx = -1; dx <= 1; ++dx) {
-                    if (dz == 0 && dy == 0 && dx == 0) continue;
-
-                    int nz = center_sz + dz * step;
-                    int ny = center_sy + dy * step;
-                    int nx = center_sx + dx * step;
-                    int n_idx = (nz * SMEM_DIM + ny) * SMEM_DIM + nx;
-
-                    int sz_in = (int)smem_z[n_idx]; 
-                    if (sz_in != -1) {
-                        int sy_in = (int)smem_y[n_idx];
-                        int sx_in = (int)smem_x[n_idx];
-                        float d = dist_sq_3d_soa(g_cz, g_cy, g_cx, sz_in, sy_in, sx_in);
-                        if (d < best_dist) {
-                            best_dist = d;
-                            best_z = sz_in;
-                            best_y = sy_in;
-                            best_x = sx_in;
-                        }
-                    }
-                }
+        __syncthreads();
+        
+        // 3. Block-Level: Warp 0 performs parallel prefix scan over warp boundaries
+        // This is the "Tree" part for inter-warp communication
+        if (warp_id == 0) {
+            // Load boundary from shared memory (only if valid warp)
+            int val = (lane < num_warps) ? s_warp_boundary[lane] : -1;
+            
+            // Perform inclusive max scan
+            int scan_res = warp_scan_inclusive_max(val, num_warps);
+            
+            // Write back inclusive scan result
+            if (lane < num_warps) {
+                s_warp_boundary[lane] = scan_res;
             }
         }
         __syncthreads();
-        smem_z[my_s_idx] = (IndexType)best_z;
-        smem_y[my_s_idx] = (IndexType)best_y;
-        smem_x[my_s_idx] = (IndexType)best_x;
+        
+        // 4. Combine Results
+        if (is_valid) {
+            int left_feature = -1;
+            
+            if (is_feature) left_feature = i;
+            else if (warp_left_feature != -1) left_feature = warp_left_feature;
+            else {
+                // Look at the scan result from the PREVIOUS warp
+                if (warp_id > 0) {
+                    left_feature = s_warp_boundary[warp_id - 1];
+                }
+                
+                // If still -1, fallback to global history
+                if (left_feature == -1) left_feature = global_left_feature;
+            }
+            
+            // Store result
+            if (left_feature >= 0) {
+                row_dist[i] = sqr((float)(i - left_feature));
+                row_indices[i] = left_feature;
+            } else {
+                row_dist[i] = INF_VAL;
+                row_indices[i] = -1;
+            }
+        }
+        
+        // 5. Update Global History
+        // The last warp's scan result contains the max index for the whole tile
+        int tile_max = s_warp_boundary[num_warps - 1];
+        if (tile_max != -1) global_left_feature = tile_max;
+        
         __syncthreads();
-        step *= 2;
     }
-
-    if (g_cz < D && g_cy < H && g_cx < W) {
-        int64_t out_idx_g = spatial_offset + (int64_t)g_cz * (H * W) + g_cy * W + g_cx;
-        out_z[out_idx_g] = (IndexType)best_z;
-        out_y[out_idx_g] = (IndexType)best_y;
-        out_x[out_idx_g] = (IndexType)best_x;
-    }
-}
-
-template <typename IndexType>
-__global__ void jfa_step_3d_soa(
-    const IndexType* __restrict__ in_z,
-    const IndexType* __restrict__ in_y,
-    const IndexType* __restrict__ in_x,
-    IndexType* __restrict__ out_z,
-    IndexType* __restrict__ out_y,
-    IndexType* __restrict__ out_x,
-    int step,
-    int D, int H, int W, 
-    int64_t total_pixels
-) {
-    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total_pixels) return;
-
-    int64_t spatial_size = (int64_t)D * H * W;
-    int64_t rem = tid % spatial_size;
-    int64_t batch_offset = tid - rem; 
-    int cur_w = (int)(rem % W);
-    int cur_h = (int)((rem / W) % H);
-    int cur_d = (int)(rem / (W * H));
-
-    int best_z = (int)in_z[tid];
-    int best_y = (int)in_y[tid];
-    int best_x = (int)in_x[tid];
     
-    float best_dist = dist_sq_3d_soa(cur_d, cur_h, cur_w, best_z, best_y, best_x);
-
-    #pragma unroll
-    for (int dz = -1; dz <= 1; ++dz) {
-        #pragma unroll
-        for (int dy = -1; dy <= 1; ++dy) {
-            #pragma unroll
-            for (int dx = -1; dx <= 1; ++dx) {
-                if (dz == 0 && dy == 0 && dx == 0) continue;
-
-                int nz = cur_d + dz * step;
-                int ny = cur_h + dy * step;
-                int nx = cur_w + dx * step;
-
-                if (nz >= 0 && nz < D && ny >= 0 && ny < H && nx >= 0 && nx < W) {
-                    int64_t n_idx = batch_offset + (int64_t)nz * (H * W) + ny * W + nx;
-                    
-                    int seed_z = (int)in_z[n_idx];
-                    if (seed_z != -1) {
-                        float dz_val = (float)(cur_d - seed_z);
-                        float dz_sq = dz_val * dz_val;
-                        
-                        if (dz_sq < best_dist) {
-                            int seed_y = (int)in_y[n_idx];
-                            int seed_x = (int)in_x[n_idx];
-                            float dist = dz_sq + sqr((float)(cur_h - seed_y)) + sqr((float)(cur_w - seed_x));
-                            
-                            if (dist < best_dist) {
-                                best_dist = dist;
-                                best_z = seed_z;
-                                best_y = seed_y;
-                                best_x = seed_x;
-                            }
-                        }
-                    }
-                }
+    // ========================================================================
+    // PASS 2: Find nearest feature to the RIGHT (Suffix Min Scan)
+    // ========================================================================
+    
+    int global_right_feature = -1;
+    int num_tiles = (width + blockDim.x - 1) / blockDim.x;
+    
+    for (int tile = num_tiles - 1; tile >= 0; --tile) {
+        int base = tile * blockDim.x;
+        int i = base + tid;
+        bool is_valid = (i < width);
+        bool is_feature = is_valid && (row_input[i] > 0.5f);
+        
+        // 1. Warp-Level
+        unsigned int feature_mask = __ballot_sync(0xFFFFFFFF, is_feature);
+        int my_index = is_feature ? i : -1;
+        int warp_right_feature = find_nearest_right_in_warp(lane, my_index, feature_mask);
+        
+        // 2. Prepare: Write leftmost feature of this warp
+        int leftmost_lane = (feature_mask != 0) ? (__ffs(feature_mask) - 1) : 0;
+        int leftmost_index = __shfl_sync(0xFFFFFFFF, my_index, leftmost_lane);
+        
+        if (lane == 0) {
+            s_warp_boundary[warp_id] = (feature_mask != 0) ? leftmost_index : -1;
+        }
+        __syncthreads();
+        
+        // 3. Block-Level: Warp 0 performs parallel suffix scan (Right-to-Left tree)
+        if (warp_id == 0) {
+            int val = (lane < num_warps) ? s_warp_boundary[lane] : -1;
+            
+            // Perform suffix min scan
+            int scan_res = warp_scan_suffix_min(val, num_warps);
+            
+            if (lane < num_warps) {
+                s_warp_boundary[lane] = scan_res;
             }
         }
-    }
-    out_z[tid] = (IndexType)best_z;
-    out_y[tid] = (IndexType)best_y;
-    out_x[tid] = (IndexType)best_x;
-}
-
-template <typename IndexType>
-__global__ void calc_dist_kernel_3d_soa(
-    const IndexType* __restrict__ in_z,
-    const IndexType* __restrict__ in_y,
-    const IndexType* __restrict__ in_x,
-    float* __restrict__ dist_out,
-    int64_t total_elements,
-    int D, int H, int W
-) {
-    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total_elements) return;
-
-    int seed_d = (int)in_z[tid];
-    if (seed_d == -1) { 
-        dist_out[tid] = INF_VAL; 
-    } else {
-        int seed_h = (int)in_y[tid];
-        int seed_w = (int)in_x[tid];
-
-        int64_t spatial_size = (int64_t)D * H * W;
-        int64_t rem = tid % spatial_size;
-        int cur_w = (int)(rem % W);
-        int cur_h = (int)((rem / W) % H);
-        int cur_d = (int)(rem / (W * H));
+        __syncthreads();
         
-        dist_out[tid] = sqrtf(dist_sq_3d_soa(cur_d, cur_h, cur_w, seed_d, seed_h, seed_w));
-    }
-}
-
-// ==================================================================
-// PART 3: SEPARABLE N-DIM KERNELS
-// ==================================================================
-
-__device__ void run_separable_scan_core(
-    int N,
-    int tid,
-    const float* __restrict__ vals,  
-    int* __restrict__ idx_curr,      
-    int* __restrict__ idx_next       
-) {
-    for (int i = tid; i < N; i += blockDim.x) {
-        if (vals[i] >= INF_VAL * 0.9f) idx_curr[i] = -1; 
-        else idx_curr[i] = i;  
-    }
-    __syncthreads();
-
-    int* idx_in = idx_curr;
-    int* idx_out = idx_next;
-
-    for (int step = 1; step < N; step *= 2) {
-        for (int i = tid; i < N; i += blockDim.x) {
-            int my_best_p = idx_in[i];
-            float min_cost = INF_VAL;
-
-            if (my_best_p != -1) min_cost = compute_cost(i, my_best_p, vals[my_best_p]);
-
-            int left = i - step;
-            if (left >= 0) {
-                int left_p = idx_in[left];
-                if (left_p != -1) {
-                    float c = compute_cost(i, left_p, vals[left_p]);
-                    if (c < min_cost) { min_cost = c; my_best_p = left_p; }
+        // 4. Combine Results
+        if (is_valid) {
+            int right_feature = -1;
+            
+            if (is_feature) right_feature = i;
+            else if (warp_right_feature != -1) right_feature = warp_right_feature;
+            else {
+                // Look at scan result from NEXT warp
+                if (warp_id < num_warps - 1) {
+                    right_feature = s_warp_boundary[warp_id + 1];
                 }
+                
+                if (right_feature == -1) right_feature = global_right_feature;
             }
-
-            int right = i + step;
-            if (right < N) {
-                int right_p = idx_in[right];
-                if (right_p != -1) {
-                    float c = compute_cost(i, right_p, vals[right_p]);
-                    if (c < min_cost) { min_cost = c; my_best_p = right_p; }
+            
+            // Update Min Distance
+            if (right_feature >= 0) {
+                float d = sqr((float)(right_feature - i));
+                if (d < row_dist[i]) {
+                    row_dist[i] = d;
+                    row_indices[i] = right_feature;
                 }
             }
-            idx_out[i] = my_best_p;
         }
-        int* temp = idx_in; idx_in = idx_out; idx_out = temp;
-        __syncthreads();
-    }
-
-    if (idx_in != idx_curr) {
-        for (int i = tid; i < N; i += blockDim.x) idx_curr[i] = idx_next[i];
+        
+        // 5. Update Global History
+        // First warp's scan result contains min index for whole tile
+        int tile_min = s_warp_boundary[0];
+        if (tile_min != -1) global_right_feature = tile_min;
+        
         __syncthreads();
     }
 }
 
-template <bool IsFinal>
-__global__ void separable_kernel_shared(
-    const float* __restrict__ in_data,
-    const int32_t* __restrict__ in_indices,
-    float* __restrict__ out_dist,            
-    int32_t* __restrict__ out_indices,       
-    int64_t L,               
-    int64_t total_elements,  
-    int coord_ndim           
-) {
-    int64_t row_idx = blockIdx.x;
-    int64_t offset = row_idx * L;
-    if (offset >= total_elements) return;
-
-    extern __shared__ char s_buffer[];
-    float* s_vals = (float*)s_buffer;
-    int*   s_idx1 = (int*)(s_vals + L);
-    int*   s_idx2 = (int*)(s_idx1 + L);
-
-    for (int i = threadIdx.x; i < L; i += blockDim.x) {
-        s_vals[i] = __ldg(&in_data[offset + i]);
-    }
-    __syncthreads();
-
-    run_separable_scan_core(L, threadIdx.x, s_vals, s_idx1, s_idx2);
-
-    for (int q = threadIdx.x; q < L; q += blockDim.x) {
-        int p = s_idx1[q];
-        float dist_val;
-
-        if (p != -1) {
-            float dist_sq = sqr((float)q - (float)p) + s_vals[p];
-            dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
-        } else {
-            dist_val = IsFinal ? INF_VAL : INF_VAL;
-            p = 0;
-        }
-        out_dist[offset + q] = dist_val;
-
-        int64_t dst_base = (offset + q) * coord_ndim;
-        if (p != -1 && s_vals[p] < INF_VAL) {
-            int64_t src_base = (offset + p) * coord_ndim;
-            for (int d = 0; d < coord_ndim; ++d) {
-                out_indices[dst_base + d] = in_indices[src_base + d];
-            }
-        } else {
-            for (int d = 0; d < coord_ndim; ++d) out_indices[dst_base + d] = 0;
-        }
-    }
-}
-
-template <bool IsFinal>
-__global__ void separable_kernel_global(
-    const float* __restrict__ in_data,
-    const int32_t* __restrict__ in_indices,
-    float* __restrict__ out_dist,
-    int32_t* __restrict__ out_indices,
-    int* __restrict__ global_buffer_1,
-    int* __restrict__ global_buffer_2,
-    int64_t L,
-    int64_t total_elements,
-    int coord_ndim
-) {
-    int64_t row_idx = blockIdx.x;
-    int64_t offset = row_idx * L;
-    if (offset >= total_elements) return;
+// ============================================================================
+// PyTorch Wrapper Function
+// ============================================================================
 
-    int* g_idx1 = global_buffer_1 + offset;
-    int* g_idx2 = global_buffer_2 + offset;
+std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
+    TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
+    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be float32");
+    TORCH_CHECK(input.dim() >= 1, "Input must be at least 1D");
     
-    run_separable_scan_core(L, threadIdx.x, in_data + offset, g_idx1, g_idx2);
-
-    for (int q = threadIdx.x; q < L; q += blockDim.x) {
-        int p = g_idx1[q];
-        float dist_val;
-        if (p != -1) {
-            float val_p = in_data[offset + p]; 
-            float dist_sq = sqr((float)q - (float)p) + val_p;
-            dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
-        } else {
-            dist_val = IsFinal ? INF_VAL : INF_VAL;
-            p = 0; 
-        }
-        out_dist[offset + q] = dist_val;
-
-        int64_t dst_base = (offset + q) * coord_ndim;
-        if (p != -1 && in_data[offset + p] < INF_VAL) {
-            int64_t src_base = (offset + p) * coord_ndim;
-            for (int d = 0; d < coord_ndim; ++d) {
-                out_indices[dst_base + d] = in_indices[src_base + d];
-            }
-        } else {
-            for (int d = 0; d < coord_ndim; ++d) out_indices[dst_base + d] = 0;
-        }
-    }
-}
-
-__global__ void init_indices_separable_kernel(
-    int32_t* indices, 
-    int64_t total_pixels, 
-    int NDim, 
-    const int64_t* __restrict__ shape_ptr
-) {
-    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= total_pixels) return;
-
-    int64_t temp = idx;
-    int32_t coords[8]; 
-    for (int d = NDim - 1; d >= 0; --d) {
-        int64_t dim_size = shape_ptr[d];
-        coords[d] = temp % dim_size;
-        temp /= dim_size;
-    }
-    int64_t out_ptr = idx * NDim;
-    for (int d = 0; d < NDim; ++d) indices[out_ptr + d] = coords[d];
-}
-
-// ==================================================================
-// PART 4: DISPATCH HELPERS
-// ==================================================================
-
-std::tuple<torch::Tensor, torch::Tensor> run_jfa_2d(
-    torch::Tensor input, int64_t H, int64_t W, int grid, int block, int64_t numel
-) {
-    auto index_opts = input.options().dtype(torch::kInt32);
-    auto idx_shape = input.sizes().vec();
-    idx_shape.push_back(2); 
-    auto curr_idx = torch::empty(idx_shape, index_opts);
-    auto next_idx = torch::empty(idx_shape, index_opts);
+    // Get dimensions
+    int64_t ndim = input.dim();
+    int64_t width = input.size(-1);
+    int64_t height = 1;
     
-    int2* d_curr = (int2*)curr_idx.data_ptr<int32_t>();
-    int2* d_next = (int2*)next_idx.data_ptr<int32_t>();
-
-    init_jfa_kernel_2d_opt<<<grid, block>>>(
-        input.data_ptr<float>(), d_curr, numel, H, W
-    );
-
-    {
-        dim3 dimBlock(JFA_BLOCK_DIM, JFA_BLOCK_DIM); 
-        int64_t batch_size = numel / (H * W);
-        dim3 dimGrid((W + JFA_BLOCK_DIM - 1) / JFA_BLOCK_DIM, 
-                     (H + JFA_BLOCK_DIM - 1) / JFA_BLOCK_DIM, 
-                     batch_size);
-        
-        jfa_block_fused_kernel_2d<<<dimGrid, dimBlock>>>(d_curr, d_next, H, W, batch_size);
-        std::swap(d_curr, d_next); 
-        std::swap(curr_idx, next_idx); 
-    }
-
-    int max_dim = std::max((int)H, (int)W);
-    int step = 16; 
-
-    while (step < max_dim) {
-        jfa_step_global_2d_opt<<<grid, block>>>(d_curr, d_next, step, H, W, numel);
-        std::swap(d_curr, d_next);
-        std::swap(curr_idx, next_idx);
-        step *= 2;
+    if (ndim >= 2) {
+        height = input.size(-2);
     }
     
-    auto final_dist = torch::empty_like(input);
-    calc_dist_kernel_2d_opt<<<grid, block>>>(d_curr, final_dist.data_ptr<float>(), numel, H, W);
-
-    return std::make_tuple(final_dist, curr_idx);
-}
-
-
-std::tuple<torch::Tensor, torch::Tensor> run_jfa_3d(
-    torch::Tensor input, int64_t D, int64_t H, int64_t W, int grid, int block, int64_t numel
-) {
-    bool use_int16 = (D < 32767 && H < 32767 && W < 32767);
-    auto index_opts = input.options().dtype(use_int16 ? torch::kInt16 : torch::kInt32);
+    int64_t batch_size = input.numel() / (width * height);
     
-    int64_t batch = numel / (D * H * W);
+    // Flatten to [batch * height, width]
+    auto input_flat = input.view({batch_size * height, width});
     
-    // (3, Batch, D, H, W)
-    auto curr_idx_soa = torch::empty({3, batch, D, H, W}, index_opts);
-    auto next_idx_soa = torch::empty({3, batch, D, H, W}, index_opts);
+    // Create output tensors
+    auto dist_map = torch::empty_like(input);
+    auto dist_flat = dist_map.view({batch_size * height, width});
     
-    void* d_curr = curr_idx_soa.data_ptr();
-    void* d_next = next_idx_soa.data_ptr();
-    int64_t plane_stride = numel; // B*D*H*W
-
-    // 1. Init
-    if (use_int16) {
-        int16_t* ptr = (int16_t*)d_curr;
-        init_jfa_kernel_3d_soa<int16_t><<<grid, block>>>(
-            input.data_ptr<float>(), ptr, ptr + plane_stride, ptr + 2 * plane_stride, numel, D, H, W
-        );
-    } else {
-        int32_t* ptr = (int32_t*)d_curr;
-        init_jfa_kernel_3d_soa<int32_t><<<grid, block>>>(
-            input.data_ptr<float>(), ptr, ptr + plane_stride, ptr + 2 * plane_stride, numel, D, H, W
-        );
-    }
-
-    // 2. Fused Steps
-    int block_dim = 8;
-    int blocks_per_d = (D + block_dim - 1) / block_dim;
-    dim3 fused_block(block_dim, block_dim, block_dim);
-    dim3 fused_grid((W + block_dim - 1) / block_dim, (H + block_dim - 1) / block_dim, blocks_per_d * batch);
-    size_t smem_bytes = (14*14*14) * 3 * (use_int16 ? 2 : 4);
-
-    if (use_int16) {
-        int16_t* c = (int16_t*)d_curr;
-        int16_t* n = (int16_t*)d_next;
-        jfa_block_fused_kernel_3d_soa<int16_t><<<fused_grid, fused_block, smem_bytes>>>(
-            c, c + plane_stride, c + 2 * plane_stride, 
-            n, n + plane_stride, n + 2 * plane_stride, 
-            D, H, W, blocks_per_d
-        );
-    } else {
-        int32_t* c = (int32_t*)d_curr;
-        int32_t* n = (int32_t*)d_next;
-        jfa_block_fused_kernel_3d_soa<int32_t><<<fused_grid, fused_block, smem_bytes>>>(
-            c, c + plane_stride, c + 2 * plane_stride, 
-            n, n + plane_stride, n + 2 * plane_stride, 
-            D, H, W, blocks_per_d
-        );
-    }
-    std::swap(d_curr, d_next); 
-
-    // 3. Global Steps
-    int max_dim = std::max({(int)D, (int)H, (int)W});
-    int step = 4;
-    while (step < max_dim) {
-        if (use_int16) {
-            int16_t* c = (int16_t*)d_curr;
-            int16_t* n = (int16_t*)d_next;
-            jfa_step_3d_soa<int16_t><<<grid, block>>>(
-                c, c + plane_stride, c + 2 * plane_stride, 
-                n, n + plane_stride, n + 2 * plane_stride, 
-                step, D, H, W, numel
-            );
-        } else {
-            int32_t* c = (int32_t*)d_curr;
-            int32_t* n = (int32_t*)d_next;
-            jfa_step_3d_soa<int32_t><<<grid, block>>>(
-                c, c + plane_stride, c + 2 * plane_stride, 
-                n, n + plane_stride, n + 2 * plane_stride, 
-                step, D, H, W, numel
-            );
-        }
-        std::swap(d_curr, d_next);
-        step *= 2;
-    }
-
-    // 4. Final Dist
-    auto final_dist = torch::empty_like(input);
-    if (use_int16) {
-        int16_t* c = (int16_t*)d_curr;
-        calc_dist_kernel_3d_soa<int16_t><<<grid, block>>>(
-            c, c + plane_stride, c + 2 * plane_stride, 
-            final_dist.data_ptr<float>(), numel, D, H, W
-        );
-    } else {
-        int32_t* c = (int32_t*)d_curr;
-        calc_dist_kernel_3d_soa<int32_t><<<grid, block>>>(
-            c, c + plane_stride, c + 2 * plane_stride, 
-            final_dist.data_ptr<float>(), numel, D, H, W
-        );
-    }
-
-    // Permute result indices back to (Batch, D, H, W, 3)
-    torch::Tensor result_indices;
-    if (d_curr == curr_idx_soa.data_ptr()) result_indices = curr_idx_soa;
-    else result_indices = next_idx_soa;
+    // Index map: same shape as input + last dimension for coordinate
+    auto idx_shape = input.sizes().vec();
+    idx_shape.push_back(1);
+    auto idx_map = torch::empty(idx_shape, input.options().dtype(torch::kInt32));
+    auto idx_flat = idx_map.view({batch_size * height, width, 1});
     
-    result_indices = result_indices.permute({1, 2, 3, 4, 0}).contiguous();
-
-    return std::make_tuple(final_dist, result_indices);
-}
-
-std::tuple<torch::Tensor, torch::Tensor> run_separable_ndim(torch::Tensor input) {
-    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Separable N-Dim input must be float32.");
-    input = input.contiguous();
+    // Launch kernel
+    // Use 256 threads per block (8 warps) for good occupancy
+    int threads_per_block = 256;
+    int num_rows = batch_size * height;
     
-    const int ndim = input.dim(); 
-    const int sample_ndim = ndim - 1; 
-    TORCH_CHECK(sample_ndim > 0 && sample_ndim <= 8, "Unsupported dims for Separable EDT");
+    dim3 block(threads_per_block);
+    dim3 grid(num_rows);
     
-    auto shape = input.sizes().vec();
-    int64_t num_pixels = input.numel();
-
-    auto current_dist = torch::where(input == 0, 
-                                    torch::tensor(0.0f, input.options()), 
-                                    torch::tensor(INF_VAL, input.options()));
+    edt_1d_warp_optimized_kernel<<<grid, block>>>(
+        input_flat.data_ptr<float>(),
+        dist_flat.data_ptr<float>(),
+        idx_flat.data_ptr<int32_t>(),
+        width,
+        num_rows
+    );
     
-    auto index_shape = shape;
-    index_shape.push_back(sample_ndim);
-    auto current_idx = torch::empty(index_shape, input.options().dtype(torch::kInt32));
+    // Check for errors
+    cudaError_t err = cudaGetLastError();
+    TORCH_CHECK(err == cudaSuccess, "CUDA kernel failed: ", cudaGetErrorString(err));
     
-    {
-        std::vector<int64_t> spatial_shape(shape.begin() + 1, shape.end());
-        auto shape_tensor = torch::tensor(spatial_shape, torch::kInt64).to(input.device());
-        int threads = 256;
-        int blocks = (num_pixels + threads - 1) / threads;
-        init_indices_separable_kernel<<<blocks, threads>>>(
-            current_idx.data_ptr<int32_t>(), num_pixels, sample_ndim, shape_tensor.data_ptr<int64_t>()
-        );
-    }
-
-    torch::Tensor global_buf1, global_buf2;
-
-    for (int d = 1; d < ndim; ++d) {
-        bool is_final_pass = (d == ndim - 1);
-        
-        auto dist_in = current_dist.transpose(d, ndim - 1).contiguous();
-        auto idx_in  = current_idx.transpose(d, ndim - 1).contiguous(); 
-        
-        auto dist_out = torch::empty(dist_in.sizes(), dist_in.options());
-        auto idx_out  = torch::empty(idx_in.sizes(), idx_in.options());
-
-        int64_t L = dist_in.size(-1); 
-        int64_t total_slices = dist_in.numel() / L; 
-        int threads = std::min((int64_t)MAX_THREADS, L);
-        
-        if (L <= SMEM_LIMIT_ELEMENTS) {
-            size_t smem_size = L * (sizeof(float) + 2 * sizeof(int));
-            if (is_final_pass) {
-                separable_kernel_shared<true><<<total_slices, threads, smem_size>>>(
-                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
-                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
-                    L, dist_in.numel(), sample_ndim
-                );
-            } else {
-                separable_kernel_shared<false><<<total_slices, threads, smem_size>>>(
-                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
-                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
-                    L, dist_in.numel(), sample_ndim
-                );
-            }
-        } else {
-            if (global_buf1.numel() < dist_in.numel()) {
-                global_buf1 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
-                global_buf2 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
-            }
-            if (is_final_pass) {
-                separable_kernel_global<true><<<total_slices, threads>>>(
-                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
-                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
-                    global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(),
-                    L, dist_in.numel(), sample_ndim
-                );
-            } else {
-                separable_kernel_global<false><<<total_slices, threads>>>(
-                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
-                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
-                    global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(),
-                    L, dist_in.numel(), sample_ndim
-                );
-            }
-        }
-        current_dist = dist_out.transpose(d, ndim - 1);
-        current_idx  = idx_out.transpose(d, ndim - 1);
-    }
-
-    return std::make_tuple(current_dist, current_idx);
+    // Take square root to get actual distance (not squared)
+    dist_map = torch::sqrt(dist_map);
+    
+    return std::make_tuple(dist_map, idx_map);
 }
-
-// ==================================================================
-// PART 5: MAIN ENTRY POINT
-// ==================================================================
-
-std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
-    TORCH_CHECK(input.is_cuda(), "Input must be CUDA tensor");
-    input = input.contiguous();
-
-    int64_t dims = input.dim();
-    int64_t numel = input.numel();
-    int block = BLOCK_SIZE;
-    int grid = (numel + block - 1) / block;
-
-    if (dims >= 5) {
-        return run_separable_ndim(input);
-    }
-    else if (dims == 4) {
-        int64_t dim1 = input.size(1); 
-        if (dim1 == 1) {
-            int64_t H = input.size(-2);
-            int64_t W = input.size(-1);
-            return run_jfa_2d(input, H, W, grid, block, numel);
-        } 
-        else {
-            int64_t D = dim1;
-            int64_t H = input.size(-2);
-            int64_t W = input.size(-1);
-            return run_jfa_3d(input, D, H, W, grid, block, numel);
-        }
-    }
-    else if (dims == 3) {
-        int64_t H = input.size(-2);
-        int64_t W = input.size(-1);
-        return run_jfa_2d(input, H, W, grid, block, numel);
-    }
-    else if (dims == 2) {
-        int64_t H = 1;
-        int64_t W = input.size(-1);
-        auto result = run_jfa_2d(input, H, W, grid, block, numel);
-        torch::Tensor dist = std::get<0>(result);
-        torch::Tensor idx_2d = std::get<1>(result); 
-        auto idx_1d = idx_2d.slice(/*dim=*/-1, /*start=*/1, /*end=*/2).contiguous();
-        return std::make_tuple(dist, idx_1d);
-    }
-    else {
-        TORCH_CHECK(false, "Unsupported dimensions.");
-        return std::make_tuple(torch::Tensor(), torch::Tensor());
-    }
-}
\ No newline at end of file

From 8bf631871b0968fdcd6fb43f20465eba9f8f67ac Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Sat, 3 Jan 2026 03:32:50 +0800
Subject: [PATCH 47/56] make input tensor to be at least 1D

---
 torchmorph/distance_transform.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchmorph/distance_transform.py b/torchmorph/distance_transform.py
index 868e84a..7887c2a 100644
--- a/torchmorph/distance_transform.py
+++ b/torchmorph/distance_transform.py
@@ -7,7 +7,7 @@ def distance_transform(input: torch.Tensor) -> torch.Tensor:
     """Distance Transform in CUDA."""
     if not input.is_cuda:
         raise ValueError("Input tensor must be on CUDA device.")
-    if input.ndim < 2 or input.numel() == 0:
+    if input.ndim < 1 or input.numel() == 0:
         raise ValueError(f"Invalid input dimension: {input.shape}.")
 
     # binarize input

From 0f4c90085e9d9fa5f3b1c2958c0bafbb7c5312db Mon Sep 17 00:00:00 2001
From: Yuhandeng <moliflower233@163.com>
Date: Wed, 14 Jan 2026 02:30:13 +0800
Subject: [PATCH 48/56] restore code to JFA

---
 test/test_edt_1d.py                          |  154 ---
 torchmorph/csrc/distance_transform_kernel.cu | 1176 +++++++++++++-----
 torchmorph/distance_transform.py             |    2 +-
 3 files changed, 884 insertions(+), 448 deletions(-)
 delete mode 100644 test/test_edt_1d.py

diff --git a/test/test_edt_1d.py b/test/test_edt_1d.py
deleted file mode 100644
index b81cd63..0000000
--- a/test/test_edt_1d.py
+++ /dev/null
@@ -1,154 +0,0 @@
-import unittest
-
-import torch
-
-import torchmorph
-
-
-class Test1DEuclideanDistanceTransform(unittest.TestCase):
-    def setUp(self):
-        if not torch.cuda.is_available():
-            self.skipTest("CUDA not available")
-        self.device = torch.device("cuda")
-
-    def test_basic_features(self):
-        """Test with 32 elements and 3 feature points"""
-        print("\n=== Test 1: Basic Features (32 elements) ===")
-        input_tensor = torch.zeros(32, dtype=torch.float32, device=self.device)
-        input_tensor[0] = 1.0
-        input_tensor[12] = 1.0
-        input_tensor[31] = 1.0
-
-        dist, indices = torchmorph.distance_transform(input_tensor)
-
-        # Check specific positions
-        self._check_position(dist, indices, 0, 0, 0)
-        self._check_position(dist, indices, 6, 0, 6)
-        self._check_position(dist, indices, 12, 12, 0)
-        self._check_position(dist, indices, 21, 12, 9)
-        self._check_position(dist, indices, 31, 31, 0)
-
-    def test_multiple_features(self):
-        """Test with multiple feature points"""
-        print("\n=== Test 2: Multiple Features ===")
-        input_tensor = torch.zeros(32, dtype=torch.float32, device=self.device)
-        features = [0, 3, 7, 10, 14, 21, 31]
-        input_tensor[features] = 1.0
-
-        dist, indices = torchmorph.distance_transform(input_tensor)
-
-        for pos in range(32):
-            self._verify_nearest(pos, dist, indices, features)
-
-    def test_batch_processing(self):
-        """Test with 2D array (batch of 1D rows)"""
-        print("\n=== Test 3: Batch Processing (4x32) ===")
-        input_tensor = torch.zeros(4, 32, dtype=torch.float32, device=self.device)
-        input_tensor[0, [0, 15, 31]] = 1.0
-        input_tensor[1, [5, 10, 20]] = 1.0
-        input_tensor[2, [8, 24]] = 1.0
-        input_tensor[3, [16]] = 1.0
-
-        dist, indices = torchmorph.distance_transform(input_tensor)
-        self.assertEqual(dist.shape, (4, 32))
-
-        # Check row 0
-        features_row0 = [0, 15, 31]
-        for pos in range(32):
-            self._verify_nearest(pos, dist[0], indices[0], features_row0)
-
-    def test_boundary_conditions(self):
-        """Test empty and full feature arrays"""
-        print("\n=== Test 4: Boundary Conditions ===")
-        # No features
-        input_empty = torch.zeros(32, dtype=torch.float32, device=self.device)
-        dist_empty, idx_empty = torchmorph.distance_transform(input_empty)
-        self.assertTrue(torch.all(dist_empty > 1000))  # Should be large/inf
-        self.assertTrue(torch.all(idx_empty == -1))
-
-        # All features
-        input_full = torch.ones(32, dtype=torch.float32, device=self.device)
-        dist_full, idx_full = torchmorph.distance_transform(input_full)
-        self.assertTrue(torch.all(dist_full == 0))
-        expected_idx = torch.arange(32, device=self.device, dtype=torch.int32).unsqueeze(-1)
-        self.assertTrue(torch.all(idx_full == expected_idx))
-
-    def test_large_array(self):
-        """Test large array to verify cross-tile propagation"""
-        print("\n=== Test 5: Large Array (1024 elements) ===")
-        input_tensor = torch.zeros(1024, dtype=torch.float32, device=self.device)
-        features = [0, 512, 1023]
-        input_tensor[features] = 1.0
-
-        dist, indices = torchmorph.distance_transform(input_tensor)
-
-        test_positions = [0, 256, 512, 768, 1023]
-        for pos in test_positions:
-            self._verify_nearest(pos, dist, indices.squeeze(), features)
-
-    def test_cross_tile_boundary(self):
-        """Test propagation across tile boundaries"""
-        print("\n=== Test 6: Cross-Tile Propagation ===")
-        # 768 elements (3 tiles of 256)
-        input_tensor = torch.zeros(768, dtype=torch.float32, device=self.device)
-        features = [100, 600]
-        input_tensor[features] = 1.0
-
-        dist, indices = torchmorph.distance_transform(input_tensor)
-
-        # Check around boundaries (256, 512)
-        test_positions = [250, 255, 256, 260, 350, 500, 510, 512, 520]
-        for pos in test_positions:
-            self._verify_nearest(pos, dist, indices.squeeze(), features)
-
-    def test_large_2d_batch(self):
-        """Test large 2D batch"""
-        print("\n=== Test 7: Large 2D Batch ===")
-        input_tensor = torch.zeros(3, 600, dtype=torch.float32, device=self.device)
-        rows_features = {
-            0: [0, 299, 599],
-            1: [150, 450],
-            2: [300],
-        }
-
-        for row, feats in rows_features.items():
-            input_tensor[row, feats] = 1.0
-
-        dist, indices = torchmorph.distance_transform(input_tensor)
-
-        # Verify specific points with dynamic calculation
-        test_cases = [
-            (0, 150),
-            (0, 450),
-            (1, 300),
-            (2, 100),
-            (2, 500),
-        ]
-
-        for row, pos in test_cases:
-            self._verify_nearest(pos, dist[row], indices[row].squeeze(), rows_features[row])
-
-    def _check_position(self, dist, indices, pos, expected_idx, expected_dist):
-        actual_dist = dist[pos].item()
-        actual_idx = indices[pos].item() if indices.ndim == 1 else indices[pos, 0].item()
-
-        self.assertAlmostEqual(actual_dist, float(expected_dist), places=1)
-        self.assertEqual(actual_idx, expected_idx)
-
-    def _verify_nearest(self, pos, dist, indices, features):
-        actual_dist = dist[pos].item()
-        nearest_idx = indices[pos].item() if indices.ndim == 1 else indices[pos].item()
-
-        # Calculate ground truth dynamically
-        true_dists = [abs(pos - f) for f in features]
-        min_dist = min(true_dists)
-        candidates = [f for f, d in zip(features, true_dists) if d == min_dist]
-
-        self.assertAlmostEqual(
-            actual_dist, float(min_dist), places=1, msg=f"Distance mismatch at {pos}"
-        )
-        self.assertIn(nearest_idx, candidates, msg=f"Nearest index mismatch at {pos}")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 59b1d99..09d4cbc 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -1,355 +1,945 @@
 #include <torch/extension.h>
-#include <cuda_runtime.h>
 #include <vector>
+#include <cmath>
+#include <limits>
+#include <cstdint>
+#include <cstdio>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <tuple>
+
+// ------------------------------------------------------------------
+// Global Configuration
+// ------------------------------------------------------------------
+#define BLOCK_SIZE 256
+#define INF_VAL 1e20f
+#define MAX_THREADS 1024
+#define SMEM_LIMIT_ELEMENTS 4096 
+
+#define JFA_BLOCK_DIM 32       
+#define JFA_FUSED_STEPS 4      
+#define JFA_MAX_OFFSET 8       
+#define JFA_SMEM_DIM (JFA_BLOCK_DIM + 2 * JFA_MAX_OFFSET) 
 
-// ============================================================================
-// 1D Euclidean Distance Transform - Optimized Warp-Level Parallel
-// ============================================================================
-// 
-// Based on the paper's algorithm using:
-// - __ballot_sync() for feature point voting
-// - __shfl_sync() for warp-level communication (NO shared memory within warp)
-// - Parallel reduction tree for cross-warp propagation
-// - Time complexity: O(log32(n)) with O(n) total work
-//
-// ============================================================================
-
-#define WARP_SIZE 32
-#define INF_VAL 1e9f
+// 3D Config
+#define JFA_3D_BLOCK 8
+#define JFA_3D_HALO 1 
 
+// ------------------------------------------------------------------
+// Device Helpers
+// ------------------------------------------------------------------
 __device__ __forceinline__ float sqr(float x) { return x * x; }
 
-// ============================================================================
-// Device Function: Find nearest feature to the LEFT using warp operations
-// ============================================================================
-// 
-// Algorithm (as described in the paper, Figure 7a):
-// 1. Each thread votes if it holds a feature point -> ballot() creates bitmask
-// 2. Mask high (warpSize - lane - 1) bits with 0
-// 3. Use clz() to count leading zeros
-// 4. Nearest thread lane = (warpSize - clz() - 1)
-// 5. Use __shfl_sync() to get the feature index from that thread
-//
-// Returns: index of nearest feature to the left, or -1 if none exists
-// ============================================================================
-
-__device__ __forceinline__ int find_nearest_left_in_warp(
-    int lane,
-    int my_index,
-    unsigned int feature_mask
+// Helper for JFA 2D/3D (Standard)
+__device__ __forceinline__ float dist_sq_2d(int y1, int x1, int y2, int x2) {
+    return sqr((float)(y1 - y2)) + sqr((float)(x1 - x2));
+}
+
+// Helper for SoA 3D (Z, Y, X separate)
+__device__ __forceinline__ float dist_sq_3d_soa(int z1, int y1, int x1, int z2, int y2, int x2) {
+    if (z2 == -1) return INF_VAL;
+    float dz = (float)(z1 - z2);
+    float dy = (float)(y1 - y2);
+    float dx = (float)(x1 - x2);
+    return dz*dz + dy*dy + dx*dx;
+}
+
+// Helper for Separable 1D
+__device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
+    if (p < 0 || val_p >= INF_VAL) return INF_VAL; 
+    return sqr((float)q - (float)p) + val_p;
+}
+
+// Device Helpers for int2 (2D Vectorized)
+__device__ __forceinline__ float dist_sq_int2(int y, int x, int2 seed) {
+    if (seed.x == -1) return INF_VAL; 
+    float dy = (float)(y - seed.x);
+    float dx = (float)(x - seed.y);
+    return dy*dy + dx*dx;
+}
+
+// ==================================================================
+// PART 1: JFA KERNELS 2D (Vectorized int2 + Block Shared)
+// ==================================================================
+
+__global__ void init_jfa_kernel_2d_opt(
+    const float* __restrict__ input,
+    int2* __restrict__ output, 
+    int64_t total_elements,    
+    int H, int W
 ) {
-    // Mask high bits: only keep features to the LEFT of current lane
-    unsigned int left_mask = feature_mask & ((1U << lane) - 1);
-    
-    // We must execute __shfl_sync for ALL threads in the warp
-    // Calculate nearest_lane if valid, otherwise use 0 (safe default)
-    int nearest_lane = 0;
-    if (left_mask != 0) {
-        nearest_lane = 31 - __clz(left_mask);
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    if (input[tid] == 0.0f) {
+        int64_t spatial_size = (int64_t)H * W;
+        int64_t rem = tid % spatial_size;
+        int w = (int)(rem % W);
+        int h = (int)(rem / W);
+        output[tid] = make_int2(h, w); 
+    } else {
+        output[tid] = make_int2(-1, -1);
     }
-    
-    // Perform shuffle for ALL threads
-    int nearest_index = __shfl_sync(0xFFFFFFFF, my_index, nearest_lane);
-    
-    // Only return valid result if we actually found a feature
-    return (left_mask != 0) ? nearest_index : -1;
 }
 
-// ============================================================================
-// Device Function: Find nearest feature to the RIGHT using warp operations
-// ============================================================================
-
-__device__ __forceinline__ int find_nearest_right_in_warp(
-    int lane,
-    int my_index,
-    unsigned int feature_mask
+__global__ void jfa_block_fused_kernel_2d(
+    const int2* __restrict__ in_idx,
+    int2* __restrict__ out_idx,
+    int H, int W,
+    int64_t num_images 
 ) {
-    // Mask low bits: only keep features to the RIGHT of current lane
-    unsigned int right_mask = feature_mask & ~((1U << (lane + 1)) - 1);
+    __shared__ int2 smem[JFA_SMEM_DIM][JFA_SMEM_DIM];
+
+    int tx = threadIdx.x; 
+    int ty = threadIdx.y; 
     
-    // Calculate nearest_lane if valid, otherwise use 0 (safe default)
-    int nearest_lane = 0;
-    if (right_mask != 0) {
-        nearest_lane = __ffs(right_mask) - 1;
+    int bx = blockIdx.x * blockDim.x;
+    int by = blockIdx.y * blockDim.y;
+    int img_idx = blockIdx.z;
+    int64_t batch_offset = (int64_t)img_idx * (H * W);
+
+    int gx = bx + tx;
+    int gy = by + ty;
+
+    // Phase 1: load data to Shared Memory
+    int smem_linear_size = JFA_SMEM_DIM * JFA_SMEM_DIM;
+    int total_threads = blockDim.x * blockDim.y;
+    int thread_linear_idx = ty * blockDim.x + tx;
+
+    int base_x = bx - JFA_MAX_OFFSET;
+    int base_y = by - JFA_MAX_OFFSET;
+
+    for (int i = thread_linear_idx; i < smem_linear_size; i += total_threads) {
+        int s_y = i / JFA_SMEM_DIM;
+        int s_x = i % JFA_SMEM_DIM;
+        int global_y = base_y + s_y;
+        int global_x = base_x + s_x;
+        int2 val = make_int2(-1, -1);
+        if (global_y >= 0 && global_y < H && global_x >= 0 && global_x < W) {
+            val = in_idx[batch_offset + global_y * W + global_x];
+        }
+        smem[s_y][s_x] = val;
     }
-    
-    // Perform shuffle for ALL threads
-    int nearest_index = __shfl_sync(0xFFFFFFFF, my_index, nearest_lane);
-    
-    return (right_mask != 0) ? nearest_index : -1;
-}
+    __syncthreads();
 
-// ============================================================================
-// Warp Scan Helpers
-// ============================================================================
+    // Phase 2: Iterate in Shared Memory
+    if (gx < W && gy < H) {
+        int center_sy = ty + JFA_MAX_OFFSET;
+        int center_sx = tx + JFA_MAX_OFFSET;
 
-// Inclusive Max Scan for positive integers (returns max seen so far)
-__device__ __forceinline__ int warp_scan_inclusive_max(int val, int width) {
-    // Hillis-Steele Scan (O(log N))
-    #pragma unroll
-    for (int offset = 1; offset < 32; offset *= 2) {
-        int neighbor_val = __shfl_up_sync(0xFFFFFFFF, val, offset);
-        if (threadIdx.x % 32 >= offset) {
-            // Logic: take the max of current and neighbor
-            // Handle -1 (invalid) carefully: max behavior handles -1 naturally if features are >= 0
-            if (neighbor_val > val) val = neighbor_val;
+        int2 best_seed = smem[center_sy][center_sx];
+        float best_dist = dist_sq_int2(gy, gx, best_seed);
+
+        int step = 1;
+        #pragma unroll
+        for (int k = 0; k < JFA_FUSED_STEPS; ++k) { 
+            #pragma unroll
+            for (int dy = -1; dy <= 1; ++dy) {
+                #pragma unroll
+                for (int dx = -1; dx <= 1; ++dx) {
+                    if (dy == 0 && dx == 0) continue;
+                    int2 neighbor_seed = smem[center_sy + dy * step][center_sx + dx * step];
+                    if (neighbor_seed.x != -1) {
+                        float d = dist_sq_int2(gy, gx, neighbor_seed);
+                        if (d < best_dist) {
+                            best_dist = d;
+                            best_seed = neighbor_seed;
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+            smem[center_sy][center_sx] = best_seed;
+            __syncthreads();
+            step *= 2;
         }
+        out_idx[batch_offset + gy * W + gx] = best_seed;
     }
-    return val;
 }
 
-// Inclusive Min Scan (returns min seen so far from right)
-// Note: We use __shfl_down_sync for suffix scan
-__device__ __forceinline__ int warp_scan_suffix_min(int val, int width) {
-    // Suffix Scan (Right to Left)
+__global__ void jfa_step_global_2d_opt(
+    const int2* __restrict__ in_idx,
+    int2* __restrict__ out_idx,
+    int step,
+    int H, int W, 
+    int64_t total_pixels
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_pixels) return;
+
+    int64_t spatial_size = (int64_t)H * W;
+    int64_t rem = tid % spatial_size;
+    int64_t batch_offset = tid - rem; 
+    int w = (int)(rem % W);
+    int h = (int)(rem / W);
+
+    int2 best_seed = in_idx[tid];
+    float best_dist = dist_sq_int2(h, w, best_seed);
+
     #pragma unroll
-    for (int offset = 1; offset < 32; offset *= 2) {
-        int neighbor_val = __shfl_down_sync(0xFFFFFFFF, val, offset);
-        // If we have a neighbor to the right
-        if ((threadIdx.x % 32) + offset < width) {
-            // Logic: take min. If current is -1 (invalid), take neighbor.
-            // If neighbor is -1, ignore it.
-            if (val == -1) val = neighbor_val;
-            else if (neighbor_val != -1) {
-                if (neighbor_val < val) val = neighbor_val;
+    for (int dy = -1; dy <= 1; ++dy) {
+        #pragma unroll
+        for (int dx = -1; dx <= 1; ++dx) {
+            if (dx == 0 && dy == 0) continue;
+            
+            int ny = h + dy * step;
+            int nx = w + dx * step;
+
+            if (ny >= 0 && ny < H && nx >= 0 && nx < W) {
+                int2 neighbor_seed = in_idx[batch_offset + ny * W + nx];
+                if (neighbor_seed.x != -1) {
+                    float d = dist_sq_int2(h, w, neighbor_seed);
+                    if (d < best_dist) {
+                        best_dist = d;
+                        best_seed = neighbor_seed;
+                    }
+                }
             }
         }
     }
-    return val;
+    out_idx[tid] = best_seed;
+}
+
+__global__ void calc_dist_kernel_2d_opt(
+    const int2* __restrict__ indices,
+    float* __restrict__ dist_out,
+    int64_t total_elements,
+    int H, int W
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    int2 s = indices[tid];
+    if (s.x == -1) { 
+        dist_out[tid] = INF_VAL; 
+    } else {
+        int64_t spatial_size = (int64_t)H * W;
+        int64_t rem = tid % spatial_size;
+        int cur_w = (int)(rem % W);
+        int cur_h = (int)(rem / W);
+        dist_out[tid] = sqrtf(dist_sq_int2(cur_h, cur_w, s));
+    }
 }
 
+// ==================================================================
+// PART 2: JFA KERNELS 3D (Optimized SoA Layout)
+// ==================================================================
+
+template <typename IndexType>
+__global__ void init_jfa_kernel_3d_soa(
+    const float* __restrict__ input,
+    IndexType* __restrict__ indices_z,
+    IndexType* __restrict__ indices_y,
+    IndexType* __restrict__ indices_x,
+    int64_t total_elements, 
+    int D, int H, int W
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
 
-// ============================================================================
-// Kernel: Optimized 1D EDT using Two-Level Tree reduction
-// ============================================================================
+    if (input[tid] == 0.0f) {
+        int64_t spatial_size = (int64_t)D * H * W;
+        int64_t rem = tid % spatial_size;
+        int w = (int)(rem % W);
+        int h = (int)((rem / W) % H);
+        int d = (int)(rem / (W * H));
 
-__global__ void edt_1d_warp_optimized_kernel(
-    const float* __restrict__ d_input,
-    float* __restrict__ d_dist,
-    int32_t* __restrict__ d_indices,
-    int width,
-    int height
+        indices_z[tid] = (IndexType)d;
+        indices_y[tid] = (IndexType)h;
+        indices_x[tid] = (IndexType)w;
+    } else {
+        indices_z[tid] = (IndexType)-1;
+        indices_y[tid] = (IndexType)-1;
+        indices_x[tid] = (IndexType)-1;
+    }
+}
+
+template <typename IndexType>
+__global__ void jfa_block_fused_kernel_3d_soa(
+    const IndexType* __restrict__ in_z,
+    const IndexType* __restrict__ in_y,
+    const IndexType* __restrict__ in_x,
+    IndexType* __restrict__ out_z,
+    IndexType* __restrict__ out_y,
+    IndexType* __restrict__ out_x,
+    int D, int H, int W,
+    int blocks_per_d
 ) {
-    int row = blockIdx.x;
-    if (row >= height) return;
-    
-    const float* row_input = d_input + row * width;
-    float* row_dist = d_dist + row * width;
-    int32_t* row_indices = d_indices + row * width;
-    
-    int tid = threadIdx.x;
-    int lane = tid % WARP_SIZE;
-    int warp_id = tid / WARP_SIZE;
-    int num_warps = blockDim.x / WARP_SIZE;
-    
-    // Shared memory for Inter-Warp Scan
-    // We use one buffer for the reduction result
-    __shared__ int s_warp_boundary[32]; 
-    
-    // ========================================================================
-    // PASS 1: Find nearest feature to the LEFT (Prefix Max Scan)
-    // ========================================================================
-    
-    int global_left_feature = -1;
+    const int BLOCK_DIM = 8;
+    const int HALO = 3; 
+    const int SMEM_DIM = BLOCK_DIM + 2 * HALO; // 14
+    const int SMEM_SIZE = SMEM_DIM * SMEM_DIM * SMEM_DIM;
+
+    extern __shared__ char smem_raw[];
+    IndexType* smem_z = (IndexType*)smem_raw;
+    IndexType* smem_y = smem_z + SMEM_SIZE;
+    IndexType* smem_x = smem_y + SMEM_SIZE;
+
+    int tx = threadIdx.x; int ty = threadIdx.y; int tz = threadIdx.z;
+
+    int b_z_total = blockIdx.z;
+    int batch_id = b_z_total / blocks_per_d;
+    int b_z_local = b_z_total % blocks_per_d;
     
-    for (int base = 0; base < width; base += blockDim.x) {
-        int i = base + tid;
-        bool is_valid = (i < width);
-        bool is_feature = is_valid && (row_input[i] > 0.5f);
-        
-        // 1. Warp-Level: Find local nearest
-        unsigned int feature_mask = __ballot_sync(0xFFFFFFFF, is_feature);
-        int my_index = is_feature ? i : -1;
-        int warp_left_feature = find_nearest_left_in_warp(lane, my_index, feature_mask);
-        
-        // 2. Prepare for Block-Level Scan: Write rightmost feature of this warp
-        int rightmost_lane = (feature_mask != 0) ? (31 - __clz(feature_mask)) : 0;
-        int rightmost_index = __shfl_sync(0xFFFFFFFF, my_index, rightmost_lane);
-        
-        if (lane == 0) {
-            s_warp_boundary[warp_id] = (feature_mask != 0) ? rightmost_index : -1;
-        }
-        __syncthreads();
-        
-        // 3. Block-Level: Warp 0 performs parallel prefix scan over warp boundaries
-        // This is the "Tree" part for inter-warp communication
-        if (warp_id == 0) {
-            // Load boundary from shared memory (only if valid warp)
-            int val = (lane < num_warps) ? s_warp_boundary[lane] : -1;
-            
-            // Perform inclusive max scan
-            int scan_res = warp_scan_inclusive_max(val, num_warps);
-            
-            // Write back inclusive scan result
-            if (lane < num_warps) {
-                s_warp_boundary[lane] = scan_res;
-            }
+    int bx = blockIdx.x * BLOCK_DIM;
+    int by = blockIdx.y * BLOCK_DIM;
+    int bz = b_z_local * BLOCK_DIM;
+
+    int64_t spatial_offset = (int64_t)batch_id * (D * H * W);
+
+    // Phase 1: Load to SoA Shared Memory
+    int tid = tz * 64 + ty * 8 + tx; 
+    int base_x = bx - HALO;
+    int base_y = by - HALO;
+    int base_z = bz - HALO;
+
+    for (int i = tid; i < SMEM_SIZE; i += 512) {
+        int temp = i;
+        int sx = temp % SMEM_DIM; temp /= SMEM_DIM;
+        int sy = temp % SMEM_DIM;
+        int sz = temp / SMEM_DIM;
+
+        int gx = base_x + sx;
+        int gy = base_y + sy;
+        int gz = base_z + sz;
+
+        IndexType val_z = -1, val_y = -1, val_x = -1;
+        if (gz >= 0 && gz < D && gy >= 0 && gy < H && gx >= 0 && gx < W) {
+            int64_t idx = spatial_offset + (int64_t)gz * (H * W) + gy * W + gx;
+            val_z = in_z[idx];
+            val_y = in_y[idx];
+            val_x = in_x[idx];
         }
-        __syncthreads();
-        
-        // 4. Combine Results
-        if (is_valid) {
-            int left_feature = -1;
-            
-            if (is_feature) left_feature = i;
-            else if (warp_left_feature != -1) left_feature = warp_left_feature;
-            else {
-                // Look at the scan result from the PREVIOUS warp
-                if (warp_id > 0) {
-                    left_feature = s_warp_boundary[warp_id - 1];
+        smem_z[i] = val_z;
+        smem_y[i] = val_y;
+        smem_x[i] = val_x;
+    }
+    __syncthreads();
+
+    // Phase 2: Compute
+    int center_sz = tz + HALO;
+    int center_sy = ty + HALO;
+    int center_sx = tx + HALO;
+    int my_s_idx = (center_sz * SMEM_DIM + center_sy) * SMEM_DIM + center_sx;
+
+    int best_z = (int)smem_z[my_s_idx];
+    int best_y = (int)smem_y[my_s_idx];
+    int best_x = (int)smem_x[my_s_idx];
+
+    int g_cz = bz + tz;
+    int g_cy = by + ty;
+    int g_cx = bx + tx;
+
+    float best_dist = dist_sq_3d_soa(g_cz, g_cy, g_cx, best_z, best_y, best_x);
+
+    int step = 1;
+    #pragma unroll
+    for (int k = 0; k < 2; ++k) { 
+        #pragma unroll
+        for (int dz = -1; dz <= 1; ++dz) {
+            #pragma unroll
+            for (int dy = -1; dy <= 1; ++dy) {
+                #pragma unroll
+                for (int dx = -1; dx <= 1; ++dx) {
+                    if (dz == 0 && dy == 0 && dx == 0) continue;
+
+                    int nz = center_sz + dz * step;
+                    int ny = center_sy + dy * step;
+                    int nx = center_sx + dx * step;
+                    int n_idx = (nz * SMEM_DIM + ny) * SMEM_DIM + nx;
+
+                    int sz_in = (int)smem_z[n_idx]; 
+                    if (sz_in != -1) {
+                        int sy_in = (int)smem_y[n_idx];
+                        int sx_in = (int)smem_x[n_idx];
+                        float d = dist_sq_3d_soa(g_cz, g_cy, g_cx, sz_in, sy_in, sx_in);
+                        if (d < best_dist) {
+                            best_dist = d;
+                            best_z = sz_in;
+                            best_y = sy_in;
+                            best_x = sx_in;
+                        }
+                    }
                 }
-                
-                // If still -1, fallback to global history
-                if (left_feature == -1) left_feature = global_left_feature;
-            }
-            
-            // Store result
-            if (left_feature >= 0) {
-                row_dist[i] = sqr((float)(i - left_feature));
-                row_indices[i] = left_feature;
-            } else {
-                row_dist[i] = INF_VAL;
-                row_indices[i] = -1;
             }
         }
-        
-        // 5. Update Global History
-        // The last warp's scan result contains the max index for the whole tile
-        int tile_max = s_warp_boundary[num_warps - 1];
-        if (tile_max != -1) global_left_feature = tile_max;
-        
         __syncthreads();
+        smem_z[my_s_idx] = (IndexType)best_z;
+        smem_y[my_s_idx] = (IndexType)best_y;
+        smem_x[my_s_idx] = (IndexType)best_x;
+        __syncthreads();
+        step *= 2;
     }
+
+    if (g_cz < D && g_cy < H && g_cx < W) {
+        int64_t out_idx_g = spatial_offset + (int64_t)g_cz * (H * W) + g_cy * W + g_cx;
+        out_z[out_idx_g] = (IndexType)best_z;
+        out_y[out_idx_g] = (IndexType)best_y;
+        out_x[out_idx_g] = (IndexType)best_x;
+    }
+}
+
+template <typename IndexType>
+__global__ void jfa_step_3d_soa(
+    const IndexType* __restrict__ in_z,
+    const IndexType* __restrict__ in_y,
+    const IndexType* __restrict__ in_x,
+    IndexType* __restrict__ out_z,
+    IndexType* __restrict__ out_y,
+    IndexType* __restrict__ out_x,
+    int step,
+    int D, int H, int W, 
+    int64_t total_pixels
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_pixels) return;
+
+    int64_t spatial_size = (int64_t)D * H * W;
+    int64_t rem = tid % spatial_size;
+    int64_t batch_offset = tid - rem; 
+    int cur_w = (int)(rem % W);
+    int cur_h = (int)((rem / W) % H);
+    int cur_d = (int)(rem / (W * H));
+
+    int best_z = (int)in_z[tid];
+    int best_y = (int)in_y[tid];
+    int best_x = (int)in_x[tid];
     
-    // ========================================================================
-    // PASS 2: Find nearest feature to the RIGHT (Suffix Min Scan)
-    // ========================================================================
-    
-    int global_right_feature = -1;
-    int num_tiles = (width + blockDim.x - 1) / blockDim.x;
-    
-    for (int tile = num_tiles - 1; tile >= 0; --tile) {
-        int base = tile * blockDim.x;
-        int i = base + tid;
-        bool is_valid = (i < width);
-        bool is_feature = is_valid && (row_input[i] > 0.5f);
-        
-        // 1. Warp-Level
-        unsigned int feature_mask = __ballot_sync(0xFFFFFFFF, is_feature);
-        int my_index = is_feature ? i : -1;
-        int warp_right_feature = find_nearest_right_in_warp(lane, my_index, feature_mask);
-        
-        // 2. Prepare: Write leftmost feature of this warp
-        int leftmost_lane = (feature_mask != 0) ? (__ffs(feature_mask) - 1) : 0;
-        int leftmost_index = __shfl_sync(0xFFFFFFFF, my_index, leftmost_lane);
-        
-        if (lane == 0) {
-            s_warp_boundary[warp_id] = (feature_mask != 0) ? leftmost_index : -1;
-        }
-        __syncthreads();
-        
-        // 3. Block-Level: Warp 0 performs parallel suffix scan (Right-to-Left tree)
-        if (warp_id == 0) {
-            int val = (lane < num_warps) ? s_warp_boundary[lane] : -1;
-            
-            // Perform suffix min scan
-            int scan_res = warp_scan_suffix_min(val, num_warps);
-            
-            if (lane < num_warps) {
-                s_warp_boundary[lane] = scan_res;
+    float best_dist = dist_sq_3d_soa(cur_d, cur_h, cur_w, best_z, best_y, best_x);
+
+    #pragma unroll
+    for (int dz = -1; dz <= 1; ++dz) {
+        #pragma unroll
+        for (int dy = -1; dy <= 1; ++dy) {
+            #pragma unroll
+            for (int dx = -1; dx <= 1; ++dx) {
+                if (dz == 0 && dy == 0 && dx == 0) continue;
+
+                int nz = cur_d + dz * step;
+                int ny = cur_h + dy * step;
+                int nx = cur_w + dx * step;
+
+                if (nz >= 0 && nz < D && ny >= 0 && ny < H && nx >= 0 && nx < W) {
+                    int64_t n_idx = batch_offset + (int64_t)nz * (H * W) + ny * W + nx;
+                    
+                    int seed_z = (int)in_z[n_idx];
+                    if (seed_z != -1) {
+                        float dz_val = (float)(cur_d - seed_z);
+                        float dz_sq = dz_val * dz_val;
+                        
+                        if (dz_sq < best_dist) {
+                            int seed_y = (int)in_y[n_idx];
+                            int seed_x = (int)in_x[n_idx];
+                            float dist = dz_sq + sqr((float)(cur_h - seed_y)) + sqr((float)(cur_w - seed_x));
+                            
+                            if (dist < best_dist) {
+                                best_dist = dist;
+                                best_z = seed_z;
+                                best_y = seed_y;
+                                best_x = seed_x;
+                            }
+                        }
+                    }
+                }
             }
         }
-        __syncthreads();
+    }
+    out_z[tid] = (IndexType)best_z;
+    out_y[tid] = (IndexType)best_y;
+    out_x[tid] = (IndexType)best_x;
+}
+
+template <typename IndexType>
+__global__ void calc_dist_kernel_3d_soa(
+    const IndexType* __restrict__ in_z,
+    const IndexType* __restrict__ in_y,
+    const IndexType* __restrict__ in_x,
+    float* __restrict__ dist_out,
+    int64_t total_elements,
+    int D, int H, int W
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    int seed_d = (int)in_z[tid];
+    if (seed_d == -1) { 
+        dist_out[tid] = INF_VAL; 
+    } else {
+        int seed_h = (int)in_y[tid];
+        int seed_w = (int)in_x[tid];
+
+        int64_t spatial_size = (int64_t)D * H * W;
+        int64_t rem = tid % spatial_size;
+        int cur_w = (int)(rem % W);
+        int cur_h = (int)((rem / W) % H);
+        int cur_d = (int)(rem / (W * H));
         
-        // 4. Combine Results
-        if (is_valid) {
-            int right_feature = -1;
-            
-            if (is_feature) right_feature = i;
-            else if (warp_right_feature != -1) right_feature = warp_right_feature;
-            else {
-                // Look at scan result from NEXT warp
-                if (warp_id < num_warps - 1) {
-                    right_feature = s_warp_boundary[warp_id + 1];
+        dist_out[tid] = sqrtf(dist_sq_3d_soa(cur_d, cur_h, cur_w, seed_d, seed_h, seed_w));
+    }
+}
+
+// ==================================================================
+// PART 3: SEPARABLE N-DIM KERNELS
+// ==================================================================
+
+__device__ void run_separable_scan_core(
+    int N,
+    int tid,
+    const float* __restrict__ vals,  
+    int* __restrict__ idx_curr,      
+    int* __restrict__ idx_next       
+) {
+    for (int i = tid; i < N; i += blockDim.x) {
+        if (vals[i] >= INF_VAL * 0.9f) idx_curr[i] = -1; 
+        else idx_curr[i] = i;  
+    }
+    __syncthreads();
+
+    int* idx_in = idx_curr;
+    int* idx_out = idx_next;
+
+    for (int step = 1; step < N; step *= 2) {
+        for (int i = tid; i < N; i += blockDim.x) {
+            int my_best_p = idx_in[i];
+            float min_cost = INF_VAL;
+
+            if (my_best_p != -1) min_cost = compute_cost(i, my_best_p, vals[my_best_p]);
+
+            int left = i - step;
+            if (left >= 0) {
+                int left_p = idx_in[left];
+                if (left_p != -1) {
+                    float c = compute_cost(i, left_p, vals[left_p]);
+                    if (c < min_cost) { min_cost = c; my_best_p = left_p; }
                 }
-                
-                if (right_feature == -1) right_feature = global_right_feature;
             }
-            
-            // Update Min Distance
-            if (right_feature >= 0) {
-                float d = sqr((float)(right_feature - i));
-                if (d < row_dist[i]) {
-                    row_dist[i] = d;
-                    row_indices[i] = right_feature;
+
+            int right = i + step;
+            if (right < N) {
+                int right_p = idx_in[right];
+                if (right_p != -1) {
+                    float c = compute_cost(i, right_p, vals[right_p]);
+                    if (c < min_cost) { min_cost = c; my_best_p = right_p; }
                 }
             }
+            idx_out[i] = my_best_p;
         }
-        
-        // 5. Update Global History
-        // First warp's scan result contains min index for whole tile
-        int tile_min = s_warp_boundary[0];
-        if (tile_min != -1) global_right_feature = tile_min;
-        
+        int* temp = idx_in; idx_in = idx_out; idx_out = temp;
+        __syncthreads();
+    }
+
+    if (idx_in != idx_curr) {
+        for (int i = tid; i < N; i += blockDim.x) idx_curr[i] = idx_next[i];
         __syncthreads();
     }
 }
 
-// ============================================================================
-// PyTorch Wrapper Function
-// ============================================================================
+template <bool IsFinal>
+__global__ void separable_kernel_shared(
+    const float* __restrict__ in_data,
+    const int32_t* __restrict__ in_indices,
+    float* __restrict__ out_dist,            
+    int32_t* __restrict__ out_indices,       
+    int64_t L,               
+    int64_t total_elements,  
+    int coord_ndim           
+) {
+    int64_t row_idx = blockIdx.x;
+    int64_t offset = row_idx * L;
+    if (offset >= total_elements) return;
 
-std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
-    TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
-    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be float32");
-    TORCH_CHECK(input.dim() >= 1, "Input must be at least 1D");
-    
-    // Get dimensions
-    int64_t ndim = input.dim();
-    int64_t width = input.size(-1);
-    int64_t height = 1;
+    extern __shared__ char s_buffer[];
+    float* s_vals = (float*)s_buffer;
+    int*   s_idx1 = (int*)(s_vals + L);
+    int*   s_idx2 = (int*)(s_idx1 + L);
+
+    for (int i = threadIdx.x; i < L; i += blockDim.x) {
+        s_vals[i] = __ldg(&in_data[offset + i]);
+    }
+    __syncthreads();
+
+    run_separable_scan_core(L, threadIdx.x, s_vals, s_idx1, s_idx2);
+
+    for (int q = threadIdx.x; q < L; q += blockDim.x) {
+        int p = s_idx1[q];
+        float dist_val;
+
+        if (p != -1) {
+            float dist_sq = sqr((float)q - (float)p) + s_vals[p];
+            dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
+        } else {
+            dist_val = IsFinal ? INF_VAL : INF_VAL;
+            p = 0;
+        }
+        out_dist[offset + q] = dist_val;
+
+        int64_t dst_base = (offset + q) * coord_ndim;
+        if (p != -1 && s_vals[p] < INF_VAL) {
+            int64_t src_base = (offset + p) * coord_ndim;
+            for (int d = 0; d < coord_ndim; ++d) {
+                out_indices[dst_base + d] = in_indices[src_base + d];
+            }
+        } else {
+            for (int d = 0; d < coord_ndim; ++d) out_indices[dst_base + d] = 0;
+        }
+    }
+}
+
+template <bool IsFinal>
+__global__ void separable_kernel_global(
+    const float* __restrict__ in_data,
+    const int32_t* __restrict__ in_indices,
+    float* __restrict__ out_dist,
+    int32_t* __restrict__ out_indices,
+    int* __restrict__ global_buffer_1,
+    int* __restrict__ global_buffer_2,
+    int64_t L,
+    int64_t total_elements,
+    int coord_ndim
+) {
+    int64_t row_idx = blockIdx.x;
+    int64_t offset = row_idx * L;
+    if (offset >= total_elements) return;
+
+    int* g_idx1 = global_buffer_1 + offset;
+    int* g_idx2 = global_buffer_2 + offset;
     
-    if (ndim >= 2) {
-        height = input.size(-2);
+    run_separable_scan_core(L, threadIdx.x, in_data + offset, g_idx1, g_idx2);
+
+    for (int q = threadIdx.x; q < L; q += blockDim.x) {
+        int p = g_idx1[q];
+        float dist_val;
+        if (p != -1) {
+            float val_p = in_data[offset + p]; 
+            float dist_sq = sqr((float)q - (float)p) + val_p;
+            dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
+        } else {
+            dist_val = IsFinal ? INF_VAL : INF_VAL;
+            p = 0; 
+        }
+        out_dist[offset + q] = dist_val;
+
+        int64_t dst_base = (offset + q) * coord_ndim;
+        if (p != -1 && in_data[offset + p] < INF_VAL) {
+            int64_t src_base = (offset + p) * coord_ndim;
+            for (int d = 0; d < coord_ndim; ++d) {
+                out_indices[dst_base + d] = in_indices[src_base + d];
+            }
+        } else {
+            for (int d = 0; d < coord_ndim; ++d) out_indices[dst_base + d] = 0;
+        }
+    }
+}
+
+__global__ void init_indices_separable_kernel(
+    int32_t* indices, 
+    int64_t total_pixels, 
+    int NDim, 
+    const int64_t* __restrict__ shape_ptr
+) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_pixels) return;
+
+    int64_t temp = idx;
+    int32_t coords[8]; 
+    for (int d = NDim - 1; d >= 0; --d) {
+        int64_t dim_size = shape_ptr[d];
+        coords[d] = temp % dim_size;
+        temp /= dim_size;
     }
+    int64_t out_ptr = idx * NDim;
+    for (int d = 0; d < NDim; ++d) indices[out_ptr + d] = coords[d];
+}
+
+// ==================================================================
+// PART 4: DISPATCH HELPERS
+// ==================================================================
+
+std::tuple<torch::Tensor, torch::Tensor> run_jfa_2d(
+    torch::Tensor input, int64_t H, int64_t W, int grid, int block, int64_t numel
+) {
+    auto index_opts = input.options().dtype(torch::kInt32);
+    auto idx_shape = input.sizes().vec();
+    idx_shape.push_back(2); 
+    auto curr_idx = torch::empty(idx_shape, index_opts);
+    auto next_idx = torch::empty(idx_shape, index_opts);
     
-    int64_t batch_size = input.numel() / (width * height);
+    int2* d_curr = (int2*)curr_idx.data_ptr<int32_t>();
+    int2* d_next = (int2*)next_idx.data_ptr<int32_t>();
+
+    init_jfa_kernel_2d_opt<<<grid, block>>>(
+        input.data_ptr<float>(), d_curr, numel, H, W
+    );
+
+    {
+        dim3 dimBlock(JFA_BLOCK_DIM, JFA_BLOCK_DIM); 
+        int64_t batch_size = numel / (H * W);
+        dim3 dimGrid((W + JFA_BLOCK_DIM - 1) / JFA_BLOCK_DIM, 
+                     (H + JFA_BLOCK_DIM - 1) / JFA_BLOCK_DIM, 
+                     batch_size);
+        
+        jfa_block_fused_kernel_2d<<<dimGrid, dimBlock>>>(d_curr, d_next, H, W, batch_size);
+        std::swap(d_curr, d_next); 
+        std::swap(curr_idx, next_idx); 
+    }
+
+    int max_dim = std::max((int)H, (int)W);
+    int step = 16; 
+
+    while (step < max_dim) {
+        jfa_step_global_2d_opt<<<grid, block>>>(d_curr, d_next, step, H, W, numel);
+        std::swap(d_curr, d_next);
+        std::swap(curr_idx, next_idx);
+        step *= 2;
+    }
     
-    // Flatten to [batch * height, width]
-    auto input_flat = input.view({batch_size * height, width});
+    auto final_dist = torch::empty_like(input);
+    calc_dist_kernel_2d_opt<<<grid, block>>>(d_curr, final_dist.data_ptr<float>(), numel, H, W);
+
+    return std::make_tuple(final_dist, curr_idx);
+}
+
+
+std::tuple<torch::Tensor, torch::Tensor> run_jfa_3d(
+    torch::Tensor input, int64_t D, int64_t H, int64_t W, int grid, int block, int64_t numel
+) {
+    bool use_int16 = (D < 32767 && H < 32767 && W < 32767);
+    auto index_opts = input.options().dtype(use_int16 ? torch::kInt16 : torch::kInt32);
     
-    // Create output tensors
-    auto dist_map = torch::empty_like(input);
-    auto dist_flat = dist_map.view({batch_size * height, width});
+    int64_t batch = numel / (D * H * W);
     
-    // Index map: same shape as input + last dimension for coordinate
-    auto idx_shape = input.sizes().vec();
-    idx_shape.push_back(1);
-    auto idx_map = torch::empty(idx_shape, input.options().dtype(torch::kInt32));
-    auto idx_flat = idx_map.view({batch_size * height, width, 1});
+    // (3, Batch, D, H, W)
+    auto curr_idx_soa = torch::empty({3, batch, D, H, W}, index_opts);
+    auto next_idx_soa = torch::empty({3, batch, D, H, W}, index_opts);
     
-    // Launch kernel
-    // Use 256 threads per block (8 warps) for good occupancy
-    int threads_per_block = 256;
-    int num_rows = batch_size * height;
+    void* d_curr = curr_idx_soa.data_ptr();
+    void* d_next = next_idx_soa.data_ptr();
+    int64_t plane_stride = numel; // B*D*H*W
+
+    // 1. Init
+    if (use_int16) {
+        int16_t* ptr = (int16_t*)d_curr;
+        init_jfa_kernel_3d_soa<int16_t><<<grid, block>>>(
+            input.data_ptr<float>(), ptr, ptr + plane_stride, ptr + 2 * plane_stride, numel, D, H, W
+        );
+    } else {
+        int32_t* ptr = (int32_t*)d_curr;
+        init_jfa_kernel_3d_soa<int32_t><<<grid, block>>>(
+            input.data_ptr<float>(), ptr, ptr + plane_stride, ptr + 2 * plane_stride, numel, D, H, W
+        );
+    }
+
+    // 2. Fused Steps
+    int block_dim = 8;
+    int blocks_per_d = (D + block_dim - 1) / block_dim;
+    dim3 fused_block(block_dim, block_dim, block_dim);
+    dim3 fused_grid((W + block_dim - 1) / block_dim, (H + block_dim - 1) / block_dim, blocks_per_d * batch);
+    size_t smem_bytes = (14*14*14) * 3 * (use_int16 ? 2 : 4);
+
+    if (use_int16) {
+        int16_t* c = (int16_t*)d_curr;
+        int16_t* n = (int16_t*)d_next;
+        jfa_block_fused_kernel_3d_soa<int16_t><<<fused_grid, fused_block, smem_bytes>>>(
+            c, c + plane_stride, c + 2 * plane_stride, 
+            n, n + plane_stride, n + 2 * plane_stride, 
+            D, H, W, blocks_per_d
+        );
+    } else {
+        int32_t* c = (int32_t*)d_curr;
+        int32_t* n = (int32_t*)d_next;
+        jfa_block_fused_kernel_3d_soa<int32_t><<<fused_grid, fused_block, smem_bytes>>>(
+            c, c + plane_stride, c + 2 * plane_stride, 
+            n, n + plane_stride, n + 2 * plane_stride, 
+            D, H, W, blocks_per_d
+        );
+    }
+    std::swap(d_curr, d_next); 
+
+    // 3. Global Steps
+    int max_dim = std::max({(int)D, (int)H, (int)W});
+    int step = 4;
+    while (step < max_dim) {
+        if (use_int16) {
+            int16_t* c = (int16_t*)d_curr;
+            int16_t* n = (int16_t*)d_next;
+            jfa_step_3d_soa<int16_t><<<grid, block>>>(
+                c, c + plane_stride, c + 2 * plane_stride, 
+                n, n + plane_stride, n + 2 * plane_stride, 
+                step, D, H, W, numel
+            );
+        } else {
+            int32_t* c = (int32_t*)d_curr;
+            int32_t* n = (int32_t*)d_next;
+            jfa_step_3d_soa<int32_t><<<grid, block>>>(
+                c, c + plane_stride, c + 2 * plane_stride, 
+                n, n + plane_stride, n + 2 * plane_stride, 
+                step, D, H, W, numel
+            );
+        }
+        std::swap(d_curr, d_next);
+        step *= 2;
+    }
+
+    // 4. Final Dist
+    auto final_dist = torch::empty_like(input);
+    if (use_int16) {
+        int16_t* c = (int16_t*)d_curr;
+        calc_dist_kernel_3d_soa<int16_t><<<grid, block>>>(
+            c, c + plane_stride, c + 2 * plane_stride, 
+            final_dist.data_ptr<float>(), numel, D, H, W
+        );
+    } else {
+        int32_t* c = (int32_t*)d_curr;
+        calc_dist_kernel_3d_soa<int32_t><<<grid, block>>>(
+            c, c + plane_stride, c + 2 * plane_stride, 
+            final_dist.data_ptr<float>(), numel, D, H, W
+        );
+    }
+
+    // Permute result indices back to (Batch, D, H, W, 3)
+    torch::Tensor result_indices;
+    if (d_curr == curr_idx_soa.data_ptr()) result_indices = curr_idx_soa;
+    else result_indices = next_idx_soa;
     
-    dim3 block(threads_per_block);
-    dim3 grid(num_rows);
+    result_indices = result_indices.permute({1, 2, 3, 4, 0}).contiguous();
+
+    return std::make_tuple(final_dist, result_indices);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> run_separable_ndim(torch::Tensor input) {
+    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Separable N-Dim input must be float32.");
+    input = input.contiguous();
     
-    edt_1d_warp_optimized_kernel<<<grid, block>>>(
-        input_flat.data_ptr<float>(),
-        dist_flat.data_ptr<float>(),
-        idx_flat.data_ptr<int32_t>(),
-        width,
-        num_rows
-    );
+    const int ndim = input.dim(); 
+    const int sample_ndim = ndim - 1; 
+    TORCH_CHECK(sample_ndim > 0 && sample_ndim <= 8, "Unsupported dims for Separable EDT");
     
-    // Check for errors
-    cudaError_t err = cudaGetLastError();
-    TORCH_CHECK(err == cudaSuccess, "CUDA kernel failed: ", cudaGetErrorString(err));
+    auto shape = input.sizes().vec();
+    int64_t num_pixels = input.numel();
+
+    auto current_dist = torch::where(input == 0, 
+                                    torch::tensor(0.0f, input.options()), 
+                                    torch::tensor(INF_VAL, input.options()));
     
-    // Take square root to get actual distance (not squared)
-    dist_map = torch::sqrt(dist_map);
+    auto index_shape = shape;
+    index_shape.push_back(sample_ndim);
+    auto current_idx = torch::empty(index_shape, input.options().dtype(torch::kInt32));
     
-    return std::make_tuple(dist_map, idx_map);
+    {
+        std::vector<int64_t> spatial_shape(shape.begin() + 1, shape.end());
+        auto shape_tensor = torch::tensor(spatial_shape, torch::kInt64).to(input.device());
+        int threads = 256;
+        int blocks = (num_pixels + threads - 1) / threads;
+        init_indices_separable_kernel<<<blocks, threads>>>(
+            current_idx.data_ptr<int32_t>(), num_pixels, sample_ndim, shape_tensor.data_ptr<int64_t>()
+        );
+    }
+
+    torch::Tensor global_buf1, global_buf2;
+
+    for (int d = 1; d < ndim; ++d) {
+        bool is_final_pass = (d == ndim - 1);
+        
+        auto dist_in = current_dist.transpose(d, ndim - 1).contiguous();
+        auto idx_in  = current_idx.transpose(d, ndim - 1).contiguous(); 
+        
+        auto dist_out = torch::empty(dist_in.sizes(), dist_in.options());
+        auto idx_out  = torch::empty(idx_in.sizes(), idx_in.options());
+
+        int64_t L = dist_in.size(-1); 
+        int64_t total_slices = dist_in.numel() / L; 
+        int threads = std::min((int64_t)MAX_THREADS, L);
+        
+        if (L <= SMEM_LIMIT_ELEMENTS) {
+            size_t smem_size = L * (sizeof(float) + 2 * sizeof(int));
+            if (is_final_pass) {
+                separable_kernel_shared<true><<<total_slices, threads, smem_size>>>(
+                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
+                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
+                    L, dist_in.numel(), sample_ndim
+                );
+            } else {
+                separable_kernel_shared<false><<<total_slices, threads, smem_size>>>(
+                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
+                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
+                    L, dist_in.numel(), sample_ndim
+                );
+            }
+        } else {
+            if (global_buf1.numel() < dist_in.numel()) {
+                global_buf1 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
+                global_buf2 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
+            }
+            if (is_final_pass) {
+                separable_kernel_global<true><<<total_slices, threads>>>(
+                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
+                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
+                    global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(),
+                    L, dist_in.numel(), sample_ndim
+                );
+            } else {
+                separable_kernel_global<false><<<total_slices, threads>>>(
+                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
+                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
+                    global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(),
+                    L, dist_in.numel(), sample_ndim
+                );
+            }
+        }
+        current_dist = dist_out.transpose(d, ndim - 1);
+        current_idx  = idx_out.transpose(d, ndim - 1);
+    }
+
+    return std::make_tuple(current_dist, current_idx);
 }
+
+// ==================================================================
+// PART 5: MAIN ENTRY POINT
+// ==================================================================
+
+std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
+    TORCH_CHECK(input.is_cuda(), "Input must be CUDA tensor");
+    input = input.contiguous();
+
+    int64_t dims = input.dim();
+    int64_t numel = input.numel();
+    int block = BLOCK_SIZE;
+    int grid = (numel + block - 1) / block;
+
+    if (dims >= 5) {
+        return run_separable_ndim(input);
+    }
+    else if (dims == 4) {
+        int64_t dim1 = input.size(1); 
+        if (dim1 == 1) {
+            int64_t H = input.size(-2);
+            int64_t W = input.size(-1);
+            return run_jfa_2d(input, H, W, grid, block, numel);
+        } 
+        else {
+            int64_t D = dim1;
+            int64_t H = input.size(-2);
+            int64_t W = input.size(-1);
+            return run_jfa_3d(input, D, H, W, grid, block, numel);
+        }
+    }
+    else if (dims == 3) {
+        int64_t H = input.size(-2);
+        int64_t W = input.size(-1);
+        return run_jfa_2d(input, H, W, grid, block, numel);
+    }
+    else if (dims == 2) {
+        int64_t H = 1;
+        int64_t W = input.size(-1);
+        auto result = run_jfa_2d(input, H, W, grid, block, numel);
+        torch::Tensor dist = std::get<0>(result);
+        torch::Tensor idx_2d = std::get<1>(result); 
+        auto idx_1d = idx_2d.slice(/*dim=*/-1, /*start=*/1, /*end=*/2).contiguous();
+        return std::make_tuple(dist, idx_1d);
+    }
+    else {
+        TORCH_CHECK(false, "Unsupported dimensions.");
+        return std::make_tuple(torch::Tensor(), torch::Tensor());
+    }
+}
\ No newline at end of file
diff --git a/torchmorph/distance_transform.py b/torchmorph/distance_transform.py
index 7887c2a..868e84a 100644
--- a/torchmorph/distance_transform.py
+++ b/torchmorph/distance_transform.py
@@ -7,7 +7,7 @@ def distance_transform(input: torch.Tensor) -> torch.Tensor:
     """Distance Transform in CUDA."""
     if not input.is_cuda:
         raise ValueError("Input tensor must be on CUDA device.")
-    if input.ndim < 1 or input.numel() == 0:
+    if input.ndim < 2 or input.numel() == 0:
         raise ValueError(f"Invalid input dimension: {input.shape}.")
 
     # binarize input

From 955e1857ab36926f9a3533967603e9b9c7f7c954 Mon Sep 17 00:00:00 2001
From: Yu Han Deng <moliflower233@163.com>
Date: Wed, 14 Jan 2026 03:14:29 +0800
Subject: [PATCH 49/56] Restore code to JFA

---
 test/test_edt_1d.py                          |  154 ---
 torchmorph/csrc/distance_transform_kernel.cu | 1176 +++++++++++++-----
 torchmorph/distance_transform.py             |    2 +-
 3 files changed, 884 insertions(+), 448 deletions(-)
 delete mode 100644 test/test_edt_1d.py

diff --git a/test/test_edt_1d.py b/test/test_edt_1d.py
deleted file mode 100644
index b81cd63..0000000
--- a/test/test_edt_1d.py
+++ /dev/null
@@ -1,154 +0,0 @@
-import unittest
-
-import torch
-
-import torchmorph
-
-
-class Test1DEuclideanDistanceTransform(unittest.TestCase):
-    def setUp(self):
-        if not torch.cuda.is_available():
-            self.skipTest("CUDA not available")
-        self.device = torch.device("cuda")
-
-    def test_basic_features(self):
-        """Test with 32 elements and 3 feature points"""
-        print("\n=== Test 1: Basic Features (32 elements) ===")
-        input_tensor = torch.zeros(32, dtype=torch.float32, device=self.device)
-        input_tensor[0] = 1.0
-        input_tensor[12] = 1.0
-        input_tensor[31] = 1.0
-
-        dist, indices = torchmorph.distance_transform(input_tensor)
-
-        # Check specific positions
-        self._check_position(dist, indices, 0, 0, 0)
-        self._check_position(dist, indices, 6, 0, 6)
-        self._check_position(dist, indices, 12, 12, 0)
-        self._check_position(dist, indices, 21, 12, 9)
-        self._check_position(dist, indices, 31, 31, 0)
-
-    def test_multiple_features(self):
-        """Test with multiple feature points"""
-        print("\n=== Test 2: Multiple Features ===")
-        input_tensor = torch.zeros(32, dtype=torch.float32, device=self.device)
-        features = [0, 3, 7, 10, 14, 21, 31]
-        input_tensor[features] = 1.0
-
-        dist, indices = torchmorph.distance_transform(input_tensor)
-
-        for pos in range(32):
-            self._verify_nearest(pos, dist, indices, features)
-
-    def test_batch_processing(self):
-        """Test with 2D array (batch of 1D rows)"""
-        print("\n=== Test 3: Batch Processing (4x32) ===")
-        input_tensor = torch.zeros(4, 32, dtype=torch.float32, device=self.device)
-        input_tensor[0, [0, 15, 31]] = 1.0
-        input_tensor[1, [5, 10, 20]] = 1.0
-        input_tensor[2, [8, 24]] = 1.0
-        input_tensor[3, [16]] = 1.0
-
-        dist, indices = torchmorph.distance_transform(input_tensor)
-        self.assertEqual(dist.shape, (4, 32))
-
-        # Check row 0
-        features_row0 = [0, 15, 31]
-        for pos in range(32):
-            self._verify_nearest(pos, dist[0], indices[0], features_row0)
-
-    def test_boundary_conditions(self):
-        """Test empty and full feature arrays"""
-        print("\n=== Test 4: Boundary Conditions ===")
-        # No features
-        input_empty = torch.zeros(32, dtype=torch.float32, device=self.device)
-        dist_empty, idx_empty = torchmorph.distance_transform(input_empty)
-        self.assertTrue(torch.all(dist_empty > 1000))  # Should be large/inf
-        self.assertTrue(torch.all(idx_empty == -1))
-
-        # All features
-        input_full = torch.ones(32, dtype=torch.float32, device=self.device)
-        dist_full, idx_full = torchmorph.distance_transform(input_full)
-        self.assertTrue(torch.all(dist_full == 0))
-        expected_idx = torch.arange(32, device=self.device, dtype=torch.int32).unsqueeze(-1)
-        self.assertTrue(torch.all(idx_full == expected_idx))
-
-    def test_large_array(self):
-        """Test large array to verify cross-tile propagation"""
-        print("\n=== Test 5: Large Array (1024 elements) ===")
-        input_tensor = torch.zeros(1024, dtype=torch.float32, device=self.device)
-        features = [0, 512, 1023]
-        input_tensor[features] = 1.0
-
-        dist, indices = torchmorph.distance_transform(input_tensor)
-
-        test_positions = [0, 256, 512, 768, 1023]
-        for pos in test_positions:
-            self._verify_nearest(pos, dist, indices.squeeze(), features)
-
-    def test_cross_tile_boundary(self):
-        """Test propagation across tile boundaries"""
-        print("\n=== Test 6: Cross-Tile Propagation ===")
-        # 768 elements (3 tiles of 256)
-        input_tensor = torch.zeros(768, dtype=torch.float32, device=self.device)
-        features = [100, 600]
-        input_tensor[features] = 1.0
-
-        dist, indices = torchmorph.distance_transform(input_tensor)
-
-        # Check around boundaries (256, 512)
-        test_positions = [250, 255, 256, 260, 350, 500, 510, 512, 520]
-        for pos in test_positions:
-            self._verify_nearest(pos, dist, indices.squeeze(), features)
-
-    def test_large_2d_batch(self):
-        """Test large 2D batch"""
-        print("\n=== Test 7: Large 2D Batch ===")
-        input_tensor = torch.zeros(3, 600, dtype=torch.float32, device=self.device)
-        rows_features = {
-            0: [0, 299, 599],
-            1: [150, 450],
-            2: [300],
-        }
-
-        for row, feats in rows_features.items():
-            input_tensor[row, feats] = 1.0
-
-        dist, indices = torchmorph.distance_transform(input_tensor)
-
-        # Verify specific points with dynamic calculation
-        test_cases = [
-            (0, 150),
-            (0, 450),
-            (1, 300),
-            (2, 100),
-            (2, 500),
-        ]
-
-        for row, pos in test_cases:
-            self._verify_nearest(pos, dist[row], indices[row].squeeze(), rows_features[row])
-
-    def _check_position(self, dist, indices, pos, expected_idx, expected_dist):
-        actual_dist = dist[pos].item()
-        actual_idx = indices[pos].item() if indices.ndim == 1 else indices[pos, 0].item()
-
-        self.assertAlmostEqual(actual_dist, float(expected_dist), places=1)
-        self.assertEqual(actual_idx, expected_idx)
-
-    def _verify_nearest(self, pos, dist, indices, features):
-        actual_dist = dist[pos].item()
-        nearest_idx = indices[pos].item() if indices.ndim == 1 else indices[pos].item()
-
-        # Calculate ground truth dynamically
-        true_dists = [abs(pos - f) for f in features]
-        min_dist = min(true_dists)
-        candidates = [f for f, d in zip(features, true_dists) if d == min_dist]
-
-        self.assertAlmostEqual(
-            actual_dist, float(min_dist), places=1, msg=f"Distance mismatch at {pos}"
-        )
-        self.assertIn(nearest_idx, candidates, msg=f"Nearest index mismatch at {pos}")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 59b1d99..09d4cbc 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -1,355 +1,945 @@
 #include <torch/extension.h>
-#include <cuda_runtime.h>
 #include <vector>
+#include <cmath>
+#include <limits>
+#include <cstdint>
+#include <cstdio>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <tuple>
+
+// ------------------------------------------------------------------
+// Global Configuration
+// ------------------------------------------------------------------
+#define BLOCK_SIZE 256
+#define INF_VAL 1e20f
+#define MAX_THREADS 1024
+#define SMEM_LIMIT_ELEMENTS 4096 
+
+#define JFA_BLOCK_DIM 32       
+#define JFA_FUSED_STEPS 4      
+#define JFA_MAX_OFFSET 8       
+#define JFA_SMEM_DIM (JFA_BLOCK_DIM + 2 * JFA_MAX_OFFSET) 
 
-// ============================================================================
-// 1D Euclidean Distance Transform - Optimized Warp-Level Parallel
-// ============================================================================
-// 
-// Based on the paper's algorithm using:
-// - __ballot_sync() for feature point voting
-// - __shfl_sync() for warp-level communication (NO shared memory within warp)
-// - Parallel reduction tree for cross-warp propagation
-// - Time complexity: O(log32(n)) with O(n) total work
-//
-// ============================================================================
-
-#define WARP_SIZE 32
-#define INF_VAL 1e9f
+// 3D Config
+#define JFA_3D_BLOCK 8
+#define JFA_3D_HALO 1 
 
+// ------------------------------------------------------------------
+// Device Helpers
+// ------------------------------------------------------------------
 __device__ __forceinline__ float sqr(float x) { return x * x; }
 
-// ============================================================================
-// Device Function: Find nearest feature to the LEFT using warp operations
-// ============================================================================
-// 
-// Algorithm (as described in the paper, Figure 7a):
-// 1. Each thread votes if it holds a feature point -> ballot() creates bitmask
-// 2. Mask high (warpSize - lane - 1) bits with 0
-// 3. Use clz() to count leading zeros
-// 4. Nearest thread lane = (warpSize - clz() - 1)
-// 5. Use __shfl_sync() to get the feature index from that thread
-//
-// Returns: index of nearest feature to the left, or -1 if none exists
-// ============================================================================
-
-__device__ __forceinline__ int find_nearest_left_in_warp(
-    int lane,
-    int my_index,
-    unsigned int feature_mask
+// Helper for JFA 2D/3D (Standard)
+__device__ __forceinline__ float dist_sq_2d(int y1, int x1, int y2, int x2) {
+    return sqr((float)(y1 - y2)) + sqr((float)(x1 - x2));
+}
+
+// Helper for SoA 3D (Z, Y, X separate)
+__device__ __forceinline__ float dist_sq_3d_soa(int z1, int y1, int x1, int z2, int y2, int x2) {
+    if (z2 == -1) return INF_VAL;
+    float dz = (float)(z1 - z2);
+    float dy = (float)(y1 - y2);
+    float dx = (float)(x1 - x2);
+    return dz*dz + dy*dy + dx*dx;
+}
+
+// Helper for Separable 1D
+__device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
+    if (p < 0 || val_p >= INF_VAL) return INF_VAL; 
+    return sqr((float)q - (float)p) + val_p;
+}
+
+// Device Helpers for int2 (2D Vectorized)
+__device__ __forceinline__ float dist_sq_int2(int y, int x, int2 seed) {
+    if (seed.x == -1) return INF_VAL; 
+    float dy = (float)(y - seed.x);
+    float dx = (float)(x - seed.y);
+    return dy*dy + dx*dx;
+}
+
+// ==================================================================
+// PART 1: JFA KERNELS 2D (Vectorized int2 + Block Shared)
+// ==================================================================
+
+__global__ void init_jfa_kernel_2d_opt(
+    const float* __restrict__ input,
+    int2* __restrict__ output, 
+    int64_t total_elements,    
+    int H, int W
 ) {
-    // Mask high bits: only keep features to the LEFT of current lane
-    unsigned int left_mask = feature_mask & ((1U << lane) - 1);
-    
-    // We must execute __shfl_sync for ALL threads in the warp
-    // Calculate nearest_lane if valid, otherwise use 0 (safe default)
-    int nearest_lane = 0;
-    if (left_mask != 0) {
-        nearest_lane = 31 - __clz(left_mask);
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    if (input[tid] == 0.0f) {
+        int64_t spatial_size = (int64_t)H * W;
+        int64_t rem = tid % spatial_size;
+        int w = (int)(rem % W);
+        int h = (int)(rem / W);
+        output[tid] = make_int2(h, w); 
+    } else {
+        output[tid] = make_int2(-1, -1);
     }
-    
-    // Perform shuffle for ALL threads
-    int nearest_index = __shfl_sync(0xFFFFFFFF, my_index, nearest_lane);
-    
-    // Only return valid result if we actually found a feature
-    return (left_mask != 0) ? nearest_index : -1;
 }
 
-// ============================================================================
-// Device Function: Find nearest feature to the RIGHT using warp operations
-// ============================================================================
-
-__device__ __forceinline__ int find_nearest_right_in_warp(
-    int lane,
-    int my_index,
-    unsigned int feature_mask
+__global__ void jfa_block_fused_kernel_2d(
+    const int2* __restrict__ in_idx,
+    int2* __restrict__ out_idx,
+    int H, int W,
+    int64_t num_images 
 ) {
-    // Mask low bits: only keep features to the RIGHT of current lane
-    unsigned int right_mask = feature_mask & ~((1U << (lane + 1)) - 1);
+    __shared__ int2 smem[JFA_SMEM_DIM][JFA_SMEM_DIM];
+
+    int tx = threadIdx.x; 
+    int ty = threadIdx.y; 
     
-    // Calculate nearest_lane if valid, otherwise use 0 (safe default)
-    int nearest_lane = 0;
-    if (right_mask != 0) {
-        nearest_lane = __ffs(right_mask) - 1;
+    int bx = blockIdx.x * blockDim.x;
+    int by = blockIdx.y * blockDim.y;
+    int img_idx = blockIdx.z;
+    int64_t batch_offset = (int64_t)img_idx * (H * W);
+
+    int gx = bx + tx;
+    int gy = by + ty;
+
+    // Phase 1: load data to Shared Memory
+    int smem_linear_size = JFA_SMEM_DIM * JFA_SMEM_DIM;
+    int total_threads = blockDim.x * blockDim.y;
+    int thread_linear_idx = ty * blockDim.x + tx;
+
+    int base_x = bx - JFA_MAX_OFFSET;
+    int base_y = by - JFA_MAX_OFFSET;
+
+    for (int i = thread_linear_idx; i < smem_linear_size; i += total_threads) {
+        int s_y = i / JFA_SMEM_DIM;
+        int s_x = i % JFA_SMEM_DIM;
+        int global_y = base_y + s_y;
+        int global_x = base_x + s_x;
+        int2 val = make_int2(-1, -1);
+        if (global_y >= 0 && global_y < H && global_x >= 0 && global_x < W) {
+            val = in_idx[batch_offset + global_y * W + global_x];
+        }
+        smem[s_y][s_x] = val;
     }
-    
-    // Perform shuffle for ALL threads
-    int nearest_index = __shfl_sync(0xFFFFFFFF, my_index, nearest_lane);
-    
-    return (right_mask != 0) ? nearest_index : -1;
-}
+    __syncthreads();
 
-// ============================================================================
-// Warp Scan Helpers
-// ============================================================================
+    // Phase 2: Iterate in Shared Memory
+    if (gx < W && gy < H) {
+        int center_sy = ty + JFA_MAX_OFFSET;
+        int center_sx = tx + JFA_MAX_OFFSET;
 
-// Inclusive Max Scan for positive integers (returns max seen so far)
-__device__ __forceinline__ int warp_scan_inclusive_max(int val, int width) {
-    // Hillis-Steele Scan (O(log N))
-    #pragma unroll
-    for (int offset = 1; offset < 32; offset *= 2) {
-        int neighbor_val = __shfl_up_sync(0xFFFFFFFF, val, offset);
-        if (threadIdx.x % 32 >= offset) {
-            // Logic: take the max of current and neighbor
-            // Handle -1 (invalid) carefully: max behavior handles -1 naturally if features are >= 0
-            if (neighbor_val > val) val = neighbor_val;
+        int2 best_seed = smem[center_sy][center_sx];
+        float best_dist = dist_sq_int2(gy, gx, best_seed);
+
+        int step = 1;
+        #pragma unroll
+        for (int k = 0; k < JFA_FUSED_STEPS; ++k) { 
+            #pragma unroll
+            for (int dy = -1; dy <= 1; ++dy) {
+                #pragma unroll
+                for (int dx = -1; dx <= 1; ++dx) {
+                    if (dy == 0 && dx == 0) continue;
+                    int2 neighbor_seed = smem[center_sy + dy * step][center_sx + dx * step];
+                    if (neighbor_seed.x != -1) {
+                        float d = dist_sq_int2(gy, gx, neighbor_seed);
+                        if (d < best_dist) {
+                            best_dist = d;
+                            best_seed = neighbor_seed;
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+            smem[center_sy][center_sx] = best_seed;
+            __syncthreads();
+            step *= 2;
         }
+        out_idx[batch_offset + gy * W + gx] = best_seed;
     }
-    return val;
 }
 
-// Inclusive Min Scan (returns min seen so far from right)
-// Note: We use __shfl_down_sync for suffix scan
-__device__ __forceinline__ int warp_scan_suffix_min(int val, int width) {
-    // Suffix Scan (Right to Left)
+__global__ void jfa_step_global_2d_opt(
+    const int2* __restrict__ in_idx,
+    int2* __restrict__ out_idx,
+    int step,
+    int H, int W, 
+    int64_t total_pixels
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_pixels) return;
+
+    int64_t spatial_size = (int64_t)H * W;
+    int64_t rem = tid % spatial_size;
+    int64_t batch_offset = tid - rem; 
+    int w = (int)(rem % W);
+    int h = (int)(rem / W);
+
+    int2 best_seed = in_idx[tid];
+    float best_dist = dist_sq_int2(h, w, best_seed);
+
     #pragma unroll
-    for (int offset = 1; offset < 32; offset *= 2) {
-        int neighbor_val = __shfl_down_sync(0xFFFFFFFF, val, offset);
-        // If we have a neighbor to the right
-        if ((threadIdx.x % 32) + offset < width) {
-            // Logic: take min. If current is -1 (invalid), take neighbor.
-            // If neighbor is -1, ignore it.
-            if (val == -1) val = neighbor_val;
-            else if (neighbor_val != -1) {
-                if (neighbor_val < val) val = neighbor_val;
+    for (int dy = -1; dy <= 1; ++dy) {
+        #pragma unroll
+        for (int dx = -1; dx <= 1; ++dx) {
+            if (dx == 0 && dy == 0) continue;
+            
+            int ny = h + dy * step;
+            int nx = w + dx * step;
+
+            if (ny >= 0 && ny < H && nx >= 0 && nx < W) {
+                int2 neighbor_seed = in_idx[batch_offset + ny * W + nx];
+                if (neighbor_seed.x != -1) {
+                    float d = dist_sq_int2(h, w, neighbor_seed);
+                    if (d < best_dist) {
+                        best_dist = d;
+                        best_seed = neighbor_seed;
+                    }
+                }
             }
         }
     }
-    return val;
+    out_idx[tid] = best_seed;
+}
+
+__global__ void calc_dist_kernel_2d_opt(
+    const int2* __restrict__ indices,
+    float* __restrict__ dist_out,
+    int64_t total_elements,
+    int H, int W
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    int2 s = indices[tid];
+    if (s.x == -1) { 
+        dist_out[tid] = INF_VAL; 
+    } else {
+        int64_t spatial_size = (int64_t)H * W;
+        int64_t rem = tid % spatial_size;
+        int cur_w = (int)(rem % W);
+        int cur_h = (int)(rem / W);
+        dist_out[tid] = sqrtf(dist_sq_int2(cur_h, cur_w, s));
+    }
 }
 
+// ==================================================================
+// PART 2: JFA KERNELS 3D (Optimized SoA Layout)
+// ==================================================================
+
+template <typename IndexType>
+__global__ void init_jfa_kernel_3d_soa(
+    const float* __restrict__ input,
+    IndexType* __restrict__ indices_z,
+    IndexType* __restrict__ indices_y,
+    IndexType* __restrict__ indices_x,
+    int64_t total_elements, 
+    int D, int H, int W
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
 
-// ============================================================================
-// Kernel: Optimized 1D EDT using Two-Level Tree reduction
-// ============================================================================
+    if (input[tid] == 0.0f) {
+        int64_t spatial_size = (int64_t)D * H * W;
+        int64_t rem = tid % spatial_size;
+        int w = (int)(rem % W);
+        int h = (int)((rem / W) % H);
+        int d = (int)(rem / (W * H));
 
-__global__ void edt_1d_warp_optimized_kernel(
-    const float* __restrict__ d_input,
-    float* __restrict__ d_dist,
-    int32_t* __restrict__ d_indices,
-    int width,
-    int height
+        indices_z[tid] = (IndexType)d;
+        indices_y[tid] = (IndexType)h;
+        indices_x[tid] = (IndexType)w;
+    } else {
+        indices_z[tid] = (IndexType)-1;
+        indices_y[tid] = (IndexType)-1;
+        indices_x[tid] = (IndexType)-1;
+    }
+}
+
+template <typename IndexType>
+__global__ void jfa_block_fused_kernel_3d_soa(
+    const IndexType* __restrict__ in_z,
+    const IndexType* __restrict__ in_y,
+    const IndexType* __restrict__ in_x,
+    IndexType* __restrict__ out_z,
+    IndexType* __restrict__ out_y,
+    IndexType* __restrict__ out_x,
+    int D, int H, int W,
+    int blocks_per_d
 ) {
-    int row = blockIdx.x;
-    if (row >= height) return;
-    
-    const float* row_input = d_input + row * width;
-    float* row_dist = d_dist + row * width;
-    int32_t* row_indices = d_indices + row * width;
-    
-    int tid = threadIdx.x;
-    int lane = tid % WARP_SIZE;
-    int warp_id = tid / WARP_SIZE;
-    int num_warps = blockDim.x / WARP_SIZE;
-    
-    // Shared memory for Inter-Warp Scan
-    // We use one buffer for the reduction result
-    __shared__ int s_warp_boundary[32]; 
-    
-    // ========================================================================
-    // PASS 1: Find nearest feature to the LEFT (Prefix Max Scan)
-    // ========================================================================
-    
-    int global_left_feature = -1;
+    const int BLOCK_DIM = 8;
+    const int HALO = 3; 
+    const int SMEM_DIM = BLOCK_DIM + 2 * HALO; // 14
+    const int SMEM_SIZE = SMEM_DIM * SMEM_DIM * SMEM_DIM;
+
+    extern __shared__ char smem_raw[];
+    IndexType* smem_z = (IndexType*)smem_raw;
+    IndexType* smem_y = smem_z + SMEM_SIZE;
+    IndexType* smem_x = smem_y + SMEM_SIZE;
+
+    int tx = threadIdx.x; int ty = threadIdx.y; int tz = threadIdx.z;
+
+    int b_z_total = blockIdx.z;
+    int batch_id = b_z_total / blocks_per_d;
+    int b_z_local = b_z_total % blocks_per_d;
     
-    for (int base = 0; base < width; base += blockDim.x) {
-        int i = base + tid;
-        bool is_valid = (i < width);
-        bool is_feature = is_valid && (row_input[i] > 0.5f);
-        
-        // 1. Warp-Level: Find local nearest
-        unsigned int feature_mask = __ballot_sync(0xFFFFFFFF, is_feature);
-        int my_index = is_feature ? i : -1;
-        int warp_left_feature = find_nearest_left_in_warp(lane, my_index, feature_mask);
-        
-        // 2. Prepare for Block-Level Scan: Write rightmost feature of this warp
-        int rightmost_lane = (feature_mask != 0) ? (31 - __clz(feature_mask)) : 0;
-        int rightmost_index = __shfl_sync(0xFFFFFFFF, my_index, rightmost_lane);
-        
-        if (lane == 0) {
-            s_warp_boundary[warp_id] = (feature_mask != 0) ? rightmost_index : -1;
-        }
-        __syncthreads();
-        
-        // 3. Block-Level: Warp 0 performs parallel prefix scan over warp boundaries
-        // This is the "Tree" part for inter-warp communication
-        if (warp_id == 0) {
-            // Load boundary from shared memory (only if valid warp)
-            int val = (lane < num_warps) ? s_warp_boundary[lane] : -1;
-            
-            // Perform inclusive max scan
-            int scan_res = warp_scan_inclusive_max(val, num_warps);
-            
-            // Write back inclusive scan result
-            if (lane < num_warps) {
-                s_warp_boundary[lane] = scan_res;
-            }
+    int bx = blockIdx.x * BLOCK_DIM;
+    int by = blockIdx.y * BLOCK_DIM;
+    int bz = b_z_local * BLOCK_DIM;
+
+    int64_t spatial_offset = (int64_t)batch_id * (D * H * W);
+
+    // Phase 1: Load to SoA Shared Memory
+    int tid = tz * 64 + ty * 8 + tx; 
+    int base_x = bx - HALO;
+    int base_y = by - HALO;
+    int base_z = bz - HALO;
+
+    for (int i = tid; i < SMEM_SIZE; i += 512) {
+        int temp = i;
+        int sx = temp % SMEM_DIM; temp /= SMEM_DIM;
+        int sy = temp % SMEM_DIM;
+        int sz = temp / SMEM_DIM;
+
+        int gx = base_x + sx;
+        int gy = base_y + sy;
+        int gz = base_z + sz;
+
+        IndexType val_z = -1, val_y = -1, val_x = -1;
+        if (gz >= 0 && gz < D && gy >= 0 && gy < H && gx >= 0 && gx < W) {
+            int64_t idx = spatial_offset + (int64_t)gz * (H * W) + gy * W + gx;
+            val_z = in_z[idx];
+            val_y = in_y[idx];
+            val_x = in_x[idx];
         }
-        __syncthreads();
-        
-        // 4. Combine Results
-        if (is_valid) {
-            int left_feature = -1;
-            
-            if (is_feature) left_feature = i;
-            else if (warp_left_feature != -1) left_feature = warp_left_feature;
-            else {
-                // Look at the scan result from the PREVIOUS warp
-                if (warp_id > 0) {
-                    left_feature = s_warp_boundary[warp_id - 1];
+        smem_z[i] = val_z;
+        smem_y[i] = val_y;
+        smem_x[i] = val_x;
+    }
+    __syncthreads();
+
+    // Phase 2: Compute
+    int center_sz = tz + HALO;
+    int center_sy = ty + HALO;
+    int center_sx = tx + HALO;
+    int my_s_idx = (center_sz * SMEM_DIM + center_sy) * SMEM_DIM + center_sx;
+
+    int best_z = (int)smem_z[my_s_idx];
+    int best_y = (int)smem_y[my_s_idx];
+    int best_x = (int)smem_x[my_s_idx];
+
+    int g_cz = bz + tz;
+    int g_cy = by + ty;
+    int g_cx = bx + tx;
+
+    float best_dist = dist_sq_3d_soa(g_cz, g_cy, g_cx, best_z, best_y, best_x);
+
+    int step = 1;
+    #pragma unroll
+    for (int k = 0; k < 2; ++k) { 
+        #pragma unroll
+        for (int dz = -1; dz <= 1; ++dz) {
+            #pragma unroll
+            for (int dy = -1; dy <= 1; ++dy) {
+                #pragma unroll
+                for (int dx = -1; dx <= 1; ++dx) {
+                    if (dz == 0 && dy == 0 && dx == 0) continue;
+
+                    int nz = center_sz + dz * step;
+                    int ny = center_sy + dy * step;
+                    int nx = center_sx + dx * step;
+                    int n_idx = (nz * SMEM_DIM + ny) * SMEM_DIM + nx;
+
+                    int sz_in = (int)smem_z[n_idx]; 
+                    if (sz_in != -1) {
+                        int sy_in = (int)smem_y[n_idx];
+                        int sx_in = (int)smem_x[n_idx];
+                        float d = dist_sq_3d_soa(g_cz, g_cy, g_cx, sz_in, sy_in, sx_in);
+                        if (d < best_dist) {
+                            best_dist = d;
+                            best_z = sz_in;
+                            best_y = sy_in;
+                            best_x = sx_in;
+                        }
+                    }
                 }
-                
-                // If still -1, fallback to global history
-                if (left_feature == -1) left_feature = global_left_feature;
-            }
-            
-            // Store result
-            if (left_feature >= 0) {
-                row_dist[i] = sqr((float)(i - left_feature));
-                row_indices[i] = left_feature;
-            } else {
-                row_dist[i] = INF_VAL;
-                row_indices[i] = -1;
             }
         }
-        
-        // 5. Update Global History
-        // The last warp's scan result contains the max index for the whole tile
-        int tile_max = s_warp_boundary[num_warps - 1];
-        if (tile_max != -1) global_left_feature = tile_max;
-        
         __syncthreads();
+        smem_z[my_s_idx] = (IndexType)best_z;
+        smem_y[my_s_idx] = (IndexType)best_y;
+        smem_x[my_s_idx] = (IndexType)best_x;
+        __syncthreads();
+        step *= 2;
     }
+
+    if (g_cz < D && g_cy < H && g_cx < W) {
+        int64_t out_idx_g = spatial_offset + (int64_t)g_cz * (H * W) + g_cy * W + g_cx;
+        out_z[out_idx_g] = (IndexType)best_z;
+        out_y[out_idx_g] = (IndexType)best_y;
+        out_x[out_idx_g] = (IndexType)best_x;
+    }
+}
+
+template <typename IndexType>
+__global__ void jfa_step_3d_soa(
+    const IndexType* __restrict__ in_z,
+    const IndexType* __restrict__ in_y,
+    const IndexType* __restrict__ in_x,
+    IndexType* __restrict__ out_z,
+    IndexType* __restrict__ out_y,
+    IndexType* __restrict__ out_x,
+    int step,
+    int D, int H, int W, 
+    int64_t total_pixels
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_pixels) return;
+
+    int64_t spatial_size = (int64_t)D * H * W;
+    int64_t rem = tid % spatial_size;
+    int64_t batch_offset = tid - rem; 
+    int cur_w = (int)(rem % W);
+    int cur_h = (int)((rem / W) % H);
+    int cur_d = (int)(rem / (W * H));
+
+    int best_z = (int)in_z[tid];
+    int best_y = (int)in_y[tid];
+    int best_x = (int)in_x[tid];
     
-    // ========================================================================
-    // PASS 2: Find nearest feature to the RIGHT (Suffix Min Scan)
-    // ========================================================================
-    
-    int global_right_feature = -1;
-    int num_tiles = (width + blockDim.x - 1) / blockDim.x;
-    
-    for (int tile = num_tiles - 1; tile >= 0; --tile) {
-        int base = tile * blockDim.x;
-        int i = base + tid;
-        bool is_valid = (i < width);
-        bool is_feature = is_valid && (row_input[i] > 0.5f);
-        
-        // 1. Warp-Level
-        unsigned int feature_mask = __ballot_sync(0xFFFFFFFF, is_feature);
-        int my_index = is_feature ? i : -1;
-        int warp_right_feature = find_nearest_right_in_warp(lane, my_index, feature_mask);
-        
-        // 2. Prepare: Write leftmost feature of this warp
-        int leftmost_lane = (feature_mask != 0) ? (__ffs(feature_mask) - 1) : 0;
-        int leftmost_index = __shfl_sync(0xFFFFFFFF, my_index, leftmost_lane);
-        
-        if (lane == 0) {
-            s_warp_boundary[warp_id] = (feature_mask != 0) ? leftmost_index : -1;
-        }
-        __syncthreads();
-        
-        // 3. Block-Level: Warp 0 performs parallel suffix scan (Right-to-Left tree)
-        if (warp_id == 0) {
-            int val = (lane < num_warps) ? s_warp_boundary[lane] : -1;
-            
-            // Perform suffix min scan
-            int scan_res = warp_scan_suffix_min(val, num_warps);
-            
-            if (lane < num_warps) {
-                s_warp_boundary[lane] = scan_res;
+    float best_dist = dist_sq_3d_soa(cur_d, cur_h, cur_w, best_z, best_y, best_x);
+
+    #pragma unroll
+    for (int dz = -1; dz <= 1; ++dz) {
+        #pragma unroll
+        for (int dy = -1; dy <= 1; ++dy) {
+            #pragma unroll
+            for (int dx = -1; dx <= 1; ++dx) {
+                if (dz == 0 && dy == 0 && dx == 0) continue;
+
+                int nz = cur_d + dz * step;
+                int ny = cur_h + dy * step;
+                int nx = cur_w + dx * step;
+
+                if (nz >= 0 && nz < D && ny >= 0 && ny < H && nx >= 0 && nx < W) {
+                    int64_t n_idx = batch_offset + (int64_t)nz * (H * W) + ny * W + nx;
+                    
+                    int seed_z = (int)in_z[n_idx];
+                    if (seed_z != -1) {
+                        float dz_val = (float)(cur_d - seed_z);
+                        float dz_sq = dz_val * dz_val;
+                        
+                        if (dz_sq < best_dist) {
+                            int seed_y = (int)in_y[n_idx];
+                            int seed_x = (int)in_x[n_idx];
+                            float dist = dz_sq + sqr((float)(cur_h - seed_y)) + sqr((float)(cur_w - seed_x));
+                            
+                            if (dist < best_dist) {
+                                best_dist = dist;
+                                best_z = seed_z;
+                                best_y = seed_y;
+                                best_x = seed_x;
+                            }
+                        }
+                    }
+                }
             }
         }
-        __syncthreads();
+    }
+    out_z[tid] = (IndexType)best_z;
+    out_y[tid] = (IndexType)best_y;
+    out_x[tid] = (IndexType)best_x;
+}
+
+template <typename IndexType>
+__global__ void calc_dist_kernel_3d_soa(
+    const IndexType* __restrict__ in_z,
+    const IndexType* __restrict__ in_y,
+    const IndexType* __restrict__ in_x,
+    float* __restrict__ dist_out,
+    int64_t total_elements,
+    int D, int H, int W
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    int seed_d = (int)in_z[tid];
+    if (seed_d == -1) { 
+        dist_out[tid] = INF_VAL; 
+    } else {
+        int seed_h = (int)in_y[tid];
+        int seed_w = (int)in_x[tid];
+
+        int64_t spatial_size = (int64_t)D * H * W;
+        int64_t rem = tid % spatial_size;
+        int cur_w = (int)(rem % W);
+        int cur_h = (int)((rem / W) % H);
+        int cur_d = (int)(rem / (W * H));
         
-        // 4. Combine Results
-        if (is_valid) {
-            int right_feature = -1;
-            
-            if (is_feature) right_feature = i;
-            else if (warp_right_feature != -1) right_feature = warp_right_feature;
-            else {
-                // Look at scan result from NEXT warp
-                if (warp_id < num_warps - 1) {
-                    right_feature = s_warp_boundary[warp_id + 1];
+        dist_out[tid] = sqrtf(dist_sq_3d_soa(cur_d, cur_h, cur_w, seed_d, seed_h, seed_w));
+    }
+}
+
+// ==================================================================
+// PART 3: SEPARABLE N-DIM KERNELS
+// ==================================================================
+
+__device__ void run_separable_scan_core(
+    int N,
+    int tid,
+    const float* __restrict__ vals,  
+    int* __restrict__ idx_curr,      
+    int* __restrict__ idx_next       
+) {
+    for (int i = tid; i < N; i += blockDim.x) {
+        if (vals[i] >= INF_VAL * 0.9f) idx_curr[i] = -1; 
+        else idx_curr[i] = i;  
+    }
+    __syncthreads();
+
+    int* idx_in = idx_curr;
+    int* idx_out = idx_next;
+
+    for (int step = 1; step < N; step *= 2) {
+        for (int i = tid; i < N; i += blockDim.x) {
+            int my_best_p = idx_in[i];
+            float min_cost = INF_VAL;
+
+            if (my_best_p != -1) min_cost = compute_cost(i, my_best_p, vals[my_best_p]);
+
+            int left = i - step;
+            if (left >= 0) {
+                int left_p = idx_in[left];
+                if (left_p != -1) {
+                    float c = compute_cost(i, left_p, vals[left_p]);
+                    if (c < min_cost) { min_cost = c; my_best_p = left_p; }
                 }
-                
-                if (right_feature == -1) right_feature = global_right_feature;
             }
-            
-            // Update Min Distance
-            if (right_feature >= 0) {
-                float d = sqr((float)(right_feature - i));
-                if (d < row_dist[i]) {
-                    row_dist[i] = d;
-                    row_indices[i] = right_feature;
+
+            int right = i + step;
+            if (right < N) {
+                int right_p = idx_in[right];
+                if (right_p != -1) {
+                    float c = compute_cost(i, right_p, vals[right_p]);
+                    if (c < min_cost) { min_cost = c; my_best_p = right_p; }
                 }
             }
+            idx_out[i] = my_best_p;
         }
-        
-        // 5. Update Global History
-        // First warp's scan result contains min index for whole tile
-        int tile_min = s_warp_boundary[0];
-        if (tile_min != -1) global_right_feature = tile_min;
-        
+        int* temp = idx_in; idx_in = idx_out; idx_out = temp;
+        __syncthreads();
+    }
+
+    if (idx_in != idx_curr) {
+        for (int i = tid; i < N; i += blockDim.x) idx_curr[i] = idx_next[i];
         __syncthreads();
     }
 }
 
-// ============================================================================
-// PyTorch Wrapper Function
-// ============================================================================
+template <bool IsFinal>
+__global__ void separable_kernel_shared(
+    const float* __restrict__ in_data,
+    const int32_t* __restrict__ in_indices,
+    float* __restrict__ out_dist,            
+    int32_t* __restrict__ out_indices,       
+    int64_t L,               
+    int64_t total_elements,  
+    int coord_ndim           
+) {
+    int64_t row_idx = blockIdx.x;
+    int64_t offset = row_idx * L;
+    if (offset >= total_elements) return;
 
-std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
-    TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
-    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be float32");
-    TORCH_CHECK(input.dim() >= 1, "Input must be at least 1D");
-    
-    // Get dimensions
-    int64_t ndim = input.dim();
-    int64_t width = input.size(-1);
-    int64_t height = 1;
+    extern __shared__ char s_buffer[];
+    float* s_vals = (float*)s_buffer;
+    int*   s_idx1 = (int*)(s_vals + L);
+    int*   s_idx2 = (int*)(s_idx1 + L);
+
+    for (int i = threadIdx.x; i < L; i += blockDim.x) {
+        s_vals[i] = __ldg(&in_data[offset + i]);
+    }
+    __syncthreads();
+
+    run_separable_scan_core(L, threadIdx.x, s_vals, s_idx1, s_idx2);
+
+    for (int q = threadIdx.x; q < L; q += blockDim.x) {
+        int p = s_idx1[q];
+        float dist_val;
+
+        if (p != -1) {
+            float dist_sq = sqr((float)q - (float)p) + s_vals[p];
+            dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
+        } else {
+            dist_val = IsFinal ? INF_VAL : INF_VAL;
+            p = 0;
+        }
+        out_dist[offset + q] = dist_val;
+
+        int64_t dst_base = (offset + q) * coord_ndim;
+        if (p != -1 && s_vals[p] < INF_VAL) {
+            int64_t src_base = (offset + p) * coord_ndim;
+            for (int d = 0; d < coord_ndim; ++d) {
+                out_indices[dst_base + d] = in_indices[src_base + d];
+            }
+        } else {
+            for (int d = 0; d < coord_ndim; ++d) out_indices[dst_base + d] = 0;
+        }
+    }
+}
+
+template <bool IsFinal>
+__global__ void separable_kernel_global(
+    const float* __restrict__ in_data,
+    const int32_t* __restrict__ in_indices,
+    float* __restrict__ out_dist,
+    int32_t* __restrict__ out_indices,
+    int* __restrict__ global_buffer_1,
+    int* __restrict__ global_buffer_2,
+    int64_t L,
+    int64_t total_elements,
+    int coord_ndim
+) {
+    int64_t row_idx = blockIdx.x;
+    int64_t offset = row_idx * L;
+    if (offset >= total_elements) return;
+
+    int* g_idx1 = global_buffer_1 + offset;
+    int* g_idx2 = global_buffer_2 + offset;
     
-    if (ndim >= 2) {
-        height = input.size(-2);
+    run_separable_scan_core(L, threadIdx.x, in_data + offset, g_idx1, g_idx2);
+
+    for (int q = threadIdx.x; q < L; q += blockDim.x) {
+        int p = g_idx1[q];
+        float dist_val;
+        if (p != -1) {
+            float val_p = in_data[offset + p]; 
+            float dist_sq = sqr((float)q - (float)p) + val_p;
+            dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
+        } else {
+            dist_val = IsFinal ? INF_VAL : INF_VAL;
+            p = 0; 
+        }
+        out_dist[offset + q] = dist_val;
+
+        int64_t dst_base = (offset + q) * coord_ndim;
+        if (p != -1 && in_data[offset + p] < INF_VAL) {
+            int64_t src_base = (offset + p) * coord_ndim;
+            for (int d = 0; d < coord_ndim; ++d) {
+                out_indices[dst_base + d] = in_indices[src_base + d];
+            }
+        } else {
+            for (int d = 0; d < coord_ndim; ++d) out_indices[dst_base + d] = 0;
+        }
+    }
+}
+
+__global__ void init_indices_separable_kernel(
+    int32_t* indices, 
+    int64_t total_pixels, 
+    int NDim, 
+    const int64_t* __restrict__ shape_ptr
+) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_pixels) return;
+
+    int64_t temp = idx;
+    int32_t coords[8]; 
+    for (int d = NDim - 1; d >= 0; --d) {
+        int64_t dim_size = shape_ptr[d];
+        coords[d] = temp % dim_size;
+        temp /= dim_size;
     }
+    int64_t out_ptr = idx * NDim;
+    for (int d = 0; d < NDim; ++d) indices[out_ptr + d] = coords[d];
+}
+
+// ==================================================================
+// PART 4: DISPATCH HELPERS
+// ==================================================================
+
+std::tuple<torch::Tensor, torch::Tensor> run_jfa_2d(
+    torch::Tensor input, int64_t H, int64_t W, int grid, int block, int64_t numel
+) {
+    auto index_opts = input.options().dtype(torch::kInt32);
+    auto idx_shape = input.sizes().vec();
+    idx_shape.push_back(2); 
+    auto curr_idx = torch::empty(idx_shape, index_opts);
+    auto next_idx = torch::empty(idx_shape, index_opts);
     
-    int64_t batch_size = input.numel() / (width * height);
+    int2* d_curr = (int2*)curr_idx.data_ptr<int32_t>();
+    int2* d_next = (int2*)next_idx.data_ptr<int32_t>();
+
+    init_jfa_kernel_2d_opt<<<grid, block>>>(
+        input.data_ptr<float>(), d_curr, numel, H, W
+    );
+
+    {
+        dim3 dimBlock(JFA_BLOCK_DIM, JFA_BLOCK_DIM); 
+        int64_t batch_size = numel / (H * W);
+        dim3 dimGrid((W + JFA_BLOCK_DIM - 1) / JFA_BLOCK_DIM, 
+                     (H + JFA_BLOCK_DIM - 1) / JFA_BLOCK_DIM, 
+                     batch_size);
+        
+        jfa_block_fused_kernel_2d<<<dimGrid, dimBlock>>>(d_curr, d_next, H, W, batch_size);
+        std::swap(d_curr, d_next); 
+        std::swap(curr_idx, next_idx); 
+    }
+
+    int max_dim = std::max((int)H, (int)W);
+    int step = 16; 
+
+    while (step < max_dim) {
+        jfa_step_global_2d_opt<<<grid, block>>>(d_curr, d_next, step, H, W, numel);
+        std::swap(d_curr, d_next);
+        std::swap(curr_idx, next_idx);
+        step *= 2;
+    }
     
-    // Flatten to [batch * height, width]
-    auto input_flat = input.view({batch_size * height, width});
+    auto final_dist = torch::empty_like(input);
+    calc_dist_kernel_2d_opt<<<grid, block>>>(d_curr, final_dist.data_ptr<float>(), numel, H, W);
+
+    return std::make_tuple(final_dist, curr_idx);
+}
+
+
+std::tuple<torch::Tensor, torch::Tensor> run_jfa_3d(
+    torch::Tensor input, int64_t D, int64_t H, int64_t W, int grid, int block, int64_t numel
+) {
+    bool use_int16 = (D < 32767 && H < 32767 && W < 32767);
+    auto index_opts = input.options().dtype(use_int16 ? torch::kInt16 : torch::kInt32);
     
-    // Create output tensors
-    auto dist_map = torch::empty_like(input);
-    auto dist_flat = dist_map.view({batch_size * height, width});
+    int64_t batch = numel / (D * H * W);
     
-    // Index map: same shape as input + last dimension for coordinate
-    auto idx_shape = input.sizes().vec();
-    idx_shape.push_back(1);
-    auto idx_map = torch::empty(idx_shape, input.options().dtype(torch::kInt32));
-    auto idx_flat = idx_map.view({batch_size * height, width, 1});
+    // (3, Batch, D, H, W)
+    auto curr_idx_soa = torch::empty({3, batch, D, H, W}, index_opts);
+    auto next_idx_soa = torch::empty({3, batch, D, H, W}, index_opts);
     
-    // Launch kernel
-    // Use 256 threads per block (8 warps) for good occupancy
-    int threads_per_block = 256;
-    int num_rows = batch_size * height;
+    void* d_curr = curr_idx_soa.data_ptr();
+    void* d_next = next_idx_soa.data_ptr();
+    int64_t plane_stride = numel; // B*D*H*W
+
+    // 1. Init
+    if (use_int16) {
+        int16_t* ptr = (int16_t*)d_curr;
+        init_jfa_kernel_3d_soa<int16_t><<<grid, block>>>(
+            input.data_ptr<float>(), ptr, ptr + plane_stride, ptr + 2 * plane_stride, numel, D, H, W
+        );
+    } else {
+        int32_t* ptr = (int32_t*)d_curr;
+        init_jfa_kernel_3d_soa<int32_t><<<grid, block>>>(
+            input.data_ptr<float>(), ptr, ptr + plane_stride, ptr + 2 * plane_stride, numel, D, H, W
+        );
+    }
+
+    // 2. Fused Steps
+    int block_dim = 8;
+    int blocks_per_d = (D + block_dim - 1) / block_dim;
+    dim3 fused_block(block_dim, block_dim, block_dim);
+    dim3 fused_grid((W + block_dim - 1) / block_dim, (H + block_dim - 1) / block_dim, blocks_per_d * batch);
+    size_t smem_bytes = (14*14*14) * 3 * (use_int16 ? 2 : 4);
+
+    if (use_int16) {
+        int16_t* c = (int16_t*)d_curr;
+        int16_t* n = (int16_t*)d_next;
+        jfa_block_fused_kernel_3d_soa<int16_t><<<fused_grid, fused_block, smem_bytes>>>(
+            c, c + plane_stride, c + 2 * plane_stride, 
+            n, n + plane_stride, n + 2 * plane_stride, 
+            D, H, W, blocks_per_d
+        );
+    } else {
+        int32_t* c = (int32_t*)d_curr;
+        int32_t* n = (int32_t*)d_next;
+        jfa_block_fused_kernel_3d_soa<int32_t><<<fused_grid, fused_block, smem_bytes>>>(
+            c, c + plane_stride, c + 2 * plane_stride, 
+            n, n + plane_stride, n + 2 * plane_stride, 
+            D, H, W, blocks_per_d
+        );
+    }
+    std::swap(d_curr, d_next); 
+
+    // 3. Global Steps
+    int max_dim = std::max({(int)D, (int)H, (int)W});
+    int step = 4;
+    while (step < max_dim) {
+        if (use_int16) {
+            int16_t* c = (int16_t*)d_curr;
+            int16_t* n = (int16_t*)d_next;
+            jfa_step_3d_soa<int16_t><<<grid, block>>>(
+                c, c + plane_stride, c + 2 * plane_stride, 
+                n, n + plane_stride, n + 2 * plane_stride, 
+                step, D, H, W, numel
+            );
+        } else {
+            int32_t* c = (int32_t*)d_curr;
+            int32_t* n = (int32_t*)d_next;
+            jfa_step_3d_soa<int32_t><<<grid, block>>>(
+                c, c + plane_stride, c + 2 * plane_stride, 
+                n, n + plane_stride, n + 2 * plane_stride, 
+                step, D, H, W, numel
+            );
+        }
+        std::swap(d_curr, d_next);
+        step *= 2;
+    }
+
+    // 4. Final Dist
+    auto final_dist = torch::empty_like(input);
+    if (use_int16) {
+        int16_t* c = (int16_t*)d_curr;
+        calc_dist_kernel_3d_soa<int16_t><<<grid, block>>>(
+            c, c + plane_stride, c + 2 * plane_stride, 
+            final_dist.data_ptr<float>(), numel, D, H, W
+        );
+    } else {
+        int32_t* c = (int32_t*)d_curr;
+        calc_dist_kernel_3d_soa<int32_t><<<grid, block>>>(
+            c, c + plane_stride, c + 2 * plane_stride, 
+            final_dist.data_ptr<float>(), numel, D, H, W
+        );
+    }
+
+    // Permute result indices back to (Batch, D, H, W, 3)
+    torch::Tensor result_indices;
+    if (d_curr == curr_idx_soa.data_ptr()) result_indices = curr_idx_soa;
+    else result_indices = next_idx_soa;
     
-    dim3 block(threads_per_block);
-    dim3 grid(num_rows);
+    result_indices = result_indices.permute({1, 2, 3, 4, 0}).contiguous();
+
+    return std::make_tuple(final_dist, result_indices);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> run_separable_ndim(torch::Tensor input) {
+    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Separable N-Dim input must be float32.");
+    input = input.contiguous();
     
-    edt_1d_warp_optimized_kernel<<<grid, block>>>(
-        input_flat.data_ptr<float>(),
-        dist_flat.data_ptr<float>(),
-        idx_flat.data_ptr<int32_t>(),
-        width,
-        num_rows
-    );
+    const int ndim = input.dim(); 
+    const int sample_ndim = ndim - 1; 
+    TORCH_CHECK(sample_ndim > 0 && sample_ndim <= 8, "Unsupported dims for Separable EDT");
     
-    // Check for errors
-    cudaError_t err = cudaGetLastError();
-    TORCH_CHECK(err == cudaSuccess, "CUDA kernel failed: ", cudaGetErrorString(err));
+    auto shape = input.sizes().vec();
+    int64_t num_pixels = input.numel();
+
+    auto current_dist = torch::where(input == 0, 
+                                    torch::tensor(0.0f, input.options()), 
+                                    torch::tensor(INF_VAL, input.options()));
     
-    // Take square root to get actual distance (not squared)
-    dist_map = torch::sqrt(dist_map);
+    auto index_shape = shape;
+    index_shape.push_back(sample_ndim);
+    auto current_idx = torch::empty(index_shape, input.options().dtype(torch::kInt32));
     
-    return std::make_tuple(dist_map, idx_map);
+    {
+        std::vector<int64_t> spatial_shape(shape.begin() + 1, shape.end());
+        auto shape_tensor = torch::tensor(spatial_shape, torch::kInt64).to(input.device());
+        int threads = 256;
+        int blocks = (num_pixels + threads - 1) / threads;
+        init_indices_separable_kernel<<<blocks, threads>>>(
+            current_idx.data_ptr<int32_t>(), num_pixels, sample_ndim, shape_tensor.data_ptr<int64_t>()
+        );
+    }
+
+    torch::Tensor global_buf1, global_buf2;
+
+    for (int d = 1; d < ndim; ++d) {
+        bool is_final_pass = (d == ndim - 1);
+        
+        auto dist_in = current_dist.transpose(d, ndim - 1).contiguous();
+        auto idx_in  = current_idx.transpose(d, ndim - 1).contiguous(); 
+        
+        auto dist_out = torch::empty(dist_in.sizes(), dist_in.options());
+        auto idx_out  = torch::empty(idx_in.sizes(), idx_in.options());
+
+        int64_t L = dist_in.size(-1); 
+        int64_t total_slices = dist_in.numel() / L; 
+        int threads = std::min((int64_t)MAX_THREADS, L);
+        
+        if (L <= SMEM_LIMIT_ELEMENTS) {
+            size_t smem_size = L * (sizeof(float) + 2 * sizeof(int));
+            if (is_final_pass) {
+                separable_kernel_shared<true><<<total_slices, threads, smem_size>>>(
+                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
+                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
+                    L, dist_in.numel(), sample_ndim
+                );
+            } else {
+                separable_kernel_shared<false><<<total_slices, threads, smem_size>>>(
+                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
+                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
+                    L, dist_in.numel(), sample_ndim
+                );
+            }
+        } else {
+            if (global_buf1.numel() < dist_in.numel()) {
+                global_buf1 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
+                global_buf2 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
+            }
+            if (is_final_pass) {
+                separable_kernel_global<true><<<total_slices, threads>>>(
+                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
+                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
+                    global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(),
+                    L, dist_in.numel(), sample_ndim
+                );
+            } else {
+                separable_kernel_global<false><<<total_slices, threads>>>(
+                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
+                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
+                    global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(),
+                    L, dist_in.numel(), sample_ndim
+                );
+            }
+        }
+        current_dist = dist_out.transpose(d, ndim - 1);
+        current_idx  = idx_out.transpose(d, ndim - 1);
+    }
+
+    return std::make_tuple(current_dist, current_idx);
 }
+
+// ==================================================================
+// PART 5: MAIN ENTRY POINT
+// ==================================================================
+
+std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
+    TORCH_CHECK(input.is_cuda(), "Input must be CUDA tensor");
+    input = input.contiguous();
+
+    int64_t dims = input.dim();
+    int64_t numel = input.numel();
+    int block = BLOCK_SIZE;
+    int grid = (numel + block - 1) / block;
+
+    if (dims >= 5) {
+        return run_separable_ndim(input);
+    }
+    else if (dims == 4) {
+        int64_t dim1 = input.size(1); 
+        if (dim1 == 1) {
+            int64_t H = input.size(-2);
+            int64_t W = input.size(-1);
+            return run_jfa_2d(input, H, W, grid, block, numel);
+        } 
+        else {
+            int64_t D = dim1;
+            int64_t H = input.size(-2);
+            int64_t W = input.size(-1);
+            return run_jfa_3d(input, D, H, W, grid, block, numel);
+        }
+    }
+    else if (dims == 3) {
+        int64_t H = input.size(-2);
+        int64_t W = input.size(-1);
+        return run_jfa_2d(input, H, W, grid, block, numel);
+    }
+    else if (dims == 2) {
+        int64_t H = 1;
+        int64_t W = input.size(-1);
+        auto result = run_jfa_2d(input, H, W, grid, block, numel);
+        torch::Tensor dist = std::get<0>(result);
+        torch::Tensor idx_2d = std::get<1>(result); 
+        auto idx_1d = idx_2d.slice(/*dim=*/-1, /*start=*/1, /*end=*/2).contiguous();
+        return std::make_tuple(dist, idx_1d);
+    }
+    else {
+        TORCH_CHECK(false, "Unsupported dimensions.");
+        return std::make_tuple(torch::Tensor(), torch::Tensor());
+    }
+}
\ No newline at end of file
diff --git a/torchmorph/distance_transform.py b/torchmorph/distance_transform.py
index 7887c2a..868e84a 100644
--- a/torchmorph/distance_transform.py
+++ b/torchmorph/distance_transform.py
@@ -7,7 +7,7 @@ def distance_transform(input: torch.Tensor) -> torch.Tensor:
     """Distance Transform in CUDA."""
     if not input.is_cuda:
         raise ValueError("Input tensor must be on CUDA device.")
-    if input.ndim < 1 or input.numel() == 0:
+    if input.ndim < 2 or input.numel() == 0:
         raise ValueError(f"Invalid input dimension: {input.shape}.")
 
     # binarize input

From 4ce7da7d616c807ff15f8a5c051f637f610bce9d Mon Sep 17 00:00:00 2001
From: Yu Han Deng <moliflower233@163.com>
Date: Fri, 16 Jan 2026 14:55:45 +0800
Subject: [PATCH 50/56] test ping

---
 torchmorph/csrc/distance_transform_kernel.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
index 09d4cbc..817d637 100644
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ b/torchmorph/csrc/distance_transform_kernel.cu
@@ -10,7 +10,7 @@
 #include <tuple>
 
 // ------------------------------------------------------------------
-// Global Configuration
+// Global Configuration(test)
 // ------------------------------------------------------------------
 #define BLOCK_SIZE 256
 #define INF_VAL 1e20f

From bb2bb7e9ee36d1b65e8c8c8ea5fbd0af864b0914 Mon Sep 17 00:00:00 2001
From: Yu Han Deng <moliflower233@163.com>
Date: Sat, 24 Jan 2026 22:48:39 +0800
Subject: [PATCH 51/56] Align scipy edt

---
 torchmorph/csrc/distance_transform_edt.cu | 470 ++++++++++++++++++++++
 1 file changed, 470 insertions(+)
 create mode 100644 torchmorph/csrc/distance_transform_edt.cu

diff --git a/torchmorph/csrc/distance_transform_edt.cu b/torchmorph/csrc/distance_transform_edt.cu
new file mode 100644
index 0000000..fee521c
--- /dev/null
+++ b/torchmorph/csrc/distance_transform_edt.cu
@@ -0,0 +1,470 @@
+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+#include <cmath>
+#include <tuple>
+
+// ==============================================================================
+// Configuration
+// ==============================================================================
+#define INF_VAL 1e20f
+#define MAX_THREADS 256
+#define SHARED_MEM_LIMIT 2048  // Max dimension size for shared memory path (48KB limit)
+
+// ==============================================================================
+// 1D EDT kernel using GLOBAL memory (for large dimensions)
+// ==============================================================================
+__global__ void edt_1d_kernel_global(
+    const float* __restrict__ input,
+    float* __restrict__ output,
+    const int* __restrict__ input_idx,
+    int* __restrict__ output_idx,
+    float* __restrict__ g_v_val,
+    int* __restrict__ g_v_idx,
+    float* __restrict__ g_z,
+    int* __restrict__ g_k,
+    int64_t num_slices,
+    int64_t slice_len,
+    int64_t num_pixels,
+    int spatial_ndim,
+    int current_dim,
+    float spacing,
+    bool is_final,
+    bool compute_indices
+) {
+    int64_t slice_idx = blockIdx.x;
+    if (slice_idx >= num_slices) return;
+
+    int64_t base_offset = slice_idx * slice_len;
+
+    float* v_val = g_v_val + base_offset;
+    int* v_idx = g_v_idx + base_offset;
+    float* z = g_z + slice_idx * (slice_len + 1);
+    int* k_ptr = g_k + slice_idx;
+
+    int tid = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    // Load input values
+    for (int i = tid; i < slice_len; i += num_threads) {
+        v_val[i] = input[base_offset + i];
+    }
+    __syncthreads();
+
+    // Build lower envelope (thread 0 only)
+    if (tid == 0) {
+        int k = -1;
+
+        for (int q = 0; q < slice_len; q++) {
+            float fq = v_val[q];
+            if (fq >= INF_VAL * 0.5f) continue;
+
+            float q_pos = (float)q * spacing;
+            float q_pos_sq = q_pos * q_pos;
+
+            while (k >= 0) {
+                int vk = v_idx[k];
+                float vk_pos = (float)vk * spacing;
+                float fvk = v_val[vk];
+                float s = ((fq + q_pos_sq) - (fvk + vk_pos * vk_pos)) / (2.0f * (q_pos - vk_pos));
+
+                if (s > z[k]) break;
+                k--;
+            }
+
+            k++;
+            v_idx[k] = q;
+
+            if (k == 0) {
+                z[0] = -INF_VAL;
+            } else {
+                int vk_prev = v_idx[k - 1];
+                float vk_prev_pos = (float)vk_prev * spacing;
+                float fvk_prev = v_val[vk_prev];
+                z[k] = ((fq + q_pos_sq) - (fvk_prev + vk_prev_pos * vk_prev_pos)) /
+                       (2.0f * (q_pos - vk_prev_pos));
+            }
+            z[k + 1] = INF_VAL;
+        }
+        *k_ptr = k;
+    }
+    __syncthreads();
+
+    int k = *k_ptr;
+
+    // Parallel fill with binary search
+    for (int q = tid; q < slice_len; q += num_threads) {
+        int64_t out_idx = base_offset + q;
+
+        if (k < 0) {
+            output[out_idx] = INF_VAL;
+            if (compute_indices) {
+                for (int d = 0; d < spatial_ndim; d++) {
+                    output_idx[d * num_pixels + out_idx] = 0;
+                }
+            }
+        } else {
+            float q_pos = (float)q * spacing;
+
+            // Binary search
+            int lo = 0, hi = k;
+            while (lo < hi) {
+                int mid = (lo + hi + 1) / 2;
+                if (z[mid] <= q_pos) lo = mid;
+                else hi = mid - 1;
+            }
+
+            int nearest = v_idx[lo];
+            float nearest_pos = (float)nearest * spacing;
+            float diff = q_pos - nearest_pos;
+            float dist_sq = diff * diff + v_val[nearest];
+
+            output[out_idx] = is_final ? sqrtf(dist_sq) : dist_sq;
+
+            if (compute_indices) {
+                int64_t src_idx = base_offset + nearest;
+                for (int d = 0; d < spatial_ndim; d++) {
+                    if (d == current_dim) {
+                        output_idx[d * num_pixels + out_idx] = nearest;
+                    } else {
+                        output_idx[d * num_pixels + out_idx] = input_idx[d * num_pixels + src_idx];
+                    }
+                }
+            }
+        }
+    }
+}
+
+// ==============================================================================
+// 1D Euclidean Distance Transform (Felzenszwalb & Huttenlocher)
+// ==============================================================================
+__global__ void edt_1d_kernel(
+    const float* __restrict__ input,
+    float* __restrict__ output,
+    const int* __restrict__ input_idx,
+    int* __restrict__ output_idx,
+    int64_t num_slices,
+    int64_t slice_len,
+    int64_t num_pixels,
+    int spatial_ndim,
+    int current_dim,
+    float spacing,
+    bool is_final,
+    bool compute_indices
+) {
+    int64_t slice_idx = blockIdx.x;
+    if (slice_idx >= num_slices) return;
+
+    int64_t base_offset = slice_idx * slice_len;
+
+    extern __shared__ char shared_mem[];
+    float* v_val = (float*)shared_mem;
+    int* v_idx = (int*)(v_val + slice_len);
+    float* z = (float*)(v_idx + slice_len);
+
+    int tid = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    // Load input values into shared memory
+    for (int i = tid; i < slice_len; i += num_threads) {
+        v_val[i] = input[base_offset + i];
+    }
+    __syncthreads();
+
+    // Build lower envelope (thread 0 only)
+    __shared__ int k_shared;
+
+    if (tid == 0) {
+        int k = -1;
+
+        for (int q = 0; q < slice_len; q++) {
+            float fq = v_val[q];
+            if (fq >= INF_VAL * 0.5f) continue;
+
+            float q_pos = (float)q * spacing;
+            float q_pos_sq = q_pos * q_pos;
+
+            while (k >= 0) {
+                int vk = v_idx[k];
+                float vk_pos = (float)vk * spacing;
+                float fvk = v_val[vk];
+                float s = ((fq + q_pos_sq) - (fvk + vk_pos * vk_pos)) / (2.0f * (q_pos - vk_pos));
+
+                if (s > z[k]) break;
+                k--;
+            }
+
+            k++;
+            v_idx[k] = q;
+
+            if (k == 0) {
+                z[0] = -INF_VAL;
+            } else {
+                int vk_prev = v_idx[k - 1];
+                float vk_prev_pos = (float)vk_prev * spacing;
+                float fvk_prev = v_val[vk_prev];
+                z[k] = ((fq + q_pos_sq) - (fvk_prev + vk_prev_pos * vk_prev_pos)) /
+                       (2.0f * (q_pos - vk_prev_pos));
+            }
+            z[k + 1] = INF_VAL;
+        }
+        k_shared = k;
+    }
+    __syncthreads();
+
+    int k = k_shared;
+
+    // Parallel fill
+    for (int q = tid; q < slice_len; q += num_threads) {
+        int64_t out_idx = base_offset + q;
+
+        if (k < 0) {
+            output[out_idx] = INF_VAL;
+            if (compute_indices) {
+                for (int d = 0; d < spatial_ndim; d++) {
+                    output_idx[d * num_pixels + out_idx] = 0;
+                }
+            }
+        } else {
+            float q_pos = (float)q * spacing;
+
+            // Binary search
+            int lo = 0, hi = k;
+            while (lo < hi) {
+                int mid = (lo + hi + 1) / 2;
+                if (z[mid] <= q_pos) lo = mid;
+                else hi = mid - 1;
+            }
+
+            int nearest = v_idx[lo];
+            float nearest_pos = (float)nearest * spacing;
+            float diff = q_pos - nearest_pos;
+            float dist_sq = diff * diff + v_val[nearest];
+
+            output[out_idx] = is_final ? sqrtf(dist_sq) : dist_sq;
+
+            if (compute_indices) {
+                int64_t src_idx = base_offset + nearest;
+                for (int d = 0; d < spatial_ndim; d++) {
+                    if (d == current_dim) {
+                        output_idx[d * num_pixels + out_idx] = nearest;
+                    } else {
+                        output_idx[d * num_pixels + out_idx] = input_idx[d * num_pixels + src_idx];
+                    }
+                }
+            }
+        }
+    }
+}
+
+// ==============================================================================
+// Initialization kernel: set up initial distances and indices
+// ==============================================================================
+__global__ void init_distance_kernel(
+    const float* __restrict__ input,
+    float* __restrict__ distance,
+    int* __restrict__ indices,
+    int64_t total_pixels,
+    int total_ndim,
+    int spatial_ndim,
+    const int64_t* __restrict__ shape,
+    bool compute_indices
+) {
+    int64_t idx = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_pixels) return;
+
+    // Set distance: 0 for background (input == 0), INF for foreground (input != 0)
+    float val = input[idx];
+    distance[idx] = (val != 0.0f) ? INF_VAL : 0.0f;
+
+    // Initialize indices to current coordinates
+    if (compute_indices) {
+        int64_t temp = idx;
+        int coords[8];
+
+        // Compute coordinates from linear index
+        for (int d = total_ndim - 1; d >= 0; d--) {
+            int64_t dim_size = shape[d];
+            coords[d] = temp % dim_size;
+            temp /= dim_size;
+        }
+
+        // Store spatial coordinates
+        int start_dim = total_ndim - spatial_ndim;
+        for (int s = 0; s < spatial_ndim; s++) {
+            indices[s * total_pixels + idx] = coords[start_dim + s];
+        }
+    }
+}
+
+// ==============================================================================
+// Host function to run separable EDT
+// ==============================================================================
+std::tuple<torch::Tensor, torch::Tensor> run_edt_separable(
+    torch::Tensor input,
+    const std::vector<float>& sampling,
+    bool return_indices
+) {
+    TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
+    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be float32");
+
+    input = input.contiguous();
+
+    int total_ndim = input.dim();
+    int spatial_ndim = sampling.size();
+    int start_dim = total_ndim - spatial_ndim;
+
+    auto shape = input.sizes().vec();
+    int64_t total_pixels = input.numel();
+
+    // Create output tensors
+    auto distance = torch::empty_like(input);
+    torch::Tensor indices;
+    if (return_indices) {
+        std::vector<int64_t> idx_shape = {spatial_ndim};
+        for (auto s : shape) idx_shape.push_back(s);
+        indices = torch::empty(idx_shape, input.options().dtype(torch::kInt32));
+    }
+
+    // Copy shape to device
+    auto shape_tensor = torch::tensor(std::vector<int64_t>(shape.begin(), shape.end()),
+                                       torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
+
+    // Initialize distances and indices
+    int threads = 256;
+    int blocks = (total_pixels + threads - 1) / threads;
+
+    init_distance_kernel<<<blocks, threads>>>(
+        input.data_ptr<float>(),
+        distance.data_ptr<float>(),
+        return_indices ? indices.data_ptr<int>() : nullptr,
+        total_pixels,
+        total_ndim,
+        spatial_ndim,
+        shape_tensor.data_ptr<int64_t>(),
+        return_indices
+    );
+
+    // Global memory buffers (allocated lazily for large dimensions)
+    torch::Tensor g_v_val, g_v_idx, g_z, g_k;
+
+    // Process each spatial dimension
+    for (int dim_idx = 0; dim_idx < spatial_ndim; dim_idx++) {
+        int actual_dim = start_dim + dim_idx;
+        bool is_final = (dim_idx == spatial_ndim - 1);
+        float spacing = sampling[dim_idx];
+
+        // Transpose to make current dimension last
+        auto dist_transposed = distance.transpose(actual_dim, total_ndim - 1).contiguous();
+        auto dist_out = torch::empty_like(dist_transposed);
+
+        torch::Tensor idx_transposed, idx_out;
+        if (return_indices) {
+            // Indices have an extra leading dimension
+            idx_transposed = indices.transpose(actual_dim + 1, total_ndim).contiguous();
+            idx_out = torch::empty_like(idx_transposed);
+        }
+
+        // Get dimensions after transpose
+        int64_t slice_len = dist_transposed.size(-1);
+        int64_t num_slices = dist_transposed.numel() / slice_len;
+
+        int kernel_threads = min((int)slice_len, MAX_THREADS);
+
+        // Choose between shared memory and global memory kernel
+        bool use_shared = (slice_len <= SHARED_MEM_LIMIT);
+
+        if (use_shared) {
+            // Calculate shared memory size
+            size_t shared_mem_size = slice_len * sizeof(float) +  // v_val
+                                      slice_len * sizeof(int) +    // v_idx
+                                      (slice_len + 1) * sizeof(float);  // z
+
+            edt_1d_kernel<<<num_slices, kernel_threads, shared_mem_size>>>(
+                dist_transposed.data_ptr<float>(),
+                dist_out.data_ptr<float>(),
+                return_indices ? idx_transposed.data_ptr<int>() : nullptr,
+                return_indices ? idx_out.data_ptr<int>() : nullptr,
+                num_slices,
+                slice_len,
+                dist_transposed.numel(),
+                spatial_ndim,
+                dim_idx,
+                spacing,
+                is_final,
+                return_indices
+            );
+        } else {
+            // Allocate global memory buffers if needed
+            int64_t total_elements = dist_transposed.numel();
+            if (!g_v_val.defined() || g_v_val.numel() < total_elements) {
+                g_v_val = torch::empty({total_elements}, dist_transposed.options());
+                g_v_idx = torch::empty({total_elements}, dist_transposed.options().dtype(torch::kInt32));
+            }
+            if (!g_z.defined() || g_z.numel() < num_slices * (slice_len + 1)) {
+                g_z = torch::empty({num_slices * (slice_len + 1)}, dist_transposed.options());
+            }
+            if (!g_k.defined() || g_k.numel() < num_slices) {
+                g_k = torch::empty({num_slices}, dist_transposed.options().dtype(torch::kInt32));
+            }
+
+            edt_1d_kernel_global<<<num_slices, kernel_threads>>>(
+                dist_transposed.data_ptr<float>(),
+                dist_out.data_ptr<float>(),
+                return_indices ? idx_transposed.data_ptr<int>() : nullptr,
+                return_indices ? idx_out.data_ptr<int>() : nullptr,
+                g_v_val.data_ptr<float>(),
+                g_v_idx.data_ptr<int>(),
+                g_z.data_ptr<float>(),
+                g_k.data_ptr<int>(),
+                num_slices,
+                slice_len,
+                dist_transposed.numel(),
+                spatial_ndim,
+                dim_idx,
+                spacing,
+                is_final,
+                return_indices
+            );
+        }
+
+        // Transpose back
+        distance = dist_out.transpose(actual_dim, total_ndim - 1);
+        if (return_indices) {
+            indices = idx_out.transpose(actual_dim + 1, total_ndim);
+        }
+    }
+
+    return std::make_tuple(distance.contiguous(), return_indices ? indices.contiguous() : torch::Tensor());
+}
+
+// ==============================================================================
+// Python binding entry point
+// ==============================================================================
+std::tuple<torch::Tensor, torch::Tensor> distance_transform_edt_cuda(
+    torch::Tensor input,
+    std::vector<float> sampling,
+    bool return_distances,
+    bool return_indices
+) {
+    TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
+
+    int total_ndim = input.dim();
+
+    // Handle empty sampling (default to unit spacing for all spatial dimensions)
+    if (sampling.empty()) {
+        // Assume all dimensions are spatial if no sampling provided
+        // But typically input is (B, C, spatial...) so use total_ndim - 2
+        int spatial_ndim = total_ndim >= 3 ? total_ndim - 2 : total_ndim;
+        sampling.resize(spatial_ndim, 1.0f);
+    }
+
+    auto [distances, indices_result] = run_edt_separable(input, sampling, return_indices);
+
+    if (!return_indices) {
+        indices_result = torch::Tensor();
+    }
+
+    return std::make_tuple(distances, indices_result);
+}

From ea02c5b50406688bace689f070ad3a2b4c745b57 Mon Sep 17 00:00:00 2001
From: Yu Han Deng <moliflower233@163.com>
Date: Sat, 24 Jan 2026 23:04:28 +0800
Subject: [PATCH 52/56] speed up 2D

---
 torchmorph/csrc/distance_transform_edt.cu | 413 ++++++++++++++++++++++
 1 file changed, 413 insertions(+)

diff --git a/torchmorph/csrc/distance_transform_edt.cu b/torchmorph/csrc/distance_transform_edt.cu
index fee521c..25c7701 100644
--- a/torchmorph/csrc/distance_transform_edt.cu
+++ b/torchmorph/csrc/distance_transform_edt.cu
@@ -11,6 +11,393 @@
 #define INF_VAL 1e20f
 #define MAX_THREADS 256
 #define SHARED_MEM_LIMIT 2048  // Max dimension size for shared memory path (48KB limit)
+#define EDT_2D_MAX_DIM 4096    // Max dimension size for 2D optimized kernels
+
+// ==============================================================================
+// 2D Optimized: Initialization kernel
+// ==============================================================================
+__global__ void init_distance_2d_kernel(
+    const float* __restrict__ input,
+    float* __restrict__ distance,
+    int* __restrict__ indices_y,
+    int* __restrict__ indices_x,
+    int height,
+    int width,
+    int64_t batch_stride,
+    bool compute_indices
+) {
+    int64_t batch_idx = blockIdx.z;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (y >= height || x >= width) return;
+
+    int64_t idx = batch_idx * batch_stride + y * width + x;
+
+    float val = input[idx];
+    distance[idx] = (val != 0.0f) ? INF_VAL : 0.0f;
+
+    if (compute_indices) {
+        indices_y[idx] = y;
+        indices_x[idx] = x;
+    }
+}
+
+// ==============================================================================
+// 2D Optimized: Row-wise EDT (X direction) - contiguous access
+// Each block processes one row
+// ==============================================================================
+__global__ void edt_2d_rows_kernel(
+    const float* __restrict__ input,
+    float* __restrict__ output,
+    const int* __restrict__ input_idx_y,
+    const int* __restrict__ input_idx_x,
+    int* __restrict__ output_idx_y,
+    int* __restrict__ output_idx_x,
+    int height,
+    int width,
+    int64_t batch_stride,
+    float spacing,
+    bool compute_indices
+) {
+    // blockIdx.x = batch_idx * height + row_idx
+    int64_t linear_idx = blockIdx.x;
+    int row_idx = linear_idx % height;
+    int64_t batch_idx = linear_idx / height;
+
+    int64_t row_base = batch_idx * batch_stride + row_idx * width;
+
+    extern __shared__ char shared_mem[];
+    float* v_val = (float*)shared_mem;
+    int* v_idx = (int*)(v_val + width);
+    float* z = (float*)(v_idx + width);
+
+    int tid = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    // Load row into shared memory (contiguous access - optimal)
+    for (int i = tid; i < width; i += num_threads) {
+        v_val[i] = input[row_base + i];
+    }
+    __syncthreads();
+
+    // Build lower envelope (thread 0 only)
+    __shared__ int k_shared;
+
+    if (tid == 0) {
+        int k = -1;
+
+        for (int q = 0; q < width; q++) {
+            float fq = v_val[q];
+            if (fq >= INF_VAL * 0.5f) continue;
+
+            float q_pos = (float)q * spacing;
+            float q_pos_sq = q_pos * q_pos;
+
+            while (k >= 0) {
+                int vk = v_idx[k];
+                float vk_pos = (float)vk * spacing;
+                float fvk = v_val[vk];
+                float s = ((fq + q_pos_sq) - (fvk + vk_pos * vk_pos)) / (2.0f * (q_pos - vk_pos));
+
+                if (s > z[k]) break;
+                k--;
+            }
+
+            k++;
+            v_idx[k] = q;
+
+            if (k == 0) {
+                z[0] = -INF_VAL;
+            } else {
+                int vk_prev = v_idx[k - 1];
+                float vk_prev_pos = (float)vk_prev * spacing;
+                float fvk_prev = v_val[vk_prev];
+                z[k] = ((fq + q_pos_sq) - (fvk_prev + vk_prev_pos * vk_prev_pos)) /
+                       (2.0f * (q_pos - vk_prev_pos));
+            }
+            z[k + 1] = INF_VAL;
+        }
+        k_shared = k;
+    }
+    __syncthreads();
+
+    int k = k_shared;
+
+    // Parallel fill with binary search
+    for (int q = tid; q < width; q += num_threads) {
+        int64_t out_idx = row_base + q;
+
+        if (k < 0) {
+            output[out_idx] = INF_VAL;
+            if (compute_indices) {
+                output_idx_y[out_idx] = row_idx;
+                output_idx_x[out_idx] = 0;
+            }
+        } else {
+            float q_pos = (float)q * spacing;
+
+            // Binary search
+            int lo = 0, hi = k;
+            while (lo < hi) {
+                int mid = (lo + hi + 1) / 2;
+                if (z[mid] <= q_pos) lo = mid;
+                else hi = mid - 1;
+            }
+
+            int nearest = v_idx[lo];
+            float nearest_pos = (float)nearest * spacing;
+            float diff = q_pos - nearest_pos;
+            float dist_sq = diff * diff + v_val[nearest];
+
+            output[out_idx] = dist_sq;  // Keep squared for next pass
+
+            if (compute_indices) {
+                output_idx_y[out_idx] = row_idx;  // Y unchanged in X pass
+                output_idx_x[out_idx] = nearest;
+            }
+        }
+    }
+}
+
+// ==============================================================================
+// 2D Optimized: Column-wise EDT (Y direction) - strided access with shared memory
+// Each block processes one column
+// ==============================================================================
+__global__ void edt_2d_cols_kernel(
+    const float* __restrict__ input,
+    float* __restrict__ output,
+    const int* __restrict__ input_idx_y,
+    const int* __restrict__ input_idx_x,
+    int* __restrict__ output_idx_y,
+    int* __restrict__ output_idx_x,
+    int height,
+    int width,
+    int64_t batch_stride,
+    float spacing,
+    bool is_final,
+    bool compute_indices
+) {
+    // blockIdx.x = batch_idx * width + col_idx
+    int64_t linear_idx = blockIdx.x;
+    int col_idx = linear_idx % width;
+    int64_t batch_idx = linear_idx / width;
+
+    int64_t col_base = batch_idx * batch_stride + col_idx;
+    int stride = width;  // Stride to next row
+
+    extern __shared__ char shared_mem[];
+    float* v_val = (float*)shared_mem;
+    int* v_idx = (int*)(v_val + height);
+    float* z = (float*)(v_idx + height);
+    int* src_x = (int*)(z + height + 1);  // Store source X indices for index propagation
+
+    int tid = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    // Load column into shared memory (strided access - but only once)
+    for (int i = tid; i < height; i += num_threads) {
+        v_val[i] = input[col_base + i * stride];
+        if (compute_indices) {
+            src_x[i] = input_idx_x[col_base + i * stride];
+        }
+    }
+    __syncthreads();
+
+    // Build lower envelope (thread 0 only)
+    __shared__ int k_shared;
+
+    if (tid == 0) {
+        int k = -1;
+
+        for (int q = 0; q < height; q++) {
+            float fq = v_val[q];
+            if (fq >= INF_VAL * 0.5f) continue;
+
+            float q_pos = (float)q * spacing;
+            float q_pos_sq = q_pos * q_pos;
+
+            while (k >= 0) {
+                int vk = v_idx[k];
+                float vk_pos = (float)vk * spacing;
+                float fvk = v_val[vk];
+                float s = ((fq + q_pos_sq) - (fvk + vk_pos * vk_pos)) / (2.0f * (q_pos - vk_pos));
+
+                if (s > z[k]) break;
+                k--;
+            }
+
+            k++;
+            v_idx[k] = q;
+
+            if (k == 0) {
+                z[0] = -INF_VAL;
+            } else {
+                int vk_prev = v_idx[k - 1];
+                float vk_prev_pos = (float)vk_prev * spacing;
+                float fvk_prev = v_val[vk_prev];
+                z[k] = ((fq + q_pos_sq) - (fvk_prev + vk_prev_pos * vk_prev_pos)) /
+                       (2.0f * (q_pos - vk_prev_pos));
+            }
+            z[k + 1] = INF_VAL;
+        }
+        k_shared = k;
+    }
+    __syncthreads();
+
+    int k = k_shared;
+
+    // Parallel fill with binary search
+    for (int q = tid; q < height; q += num_threads) {
+        int64_t out_idx = col_base + q * stride;
+
+        if (k < 0) {
+            output[out_idx] = INF_VAL;
+            if (compute_indices) {
+                output_idx_y[out_idx] = 0;
+                output_idx_x[out_idx] = col_idx;
+            }
+        } else {
+            float q_pos = (float)q * spacing;
+
+            // Binary search
+            int lo = 0, hi = k;
+            while (lo < hi) {
+                int mid = (lo + hi + 1) / 2;
+                if (z[mid] <= q_pos) lo = mid;
+                else hi = mid - 1;
+            }
+
+            int nearest = v_idx[lo];
+            float nearest_pos = (float)nearest * spacing;
+            float diff = q_pos - nearest_pos;
+            float dist_sq = diff * diff + v_val[nearest];
+
+            output[out_idx] = is_final ? sqrtf(dist_sq) : dist_sq;
+
+            if (compute_indices) {
+                output_idx_y[out_idx] = nearest;
+                output_idx_x[out_idx] = src_x[nearest];  // Propagate X from source
+            }
+        }
+    }
+}
+
+// ==============================================================================
+// 2D Optimized: Host function
+// ==============================================================================
+std::tuple<torch::Tensor, torch::Tensor> run_edt_2d_optimized(
+    torch::Tensor input,
+    float spacing_y,
+    float spacing_x,
+    bool return_indices
+) {
+    TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
+    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be float32");
+
+    input = input.contiguous();
+
+    int total_ndim = input.dim();
+    TORCH_CHECK(total_ndim >= 2, "Input must have at least 2 dimensions");
+
+    auto shape = input.sizes().vec();
+    int height = shape[total_ndim - 2];
+    int width = shape[total_ndim - 1];
+    int64_t batch_stride = (int64_t)height * width;
+    int64_t batch_size = input.numel() / batch_stride;
+    int64_t total_pixels = input.numel();
+
+    // Check dimension limits for 2D optimized path
+    TORCH_CHECK(height <= EDT_2D_MAX_DIM && width <= EDT_2D_MAX_DIM,
+                "Dimensions too large for 2D optimized path");
+
+    // Create output tensors
+    auto distance = torch::empty_like(input);
+    auto temp = torch::empty_like(input);
+
+    torch::Tensor indices_y, indices_x, temp_idx_y, temp_idx_x;
+    if (return_indices) {
+        indices_y = torch::empty_like(input, input.options().dtype(torch::kInt32));
+        indices_x = torch::empty_like(input, input.options().dtype(torch::kInt32));
+        temp_idx_y = torch::empty_like(indices_y);
+        temp_idx_x = torch::empty_like(indices_x);
+    }
+
+    // Step 1: Initialize
+    {
+        dim3 block(16, 16);
+        dim3 grid((width + 15) / 16, (height + 15) / 16, batch_size);
+
+        init_distance_2d_kernel<<<grid, block>>>(
+            input.data_ptr<float>(),
+            distance.data_ptr<float>(),
+            return_indices ? indices_y.data_ptr<int>() : nullptr,
+            return_indices ? indices_x.data_ptr<int>() : nullptr,
+            height, width, batch_stride,
+            return_indices
+        );
+    }
+
+    // Step 2: Row-wise EDT (X direction)
+    {
+        int64_t num_rows = batch_size * height;
+        int threads = min(width, MAX_THREADS);
+        size_t shared_mem_size = width * sizeof(float) +      // v_val
+                                  width * sizeof(int) +        // v_idx
+                                  (width + 1) * sizeof(float); // z
+
+        edt_2d_rows_kernel<<<num_rows, threads, shared_mem_size>>>(
+            distance.data_ptr<float>(),
+            temp.data_ptr<float>(),
+            return_indices ? indices_y.data_ptr<int>() : nullptr,
+            return_indices ? indices_x.data_ptr<int>() : nullptr,
+            return_indices ? temp_idx_y.data_ptr<int>() : nullptr,
+            return_indices ? temp_idx_x.data_ptr<int>() : nullptr,
+            height, width, batch_stride,
+            spacing_x,
+            return_indices
+        );
+    }
+
+    // Step 3: Column-wise EDT (Y direction)
+    {
+        int64_t num_cols = batch_size * width;
+        int threads = min(height, MAX_THREADS);
+        size_t shared_mem_size = height * sizeof(float) +       // v_val
+                                  height * sizeof(int) +         // v_idx
+                                  (height + 1) * sizeof(float);  // z
+        if (return_indices) {
+            shared_mem_size += height * sizeof(int);  // src_x
+        }
+
+        edt_2d_cols_kernel<<<num_cols, threads, shared_mem_size>>>(
+            temp.data_ptr<float>(),
+            distance.data_ptr<float>(),
+            return_indices ? temp_idx_y.data_ptr<int>() : nullptr,
+            return_indices ? temp_idx_x.data_ptr<int>() : nullptr,
+            return_indices ? indices_y.data_ptr<int>() : nullptr,
+            return_indices ? indices_x.data_ptr<int>() : nullptr,
+            height, width, batch_stride,
+            spacing_y,
+            true,  // is_final
+            return_indices
+        );
+    }
+
+    // Combine indices into single tensor with shape [2, ...]
+    torch::Tensor indices;
+    if (return_indices) {
+        std::vector<int64_t> idx_shape = {2};
+        for (auto s : shape) idx_shape.push_back(s);
+        indices = torch::empty(idx_shape, input.options().dtype(torch::kInt32));
+
+        // Copy Y and X indices
+        indices.select(0, 0).copy_(indices_y);
+        indices.select(0, 1).copy_(indices_x);
+    }
+
+    return std::make_tuple(distance, indices);
+}
 
 // ==============================================================================
 // 1D EDT kernel using GLOBAL memory (for large dimensions)
@@ -460,6 +847,32 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_edt_cuda(
         sampling.resize(spatial_ndim, 1.0f);
     }
 
+    int spatial_ndim = sampling.size();
+
+    // Use 2D optimized path when applicable
+    if (spatial_ndim == 2) {
+        auto shape = input.sizes().vec();
+        int height = shape[total_ndim - 2];
+        int width = shape[total_ndim - 1];
+
+        // Check if dimensions are within limits for 2D optimized path
+        if (height <= EDT_2D_MAX_DIM && width <= EDT_2D_MAX_DIM) {
+            float spacing_y = sampling[0];
+            float spacing_x = sampling[1];
+
+            auto [distances, indices_result] = run_edt_2d_optimized(
+                input, spacing_y, spacing_x, return_indices
+            );
+
+            if (!return_indices) {
+                indices_result = torch::Tensor();
+            }
+
+            return std::make_tuple(distances, indices_result);
+        }
+    }
+
+    // Fall back to general N-D implementation
     auto [distances, indices_result] = run_edt_separable(input, sampling, return_indices);
 
     if (!return_indices) {

From 4fb5ee53bc978ca8e439decb00b2141773e10997 Mon Sep 17 00:00:00 2001
From: Yu Han Deng <moliflower233@163.com>
Date: Wed, 28 Jan 2026 17:53:39 +0800
Subject: [PATCH 53/56] add cdt function

---
 torchmorph/csrc/distance_transform_cdt.cu | 446 ++++++++++++++++++++++
 1 file changed, 446 insertions(+)
 create mode 100644 torchmorph/csrc/distance_transform_cdt.cu

diff --git a/torchmorph/csrc/distance_transform_cdt.cu b/torchmorph/csrc/distance_transform_cdt.cu
new file mode 100644
index 0000000..6de71dd
--- /dev/null
+++ b/torchmorph/csrc/distance_transform_cdt.cu
@@ -0,0 +1,446 @@
+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <tuple>
+#include <vector>
+
+#define CDT_BLOCK_SIZE 256
+#define CDT_INF_VAL 1000000000
+#define MAX_NDIM 16
+
+// ============================================================================
+// High-performance N-dimensional CDT using dimension-separable parallel scans
+// For each dimension, we do forward and backward sweeps that can be parallelized
+// across all other dimensions and batch elements.
+// ============================================================================
+
+// Initialize distance and indices
+__global__ void cdt_init_kernel(
+    const float* __restrict__ input,
+    int32_t* __restrict__ dist,
+    int32_t* __restrict__ indices,  // [spatial_ndim, total_elements]
+    int64_t total_elements,
+    int spatial_ndim,
+    int64_t spatial_elements,
+    const int64_t* __restrict__ spatial_strides,
+    bool compute_indices
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    if (input[tid] == 0.0f) {
+        dist[tid] = 0;
+        if (compute_indices) {
+            int64_t spatial_idx = tid % spatial_elements;
+            int64_t rem = spatial_idx;
+            for (int d = 0; d < spatial_ndim; d++) {
+                int64_t coord = rem / spatial_strides[d];
+                rem = rem % spatial_strides[d];
+                indices[d * total_elements + tid] = (int32_t)coord;
+            }
+        }
+    } else {
+        dist[tid] = CDT_INF_VAL;
+        if (compute_indices) {
+            for (int d = 0; d < spatial_ndim; d++) {
+                indices[d * total_elements + tid] = -1;
+            }
+        }
+    }
+}
+
+// ============================================================================
+// Dimension-wise sweep kernels for chessboard metric
+// Each thread handles one "line" along the scan dimension
+// ============================================================================
+
+// Forward sweep along dimension d (from 0 to size-1)
+// For chessboard: check neighbor at offset -1 in dimension d, and diagonal neighbors
+__global__ void cdt_sweep_forward_chessboard_kernel(
+    int32_t* __restrict__ dist,
+    int32_t* __restrict__ indices,
+    int64_t total_elements,
+    int64_t num_lines,           // number of parallel lines
+    int64_t line_stride,         // stride between elements in the same line
+    int64_t line_length,         // number of elements in one line
+    int64_t batch_stride,        // stride between batches
+    int64_t spatial_elements,
+    int spatial_ndim,
+    int scan_dim,
+    const int64_t* __restrict__ spatial_strides,
+    const int64_t* __restrict__ spatial_shape,
+    bool compute_indices
+) {
+    int64_t line_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (line_idx >= num_lines) return;
+
+    // Compute starting position for this line
+    int64_t batch_idx = line_idx / (spatial_elements / line_length);
+    int64_t within_batch = line_idx % (spatial_elements / line_length);
+
+    // Convert within_batch to actual spatial offset (excluding scan dimension)
+    int64_t spatial_offset = 0;
+    int64_t rem = within_batch;
+    for (int d = 0; d < spatial_ndim; d++) {
+        if (d == scan_dim) continue;
+        int64_t dim_size = spatial_shape[d];
+        int64_t coord = rem % dim_size;
+        rem /= dim_size;
+        spatial_offset += coord * spatial_strides[d];
+    }
+
+    int64_t base = batch_idx * spatial_elements + spatial_offset;
+
+    // Forward sweep: i = 0 to line_length-1
+    for (int64_t i = 1; i < line_length; i++) {
+        int64_t curr_idx = base + i * line_stride;
+        int32_t curr_dist = dist[curr_idx];
+
+        if (curr_dist == 0) continue;
+
+        // Check previous element in this dimension
+        int64_t prev_idx = base + (i - 1) * line_stride;
+        int32_t prev_dist = dist[prev_idx];
+
+        if (prev_dist < CDT_INF_VAL) {
+            int32_t new_dist = prev_dist + 1;
+            if (new_dist < curr_dist) {
+                dist[curr_idx] = new_dist;
+                if (compute_indices) {
+                    for (int d = 0; d < spatial_ndim; d++) {
+                        indices[d * total_elements + curr_idx] = indices[d * total_elements + prev_idx];
+                    }
+                }
+            }
+        }
+    }
+}
+
+// Backward sweep along dimension d (from size-1 to 0)
+__global__ void cdt_sweep_backward_chessboard_kernel(
+    int32_t* __restrict__ dist,
+    int32_t* __restrict__ indices,
+    int64_t total_elements,
+    int64_t num_lines,
+    int64_t line_stride,
+    int64_t line_length,
+    int64_t batch_stride,
+    int64_t spatial_elements,
+    int spatial_ndim,
+    int scan_dim,
+    const int64_t* __restrict__ spatial_strides,
+    const int64_t* __restrict__ spatial_shape,
+    bool compute_indices
+) {
+    int64_t line_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (line_idx >= num_lines) return;
+
+    int64_t batch_idx = line_idx / (spatial_elements / line_length);
+    int64_t within_batch = line_idx % (spatial_elements / line_length);
+
+    int64_t spatial_offset = 0;
+    int64_t rem = within_batch;
+    for (int d = 0; d < spatial_ndim; d++) {
+        if (d == scan_dim) continue;
+        int64_t dim_size = spatial_shape[d];
+        int64_t coord = rem % dim_size;
+        rem /= dim_size;
+        spatial_offset += coord * spatial_strides[d];
+    }
+
+    int64_t base = batch_idx * spatial_elements + spatial_offset;
+
+    // Backward sweep: i = line_length-2 down to 0
+    for (int64_t i = line_length - 2; i >= 0; i--) {
+        int64_t curr_idx = base + i * line_stride;
+        int32_t curr_dist = dist[curr_idx];
+
+        if (curr_dist == 0) continue;
+
+        int64_t next_idx = base + (i + 1) * line_stride;
+        int32_t next_dist = dist[next_idx];
+
+        if (next_dist < CDT_INF_VAL) {
+            int32_t new_dist = next_dist + 1;
+            if (new_dist < curr_dist) {
+                dist[curr_idx] = new_dist;
+                if (compute_indices) {
+                    for (int d = 0; d < spatial_ndim; d++) {
+                        indices[d * total_elements + curr_idx] = indices[d * total_elements + next_idx];
+                    }
+                }
+            }
+        }
+    }
+}
+
+// ============================================================================
+// Diagonal sweep kernels for chessboard metric (handles corner neighbors)
+// ============================================================================
+
+// Check all neighbors at distance 1 in chessboard metric
+__global__ void cdt_diagonal_pass_kernel(
+    int32_t* __restrict__ dist,
+    int32_t* __restrict__ indices,
+    int64_t total_elements,
+    int64_t batch_size,
+    int64_t spatial_elements,
+    int spatial_ndim,
+    const int64_t* __restrict__ spatial_strides,
+    const int64_t* __restrict__ spatial_shape,
+    const int32_t* __restrict__ offsets,
+    int num_offsets,
+    bool compute_indices,
+    bool forward  // true for forward pass, false for backward
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    int32_t curr_dist = dist[tid];
+    if (curr_dist == 0) return;
+
+    int64_t batch_idx = tid / spatial_elements;
+    int64_t spatial_idx = tid % spatial_elements;
+    int64_t base = batch_idx * spatial_elements;
+
+    // Compute current coordinates
+    int32_t coords[MAX_NDIM];
+    int64_t rem = spatial_idx;
+    for (int d = 0; d < spatial_ndim; d++) {
+        coords[d] = (int32_t)(rem / spatial_strides[d]);
+        rem = rem % spatial_strides[d];
+    }
+
+    int32_t min_dist = curr_dist;
+    int best_neighbor = -1;
+
+    for (int n = 0; n < num_offsets; n++) {
+        int64_t neighbor_spatial = spatial_idx + offsets[n];
+
+        // Check bounds and no wrap-around
+        if (neighbor_spatial < 0 || neighbor_spatial >= spatial_elements) continue;
+
+        // Verify no wrap-around by checking coordinate differences
+        int64_t n_rem = neighbor_spatial;
+        bool valid = true;
+        for (int d = 0; d < spatial_ndim; d++) {
+            int32_t n_coord = (int32_t)(n_rem / spatial_strides[d]);
+            n_rem = n_rem % spatial_strides[d];
+            int32_t diff = coords[d] - n_coord;
+            if (diff < -1 || diff > 1) {
+                valid = false;
+                break;
+            }
+        }
+        if (!valid) continue;
+
+        int64_t neighbor_idx = base + neighbor_spatial;
+        int32_t neighbor_dist = dist[neighbor_idx];
+
+        if (neighbor_dist < CDT_INF_VAL) {
+            int32_t new_dist = neighbor_dist + 1;
+            if (new_dist < min_dist) {
+                min_dist = new_dist;
+                best_neighbor = n;
+            }
+        }
+    }
+
+    if (min_dist < curr_dist) {
+        dist[tid] = min_dist;
+        if (compute_indices && best_neighbor >= 0) {
+            int64_t src_idx = base + spatial_idx + offsets[best_neighbor];
+            for (int d = 0; d < spatial_ndim; d++) {
+                indices[d * total_elements + tid] = indices[d * total_elements + src_idx];
+            }
+        }
+    }
+}
+
+// ============================================================================
+// Taxicab metric uses simpler dimension-separable sweeps (no diagonals)
+// ============================================================================
+
+std::tuple<torch::Tensor, torch::Tensor> distance_transform_cdt_cuda(
+    torch::Tensor input,
+    const std::string& metric,
+    bool return_distances,
+    bool return_indices
+) {
+    TORCH_CHECK(input.is_cuda(), "Input must be CUDA tensor");
+    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Input must be float32");
+    TORCH_CHECK(metric == "chessboard" || metric == "taxicab",
+                "metric must be 'chessboard' or 'taxicab'");
+    TORCH_CHECK(return_distances || return_indices,
+                "At least one of return_distances or return_indices must be True");
+
+    input = input.contiguous();
+
+    bool is_taxicab = (metric == "taxicab");
+    int total_ndim = input.dim();
+
+    TORCH_CHECK(total_ndim >= 3, "Input must be (B, C, Spatial...) format with at least 3 dimensions");
+
+    auto shape_vec = input.sizes().vec();
+    int64_t batch_size = shape_vec[0] * shape_vec[1];
+    int spatial_ndim = total_ndim - 2;
+
+    TORCH_CHECK(spatial_ndim >= 1 && spatial_ndim <= MAX_NDIM,
+                "CDT supports 1D-" + std::to_string(MAX_NDIM) + "D spatial dimensions");
+
+    std::vector<int64_t> spatial_shape(spatial_ndim);
+    std::vector<int64_t> spatial_strides(spatial_ndim);
+
+    int64_t spatial_elements = 1;
+    for (int d = 0; d < spatial_ndim; d++) {
+        spatial_shape[d] = shape_vec[d + 2];
+        spatial_elements *= spatial_shape[d];
+    }
+
+    spatial_strides[spatial_ndim - 1] = 1;
+    for (int d = spatial_ndim - 2; d >= 0; d--) {
+        spatial_strides[d] = spatial_strides[d + 1] * spatial_shape[d + 1];
+    }
+
+    int64_t total_elements = input.numel();
+
+    // Allocate output tensors
+    auto dist = torch::empty({total_elements}, input.options().dtype(torch::kInt32));
+    torch::Tensor indices;
+    if (return_indices) {
+        indices = torch::empty({spatial_ndim, total_elements}, input.options().dtype(torch::kInt32));
+    }
+
+    // Copy shape/strides to device
+    auto spatial_shape_tensor = torch::tensor(spatial_shape, torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
+    auto spatial_strides_tensor = torch::tensor(spatial_strides, torch::TensorOptions().dtype(torch::kInt64).device(input.device()));
+
+    // Initialize
+    int block = CDT_BLOCK_SIZE;
+    int grid = (total_elements + block - 1) / block;
+
+    cdt_init_kernel<<<grid, block>>>(
+        input.data_ptr<float>(),
+        dist.data_ptr<int32_t>(),
+        return_indices ? indices.data_ptr<int32_t>() : nullptr,
+        total_elements,
+        spatial_ndim,
+        spatial_elements,
+        spatial_strides_tensor.data_ptr<int64_t>(),
+        return_indices
+    );
+
+    // For each dimension, do forward and backward sweeps
+    for (int d = 0; d < spatial_ndim; d++) {
+        int64_t line_length = spatial_shape[d];
+        int64_t line_stride = spatial_strides[d];
+        int64_t num_lines = batch_size * (spatial_elements / line_length);
+
+        int sweep_block = CDT_BLOCK_SIZE;
+        int sweep_grid = (num_lines + sweep_block - 1) / sweep_block;
+
+        // Forward sweep
+        cdt_sweep_forward_chessboard_kernel<<<sweep_grid, sweep_block>>>(
+            dist.data_ptr<int32_t>(),
+            return_indices ? indices.data_ptr<int32_t>() : nullptr,
+            total_elements,
+            num_lines,
+            line_stride,
+            line_length,
+            spatial_elements,
+            spatial_elements,
+            spatial_ndim,
+            d,
+            spatial_strides_tensor.data_ptr<int64_t>(),
+            spatial_shape_tensor.data_ptr<int64_t>(),
+            return_indices
+        );
+
+        // Backward sweep
+        cdt_sweep_backward_chessboard_kernel<<<sweep_grid, sweep_block>>>(
+            dist.data_ptr<int32_t>(),
+            return_indices ? indices.data_ptr<int32_t>() : nullptr,
+            total_elements,
+            num_lines,
+            line_stride,
+            line_length,
+            spatial_elements,
+            spatial_elements,
+            spatial_ndim,
+            d,
+            spatial_strides_tensor.data_ptr<int64_t>(),
+            spatial_shape_tensor.data_ptr<int64_t>(),
+            return_indices
+        );
+    }
+
+    // For chessboard metric, we need additional diagonal passes
+    if (!is_taxicab && spatial_ndim >= 2) {
+        // Generate diagonal offsets
+        std::vector<int32_t> diagonal_offsets;
+
+        // All neighbors in 3^ndim hypercube except axis-aligned ones
+        int total_combos = 1;
+        for (int d = 0; d < spatial_ndim; d++) total_combos *= 3;
+
+        for (int i = 0; i < total_combos; i++) {
+            int temp = i;
+            int64_t offset = 0;
+            int non_zero_count = 0;
+
+            for (int d = 0; d < spatial_ndim; d++) {
+                int dir = (temp % 3) - 1;
+                temp /= 3;
+                if (dir != 0) non_zero_count++;
+                offset += dir * spatial_strides[d];
+            }
+
+            // Only include diagonal neighbors (more than one non-zero direction)
+            if (non_zero_count >= 2) {
+                diagonal_offsets.push_back((int32_t)offset);
+            }
+        }
+
+        if (!diagonal_offsets.empty()) {
+            auto offsets_tensor = torch::tensor(diagonal_offsets, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
+
+            // Multiple passes to propagate diagonal distances
+            // Need more passes for higher dimensions to ensure full propagation
+            int num_passes = spatial_ndim * 2;  // Scale with dimensions
+            for (int pass = 0; pass < num_passes; pass++) {
+                cdt_diagonal_pass_kernel<<<grid, block>>>(
+                    dist.data_ptr<int32_t>(),
+                    return_indices ? indices.data_ptr<int32_t>() : nullptr,
+                    total_elements,
+                    batch_size,
+                    spatial_elements,
+                    spatial_ndim,
+                    spatial_strides_tensor.data_ptr<int64_t>(),
+                    spatial_shape_tensor.data_ptr<int64_t>(),
+                    offsets_tensor.data_ptr<int32_t>(),
+                    diagonal_offsets.size(),
+                    return_indices,
+                    pass % 2 == 0
+                );
+            }
+        }
+    }
+
+    // Prepare output
+    torch::Tensor result_dist;
+    torch::Tensor result_indices;
+
+    if (return_distances) {
+        result_dist = dist.to(torch::kFloat32).view(input.sizes());
+    }
+
+    if (return_indices) {
+        std::vector<int64_t> idx_shape = {spatial_ndim};
+        for (int d = 0; d < total_ndim; d++) {
+            idx_shape.push_back(shape_vec[d]);
+        }
+        result_indices = indices.view(idx_shape);
+    }
+
+    return std::make_tuple(result_dist, result_indices);
+}

From 1d3546b9cf81b5e73387f64be42a6b11b5d02f17 Mon Sep 17 00:00:00 2001
From: Yu Han Deng <moliflower233@163.com>
Date: Fri, 30 Jan 2026 15:47:07 +0800
Subject: [PATCH 54/56] Feat: Implement Chamfer Distance (CDT) & Resolve merge
 conflicts

---
 benchmark/distance_transform_cdt.py           |  89 ++
 ...transform.py => distance_transform_edt.py} |   8 +-
 test/test_distance_transform.py               | 182 ----
 test/test_distance_transform_cdt.py           | 500 +++++++++
 test/test_distance_transform_edt.py           | 396 ++++++++
 torchmorph/__init__.py                        |  18 +-
 torchmorph/csrc/distance_transform_edt.cu     |  21 +-
 torchmorph/csrc/distance_transform_kernel.cu  | 945 ------------------
 torchmorph/csrc/torchmorph.cpp                |  34 +-
 torchmorph/distance_transform.py              | 266 ++++-
 10 files changed, 1308 insertions(+), 1151 deletions(-)
 create mode 100644 benchmark/distance_transform_cdt.py
 rename benchmark/{distance_transform.py => distance_transform_edt.py} (90%)
 delete mode 100644 test/test_distance_transform.py
 create mode 100644 test/test_distance_transform_cdt.py
 create mode 100644 test/test_distance_transform_edt.py
 delete mode 100644 torchmorph/csrc/distance_transform_kernel.cu

diff --git a/benchmark/distance_transform_cdt.py b/benchmark/distance_transform_cdt.py
new file mode 100644
index 0000000..4e56135
--- /dev/null
+++ b/benchmark/distance_transform_cdt.py
@@ -0,0 +1,89 @@
+import scipy.ndimage as ndi  # noqa: F401
+import torch
+import torch.utils.benchmark as benchmark
+from prettytable import PrettyTable
+
+import torchmorph as tm  # noqa: F401
+
+sizes = [64, 128, 256, 512, 1024]
+batches = [1, 4, 8, 16]
+dtype = torch.float32
+device = "cuda"
+MIN_RUN = 1.0  # seconds per measurement
+
+torch.set_num_threads(torch.get_num_threads())
+
+for metric in ["chessboard", "taxicab"]:
+    print(f"\n{'='*60}")
+    print(f"  CDT Benchmark - Metric: {metric}")
+    print(f"{'='*60}")
+
+    for B in batches:
+        table = PrettyTable()
+        table.field_names = [
+            "Size",
+            "SciPy (ms/img)",
+            "Torch 1× (ms/img)",
+            "Torch batch (ms/img)",
+            "Speedup 1×",
+            "Speedup batch",
+        ]
+        for c in table.field_names:
+            table.align[c] = "r"
+
+        for s in sizes:
+            # Inputs: (B, C, H, W) format - C=1 for single channel
+            x = (torch.randn(B, 1, s, s, device=device) > 0).to(dtype)
+            # For scipy, we need (H, W) arrays
+            x_np_list = [x[i, 0].detach().cpu().numpy() for i in range(B)]
+            # For torch single image processing: each is (1, 1, H, W)
+            x_imgs = [x[i : i + 1] for i in range(B)]
+
+            # SciPy (CPU, one-by-one)
+            stmt_scipy = (
+                f"out = [ndi.distance_transform_cdt(arr, metric='{metric}') for arr in x_np_list]"
+            )
+            t_scipy = benchmark.Timer(
+                stmt=stmt_scipy,
+                setup="from __main__ import x_np_list, ndi",
+                num_threads=torch.get_num_threads(),
+            ).blocked_autorange(min_run_time=MIN_RUN)
+            scipy_per_img_ms = (t_scipy.median * 1e3) / B
+
+            # Torch (CUDA, one-by-one)
+            stmt_torch1 = f"""
+for xi in x_imgs:
+    tm.distance_transform_cdt(xi, metric='{metric}')
+"""
+            t_torch1 = benchmark.Timer(
+                stmt=stmt_torch1,
+                setup="from __main__ import x_imgs, tm",
+                num_threads=torch.get_num_threads(),
+            ).blocked_autorange(min_run_time=MIN_RUN)
+            torch1_per_img_ms = (t_torch1.median * 1e3) / B
+
+            # Torch (CUDA, batched)
+            t_batch = benchmark.Timer(
+                stmt=f"tm.distance_transform_cdt(x, metric='{metric}')",
+                setup="from __main__ import x, tm",
+                num_threads=torch.get_num_threads(),
+            ).blocked_autorange(min_run_time=MIN_RUN)
+            torchB_per_img_ms = (t_batch.median * 1e3) / B
+
+            # Speedups
+            speed1 = scipy_per_img_ms / torch1_per_img_ms
+            speedB = scipy_per_img_ms / torchB_per_img_ms
+
+            table.add_row(
+                [
+                    s,
+                    f"{scipy_per_img_ms:.3f}",
+                    f"{torch1_per_img_ms:.3f}",
+                    f"{torchB_per_img_ms:.3f}",
+                    f"{speed1:.1f}×",
+                    f"{speedB:.1f}×",
+                ]
+            )
+
+        print(f"\n=== Metric: {metric}, Batch Size: {B} ===")
+        print(table)
diff --git a/benchmark/distance_transform.py b/benchmark/distance_transform_edt.py
similarity index 90%
rename from benchmark/distance_transform.py
rename to benchmark/distance_transform_edt.py
index 9bfe27e..3bb388d 100644
--- a/benchmark/distance_transform.py
+++ b/benchmark/distance_transform_edt.py
@@ -27,9 +27,11 @@
         table.align[c] = "r"
 
     for s in sizes:
-        # Inputs
+        # Inputs: (B, C, H, W) format - C=1 for single channel
         x = (torch.randn(B, 1, s, s, device=device) > 0).to(dtype)
+        # For scipy, we need (H, W) arrays
         x_np_list = [x[i, 0].detach().cpu().numpy() for i in range(B)]
+        # For torch single image processing: each is (1, 1, H, W)
         x_imgs = [x[i : i + 1] for i in range(B)]
 
         # SciPy (CPU, one-by-one)
@@ -44,7 +46,7 @@
         # Torch (CUDA, one-by-one)
         stmt_torch1 = """
 for xi in x_imgs:
-    tm.distance_transform(xi)
+    tm.distance_transform_edt(xi)
 """
         t_torch1 = benchmark.Timer(
             stmt=stmt_torch1,
@@ -55,7 +57,7 @@
 
         # Torch (CUDA, batched)
         t_batch = benchmark.Timer(
-            stmt="tm.distance_transform(x)",
+            stmt="tm.distance_transform_edt(x)",
             setup="from __main__ import x, tm",
             num_threads=torch.get_num_threads(),
         ).blocked_autorange(min_run_time=MIN_RUN)
diff --git a/test/test_distance_transform.py b/test/test_distance_transform.py
deleted file mode 100644
index 5855bf3..0000000
--- a/test/test_distance_transform.py
+++ /dev/null
@@ -1,182 +0,0 @@
-import numpy as np  # noqa: F401
-import pytest
-import torch
-from scipy.ndimage import distance_transform_edt as scipy_edt  # noqa: F401
-
-import torchmorph as tm  # noqa: F401
-
-
-# ======================================================================
-# Helper functions
-# ======================================================================
-def batch_scipy_edt_with_indices(
-    batch_numpy: np.ndarray,
-) -> tuple[np.ndarray, np.ndarray]:
-    """Compute SciPy EDT and indices for a batch of arrays."""
-    dist_results: list[np.ndarray] = []
-    indices_results: list[np.ndarray] = []
-
-    # Ensure batch_numpy has at least shape (Batch, ...)
-    # If the input is (H, W), it is already converted to (1, H, W) outside.
-    if batch_numpy.ndim == 1:
-        batch_numpy = batch_numpy[np.newaxis, ...]
-
-    for sample in batch_numpy:
-        dist, indices = scipy_edt(
-            sample,
-            return_indices=True,
-            return_distances=True,
-        )
-        dist_results.append(dist)
-        indices_results.append(indices)
-
-    output_dist = np.stack(dist_results, axis=0)
-    output_indices = np.stack(indices_results, axis=0)
-    output_indices = np.moveaxis(output_indices, 1, -1)
-
-    return output_dist, output_indices
-
-
-# ======================================================================
-# Test data
-# ======================================================================
-case_batch_1d = np.array(
-    [[1, 1, 0, 1, 0, 1], [0, 1, 1, 1, 1, 0]],
-    dtype=np.float32,
-)
-
-case_batch_2d = np.array(
-    [
-        [[0.0, 1, 1, 1], [0, 0, 1, 1], [0, 1, 1, 0]],
-        [[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]],
-    ],
-    dtype=np.float32,
-)
-
-# This is a single 2D image with shape (4, 4)
-case_single_2d = np.array(
-    [
-        [0, 1, 0, 1],
-        [1, 0, 1, 0],
-        [0, 1, 0, 1],
-        [1, 0, 1, 0],
-    ],
-    dtype=np.float32,
-)
-case_explicit_batch_one = case_single_2d[np.newaxis, ...]
-
-_case_3d_s1 = np.ones((4, 5, 6), dtype=np.float32)
-_case_3d_s1[1, 1, 1] = 0.0
-_case_3d_s1[2, 3, 4] = 0.0
-
-_case_3d_s2 = np.ones((4, 5, 6), dtype=np.float32)
-_case_3d_s2[0, 0, 0] = 0.0
-
-case_batch_3d = np.stack([_case_3d_s1, _case_3d_s2], axis=0)
-
-case_dim_one = np.ones((2, 5, 1), dtype=np.float32)
-case_dim_one[0, 2, 0] = 0.0
-case_dim_one[1, 4, 0] = 0.0
-
-# 4D spatial case
-_case_4d_s1 = np.ones((3, 3, 3, 3), dtype=np.float32)
-_case_4d_s1[0, 0, 0, 0] = 0.0
-
-_case_4d_s2 = np.ones((3, 3, 3, 3), dtype=np.float32)
-_case_4d_s2[1, 1, 1, 1] = 0.0
-
-case_batch_4d_spatial = np.stack([_case_4d_s1, _case_4d_s2], axis=0)
-
-# 5D spatial case
-case_batch_5d_spatial = np.ones((1, 2, 2, 2, 2, 2), dtype=np.float32)
-case_batch_5d_spatial[0, 0, 0, 0, 0, 0] = 0.0
-case_batch_5d_spatial[0, 1, 1, 1, 1, 1] = 0.0
-
-
-# ======================================================================
-# Test logic
-# ======================================================================
-@pytest.mark.parametrize(
-    "input_numpy, has_batch_dim",
-    [
-        pytest.param(case_batch_1d, True, id="1D_Batch"),
-        pytest.param(case_batch_2d, True, id="2D_Batch"),
-        pytest.param(case_single_2d, False, id="2D_Single_NoBatch"),
-        pytest.param(
-            case_explicit_batch_one,
-            True,
-            id="2D_Single_ExplicitBatch",
-        ),
-        pytest.param(case_batch_3d, True, id="3D_Batch"),
-        pytest.param(case_dim_one, True, id="2D_UnitDim_Batch"),
-        pytest.param(case_batch_4d_spatial, True, id="4D_Spatial_Batch"),
-        pytest.param(case_batch_5d_spatial, True, id="5D_Spatial_Batch"),
-    ],
-)
-def test_distance_transform_and_indices(
-    input_numpy: np.ndarray,
-    has_batch_dim: bool,
-    request: pytest.FixtureRequest,
-) -> None:
-    if not torch.cuda.is_available():
-        pytest.skip("CUDA not available")
-
-    # 1. Prepare NumPy data
-    x_numpy_contiguous = np.ascontiguousarray(input_numpy)
-
-    # 2. Prepare SciPy input.
-    # If this is a single sample (has_batch_dim=False), manually add a
-    # batch dimension so SciPy treats it as one image instead of N 1D
-    # signals.
-    if not has_batch_dim:
-        scipy_input = x_numpy_contiguous[np.newaxis, ...]
-    else:
-        scipy_input = x_numpy_contiguous
-
-    # 3. Prepare CUDA input.
-    # If has_batch_dim=False, the input is (H, W) and we want 2D EDT.
-    # The C++ API assumes the first dimension is batch, so we must
-    # unsqueeze(0) to get shape (1, H, W). Otherwise, it will be
-    # interpreted as (Batch=H, Length=W) and run 1D EDT.
-    x_cuda = torch.from_numpy(x_numpy_contiguous).cuda()
-    if not has_batch_dim:
-        x_cuda = x_cuda.unsqueeze(0)
-
-    print(f"\n\n--- Running test: {request.node.callspec.id} ---")
-    print(f"CUDA input shape: {x_cuda.shape}")
-
-    # 4. Run CUDA EDT
-    dist_cuda, idx_cuda = tm.distance_transform(x_cuda.clone())
-
-    # 5. Run SciPy (ground truth)
-    dist_ref_numpy, idx_ref_numpy = batch_scipy_edt_with_indices(scipy_input)
-    dist_ref = torch.from_numpy(dist_ref_numpy).to(torch.float32).cuda()
-
-    # 6. Validate distances
-    print(
-        f"CUDA distance shape: {dist_cuda.shape}, " f"reference shape: {dist_ref.shape}",
-    )
-    assert (
-        dist_cuda.shape == dist_ref.shape
-    ), f"Shape mismatch: {dist_cuda.shape} vs {dist_ref.shape}"
-    torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-3, rtol=1e-3)
-    print(">> Distance validation passed.")
-
-    # 7. Validate indices
-    # idx_cuda: (B, H, W, D)
-    spatial_shape = x_cuda.shape[1:]
-    coords = [torch.arange(s, device="cuda") for s in spatial_shape]
-    grid = torch.stack(torch.meshgrid(*coords, indexing="ij"), dim=-1)
-    grid = grid.unsqueeze(0)  # (1, H, W, D)
-
-    diff = grid.float() - idx_cuda.float()
-    dist_sq_calculated = torch.sum(diff * diff, dim=-1)
-    dist_sq_output = dist_cuda * dist_cuda
-
-    torch.testing.assert_close(
-        dist_sq_calculated,
-        dist_sq_output,
-        atol=1e-3,
-        rtol=1e-3,
-    )
-    print(">> Index validation passed.")
diff --git a/test/test_distance_transform_cdt.py b/test/test_distance_transform_cdt.py
new file mode 100644
index 0000000..b35248f
--- /dev/null
+++ b/test/test_distance_transform_cdt.py
@@ -0,0 +1,500 @@
+import numpy as np
+import pytest
+import torch
+from scipy.ndimage import distance_transform_cdt as scipy_cdt
+
+import torchmorph as tm
+
+
+# ======================================================================
+# Helper functions
+# ======================================================================
+def batch_scipy_cdt(
+    batch_numpy: np.ndarray,
+    metric: str = "chessboard",
+    return_indices: bool = False,
+    spatial_ndim: int = 2,
+) -> tuple[np.ndarray, np.ndarray | None]:
+    """Compute SciPy CDT for a batch of arrays.
+
+    Args:
+        batch_numpy: Input array with shape (batch..., spatial...)
+        metric: 'chessboard' or 'taxicab'
+        return_indices: Whether to return indices
+        spatial_ndim: Number of spatial dimensions (1, 2 or 3)
+    """
+    original_shape = batch_numpy.shape
+    spatial_shape = original_shape[-spatial_ndim:]
+    batch_shape = original_shape[:-spatial_ndim]
+
+    if len(batch_shape) > 0:
+        batch_size = int(np.prod(batch_shape))
+        flat_input = batch_numpy.reshape(batch_size, *spatial_shape)
+    else:
+        batch_size = 1
+        flat_input = batch_numpy[np.newaxis, ...]
+
+    dist_results: list[np.ndarray] = []
+    indices_results: list[np.ndarray] = []
+
+    for sample in flat_input:
+        if return_indices:
+            dist, indices = scipy_cdt(
+                sample,
+                metric=metric,
+                return_distances=True,
+                return_indices=True,
+            )
+            dist_results.append(dist)
+            indices_results.append(indices)
+        else:
+            dist = scipy_cdt(sample, metric=metric)
+            dist_results.append(dist)
+
+    output_dist = np.stack(dist_results, axis=0)
+
+    # Reshape back
+    if len(batch_shape) > 0:
+        output_dist = output_dist.reshape(*batch_shape, *spatial_shape)
+    else:
+        output_dist = output_dist[0]
+
+    if return_indices:
+        output_indices = np.stack(indices_results, axis=0)
+        if len(batch_shape) > 0:
+            output_indices = output_indices.reshape(*batch_shape, spatial_ndim, *spatial_shape)
+        else:
+            output_indices = output_indices[0]
+        return output_dist, output_indices
+
+    return output_dist, None
+
+
+# ======================================================================
+# Test data: (B, C, Spatial...) format
+# ======================================================================
+# 1D spatial: (B=2, C=1, W=9)
+case_1d = np.array(
+    [[[0, 1, 1, 1, 1, 0, 1, 1, 0]], [[1, 1, 0, 1, 1, 1, 1, 0, 1]]],
+    dtype=np.float32,
+)
+
+# 2D spatial: (B=1, C=1, H=5, W=6)
+case_2d_simple = np.array(
+    [
+        [
+            [
+                [0, 1, 1, 1, 1, 0],
+                [1, 1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1, 1],
+                [0, 1, 1, 1, 1, 0],
+            ]
+        ]
+    ],
+    dtype=np.float32,
+)
+
+# 2D spatial batch: (B=2, C=1, H=4, W=5)
+case_2d_batch = np.array(
+    [
+        [[[0, 1, 1, 1, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [0, 1, 1, 1, 0]]],
+        [[[1, 1, 0, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 0, 1, 1]]],
+    ],
+    dtype=np.float32,
+)
+
+# 2D checkerboard: (B=1, C=1, H=4, W=4)
+case_checkerboard = np.array(
+    [
+        [
+            [
+                [0, 1, 0, 1],
+                [1, 0, 1, 0],
+                [0, 1, 0, 1],
+                [1, 0, 1, 0],
+            ]
+        ]
+    ],
+    dtype=np.float32,
+)
+
+# 3D spatial: (B=1, C=1, D=5, H=5, W=5)
+_case_3d_simple = np.zeros((1, 1, 5, 5, 5), dtype=np.float32)
+_case_3d_simple[0, 0, 1:4, 1:4, 1:4] = 1  # 3x3x3 cube of foreground
+case_3d_simple = _case_3d_simple
+
+# 3D sphere: (B=1, C=1, D=7, H=7, W=7)
+_case_3d_sphere = np.zeros((1, 1, 7, 7, 7), dtype=np.float32)
+for z in range(7):
+    for y in range(7):
+        for x in range(7):
+            if (z - 3) ** 2 + (y - 3) ** 2 + (x - 3) ** 2 <= 4:
+                _case_3d_sphere[0, 0, z, y, x] = 1
+case_3d_sphere = _case_3d_sphere
+
+# 3D batch: (B=2, C=1, D=4, H=5, W=6)
+_case_3d_batch_s1 = np.ones((1, 4, 5, 6), dtype=np.float32)
+_case_3d_batch_s1[0, 1, 1, 1] = 0.0
+_case_3d_batch_s1[0, 2, 3, 4] = 0.0
+
+_case_3d_batch_s2 = np.ones((1, 4, 5, 6), dtype=np.float32)
+_case_3d_batch_s2[0, 0, 0, 0] = 0.0
+
+case_3d_batch = np.stack(
+    [_case_3d_batch_s1, _case_3d_batch_s2], axis=0
+)  # (B=2, C=1, D=4, H=5, W=6)
+
+
+# ======================================================================
+# Test basic CDT functionality with BCHW format
+# ======================================================================
+@pytest.mark.parametrize(
+    "input_numpy, spatial_ndim, metric",
+    [
+        pytest.param(case_1d, 1, "chessboard", id="1D_B2C1_chessboard"),
+        pytest.param(case_1d, 1, "taxicab", id="1D_B2C1_taxicab"),
+        pytest.param(case_2d_simple, 2, "chessboard", id="2D_B1C1_chessboard"),
+        pytest.param(case_2d_simple, 2, "taxicab", id="2D_B1C1_taxicab"),
+        pytest.param(case_2d_batch, 2, "chessboard", id="2D_B2C1_chessboard"),
+        pytest.param(case_2d_batch, 2, "taxicab", id="2D_B2C1_taxicab"),
+        pytest.param(case_checkerboard, 2, "chessboard", id="2D_checkerboard_chessboard"),
+        pytest.param(case_checkerboard, 2, "taxicab", id="2D_checkerboard_taxicab"),
+        pytest.param(case_3d_simple, 3, "chessboard", id="3D_B1C1_simple_chessboard"),
+        pytest.param(case_3d_simple, 3, "taxicab", id="3D_B1C1_simple_taxicab"),
+        pytest.param(case_3d_sphere, 3, "chessboard", id="3D_B1C1_sphere_chessboard"),
+        pytest.param(case_3d_sphere, 3, "taxicab", id="3D_B1C1_sphere_taxicab"),
+        pytest.param(case_3d_batch, 3, "chessboard", id="3D_B2C1_batch_chessboard"),
+        pytest.param(case_3d_batch, 3, "taxicab", id="3D_B2C1_batch_taxicab"),
+    ],
+)
+def test_cdt_basic(
+    input_numpy: np.ndarray,
+    spatial_ndim: int,
+    metric: str,
+    request: pytest.FixtureRequest,
+) -> None:
+    """Test CDT distance computation against scipy with BCHW format."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    x_numpy_contiguous = np.ascontiguousarray(input_numpy)
+    x_cuda = torch.from_numpy(x_numpy_contiguous).cuda()
+
+    print(f"\n\n--- Running test: {request.node.callspec.id} ---")
+    print(f"CUDA input shape: {x_cuda.shape}, spatial_ndim: {spatial_ndim}, metric: {metric}")
+
+    # Run torchmorph CDT
+    dist_cuda = tm.distance_transform_cdt(x_cuda, metric=metric)
+
+    # Run scipy CDT (ground truth)
+    dist_scipy, _ = batch_scipy_cdt(x_numpy_contiguous, metric=metric, spatial_ndim=spatial_ndim)
+    dist_ref = torch.from_numpy(dist_scipy).to(torch.float32).cuda()
+
+    print(f"CUDA distance shape: {dist_cuda.shape}, reference shape: {dist_ref.shape}")
+    assert (
+        dist_cuda.shape == dist_ref.shape
+    ), f"Shape mismatch: {dist_cuda.shape} vs {dist_ref.shape}"
+
+    # Debug: print actual values for small tensors
+    if dist_cuda.numel() <= 50:
+        print(f"Input:\n{x_cuda.cpu().numpy()}")
+        print(f"CUDA result:\n{dist_cuda.cpu().numpy()}")
+        print(f"SciPy reference:\n{dist_ref.cpu().numpy()}")
+
+    torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-5, rtol=1e-5)
+    print(">> Distance validation passed.")
+
+
+# ======================================================================
+# Test metric aliases
+# ======================================================================
+@pytest.mark.parametrize(
+    "alias, canonical",
+    [
+        pytest.param("cityblock", "taxicab", id="cityblock"),
+        pytest.param("manhattan", "taxicab", id="manhattan"),
+    ],
+)
+def test_metric_aliases(alias: str, canonical: str) -> None:
+    """Test that metric aliases produce same results."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    x_cuda = torch.from_numpy(case_2d_simple).cuda()
+
+    dist_alias = tm.distance_transform_cdt(x_cuda, metric=alias)
+    dist_canonical = tm.distance_transform_cdt(x_cuda, metric=canonical)
+
+    torch.testing.assert_close(dist_alias, dist_canonical, atol=1e-5, rtol=1e-5)
+    print(f">> Alias '{alias}' == '{canonical}' validation passed.")
+
+
+# ======================================================================
+# Test return flags with BCHW format
+# ======================================================================
+def test_return_flags() -> None:
+    """Test return_distances and return_indices flags with BCHW format."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    # (B=1, C=1, H=5, W=6)
+    x = torch.from_numpy(case_2d_simple).cuda()
+
+    # Only distances (default)
+    result = tm.distance_transform_cdt(x, return_distances=True, return_indices=False)
+    assert isinstance(result, torch.Tensor), "Should return single tensor"
+    assert result.shape == x.shape
+
+    # Only indices - spatial_ndim=2 for BCHW
+    result = tm.distance_transform_cdt(x, return_distances=False, return_indices=True)
+    assert isinstance(result, torch.Tensor), "Should return single tensor"
+    assert result.shape == (2, *x.shape)  # (spatial_ndim, B, C, H, W)
+
+    # Both
+    dist, idx = tm.distance_transform_cdt(x, return_distances=True, return_indices=True)
+    assert dist.shape == x.shape
+    assert idx.shape == (2, *x.shape)
+
+    print(">> Return flags test passed.")
+
+
+# ======================================================================
+# Test pre-allocated output tensors (scipy convention) with BCHW format
+# ======================================================================
+def test_preallocated_output() -> None:
+    """Test pre-allocated output tensors with scipy-style return convention."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    # (B=1, C=1, H=5, W=6)
+    x = torch.from_numpy(case_2d_simple).cuda()
+
+    # Pre-allocate distances tensor
+    dist_out = torch.empty_like(x)
+    result = tm.distance_transform_cdt(x, distances=dist_out)
+
+    # Should return None (scipy convention)
+    assert result is None, "Should return None when distances tensor is provided"
+
+    # But dist_out should be filled
+    dist_ref, _ = batch_scipy_cdt(case_2d_simple, metric="chessboard", spatial_ndim=2)
+    dist_ref_tensor = torch.from_numpy(dist_ref).to(torch.float32).cuda()
+    torch.testing.assert_close(dist_out, dist_ref_tensor, atol=1e-5, rtol=1e-5)
+
+    print(">> Pre-allocated output test passed.")
+
+
+# ======================================================================
+# Test indices correctness with BCHW format
+# ======================================================================
+def test_indices_correctness() -> None:
+    """Test that indices point to correct nearest background pixel with BCHW format."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    # (B=1, C=1, H=5, W=6)
+    x = torch.from_numpy(case_2d_simple).cuda()
+
+    dist, idx = tm.distance_transform_cdt(x, metric="chessboard", return_indices=True)
+
+    # For each foreground pixel, verify the index points to a background pixel
+    # idx shape: (spatial_ndim=2, B=1, C=1, H=5, W=6)
+    B, C, H, W = x.shape
+    x_np = x.cpu().numpy()
+    idx_np = idx.cpu().numpy()  # (2, B, C, H, W)
+    dist_np = dist.cpu().numpy()
+
+    for b in range(B):
+        for c in range(C):
+            for y in range(H):
+                for x_coord in range(W):
+                    if x_np[b, c, y, x_coord] != 0:  # Foreground
+                        idx_y = idx_np[0, b, c, y, x_coord]
+                        idx_x = idx_np[1, b, c, y, x_coord]
+                        # The pointed pixel should be background
+                        assert (
+                            x_np[b, c, idx_y, idx_x] == 0
+                        ), f"Index ({idx_y}, {idx_x}) should point to background"
+                        # Chessboard distance should match
+                        expected_dist = max(abs(y - idx_y), abs(x_coord - idx_x))
+                        assert (
+                            dist_np[b, c, y, x_coord] == expected_dist
+                        ), f"Distance mismatch at ({b}, {c}, {y}, {x_coord})"
+
+    print(">> Indices correctness test passed.")
+
+
+# ======================================================================
+# Test indices correctness - 3D with BCHW format
+# ======================================================================
+def test_indices_correctness_3d() -> None:
+    """Test that 3D indices point to correct nearest background pixel with BCDHW format."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    # (B=1, C=1, D=5, H=5, W=5)
+    x = torch.from_numpy(case_3d_simple).cuda()
+
+    dist, idx = tm.distance_transform_cdt(x, metric="chessboard", return_indices=True)
+
+    # For each foreground pixel, verify the index points to a background pixel
+    # idx shape: (spatial_ndim=3, B=1, C=1, D=5, H=5, W=5)
+    B, C, D, H, W = x.shape
+    x_np = x.cpu().numpy()
+    idx_np = idx.cpu().numpy()  # (3, B, C, D, H, W)
+    dist_np = dist.cpu().numpy()
+
+    for b in range(B):
+        for c in range(C):
+            for z in range(D):
+                for y in range(H):
+                    for x_coord in range(W):
+                        if x_np[b, c, z, y, x_coord] != 0:  # Foreground
+                            idx_z = idx_np[0, b, c, z, y, x_coord]
+                            idx_y = idx_np[1, b, c, z, y, x_coord]
+                            idx_x = idx_np[2, b, c, z, y, x_coord]
+                            # The pointed pixel should be background
+                            assert (
+                                x_np[b, c, idx_z, idx_y, idx_x] == 0
+                            ), f"Index ({idx_z}, {idx_y}, {idx_x}) should point to background"
+                            # Chessboard distance should match
+                            expected_dist = max(
+                                abs(z - idx_z), abs(y - idx_y), abs(x_coord - idx_x)
+                            )
+                            assert (
+                                dist_np[b, c, z, y, x_coord] == expected_dist
+                            ), f"Distance mismatch at ({b}, {c}, {z}, {y}, {x_coord})"
+
+    print(">> 3D Indices correctness test passed.")
+
+
+# ======================================================================
+# Test invalid inputs
+# ======================================================================
+def test_invalid_metric() -> None:
+    """Test that invalid metric raises error."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    x = torch.from_numpy(case_2d_simple).cuda()
+
+    with pytest.raises(ValueError, match="metric must be"):
+        tm.distance_transform_cdt(x, metric="invalid")
+
+
+def test_cpu_input_error() -> None:
+    """Test that CPU input raises error."""
+    x = torch.from_numpy(case_2d_simple)  # CPU tensor
+
+    with pytest.raises(ValueError, match="CUDA"):
+        tm.distance_transform_cdt(x)
+
+
+# ======================================================================
+# Test with random data - BCHW format
+# ======================================================================
+@pytest.mark.parametrize(
+    "shape, spatial_ndim, metric",
+    [
+        # 1D spatial: (B, C, W)
+        pytest.param((2, 1, 32), 1, "chessboard", id="1D_B2C1_32_chessboard"),
+        pytest.param((2, 1, 32), 1, "taxicab", id="1D_B2C1_32_taxicab"),
+        pytest.param((4, 2, 64), 1, "chessboard", id="1D_B4C2_64_chessboard"),
+        # 2D spatial: (B, C, H, W)
+        pytest.param((1, 1, 32, 32), 2, "chessboard", id="2D_B1C1_32x32_chessboard"),
+        pytest.param((1, 1, 32, 32), 2, "taxicab", id="2D_B1C1_32x32_taxicab"),
+        pytest.param((2, 1, 32, 32), 2, "chessboard", id="2D_B2C1_32x32_chessboard"),
+        pytest.param((2, 1, 32, 32), 2, "taxicab", id="2D_B2C1_32x32_taxicab"),
+        pytest.param((2, 3, 64, 48), 2, "chessboard", id="2D_B2C3_64x48_chessboard"),
+        # 3D spatial: (B, C, D, H, W)
+        pytest.param((1, 1, 8, 8, 8), 3, "chessboard", id="3D_B1C1_8x8x8_chessboard"),
+        pytest.param((1, 1, 8, 8, 8), 3, "taxicab", id="3D_B1C1_8x8x8_taxicab"),
+        pytest.param((2, 1, 16, 16, 16), 3, "chessboard", id="3D_B2C1_16x16x16_chessboard"),
+        pytest.param((2, 2, 12, 10, 8), 3, "taxicab", id="3D_B2C2_12x10x8_taxicab"),
+    ],
+)
+def test_random_data(shape: tuple, spatial_ndim: int, metric: str) -> None:
+    """Test CDT with random binary data in BCHW format."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    np.random.seed(42)
+    input_numpy = (np.random.rand(*shape) > 0.3).astype(np.float32)
+
+    x_cuda = torch.from_numpy(input_numpy).cuda()
+
+    # Run torchmorph CDT
+    dist_cuda = tm.distance_transform_cdt(x_cuda, metric=metric)
+
+    # Run scipy CDT
+    dist_scipy, _ = batch_scipy_cdt(input_numpy, metric=metric, spatial_ndim=spatial_ndim)
+    dist_ref = torch.from_numpy(dist_scipy).to(torch.float32).cuda()
+
+    torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-5, rtol=1e-5)
+    print(f">> Random data test ({shape}, spatial_ndim={spatial_ndim}, {metric}) passed.")
+
+
+# ======================================================================
+# Test indices validation with BCHW format
+# ======================================================================
+@pytest.mark.parametrize(
+    "input_numpy, spatial_ndim, metric",
+    [
+        pytest.param(case_1d, 1, "chessboard", id="1D_indices_chessboard"),
+        pytest.param(case_1d, 1, "taxicab", id="1D_indices_taxicab"),
+        pytest.param(case_2d_batch, 2, "chessboard", id="2D_batch_indices_chessboard"),
+        pytest.param(case_3d_batch, 3, "chessboard", id="3D_batch_indices_chessboard"),
+    ],
+)
+def test_indices_validation(
+    input_numpy: np.ndarray,
+    spatial_ndim: int,
+    metric: str,
+    request: pytest.FixtureRequest,
+) -> None:
+    """Test that indices correctly point to nearest background pixels in BCHW format."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    x_numpy_contiguous = np.ascontiguousarray(input_numpy)
+    x_cuda = torch.from_numpy(x_numpy_contiguous).cuda()
+
+    print(f"\n\n--- Running test: {request.node.callspec.id} ---")
+    print(f"CUDA input shape: {x_cuda.shape}, spatial_ndim: {spatial_ndim}")
+
+    # Run torchmorph CDT with indices
+    dist_cuda, idx_cuda = tm.distance_transform_cdt(x_cuda, metric=metric, return_indices=True)
+
+    # Validate indices shape: (spatial_ndim, *input_shape)
+    expected_idx_shape = (spatial_ndim, *x_cuda.shape)
+    assert (
+        idx_cuda.shape == expected_idx_shape
+    ), f"Index shape mismatch: {idx_cuda.shape} vs {expected_idx_shape}"
+
+    # Validate that indices point to background pixels and distance matches
+    spatial_shape = x_cuda.shape[-spatial_ndim:]
+    batch_shape = x_cuda.shape[:-spatial_ndim]
+
+    # Create coordinate grid for spatial dimensions
+    coords = [torch.arange(s, device="cuda") for s in spatial_shape]
+    grid = torch.stack(
+        torch.meshgrid(*coords, indexing="ij"), dim=0
+    )  # (spatial_ndim, *spatial_shape)
+
+    # Expand grid for batch dimensions
+    for _ in batch_shape:
+        grid = grid.unsqueeze(1)
+    grid = grid.expand(spatial_ndim, *batch_shape, *spatial_shape)
+
+    # Calculate distance from indices based on metric
+    diff = grid.float() - idx_cuda.float()
+    if metric in ("chessboard",):
+        # Chessboard: max of absolute differences
+        dist_calculated = torch.max(torch.abs(diff), dim=0).values
+    else:
+        # Taxicab: sum of absolute differences
+        dist_calculated = torch.sum(torch.abs(diff), dim=0)
+
+    torch.testing.assert_close(dist_calculated, dist_cuda, atol=1e-5, rtol=1e-5)
+    print(">> Index validation passed.")
diff --git a/test/test_distance_transform_edt.py b/test/test_distance_transform_edt.py
new file mode 100644
index 0000000..cace4c8
--- /dev/null
+++ b/test/test_distance_transform_edt.py
@@ -0,0 +1,396 @@
+import numpy as np  # noqa: F401
+import pytest
+import torch
+from scipy.ndimage import distance_transform_edt as scipy_edt  # noqa: F401
+
+import torchmorph as tm  # noqa: F401
+
+
+# ======================================================================
+# Helper functions
+# ======================================================================
+def batch_scipy_edt_with_indices(
+    batch_numpy: np.ndarray,
+    spatial_ndim: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Compute SciPy EDT and indices for a batch of arrays.
+
+    Args:
+        batch_numpy: Input array with shape (batch..., *spatial_shape)
+        spatial_ndim: Number of spatial dimensions
+    """
+    dist_results: list[np.ndarray] = []
+    indices_results: list[np.ndarray] = []
+
+    # Compute batch shape
+    batch_shape = batch_numpy.shape[:-spatial_ndim] if spatial_ndim > 0 else ()
+    spatial_shape = batch_numpy.shape[-spatial_ndim:] if spatial_ndim > 0 else batch_numpy.shape
+
+    # Flatten batch dimensions
+    if len(batch_shape) > 0:
+        batch_size = int(np.prod(batch_shape))
+        flat_input = batch_numpy.reshape(batch_size, *spatial_shape)
+    else:
+        batch_size = 1
+        flat_input = batch_numpy[np.newaxis, ...]
+
+    for sample in flat_input:
+        dist, indices = scipy_edt(
+            sample,
+            return_indices=True,
+            return_distances=True,
+        )
+        dist_results.append(dist)
+        indices_results.append(indices)
+
+    output_dist = np.stack(dist_results, axis=0)
+    output_indices = np.stack(indices_results, axis=0)
+
+    # Reshape back to original batch shape
+    if len(batch_shape) > 0:
+        output_dist = output_dist.reshape(*batch_shape, *spatial_shape)
+        output_indices = output_indices.reshape(*batch_shape, spatial_ndim, *spatial_shape)
+    else:
+        output_dist = output_dist[0]
+        output_indices = output_indices[0]
+
+    return output_dist, output_indices
+
+
+# ======================================================================
+# Test data: (B, C, Spatial...) format
+# ======================================================================
+# 1D spatial: (B=2, C=1, W=6)
+case_1d = np.array(
+    [[[1, 1, 0, 1, 0, 1]], [[0, 1, 1, 1, 1, 0]]],
+    dtype=np.float32,
+)
+
+# 2D spatial: (B=2, C=1, H=3, W=4)
+case_2d = np.array(
+    [
+        [[[0.0, 1, 1, 1], [0, 0, 1, 1], [0, 1, 1, 0]]],
+        [[[0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]]],
+    ],
+    dtype=np.float32,
+)
+
+# 2D spatial single batch: (B=1, C=1, H=4, W=4)
+case_2d_single = np.array(
+    [
+        [
+            [
+                [0, 1, 0, 1],
+                [1, 0, 1, 0],
+                [0, 1, 0, 1],
+                [1, 0, 1, 0],
+            ]
+        ]
+    ],
+    dtype=np.float32,
+)
+
+# 3D spatial: (B=2, C=1, D=4, H=5, W=6)
+_case_3d_s1 = np.ones((1, 4, 5, 6), dtype=np.float32)
+_case_3d_s1[0, 1, 1, 1] = 0.0
+_case_3d_s1[0, 2, 3, 4] = 0.0
+
+_case_3d_s2 = np.ones((1, 4, 5, 6), dtype=np.float32)
+_case_3d_s2[0, 0, 0, 0] = 0.0
+
+case_3d = np.stack([_case_3d_s1, _case_3d_s2], axis=0)  # (B=2, C=1, D=4, H=5, W=6)
+
+# 2D with unit dimension: (B=2, C=1, H=5, W=1)
+case_2d_unit = np.ones((2, 1, 5, 1), dtype=np.float32)
+case_2d_unit[0, 0, 2, 0] = 0.0
+case_2d_unit[1, 0, 4, 0] = 0.0
+
+
+# ======================================================================
+# Test logic
+# ======================================================================
+@pytest.mark.parametrize(
+    "input_numpy, spatial_ndim",
+    [
+        pytest.param(case_1d, 1, id="1D_B2C1"),
+        pytest.param(case_2d, 2, id="2D_B2C1"),
+        pytest.param(case_2d_single, 2, id="2D_B1C1"),
+        pytest.param(case_3d, 3, id="3D_B2C1"),
+        pytest.param(case_2d_unit, 2, id="2D_UnitDim_B2C1"),
+    ],
+)
+def test_distance_transform_and_indices(
+    input_numpy: np.ndarray,
+    spatial_ndim: int,
+    request: pytest.FixtureRequest,
+) -> None:
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    # 1. Prepare data
+    x_numpy_contiguous = np.ascontiguousarray(input_numpy)
+    x_cuda = torch.from_numpy(x_numpy_contiguous).cuda()
+
+    print(f"\n\n--- Running test: {request.node.callspec.id} ---")
+    print(f"CUDA input shape: {x_cuda.shape}, spatial_ndim: {spatial_ndim}")
+
+    # 2. Create sampling list to specify spatial dimensions
+    sampling = [1.0] * spatial_ndim
+
+    # 3. Run CUDA EDT
+    dist_cuda, idx_cuda = tm.distance_transform(
+        x_cuda.clone(), sampling=sampling, return_indices=True
+    )
+
+    # 4. Run SciPy (ground truth)
+    dist_ref_numpy, idx_ref_numpy = batch_scipy_edt_with_indices(x_numpy_contiguous, spatial_ndim)
+    dist_ref = torch.from_numpy(dist_ref_numpy).to(torch.float32).cuda()
+
+    # 5. Validate distances
+    print(
+        f"CUDA distance shape: {dist_cuda.shape}, reference shape: {dist_ref.shape}",
+    )
+    assert (
+        dist_cuda.shape == dist_ref.shape
+    ), f"Shape mismatch: {dist_cuda.shape} vs {dist_ref.shape}"
+
+    # Debug: print actual values for small tensors
+    if dist_cuda.numel() <= 30:
+        print(f"Input:\n{x_cuda.cpu().numpy()}")
+        print(f"CUDA result:\n{dist_cuda.cpu().numpy()}")
+        print(f"SciPy reference:\n{dist_ref.cpu().numpy()}")
+
+    torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-3, rtol=1e-3)
+    print(">> Distance validation passed.")
+
+    # 6. Validate indices
+    # idx_cuda shape: (spatial_ndim, *input_shape)
+    # We need to verify that the indices point to the correct nearest background pixel
+    spatial_shape = x_cuda.shape[-spatial_ndim:]
+    batch_shape = x_cuda.shape[:-spatial_ndim]
+
+    # Create coordinate grid for spatial dimensions
+    coords = [torch.arange(s, device="cuda") for s in spatial_shape]
+    grid = torch.stack(
+        torch.meshgrid(*coords, indexing="ij"), dim=0
+    )  # (spatial_ndim, *spatial_shape)
+
+    # Expand grid for batch dimensions
+    for _ in batch_shape:
+        grid = grid.unsqueeze(1)  # (spatial_ndim, 1, ..., *spatial_shape)
+    grid = grid.expand(
+        spatial_ndim, *batch_shape, *spatial_shape
+    )  # (spatial_ndim, *batch_shape, *spatial_shape)
+
+    # Calculate distance from indices
+    diff = grid.float() - idx_cuda.float()
+    dist_sq_calculated = torch.sum(diff * diff, dim=0)
+    dist_sq_output = dist_cuda * dist_cuda
+
+    torch.testing.assert_close(
+        dist_sq_calculated,
+        dist_sq_output,
+        atol=1e-3,
+        rtol=1e-3,
+    )
+    print(">> Index validation passed.")
+
+
+# ======================================================================
+# Helper for sampling tests
+# ======================================================================
+def batch_scipy_edt_with_sampling(
+    batch_numpy: np.ndarray,
+    spatial_ndim: int,
+    sampling: list[float],
+) -> tuple[np.ndarray, np.ndarray]:
+    """Compute SciPy EDT with sampling for a batch of arrays.
+
+    Args:
+        batch_numpy: Input array with shape (batch..., *spatial_shape)
+        spatial_ndim: Number of spatial dimensions
+        sampling: Spacing for each spatial dimension
+    """
+    dist_results: list[np.ndarray] = []
+    indices_results: list[np.ndarray] = []
+
+    batch_shape = batch_numpy.shape[:-spatial_ndim] if spatial_ndim > 0 else ()
+    spatial_shape = batch_numpy.shape[-spatial_ndim:] if spatial_ndim > 0 else batch_numpy.shape
+
+    if len(batch_shape) > 0:
+        batch_size = int(np.prod(batch_shape))
+        flat_input = batch_numpy.reshape(batch_size, *spatial_shape)
+    else:
+        batch_size = 1
+        flat_input = batch_numpy[np.newaxis, ...]
+
+    for sample in flat_input:
+        dist, indices = scipy_edt(
+            sample,
+            sampling=sampling,
+            return_indices=True,
+            return_distances=True,
+        )
+        dist_results.append(dist)
+        indices_results.append(indices)
+
+    output_dist = np.stack(dist_results, axis=0)
+    output_indices = np.stack(indices_results, axis=0)
+
+    if len(batch_shape) > 0:
+        output_dist = output_dist.reshape(*batch_shape, *spatial_shape)
+        output_indices = output_indices.reshape(*batch_shape, spatial_ndim, *spatial_shape)
+    else:
+        output_dist = output_dist[0]
+        output_indices = output_indices[0]
+
+    return output_dist, output_indices
+
+
+# ======================================================================
+# Test sampling functionality
+# ======================================================================
+@pytest.mark.parametrize(
+    "input_numpy, spatial_ndim, sampling",
+    [
+        # 2D with non-uniform sampling
+        pytest.param(case_2d_single, 2, [0.5, 1.0], id="2D_Sampling_0.5_1.0"),
+        pytest.param(case_2d_single, 2, [2.0, 0.5], id="2D_Sampling_2.0_0.5"),
+        pytest.param(case_2d_single, 2, [0.25, 0.25], id="2D_Sampling_0.25_0.25"),
+        # 2D batch with sampling
+        pytest.param(case_2d, 2, [1.5, 0.75], id="2D_Batch_Sampling"),
+        # 3D with sampling
+        pytest.param(case_3d, 3, [1.0, 2.0, 0.5], id="3D_Batch_Sampling"),
+        # 1D with sampling
+        pytest.param(case_1d, 1, [0.5], id="1D_Batch_Sampling"),
+        # Test single-element list broadcast
+        pytest.param(case_2d_single, 2, [0.5], id="2D_SingleElementList_Broadcast"),
+        pytest.param(case_3d, 3, [2.0], id="3D_SingleElementList_Broadcast"),
+    ],
+)
+def test_distance_transform_with_sampling(
+    input_numpy: np.ndarray,
+    spatial_ndim: int,
+    sampling: list[float],
+    request: pytest.FixtureRequest,
+) -> None:
+    """Test EDT with non-unit sampling (pixel spacing)."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    x_numpy_contiguous = np.ascontiguousarray(input_numpy)
+    x_cuda = torch.from_numpy(x_numpy_contiguous).cuda()
+
+    print(f"\n\n--- Running test: {request.node.callspec.id} ---")
+    print(f"CUDA input shape: {x_cuda.shape}, spatial_ndim: {spatial_ndim}, sampling: {sampling}")
+
+    # Run CUDA EDT with sampling
+    dist_cuda, idx_cuda = tm.distance_transform_edt(
+        x_cuda.clone(), sampling=sampling, return_indices=True
+    )
+
+    # Expand single-element list for SciPy (it doesn't support broadcast)
+    scipy_sampling = sampling if len(sampling) == spatial_ndim else sampling * spatial_ndim
+
+    # Run SciPy with sampling (ground truth)
+    dist_ref_numpy, idx_ref_numpy = batch_scipy_edt_with_sampling(
+        x_numpy_contiguous, spatial_ndim, scipy_sampling
+    )
+    dist_ref = torch.from_numpy(dist_ref_numpy).to(torch.float32).cuda()
+
+    # Validate distances
+    print(f"CUDA distance shape: {dist_cuda.shape}, reference shape: {dist_ref.shape}")
+    assert (
+        dist_cuda.shape == dist_ref.shape
+    ), f"Shape mismatch: {dist_cuda.shape} vs {dist_ref.shape}"
+    torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-3, rtol=1e-3)
+    print(">> Distance validation with sampling passed.")
+
+    # Validate indices shape
+    expected_idx_shape = (spatial_ndim, *x_cuda.shape)
+    assert (
+        idx_cuda.shape == expected_idx_shape
+    ), f"Index shape mismatch: {idx_cuda.shape} vs {expected_idx_shape}"
+    print(">> Index shape validation passed.")
+
+    # Validate indices correctness using sampling
+    spatial_shape = x_cuda.shape[-spatial_ndim:]
+    batch_shape = x_cuda.shape[:-spatial_ndim]
+
+    coords = [torch.arange(s, device="cuda") for s in spatial_shape]
+    grid = torch.stack(torch.meshgrid(*coords, indexing="ij"), dim=0)
+
+    for _ in batch_shape:
+        grid = grid.unsqueeze(1)
+    grid = grid.expand(spatial_ndim, *batch_shape, *spatial_shape)
+
+    # Calculate distance with sampling (use expanded sampling for validation)
+    sampling_expanded = sampling if len(sampling) == spatial_ndim else sampling * spatial_ndim
+    sampling_tensor = torch.tensor(sampling_expanded, device="cuda", dtype=torch.float32)
+    for _ in range(len(batch_shape) + len(spatial_shape)):
+        sampling_tensor = sampling_tensor.unsqueeze(-1)
+    sampling_tensor = sampling_tensor.expand(spatial_ndim, *batch_shape, *spatial_shape)
+
+    diff = (grid.float() - idx_cuda.float()) * sampling_tensor
+    dist_sq_calculated = torch.sum(diff * diff, dim=0)
+    dist_sq_output = dist_cuda * dist_cuda
+
+    torch.testing.assert_close(dist_sq_calculated, dist_sq_output, atol=1e-3, rtol=1e-3)
+    print(">> Index validation with sampling passed.")
+
+
+# ======================================================================
+# Test return_distances and return_indices flags
+# ======================================================================
+def test_return_flags() -> None:
+    """Test return_distances and return_indices flags."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    # (B=1, C=1, H=2, W=3)
+    x = torch.tensor([[[[1, 1, 0], [1, 0, 0]]]], dtype=torch.float32).cuda()
+
+    # Only distances
+    result = tm.distance_transform_edt(x, return_distances=True, return_indices=False)
+    assert isinstance(
+        result, torch.Tensor
+    ), "Should return single tensor when only distances requested"
+    assert result.shape == x.shape
+
+    # Only indices
+    result = tm.distance_transform_edt(x, return_distances=False, return_indices=True)
+    assert isinstance(
+        result, torch.Tensor
+    ), "Should return single tensor when only indices requested"
+    assert result.shape == (2, *x.shape)  # (spatial_ndim, B, C, H, W)
+
+    # Both
+    dist, idx = tm.distance_transform_edt(x, return_distances=True, return_indices=True)
+    assert dist.shape == x.shape
+    assert idx.shape == (2, *x.shape)
+
+    print(">> Return flags test passed.")
+
+
+# ======================================================================
+# Test single float sampling
+# ======================================================================
+def test_single_float_sampling() -> None:
+    """Test that a single float sampling value applies to all dimensions."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    # Use case_2d_single which is (B=1, C=1, H=4, W=4) format
+    x_numpy = case_2d_single
+    x_cuda = torch.from_numpy(x_numpy).cuda()
+
+    # Single float should apply to all spatial dimensions
+    dist_cuda = tm.distance_transform_edt(x_cuda, sampling=0.5)
+
+    # Compare with scipy using [0.5, 0.5] - use batch helper for BCHW format
+    spatial_ndim = 2
+    dist_ref_numpy, _ = batch_scipy_edt_with_sampling(x_numpy, spatial_ndim, [0.5, 0.5])
+    dist_ref = torch.from_numpy(dist_ref_numpy).to(torch.float32).cuda()
+
+    torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-3, rtol=1e-3)
+    print(">> Single float sampling test passed.")
diff --git a/torchmorph/__init__.py b/torchmorph/__init__.py
index 1f31247..ed972e1 100644
--- a/torchmorph/__init__.py
+++ b/torchmorph/__init__.py
@@ -1,4 +1,18 @@
 from .add import add
-from .distance_transform import distance_transform
+from .binary_morphology import binary_fill_holes, binary_propagation
+from .distance_transform import (
+    distance_transform,
+    distance_transform_bf,
+    distance_transform_cdt,
+    distance_transform_edt,
+)
 
-__all__ = ["add", "distance_transform"]
+__all__ = [
+    "add",
+    "distance_transform",
+    "distance_transform_edt",
+    "distance_transform_bf",
+    "distance_transform_cdt",
+    "binary_fill_holes",
+    "binary_propagation",
+]
diff --git a/torchmorph/csrc/distance_transform_edt.cu b/torchmorph/csrc/distance_transform_edt.cu
index 25c7701..225a976 100644
--- a/torchmorph/csrc/distance_transform_edt.cu
+++ b/torchmorph/csrc/distance_transform_edt.cu
@@ -11,7 +11,6 @@
 #define INF_VAL 1e20f
 #define MAX_THREADS 256
 #define SHARED_MEM_LIMIT 2048  // Max dimension size for shared memory path (48KB limit)
-#define EDT_2D_MAX_DIM 4096    // Max dimension size for 2D optimized kernels
 
 // ==============================================================================
 // 2D Optimized: Initialization kernel
@@ -284,7 +283,7 @@ __global__ void edt_2d_cols_kernel(
 }
 
 // ==============================================================================
-// 2D Optimized: Host function
+// 2D Optimized: Host function (shared memory only, for dimensions <= 2048)
 // ==============================================================================
 std::tuple<torch::Tensor, torch::Tensor> run_edt_2d_optimized(
     torch::Tensor input,
@@ -305,11 +304,10 @@ std::tuple<torch::Tensor, torch::Tensor> run_edt_2d_optimized(
     int width = shape[total_ndim - 1];
     int64_t batch_stride = (int64_t)height * width;
     int64_t batch_size = input.numel() / batch_stride;
-    int64_t total_pixels = input.numel();
 
-    // Check dimension limits for 2D optimized path
-    TORCH_CHECK(height <= EDT_2D_MAX_DIM && width <= EDT_2D_MAX_DIM,
-                "Dimensions too large for 2D optimized path");
+    // This function should only be called when both dimensions fit in shared memory
+    TORCH_CHECK(height <= SHARED_MEM_LIMIT && width <= SHARED_MEM_LIMIT,
+                "Dimensions too large for 2D optimized path, use general N-D version");
 
     // Create output tensors
     auto distance = torch::empty_like(input);
@@ -338,7 +336,7 @@ std::tuple<torch::Tensor, torch::Tensor> run_edt_2d_optimized(
         );
     }
 
-    // Step 2: Row-wise EDT (X direction)
+    // Step 2: Row-wise EDT (X direction) - shared memory
     {
         int64_t num_rows = batch_size * height;
         int threads = min(width, MAX_THREADS);
@@ -359,7 +357,7 @@ std::tuple<torch::Tensor, torch::Tensor> run_edt_2d_optimized(
         );
     }
 
-    // Step 3: Column-wise EDT (Y direction)
+    // Step 3: Column-wise EDT (Y direction) - shared memory
     {
         int64_t num_cols = batch_size * width;
         int threads = min(height, MAX_THREADS);
@@ -849,14 +847,15 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_edt_cuda(
 
     int spatial_ndim = sampling.size();
 
-    // Use 2D optimized path when applicable
+    // Use 2D optimized path only when both dimensions fit in shared memory
+    // For larger dimensions, the N-D general version with transpose is faster
     if (spatial_ndim == 2) {
         auto shape = input.sizes().vec();
         int height = shape[total_ndim - 2];
         int width = shape[total_ndim - 1];
 
-        // Check if dimensions are within limits for 2D optimized path
-        if (height <= EDT_2D_MAX_DIM && width <= EDT_2D_MAX_DIM) {
+        // Only use 2D optimized path when shared memory can be used for both directions
+        if (height <= SHARED_MEM_LIMIT && width <= SHARED_MEM_LIMIT) {
             float spacing_y = sampling[0];
             float spacing_x = sampling[1];
 
diff --git a/torchmorph/csrc/distance_transform_kernel.cu b/torchmorph/csrc/distance_transform_kernel.cu
deleted file mode 100644
index 817d637..0000000
--- a/torchmorph/csrc/distance_transform_kernel.cu
+++ /dev/null
@@ -1,945 +0,0 @@
-#include <torch/extension.h>
-#include <vector>
-#include <cmath>
-#include <limits>
-#include <cstdint>
-#include <cstdio>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <algorithm>
-#include <tuple>
-
-// ------------------------------------------------------------------
-// Global Configuration(test)
-// ------------------------------------------------------------------
-#define BLOCK_SIZE 256
-#define INF_VAL 1e20f
-#define MAX_THREADS 1024
-#define SMEM_LIMIT_ELEMENTS 4096 
-
-#define JFA_BLOCK_DIM 32       
-#define JFA_FUSED_STEPS 4      
-#define JFA_MAX_OFFSET 8       
-#define JFA_SMEM_DIM (JFA_BLOCK_DIM + 2 * JFA_MAX_OFFSET) 
-
-// 3D Config
-#define JFA_3D_BLOCK 8
-#define JFA_3D_HALO 1 
-
-// ------------------------------------------------------------------
-// Device Helpers
-// ------------------------------------------------------------------
-__device__ __forceinline__ float sqr(float x) { return x * x; }
-
-// Helper for JFA 2D/3D (Standard)
-__device__ __forceinline__ float dist_sq_2d(int y1, int x1, int y2, int x2) {
-    return sqr((float)(y1 - y2)) + sqr((float)(x1 - x2));
-}
-
-// Helper for SoA 3D (Z, Y, X separate)
-__device__ __forceinline__ float dist_sq_3d_soa(int z1, int y1, int x1, int z2, int y2, int x2) {
-    if (z2 == -1) return INF_VAL;
-    float dz = (float)(z1 - z2);
-    float dy = (float)(y1 - y2);
-    float dx = (float)(x1 - x2);
-    return dz*dz + dy*dy + dx*dx;
-}
-
-// Helper for Separable 1D
-__device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
-    if (p < 0 || val_p >= INF_VAL) return INF_VAL; 
-    return sqr((float)q - (float)p) + val_p;
-}
-
-// Device Helpers for int2 (2D Vectorized)
-__device__ __forceinline__ float dist_sq_int2(int y, int x, int2 seed) {
-    if (seed.x == -1) return INF_VAL; 
-    float dy = (float)(y - seed.x);
-    float dx = (float)(x - seed.y);
-    return dy*dy + dx*dx;
-}
-
-// ==================================================================
-// PART 1: JFA KERNELS 2D (Vectorized int2 + Block Shared)
-// ==================================================================
-
-__global__ void init_jfa_kernel_2d_opt(
-    const float* __restrict__ input,
-    int2* __restrict__ output, 
-    int64_t total_elements,    
-    int H, int W
-) {
-    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total_elements) return;
-
-    if (input[tid] == 0.0f) {
-        int64_t spatial_size = (int64_t)H * W;
-        int64_t rem = tid % spatial_size;
-        int w = (int)(rem % W);
-        int h = (int)(rem / W);
-        output[tid] = make_int2(h, w); 
-    } else {
-        output[tid] = make_int2(-1, -1);
-    }
-}
-
-__global__ void jfa_block_fused_kernel_2d(
-    const int2* __restrict__ in_idx,
-    int2* __restrict__ out_idx,
-    int H, int W,
-    int64_t num_images 
-) {
-    __shared__ int2 smem[JFA_SMEM_DIM][JFA_SMEM_DIM];
-
-    int tx = threadIdx.x; 
-    int ty = threadIdx.y; 
-    
-    int bx = blockIdx.x * blockDim.x;
-    int by = blockIdx.y * blockDim.y;
-    int img_idx = blockIdx.z;
-    int64_t batch_offset = (int64_t)img_idx * (H * W);
-
-    int gx = bx + tx;
-    int gy = by + ty;
-
-    // Phase 1: load data to Shared Memory
-    int smem_linear_size = JFA_SMEM_DIM * JFA_SMEM_DIM;
-    int total_threads = blockDim.x * blockDim.y;
-    int thread_linear_idx = ty * blockDim.x + tx;
-
-    int base_x = bx - JFA_MAX_OFFSET;
-    int base_y = by - JFA_MAX_OFFSET;
-
-    for (int i = thread_linear_idx; i < smem_linear_size; i += total_threads) {
-        int s_y = i / JFA_SMEM_DIM;
-        int s_x = i % JFA_SMEM_DIM;
-        int global_y = base_y + s_y;
-        int global_x = base_x + s_x;
-        int2 val = make_int2(-1, -1);
-        if (global_y >= 0 && global_y < H && global_x >= 0 && global_x < W) {
-            val = in_idx[batch_offset + global_y * W + global_x];
-        }
-        smem[s_y][s_x] = val;
-    }
-    __syncthreads();
-
-    // Phase 2: Iterate in Shared Memory
-    if (gx < W && gy < H) {
-        int center_sy = ty + JFA_MAX_OFFSET;
-        int center_sx = tx + JFA_MAX_OFFSET;
-
-        int2 best_seed = smem[center_sy][center_sx];
-        float best_dist = dist_sq_int2(gy, gx, best_seed);
-
-        int step = 1;
-        #pragma unroll
-        for (int k = 0; k < JFA_FUSED_STEPS; ++k) { 
-            #pragma unroll
-            for (int dy = -1; dy <= 1; ++dy) {
-                #pragma unroll
-                for (int dx = -1; dx <= 1; ++dx) {
-                    if (dy == 0 && dx == 0) continue;
-                    int2 neighbor_seed = smem[center_sy + dy * step][center_sx + dx * step];
-                    if (neighbor_seed.x != -1) {
-                        float d = dist_sq_int2(gy, gx, neighbor_seed);
-                        if (d < best_dist) {
-                            best_dist = d;
-                            best_seed = neighbor_seed;
-                        }
-                    }
-                }
-            }
-            __syncthreads();
-            smem[center_sy][center_sx] = best_seed;
-            __syncthreads();
-            step *= 2;
-        }
-        out_idx[batch_offset + gy * W + gx] = best_seed;
-    }
-}
-
-__global__ void jfa_step_global_2d_opt(
-    const int2* __restrict__ in_idx,
-    int2* __restrict__ out_idx,
-    int step,
-    int H, int W, 
-    int64_t total_pixels
-) {
-    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total_pixels) return;
-
-    int64_t spatial_size = (int64_t)H * W;
-    int64_t rem = tid % spatial_size;
-    int64_t batch_offset = tid - rem; 
-    int w = (int)(rem % W);
-    int h = (int)(rem / W);
-
-    int2 best_seed = in_idx[tid];
-    float best_dist = dist_sq_int2(h, w, best_seed);
-
-    #pragma unroll
-    for (int dy = -1; dy <= 1; ++dy) {
-        #pragma unroll
-        for (int dx = -1; dx <= 1; ++dx) {
-            if (dx == 0 && dy == 0) continue;
-            
-            int ny = h + dy * step;
-            int nx = w + dx * step;
-
-            if (ny >= 0 && ny < H && nx >= 0 && nx < W) {
-                int2 neighbor_seed = in_idx[batch_offset + ny * W + nx];
-                if (neighbor_seed.x != -1) {
-                    float d = dist_sq_int2(h, w, neighbor_seed);
-                    if (d < best_dist) {
-                        best_dist = d;
-                        best_seed = neighbor_seed;
-                    }
-                }
-            }
-        }
-    }
-    out_idx[tid] = best_seed;
-}
-
-__global__ void calc_dist_kernel_2d_opt(
-    const int2* __restrict__ indices,
-    float* __restrict__ dist_out,
-    int64_t total_elements,
-    int H, int W
-) {
-    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total_elements) return;
-
-    int2 s = indices[tid];
-    if (s.x == -1) { 
-        dist_out[tid] = INF_VAL; 
-    } else {
-        int64_t spatial_size = (int64_t)H * W;
-        int64_t rem = tid % spatial_size;
-        int cur_w = (int)(rem % W);
-        int cur_h = (int)(rem / W);
-        dist_out[tid] = sqrtf(dist_sq_int2(cur_h, cur_w, s));
-    }
-}
-
-// ==================================================================
-// PART 2: JFA KERNELS 3D (Optimized SoA Layout)
-// ==================================================================
-
-template <typename IndexType>
-__global__ void init_jfa_kernel_3d_soa(
-    const float* __restrict__ input,
-    IndexType* __restrict__ indices_z,
-    IndexType* __restrict__ indices_y,
-    IndexType* __restrict__ indices_x,
-    int64_t total_elements, 
-    int D, int H, int W
-) {
-    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total_elements) return;
-
-    if (input[tid] == 0.0f) {
-        int64_t spatial_size = (int64_t)D * H * W;
-        int64_t rem = tid % spatial_size;
-        int w = (int)(rem % W);
-        int h = (int)((rem / W) % H);
-        int d = (int)(rem / (W * H));
-
-        indices_z[tid] = (IndexType)d;
-        indices_y[tid] = (IndexType)h;
-        indices_x[tid] = (IndexType)w;
-    } else {
-        indices_z[tid] = (IndexType)-1;
-        indices_y[tid] = (IndexType)-1;
-        indices_x[tid] = (IndexType)-1;
-    }
-}
-
-template <typename IndexType>
-__global__ void jfa_block_fused_kernel_3d_soa(
-    const IndexType* __restrict__ in_z,
-    const IndexType* __restrict__ in_y,
-    const IndexType* __restrict__ in_x,
-    IndexType* __restrict__ out_z,
-    IndexType* __restrict__ out_y,
-    IndexType* __restrict__ out_x,
-    int D, int H, int W,
-    int blocks_per_d
-) {
-    const int BLOCK_DIM = 8;
-    const int HALO = 3; 
-    const int SMEM_DIM = BLOCK_DIM + 2 * HALO; // 14
-    const int SMEM_SIZE = SMEM_DIM * SMEM_DIM * SMEM_DIM;
-
-    extern __shared__ char smem_raw[];
-    IndexType* smem_z = (IndexType*)smem_raw;
-    IndexType* smem_y = smem_z + SMEM_SIZE;
-    IndexType* smem_x = smem_y + SMEM_SIZE;
-
-    int tx = threadIdx.x; int ty = threadIdx.y; int tz = threadIdx.z;
-
-    int b_z_total = blockIdx.z;
-    int batch_id = b_z_total / blocks_per_d;
-    int b_z_local = b_z_total % blocks_per_d;
-    
-    int bx = blockIdx.x * BLOCK_DIM;
-    int by = blockIdx.y * BLOCK_DIM;
-    int bz = b_z_local * BLOCK_DIM;
-
-    int64_t spatial_offset = (int64_t)batch_id * (D * H * W);
-
-    // Phase 1: Load to SoA Shared Memory
-    int tid = tz * 64 + ty * 8 + tx; 
-    int base_x = bx - HALO;
-    int base_y = by - HALO;
-    int base_z = bz - HALO;
-
-    for (int i = tid; i < SMEM_SIZE; i += 512) {
-        int temp = i;
-        int sx = temp % SMEM_DIM; temp /= SMEM_DIM;
-        int sy = temp % SMEM_DIM;
-        int sz = temp / SMEM_DIM;
-
-        int gx = base_x + sx;
-        int gy = base_y + sy;
-        int gz = base_z + sz;
-
-        IndexType val_z = -1, val_y = -1, val_x = -1;
-        if (gz >= 0 && gz < D && gy >= 0 && gy < H && gx >= 0 && gx < W) {
-            int64_t idx = spatial_offset + (int64_t)gz * (H * W) + gy * W + gx;
-            val_z = in_z[idx];
-            val_y = in_y[idx];
-            val_x = in_x[idx];
-        }
-        smem_z[i] = val_z;
-        smem_y[i] = val_y;
-        smem_x[i] = val_x;
-    }
-    __syncthreads();
-
-    // Phase 2: Compute
-    int center_sz = tz + HALO;
-    int center_sy = ty + HALO;
-    int center_sx = tx + HALO;
-    int my_s_idx = (center_sz * SMEM_DIM + center_sy) * SMEM_DIM + center_sx;
-
-    int best_z = (int)smem_z[my_s_idx];
-    int best_y = (int)smem_y[my_s_idx];
-    int best_x = (int)smem_x[my_s_idx];
-
-    int g_cz = bz + tz;
-    int g_cy = by + ty;
-    int g_cx = bx + tx;
-
-    float best_dist = dist_sq_3d_soa(g_cz, g_cy, g_cx, best_z, best_y, best_x);
-
-    int step = 1;
-    #pragma unroll
-    for (int k = 0; k < 2; ++k) { 
-        #pragma unroll
-        for (int dz = -1; dz <= 1; ++dz) {
-            #pragma unroll
-            for (int dy = -1; dy <= 1; ++dy) {
-                #pragma unroll
-                for (int dx = -1; dx <= 1; ++dx) {
-                    if (dz == 0 && dy == 0 && dx == 0) continue;
-
-                    int nz = center_sz + dz * step;
-                    int ny = center_sy + dy * step;
-                    int nx = center_sx + dx * step;
-                    int n_idx = (nz * SMEM_DIM + ny) * SMEM_DIM + nx;
-
-                    int sz_in = (int)smem_z[n_idx]; 
-                    if (sz_in != -1) {
-                        int sy_in = (int)smem_y[n_idx];
-                        int sx_in = (int)smem_x[n_idx];
-                        float d = dist_sq_3d_soa(g_cz, g_cy, g_cx, sz_in, sy_in, sx_in);
-                        if (d < best_dist) {
-                            best_dist = d;
-                            best_z = sz_in;
-                            best_y = sy_in;
-                            best_x = sx_in;
-                        }
-                    }
-                }
-            }
-        }
-        __syncthreads();
-        smem_z[my_s_idx] = (IndexType)best_z;
-        smem_y[my_s_idx] = (IndexType)best_y;
-        smem_x[my_s_idx] = (IndexType)best_x;
-        __syncthreads();
-        step *= 2;
-    }
-
-    if (g_cz < D && g_cy < H && g_cx < W) {
-        int64_t out_idx_g = spatial_offset + (int64_t)g_cz * (H * W) + g_cy * W + g_cx;
-        out_z[out_idx_g] = (IndexType)best_z;
-        out_y[out_idx_g] = (IndexType)best_y;
-        out_x[out_idx_g] = (IndexType)best_x;
-    }
-}
-
-template <typename IndexType>
-__global__ void jfa_step_3d_soa(
-    const IndexType* __restrict__ in_z,
-    const IndexType* __restrict__ in_y,
-    const IndexType* __restrict__ in_x,
-    IndexType* __restrict__ out_z,
-    IndexType* __restrict__ out_y,
-    IndexType* __restrict__ out_x,
-    int step,
-    int D, int H, int W, 
-    int64_t total_pixels
-) {
-    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total_pixels) return;
-
-    int64_t spatial_size = (int64_t)D * H * W;
-    int64_t rem = tid % spatial_size;
-    int64_t batch_offset = tid - rem; 
-    int cur_w = (int)(rem % W);
-    int cur_h = (int)((rem / W) % H);
-    int cur_d = (int)(rem / (W * H));
-
-    int best_z = (int)in_z[tid];
-    int best_y = (int)in_y[tid];
-    int best_x = (int)in_x[tid];
-    
-    float best_dist = dist_sq_3d_soa(cur_d, cur_h, cur_w, best_z, best_y, best_x);
-
-    #pragma unroll
-    for (int dz = -1; dz <= 1; ++dz) {
-        #pragma unroll
-        for (int dy = -1; dy <= 1; ++dy) {
-            #pragma unroll
-            for (int dx = -1; dx <= 1; ++dx) {
-                if (dz == 0 && dy == 0 && dx == 0) continue;
-
-                int nz = cur_d + dz * step;
-                int ny = cur_h + dy * step;
-                int nx = cur_w + dx * step;
-
-                if (nz >= 0 && nz < D && ny >= 0 && ny < H && nx >= 0 && nx < W) {
-                    int64_t n_idx = batch_offset + (int64_t)nz * (H * W) + ny * W + nx;
-                    
-                    int seed_z = (int)in_z[n_idx];
-                    if (seed_z != -1) {
-                        float dz_val = (float)(cur_d - seed_z);
-                        float dz_sq = dz_val * dz_val;
-                        
-                        if (dz_sq < best_dist) {
-                            int seed_y = (int)in_y[n_idx];
-                            int seed_x = (int)in_x[n_idx];
-                            float dist = dz_sq + sqr((float)(cur_h - seed_y)) + sqr((float)(cur_w - seed_x));
-                            
-                            if (dist < best_dist) {
-                                best_dist = dist;
-                                best_z = seed_z;
-                                best_y = seed_y;
-                                best_x = seed_x;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    out_z[tid] = (IndexType)best_z;
-    out_y[tid] = (IndexType)best_y;
-    out_x[tid] = (IndexType)best_x;
-}
-
-template <typename IndexType>
-__global__ void calc_dist_kernel_3d_soa(
-    const IndexType* __restrict__ in_z,
-    const IndexType* __restrict__ in_y,
-    const IndexType* __restrict__ in_x,
-    float* __restrict__ dist_out,
-    int64_t total_elements,
-    int D, int H, int W
-) {
-    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= total_elements) return;
-
-    int seed_d = (int)in_z[tid];
-    if (seed_d == -1) { 
-        dist_out[tid] = INF_VAL; 
-    } else {
-        int seed_h = (int)in_y[tid];
-        int seed_w = (int)in_x[tid];
-
-        int64_t spatial_size = (int64_t)D * H * W;
-        int64_t rem = tid % spatial_size;
-        int cur_w = (int)(rem % W);
-        int cur_h = (int)((rem / W) % H);
-        int cur_d = (int)(rem / (W * H));
-        
-        dist_out[tid] = sqrtf(dist_sq_3d_soa(cur_d, cur_h, cur_w, seed_d, seed_h, seed_w));
-    }
-}
-
-// ==================================================================
-// PART 3: SEPARABLE N-DIM KERNELS
-// ==================================================================
-
-__device__ void run_separable_scan_core(
-    int N,
-    int tid,
-    const float* __restrict__ vals,  
-    int* __restrict__ idx_curr,      
-    int* __restrict__ idx_next       
-) {
-    for (int i = tid; i < N; i += blockDim.x) {
-        if (vals[i] >= INF_VAL * 0.9f) idx_curr[i] = -1; 
-        else idx_curr[i] = i;  
-    }
-    __syncthreads();
-
-    int* idx_in = idx_curr;
-    int* idx_out = idx_next;
-
-    for (int step = 1; step < N; step *= 2) {
-        for (int i = tid; i < N; i += blockDim.x) {
-            int my_best_p = idx_in[i];
-            float min_cost = INF_VAL;
-
-            if (my_best_p != -1) min_cost = compute_cost(i, my_best_p, vals[my_best_p]);
-
-            int left = i - step;
-            if (left >= 0) {
-                int left_p = idx_in[left];
-                if (left_p != -1) {
-                    float c = compute_cost(i, left_p, vals[left_p]);
-                    if (c < min_cost) { min_cost = c; my_best_p = left_p; }
-                }
-            }
-
-            int right = i + step;
-            if (right < N) {
-                int right_p = idx_in[right];
-                if (right_p != -1) {
-                    float c = compute_cost(i, right_p, vals[right_p]);
-                    if (c < min_cost) { min_cost = c; my_best_p = right_p; }
-                }
-            }
-            idx_out[i] = my_best_p;
-        }
-        int* temp = idx_in; idx_in = idx_out; idx_out = temp;
-        __syncthreads();
-    }
-
-    if (idx_in != idx_curr) {
-        for (int i = tid; i < N; i += blockDim.x) idx_curr[i] = idx_next[i];
-        __syncthreads();
-    }
-}
-
-template <bool IsFinal>
-__global__ void separable_kernel_shared(
-    const float* __restrict__ in_data,
-    const int32_t* __restrict__ in_indices,
-    float* __restrict__ out_dist,            
-    int32_t* __restrict__ out_indices,       
-    int64_t L,               
-    int64_t total_elements,  
-    int coord_ndim           
-) {
-    int64_t row_idx = blockIdx.x;
-    int64_t offset = row_idx * L;
-    if (offset >= total_elements) return;
-
-    extern __shared__ char s_buffer[];
-    float* s_vals = (float*)s_buffer;
-    int*   s_idx1 = (int*)(s_vals + L);
-    int*   s_idx2 = (int*)(s_idx1 + L);
-
-    for (int i = threadIdx.x; i < L; i += blockDim.x) {
-        s_vals[i] = __ldg(&in_data[offset + i]);
-    }
-    __syncthreads();
-
-    run_separable_scan_core(L, threadIdx.x, s_vals, s_idx1, s_idx2);
-
-    for (int q = threadIdx.x; q < L; q += blockDim.x) {
-        int p = s_idx1[q];
-        float dist_val;
-
-        if (p != -1) {
-            float dist_sq = sqr((float)q - (float)p) + s_vals[p];
-            dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
-        } else {
-            dist_val = IsFinal ? INF_VAL : INF_VAL;
-            p = 0;
-        }
-        out_dist[offset + q] = dist_val;
-
-        int64_t dst_base = (offset + q) * coord_ndim;
-        if (p != -1 && s_vals[p] < INF_VAL) {
-            int64_t src_base = (offset + p) * coord_ndim;
-            for (int d = 0; d < coord_ndim; ++d) {
-                out_indices[dst_base + d] = in_indices[src_base + d];
-            }
-        } else {
-            for (int d = 0; d < coord_ndim; ++d) out_indices[dst_base + d] = 0;
-        }
-    }
-}
-
-template <bool IsFinal>
-__global__ void separable_kernel_global(
-    const float* __restrict__ in_data,
-    const int32_t* __restrict__ in_indices,
-    float* __restrict__ out_dist,
-    int32_t* __restrict__ out_indices,
-    int* __restrict__ global_buffer_1,
-    int* __restrict__ global_buffer_2,
-    int64_t L,
-    int64_t total_elements,
-    int coord_ndim
-) {
-    int64_t row_idx = blockIdx.x;
-    int64_t offset = row_idx * L;
-    if (offset >= total_elements) return;
-
-    int* g_idx1 = global_buffer_1 + offset;
-    int* g_idx2 = global_buffer_2 + offset;
-    
-    run_separable_scan_core(L, threadIdx.x, in_data + offset, g_idx1, g_idx2);
-
-    for (int q = threadIdx.x; q < L; q += blockDim.x) {
-        int p = g_idx1[q];
-        float dist_val;
-        if (p != -1) {
-            float val_p = in_data[offset + p]; 
-            float dist_sq = sqr((float)q - (float)p) + val_p;
-            dist_val = IsFinal ? sqrtf(dist_sq) : dist_sq;
-        } else {
-            dist_val = IsFinal ? INF_VAL : INF_VAL;
-            p = 0; 
-        }
-        out_dist[offset + q] = dist_val;
-
-        int64_t dst_base = (offset + q) * coord_ndim;
-        if (p != -1 && in_data[offset + p] < INF_VAL) {
-            int64_t src_base = (offset + p) * coord_ndim;
-            for (int d = 0; d < coord_ndim; ++d) {
-                out_indices[dst_base + d] = in_indices[src_base + d];
-            }
-        } else {
-            for (int d = 0; d < coord_ndim; ++d) out_indices[dst_base + d] = 0;
-        }
-    }
-}
-
-__global__ void init_indices_separable_kernel(
-    int32_t* indices, 
-    int64_t total_pixels, 
-    int NDim, 
-    const int64_t* __restrict__ shape_ptr
-) {
-    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= total_pixels) return;
-
-    int64_t temp = idx;
-    int32_t coords[8]; 
-    for (int d = NDim - 1; d >= 0; --d) {
-        int64_t dim_size = shape_ptr[d];
-        coords[d] = temp % dim_size;
-        temp /= dim_size;
-    }
-    int64_t out_ptr = idx * NDim;
-    for (int d = 0; d < NDim; ++d) indices[out_ptr + d] = coords[d];
-}
-
-// ==================================================================
-// PART 4: DISPATCH HELPERS
-// ==================================================================
-
-std::tuple<torch::Tensor, torch::Tensor> run_jfa_2d(
-    torch::Tensor input, int64_t H, int64_t W, int grid, int block, int64_t numel
-) {
-    auto index_opts = input.options().dtype(torch::kInt32);
-    auto idx_shape = input.sizes().vec();
-    idx_shape.push_back(2); 
-    auto curr_idx = torch::empty(idx_shape, index_opts);
-    auto next_idx = torch::empty(idx_shape, index_opts);
-    
-    int2* d_curr = (int2*)curr_idx.data_ptr<int32_t>();
-    int2* d_next = (int2*)next_idx.data_ptr<int32_t>();
-
-    init_jfa_kernel_2d_opt<<<grid, block>>>(
-        input.data_ptr<float>(), d_curr, numel, H, W
-    );
-
-    {
-        dim3 dimBlock(JFA_BLOCK_DIM, JFA_BLOCK_DIM); 
-        int64_t batch_size = numel / (H * W);
-        dim3 dimGrid((W + JFA_BLOCK_DIM - 1) / JFA_BLOCK_DIM, 
-                     (H + JFA_BLOCK_DIM - 1) / JFA_BLOCK_DIM, 
-                     batch_size);
-        
-        jfa_block_fused_kernel_2d<<<dimGrid, dimBlock>>>(d_curr, d_next, H, W, batch_size);
-        std::swap(d_curr, d_next); 
-        std::swap(curr_idx, next_idx); 
-    }
-
-    int max_dim = std::max((int)H, (int)W);
-    int step = 16; 
-
-    while (step < max_dim) {
-        jfa_step_global_2d_opt<<<grid, block>>>(d_curr, d_next, step, H, W, numel);
-        std::swap(d_curr, d_next);
-        std::swap(curr_idx, next_idx);
-        step *= 2;
-    }
-    
-    auto final_dist = torch::empty_like(input);
-    calc_dist_kernel_2d_opt<<<grid, block>>>(d_curr, final_dist.data_ptr<float>(), numel, H, W);
-
-    return std::make_tuple(final_dist, curr_idx);
-}
-
-
-std::tuple<torch::Tensor, torch::Tensor> run_jfa_3d(
-    torch::Tensor input, int64_t D, int64_t H, int64_t W, int grid, int block, int64_t numel
-) {
-    bool use_int16 = (D < 32767 && H < 32767 && W < 32767);
-    auto index_opts = input.options().dtype(use_int16 ? torch::kInt16 : torch::kInt32);
-    
-    int64_t batch = numel / (D * H * W);
-    
-    // (3, Batch, D, H, W)
-    auto curr_idx_soa = torch::empty({3, batch, D, H, W}, index_opts);
-    auto next_idx_soa = torch::empty({3, batch, D, H, W}, index_opts);
-    
-    void* d_curr = curr_idx_soa.data_ptr();
-    void* d_next = next_idx_soa.data_ptr();
-    int64_t plane_stride = numel; // B*D*H*W
-
-    // 1. Init
-    if (use_int16) {
-        int16_t* ptr = (int16_t*)d_curr;
-        init_jfa_kernel_3d_soa<int16_t><<<grid, block>>>(
-            input.data_ptr<float>(), ptr, ptr + plane_stride, ptr + 2 * plane_stride, numel, D, H, W
-        );
-    } else {
-        int32_t* ptr = (int32_t*)d_curr;
-        init_jfa_kernel_3d_soa<int32_t><<<grid, block>>>(
-            input.data_ptr<float>(), ptr, ptr + plane_stride, ptr + 2 * plane_stride, numel, D, H, W
-        );
-    }
-
-    // 2. Fused Steps
-    int block_dim = 8;
-    int blocks_per_d = (D + block_dim - 1) / block_dim;
-    dim3 fused_block(block_dim, block_dim, block_dim);
-    dim3 fused_grid((W + block_dim - 1) / block_dim, (H + block_dim - 1) / block_dim, blocks_per_d * batch);
-    size_t smem_bytes = (14*14*14) * 3 * (use_int16 ? 2 : 4);
-
-    if (use_int16) {
-        int16_t* c = (int16_t*)d_curr;
-        int16_t* n = (int16_t*)d_next;
-        jfa_block_fused_kernel_3d_soa<int16_t><<<fused_grid, fused_block, smem_bytes>>>(
-            c, c + plane_stride, c + 2 * plane_stride, 
-            n, n + plane_stride, n + 2 * plane_stride, 
-            D, H, W, blocks_per_d
-        );
-    } else {
-        int32_t* c = (int32_t*)d_curr;
-        int32_t* n = (int32_t*)d_next;
-        jfa_block_fused_kernel_3d_soa<int32_t><<<fused_grid, fused_block, smem_bytes>>>(
-            c, c + plane_stride, c + 2 * plane_stride, 
-            n, n + plane_stride, n + 2 * plane_stride, 
-            D, H, W, blocks_per_d
-        );
-    }
-    std::swap(d_curr, d_next); 
-
-    // 3. Global Steps
-    int max_dim = std::max({(int)D, (int)H, (int)W});
-    int step = 4;
-    while (step < max_dim) {
-        if (use_int16) {
-            int16_t* c = (int16_t*)d_curr;
-            int16_t* n = (int16_t*)d_next;
-            jfa_step_3d_soa<int16_t><<<grid, block>>>(
-                c, c + plane_stride, c + 2 * plane_stride, 
-                n, n + plane_stride, n + 2 * plane_stride, 
-                step, D, H, W, numel
-            );
-        } else {
-            int32_t* c = (int32_t*)d_curr;
-            int32_t* n = (int32_t*)d_next;
-            jfa_step_3d_soa<int32_t><<<grid, block>>>(
-                c, c + plane_stride, c + 2 * plane_stride, 
-                n, n + plane_stride, n + 2 * plane_stride, 
-                step, D, H, W, numel
-            );
-        }
-        std::swap(d_curr, d_next);
-        step *= 2;
-    }
-
-    // 4. Final Dist
-    auto final_dist = torch::empty_like(input);
-    if (use_int16) {
-        int16_t* c = (int16_t*)d_curr;
-        calc_dist_kernel_3d_soa<int16_t><<<grid, block>>>(
-            c, c + plane_stride, c + 2 * plane_stride, 
-            final_dist.data_ptr<float>(), numel, D, H, W
-        );
-    } else {
-        int32_t* c = (int32_t*)d_curr;
-        calc_dist_kernel_3d_soa<int32_t><<<grid, block>>>(
-            c, c + plane_stride, c + 2 * plane_stride, 
-            final_dist.data_ptr<float>(), numel, D, H, W
-        );
-    }
-
-    // Permute result indices back to (Batch, D, H, W, 3)
-    torch::Tensor result_indices;
-    if (d_curr == curr_idx_soa.data_ptr()) result_indices = curr_idx_soa;
-    else result_indices = next_idx_soa;
-    
-    result_indices = result_indices.permute({1, 2, 3, 4, 0}).contiguous();
-
-    return std::make_tuple(final_dist, result_indices);
-}
-
-std::tuple<torch::Tensor, torch::Tensor> run_separable_ndim(torch::Tensor input) {
-    TORCH_CHECK(input.scalar_type() == torch::kFloat32, "Separable N-Dim input must be float32.");
-    input = input.contiguous();
-    
-    const int ndim = input.dim(); 
-    const int sample_ndim = ndim - 1; 
-    TORCH_CHECK(sample_ndim > 0 && sample_ndim <= 8, "Unsupported dims for Separable EDT");
-    
-    auto shape = input.sizes().vec();
-    int64_t num_pixels = input.numel();
-
-    auto current_dist = torch::where(input == 0, 
-                                    torch::tensor(0.0f, input.options()), 
-                                    torch::tensor(INF_VAL, input.options()));
-    
-    auto index_shape = shape;
-    index_shape.push_back(sample_ndim);
-    auto current_idx = torch::empty(index_shape, input.options().dtype(torch::kInt32));
-    
-    {
-        std::vector<int64_t> spatial_shape(shape.begin() + 1, shape.end());
-        auto shape_tensor = torch::tensor(spatial_shape, torch::kInt64).to(input.device());
-        int threads = 256;
-        int blocks = (num_pixels + threads - 1) / threads;
-        init_indices_separable_kernel<<<blocks, threads>>>(
-            current_idx.data_ptr<int32_t>(), num_pixels, sample_ndim, shape_tensor.data_ptr<int64_t>()
-        );
-    }
-
-    torch::Tensor global_buf1, global_buf2;
-
-    for (int d = 1; d < ndim; ++d) {
-        bool is_final_pass = (d == ndim - 1);
-        
-        auto dist_in = current_dist.transpose(d, ndim - 1).contiguous();
-        auto idx_in  = current_idx.transpose(d, ndim - 1).contiguous(); 
-        
-        auto dist_out = torch::empty(dist_in.sizes(), dist_in.options());
-        auto idx_out  = torch::empty(idx_in.sizes(), idx_in.options());
-
-        int64_t L = dist_in.size(-1); 
-        int64_t total_slices = dist_in.numel() / L; 
-        int threads = std::min((int64_t)MAX_THREADS, L);
-        
-        if (L <= SMEM_LIMIT_ELEMENTS) {
-            size_t smem_size = L * (sizeof(float) + 2 * sizeof(int));
-            if (is_final_pass) {
-                separable_kernel_shared<true><<<total_slices, threads, smem_size>>>(
-                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
-                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
-                    L, dist_in.numel(), sample_ndim
-                );
-            } else {
-                separable_kernel_shared<false><<<total_slices, threads, smem_size>>>(
-                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
-                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
-                    L, dist_in.numel(), sample_ndim
-                );
-            }
-        } else {
-            if (global_buf1.numel() < dist_in.numel()) {
-                global_buf1 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
-                global_buf2 = torch::empty({dist_in.numel()}, torch::TensorOptions().dtype(torch::kInt32).device(input.device()));
-            }
-            if (is_final_pass) {
-                separable_kernel_global<true><<<total_slices, threads>>>(
-                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
-                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
-                    global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(),
-                    L, dist_in.numel(), sample_ndim
-                );
-            } else {
-                separable_kernel_global<false><<<total_slices, threads>>>(
-                    dist_in.data_ptr<float>(), idx_in.data_ptr<int32_t>(),
-                    dist_out.data_ptr<float>(), idx_out.data_ptr<int32_t>(),
-                    global_buf1.data_ptr<int>(), global_buf2.data_ptr<int>(),
-                    L, dist_in.numel(), sample_ndim
-                );
-            }
-        }
-        current_dist = dist_out.transpose(d, ndim - 1);
-        current_idx  = idx_out.transpose(d, ndim - 1);
-    }
-
-    return std::make_tuple(current_dist, current_idx);
-}
-
-// ==================================================================
-// PART 5: MAIN ENTRY POINT
-// ==================================================================
-
-std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
-    TORCH_CHECK(input.is_cuda(), "Input must be CUDA tensor");
-    input = input.contiguous();
-
-    int64_t dims = input.dim();
-    int64_t numel = input.numel();
-    int block = BLOCK_SIZE;
-    int grid = (numel + block - 1) / block;
-
-    if (dims >= 5) {
-        return run_separable_ndim(input);
-    }
-    else if (dims == 4) {
-        int64_t dim1 = input.size(1); 
-        if (dim1 == 1) {
-            int64_t H = input.size(-2);
-            int64_t W = input.size(-1);
-            return run_jfa_2d(input, H, W, grid, block, numel);
-        } 
-        else {
-            int64_t D = dim1;
-            int64_t H = input.size(-2);
-            int64_t W = input.size(-1);
-            return run_jfa_3d(input, D, H, W, grid, block, numel);
-        }
-    }
-    else if (dims == 3) {
-        int64_t H = input.size(-2);
-        int64_t W = input.size(-1);
-        return run_jfa_2d(input, H, W, grid, block, numel);
-    }
-    else if (dims == 2) {
-        int64_t H = 1;
-        int64_t W = input.size(-1);
-        auto result = run_jfa_2d(input, H, W, grid, block, numel);
-        torch::Tensor dist = std::get<0>(result);
-        torch::Tensor idx_2d = std::get<1>(result); 
-        auto idx_1d = idx_2d.slice(/*dim=*/-1, /*start=*/1, /*end=*/2).contiguous();
-        return std::make_tuple(dist, idx_1d);
-    }
-    else {
-        TORCH_CHECK(false, "Unsupported dimensions.");
-        return std::make_tuple(torch::Tensor(), torch::Tensor());
-    }
-}
\ No newline at end of file
diff --git a/torchmorph/csrc/torchmorph.cpp b/torchmorph/csrc/torchmorph.cpp
index c79970c..15d8cd2 100644
--- a/torchmorph/csrc/torchmorph.cpp
+++ b/torchmorph/csrc/torchmorph.cpp
@@ -2,9 +2,37 @@
 
 // Declare CUDA implementations
 torch::Tensor add_cuda(torch::Tensor input, float scalar);
-std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input);
+
+// Distance Transform functions
+std::tuple<torch::Tensor, torch::Tensor> distance_transform_edt_cuda(
+    torch::Tensor input,
+    std::vector<float> sampling,
+    bool return_distances,
+    bool return_indices
+);
+
+std::tuple<torch::Tensor, torch::Tensor> distance_transform_cdt_cuda(
+    torch::Tensor input,
+    const std::string& metric,
+    bool return_distances,
+    bool return_indices
+);
+
+
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     m.def("add_cuda", &add_cuda, "Add tensor with scalar");
-    m.def("distance_transform_cuda", &distance_transform_cuda, "Distance transform");
-}
+
+    // Distance Transform
+    m.def("distance_transform_edt_cuda", &distance_transform_edt_cuda,
+          "Exact Euclidean Distance Transform (Felzenszwalb algorithm)",
+          py::arg("input"),
+          py::arg("sampling"),
+          py::arg("return_distances") = true,
+          py::arg("return_indices") = false);
+    m.def("distance_transform_cdt_cuda", &distance_transform_cdt_cuda,
+          "Chamfer Distance Transform",
+          py::arg("input"),
+          py::arg("metric") = "chessboard",
+          py::arg("return_distances") = true,
+          py::arg("return_indices") = false);
diff --git a/torchmorph/distance_transform.py b/torchmorph/distance_transform.py
index 868e84a..3e3aa37 100644
--- a/torchmorph/distance_transform.py
+++ b/torchmorph/distance_transform.py
@@ -1,16 +1,272 @@
+from typing import Optional, Sequence, Tuple, Union
+
 import torch
 
 from torchmorph import _C
 
 
-def distance_transform(input: torch.Tensor) -> torch.Tensor:
-    """Distance Transform in CUDA."""
+def distance_transform_edt(
+    input: torch.Tensor,
+    sampling: Optional[Union[float, Sequence[float]]] = None,
+    return_distances: bool = True,
+    return_indices: bool = False,
+    distances: Optional[torch.Tensor] = None,
+    indices: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor], None]:
+    """Exact Euclidean Distance Transform (EDT) using Felzenszwalb algorithm.
+
+    Args:
+        input: Binary input tensor (0 = background, non-zero = foreground).
+               Must be in (B, C, Spatial...) format where Spatial can be 1D, 2D, or 3D.
+               For single images, use unsqueeze to add batch and channel dims.
+        sampling: Spacing of elements along each spatial dimension. If a single
+                  number, the spacing is uniform in all spatial dimensions. If a
+                  sequence, it must match the number of spatial dimensions.
+                  Default is None (unit spacing for all spatial dimensions).
+        return_distances: Whether to calculate the distance transform.
+                          Default is True.
+        return_indices: Whether to calculate the feature transform (indices
+                        of closest background element). Default is False.
+        distances: Optional output tensor for distances. If provided, must have
+                   the same shape as input. If None and return_distances is True,
+                   a new tensor will be created and returned.
+        indices: Optional output tensor for indices. If provided, must have shape
+                 (spatial_ndim, ...) where ... matches input shape. If None and
+                 return_indices is True, a new tensor will be created and returned.
+
+    Returns:
+        Depending on return_distances, return_indices, and whether output tensors
+        are provided:
+            - Returns distance tensor only when return_distances=True and distances=None
+            - Returns indices tensor only when return_indices=True and indices=None
+            - Returns tuple of (distances, indices) when both conditions above are met
+            - Returns None if output tensors are provided for all requested outputs
+
+    Example:
+        >>> import torchmorph as tm
+        >>> # 2D image: (B, C, H, W)
+        >>> x = torch.zeros(1, 1, 64, 64, device='cuda')
+        >>> x[0, 0, 10:20, 10:20] = 1
+        >>> dist = tm.distance_transform_edt(x)
+        >>> dist, indices = tm.distance_transform_edt(x, return_indices=True)
+        >>> dist = tm.distance_transform_edt(x, sampling=[0.5, 1.0])
+        >>> # Using pre-allocated output tensors
+        >>> dist_out = torch.empty_like(x)
+        >>> tm.distance_transform_edt(x, distances=dist_out)  # Returns None, fills dist_out
+        >>> # 3D volume: (B, C, D, H, W)
+        >>> x_3d = torch.zeros(2, 1, 32, 64, 64, device='cuda')
+        >>> dist_3d = tm.distance_transform_edt(x_3d, sampling=[2.0, 1.0, 1.0])
+    """
+    if not input.is_cuda:
+        raise ValueError("Input tensor must be on CUDA device.")
+    if input.ndim < 3:
+        raise ValueError(
+            f"Input must be (B, C, Spatial) format with at least 3 dimensions, got {input.shape}. "
+            "For single images, use unsqueeze to add batch and channel dims."
+        )
+    if input.numel() == 0:
+        raise ValueError(f"Invalid input: empty tensor with shape {input.shape}.")
+
+    # Validate pre-allocated output tensors
+    if distances is not None:
+        if distances.shape != input.shape:
+            raise ValueError(
+                f"distances shape {distances.shape} must match input shape {input.shape}"
+            )
+        if not distances.is_cuda:
+            raise ValueError("distances tensor must be on CUDA device.")
+        return_distances = True
+
+    if indices is not None:
+        if not indices.is_cuda:
+            raise ValueError("indices tensor must be on CUDA device.")
+        return_indices = True
+
+    if not return_distances and not return_indices:
+        raise ValueError(
+            "At least one of return_distances or return_indices must be True, "
+            "or output tensors must be provided."
+        )
+
+    input = input.float().contiguous()
+    total_ndim = input.ndim
+    spatial_ndim = total_ndim - 2  # Exclude B and C dimensions
+
+    # Process sampling parameter for spatial dimensions only
+    if sampling is None:
+        # Unit spacing for all spatial dimensions
+        sampling_list = [1.0] * spatial_ndim
+    elif isinstance(sampling, (int, float)):
+        # Single value: same spacing for all spatial dimensions
+        sampling_list = [float(sampling)] * spatial_ndim
+    else:
+        # Sequence: convert to list
+        sampling_list = [float(s) for s in sampling]
+        if len(sampling_list) == 1:
+            # Single element list: broadcast to all spatial dimensions
+            sampling_list = sampling_list * spatial_ndim
+        elif len(sampling_list) != spatial_ndim:
+            raise ValueError(
+                f"sampling has {len(sampling_list)} but input has {spatial_ndim} spatial dims"
+                f"(input shape: {input.shape}, format: (B, C, Spatial...))"
+            )
+
+    # Call CUDA kernel - it handles batch dimensions based on sampling size
+    raw_distances, raw_indices = _C.distance_transform_edt_cuda(
+        input, sampling_list, return_distances, return_indices
+    )
+
+    # Copy to pre-allocated tensors if provided
+    if distances is not None and raw_distances is not None:
+        distances.copy_(raw_distances)
+
+    if indices is not None and raw_indices is not None:
+        indices.copy_(raw_indices)
+
+    # Return based on scipy convention:
+    # Only return tensors that were NOT provided by the user
+    return_dist_tensor = return_distances and distances is None
+    return_idx_tensor = return_indices and indices is None
+
+    if return_dist_tensor and return_idx_tensor:
+        return raw_distances, raw_indices
+    elif return_dist_tensor:
+        return raw_distances
+    elif return_idx_tensor:
+        return raw_indices
+    else:
+        return None
+
+
+def distance_transform_cdt(
+    input: torch.Tensor,
+    metric: str = "chessboard",
+    return_distances: bool = True,
+    return_indices: bool = False,
+    distances: Optional[torch.Tensor] = None,
+    indices: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor], None]:
+    """Chamfer Distance Transform (CDT).
+
+    Calculates the distance transform of the input using a chamfer metric.
+    The input is treated as a binary image where non-zero values are foreground
+    and zero values are background. Distances are computed from each foreground
+    pixel to the nearest background pixel.
+
+    Args:
+        input: Binary input tensor (0 = background, non-zero = foreground).
+               Must be in (B, C, H, W) or (B, C, D, H, W) format for batch processing,
+               or (H, W) / (D, H, W) for single images.
+        metric: Distance metric to use:
+                - "chessboard": L-infinity norm (default). Also known as Chebyshev distance.
+                - "taxicab": L1 norm. Also known as Manhattan or city-block distance.
+                - "cityblock": Alias for "taxicab".
+                - "manhattan": Alias for "taxicab".
+        return_distances: Whether to calculate the distance transform. Default is True.
+        return_indices: Whether to calculate the feature transform (indices of closest
+                        background element). Default is False.
+        distances: Optional output tensor for distances. If provided, must have
+                   the same shape as input. If None and return_distances is True,
+                   a new tensor will be created.
+        indices: Optional output tensor for indices. If provided, must have shape
+                 (..., ndim) where ... matches input shape. If None and return_indices
+                 is True, a new tensor will be created.
+
+    Returns:
+        Depending on return_distances, return_indices, and whether output tensors
+        are provided:
+            - Returns distance tensor only when return_distances=True and distances=None
+            - Returns indices tensor only when return_indices=True and indices=None
+            - Returns tuple of (distances, indices) when both conditions above are met
+            - Returns None if output tensors are provided for all requested outputs
+
+    Example:
+        >>> import torchmorph as tm
+        >>> # 2D image with batch: (B, C, H, W)
+        >>> x = torch.zeros(1, 1, 64, 64, device='cuda')
+        >>> x[0, 0, 10:20, 10:20] = 1
+        >>> dist = tm.distance_transform_cdt(x)  # chessboard by default
+        >>> dist = tm.distance_transform_cdt(x, metric='taxicab')
+        >>> dist, indices = tm.distance_transform_cdt(x, return_indices=True)
+        >>> # Using pre-allocated output tensors
+        >>> dist_out = torch.empty_like(x)
+        >>> tm.distance_transform_cdt(x, distances=dist_out)  # Returns None, fills dist_out
+    """
     if not input.is_cuda:
         raise ValueError("Input tensor must be on CUDA device.")
     if input.ndim < 2 or input.numel() == 0:
         raise ValueError(f"Invalid input dimension: {input.shape}.")
 
-    # binarize input
-    input[input != 0] = 1
+    # Normalize metric aliases
+    if metric in ("cityblock", "manhattan"):
+        metric = "taxicab"
+
+    if metric not in ("chessboard", "taxicab"):
+        raise ValueError("metric must be 'chessboard', 'taxicab', 'cityblock', or 'manhattan'.")
+    if not return_distances and not return_indices:
+        if distances is None and indices is None:
+            raise ValueError(
+                "At least one of return_distances or return_indices must be True, "
+                "or output tensors must be provided."
+            )
+
+    input = input.float().contiguous()
+
+    # Validate pre-allocated output tensors
+    if distances is not None:
+        if distances.shape != input.shape:
+            raise ValueError(
+                f"distances shape {distances.shape} must match input shape {input.shape}"
+            )
+        if not distances.is_cuda:
+            raise ValueError("distances tensor must be on CUDA device.")
+        return_distances = True
+
+    if indices is not None:
+        if not indices.is_cuda:
+            raise ValueError("indices tensor must be on CUDA device.")
+        return_indices = True
+
+    # Call CUDA kernel
+    raw_distances, raw_indices = _C.distance_transform_cdt_cuda(
+        input, metric, return_distances, return_indices
+    )
+
+    # Copy to pre-allocated tensors if provided
+    if distances is not None and raw_distances is not None:
+        distances.copy_(raw_distances)
+
+    if indices is not None and raw_indices is not None:
+        indices.copy_(raw_indices)
+
+    # Return based on scipy convention:
+    # Only return tensors that were NOT provided by the user
+    return_dist_tensor = return_distances and distances is None
+    return_idx_tensor = return_indices and indices is None
+
+    if return_dist_tensor and return_idx_tensor:
+        return raw_distances, raw_indices
+    elif return_dist_tensor:
+        return raw_distances
+    elif return_idx_tensor:
+        return raw_indices
+    else:
+        return None
+
+
+# Backward compatibility alias
+def distance_transform(
+    input: torch.Tensor,
+    sampling: Optional[Union[float, Sequence[float]]] = None,
+    return_distances: bool = True,
+    return_indices: bool = False,
+    distances: Optional[torch.Tensor] = None,
+    indices: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor], None]:
+    """Distance Transform (alias for distance_transform_edt).
 
-    return _C.distance_transform_cuda(input)
+    See distance_transform_edt for full documentation.
+    """
+    return distance_transform_edt(
+        input, sampling, return_distances, return_indices, distances, indices
+    )

From 6a16d6bcac828ac7e00c96c41b71ed5da8e6144d Mon Sep 17 00:00:00 2001
From: Yu Han Deng <moliflower233@163.com>
Date: Mon, 2 Feb 2026 22:25:57 +0800
Subject: [PATCH 55/56] add edt 3D benchmark

---
 benchmark/distance_transform_edt_3D.py | 112 +++++++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 benchmark/distance_transform_edt_3D.py

diff --git a/benchmark/distance_transform_edt_3D.py b/benchmark/distance_transform_edt_3D.py
new file mode 100644
index 0000000..49a5c31
--- /dev/null
+++ b/benchmark/distance_transform_edt_3D.py
@@ -0,0 +1,112 @@
+import scipy.ndimage as ndi  # noqa: F401
+import torch
+import torch.utils.benchmark as benchmark
+from prettytable import PrettyTable
+
+import torchmorph as tm  # noqa: F401
+
+# 3D benchmark configurations
+sizes = [32, 64, 128, 256]  # D=H=W
+batches = [1, 2, 4, 8]
+dtype = torch.float32
+device = "cuda"
+MIN_RUN = 1.0  # seconds per measurement
+
+torch.set_num_threads(torch.get_num_threads())
+
+for B in batches:
+    table = PrettyTable()
+    table.field_names = [
+        "Size (D×H×W)",
+        "SciPy (ms/vol)",
+        "Exact 1× (ms/vol)",
+        "Exact batch (ms/vol)",
+        "JFA 1× (ms/vol)",
+        "JFA batch (ms/vol)",
+        "Speedup Exact",
+        "Speedup JFA",
+    ]
+    for c in table.field_names:
+        table.align[c] = "r"
+
+    for s in sizes:
+        # Skip large sizes with large batches to avoid OOM
+        if s >= 256 and B >= 4:
+            table.add_row([f"{s}³", "OOM", "OOM", "OOM", "OOM", "OOM", "-", "-"])
+            continue
+
+        # Inputs: (B, D, H, W) format for 3D - no channel dimension for JFA 3D
+        x = (torch.randn(B, s, s, s, device=device) > 0).to(dtype)
+        # For scipy, we need (D, H, W) arrays
+        x_np_list = [x[i].detach().cpu().numpy() for i in range(B)]
+        # For torch single volume processing: each is (1, D, H, W)
+        x_vols = [x[i : i + 1] for i in range(B)]
+
+        # SciPy (CPU, one-by-one)
+        stmt_scipy = "out = [ndi.distance_transform_edt(arr) for arr in x_np_list]"
+        t_scipy = benchmark.Timer(
+            stmt=stmt_scipy,
+            setup="from __main__ import x_np_list, ndi",
+            num_threads=torch.get_num_threads(),
+        ).blocked_autorange(min_run_time=MIN_RUN)
+        scipy_per_vol_ms = (t_scipy.median * 1e3) / B
+
+        # Torch Exact (CUDA, one-by-one)
+        stmt_exact1 = """
+for xi in x_vols:
+    tm.distance_transform_edt(xi, algorithm="exact")
+"""
+        t_exact1 = benchmark.Timer(
+            stmt=stmt_exact1,
+            setup="from __main__ import x_vols, tm",
+            num_threads=torch.get_num_threads(),
+        ).blocked_autorange(min_run_time=MIN_RUN)
+        exact1_per_vol_ms = (t_exact1.median * 1e3) / B
+
+        # Torch Exact (CUDA, batched)
+        t_exact_batch = benchmark.Timer(
+            stmt='tm.distance_transform_edt(x, algorithm="exact")',
+            setup="from __main__ import x, tm",
+            num_threads=torch.get_num_threads(),
+        ).blocked_autorange(min_run_time=MIN_RUN)
+        exactB_per_vol_ms = (t_exact_batch.median * 1e3) / B
+
+        # Torch JFA (CUDA, one-by-one)
+        stmt_jfa1 = """
+for xi in x_vols:
+    tm.distance_transform_edt(xi, algorithm="jfa")
+"""
+        t_jfa1 = benchmark.Timer(
+            stmt=stmt_jfa1,
+            setup="from __main__ import x_vols, tm",
+            num_threads=torch.get_num_threads(),
+        ).blocked_autorange(min_run_time=MIN_RUN)
+        jfa1_per_vol_ms = (t_jfa1.median * 1e3) / B
+
+        # Torch JFA (CUDA, batched)
+        t_jfa_batch = benchmark.Timer(
+            stmt='tm.distance_transform_edt(x, algorithm="jfa")',
+            setup="from __main__ import x, tm",
+            num_threads=torch.get_num_threads(),
+        ).blocked_autorange(min_run_time=MIN_RUN)
+        jfaB_per_vol_ms = (t_jfa_batch.median * 1e3) / B
+
+        # Speedups (batch mode vs scipy)
+        speed_exact = scipy_per_vol_ms / exactB_per_vol_ms
+        speed_jfa = scipy_per_vol_ms / jfaB_per_vol_ms
+
+        table.add_row(
+            [
+                f"{s}³",
+                f"{scipy_per_vol_ms:.3f}",
+                f"{exact1_per_vol_ms:.3f}",
+                f"{exactB_per_vol_ms:.3f}",
+                f"{jfa1_per_vol_ms:.3f}",
+                f"{jfaB_per_vol_ms:.3f}",
+                f"{speed_exact:.1f}×",
+                f"{speed_jfa:.1f}×",
+            ]
+        )
+
+    print(f"\n=== 3D EDT Benchmark | Batch Size: {B} ===")
+    print(table)

From 2d67351d0f7a85cd9d72faba8c0eab90be7b6d22 Mon Sep 17 00:00:00 2001
From: Yu Han Deng <moliflower233@163.com>
Date: Mon, 2 Feb 2026 22:55:05 +0800
Subject: [PATCH 56/56] add algorithm parameter for edt

---
 benchmark/distance_transform_edt.py       |  66 +-
 test/test_distance_transform_edt.py       | 114 +++-
 torchmorph/__init__.py                    |  11 +-
 torchmorph/csrc/distance_transform_edt.cu | 720 +++++++++++++++++++++-
 torchmorph/csrc/torchmorph.cpp            |   8 +-
 torchmorph/distance_transform.py          |  16 +-
 6 files changed, 890 insertions(+), 45 deletions(-)

diff --git a/benchmark/distance_transform_edt.py b/benchmark/distance_transform_edt.py
index 3bb388d..f1f7318 100644
--- a/benchmark/distance_transform_edt.py
+++ b/benchmark/distance_transform_edt.py
@@ -18,10 +18,12 @@
     table.field_names = [
         "Size",
         "SciPy (ms/img)",
-        "Torch 1× (ms/img)",
-        "Torch batch (ms/img)",
-        "Speedup 1×",
-        "Speedup batch",
+        "Exact 1× (ms/img)",
+        "Exact batch (ms/img)",
+        "JFA 1× (ms/img)",
+        "JFA batch (ms/img)",
+        "Speedup Exact",
+        "Speedup JFA",
     ]
     for c in table.field_names:
         table.align[c] = "r"
@@ -43,38 +45,60 @@
         ).blocked_autorange(min_run_time=MIN_RUN)
         scipy_per_img_ms = (t_scipy.median * 1e3) / B
 
-        # Torch (CUDA, one-by-one)
-        stmt_torch1 = """
+        # Torch Exact (CUDA, one-by-one)
+        stmt_exact1 = """
 for xi in x_imgs:
-    tm.distance_transform_edt(xi)
+    tm.distance_transform_edt(xi, algorithm="exact")
 """
-        t_torch1 = benchmark.Timer(
-            stmt=stmt_torch1,
+        t_exact1 = benchmark.Timer(
+            stmt=stmt_exact1,
             setup="from __main__ import x_imgs, tm",
             num_threads=torch.get_num_threads(),
         ).blocked_autorange(min_run_time=MIN_RUN)
-        torch1_per_img_ms = (t_torch1.median * 1e3) / B
+        exact1_per_img_ms = (t_exact1.median * 1e3) / B
 
-        # Torch (CUDA, batched)
-        t_batch = benchmark.Timer(
-            stmt="tm.distance_transform_edt(x)",
+        # Torch Exact (CUDA, batched)
+        t_exact_batch = benchmark.Timer(
+            stmt='tm.distance_transform_edt(x, algorithm="exact")',
             setup="from __main__ import x, tm",
             num_threads=torch.get_num_threads(),
         ).blocked_autorange(min_run_time=MIN_RUN)
-        torchB_per_img_ms = (t_batch.median * 1e3) / B
+        exactB_per_img_ms = (t_exact_batch.median * 1e3) / B
 
-        # Speedups
-        speed1 = scipy_per_img_ms / torch1_per_img_ms
-        speedB = scipy_per_img_ms / torchB_per_img_ms
+        # Torch JFA (CUDA, one-by-one)
+        stmt_jfa1 = """
+for xi in x_imgs:
+    tm.distance_transform_edt(xi, algorithm="jfa")
+"""
+        t_jfa1 = benchmark.Timer(
+            stmt=stmt_jfa1,
+            setup="from __main__ import x_imgs, tm",
+            num_threads=torch.get_num_threads(),
+        ).blocked_autorange(min_run_time=MIN_RUN)
+        jfa1_per_img_ms = (t_jfa1.median * 1e3) / B
+
+        # Torch JFA (CUDA, batched)
+        t_jfa_batch = benchmark.Timer(
+            stmt='tm.distance_transform_edt(x, algorithm="jfa")',
+            setup="from __main__ import x, tm",
+            num_threads=torch.get_num_threads(),
+        ).blocked_autorange(min_run_time=MIN_RUN)
+        jfaB_per_img_ms = (t_jfa_batch.median * 1e3) / B
+
+        # Speedups (batch mode vs scipy)
+        speed_exact = scipy_per_img_ms / exactB_per_img_ms
+        speed_jfa = scipy_per_img_ms / jfaB_per_img_ms
 
         table.add_row(
             [
                 s,
                 f"{scipy_per_img_ms:.3f}",
-                f"{torch1_per_img_ms:.3f}",
-                f"{torchB_per_img_ms:.3f}",
-                f"{speed1:.1f}×",
-                f"{speedB:.1f}×",
+                f"{exact1_per_img_ms:.3f}",
+                f"{exactB_per_img_ms:.3f}",
+                f"{jfa1_per_img_ms:.3f}",
+                f"{jfaB_per_img_ms:.3f}",
+                f"{speed_exact:.1f}×",
+                f"{speed_jfa:.1f}×",
             ]
         )
 
diff --git a/test/test_distance_transform_edt.py b/test/test_distance_transform_edt.py
index cace4c8..20b993a 100644
--- a/test/test_distance_transform_edt.py
+++ b/test/test_distance_transform_edt.py
@@ -160,7 +160,7 @@ def test_distance_transform_and_indices(
         print(f"CUDA result:\n{dist_cuda.cpu().numpy()}")
         print(f"SciPy reference:\n{dist_ref.cpu().numpy()}")
 
-    torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-3, rtol=1e-3)
+    torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-5, rtol=1e-5)
     print(">> Distance validation passed.")
 
     # 6. Validate indices
@@ -190,8 +190,8 @@ def test_distance_transform_and_indices(
     torch.testing.assert_close(
         dist_sq_calculated,
         dist_sq_output,
-        atol=1e-3,
-        rtol=1e-3,
+        atol=1e-5,
+        rtol=1e-5,
     )
     print(">> Index validation passed.")
 
@@ -303,7 +303,7 @@ def test_distance_transform_with_sampling(
     assert (
         dist_cuda.shape == dist_ref.shape
     ), f"Shape mismatch: {dist_cuda.shape} vs {dist_ref.shape}"
-    torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-3, rtol=1e-3)
+    torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-5, rtol=1e-5)
     print(">> Distance validation with sampling passed.")
 
     # Validate indices shape
@@ -335,7 +335,7 @@ def test_distance_transform_with_sampling(
     dist_sq_calculated = torch.sum(diff * diff, dim=0)
     dist_sq_output = dist_cuda * dist_cuda
 
-    torch.testing.assert_close(dist_sq_calculated, dist_sq_output, atol=1e-3, rtol=1e-3)
+    torch.testing.assert_close(dist_sq_calculated, dist_sq_output, atol=1e-5, rtol=1e-5)
     print(">> Index validation with sampling passed.")
 
 
@@ -392,5 +392,107 @@ def test_single_float_sampling() -> None:
     dist_ref_numpy, _ = batch_scipy_edt_with_sampling(x_numpy, spatial_ndim, [0.5, 0.5])
     dist_ref = torch.from_numpy(dist_ref_numpy).to(torch.float32).cuda()
 
-    torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-3, rtol=1e-3)
+    torch.testing.assert_close(dist_cuda, dist_ref, atol=1e-5, rtol=1e-5)
     print(">> Single float sampling test passed.")
+
+
+# ======================================================================
+# Test algorithm parameter (JFA vs Exact)
+# ======================================================================
+@pytest.mark.parametrize(
+    "input_numpy, spatial_ndim, algorithm",
+    [
+        # 2D tests with different algorithms
+        pytest.param(case_2d, 2, "exact", id="2D_exact"),
+        pytest.param(case_2d, 2, "jfa", id="2D_jfa"),
+        pytest.param(case_2d, 2, "auto", id="2D_auto"),
+        pytest.param(case_2d_single, 2, "exact", id="2D_single_exact"),
+        pytest.param(case_2d_single, 2, "jfa", id="2D_single_jfa"),
+        pytest.param(case_2d_single, 2, "auto", id="2D_single_auto"),
+        # 3D tests with different algorithms
+        pytest.param(case_3d, 3, "exact", id="3D_exact"),
+        pytest.param(case_3d, 3, "jfa", id="3D_jfa"),
+        pytest.param(case_3d, 3, "auto", id="3D_auto"),
+    ],
+)
+def test_distance_transform_algorithm(
+    input_numpy: np.ndarray,
+    spatial_ndim: int,
+    algorithm: str,
+    request: pytest.FixtureRequest,
+) -> None:
+    """Test EDT with different algorithm options (exact, jfa, auto)."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    x_numpy_contiguous = np.ascontiguousarray(input_numpy)
+    x_cuda = torch.from_numpy(x_numpy_contiguous).cuda()
+
+    print(f"\n\n--- Running test: {request.node.callspec.id} ---")
+    print(f"CUDA input shape: {x_cuda.shape}, spatial_ndim: {spatial_ndim}, algorithm: {algorithm}")
+
+    # Run CUDA EDT with specified algorithm
+    dist_cuda = tm.distance_transform_edt(x_cuda.clone(), algorithm=algorithm)
+
+    # Run SciPy (ground truth)
+    dist_ref_numpy, _ = batch_scipy_edt_with_indices(x_numpy_contiguous, spatial_ndim)
+    dist_ref = torch.from_numpy(dist_ref_numpy).to(torch.float32).cuda()
+
+    # Validate distances
+    print(f"CUDA distance shape: {dist_cuda.shape}, reference shape: {dist_ref.shape}")
+    assert (
+        dist_cuda.shape == dist_ref.shape
+    ), f"Shape mismatch: {dist_cuda.shape} vs {dist_ref.shape}"
+
+    torch.testing.assert_close(dist_cuda, dist_ref, rtol=1e-5, atol=1e-5)
+
+    print(f">> Algorithm '{algorithm}' validation passed.")
+
+
+def test_algorithm_fallback_with_sampling() -> None:
+    """Test that JFA falls back to exact when sampling is provided."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    x_numpy = case_2d_single
+    x_cuda = torch.from_numpy(x_numpy).cuda()
+
+    # With non-unit sampling, JFA should fall back to exact algorithm
+    # Both should give same result
+    dist_jfa = tm.distance_transform_edt(x_cuda.clone(), sampling=[0.5, 1.0], algorithm="jfa")
+    dist_exact = tm.distance_transform_edt(x_cuda.clone(), sampling=[0.5, 1.0], algorithm="exact")
+
+    # Compare with scipy
+    spatial_ndim = 2
+    dist_ref_numpy, _ = batch_scipy_edt_with_sampling(x_numpy, spatial_ndim, [0.5, 1.0])
+    dist_ref = torch.from_numpy(dist_ref_numpy).to(torch.float32).cuda()
+
+    torch.testing.assert_close(dist_jfa, dist_ref, atol=1e-5, rtol=1e-5)
+    torch.testing.assert_close(dist_exact, dist_ref, atol=1e-5, rtol=1e-5)
+
+    print(">> Algorithm fallback with sampling test passed.")
+
+
+def test_jfa_vs_exact_consistency() -> None:
+    """Test that JFA and exact produce similar results for unit sampling."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    # Create a larger random test case
+    torch.manual_seed(42)
+    x = (torch.randn(2, 1, 64, 64, device="cuda") > 0).float()
+
+    dist_exact = tm.distance_transform_edt(x, algorithm="exact")
+    dist_jfa = tm.distance_transform_edt(x, algorithm="jfa")
+
+    # JFA should be very close to exact for most pixels
+    # Allow for small differences due to JFA's approximate nature
+    diff = torch.abs(dist_exact - dist_jfa)
+    max_diff = diff.max().item()
+    mean_diff = diff.mean().item()
+
+    print(f"JFA vs Exact - Max diff: {max_diff:.6f}, Mean diff: {mean_diff:.6f}")
+
+    # Most pixels should be exact or very close
+    assert mean_diff < 0.1, f"Mean difference too large: {mean_diff}"
+    print(">> JFA vs Exact consistency test passed.")
diff --git a/torchmorph/__init__.py b/torchmorph/__init__.py
index ed972e1..b853b19 100644
--- a/torchmorph/__init__.py
+++ b/torchmorph/__init__.py
@@ -1,18 +1,9 @@
 from .add import add
-from .binary_morphology import binary_fill_holes, binary_propagation
-from .distance_transform import (
-    distance_transform,
-    distance_transform_bf,
-    distance_transform_cdt,
-    distance_transform_edt,
-)
+from .distance_transform import distance_transform, distance_transform_cdt, distance_transform_edt
 
 __all__ = [
     "add",
     "distance_transform",
     "distance_transform_edt",
-    "distance_transform_bf",
     "distance_transform_cdt",
-    "binary_fill_holes",
-    "binary_propagation",
 ]
diff --git a/torchmorph/csrc/distance_transform_edt.cu b/torchmorph/csrc/distance_transform_edt.cu
index 225a976..d021c62 100644
--- a/torchmorph/csrc/distance_transform_edt.cu
+++ b/torchmorph/csrc/distance_transform_edt.cu
@@ -4,6 +4,10 @@
 #include <vector>
 #include <cmath>
 #include <tuple>
+#include <limits>
+#include <cstdint>
+#include <cstdio>
+#include <algorithm>
 
 // ==============================================================================
 // Configuration
@@ -12,6 +16,463 @@
 #define MAX_THREADS 256
 #define SHARED_MEM_LIMIT 2048  // Max dimension size for shared memory path (48KB limit)
 
+// JFA Configuration
+#define BLOCK_SIZE 256
+#define SMEM_LIMIT_ELEMENTS 4096
+#define JFA_BLOCK_DIM 32
+#define JFA_FUSED_STEPS 4
+#define JFA_MAX_OFFSET 8
+#define JFA_SMEM_DIM (JFA_BLOCK_DIM + 2 * JFA_MAX_OFFSET)
+#define JFA_3D_BLOCK 8
+#define JFA_3D_HALO 1
+
+// ==============================================================================
+// JFA Device Helpers
+// ==============================================================================
+__device__ __forceinline__ float sqr(float x) { return x * x; }
+
+__device__ __forceinline__ float dist_sq_2d(int y1, int x1, int y2, int x2) {
+    return sqr((float)(y1 - y2)) + sqr((float)(x1 - x2));
+}
+
+__device__ __forceinline__ float dist_sq_3d_soa(int z1, int y1, int x1, int z2, int y2, int x2) {
+    if (z2 == -1) return INF_VAL;
+    float dz = (float)(z1 - z2);
+    float dy = (float)(y1 - y2);
+    float dx = (float)(x1 - x2);
+    return dz*dz + dy*dy + dx*dx;
+}
+
+__device__ __forceinline__ float compute_cost(int q, int p, float val_p) {
+    if (p < 0 || val_p >= INF_VAL) return INF_VAL;
+    return sqr((float)q - (float)p) + val_p;
+}
+
+__device__ __forceinline__ float dist_sq_int2(int y, int x, int2 seed) {
+    if (seed.x == -1) return INF_VAL;
+    float dy = (float)(y - seed.x);
+    float dx = (float)(x - seed.y);
+    return dy*dy + dx*dx;
+}
+
+// ==============================================================================
+// JFA 2D Kernels (Vectorized int2 + Block Shared)
+// ==============================================================================
+__global__ void init_jfa_kernel_2d_opt(
+    const float* __restrict__ input,
+    int2* __restrict__ output,
+    int64_t total_elements,
+    int H, int W
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    if (input[tid] == 0.0f) {
+        int64_t spatial_size = (int64_t)H * W;
+        int64_t rem = tid % spatial_size;
+        int w = (int)(rem % W);
+        int h = (int)(rem / W);
+        output[tid] = make_int2(h, w);
+    } else {
+        output[tid] = make_int2(-1, -1);
+    }
+}
+
+__global__ void jfa_block_fused_kernel_2d(
+    const int2* __restrict__ in_idx,
+    int2* __restrict__ out_idx,
+    int H, int W,
+    int64_t num_images
+) {
+    __shared__ int2 smem[JFA_SMEM_DIM][JFA_SMEM_DIM];
+
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    int bx = blockIdx.x * blockDim.x;
+    int by = blockIdx.y * blockDim.y;
+    int img_idx = blockIdx.z;
+    int64_t batch_offset = (int64_t)img_idx * (H * W);
+
+    int gx = bx + tx;
+    int gy = by + ty;
+
+    // Phase 1: load data to Shared Memory
+    int smem_linear_size = JFA_SMEM_DIM * JFA_SMEM_DIM;
+    int total_threads = blockDim.x * blockDim.y;
+    int thread_linear_idx = ty * blockDim.x + tx;
+
+    int base_x = bx - JFA_MAX_OFFSET;
+    int base_y = by - JFA_MAX_OFFSET;
+
+    for (int i = thread_linear_idx; i < smem_linear_size; i += total_threads) {
+        int s_y = i / JFA_SMEM_DIM;
+        int s_x = i % JFA_SMEM_DIM;
+        int global_y = base_y + s_y;
+        int global_x = base_x + s_x;
+        int2 val = make_int2(-1, -1);
+        if (global_y >= 0 && global_y < H && global_x >= 0 && global_x < W) {
+            val = in_idx[batch_offset + global_y * W + global_x];
+        }
+        smem[s_y][s_x] = val;
+    }
+    __syncthreads();
+
+    // Phase 2: Iterate in Shared Memory
+    if (gx < W && gy < H) {
+        int center_sy = ty + JFA_MAX_OFFSET;
+        int center_sx = tx + JFA_MAX_OFFSET;
+
+        int2 best_seed = smem[center_sy][center_sx];
+        float best_dist = dist_sq_int2(gy, gx, best_seed);
+
+        int step = 1;
+        #pragma unroll
+        for (int k = 0; k < JFA_FUSED_STEPS; ++k) {
+            #pragma unroll
+            for (int dy = -1; dy <= 1; ++dy) {
+                #pragma unroll
+                for (int dx = -1; dx <= 1; ++dx) {
+                    if (dy == 0 && dx == 0) continue;
+                    int2 neighbor_seed = smem[center_sy + dy * step][center_sx + dx * step];
+                    if (neighbor_seed.x != -1) {
+                        float d = dist_sq_int2(gy, gx, neighbor_seed);
+                        if (d < best_dist) {
+                            best_dist = d;
+                            best_seed = neighbor_seed;
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+            smem[center_sy][center_sx] = best_seed;
+            __syncthreads();
+            step *= 2;
+        }
+        out_idx[batch_offset + gy * W + gx] = best_seed;
+    }
+}
+
+__global__ void jfa_step_global_2d_opt(
+    const int2* __restrict__ in_idx,
+    int2* __restrict__ out_idx,
+    int step,
+    int H, int W,
+    int64_t total_pixels
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_pixels) return;
+
+    int64_t spatial_size = (int64_t)H * W;
+    int64_t rem = tid % spatial_size;
+    int64_t batch_offset = tid - rem;
+    int w = (int)(rem % W);
+    int h = (int)(rem / W);
+
+    int2 best_seed = in_idx[tid];
+    float best_dist = dist_sq_int2(h, w, best_seed);
+
+    #pragma unroll
+    for (int dy = -1; dy <= 1; ++dy) {
+        #pragma unroll
+        for (int dx = -1; dx <= 1; ++dx) {
+            if (dx == 0 && dy == 0) continue;
+
+            int ny = h + dy * step;
+            int nx = w + dx * step;
+
+            if (ny >= 0 && ny < H && nx >= 0 && nx < W) {
+                int2 neighbor_seed = in_idx[batch_offset + ny * W + nx];
+                if (neighbor_seed.x != -1) {
+                    float d = dist_sq_int2(h, w, neighbor_seed);
+                    if (d < best_dist) {
+                        best_dist = d;
+                        best_seed = neighbor_seed;
+                    }
+                }
+            }
+        }
+    }
+    out_idx[tid] = best_seed;
+}
+
+__global__ void calc_dist_kernel_2d_opt(
+    const int2* __restrict__ indices,
+    float* __restrict__ dist_out,
+    int64_t total_elements,
+    int H, int W
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    int2 s = indices[tid];
+    if (s.x == -1) {
+        dist_out[tid] = INF_VAL;
+    } else {
+        int64_t spatial_size = (int64_t)H * W;
+        int64_t rem = tid % spatial_size;
+        int cur_w = (int)(rem % W);
+        int cur_h = (int)(rem / W);
+        dist_out[tid] = sqrtf(dist_sq_int2(cur_h, cur_w, s));
+    }
+}
+
+// ==============================================================================
+// JFA 3D Kernels (Optimized SoA Layout)
+// ==============================================================================
+template <typename IndexType>
+__global__ void init_jfa_kernel_3d_soa(
+    const float* __restrict__ input,
+    IndexType* __restrict__ indices_z,
+    IndexType* __restrict__ indices_y,
+    IndexType* __restrict__ indices_x,
+    int64_t total_elements,
+    int D, int H, int W
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    if (input[tid] == 0.0f) {
+        int64_t spatial_size = (int64_t)D * H * W;
+        int64_t rem = tid % spatial_size;
+        int w = (int)(rem % W);
+        int h = (int)((rem / W) % H);
+        int d = (int)(rem / (W * H));
+
+        indices_z[tid] = (IndexType)d;
+        indices_y[tid] = (IndexType)h;
+        indices_x[tid] = (IndexType)w;
+    } else {
+        indices_z[tid] = (IndexType)-1;
+        indices_y[tid] = (IndexType)-1;
+        indices_x[tid] = (IndexType)-1;
+    }
+}
+
+template <typename IndexType>
+__global__ void jfa_block_fused_kernel_3d_soa(
+    const IndexType* __restrict__ in_z,
+    const IndexType* __restrict__ in_y,
+    const IndexType* __restrict__ in_x,
+    IndexType* __restrict__ out_z,
+    IndexType* __restrict__ out_y,
+    IndexType* __restrict__ out_x,
+    int D, int H, int W,
+    int blocks_per_d
+) {
+    const int BLOCK_DIM = 8;
+    const int HALO = 3;
+    const int SMEM_DIM = BLOCK_DIM + 2 * HALO; // 14
+    const int SMEM_SIZE = SMEM_DIM * SMEM_DIM * SMEM_DIM;
+
+    extern __shared__ char smem_raw[];
+    IndexType* smem_z = (IndexType*)smem_raw;
+    IndexType* smem_y = smem_z + SMEM_SIZE;
+    IndexType* smem_x = smem_y + SMEM_SIZE;
+
+    int tx = threadIdx.x; int ty = threadIdx.y; int tz = threadIdx.z;
+
+    int b_z_total = blockIdx.z;
+    int batch_id = b_z_total / blocks_per_d;
+    int b_z_local = b_z_total % blocks_per_d;
+
+    int bx = blockIdx.x * BLOCK_DIM;
+    int by = blockIdx.y * BLOCK_DIM;
+    int bz = b_z_local * BLOCK_DIM;
+
+    int64_t spatial_offset = (int64_t)batch_id * (D * H * W);
+
+    // Phase 1: Load to SoA Shared Memory
+    int tid = tz * 64 + ty * 8 + tx;
+    int base_x = bx - HALO;
+    int base_y = by - HALO;
+    int base_z = bz - HALO;
+
+    for (int i = tid; i < SMEM_SIZE; i += 512) {
+        int temp = i;
+        int sx = temp % SMEM_DIM; temp /= SMEM_DIM;
+        int sy = temp % SMEM_DIM;
+        int sz = temp / SMEM_DIM;
+
+        int gx = base_x + sx;
+        int gy = base_y + sy;
+        int gz = base_z + sz;
+
+        IndexType val_z = -1, val_y = -1, val_x = -1;
+        if (gz >= 0 && gz < D && gy >= 0 && gy < H && gx >= 0 && gx < W) {
+            int64_t idx = spatial_offset + (int64_t)gz * (H * W) + gy * W + gx;
+            val_z = in_z[idx];
+            val_y = in_y[idx];
+            val_x = in_x[idx];
+        }
+        smem_z[i] = val_z;
+        smem_y[i] = val_y;
+        smem_x[i] = val_x;
+    }
+    __syncthreads();
+
+    // Phase 2: Compute
+    int center_sz = tz + HALO;
+    int center_sy = ty + HALO;
+    int center_sx = tx + HALO;
+    int my_s_idx = (center_sz * SMEM_DIM + center_sy) * SMEM_DIM + center_sx;
+
+    int best_z = (int)smem_z[my_s_idx];
+    int best_y = (int)smem_y[my_s_idx];
+    int best_x = (int)smem_x[my_s_idx];
+
+    int g_cz = bz + tz;
+    int g_cy = by + ty;
+    int g_cx = bx + tx;
+
+    float best_dist = dist_sq_3d_soa(g_cz, g_cy, g_cx, best_z, best_y, best_x);
+
+    int step = 1;
+    #pragma unroll
+    for (int k = 0; k < 2; ++k) {
+        #pragma unroll
+        for (int dz = -1; dz <= 1; ++dz) {
+            #pragma unroll
+            for (int dy = -1; dy <= 1; ++dy) {
+                #pragma unroll
+                for (int dx = -1; dx <= 1; ++dx) {
+                    if (dz == 0 && dy == 0 && dx == 0) continue;
+
+                    int nz = center_sz + dz * step;
+                    int ny = center_sy + dy * step;
+                    int nx = center_sx + dx * step;
+                    int n_idx = (nz * SMEM_DIM + ny) * SMEM_DIM + nx;
+
+                    int sz_in = (int)smem_z[n_idx];
+                    if (sz_in != -1) {
+                        int sy_in = (int)smem_y[n_idx];
+                        int sx_in = (int)smem_x[n_idx];
+                        float d = dist_sq_3d_soa(g_cz, g_cy, g_cx, sz_in, sy_in, sx_in);
+                        if (d < best_dist) {
+                            best_dist = d;
+                            best_z = sz_in;
+                            best_y = sy_in;
+                            best_x = sx_in;
+                        }
+                    }
+                }
+            }
+        }
+        __syncthreads();
+        smem_z[my_s_idx] = (IndexType)best_z;
+        smem_y[my_s_idx] = (IndexType)best_y;
+        smem_x[my_s_idx] = (IndexType)best_x;
+        __syncthreads();
+        step *= 2;
+    }
+
+    if (g_cz < D && g_cy < H && g_cx < W) {
+        int64_t out_idx_g = spatial_offset + (int64_t)g_cz * (H * W) + g_cy * W + g_cx;
+        out_z[out_idx_g] = (IndexType)best_z;
+        out_y[out_idx_g] = (IndexType)best_y;
+        out_x[out_idx_g] = (IndexType)best_x;
+    }
+}
+
+template <typename IndexType>
+__global__ void jfa_step_3d_soa(
+    const IndexType* __restrict__ in_z,
+    const IndexType* __restrict__ in_y,
+    const IndexType* __restrict__ in_x,
+    IndexType* __restrict__ out_z,
+    IndexType* __restrict__ out_y,
+    IndexType* __restrict__ out_x,
+    int step,
+    int D, int H, int W,
+    int64_t total_pixels
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_pixels) return;
+
+    int64_t spatial_size = (int64_t)D * H * W;
+    int64_t rem = tid % spatial_size;
+    int64_t batch_offset = tid - rem;
+    int cur_w = (int)(rem % W);
+    int cur_h = (int)((rem / W) % H);
+    int cur_d = (int)(rem / (W * H));
+
+    int best_z = (int)in_z[tid];
+    int best_y = (int)in_y[tid];
+    int best_x = (int)in_x[tid];
+
+    float best_dist = dist_sq_3d_soa(cur_d, cur_h, cur_w, best_z, best_y, best_x);
+
+    #pragma unroll
+    for (int dz = -1; dz <= 1; ++dz) {
+        #pragma unroll
+        for (int dy = -1; dy <= 1; ++dy) {
+            #pragma unroll
+            for (int dx = -1; dx <= 1; ++dx) {
+                if (dz == 0 && dy == 0 && dx == 0) continue;
+
+                int nz = cur_d + dz * step;
+                int ny = cur_h + dy * step;
+                int nx = cur_w + dx * step;
+
+                if (nz >= 0 && nz < D && ny >= 0 && ny < H && nx >= 0 && nx < W) {
+                    int64_t n_idx = batch_offset + (int64_t)nz * (H * W) + ny * W + nx;
+
+                    int seed_z = (int)in_z[n_idx];
+                    if (seed_z != -1) {
+                        float dz_val = (float)(cur_d - seed_z);
+                        float dz_sq = dz_val * dz_val;
+
+                        if (dz_sq < best_dist) {
+                            int seed_y = (int)in_y[n_idx];
+                            int seed_x = (int)in_x[n_idx];
+                            float dist = dz_sq + sqr((float)(cur_h - seed_y)) + sqr((float)(cur_w - seed_x));
+
+                            if (dist < best_dist) {
+                                best_dist = dist;
+                                best_z = seed_z;
+                                best_y = seed_y;
+                                best_x = seed_x;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    out_z[tid] = (IndexType)best_z;
+    out_y[tid] = (IndexType)best_y;
+    out_x[tid] = (IndexType)best_x;
+}
+
+template <typename IndexType>
+__global__ void calc_dist_kernel_3d_soa(
+    const IndexType* __restrict__ in_z,
+    const IndexType* __restrict__ in_y,
+    const IndexType* __restrict__ in_x,
+    float* __restrict__ dist_out,
+    int64_t total_elements,
+    int D, int H, int W
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= total_elements) return;
+
+    int seed_d = (int)in_z[tid];
+    if (seed_d == -1) {
+        dist_out[tid] = INF_VAL;
+    } else {
+        int seed_h = (int)in_y[tid];
+        int seed_w = (int)in_x[tid];
+
+        int64_t spatial_size = (int64_t)D * H * W;
+        int64_t rem = tid % spatial_size;
+        int cur_w = (int)(rem % W);
+        int cur_h = (int)((rem / W) % H);
+        int cur_d = (int)(rem / (W * H));
+
+        dist_out[tid] = sqrtf(dist_sq_3d_soa(cur_d, cur_h, cur_w, seed_d, seed_h, seed_w));
+    }
+}
+
 // ==============================================================================
 // 2D Optimized: Initialization kernel
 // ==============================================================================
@@ -666,7 +1127,7 @@ __global__ void init_distance_kernel(
     // Initialize indices to current coordinates
     if (compute_indices) {
         int64_t temp = idx;
-        int coords[8];
+        int coords[16];  // Support up to 16D
 
         // Compute coordinates from linear index
         for (int d = total_ndim - 1; d >= 0; d--) {
@@ -824,14 +1285,221 @@ std::tuple<torch::Tensor, torch::Tensor> run_edt_separable(
     return std::make_tuple(distance.contiguous(), return_indices ? indices.contiguous() : torch::Tensor());
 }
 
+// ==============================================================================
+// JFA Dispatch Helpers
+// ==============================================================================
+std::tuple<torch::Tensor, torch::Tensor> run_jfa_2d(
+    torch::Tensor input, int64_t H, int64_t W, int grid, int block, int64_t numel
+) {
+    auto index_opts = input.options().dtype(torch::kInt32);
+    auto idx_shape = input.sizes().vec();
+    idx_shape.push_back(2);
+    auto curr_idx = torch::empty(idx_shape, index_opts);
+    auto next_idx = torch::empty(idx_shape, index_opts);
+
+    int2* d_curr = (int2*)curr_idx.data_ptr<int32_t>();
+    int2* d_next = (int2*)next_idx.data_ptr<int32_t>();
+
+    init_jfa_kernel_2d_opt<<<grid, block>>>(
+        input.data_ptr<float>(), d_curr, numel, H, W
+    );
+
+    {
+        dim3 dimBlock(JFA_BLOCK_DIM, JFA_BLOCK_DIM);
+        int64_t batch_size = numel / (H * W);
+        dim3 dimGrid((W + JFA_BLOCK_DIM - 1) / JFA_BLOCK_DIM,
+                     (H + JFA_BLOCK_DIM - 1) / JFA_BLOCK_DIM,
+                     batch_size);
+
+        jfa_block_fused_kernel_2d<<<dimGrid, dimBlock>>>(d_curr, d_next, H, W, batch_size);
+        std::swap(d_curr, d_next);
+        std::swap(curr_idx, next_idx);
+    }
+
+    int max_dim = std::max((int)H, (int)W);
+    int step = 16;
+
+    while (step < max_dim) {
+        jfa_step_global_2d_opt<<<grid, block>>>(d_curr, d_next, step, H, W, numel);
+        std::swap(d_curr, d_next);
+        std::swap(curr_idx, next_idx);
+        step *= 2;
+    }
+
+    auto final_dist = torch::empty_like(input);
+    calc_dist_kernel_2d_opt<<<grid, block>>>(d_curr, final_dist.data_ptr<float>(), numel, H, W);
+
+    return std::make_tuple(final_dist, curr_idx);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> run_jfa_3d(
+    torch::Tensor input, int64_t D, int64_t H, int64_t W, int grid, int block, int64_t numel
+) {
+    bool use_int16 = (D < 32767 && H < 32767 && W < 32767);
+    auto index_opts = input.options().dtype(use_int16 ? torch::kInt16 : torch::kInt32);
+
+    int64_t batch = numel / (D * H * W);
+
+    // (3, Batch, D, H, W)
+    auto curr_idx_soa = torch::empty({3, batch, D, H, W}, index_opts);
+    auto next_idx_soa = torch::empty({3, batch, D, H, W}, index_opts);
+
+    void* d_curr = curr_idx_soa.data_ptr();
+    void* d_next = next_idx_soa.data_ptr();
+    int64_t plane_stride = numel; // B*D*H*W
+
+    // 1. Init
+    if (use_int16) {
+        int16_t* ptr = (int16_t*)d_curr;
+        init_jfa_kernel_3d_soa<int16_t><<<grid, block>>>(
+            input.data_ptr<float>(), ptr, ptr + plane_stride, ptr + 2 * plane_stride, numel, D, H, W
+        );
+    } else {
+        int32_t* ptr = (int32_t*)d_curr;
+        init_jfa_kernel_3d_soa<int32_t><<<grid, block>>>(
+            input.data_ptr<float>(), ptr, ptr + plane_stride, ptr + 2 * plane_stride, numel, D, H, W
+        );
+    }
+
+    // 2. Fused Steps
+    int block_dim = 8;
+    int blocks_per_d = (D + block_dim - 1) / block_dim;
+    dim3 fused_block(block_dim, block_dim, block_dim);
+    dim3 fused_grid((W + block_dim - 1) / block_dim, (H + block_dim - 1) / block_dim, blocks_per_d * batch);
+    size_t smem_bytes = (14*14*14) * 3 * (use_int16 ? 2 : 4);
+
+    if (use_int16) {
+        int16_t* c = (int16_t*)d_curr;
+        int16_t* n = (int16_t*)d_next;
+        jfa_block_fused_kernel_3d_soa<int16_t><<<fused_grid, fused_block, smem_bytes>>>(
+            c, c + plane_stride, c + 2 * plane_stride,
+            n, n + plane_stride, n + 2 * plane_stride,
+            D, H, W, blocks_per_d
+        );
+    } else {
+        int32_t* c = (int32_t*)d_curr;
+        int32_t* n = (int32_t*)d_next;
+        jfa_block_fused_kernel_3d_soa<int32_t><<<fused_grid, fused_block, smem_bytes>>>(
+            c, c + plane_stride, c + 2 * plane_stride,
+            n, n + plane_stride, n + 2 * plane_stride,
+            D, H, W, blocks_per_d
+        );
+    }
+    std::swap(d_curr, d_next);
+
+    // 3. Global Steps
+    int max_dim = std::max({(int)D, (int)H, (int)W});
+    int step = 4;
+    while (step < max_dim) {
+        if (use_int16) {
+            int16_t* c = (int16_t*)d_curr;
+            int16_t* n = (int16_t*)d_next;
+            jfa_step_3d_soa<int16_t><<<grid, block>>>(
+                c, c + plane_stride, c + 2 * plane_stride,
+                n, n + plane_stride, n + 2 * plane_stride,
+                step, D, H, W, numel
+            );
+        } else {
+            int32_t* c = (int32_t*)d_curr;
+            int32_t* n = (int32_t*)d_next;
+            jfa_step_3d_soa<int32_t><<<grid, block>>>(
+                c, c + plane_stride, c + 2 * plane_stride,
+                n, n + plane_stride, n + 2 * plane_stride,
+                step, D, H, W, numel
+            );
+        }
+        std::swap(d_curr, d_next);
+        step *= 2;
+    }
+
+    // 4. Final Dist
+    auto final_dist = torch::empty_like(input);
+    if (use_int16) {
+        int16_t* c = (int16_t*)d_curr;
+        calc_dist_kernel_3d_soa<int16_t><<<grid, block>>>(
+            c, c + plane_stride, c + 2 * plane_stride,
+            final_dist.data_ptr<float>(), numel, D, H, W
+        );
+    } else {
+        int32_t* c = (int32_t*)d_curr;
+        calc_dist_kernel_3d_soa<int32_t><<<grid, block>>>(
+            c, c + plane_stride, c + 2 * plane_stride,
+            final_dist.data_ptr<float>(), numel, D, H, W
+        );
+    }
+
+    // Permute result indices back to (Batch, D, H, W, 3)
+    torch::Tensor result_indices;
+    if (d_curr == curr_idx_soa.data_ptr()) result_indices = curr_idx_soa;
+    else result_indices = next_idx_soa;
+
+    result_indices = result_indices.permute({1, 2, 3, 4, 0}).contiguous();
+
+    return std::make_tuple(final_dist, result_indices);
+}
+
+// ==============================================================================
+// JFA Main Entry Point
+// ==============================================================================
+std::tuple<torch::Tensor, torch::Tensor> distance_transform_cuda(torch::Tensor input) {
+    TORCH_CHECK(input.is_cuda(), "Input must be CUDA tensor");
+    input = input.contiguous();
+
+    int64_t dims = input.dim();
+    int64_t numel = input.numel();
+    int block = BLOCK_SIZE;
+    int grid = (numel + block - 1) / block;
+
+    if (dims >= 5) {
+        // For 4D+ spatial, fall back to separable algorithm
+        int spatial_ndim = dims - 1;
+        std::vector<float> sampling(spatial_ndim, 1.0f);
+        return run_edt_separable(input, sampling, true);
+    }
+    else if (dims == 4) {
+        int64_t dim1 = input.size(1);
+        if (dim1 == 1) {
+            int64_t H = input.size(-2);
+            int64_t W = input.size(-1);
+            return run_jfa_2d(input, H, W, grid, block, numel);
+        }
+        else {
+            int64_t D = dim1;
+            int64_t H = input.size(-2);
+            int64_t W = input.size(-1);
+            return run_jfa_3d(input, D, H, W, grid, block, numel);
+        }
+    }
+    else if (dims == 3) {
+        int64_t H = input.size(-2);
+        int64_t W = input.size(-1);
+        return run_jfa_2d(input, H, W, grid, block, numel);
+    }
+    else if (dims == 2) {
+        int64_t H = 1;
+        int64_t W = input.size(-1);
+        auto result = run_jfa_2d(input, H, W, grid, block, numel);
+        torch::Tensor dist = std::get<0>(result);
+        torch::Tensor idx_2d = std::get<1>(result);
+        auto idx_1d = idx_2d.slice(/*dim=*/-1, /*start=*/1, /*end=*/2).contiguous();
+        return std::make_tuple(dist, idx_1d);
+    }
+    else {
+        TORCH_CHECK(false, "Unsupported dimensions.");
+        return std::make_tuple(torch::Tensor(), torch::Tensor());
+    }
+}
+
 // ==============================================================================
 // Python binding entry point
 // ==============================================================================
+
 std::tuple<torch::Tensor, torch::Tensor> distance_transform_edt_cuda(
     torch::Tensor input,
     std::vector<float> sampling,
     bool return_distances,
-    bool return_indices
+    bool return_indices,
+    const std::string& algorithm
 ) {
     TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
 
@@ -847,6 +1515,54 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_edt_cuda(
 
     int spatial_ndim = sampling.size();
 
+    // Check if we can use JFA algorithm
+    bool can_use_jfa = true;
+
+    // JFA doesn't support non-unit sampling
+    for (float s : sampling) {
+        if (std::abs(s - 1.0f) > 1e-6f) {
+            can_use_jfa = false;
+            break;
+        }
+    }
+
+    // JFA only supports 2D and 3D (spatial dimensions)
+    if (spatial_ndim > 3) {
+        can_use_jfa = false;
+    }
+
+    // Determine which algorithm to use
+    bool use_jfa = false;
+    if (algorithm == "jfa") {
+        if (can_use_jfa) {
+            use_jfa = true;
+        } else {
+            // Fall back to exact with warning (or we can throw)
+            // For now, silently fall back to exact
+            use_jfa = false;
+        }
+    } else if (algorithm == "exact") {
+        use_jfa = false;
+    } else if (algorithm == "auto") {
+        // Auto mode: use JFA only for 2D with unit sampling
+        // For 3D, exact algorithm performs better in practice
+        use_jfa = can_use_jfa && (spatial_ndim == 2);
+    } else {
+        TORCH_CHECK(false, "algorithm must be 'exact', 'jfa', or 'auto', got: ", algorithm);
+    }
+
+    if (use_jfa) {
+        // Use JFA algorithm
+        auto [distances, indices_result] = distance_transform_cuda(input);
+
+        if (!return_indices) {
+            indices_result = torch::Tensor();
+        }
+
+        return std::make_tuple(distances, indices_result);
+    }
+
+    // Use exact (Felzenszwalb) algorithm
     // Use 2D optimized path only when both dimensions fit in shared memory
     // For larger dimensions, the N-D general version with transpose is faster
     if (spatial_ndim == 2) {
diff --git a/torchmorph/csrc/torchmorph.cpp b/torchmorph/csrc/torchmorph.cpp
index 15d8cd2..58cf6ed 100644
--- a/torchmorph/csrc/torchmorph.cpp
+++ b/torchmorph/csrc/torchmorph.cpp
@@ -8,7 +8,8 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_edt_cuda(
     torch::Tensor input,
     std::vector<float> sampling,
     bool return_distances,
-    bool return_indices
+    bool return_indices,
+    const std::string& algorithm
 );
 
 std::tuple<torch::Tensor, torch::Tensor> distance_transform_cdt_cuda(
@@ -19,7 +20,6 @@ std::tuple<torch::Tensor, torch::Tensor> distance_transform_cdt_cuda(
 );
 
 
-
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     m.def("add_cuda", &add_cuda, "Add tensor with scalar");
 
@@ -29,10 +29,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
           py::arg("input"),
           py::arg("sampling"),
           py::arg("return_distances") = true,
-          py::arg("return_indices") = false);
+          py::arg("return_indices") = false,
+          py::arg("algorithm") = "exact");
     m.def("distance_transform_cdt_cuda", &distance_transform_cdt_cuda,
           "Chamfer Distance Transform",
           py::arg("input"),
           py::arg("metric") = "chessboard",
           py::arg("return_distances") = true,
           py::arg("return_indices") = false);
+}
diff --git a/torchmorph/distance_transform.py b/torchmorph/distance_transform.py
index 3e3aa37..c3356e2 100644
--- a/torchmorph/distance_transform.py
+++ b/torchmorph/distance_transform.py
@@ -12,6 +12,7 @@ def distance_transform_edt(
     return_indices: bool = False,
     distances: Optional[torch.Tensor] = None,
     indices: Optional[torch.Tensor] = None,
+    algorithm: str = "exact",
 ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor], None]:
     """Exact Euclidean Distance Transform (EDT) using Felzenszwalb algorithm.
 
@@ -23,6 +24,7 @@ def distance_transform_edt(
                   number, the spacing is uniform in all spatial dimensions. If a
                   sequence, it must match the number of spatial dimensions.
                   Default is None (unit spacing for all spatial dimensions).
+                  Note: When sampling is not unit spacing, only "exact" algorithm is used.
         return_distances: Whether to calculate the distance transform.
                           Default is True.
         return_indices: Whether to calculate the feature transform (indices
@@ -33,6 +35,12 @@ def distance_transform_edt(
         indices: Optional output tensor for indices. If provided, must have shape
                  (spatial_ndim, ...) where ... matches input shape. If None and
                  return_indices is True, a new tensor will be created and returned.
+        algorithm: Algorithm to use for distance transform. Options:
+                   - "exact": Use Felzenszwalb's exact algorithm (default).
+                   - "jfa": Use Jump Flooding Algorithm (fast but approximate).
+                            Only available for 2D/3D with unit sampling.
+                   - "auto": Automatically choose based on input (uses JFA when
+                            applicable, otherwise exact).
 
     Returns:
         Depending on return_distances, return_indices, and whether output tensors
@@ -50,6 +58,8 @@ def distance_transform_edt(
         >>> dist = tm.distance_transform_edt(x)
         >>> dist, indices = tm.distance_transform_edt(x, return_indices=True)
         >>> dist = tm.distance_transform_edt(x, sampling=[0.5, 1.0])
+        >>> # Using JFA algorithm (faster for large images)
+        >>> dist = tm.distance_transform_edt(x, algorithm="jfa")
         >>> # Using pre-allocated output tensors
         >>> dist_out = torch.empty_like(x)
         >>> tm.distance_transform_edt(x, distances=dist_out)  # Returns None, fills dist_out
@@ -61,7 +71,7 @@ def distance_transform_edt(
         raise ValueError("Input tensor must be on CUDA device.")
     if input.ndim < 3:
         raise ValueError(
-            f"Input must be (B, C, Spatial) format with at least 3 dimensions, got {input.shape}. "
+            f"Input must be (B, C, ) format with at least 3 dimensions, got {input.shape}. "
             "For single images, use unsqueeze to add batch and channel dims."
         )
     if input.numel() == 0:
@@ -107,13 +117,13 @@ def distance_transform_edt(
             sampling_list = sampling_list * spatial_ndim
         elif len(sampling_list) != spatial_ndim:
             raise ValueError(
-                f"sampling has {len(sampling_list)} but input has {spatial_ndim} spatial dims"
+                f"sampling has {len(sampling_list)} but input {spatial_ndim}  dimensions "
                 f"(input shape: {input.shape}, format: (B, C, Spatial...))"
             )
 
     # Call CUDA kernel - it handles batch dimensions based on sampling size
     raw_distances, raw_indices = _C.distance_transform_edt_cuda(
-        input, sampling_list, return_distances, return_indices
+        input, sampling_list, return_distances, return_indices, algorithm
     )
 
     # Copy to pre-allocated tensors if provided