diff --git a/projects/PTv3/Dockerfile b/projects/PTv3/Dockerfile
index 9a7e0e3ff..9de653ade 100644
--- a/projects/PTv3/Dockerfile
+++ b/projects/PTv3/Dockerfile
@@ -4,15 +4,9 @@ FROM ${AWML_BASE_IMAGE}
 ENV FLASH_ATTN_CUDA_ARCHS="120"
 ENV PYTHONPATH=/workspace/projects:/workspace/projects/PTv3
 
-RUN python3 -m pip --no-cache-dir install \
-  addict \
-  open3d \
-  flash-attn --no-build-isolation \
-  regex \
-  sharedarray \
-  spconv-cu120 \
-  tensorboardx \
-  termcolor \
-  torch-scatter -f https://data.pyg.org/whl/torch-2.8.0+cu129.html \
-  tqdm \
-  yapf==0.40.1
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
+ && . /root/.local/bin/env \
+ && uv pip install --system --python /opt/conda/bin/python \
+    --upgrade pip setuptools wheel \
+ && uv pip install --system --python /opt/conda/bin/python \
+    --no-build-isolation flash-attn /workspace/projects/PTv3
diff --git a/projects/PTv3/configs/semseg-litept-small-v1m1-nuscenes.py b/projects/PTv3/configs/semseg-litept-small-v1m1-nuscenes.py
new file mode 100644
index 000000000..3b346fd7d
--- /dev/null
+++ b/projects/PTv3/configs/semseg-litept-small-v1m1-nuscenes.py
@@ -0,0 +1,245 @@
+_base_ = ["./_base_/default_runtime.py"]
+
+save_path = "work_dirs/litept"
+
+# misc custom setting
+batch_size = 12  # bs: total bs in all gpus
+num_worker = 24
+mix_prob = 0.8
+empty_cache = False
+enable_amp = True
+
+# dataset settings
+dataset_type = "NuScenesDataset"
+data_root = "data/nuscenes"
+ignore_index = -1
+info_paths_train = ["info/nuscenes_infos_10sweeps_train.pkl"]
+info_paths_val = ["info/nuscenes_infos_10sweeps_val.pkl"]
+info_paths_test = ["info/nuscenes_infos_10sweeps_test.pkl"]
+class_names = [
+    "barrier",
+    "bicycle",
+    "bus",
+    "car",
+    "construction_vehicle",
+    "motorcycle",
+    "pedestrian",
+    "traffic_cone",
+    "trailer",
+    "truck",
+    "driveable_surface",
+    "other_flat",
+    "sidewalk",
+    "terrain",
+    "manmade",
+    "vegetation",
+]
+class_mapping = {
+    0: 0,
+    1: 1,
+    2: 2,
+    3: 3,
+    4: 4,
+    5: 5,
+    6: 6,
+    7: 7,
+    8: 8,
+    9: 9,
+    10: 10,
+    11: 11,
+    12: 12,
+    13: 13,
+    14: 14,
+    15: 15,
+}
+num_classes = 16
+
+# model settings
+model = dict(
+    type="DefaultSegmentorV2",
+    num_classes=num_classes,
+    backbone_out_channels=72,
+    backbone=dict(
+        type="LitePT",
+        in_channels=4,
+        order=["z", "z-trans", "hilbert", "hilbert-trans"],
+        stride=(2, 2, 2, 2),
+        enc_depths=(2, 2, 2, 6, 2),
+        enc_channels=(36, 72, 144, 252, 504),
+        enc_num_head=(2, 4, 8, 14, 28),
+        enc_patch_size=(1024, 1024, 1024, 1024, 1024),
+        enc_conv=(True, True, True, False, False),
+        enc_attn=(False, False, False, True, True),
+        enc_rope_freq=(100.0, 100.0, 100.0, 100.0, 100.0),
+        dec_depths=(0, 0, 0, 0),
+        dec_channels=(72, 72, 144, 252),
+        dec_num_head=(4, 4, 8, 14),
+        dec_patch_size=(1024, 1024, 1024, 1024),
+        dec_conv=(False, False, False, False),
+        dec_attn=(False, False, False, False),
+        dec_rope_freq=(100.0, 100.0, 100.0, 100.0),
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        drop_path=0.3,
+        shuffle_orders=True,
+        pre_norm=True,
+        enc_mode=False,
+    ),
+    criteria=[
+        dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=ignore_index),
+        dict(type="LovaszLoss", mode="multiclass", loss_weight=1.0, ignore_index=ignore_index),
+    ],
+)
+
+# scheduler settings
+epoch = 50
+eval_epoch = 50
+optimizer = dict(type="AdamW", lr=0.002, weight_decay=0.005)
+scheduler = dict(
+    type="OneCycleLR",
+    max_lr=[0.002, 0.0002],
+    pct_start=0.04,
+    anneal_strategy="cos",
+    div_factor=10.0,
+    final_div_factor=100.0,
+)
+param_dicts = [dict(keyword="block", lr=0.0002)]
+
+# dataset settings
+data = dict(
+    num_classes=num_classes,
+    ignore_index=ignore_index,
+    train=dict(
+        type=dataset_type,
+        split="train",
+        data_root=data_root,
+        info_paths=info_paths_train,
+        transform=[
+            # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
+            # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
+            dict(type="RandomRotate", angle=[-1, 1], axis="z", center=[0, 0, 0], p=0.5),
+            # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5),
+            # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5),
+            dict(type="RandomScale", scale=[0.9, 1.1]),
+            # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
+            dict(type="RandomFlip", p=0.5),
+            dict(type="RandomJitter", sigma=0.005, clip=0.02),
+            # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
+            dict(
+                type="GridSample",
+                grid_size=0.05,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "strength", "segment"),
+                return_grid_coord=True,
+            ),
+            # dict(type="SphereCrop", point_max=1000000, mode="random"),
+            # dict(type="CenterShift", apply_z=False),
+            dict(type="ToTensor"),
+            dict(type="Update", keys_dict={"grid_size": 0.05}),
+            dict(
+                type="Collect",
+                keys=("coord", "grid_coord", "segment", "grid_size"),
+                feat_keys=("coord", "strength"),
+            ),
+        ],
+        test_mode=False,
+        ignore_index=ignore_index,
+        class_mapping=class_mapping,
+    ),
+    val=dict(
+        type=dataset_type,
+        split="val",
+        data_root=data_root,
+        info_paths=info_paths_val,
+        transform=[
+            dict(type="Copy", keys_dict={"segment": "origin_segment"}),
+            # dict(type="PointClip", point_cloud_range=(-51.2, -51.2, -4, 51.2, 51.2, 2.4)),
+            dict(
+                type="GridSample",
+                grid_size=0.05,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "strength", "segment"),
+                return_grid_coord=True,
+                return_inverse=True,
+            ),
+            # dict(type="SphereCrop", point_max=1000000, mode='center'),
+            dict(type="ToTensor"),
+            dict(
+                type="Collect",
+                keys=("coord", "grid_coord", "segment", "origin_segment", "inverse"),
+                feat_keys=("coord", "strength"),
+            ),
+        ],
+        test_mode=False,
+        ignore_index=ignore_index,
+        class_mapping=class_mapping,
+    ),
+    test=dict(
+        type=dataset_type,
+        split="val",
+        data_root=data_root,
+        info_paths=info_paths_test,
+        transform=[
+            dict(type="Copy", keys_dict={"segment": "origin_segment"}),
+            dict(
+                type="GridSample",
+                grid_size=0.025,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "strength", "segment"),
+                return_inverse=True,
+            ),
+        ],
+        test_mode=True,
+        test_cfg=dict(
+            voxelize=dict(
+                type="GridSample",
+                grid_size=0.05,
+                hash_type="fnv",
+                mode="test",
+                keys=("coord", "strength", "segment"),
+                return_grid_coord=True,
+            ),
+            crop=None,
+            post_transform=[
+                dict(type="ToTensor"),
+                dict(
+                    type="Collect",
+                    keys=("coord", "grid_coord", "index"),
+                    feat_keys=("coord", "strength"),
+                ),
+            ],
+            aug_transform=[
+                [dict(type="RandomScale", scale=[0.9, 0.9])],
+                [dict(type="RandomScale", scale=[0.95, 0.95])],
+                [dict(type="RandomScale", scale=[1, 1])],
+                [dict(type="RandomScale", scale=[1.05, 1.05])],
+                [dict(type="RandomScale", scale=[1.1, 1.1])],
+                [
+                    dict(type="RandomScale", scale=[0.9, 0.9]),
+                    dict(type="RandomFlip", p=1),
+                ],
+                [
+                    dict(type="RandomScale", scale=[0.95, 0.95]),
+                    dict(type="RandomFlip", p=1),
+                ],
+                [dict(type="RandomScale", scale=[1, 1]), dict(type="RandomFlip", p=1)],
+                [
+                    dict(type="RandomScale", scale=[1.05, 1.05]),
+                    dict(type="RandomFlip", p=1),
+                ],
+                [
+                    dict(type="RandomScale", scale=[1.1, 1.1]),
+                    dict(type="RandomFlip", p=1),
+                ],
+            ],
+        ),
+        ignore_index=ignore_index,
+        class_mapping=class_mapping,
+    ),
+)
diff --git a/projects/PTv3/configs/semseg-litept-small-v1m1-t4dataset.py b/projects/PTv3/configs/semseg-litept-small-v1m1-t4dataset.py
new file mode 100644
index 000000000..9d84c9089
--- /dev/null
+++ b/projects/PTv3/configs/semseg-litept-small-v1m1-t4dataset.py
@@ -0,0 +1,252 @@
+_base_ = [
+    "../../../autoware_ml/configs/detection3d/default_runtime.py",
+    "./_base_/default_runtime.py",
+    "../../../autoware_ml/configs/segmentation3d/dataset/t4dataset/j6gen2_base.py",
+]
+
+save_path = "work_dirs/litept"
+
+# misc custom setting
+batch_size = 12  # bs: total bs in all gpus
+num_worker = 24
+mix_prob = 0.8
+empty_cache = False
+enable_amp = True
+
+# (min_x, min_y, min_z, max_x, max_y, max_z)
+point_cloud_range = [-102.4, -102.4, -2.8, 102.4, 102.4, 10.0]
+grid_size = 0.1  # original is 0.05
+
+# dataset settings
+dataset_type = "T4Dataset"
+data_root = "data/t4dataset"
+ignore_index = -1
+info_paths_train = ["info/lidarseg/t4dataset_j6gen2_lidarseg_infos_train.pkl"]
+info_paths_val = ["info/lidarseg/t4dataset_j6gen2_lidarseg_infos_val.pkl"]
+info_paths_test = ["info/lidarseg/t4dataset_j6gen2_lidarseg_infos_test.pkl"]
+class_mapping = {
+    "drivable_surface": 0,
+    "other_flat_surface": 1,
+    "sidewalk": 2,
+    "manmade": 3,
+    "vegetation": 4,
+    "car": 5,
+    "bus": 6,
+    "emergency_vehicle": 7,
+    "train": 8,
+    "truck": 9,
+    "tractor_unit": 10,
+    "semi_trailer": 11,
+    "construction_vehicle": 12,
+    "forklift": 13,
+    "kart": 14,
+    "motorcycle": 15,
+    "bicycle": 16,
+    "pedestrian": 17,
+    "personal_mobility": 18,
+    "animal": 19,
+    "pushable_pullable": 20,
+    "traffic_cone": 21,
+    "stroller": 22,
+    "debris": 23,
+    "other_stuff": 24,
+    "noise": 25,
+    "ghost_point": 25,
+    "out_of_sync": ignore_index,
+    "unpainted": ignore_index,
+}
+num_classes = 26
+
+# model settings
+model = dict(
+    type="DefaultSegmentorV2",
+    num_classes=num_classes,
+    backbone_out_channels=72,
+    backbone=dict(
+        type="LitePT",
+        in_channels=4,
+        order=["z", "z-trans", "hilbert", "hilbert-trans"],
+        stride=(2, 2, 2, 2),
+        enc_depths=(2, 2, 2, 6, 2),
+        enc_channels=(36, 72, 144, 252, 504),
+        enc_num_head=(2, 4, 8, 14, 28),
+        enc_patch_size=(1024, 1024, 1024, 1024, 1024),
+        enc_conv=(True, True, True, False, False),
+        enc_attn=(False, False, False, True, True),
+        enc_rope_freq=(100.0, 100.0, 100.0, 100.0, 100.0),
+        dec_depths=(0, 0, 0, 0),
+        dec_channels=(72, 72, 144, 252),
+        dec_num_head=(4, 4, 8, 14),
+        dec_patch_size=(1024, 1024, 1024, 1024),
+        dec_conv=(False, False, False, False),
+        dec_attn=(False, False, False, False),
+        dec_rope_freq=(100.0, 100.0, 100.0, 100.0),
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        drop_path=0.3,
+        shuffle_orders=True,
+        pre_norm=True,
+        enc_mode=False,
+    ),
+    criteria=[
+        dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=ignore_index),
+        dict(type="LovaszLoss", mode="multiclass", loss_weight=1.0, ignore_index=ignore_index),
+    ],
+)
+
+# scheduler settings
+epoch = 50
+eval_epoch = 50
+optimizer = dict(type="AdamW", lr=0.002, weight_decay=0.005)
+scheduler = dict(
+    type="OneCycleLR",
+    max_lr=[0.002, 0.0002],
+    pct_start=0.04,
+    anneal_strategy="cos",
+    div_factor=10.0,
+    final_div_factor=100.0,
+)
+param_dicts = [dict(keyword="block", lr=0.0002)]
+
+# dataset settings
+data = dict(
+    num_classes=num_classes,
+    ignore_index=ignore_index,
+    train=dict(
+        type=dataset_type,
+        split="train",
+        data_root=data_root,
+        info_paths=info_paths_train,
+        transform=[
+            # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2),
+            # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75),
+            dict(type="RandomRotate", angle=[-1, 1], axis="z", center=[0, 0, 0], p=0.5),
+            # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5),
+            # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5),
+            dict(type="RandomScale", scale=[0.9, 1.1]),
+            dict(
+                type="PointClip",
+                point_cloud_range=point_cloud_range,
+            ),
+            # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]),
+            dict(type="RandomFlip", p=0.5),
+            dict(type="RandomJitter", sigma=0.005, clip=0.02),
+            # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]),
+            dict(
+                type="GridSample",
+                grid_size=grid_size,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "strength", "segment"),
+                return_grid_coord=True,
+            ),
+            dict(type="SphereCrop", point_max=128000, mode="random"),
+            # dict(type="CenterShift", apply_z=False),
+            dict(type="ToTensor"),
+            dict(type="Update", keys_dict={"grid_size": grid_size}),
+            dict(
+                type="Collect",
+                keys=("coord", "grid_coord", "segment", "grid_size"),
+                feat_keys=("coord", "strength"),
+            ),
+        ],
+        test_mode=False,
+        ignore_index=ignore_index,
+        class_mapping=class_mapping,
+    ),
+    val=dict(
+        type=dataset_type,
+        split="val",
+        data_root=data_root,
+        info_paths=info_paths_val,
+        transform=[
+            dict(type="Copy", keys_dict={"segment": "origin_segment"}),
+            dict(type="PointClip", point_cloud_range=point_cloud_range),
+            dict(
+                type="GridSample",
+                grid_size=grid_size,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "strength", "segment"),
+                return_grid_coord=True,
+                return_inverse=True,
+            ),
+            # dict(type="SphereCrop", point_max=1000000, mode='center'),
+            dict(type="ToTensor"),
+            dict(
+                type="Collect",
+                keys=("coord", "grid_coord", "segment", "origin_segment", "inverse"),
+                feat_keys=("coord", "strength"),
+            ),
+        ],
+        test_mode=False,
+        ignore_index=ignore_index,
+        class_mapping=class_mapping,
+    ),
+    test=dict(
+        type=dataset_type,
+        split="val",
+        data_root=data_root,
+        info_paths=info_paths_test,
+        transform=[
+            dict(type="Copy", keys_dict={"segment": "origin_segment"}),
+            dict(
+                type="GridSample",
+                grid_size=grid_size,
+                hash_type="fnv",
+                mode="train",
+                keys=("coord", "strength", "segment"),
+                return_inverse=True,
+            ),
+        ],
+        test_mode=True,
+        test_cfg=dict(
+            voxelize=dict(
+                type="GridSample",
+                grid_size=grid_size,
+                hash_type="fnv",
+                mode="test",
+                keys=("coord", "strength", "segment"),
+                return_grid_coord=True,
+            ),
+            crop=None,
+            post_transform=[
+                dict(type="ToTensor"),
+                dict(
+                    type="Collect",
+                    keys=("coord", "grid_coord", "index"),
+                    feat_keys=("coord", "strength"),
+                ),
+            ],
+            aug_transform=[
+                [dict(type="RandomScale", scale=[0.9, 0.9])],
+                [dict(type="RandomScale", scale=[0.95, 0.95])],
+                [dict(type="RandomScale", scale=[1, 1])],
+                [dict(type="RandomScale", scale=[1.05, 1.05])],
+                [dict(type="RandomScale", scale=[1.1, 1.1])],
+                [
+                    dict(type="RandomScale", scale=[0.9, 0.9]),
+                    dict(type="RandomFlip", p=1),
+                ],
+                [
+                    dict(type="RandomScale", scale=[0.95, 0.95]),
+                    dict(type="RandomFlip", p=1),
+                ],
+                [dict(type="RandomScale", scale=[1, 1]), dict(type="RandomFlip", p=1)],
+                [
+                    dict(type="RandomScale", scale=[1.05, 1.05]),
+                    dict(type="RandomFlip", p=1),
+                ],
+                [
+                    dict(type="RandomScale", scale=[1.1, 1.1]),
+                    dict(type="RandomFlip", p=1),
+                ],
+            ],
+        ),
+        ignore_index=ignore_index,
+        class_mapping=class_mapping,
+    ),
+)
diff --git a/projects/PTv3/datasets/transform.py b/projects/PTv3/datasets/transform.py
index fb911adc9..9de0aae73 100644
--- a/projects/PTv3/datasets/transform.py
+++ b/projects/PTv3/datasets/transform.py
@@ -64,6 +64,19 @@ def __call__(self, data_dict):
         return data_dict
 
 
+@TRANSFORMS.register_module()
+class Update(object):
+    def __init__(self, keys_dict=None):
+        if keys_dict is None:
+            keys_dict = dict()
+        self.keys_dict = keys_dict
+
+    def __call__(self, data_dict):
+        for key, value in self.keys_dict.items():
+            data_dict[key] = value
+        return data_dict
+
+
 @TRANSFORMS.register_module()
 class ToTensor(object):
     def __call__(self, data):
diff --git a/projects/PTv3/libs/pointgroup_ops/functions/__init__.py b/projects/PTv3/libs/pointgroup_ops/functions/__init__.py
new file mode 100644
index 000000000..a9e748288
--- /dev/null
+++ b/projects/PTv3/libs/pointgroup_ops/functions/__init__.py
@@ -0,0 +1 @@
+from .functions import Clustering, ballquery_batch_p, bfs_cluster
diff --git a/projects/PTv3/libs/pointgroup_ops/functions/functions.py b/projects/PTv3/libs/pointgroup_ops/functions/functions.py
new file mode 100644
index 000000000..8236abaf5
--- /dev/null
+++ b/projects/PTv3/libs/pointgroup_ops/functions/functions.py
@@ -0,0 +1,168 @@
+import pointgroup_ops_cuda
+import torch
+from torch.autograd import Function
+
+
+class BallQueryBatchP(Function):
+    @staticmethod
+    def forward(ctx, coords, batch_idxs, batch_offsets, radius, meanActive):
+        """
+        :param ctx:
+        :param coords: (n, 3) float
+        :param batch_idxs: (n) int
+        :param batch_offsets: (B+1) int
+        :param radius: float
+        :param meanActive: int
+        :return: idx (nActive), int
+        :return: start_len (n, 2), int
+        """
+
+        n = coords.size(0)
+
+        assert coords.is_contiguous() and coords.is_cuda
+        assert batch_idxs.is_contiguous() and batch_idxs.is_cuda
+        assert batch_offsets.is_contiguous() and batch_offsets.is_cuda
+
+        while True:
+            idx = torch.cuda.IntTensor(n * meanActive).zero_()
+            start_len = torch.cuda.IntTensor(n, 2).zero_()
+            nActive = pointgroup_ops_cuda.ballquery_batch_p(
+                coords, batch_idxs, batch_offsets, idx, start_len, n, meanActive, radius
+            )
+            if nActive <= n * meanActive:
+                break
+            meanActive = int(nActive // n + 1)
+        idx = idx[:nActive]
+
+        return idx, start_len
+
+    @staticmethod
+    def backward(ctx, a=None, b=None):
+        return None, None, None
+
+
+ballquery_batch_p = BallQueryBatchP.apply
+
+
+class Clustering:
+    def __init__(
+        self,
+        ignored_labels,
+        class_mapping,
+        thresh=0.03,
+        closed_points=300,
+        min_points=50,
+        propose_points=100,
+        score_func=torch.max,
+    ) -> None:
+        self.ignored_labels = ignored_labels
+        self.thresh = thresh
+        self.closed_points = closed_points
+        self.min_points = min_points
+        self.class_mapping = class_mapping
+        self.propose_points = propose_points
+        self.score_func = score_func
+
+    def cluster(self, vertices, scores):
+        labels = torch.max(scores, 1)[1]  # (N) long, cuda
+        proposals_idx, proposals_offset = self.cluster_(vertices, labels)
+
+        ## debug
+        # import ipdb; ipdb.set_trace()
+        # colors = np.array(create_color_palette())[labels.cpu()]
+        # write_triangle_mesh(vertices, colors, None, 'semantics.ply')
+
+        # scatter
+        proposals_pred = torch.zeros(
+            (proposals_offset.shape[0] - 1, vertices.shape[0]), dtype=torch.int
+        )  # (nProposal, N), int, cuda
+        proposals_pred[proposals_idx[:, 0].long(), proposals_idx[:, 1].long()] = 1
+        labels = labels[proposals_idx[:, 1][proposals_offset[:-1].long()].long()]
+
+        proposals_pointnum = proposals_pred.sum(1)
+        npoint_mask = proposals_pointnum > self.propose_points
+
+        proposals_pred = proposals_pred[npoint_mask]
+        labels = labels[npoint_mask]
+        return proposals_pred, labels
+
+    def cluster_(self, vertices, labels):
+        """
+        :param batch_idxs: (N), int, cuda
+        :labels: 0-19
+        """
+        batch_idxs = torch.zeros_like(labels)
+
+        mask_non_ignored = torch.ones_like(labels).bool()
+        for ignored_label in self.ignored_labels:
+            mask_non_ignored = mask_non_ignored & (self.class_mapping[labels] != ignored_label)
+        object_idxs = mask_non_ignored.nonzero().view(-1)
+
+        vertices_ = vertices[object_idxs].float()
+        labels_ = labels[object_idxs].int()
+
+        if vertices_.numel() == 0:
+            return torch.zeros((0, 2)).int(), torch.zeros(1).int()
+
+        batch_idxs_ = batch_idxs[object_idxs].int()
+        batch_offsets_ = torch.FloatTensor([0, object_idxs.shape[0]]).int().cuda()
+
+        idx, start_len = ballquery_batch_p(vertices_, batch_idxs_, batch_offsets_, self.thresh, self.closed_points)
+        proposals_idx, proposals_offset = bfs_cluster(labels_.cpu(), idx.cpu(), start_len.cpu(), self.min_points)
+        proposals_idx[:, 1] = object_idxs[proposals_idx[:, 1].long()].int()
+
+        return proposals_idx, proposals_offset
+
+    def get_instances(self, vertices, scores):
+        proposals_pred, labels = self.cluster(vertices, scores)
+        instances = {}
+        for proposal_id in range(len(proposals_pred)):
+            clusters_i = proposals_pred[proposal_id]
+            score = scores[clusters_i.bool(), labels[proposal_id]]
+            score = self.score_func(score)
+            instances[proposal_id] = {}
+            instances[proposal_id]["conf"] = score.cpu().numpy()
+            instances[proposal_id]["label_id"] = self.class_mapping.cpu()[labels[proposal_id]]
+            instances[proposal_id]["pred_mask"] = clusters_i.cpu().numpy()
+        return instances
+
+
+class BFSCluster(Function):
+    @staticmethod
+    def forward(ctx, semantic_label, ball_query_idxs, start_len, threshold):
+        """
+        :param ctx:
+        :param semantic_label: (N), int
+        :param ball_query_idxs: (nActive), int
+        :param start_len: (N, 2), int
+        :return: cluster_idxs:  int (sumNPoint, 2), dim 0 for cluster_id, dim 1 for corresponding point idxs in N
+        :return: cluster_offsets: int (nCluster + 1)
+        """
+
+        N = start_len.size(0)
+
+        assert semantic_label.is_contiguous()
+        assert ball_query_idxs.is_contiguous()
+        assert start_len.is_contiguous()
+
+        cluster_idxs = semantic_label.new()
+        cluster_offsets = semantic_label.new()
+
+        pointgroup_ops_cuda.bfs_cluster(
+            semantic_label,
+            ball_query_idxs,
+            start_len,
+            cluster_idxs,
+            cluster_offsets,
+            N,
+            threshold,
+        )
+
+        return cluster_idxs, cluster_offsets
+
+    @staticmethod
+    def backward(ctx, a=None):
+        return None
+
+
+bfs_cluster = BFSCluster.apply
diff --git a/projects/PTv3/libs/pointgroup_ops/pyproject.toml b/projects/PTv3/libs/pointgroup_ops/pyproject.toml
new file mode 100644
index 000000000..74b97ba86
--- /dev/null
+++ b/projects/PTv3/libs/pointgroup_ops/pyproject.toml
@@ -0,0 +1,13 @@
+[project]
+name = "pointgroup-ops"
+version = "0.1.0"
+requires-python = "==3.10.*"
+dependencies = []
+
+[build-system]
+requires = ["setuptools>=61"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+where = ["."]
+exclude = []
diff --git a/projects/PTv3/libs/pointgroup_ops/setup.py b/projects/PTv3/libs/pointgroup_ops/setup.py
new file mode 100644
index 000000000..f6bcef376
--- /dev/null
+++ b/projects/PTv3/libs/pointgroup_ops/setup.py
@@ -0,0 +1,58 @@
+import os
+from distutils.sysconfig import get_config_vars
+from sys import argv
+
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+(opt,) = get_config_vars("OPT")
+os.environ["OPT"] = " ".join(flag for flag in opt.split() if flag != "-Wstrict-prototypes")
+
+
+def _argparse(pattern, argv, is_flag=True, is_list=False):
+    if is_flag:
+        found = pattern in argv
+        if found:
+            argv.remove(pattern)
+        return found, argv
+    else:
+        arr = [arg for arg in argv if pattern == arg.split("=")[0]]
+        if is_list:
+            if len(arr) == 0:  # not found
+                return False, argv
+            else:
+                assert "=" in arr[0], f"{arr[0]} requires a value."
+                argv.remove(arr[0])
+                val = arr[0].split("=")[1]
+                if "," in val:
+                    return val.split(","), argv
+                else:
+                    return [val], argv
+        else:
+            if len(arr) == 0:  # not found
+                return False, argv
+            else:
+                assert "=" in arr[0], f"{arr[0]} requires a value."
+                argv.remove(arr[0])
+                return arr[0].split("=")[1], argv
+
+
+INCLUDE_DIRS, argv = _argparse("--include_dirs", argv, False, is_list=True)
+include_dirs = []
+if not (INCLUDE_DIRS is False):
+    include_dirs += INCLUDE_DIRS
+
+setup(
+    name="pointgroup_ops",
+    packages=["pointgroup_ops"],
+    package_dir={"pointgroup_ops": "functions"},
+    ext_modules=[
+        CUDAExtension(
+            name="pointgroup_ops_cuda",
+            sources=["src/bfs_cluster.cpp", "src/bfs_cluster_kernel.cu"],
+            extra_compile_args={"cxx": ["-g"], "nvcc": ["-O2"]},
+        )
+    ],
+    include_dirs=[*include_dirs],
+    cmdclass={"build_ext": BuildExtension},
+)
diff --git a/projects/PTv3/libs/pointgroup_ops/src/bfs_cluster.cpp b/projects/PTv3/libs/pointgroup_ops/src/bfs_cluster.cpp
new file mode 100644
index 000000000..d0298aae5
--- /dev/null
+++ b/projects/PTv3/libs/pointgroup_ops/src/bfs_cluster.cpp
@@ -0,0 +1,145 @@
+/*
+Ball Query with BatchIdx & Clustering Algorithm
+Written by Li Jiang
+All Rights Reserved 2020.
+*/
+
+#include <torch/extension.h>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cmath>
+#include <cstdint>
+#include <array>
+#include <vector>
+#include <queue>
+#include <google/dense_hash_map>
+
+int ballquery_batch_p_cuda(int n, int meanActive, float radius, const float *xyz, const int *batch_idxs, const int *batch_offsets, int *idx, int *start_len, cudaStream_t stream);
+
+
+using Int = int32_t;
+class ConnectedComponent{
+public:
+    std::vector<Int> pt_idxs {};
+
+    ConnectedComponent(){};
+    void addPoint(Int pt_idx)
+    {
+	pt_idxs.push_back(pt_idx);
+
+    }
+};
+using ConnectedComponents = std::vector<ConnectedComponent>;
+
+/* ================================== ballquery_batch_p ================================== */
+// input xyz: (n, 3) float
+// input batch_idxs: (n) int
+// input batch_offsets: (B+1) int, batch_offsets[-1]
+// output idx: (n * meanActive) dim 0 for number of points in the ball, idx in n
+// output start_len: (n, 2), int
+int ballquery_batch_p(at::Tensor xyz_tensor, at::Tensor batch_idxs_tensor, at::Tensor batch_offsets_tensor, at::Tensor idx_tensor, at::Tensor start_len_tensor, int n, int meanActive, float radius){
+    const float *xyz = xyz_tensor.data<float>();
+    const int *batch_idxs = batch_idxs_tensor.data<int>();
+    const int *batch_offsets = batch_offsets_tensor.data<int>();
+    int *idx = idx_tensor.data<int>();
+    int *start_len = start_len_tensor.data<int>();
+
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    int cumsum = ballquery_batch_p_cuda(n, meanActive, radius, xyz, batch_idxs, batch_offsets, idx, start_len, stream);
+    return cumsum;
+}
+
+/* ================================== bfs_cluster ================================== */
+ConnectedComponent find_cc(Int idx, int *semantic_label, Int *ball_query_idxs, int *start_len, int *visited){
+    ConnectedComponent cc;
+    cc.addPoint(idx);
+    visited[idx] = 1;
+
+    std::queue<Int> Q;
+    assert(Q.empty());
+    Q.push(idx);
+
+    while(!Q.empty()){
+        Int cur = Q.front(); Q.pop();
+        int start = start_len[cur * 2];
+        int len = start_len[cur * 2 + 1];
+        int label_cur = semantic_label[cur];
+        for(Int i = start; i < start + len; i++){
+            Int idx_i = ball_query_idxs[i];
+            if(semantic_label[idx_i] != label_cur) continue;
+            if(visited[idx_i] == 1) continue;
+
+            cc.addPoint(idx_i);
+            visited[idx_i] = 1;
+
+            Q.push(idx_i);
+        }
+    }
+    return cc;
+}
+
+//input: semantic_label, int, N
+//input: ball_query_idxs, Int, (nActive)
+//input: start_len, int, (N, 2)
+//output: clusters, CCs
+int get_clusters(int *semantic_label, Int *ball_query_idxs, int *start_len, const Int nPoint, int threshold, ConnectedComponents &clusters){
+    int visited[nPoint] = {0};
+
+    int sumNPoint = 0;
+    for(Int i = 0; i < nPoint; i++){
+        if(visited[i] == 0){
+            ConnectedComponent CC = find_cc(i, semantic_label, ball_query_idxs, start_len, visited);
+            if((int)CC.pt_idxs.size() >= threshold){
+                clusters.push_back(CC);
+                sumNPoint += (int)CC.pt_idxs.size();
+            }
+        }
+    }
+
+    return sumNPoint;
+}
+
+void fill_cluster_idxs_(ConnectedComponents &CCs, int *cluster_idxs, int *cluster_offsets){
+    for(int i = 0; i < (int)CCs.size(); i++){
+        cluster_offsets[i + 1] = cluster_offsets[i] + (int)CCs[i].pt_idxs.size();
+        for(int j = 0; j < (int)CCs[i].pt_idxs.size(); j++){
+            int idx = CCs[i].pt_idxs[j];
+            cluster_idxs[(cluster_offsets[i] + j) * 2 + 0] = i;
+            cluster_idxs[(cluster_offsets[i] + j) * 2 + 1] = idx;
+        }
+    }
+}
+
+//input: semantic_label, int, N
+//input: ball_query_idxs, int, (nActive)
+//input: start_len, int, (N, 2)
+//output: cluster_idxs, int (sumNPoint, 2), dim 0 for cluster_id, dim 1 for corresponding point idxs in N
+//output: cluster_offsets, int (nCluster + 1)
+void bfs_cluster(at::Tensor semantic_label_tensor, at::Tensor ball_query_idxs_tensor, at::Tensor start_len_tensor,
+at::Tensor cluster_idxs_tensor, at::Tensor cluster_offsets_tensor, const int N, int threshold){
+    int *semantic_label = semantic_label_tensor.data<int>();
+    Int *ball_query_idxs = ball_query_idxs_tensor.data<Int>();
+    int *start_len = start_len_tensor.data<int>();
+
+    ConnectedComponents CCs;
+    int sumNPoint = get_clusters(semantic_label, ball_query_idxs, start_len, N, threshold, CCs);
+
+    int nCluster = (int)CCs.size();
+    cluster_idxs_tensor.resize_({sumNPoint, 2});
+    cluster_offsets_tensor.resize_({nCluster + 1});
+    cluster_idxs_tensor.zero_();
+    cluster_offsets_tensor.zero_();
+
+    int *cluster_idxs = cluster_idxs_tensor.data<int>();
+    int *cluster_offsets = cluster_offsets_tensor.data<int>();
+
+    fill_cluster_idxs_(CCs, cluster_idxs, cluster_offsets);
+}
+
+//------------------------------------API------------------------------------------
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
+
+    m.def("ballquery_batch_p", &ballquery_batch_p, "ballquery_batch_p");
+    m.def("bfs_cluster", &bfs_cluster, "bfs_cluster");
+
+}
diff --git a/projects/PTv3/libs/pointgroup_ops/src/bfs_cluster_kernel.cu b/projects/PTv3/libs/pointgroup_ops/src/bfs_cluster_kernel.cu
new file mode 100644
index 000000000..99a31842d
--- /dev/null
+++ b/projects/PTv3/libs/pointgroup_ops/src/bfs_cluster_kernel.cu
@@ -0,0 +1,91 @@
+/*
+Ball Query with BatchIdx
+Written by Li Jiang
+All Rights Reserved 2020.
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define TOTAL_THREADS 1024
+#define THREADS_PER_BLOCK 512
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+
+
+/* ================================== ballquery_batch_p ================================== */
+__global__ void ballquery_batch_p_cuda_(int n, int meanActive, float radius, const float *xyz, const int *batch_idxs, const int *batch_offsets, int *idx, int *start_len, int *cumsum) {
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pt_idx >= n) return;
+
+    start_len += (pt_idx * 2);
+    int idx_temp[1000];
+
+    float radius2 = radius * radius;
+    float o_x = xyz[pt_idx * 3 + 0];
+    float o_y = xyz[pt_idx * 3 + 1];
+    float o_z = xyz[pt_idx * 3 + 2];
+
+    int batch_idx = batch_idxs[pt_idx];
+    int start = batch_offsets[batch_idx];
+    int end = batch_offsets[batch_idx + 1];
+
+    int cnt = 0;
+    for(int k = start; k < end; k++){
+        float x = xyz[k * 3 + 0];
+        float y = xyz[k * 3 + 1];
+        float z = xyz[k * 3 + 2];
+        float d2 = (o_x - x) * (o_x - x) + (o_y - y) * (o_y - y) + (o_z - z) * (o_z - z);
+        if(d2 < radius2){
+            if(cnt < 1000){
+                idx_temp[cnt] = k;
+            }
+            else{
+                break;
+            }
+            ++cnt;
+        }
+    }
+
+    start_len[0] = atomicAdd(cumsum, cnt);
+    start_len[1] = cnt;
+
+    int thre = n * meanActive;
+    if(start_len[0] >= thre) return;
+
+    idx += start_len[0];
+    if(start_len[0] + cnt >= thre) cnt = thre - start_len[0];
+
+    for(int k = 0; k < cnt; k++){
+        idx[k] = idx_temp[k];
+    }
+}
+
+
+int ballquery_batch_p_cuda(int n, int meanActive, float radius, const float *xyz, const int *batch_idxs, const int *batch_offsets, int *idx, int *start_len, cudaStream_t stream) {
+    // param xyz: (n, 3)
+    // param batch_idxs: (n)
+    // param batch_offsets: (B + 1)
+    // output idx: (n * meanActive) dim 0 for number of points in the ball, idx in n
+    // output start_len: (n, 2), int
+
+    cudaError_t err;
+
+    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+
+    int cumsum = 0;
+    int* p_cumsum;
+    cudaMalloc((void**)&p_cumsum, sizeof(int));
+    cudaMemcpy(p_cumsum, &cumsum, sizeof(int), cudaMemcpyHostToDevice);
+
+    ballquery_batch_p_cuda_<<<blocks, threads, 0, stream>>>(n, meanActive, radius, xyz, batch_idxs, batch_offsets, idx, start_len, p_cumsum);
+
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+
+    cudaMemcpy(&cumsum, p_cumsum, sizeof(int), cudaMemcpyDeviceToHost);
+    return cumsum;
+}
diff --git a/projects/PTv3/libs/pointops/__init__.py b/projects/PTv3/libs/pointops/__init__.py
new file mode 100644
index 000000000..c8f754883
--- /dev/null
+++ b/projects/PTv3/libs/pointops/__init__.py
@@ -0,0 +1 @@
+from .functions import *
diff --git a/projects/PTv3/libs/pointops/functions/__init__.py b/projects/PTv3/libs/pointops/functions/__init__.py
new file mode 100644
index 000000000..1c20807f6
--- /dev/null
+++ b/projects/PTv3/libs/pointops/functions/__init__.py
@@ -0,0 +1,14 @@
+from .aggregation import aggregation
+from .attention import attention_fusion_step, attention_relation_step
+from .grouping import grouping, grouping2
+from .interpolation import interpolation, interpolation2
+from .query import ball_query, knn_query, random_ball_query
+from .sampling import farthest_point_sampling
+from .subtraction import subtraction
+from .utils import (
+    ball_query_and_group,
+    batch2offset,
+    knn_query_and_group,
+    offset2batch,
+    query_and_group,
+)
diff --git a/projects/PTv3/libs/pointops/functions/aggregation.py b/projects/PTv3/libs/pointops/functions/aggregation.py
new file mode 100644
index 000000000..8e7865b38
--- /dev/null
+++ b/projects/PTv3/libs/pointops/functions/aggregation.py
@@ -0,0 +1,50 @@
+import torch
+from pointops._C import aggregation_backward_cuda, aggregation_forward_cuda
+from torch.autograd import Function
+
+
+class Aggregation(Function):
+    @staticmethod
+    def forward(ctx, input, position, weight, idx):
+        """
+        input: input: (n, c), position: (n, nsample, c), weight : (n, nsample, c'), idx: (n, nsample)
+        output: (n, c)
+        """
+        assert input.is_contiguous() and position.is_contiguous() and weight.is_contiguous()
+        n, nsample, c = position.shape
+        w_c = weight.shape[-1]
+        output = torch.cuda.FloatTensor(n, c).zero_()
+        aggregation_forward_cuda(n, nsample, c, w_c, input, position, weight, idx, output)
+        ctx.save_for_backward(input, position, weight, idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_out: (n, c)
+        output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight : (n, nsample, c')
+        """
+        input, position, weight, idx = ctx.saved_tensors
+        n, nsample, c = position.shape
+        w_c = weight.shape[-1]
+        grad_input = torch.cuda.FloatTensor(n, c).zero_()
+        grad_position = torch.cuda.FloatTensor(n, nsample, c).zero_()
+        grad_weight = torch.cuda.FloatTensor(n, nsample, w_c).zero_()
+        aggregation_backward_cuda(
+            n,
+            nsample,
+            c,
+            w_c,
+            input,
+            position,
+            weight,
+            idx,
+            grad_output,
+            grad_input,
+            grad_position,
+            grad_weight,
+        )
+        return grad_input, grad_position, grad_weight, None
+
+
+aggregation = Aggregation.apply
diff --git a/projects/PTv3/libs/pointops/functions/attention.py b/projects/PTv3/libs/pointops/functions/attention.py
new file mode 100644
index 000000000..e063aefde
--- /dev/null
+++ b/projects/PTv3/libs/pointops/functions/attention.py
@@ -0,0 +1,117 @@
+import torch
+from pointops._C import (
+    attention_fusion_step_backward_cuda,
+    attention_fusion_step_forward_cuda,
+    attention_relation_step_backward_cuda,
+    attention_relation_step_forward_cuda,
+)
+from torch.autograd import Function
+
+
+class AttentionRelationStep(Function):
+    @staticmethod
+    def forward(ctx, query, key, weight, index_target, index_refer):
+        """
+        input - query: (n, g, c), key: (n, g, c), weight: (c)  1_c for scatter attention,
+                index_target: (m), index_refer: (m)
+        output - relation: (M, g)
+        """
+
+        assert (
+            query.is_contiguous()
+            and key.is_contiguous()
+            and index_target.is_contiguous()
+            and index_refer.is_contiguous()
+            and weight.is_contiguous()
+        )
+
+        assert index_target.shape[0] == index_refer.shape[0]
+
+        _, g, c = query.shape
+        m = index_target.shape[0]
+        output = torch.cuda.FloatTensor(m, g).zero_()
+        attention_relation_step_forward_cuda(
+            m, g, c, query, key, weight, index_target.int(), index_refer.int(), output
+        )
+        ctx.save_for_backward(query, key, weight, index_target, index_refer)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        query, key, weight, index_target, index_refer = ctx.saved_tensors
+        n, g, c = query.shape
+        m = index_target.shape[0]
+        grad_query = torch.cuda.FloatTensor(n, g, c).zero_()
+        grad_key = torch.cuda.FloatTensor(n, g, c).zero_()
+        grad_weight = torch.cuda.FloatTensor(c).zero_()
+        attention_relation_step_backward_cuda(
+            m,
+            g,
+            c,
+            query,
+            grad_query,
+            key,
+            grad_key,
+            weight,
+            grad_weight,
+            index_target.int(),
+            index_refer.int(),
+            grad_output,
+        )
+        return grad_query, grad_key, None, None, None
+
+
+class AttentionFusionStep(Function):
+    @staticmethod
+    def forward(ctx, weight, value, index_target, index_refer):
+        """
+        input - weight: (m, g), value: (n, g, c)
+                index_target: (m), index_value: (m)
+        output - output: (n, g, c)
+        """
+
+        assert (
+            weight.is_contiguous()
+            and value.is_contiguous()
+            and index_target.is_contiguous()
+            and index_refer.is_contiguous()
+            and weight.is_contiguous()
+        )
+
+        assert index_target.shape[0] == index_refer.shape[0]
+
+        n, g, c = value.shape
+        m = index_refer.shape[0]
+        output = torch.cuda.FloatTensor(n, g, c).zero_()
+        attention_fusion_step_forward_cuda(m, g, c, weight, value, index_target.int(), index_refer.int(), output)
+        ctx.save_for_backward(weight, value, index_target, index_refer)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_output: (n, g, c)
+        output: grad_weight: (m, g), grad_value: (n, g, c), none, none
+        """
+        weight, value, index_target, index_refer = ctx.saved_tensors
+        n, g, c = value.shape
+        m = index_target.shape[0]
+        grad_weight = torch.cuda.FloatTensor(m, g).zero_()
+        grad_value = torch.cuda.FloatTensor(n, g, c).zero_()
+        attention_fusion_step_backward_cuda(
+            m,
+            g,
+            c,
+            weight,
+            grad_weight,
+            value,
+            grad_value,
+            index_target.int(),
+            index_refer.int(),
+            grad_output,
+        )
+        return grad_weight, grad_value, None, None
+
+
+attention_relation_step = AttentionRelationStep.apply
+attention_fusion_step = AttentionFusionStep.apply
diff --git a/projects/PTv3/libs/pointops/functions/grouping.py b/projects/PTv3/libs/pointops/functions/grouping.py
new file mode 100644
index 000000000..5de89e443
--- /dev/null
+++ b/projects/PTv3/libs/pointops/functions/grouping.py
@@ -0,0 +1,54 @@
+import torch
+from pointops._C import grouping_backward_cuda, grouping_forward_cuda
+from torch.autograd import Function
+
+
+class Grouping(Function):
+    @staticmethod
+    def forward(ctx, input, idx):
+        """
+        input: input: (n, c), idx : (m, nsample)
+        output: (m, nsample, c)
+        """
+        assert input.is_contiguous() and idx.is_contiguous()
+        m, nsample, n, c = idx.shape[0], idx.shape[1], input.shape[0], input.shape[1]
+        output = torch.cuda.FloatTensor(m, nsample, c)
+        grouping_forward_cuda(m, nsample, c, input, idx, output)
+        ctx.n = n
+        ctx.save_for_backward(idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_out: (m, c, nsample)
+        output: (n, c), None
+        """
+        n = ctx.n
+        (idx,) = ctx.saved_tensors
+        m, nsample, c = grad_output.shape
+        grad_input = torch.cuda.FloatTensor(n, c).zero_()
+        grouping_backward_cuda(m, nsample, c, grad_output, idx, grad_input)
+        return grad_input, None
+
+
+def grouping(idx, feat, xyz, new_xyz=None, with_xyz=False):
+    if new_xyz is None:
+        new_xyz = xyz
+    assert xyz.is_contiguous() and feat.is_contiguous()
+    m, nsample, c = idx.shape[0], idx.shape[1], feat.shape[1]
+    xyz = torch.cat([xyz, torch.zeros([1, 3]).to(xyz.device)], dim=0)
+    feat = torch.cat([feat, torch.zeros([1, c]).to(feat.device)], dim=0)
+    grouped_feat = feat[idx.view(-1).long(), :].view(m, nsample, c)  # (m, num_sample, c)
+
+    if with_xyz:
+        assert new_xyz.is_contiguous()
+        mask = torch.sign(idx + 1)
+        grouped_xyz = xyz[idx.view(-1).long(), :].view(m, nsample, 3) - new_xyz.unsqueeze(1)  # (m, num_sample, 3)
+        grouped_xyz = torch.einsum("n s c, n s -> n s c", grouped_xyz, mask)  # (m, num_sample, 3)
+        return torch.cat((grouped_xyz, grouped_feat), -1)
+    else:
+        return grouped_feat
+
+
+grouping2 = Grouping.apply
diff --git a/projects/PTv3/libs/pointops/functions/interpolation.py b/projects/PTv3/libs/pointops/functions/interpolation.py
new file mode 100644
index 000000000..9ba975ac0
--- /dev/null
+++ b/projects/PTv3/libs/pointops/functions/interpolation.py
@@ -0,0 +1,59 @@
+import torch
+from pointops._C import interpolation_backward_cuda, interpolation_forward_cuda
+from torch.autograd import Function
+
+from .query import knn_query
+
+
+def interpolation(xyz, new_xyz, feat, offset, new_offset, k=3):
+    """
+    input: coords: (m, 3), new_xyz: (n, 3), color: (m, c), offset: (b), new_offset: (b)
+    output: (n, c)
+    """
+    assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous()
+    idx, dist = knn_query(k, xyz, offset, new_xyz, new_offset)  # (n, 3), (n, 3)
+    dist_recip = 1.0 / (dist + 1e-8)  # (n, 3)
+    norm = torch.sum(dist_recip, dim=1, keepdim=True)
+    weight = dist_recip / norm  # (n, 3)
+
+    new_feat = torch.cuda.FloatTensor(new_xyz.shape[0], feat.shape[1]).zero_()
+    for i in range(k):
+        new_feat += feat[idx[:, i].long(), :] * weight[:, i].unsqueeze(-1)
+    return new_feat
+
+
+class Interpolation(Function):
+    @staticmethod
+    def forward(ctx, xyz, new_xyz, input, offset, new_offset, k=3):
+        """
+        input: coords: (m, 3), new_xyz: (n, 3), input: (m, c), offset: (b), new_offset: (b)
+        output: (n, c)
+        """
+        assert xyz.is_contiguous() and new_xyz.is_contiguous() and input.is_contiguous()
+        idx, dist = knn_query(k, xyz, offset, new_xyz, new_offset)  # (n, k), (n, k)
+        dist_recip = 1.0 / (dist + 1e-8)  # (n, k)
+        norm = torch.sum(dist_recip, dim=1, keepdim=True)
+        weight = dist_recip / norm  # (n, k)
+
+        n, c, m = new_xyz.shape[0], input.shape[1], input.shape[0]
+        output = torch.cuda.FloatTensor(n, c).zero_()
+        interpolation_forward_cuda(n, c, k, input, idx, weight, output)
+        ctx.m, ctx.k = m, k
+        ctx.save_for_backward(idx, weight)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: coords: (m, 3), new_xyz: (n, 3), input: (m, c), offset: (b), new_offset: (b)
+        output: (n, c)
+        """
+        m, k = ctx.m, ctx.k
+        idx, weight = ctx.saved_tensors
+        n, c = grad_output.shape
+        grad_input = torch.cuda.FloatTensor(m, c).zero_()
+        interpolation_backward_cuda(n, c, k, grad_output, idx, weight, grad_input)
+        return None, None, grad_input, None, None, None
+
+
+interpolation2 = Interpolation.apply
diff --git a/projects/PTv3/libs/pointops/functions/query.py b/projects/PTv3/libs/pointops/functions/query.py
new file mode 100644
index 000000000..46577591d
--- /dev/null
+++ b/projects/PTv3/libs/pointops/functions/query.py
@@ -0,0 +1,104 @@
+import torch
+from pointops._C import ball_query_cuda, knn_query_cuda, random_ball_query_cuda
+from torch.autograd import Function
+
+
+class KNNQuery(Function):
+    @staticmethod
+    def forward(ctx, nsample, xyz, offset, new_xyz=None, new_offset=None):
+        """
+        input: coords: (n, 3), new_xyz: (m, 3), offset: (b), new_offset: (b)
+        output: idx: (m, nsample) -1 is placeholder, dist2: (m, nsample)
+        """
+        if new_xyz is None or new_offset is None:
+            new_xyz = xyz
+            new_offset = offset
+        assert xyz.is_contiguous() and new_xyz.is_contiguous()
+        m = new_xyz.shape[0]
+        idx = torch.cuda.IntTensor(m, nsample).zero_()
+        dist2 = torch.cuda.FloatTensor(m, nsample).zero_()
+        knn_query_cuda(m, nsample, xyz, new_xyz, offset.int(), new_offset.int(), idx, dist2)
+        return idx, torch.sqrt(dist2)
+
+
+class RandomBallQuery(Function):
+    """Random Ball Query.
+
+    Find nearby points in spherical space.
+    """
+
+    @staticmethod
+    def forward(ctx, nsample, max_radius, min_radius, xyz, offset, new_xyz=None, new_offset=None):
+        """
+        input: coords: (n, 3), new_xyz: (m, 3), offset: (b), new_offset: (b)
+        output: idx: (m, nsample), dist2: (m, nsample)
+        """
+        if new_xyz is None or new_offset is None:
+            new_xyz = xyz
+            new_offset = offset
+        assert xyz.is_contiguous() and new_xyz.is_contiguous()
+        assert min_radius < max_radius
+
+        m = new_xyz.shape[0]
+        order = []
+        for k in range(offset.shape[0]):
+            s_k, e_k = (0, offset[0]) if k == 0 else (offset[k - 1], offset[k])
+            order.append(torch.randperm(e_k - s_k, dtype=torch.int32, device=offset.device) + s_k)
+        order = torch.cat(order, dim=0)
+        idx = torch.cuda.IntTensor(m, nsample).zero_()
+        dist2 = torch.cuda.FloatTensor(m, nsample).zero_()
+        random_ball_query_cuda(
+            m,
+            nsample,
+            min_radius,
+            max_radius,
+            order,
+            xyz,
+            new_xyz,
+            offset.int(),
+            new_offset.int(),
+            idx,
+            dist2,
+        )
+        return idx, torch.sqrt(dist2)
+
+
+class BallQuery(Function):
+    """Ball Query.
+
+    Find nearby points in spherical space.
+    """
+
+    @staticmethod
+    def forward(ctx, nsample, max_radius, min_radius, xyz, offset, new_xyz=None, new_offset=None):
+        """
+        input: coords: (n, 3), new_xyz: (m, 3), offset: (b), new_offset: (b)
+        output: idx: (m, nsample), dist2: (m, nsample)
+        """
+        if new_xyz is None or new_offset is None:
+            new_xyz = xyz
+            new_offset = offset
+        assert xyz.is_contiguous() and new_xyz.is_contiguous()
+        assert min_radius < max_radius
+
+        m = new_xyz.shape[0]
+        idx = torch.cuda.IntTensor(m, nsample).zero_()
+        dist2 = torch.cuda.FloatTensor(m, nsample).zero_()
+        ball_query_cuda(
+            m,
+            nsample,
+            min_radius,
+            max_radius,
+            xyz,
+            new_xyz,
+            offset.int(),
+            new_offset.int(),
+            idx,
+            dist2,
+        )
+        return idx, torch.sqrt(dist2)
+
+
+knn_query = KNNQuery.apply
+ball_query = BallQuery.apply
+random_ball_query = RandomBallQuery.apply
diff --git a/projects/PTv3/libs/pointops/functions/sampling.py b/projects/PTv3/libs/pointops/functions/sampling.py
new file mode 100644
index 000000000..0ee83b51d
--- /dev/null
+++ b/projects/PTv3/libs/pointops/functions/sampling.py
@@ -0,0 +1,24 @@
+import torch
+from pointops._C import farthest_point_sampling_cuda
+from torch.autograd import Function
+
+
+class FarthestPointSampling(Function):
+    @staticmethod
+    def forward(ctx, xyz, offset, new_offset):
+        """
+        input: coords: (n, 3), offset: (b), new_offset: (b)
+        output: idx: (m)
+        """
+        assert xyz.is_contiguous()
+        n, b, n_max = xyz.shape[0], offset.shape[0], offset[0]
+        for i in range(1, b):
+            n_max = max(offset[i] - offset[i - 1], n_max)
+        idx = torch.cuda.IntTensor(new_offset[b - 1].item()).zero_()
+        tmp = torch.cuda.FloatTensor(n).fill_(1e10)
+        farthest_point_sampling_cuda(b, n_max, xyz, offset.int(), new_offset.int(), tmp, idx)
+        del tmp
+        return idx
+
+
+farthest_point_sampling = FarthestPointSampling.apply
diff --git a/projects/PTv3/libs/pointops/functions/subtraction.py b/projects/PTv3/libs/pointops/functions/subtraction.py
new file mode 100644
index 000000000..1633552c5
--- /dev/null
+++ b/projects/PTv3/libs/pointops/functions/subtraction.py
@@ -0,0 +1,35 @@
+import torch
+from pointops._C import subtraction_backward_cuda, subtraction_forward_cuda
+from torch.autograd import Function
+
+
+class Subtraction(Function):
+    @staticmethod
+    def forward(ctx, input1, input2, idx):
+        """
+        input: input1: (n, c), input2: (n, c), idx: (n, nsample)
+        output:  (n, nsample, c)
+        """
+        assert input1.is_contiguous() and input2.is_contiguous()
+        n, c = input1.shape
+        nsample = idx.shape[-1]
+        output = torch.cuda.FloatTensor(n, nsample, c).zero_()
+        subtraction_forward_cuda(n, nsample, c, input1, input2, idx, output)
+        ctx.save_for_backward(idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_out: (n, nsample, c)
+        output: grad_input1: (n, c), grad_input2: (n, c)
+        """
+        (idx,) = ctx.saved_tensors
+        n, nsample, c = grad_output.shape
+        grad_input1 = torch.cuda.FloatTensor(n, c).zero_()
+        grad_input2 = torch.cuda.FloatTensor(n, c).zero_()
+        subtraction_backward_cuda(n, nsample, c, idx, grad_output, grad_input1, grad_input2)
+        return grad_input1, grad_input2, None
+
+
+subtraction = Subtraction.apply
diff --git a/projects/PTv3/libs/pointops/functions/utils.py b/projects/PTv3/libs/pointops/functions/utils.py
new file mode 100644
index 000000000..0c51202cd
--- /dev/null
+++ b/projects/PTv3/libs/pointops/functions/utils.py
@@ -0,0 +1,113 @@
+import torch
+from pointops import ball_query, grouping, knn_query
+
+
+def knn_query_and_group(
+    feat,
+    xyz,
+    offset=None,
+    new_xyz=None,
+    new_offset=None,
+    idx=None,
+    nsample=None,
+    with_xyz=False,
+):
+    if idx is None:
+        assert nsample is not None
+        idx, _ = knn_query(nsample, xyz, offset, new_xyz, new_offset)
+    return grouping(idx, feat, xyz, new_xyz, with_xyz), idx
+
+
+def ball_query_and_group(
+    feat,
+    xyz,
+    offset=None,
+    new_xyz=None,
+    new_offset=None,
+    idx=None,
+    max_radio=None,
+    min_radio=0,
+    nsample=None,
+    with_xyz=False,
+):
+    if idx is None:
+        assert nsample is not None and offset is not None
+        assert max_radio is not None and min_radio is not None
+        idx, _ = ball_query(nsample, max_radio, min_radio, xyz, offset, new_xyz, new_offset)
+    return grouping(idx, feat, xyz, new_xyz, with_xyz), idx
+
+
+def query_and_group(
+    nsample,
+    xyz,
+    new_xyz,
+    feat,
+    idx,
+    offset,
+    new_offset,
+    dilation=0,
+    with_feat=True,
+    with_xyz=True,
+):
+    """
+    input: coords: (n, 3), new_xyz: (m, 3), color: (n, c), idx: (m, nsample), offset: (b), new_offset: (b)
+    output: new_feat: (m, nsample, c+3), grouped_idx: (m, nsample)
+    """
+    assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous()
+    if new_xyz is None:
+        new_xyz = xyz
+
+    if idx is None:
+        num_samples_total = 1 + (nsample - 1) * (dilation + 1)
+        # num points in a batch might < num_samples_total => [n1, n2, ..., nk, ns, ns, ns, ...]
+        idx_no_dilation, _ = knn_query(num_samples_total, xyz, offset, new_xyz, new_offset)  # (m, nsample * (d + 1))
+        idx = []
+        batch_end = offset.tolist()
+        batch_start = [0] + batch_end[:-1]
+        new_batch_end = new_offset.tolist()
+        new_batch_start = [0] + new_batch_end[:-1]
+        for i in range(offset.shape[0]):
+            if batch_end[i] - batch_start[i] < num_samples_total:
+                soft_dilation = (batch_end[i] - batch_start[i] - 1) / (nsample - 1) - 1
+            else:
+                soft_dilation = dilation
+            idx.append(
+                idx_no_dilation[
+                    new_batch_start[i] : new_batch_end[i],
+                    [int((soft_dilation + 1) * i) for i in range(nsample)],
+                ]
+            )
+        idx = torch.cat(idx, dim=0)
+
+    if not with_feat:
+        return idx
+
+    n, m, c = xyz.shape[0], new_xyz.shape[0], feat.shape[1]
+    grouped_xyz = xyz[idx.view(-1).long(), :].view(m, nsample, 3)  # (m, nsample, 3)
+    # grouped_xyz = grouping(coords, idx) # (m, nsample, 3)
+    grouped_xyz -= new_xyz.unsqueeze(1)  # (m, nsample, 3)
+    grouped_feat = feat[idx.view(-1).long(), :].view(m, nsample, c)  # (m, nsample, c)
+    # grouped_feat = grouping(color, idx) # (m, nsample, c)
+
+    if with_xyz:
+        return torch.cat((grouped_xyz, grouped_feat), -1), idx  # (m, nsample, 3+c)
+    else:
+        return grouped_feat, idx
+
+
+def offset2batch(offset):
+    return (
+        torch.cat(
+            [
+                (torch.tensor([i] * (o - offset[i - 1])) if i > 0 else torch.tensor([i] * o))
+                for i, o in enumerate(offset)
+            ],
+            dim=0,
+        )
+        .long()
+        .to(offset.device)
+    )
+
+
+def batch2offset(batch):
+    return torch.cumsum(batch.bincount(), dim=0).int()
diff --git a/projects/PTv3/libs/pointops/pyproject.toml b/projects/PTv3/libs/pointops/pyproject.toml
new file mode 100644
index 000000000..7dafe1f34
--- /dev/null
+++ b/projects/PTv3/libs/pointops/pyproject.toml
@@ -0,0 +1,13 @@
+[project]
+name = "pointops"
+version = "0.1.0"
+requires-python = "==3.10.*"
+dependencies = []
+
+[build-system]
+requires = ["setuptools>=61"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+where = ["."]
+exclude = []
diff --git a/projects/PTv3/libs/pointops/setup.py b/projects/PTv3/libs/pointops/setup.py
new file mode 100644
index 000000000..bdd4e4cef
--- /dev/null
+++ b/projects/PTv3/libs/pointops/setup.py
@@ -0,0 +1,32 @@
+import os
+from distutils.sysconfig import get_config_vars
+
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+(opt,) = get_config_vars("OPT")
+os.environ["OPT"] = " ".join(flag for flag in opt.split() if flag != "-Wstrict-prototypes")
+
+src = "src"
+sources = [
+    os.path.join(root, file)
+    for root, dirs, files in os.walk(src)
+    for file in files
+    if file.endswith(".cpp") or file.endswith(".cu")
+]
+
+setup(
+    name="pointops",
+    version="1.0",
+    install_requires=["torch", "numpy"],
+    packages=["pointops"],
+    package_dir={"pointops": "functions"},
+    ext_modules=[
+        CUDAExtension(
+            name="pointops._C",
+            sources=sources,
+            extra_compile_args={"cxx": ["-g"], "nvcc": ["-O2"]},
+        )
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)
diff --git a/projects/PTv3/libs/pointops/src/__init__.py b/projects/PTv3/libs/pointops/src/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda.cpp b/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda.cpp
new file mode 100644
index 000000000..491b6f416
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda.cpp
@@ -0,0 +1,28 @@
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "aggregation_cuda_kernel.h"
+
+
+void aggregation_forward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor output_tensor)
+{
+    const float *input = input_tensor.data_ptr<float>();
+    const float *position = position_tensor.data_ptr<float>();
+    const float *weight = weight_tensor.data_ptr<float>();
+    const int *idx = idx_tensor.data_ptr<int>();
+    float *output = output_tensor.data_ptr<float>();
+    aggregation_forward_cuda_launcher(n, nsample, c, w_c, input, position, weight, idx, output);
+}
+
+void aggregation_backward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input_tensor, at::Tensor grad_position_tensor, at::Tensor grad_weight_tensor)
+{
+	const float *input = input_tensor.data_ptr<float>();
+    const float *position = position_tensor.data_ptr<float>();
+    const float *weight = weight_tensor.data_ptr<float>();
+    const int *idx = idx_tensor.data_ptr<int>();
+    const float *grad_output = grad_output_tensor.data_ptr<float>();
+    float *grad_input = grad_input_tensor.data_ptr<float>();
+    float *grad_position = grad_position_tensor.data_ptr<float>();
+    float *grad_weight = grad_weight_tensor.data_ptr<float>();
+    aggregation_backward_cuda_launcher(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight);
+}
diff --git a/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda_kernel.cu
new file mode 100644
index 000000000..dfeecb509
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda_kernel.cu
@@ -0,0 +1,53 @@
+#include "../cuda_utils.h"
+#include "aggregation_cuda_kernel.h"
+
+
+__global__ void aggregation_forward_cuda_kernel(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output) {
+    // input: input: (n, c), position: (n, nsample, c), weight: (n, nsample, w_c), idx: (n, nsample), output: (n, c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= n * c) return;
+    const int c_idx = index % c;
+    const int n_idx = index / c;
+    const int w_c_idx = c_idx % w_c;
+    for (int nsample_idx = 0; nsample_idx < nsample; nsample_idx++)
+    {
+        int idx_idx = n_idx * nsample + nsample_idx;
+        int input_idx = idx[idx_idx] * c + c_idx;
+        int position_idx = n_idx * nsample * c + nsample_idx * c + c_idx;
+        int weight_idx = n_idx * nsample * w_c + nsample_idx * w_c + w_c_idx;
+        output[index] += (input[input_idx] + position[position_idx]) * weight[weight_idx];
+    }
+}
+
+__global__ void aggregation_backward_cuda_kernel(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight) {
+    // input: grad_output: (n, c), output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight: (n, nsample, w_c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= n * c) return;
+    const int c_idx = index % c;
+    const int n_idx = index / c;
+    const int w_c_idx = c_idx % w_c;
+    for (int nsample_idx = 0; nsample_idx < nsample; nsample_idx++)
+    {
+        int idx_idx = n_idx * nsample + nsample_idx;
+        int input_idx = idx[idx_idx] * c + c_idx;
+        int position_idx = n_idx * nsample * c + nsample_idx * c + c_idx;
+        int weight_idx = n_idx * nsample * w_c + nsample_idx * w_c + w_c_idx;
+        atomicAdd(grad_input + input_idx, grad_output[index] * weight[weight_idx]);
+        grad_position[position_idx] = grad_output[index] * weight[weight_idx];
+        atomicAdd(grad_weight + weight_idx, grad_output[index] * (input[input_idx] + position[position_idx]));
+    }
+}
+
+void aggregation_forward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output) {
+    // input: input: (n, c), position: (n, nsample, c), weight: (n, nsample, w_c), idx: (n, nsample), output: (n, c)
+    dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    aggregation_forward_cuda_kernel<<<blocks, threads, 0>>>(n, nsample, c, w_c, input, position, weight, idx, output);
+}
+
+void aggregation_backward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight) {
+    // input: grad_output: (n, c), output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight: (n, nsample, w_c)
+    dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    aggregation_backward_cuda_kernel<<<blocks, threads, 0>>>(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight);
+}
diff --git a/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda_kernel.h b/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda_kernel.h
new file mode 100644
index 000000000..5211a96aa
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda_kernel.h
@@ -0,0 +1,20 @@
+#ifndef _AGGREGATION_CUDA_KERNEL
+#define _AGGREGATION_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void aggregation_forward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor output_tensor);
+void aggregation_backward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input_tensor, at::Tensor grad_position_tensor, at::Tensor grad_weight_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aggregation_forward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output);
+void aggregation_backward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/projects/PTv3/libs/pointops/src/attention/attention_cuda.cpp b/projects/PTv3/libs/pointops/src/attention/attention_cuda.cpp
new file mode 100644
index 000000000..79b90c7eb
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/attention/attention_cuda.cpp
@@ -0,0 +1,76 @@
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "attention_cuda_kernel.h"
+
+
+void attention_relation_step_forward_cuda(int m, int g, int c,
+                                          at::Tensor query_tensor, at::Tensor key_tensor, at::Tensor weight_tensor,
+                                          at::Tensor index_target_tensor, at::Tensor index_refer_tensor,
+                                          at::Tensor output_tensor)
+{
+    const float *query = query_tensor.data_ptr<float>();
+    const float *key = key_tensor.data_ptr<float>();
+    const float *weight = weight_tensor.data_ptr<float>();
+    const int *index_target = index_target_tensor.data_ptr<int>();
+    const int *index_refer = index_refer_tensor.data_ptr<int>();
+    float *output = output_tensor.data_ptr<float>();
+    attention_relation_step_forward_cuda_launcher(m, g, c, query, key, weight, index_target, index_refer, output);
+}
+
+void attention_relation_step_backward_cuda(int m, int g, int c,
+                                           at::Tensor query_tensor, at::Tensor grad_query_tensor,
+                                           at::Tensor key_tensor, at::Tensor grad_key_tensor,
+                                           at::Tensor weight_tensor, at::Tensor grad_weight_tensor,
+                                           at::Tensor index_target_tensor, at::Tensor index_refer_tensor,
+                                           at::Tensor grad_output_tensor)
+{
+    const float *query = query_tensor.data_ptr<float>();
+    float *grad_query = grad_query_tensor.data_ptr<float>();
+    const float *key = key_tensor.data_ptr<float>();
+    float *grad_key = grad_key_tensor.data_ptr<float>();
+    const float *weight = weight_tensor.data_ptr<float>();
+    float *grad_weight = grad_weight_tensor.data_ptr<float>();
+    const int *index_target = index_target_tensor.data_ptr<int>();
+    const int *index_refer = index_refer_tensor.data_ptr<int>();
+    const float *grad_output = grad_output_tensor.data_ptr<float>();
+    attention_relation_step_backward_cuda_launcher(m, g, c,
+                                                   query, grad_query,
+                                                   key, grad_key,
+                                                   weight, grad_weight,
+                                                   index_target, index_refer, grad_output);
+}
+
+
+void attention_fusion_step_forward_cuda(int m, int g, int c,
+                                        at::Tensor weight_tensor, at::Tensor value_tensor,
+                                        at::Tensor index_target_tensor, at::Tensor index_refer_tensor,
+                                        at::Tensor output_tensor)
+{
+    const float *weight = weight_tensor.data_ptr<float>();
+    const float *value = value_tensor.data_ptr<float>();
+    const int *index_target = index_target_tensor.data_ptr<int>();
+    const int *index_refer = index_refer_tensor.data_ptr<int>();
+    float *output = output_tensor.data_ptr<float>();
+    attention_fusion_step_forward_cuda_launcher(m, g, c, weight, value, index_target, index_refer, output);
+}
+
+
+void attention_fusion_step_backward_cuda(int m, int g, int c,
+                                         at::Tensor weight_tensor, at::Tensor grad_weight_tensor,
+                                         at::Tensor value_tensor, at::Tensor grad_value_tensor,
+                                         at::Tensor index_target_tensor, at::Tensor index_refer_tensor,
+                                         at::Tensor grad_output_tensor)
+{
+    const float *weight = weight_tensor.data_ptr<float>();
+    float *grad_weight = grad_weight_tensor.data_ptr<float>();
+    const float *value = value_tensor.data_ptr<float>();
+    float *grad_value = grad_value_tensor.data_ptr<float>();
+    const int *index_target = index_target_tensor.data_ptr<int>();
+    const int *index_refer = index_refer_tensor.data_ptr<int>();
+    const float *grad_output = grad_output_tensor.data_ptr<float>();
+    attention_fusion_step_backward_cuda_launcher(m, g, c,
+                                                 weight, grad_weight,
+                                                 value, grad_value,
+                                                 index_target, index_refer, grad_output);
+}
diff --git a/projects/PTv3/libs/pointops/src/attention/attention_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/attention/attention_cuda_kernel.cu
new file mode 100644
index 000000000..747db4d37
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/attention/attention_cuda_kernel.cu
@@ -0,0 +1,147 @@
+#include "../cuda_utils.h"
+#include "attention_cuda_kernel.h"
+
+
+/*
+Kernels
+*/
+
+__global__ void attention_relation_step_forward_cuda_kernel(int m, int g, int c,
+                                                            const float *query, const float *key, const float *weight,
+                                                            const int *index_target, const int *index_refer,
+                                                            float *output)
+{
+    int r_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int g_idx = blockIdx.y;
+    int c_idx = blockIdx.z;
+
+    if (r_idx >= m || g_idx >= g || c_idx >= c) return;
+    int q_idx = index_target[r_idx] * g * c + g_idx * c + c_idx;
+    int k_idx = index_refer[r_idx] * g * c + g_idx * c + c_idx;
+
+    float r = query[q_idx] * key[k_idx] * weight[c_idx];
+    atomicAdd(output + r_idx * g + g_idx, r);
+}
+
+__global__ void attention_relation_step_backward_cuda_kernel(int m, int g, int c,
+                                                             const float *query, float *grad_query,
+                                                             const float *key, float *grad_key,
+                                                             const float *weight, float *grad_weight,
+                                                             const int *index_target, const int *index_refer,
+                                                             const float *grad_output)
+{
+    int r_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int g_idx = blockIdx.y;
+    int c_idx = blockIdx.z;
+
+    if (r_idx >= m || g_idx >= g || c_idx >= c) return;
+
+    int q_idx = index_target[r_idx] * g * c + g_idx * c + c_idx;
+    int k_idx = index_refer[r_idx] * g * c + g_idx * c + c_idx;
+    int o_idx = r_idx * g + g_idx;
+    float grad_r = grad_output[o_idx];
+    atomicAdd(grad_query + q_idx, grad_r * key[k_idx] * weight[c_idx]);
+    atomicAdd(grad_key + k_idx, grad_r * query[q_idx] * weight[c_idx]);
+    atomicAdd(grad_weight + c_idx, grad_r * key[k_idx] * query[q_idx]);
+}
+
+
+__global__ void attention_fusion_step_forward_cuda_kernel(int m, int g, int c,
+                                                          const float *weight, const float *value,
+                                                          const int *index_target, const int *index_refer,
+                                                          float *output)
+{
+    int r_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int g_idx = blockIdx.y;
+    int c_idx = blockIdx.z;
+
+    if (r_idx >= m || g_idx >= g || c_idx >= c) return;
+
+    int o_idx = index_target[r_idx] * g * c + g_idx * c + c_idx;
+    int v_idx = index_refer[r_idx] * g * c + g_idx * c + c_idx;
+
+    float f = weight[r_idx * g + g_idx] * value[v_idx];
+    atomicAdd(output + o_idx, f);
+}
+
+
+__global__ void attention_fusion_step_backward_cuda_kernel(int m, int g, int c,
+                                                           const float *weight, float *grad_weight,
+                                                           const float *value, float *grad_value,
+                                                           const int *index_target, const int *index_refer,
+                                                           const float *grad_output)
+{
+    int r_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int g_idx = blockIdx.y;
+    int c_idx = blockIdx.z;
+
+    if (r_idx >= m || g_idx >= g || c_idx >= c) return;
+
+    int o_idx = index_target[r_idx] * g * c + g_idx * c + c_idx;
+    int v_idx = index_refer[r_idx] * g * c + g_idx * c + c_idx;
+    int w_idx = r_idx * g + g_idx;
+    float grad = grad_output[o_idx];
+    atomicAdd(grad_weight + w_idx, grad * value[v_idx]);
+    atomicAdd(grad_value + v_idx, grad * weight[w_idx]);
+}
+
+/*
+Launchers
+*/
+
+
+void attention_relation_step_forward_cuda_launcher(int m, int g, int c,
+                                                   const float *query, const float *key, const float *weight,
+                                                   const int *index_target, const int *index_refer,
+                                                   float *output)
+{
+    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), g, c);
+    dim3 threads(THREADS_PER_BLOCK);
+    attention_relation_step_forward_cuda_kernel<<<blocks, threads, 0>>>(m, g, c, query, key, weight,
+                                                                        index_target, index_refer, output);
+}
+
+void attention_relation_step_backward_cuda_launcher(int m, int g, int c,
+                                                    const float *query, float *grad_query,
+                                                    const float *key, float *grad_key,
+                                                    const float *weight, float *grad_weight,
+                                                    const int *index_target, const int *index_refer,
+                                                    const float *grad_output)
+{
+    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), g, c);
+    dim3 threads(THREADS_PER_BLOCK);
+    attention_relation_step_backward_cuda_kernel<<<blocks, threads, 0>>>(m, g, c,
+                                                                         query, grad_query,
+                                                                         key, grad_key,
+                                                                         weight, grad_weight,
+                                                                         index_target, index_refer,
+                                                                         grad_output);
+}
+
+
+void attention_fusion_step_forward_cuda_launcher(int m, int g, int c,
+                                                 const float *weight, const float *value,
+                                                 const int *index_target, const int *index_refer,
+                                                 float *output)
+{
+    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), g, c);
+    dim3 threads(THREADS_PER_BLOCK);
+    attention_fusion_step_forward_cuda_kernel<<<blocks, threads, 0>>>(m, g, c, weight, value,
+                                                                      index_target, index_refer, output);
+}
+
+
+void attention_fusion_step_backward_cuda_launcher(int m, int g, int c,
+                                                  const float *weight, float *grad_weight,
+                                                  const float *value, float *grad_value,
+                                                  const int *index_target, const int *index_refer,
+                                                  const float *grad_output)
+{
+    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), g, c);
+    dim3 threads(THREADS_PER_BLOCK);
+    attention_fusion_step_backward_cuda_kernel<<<blocks, threads, 0>>>(m, g, c,
+                                                                       weight, grad_weight,
+                                                                       value, grad_value,
+                                                                       index_target, index_refer,
+                                                                       grad_output);
+}
diff --git a/projects/PTv3/libs/pointops/src/attention/attention_cuda_kernel.h b/projects/PTv3/libs/pointops/src/attention/attention_cuda_kernel.h
new file mode 100644
index 000000000..fec965c04
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/attention/attention_cuda_kernel.h
@@ -0,0 +1,54 @@
+#ifndef _ATTENTION_CUDA_KERNEL
+#define _ATTENTION_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void attention_relation_step_forward_cuda(int m, int g, int c,
+                                          at::Tensor query_tensor, at::Tensor key_tensor, at::Tensor weight_tensor,
+                                          at::Tensor index_target_tensor, at::Tensor index_refer_tensor,
+                                          at::Tensor output_tensor);
+void attention_relation_step_backward_cuda(int m, int g, int c,
+                                           at::Tensor query_tensor, at::Tensor grad_query_tensor,
+                                           at::Tensor key_tensor, at::Tensor grad_key_tensor,
+                                           at::Tensor weight_tensor, at::Tensor grad_weight_tensor,
+                                           at::Tensor index_target_tensor, at::Tensor index_refer_tensor,
+                                           at::Tensor grad_output_tensor);
+void attention_fusion_step_forward_cuda(int m, int g, int c,
+                                        at::Tensor weight_tensor, at::Tensor value_tensor,
+                                        at::Tensor index_target_tensor, at::Tensor index_refer_tensor,
+                                        at::Tensor output_tensor);
+void attention_fusion_step_backward_cuda(int m, int g, int c,
+                                         at::Tensor weight_tensor, at::Tensor grad_weight_tensor,
+                                         at::Tensor value_tensor, at::Tensor grad_value_tensor,
+                                         at::Tensor index_target_tensor, at::Tensor index_refer_tensor,
+                                         at::Tensor grad_output_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void attention_relation_step_forward_cuda_launcher(int m, int g, int c,
+                                                   const float *query, const float *key, const float *weight,
+                                                   const int *index_target, const int *index_refer,
+                                                   float *output);
+void attention_relation_step_backward_cuda_launcher(int m, int g, int c,
+                                                    const float *query, float *grad_query,
+                                                    const float *key, float *grad_key,
+                                                    const float *weight, float *grad_weight,
+                                                    const int *index_target, const int *index_refer,
+                                                    const float *grad_output);
+void attention_fusion_step_forward_cuda_launcher(int m, int g, int c,
+                                                 const float *weight, const float *value,
+                                                 const int *index_target, const int *index_refer,
+                                                 float *output);
+void attention_fusion_step_backward_cuda_launcher(int m, int g, int c,
+                                                  const float *weight, float *grad_weight,
+                                                  const float *value, float *grad_value,
+                                                  const int *index_target, const int *index_refer,
+                                                  const float *grad_output);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda.cpp b/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda.cpp
new file mode 100644
index 000000000..04cd5ff9e
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda.cpp
@@ -0,0 +1,20 @@
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "ball_query_cuda_kernel.h"
+
+
+void ball_query_cuda(int m, int nsample,
+                     float min_radius, float max_radius,
+                     at::Tensor xyz_tensor, at::Tensor new_xyz_tensor,
+                     at::Tensor offset_tensor, at::Tensor new_offset_tensor,
+                     at::Tensor idx_tensor, at::Tensor dist2_tensor)
+{
+    const float *xyz = xyz_tensor.data_ptr<float>();
+    const float *new_xyz = new_xyz_tensor.data_ptr<float>();
+    const int *offset = offset_tensor.data_ptr<int>();
+    const int *new_offset = new_offset_tensor.data_ptr<int>();
+    int *idx = idx_tensor.data_ptr<int>();
+    float *dist2 = dist2_tensor.data_ptr<float>();
+    ball_query_cuda_launcher(m, nsample, min_radius, max_radius, xyz, new_xyz, offset, new_offset, idx, dist2);
+}
diff --git a/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda_kernel.cu
new file mode 100644
index 000000000..7b3d95a98
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda_kernel.cu
@@ -0,0 +1,190 @@
+#include "../cuda_utils.h"
+#include "ball_query_cuda_kernel.h"
+
+
+namespace ball_query_utils{
+
+template <typename DType>
+__device__ void swap(DType *x, DType *y)
+{
+    DType tmp = *x;
+    *x = *y;
+    *y = tmp;
+}
+
+__device__ void reheap(float *dist, int *idx, int k)
+{
+    int root = 0;
+    int child = root * 2 + 1;
+    while (child < k)
+    {
+        if(child + 1 < k && dist[child+1] > dist[child])
+            child++;
+        if(dist[root] > dist[child])
+            return;
+        swap<float>(&dist[root], &dist[child]);
+        swap<int>(&idx[root], &idx[child]);
+        root = child;
+        child = root * 2 + 1;
+    }
+}
+
+
+__device__ void heap_sort(float *dist, int *idx, int k)
+{
+    int i;
+    for (i = k - 1; i > 0; i--)
+    {
+        swap<float>(&dist[0], &dist[i]);
+        swap<int>(&idx[0], &idx[i]);
+        reheap(dist, idx, i);
+    }
+}
+
+__device__ int get_bt_idx(int idx, const int *offset)
+{
+    int i = 0;
+    while (1)
+    {
+        if (idx < offset[i])
+            break;
+        else
+            i++;
+    }
+    return i;
+}
+}  // namespace ball_query_utils
+
+__global__ void ball_query_cuda_kernel(int m, int nsample,
+                                       float min_radius, float max_radius,
+                                       const float *__restrict__ xyz, const float *__restrict__ new_xyz,
+                                       const int *__restrict__ offset, const int *__restrict__ new_offset,
+                                       int *__restrict__ idx, float *__restrict__ dist2) {
+    // input: xyz (n, 3) new_xyz (m, 3)
+    // output: idx (m, nsample) dist (m, nsample)
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pt_idx >= m) return;
+
+    new_xyz += pt_idx * 3;
+    idx += pt_idx * nsample;
+    dist2 += pt_idx * nsample;
+
+    int bt_idx = ball_query_utils::get_bt_idx(pt_idx, new_offset);
+    int start;
+    if (bt_idx == 0)
+        start = 0;
+    else
+        start = offset[bt_idx - 1];
+    int end = offset[bt_idx];
+
+    float max_radius2 = max_radius * max_radius;
+    float min_radius2 = min_radius * min_radius;
+    float new_x = new_xyz[0];
+    float new_y = new_xyz[1];
+    float new_z = new_xyz[2];
+
+    float candi_dist[2048];
+    int candi_idx[2048];
+    int candi_num = 0;
+
+    for(int i = start; i < end; i++){
+        float x = xyz[i * 3 + 0];
+        float y = xyz[i * 3 + 1];
+        float z = xyz[i * 3 + 2];
+        float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
+
+        if (d2 <= 1e-5 || (d2 >= min_radius2 && d2 < max_radius2)){
+            // TODO: Check d2 <= 1e-5
+            candi_dist[candi_num] = d2;
+            candi_idx[candi_num] = i;
+            candi_num += 1;
+        }
+    }
+    ball_query_utils::heap_sort(candi_dist, candi_idx, candi_num);
+    if(candi_num <= nsample){
+        for(int i = 0; i < candi_num; i++){
+            idx[i] = candi_idx[i];
+            dist2[i] = candi_dist[i];
+        }
+        for(int i = candi_num; i < nsample; i++){
+            idx[i] = -1;
+            dist2[i] = 1e10;
+        }
+    }
+    else{
+        float sep = static_cast<float>(candi_num) / nsample;
+        for(int i = 0; i < nsample; i++)
+        {
+            int index = static_cast<int>(sep * i);
+            idx[i] = candi_idx[index];
+            dist2[i] = candi_idx[index];
+        }
+    }
+}
+
+/* Random Sample Mode Ball Query */
+
+// __global__ void ball_query_cuda_kernel(int m, int nsample,
+//                                        float min_radius, float max_radius,
+//                                        const float *__restrict__ xyz, const float *__restrict__ new_xyz,
+//                                        const int *__restrict__ offset, const int *__restrict__ new_offset,
+//                                        int *__restrict__ idx, float *__restrict__ dist2) {
+//     // input: xyz (n, 3) new_xyz (m, 3)
+//     // output: idx (m, nsample) dist (m, nsample)
+//     int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+//     if (pt_idx >= m) return;
+//
+//     new_xyz += pt_idx * 3;
+//     idx += pt_idx * nsample;
+//     dist2 += pt_idx * nsample;
+//
+//     int bt_idx = ball_get_bt_idx(pt_idx, new_offset);
+//     int start;
+//     if (bt_idx == 0)
+//         start = 0;
+//     else
+//         start = offset[bt_idx - 1];
+//     int end = offset[bt_idx];
+//
+//     float max_radius2 = max_radius * max_radius;
+//     float min_radius2 = min_radius * min_radius;
+//     float new_x = new_xyz[0];
+//     float new_y = new_xyz[1];
+//     float new_z = new_xyz[2];
+//
+//     int cnt = 0;
+//     for(int i = start; i < end; i++){
+//         float x = xyz[i * 3 + 0];
+//         float y = xyz[i * 3 + 1];
+//         float z = xyz[i * 3 + 2];
+//         float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
+//
+//         if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
+//             if (cnt == 0) {
+//                 for (int l = 0; l < nsample; ++l) {
+//                     idx[l] = i;
+//                     dist2[l] = d2;
+//                 }
+//             }
+//             idx[cnt] = i;
+//             ++cnt;
+//             if (cnt >= nsample) break;
+//         }
+//     }
+// }
+
+
+void ball_query_cuda_launcher(int m, int nsample,
+                              float min_radius, float max_radius,
+                              const float *xyz, const float *new_xyz,
+                              const int *offset, const int *new_offset,
+                              int *idx, float *dist2) {
+    // input: new_xyz: (m, 3), xyz: (n, 3), idx: (m, nsample)
+    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    ball_query_cuda_kernel<<<blocks, threads, 0>>>(m, nsample,
+                                                  min_radius, max_radius,
+                                                  xyz, new_xyz,
+                                                  offset, new_offset,
+                                                  idx, dist2);
+}
diff --git a/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda_kernel.h b/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda_kernel.h
new file mode 100644
index 000000000..03007a285
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda_kernel.h
@@ -0,0 +1,26 @@
+#ifndef _BALL_QUERY_CUDA_KERNEL
+#define _BALL_QUERY_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void ball_query_cuda(int m, int nsample,
+                     float min_radius, float max_radius,
+                     at::Tensor xyz_tensor, at::Tensor new_xyz_tensor,
+                     at::Tensor offset_tensor, at::Tensor new_offset_tensor,
+                     at::Tensor idx_tensor, at::Tensor dist2_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ball_query_cuda_launcher(int m, int nsample,
+                              float min_radius, float max_radius,
+                              const float *xyz, const float *new_xyz,
+                              const int *offset, const int *new_offset,
+                              int *idx, float *dist2);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/projects/PTv3/libs/pointops/src/cuda_utils.h b/projects/PTv3/libs/pointops/src/cuda_utils.h
new file mode 100644
index 000000000..bbfe7a06b
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/cuda_utils.h
@@ -0,0 +1,23 @@
+#ifndef _CUDA_UTILS_H
+#define _CUDA_UTILS_H
+
+#include <cmath>
+#include <algorithm>
+
+#define TOTAL_THREADS 1024
+#define THREADS_PER_BLOCK 512
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+inline int opt_n_threads(int work_size) {
+    const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+    return std::max(std::min(1 << pow_2, TOTAL_THREADS), 1);
+}
+
+inline dim3 opt_block_config(int x, int y) {
+    const int x_threads = opt_n_threads(x);
+    const int y_threads = std::max(std::min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1);
+    dim3 block_config(x_threads, y_threads, 1);
+    return block_config;
+}
+
+#endif
diff --git a/projects/PTv3/libs/pointops/src/grouping/grouping_cuda.cpp b/projects/PTv3/libs/pointops/src/grouping/grouping_cuda.cpp
new file mode 100644
index 000000000..6f7990ada
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/grouping/grouping_cuda.cpp
@@ -0,0 +1,21 @@
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "grouping_cuda_kernel.h"
+
+
+void grouping_forward_cuda(int m, int nsample, int c, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor output_tensor)
+{
+    const float *input = input_tensor.data_ptr<float>();
+    const int *idx = idx_tensor.data_ptr<int>();
+    float *output = output_tensor.data_ptr<float>();
+    grouping_forward_cuda_launcher(m, nsample, c, input, idx, output);
+}
+
+void grouping_backward_cuda(int m, int nsample, int c, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor grad_input_tensor)
+{
+    const float *grad_output = grad_output_tensor.data_ptr<float>();
+    const int *idx = idx_tensor.data_ptr<int>();
+    float *grad_input = grad_input_tensor.data_ptr<float>();
+    grouping_backward_cuda_launcher(m, nsample, c, grad_output, idx, grad_input);
+}
diff --git a/projects/PTv3/libs/pointops/src/grouping/grouping_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/grouping/grouping_cuda_kernel.cu
new file mode 100644
index 000000000..a5fbd78fa
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/grouping/grouping_cuda_kernel.cu
@@ -0,0 +1,40 @@
+#include "../cuda_utils.h"
+#include "grouping_cuda_kernel.h"
+
+
+__global__ void grouping_forward_cuda_kernel(int m, int nsample, int c, const float *__restrict__ input, const int *__restrict__ idx, float *__restrict__ output) {
+    // input: input: (n, c), idx: (m, nsample), output: (m, nsample, c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= m * nsample * c) return;
+    const int c_idx = index % c;
+    const int nsample_idx = (index / c) % nsample;
+    const int m_idx = index / nsample / c;
+    const int input_idx = idx[m_idx * nsample + nsample_idx] * c + c_idx;
+    output[index] = input[input_idx];
+}
+
+__global__ void grouping_backward_cuda_kernel(int m, int nsample, int c, const float *__restrict__ grad_output, const int *__restrict__ idx, float *__restrict__ grad_input) {
+    // input: grad_output: (m, nsample, c), idx: (m, nsample), output: grad_input: (n, c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= m * nsample * c) return;
+    const int c_idx = index % c;
+    const int nsample_idx = (index / c) % nsample;
+    const int m_idx = index / nsample / c;
+    const int input_idx = idx[m_idx * nsample + nsample_idx] * c + c_idx;
+    atomicAdd(grad_input + input_idx, grad_output[index]);
+}
+
+void grouping_forward_cuda_launcher(int m, int nsample, int c, const float *input, const int *idx, float *output) {
+    // input: input: (n, c), idx: (m, nsample), output: (m, nsample, c)
+    dim3 blocks(DIVUP(m * nsample * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    grouping_forward_cuda_kernel<<<blocks, threads, 0>>>(m, nsample, c, input, idx, output);
+}
+
+void grouping_backward_cuda_launcher(int m, int nsample, int c, const float *grad_output, const int *idx, float *grad_input)
+{
+    // input: grad_output: (m, nsample, c), idx: (m, nsample), output: grad_input: (n, c)
+    dim3 blocks(DIVUP(m * nsample * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    grouping_backward_cuda_kernel<<<blocks, threads, 0>>>(m, nsample, c, grad_output, idx, grad_input);
+}
diff --git a/projects/PTv3/libs/pointops/src/grouping/grouping_cuda_kernel.h b/projects/PTv3/libs/pointops/src/grouping/grouping_cuda_kernel.h
new file mode 100644
index 000000000..3db4aaa9f
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/grouping/grouping_cuda_kernel.h
@@ -0,0 +1,20 @@
+#ifndef _GROUPING_CUDA_KERNEL
+#define _GROUPING_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void grouping_forward_cuda(int m, int nsample, int c, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor output_tensor);
+void grouping_backward_cuda(int m, int nsample, int c, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor grad_input_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void grouping_forward_cuda_launcher(int m, int nsample, int c, const float *input, const int *idx, float *output);
+void grouping_backward_cuda_launcher(int m, int nsample, int c, const float *grad_output, const int *idx, float *grad_input);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda.cpp b/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda.cpp
new file mode 100644
index 000000000..f2c1b0078
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda.cpp
@@ -0,0 +1,23 @@
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "interpolation_cuda_kernel.h"
+
+
+void interpolation_forward_cuda(int n, int c, int k, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor output_tensor)
+{
+    const float *input = input_tensor.data_ptr<float>();
+    const int *idx = idx_tensor.data_ptr<int>();
+    const float *weight = weight_tensor.data_ptr<float>();
+    float *output = output_tensor.data_ptr<float>();
+    interpolation_forward_cuda_launcher(n, c, k, input, idx, weight, output);
+}
+
+void interpolation_backward_cuda(int n, int c, int k, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_input_tensor)
+{
+    const float *grad_output = grad_output_tensor.data_ptr<float>();
+    const int *idx = idx_tensor.data_ptr<int>();
+    const float *weight = weight_tensor.data_ptr<float>();
+    float *grad_input = grad_input_tensor.data_ptr<float>();
+    interpolation_backward_cuda_launcher(n, c, k, grad_output, idx, weight, grad_input);
+}
diff --git a/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda_kernel.cu
new file mode 100644
index 000000000..f560d8c92
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda_kernel.cu
@@ -0,0 +1,47 @@
+#include "../cuda_utils.h"
+#include "interpolation_cuda_kernel.h"
+
+
+__global__ void interpolation_forward_cuda_kernel(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output)
+{
+    // input: input: (m, c), idx: (n, k), weight: (n, k), output: output (n, c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= n * c) return;
+    int c_idx = index % c;
+    int n_idx = index / c;
+    for (int i = 0; i < k; i++)
+    {
+        int idx_idx = n_idx * k + i;
+        int input_idx = idx[idx_idx] * c + c_idx;
+        output[index] += input[input_idx] * weight[idx_idx];
+    }
+}
+
+__global__ void interpolation_backward_cuda_kernel(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input)
+{
+    // input: grad_output: (n, c), idx: (n, k), weight: (n, k), output: grad_input (m, c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= n * c) return;
+    int c_idx = index % c;
+    int n_idx = index / c;
+    for (int i = 0; i < k; i++)
+    {
+        int idx_idx = n_idx * k + i;
+        int input_idx = idx[idx_idx] * c + c_idx;
+        atomicAdd(grad_input + input_idx, grad_output[index] * weight[idx_idx]);
+    }
+}
+
+void interpolation_forward_cuda_launcher(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output) {
+    // input: input: (m, c), idx: (n, k), weight: (n, k), output: output (n, c)
+    dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    interpolation_forward_cuda_kernel<<<blocks, threads, 0>>>(n, c, k, input, idx, weight, output);
+}
+
+void interpolation_backward_cuda_launcher(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input) {
+    // input: grad_output: (n, c), idx: (n, k), weight: (n, k), output: grad_input (m, c)
+    dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    interpolation_backward_cuda_kernel<<<blocks, threads, 0>>>(n, c, k, grad_output, idx, weight, grad_input);
+}
diff --git a/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda_kernel.h b/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda_kernel.h
new file mode 100644
index 000000000..309e5dd0a
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda_kernel.h
@@ -0,0 +1,20 @@
+#ifndef _INTERPOLATION_CUDA_KERNEL
+#define _INTERPOLATION_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void interpolation_forward_cuda(int n, int c, int k, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor output_tensor);
+void interpolation_backward_cuda(int n, int c, int k, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_input_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void interpolation_forward_cuda_launcher(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output);
+void interpolation_backward_cuda_launcher(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda.cpp b/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda.cpp
new file mode 100644
index 000000000..bbe841ce0
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda.cpp
@@ -0,0 +1,16 @@
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "knn_query_cuda_kernel.h"
+
+
+void knn_query_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor)
+{
+    const float *xyz = xyz_tensor.data_ptr<float>();
+    const float *new_xyz = new_xyz_tensor.data_ptr<float>();
+    const int *offset = offset_tensor.data_ptr<int>();
+    const int *new_offset = new_offset_tensor.data_ptr<int>();
+    int *idx = idx_tensor.data_ptr<int>();
+    float *dist2 = dist2_tensor.data_ptr<float>();
+    knn_query_cuda_launcher(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2);
+}
diff --git a/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda_kernel.cu
new file mode 100644
index 000000000..297740237
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda_kernel.cu
@@ -0,0 +1,112 @@
+#include "../cuda_utils.h"
+#include "knn_query_cuda_kernel.h"
+
+
+namespace knn_query_utils{
+
+template <typename DType>
+__device__ void swap(DType *x, DType *y)
+{
+    DType tmp = *x;
+    *x = *y;
+    *y = tmp;
+}
+
+__device__ void reheap(float *dist, int *idx, int k)
+{
+    int root = 0;
+    int child = root * 2 + 1;
+    while (child < k)
+    {
+        if(child + 1 < k && dist[child+1] > dist[child])
+            child++;
+        if(dist[root] > dist[child])
+            return;
+        swap<float>(&dist[root], &dist[child]);
+        swap<int>(&idx[root], &idx[child]);
+        root = child;
+        child = root * 2 + 1;
+    }
+}
+
+
+__device__ void heap_sort(float *dist, int *idx, int k)
+{
+    int i;
+    for (i = k - 1; i > 0; i--)
+    {
+        swap<float>(&dist[0], &dist[i]);
+        swap<int>(&idx[0], &idx[i]);
+        reheap(dist, idx, i);
+    }
+}
+
+
+__device__ int get_bt_idx(int idx, const int *offset)
+{
+    int i = 0;
+    while (1)
+    {
+        if (idx < offset[i])
+            break;
+        else
+            i++;
+    }
+    return i;
+}
+}  // namespace knn_query_utils
+
+
+__global__ void knn_query_cuda_kernel(int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, const int *__restrict__ offset, const int *__restrict__ new_offset, int *__restrict__ idx, float *__restrict__ dist2) {
+    // input: xyz (n, 3) new_xyz (m, 3)
+    // output: idx (m, nsample) dist2 (m, nsample)
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pt_idx >= m) return;
+
+    new_xyz += pt_idx * 3;
+    idx += pt_idx * nsample;
+    dist2 += pt_idx * nsample;
+
+    int bt_idx = knn_query_utils::get_bt_idx(pt_idx, new_offset);
+    int start;
+    if (bt_idx == 0)
+        start = 0;
+    else
+        start = offset[bt_idx - 1];
+    int end = offset[bt_idx];
+
+    float new_x = new_xyz[0];
+    float new_y = new_xyz[1];
+    float new_z = new_xyz[2];
+
+    float best_dist[128];
+    int best_idx[128];
+    for(int i = 0; i < nsample; i++){
+        best_dist[i] = 1e10;
+        best_idx[i] = -1;
+    }
+    for(int i = start; i < end; i++){
+        float x = xyz[i * 3 + 0];
+        float y = xyz[i * 3 + 1];
+        float z = xyz[i * 3 + 2];
+        float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
+        if (d2 < best_dist[0]){
+            best_dist[0] = d2;
+            best_idx[0] = i;
+            knn_query_utils::reheap(best_dist, best_idx, nsample);
+        }
+    }
+    knn_query_utils::heap_sort(best_dist, best_idx, nsample);
+    for(int i = 0; i < nsample; i++){
+        idx[i] = best_idx[i];
+        dist2[i] = best_dist[i];
+    }
+}
+
+
+void knn_query_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, const int *new_offset, int *idx, float *dist2) {
+    // input: new_xyz: (m, 3), xyz: (n, 3), idx: (m, nsample)
+    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    knn_query_cuda_kernel<<<blocks, threads, 0>>>(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2);
+}
diff --git a/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda_kernel.h b/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda_kernel.h
new file mode 100644
index 000000000..c07c1cb46
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda_kernel.h
@@ -0,0 +1,18 @@
+#ifndef _KNN_QUERY_CUDA_KERNEL
+#define _KNN_QUERY_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void knn_query_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void knn_query_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, const int *new_offset, int *idx, float *dist2);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/projects/PTv3/libs/pointops/src/pointops_api.cpp b/projects/PTv3/libs/pointops/src/pointops_api.cpp
new file mode 100644
index 000000000..5ca437760
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/pointops_api.cpp
@@ -0,0 +1,32 @@
+#include <torch/serialize/tensor.h>
+#include <torch/extension.h>
+
+#include "knn_query/knn_query_cuda_kernel.h"
+#include "ball_query/ball_query_cuda_kernel.h"
+#include "random_ball_query/random_ball_query_cuda_kernel.h"
+#include "sampling/sampling_cuda_kernel.h"
+#include "grouping/grouping_cuda_kernel.h"
+#include "interpolation/interpolation_cuda_kernel.h"
+#include "aggregation/aggregation_cuda_kernel.h"
+#include "subtraction/subtraction_cuda_kernel.h"
+#include "attention/attention_cuda_kernel.h"
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("knn_query_cuda", &knn_query_cuda, "knn_query_cuda");
+    m.def("ball_query_cuda", &ball_query_cuda, "ball_query_cuda");
+    m.def("random_ball_query_cuda", &random_ball_query_cuda, "random_ball_query_cuda");
+    m.def("farthest_point_sampling_cuda", &farthest_point_sampling_cuda, "farthest_point_sampling_cuda");
+    m.def("grouping_forward_cuda", &grouping_forward_cuda, "grouping_forward_cuda");
+    m.def("grouping_backward_cuda", &grouping_backward_cuda, "grouping_backward_cuda");
+    m.def("interpolation_forward_cuda", &interpolation_forward_cuda, "interpolation_forward_cuda");
+    m.def("interpolation_backward_cuda", &interpolation_backward_cuda, "interpolation_backward_cuda");
+    m.def("subtraction_forward_cuda", &subtraction_forward_cuda, "subtraction_forward_cuda");
+    m.def("subtraction_backward_cuda", &subtraction_backward_cuda, "subtraction_backward_cuda");
+    m.def("aggregation_forward_cuda", &aggregation_forward_cuda, "aggregation_forward_cuda");
+    m.def("aggregation_backward_cuda", &aggregation_backward_cuda, "aggregation_backward_cuda");
+    m.def("attention_relation_step_forward_cuda", &attention_relation_step_forward_cuda, "attention_relation_step_forward_cuda");
+    m.def("attention_relation_step_backward_cuda", &attention_relation_step_backward_cuda, "attention_relation_step_backward_cuda");
+    m.def("attention_fusion_step_forward_cuda", &attention_fusion_step_forward_cuda, "attention_fusion_step_forward_cuda");
+    m.def("attention_fusion_step_backward_cuda", &attention_fusion_step_backward_cuda, "attention_fusion_step_backward_cuda");
+}
diff --git a/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda.cpp b/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda.cpp
new file mode 100644
index 000000000..c2618c94b
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda.cpp
@@ -0,0 +1,21 @@
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "random_ball_query_cuda_kernel.h"
+
+
+void random_ball_query_cuda(int m, int nsample,
+                            float min_radius, float max_radius, at::Tensor order_tensor,
+                            at::Tensor xyz_tensor, at::Tensor new_xyz_tensor,
+                            at::Tensor offset_tensor, at::Tensor new_offset_tensor,
+                            at::Tensor idx_tensor, at::Tensor dist2_tensor)
+{
+    const int *order = order_tensor.data_ptr<int>();
+    const float *xyz = xyz_tensor.data_ptr<float>();
+    const float *new_xyz = new_xyz_tensor.data_ptr<float>();
+    const int *offset = offset_tensor.data_ptr<int>();
+    const int *new_offset = new_offset_tensor.data_ptr<int>();
+    int *idx = idx_tensor.data_ptr<int>();
+    float *dist2 = dist2_tensor.data_ptr<float>();
+    random_ball_query_cuda_launcher(m, nsample, min_radius, max_radius, order, xyz, new_xyz, offset, new_offset, idx, dist2);
+}
diff --git a/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda_kernel.cu
new file mode 100644
index 000000000..bfafb0f8b
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda_kernel.cu
@@ -0,0 +1,123 @@
+#include "../cuda_utils.h"
+#include "random_ball_query_cuda_kernel.h"
+
+
+namespace random_ball_query_utils{
+
+template <typename DType>
+__device__ void swap(DType *x, DType *y)
+{
+    DType tmp = *x;
+    *x = *y;
+    *y = tmp;
+}
+
+__device__ void reheap(float *dist, int *idx, int k)
+{
+    int root = 0;
+    int child = root * 2 + 1;
+    while (child < k)
+    {
+        if(child + 1 < k && dist[child+1] > dist[child])
+            child++;
+        if(dist[root] > dist[child])
+            return;
+        swap<float>(&dist[root], &dist[child]);
+        swap<int>(&idx[root], &idx[child]);
+        root = child;
+        child = root * 2 + 1;
+    }
+}
+
+
+__device__ void heap_sort(float *dist, int *idx, int k)
+{
+    int i;
+    for (i = k - 1; i > 0; i--)
+    {
+        swap<float>(&dist[0], &dist[i]);
+        swap<int>(&idx[0], &idx[i]);
+        reheap(dist, idx, i);
+    }
+}
+
+__device__ int get_bt_idx(int idx, const int *offset)
+{
+    int i = 0;
+    while (1)
+    {
+        if (idx < offset[i])
+            break;
+        else
+            i++;
+    }
+    return i;
+}
+}  // namespace ball_query_utils
+
+__global__ void random_ball_query_cuda_kernel(int m, int nsample,
+                                              float min_radius, float max_radius, const int *__restrict__ order,
+                                              const float *__restrict__ xyz, const float *__restrict__ new_xyz,
+                                              const int *__restrict__ offset, const int *__restrict__ new_offset,
+                                              int *__restrict__ idx, float *__restrict__ dist2) {
+    // input: xyz (n, 3) new_xyz (m, 3)
+    // output: idx (m, nsample) dist (m, nsample)
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pt_idx >= m) return;
+
+    new_xyz += pt_idx * 3;
+    idx += pt_idx * nsample;
+    dist2 += pt_idx * nsample;
+
+    int bt_idx = random_ball_query_utils::get_bt_idx(pt_idx, new_offset);
+    int start;
+    if (bt_idx == 0)
+        start = 0;
+    else
+        start = offset[bt_idx - 1];
+    int end = offset[bt_idx];
+
+    float max_radius2 = max_radius * max_radius;
+    float min_radius2 = min_radius * min_radius;
+    float new_x = new_xyz[0];
+    float new_y = new_xyz[1];
+    float new_z = new_xyz[2];
+
+    int cnt = 0;
+
+    for(int i = start; i < end; i++){
+        float x = xyz[order[i] * 3 + 0];
+        float y = xyz[order[i] * 3 + 1];
+        float z = xyz[order[i] * 3 + 2];
+        float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
+
+        if (d2 <= 1e-5 || (d2 >= min_radius2 && d2 < max_radius2)){
+            dist2[cnt] = d2;
+            idx[cnt] = order[i];
+            cnt += 1;
+            if (cnt >= nsample) break;
+        }
+    }
+
+    if (cnt < nsample) {
+        for (int i = cnt; i < nsample; i++){
+            idx[i] = -1;
+            dist2[i] = 1e10;
+        }
+    }
+}
+
+void random_ball_query_cuda_launcher(int m, int nsample,
+                                     float min_radius, float max_radius, const int *order,
+                                     const float *xyz, const float *new_xyz,
+                                     const int *offset, const int *new_offset,
+                                     int *idx, float *dist2) {
+    // input: new_xyz: (m, 3), xyz: (n, 3), idx: (m, nsample)
+    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    random_ball_query_cuda_kernel<<<blocks, threads, 0>>>(m, nsample,
+                                                          min_radius, max_radius, order,
+                                                          xyz, new_xyz,
+                                                          offset, new_offset,
+                                                          idx, dist2);
+}
diff --git a/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda_kernel.h b/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda_kernel.h
new file mode 100644
index 000000000..d3e35be21
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda_kernel.h
@@ -0,0 +1,26 @@
+#ifndef _RANDOM_BALL_QUERY_CUDA_KERNEL
+#define _RANDOM_BALL_QUERY_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void random_ball_query_cuda(int m, int nsample,
+                            float min_radius, float max_radius, at::Tensor order_tensor,
+                            at::Tensor xyz_tensor, at::Tensor new_xyz_tensor,
+                            at::Tensor offset_tensor, at::Tensor new_offset_tensor,
+                            at::Tensor idx_tensor, at::Tensor dist2_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void random_ball_query_cuda_launcher(int m, int nsample,
+                                    float min_radius, float max_radius,  const int *order,
+                                    const float *xyz, const float *new_xyz,
+                                    const int *offset, const int *new_offset,
+                                    int *idx, float *dist2);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/projects/PTv3/libs/pointops/src/sampling/sampling_cuda.cpp b/projects/PTv3/libs/pointops/src/sampling/sampling_cuda.cpp
new file mode 100644
index 000000000..7dc8094c3
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/sampling/sampling_cuda.cpp
@@ -0,0 +1,15 @@
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "sampling_cuda_kernel.h"
+
+
+void farthest_point_sampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor tmp_tensor, at::Tensor idx_tensor)
+{
+    const float *xyz = xyz_tensor.data_ptr<float>();
+    const int *offset = offset_tensor.data_ptr<int>();
+    const int *new_offset = new_offset_tensor.data_ptr<int>();
+    float *tmp = tmp_tensor.data_ptr<float>();
+    int *idx = idx_tensor.data_ptr<int>();
+    farthest_point_sampling_cuda_launcher(b, n, xyz, offset, new_offset, tmp, idx);
+}
diff --git a/projects/PTv3/libs/pointops/src/sampling/sampling_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/sampling/sampling_cuda_kernel.cu
new file mode 100644
index 000000000..fb240c2d0
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/sampling/sampling_cuda_kernel.cu
@@ -0,0 +1,171 @@
+#include "../cuda_utils.h"
+#include "sampling_cuda_kernel.h"
+
+
+__device__ void __update(float *dists, int *dists_i, int idx1, int idx2) {
+    const float v1 = dists[idx1], v2 = dists[idx2];
+    const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+    dists[idx1] = max(v1, v2);
+    dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+
+// input xyz: (n, 3), tmp: (b, n_max)
+// ouput idx (m)
+template <unsigned int block_size>
+__global__ void farthest_point_sampling_cuda_kernel(const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx)
+{
+    __shared__ float dists[block_size];
+    __shared__ int dists_i[block_size];
+
+    int bid = blockIdx.x;
+    int start_n, end_n, start_m, end_m, old;
+    if (bid == 0) {
+        start_n = 0;
+        end_n = offset[0];
+        start_m = 0;
+        end_m = new_offset[0];
+        old = 0;
+    }
+    else {
+        start_n = offset[bid - 1];
+        end_n = offset[bid];
+        start_m = new_offset[bid - 1];
+        end_m = new_offset[bid];
+        old = offset[bid - 1];
+    }
+
+    const int stride = block_size;
+    int tid = threadIdx.x;
+    if (tid == 0) idx[start_m] = start_n;
+
+    __syncthreads();
+    for (int j = start_m + 1; j < end_m; j++)
+    {
+        int besti = start_n;
+        float best = -1;
+        float x1 = xyz[old * 3 + 0];
+        float y1 = xyz[old * 3 + 1];
+        float z1 = xyz[old * 3 + 2];
+        for (int k = start_n + tid; k < end_n; k += stride)
+        {
+            float x2 = xyz[k * 3 + 0];
+            float y2 = xyz[k * 3 + 1];
+            float z2 = xyz[k * 3 + 2];
+            float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+            float d2 = min(d, tmp[k]);
+            tmp[k] = d2;
+            besti = d2 > best ? k : besti;
+            best = d2 > best ? d2 : best;
+        }
+        dists[tid] = best;
+        dists_i[tid] = besti;
+        __syncthreads();
+
+        if (block_size >= 1024) {
+            if (tid < 512) {
+            __update(dists, dists_i, tid, tid + 512);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 512) {
+            if (tid < 256) {
+            __update(dists, dists_i, tid, tid + 256);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 256) {
+            if (tid < 128) {
+            __update(dists, dists_i, tid, tid + 128);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 128) {
+            if (tid < 64) {
+            __update(dists, dists_i, tid, tid + 64);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 64) {
+            if (tid < 32) {
+            __update(dists, dists_i, tid, tid + 32);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 32) {
+            if (tid < 16) {
+            __update(dists, dists_i, tid, tid + 16);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 16) {
+            if (tid < 8) {
+            __update(dists, dists_i, tid, tid + 8);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 8) {
+            if (tid < 4) {
+            __update(dists, dists_i, tid, tid + 4);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 4) {
+            if (tid < 2) {
+            __update(dists, dists_i, tid, tid + 2);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 2) {
+            if (tid < 1) {
+            __update(dists, dists_i, tid, tid + 1);
+            }
+            __syncthreads();
+        }
+
+        old = dists_i[0];
+        if (tid == 0)
+            idx[j] = old;
+    }
+}
+
+void farthest_point_sampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx)
+{
+	unsigned int n_threads = opt_n_threads(n);
+	switch (n_threads) {
+        case 1024:
+            farthest_point_sampling_cuda_kernel<1024><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 512:
+            farthest_point_sampling_cuda_kernel<512><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 256:
+            farthest_point_sampling_cuda_kernel<256><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 128:
+            farthest_point_sampling_cuda_kernel<128><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 64:
+            farthest_point_sampling_cuda_kernel<64><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 32:
+            farthest_point_sampling_cuda_kernel<32><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 16:
+            farthest_point_sampling_cuda_kernel<16><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 8:
+            farthest_point_sampling_cuda_kernel<8><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 4:
+            farthest_point_sampling_cuda_kernel<4><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 2:
+            farthest_point_sampling_cuda_kernel<2><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 1:
+            farthest_point_sampling_cuda_kernel<1><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        default:
+            farthest_point_sampling_cuda_kernel<512><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+    }
+}
diff --git a/projects/PTv3/libs/pointops/src/sampling/sampling_cuda_kernel.h b/projects/PTv3/libs/pointops/src/sampling/sampling_cuda_kernel.h
new file mode 100644
index 000000000..f0e076073
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/sampling/sampling_cuda_kernel.h
@@ -0,0 +1,18 @@
+#ifndef _SAMPLING_CUDA_KERNEL
+#define _SAMPLING_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void farthest_point_sampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor tmp_tensor, at::Tensor idx_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void farthest_point_sampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda.cpp b/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda.cpp
new file mode 100644
index 000000000..b343857a1
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda.cpp
@@ -0,0 +1,23 @@
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "subtraction_cuda_kernel.h"
+
+
+void subtraction_forward_cuda(int n, int nsample, int c, at::Tensor input1_tensor, at::Tensor input2_tensor, at::Tensor idx_tensor, at::Tensor output_tensor)
+{
+    const float *input1 = input1_tensor.data_ptr<float>();
+    const float *input2 = input2_tensor.data_ptr<float>();
+    const int *idx = idx_tensor.data_ptr<int>();
+    float *output = output_tensor.data_ptr<float>();
+    subtraction_forward_cuda_launcher(n, nsample, c, input1, input2, idx, output);
+}
+
+void subtraction_backward_cuda(int n, int nsample, int c, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input1_tensor, at::Tensor grad_input2_tensor)
+{
+    const int *idx = idx_tensor.data_ptr<int>();
+    const float *grad_output = grad_output_tensor.data_ptr<float>();
+    float *grad_input1 = grad_input1_tensor.data_ptr<float>();
+    float *grad_input2 = grad_input2_tensor.data_ptr<float>();
+    subtraction_backward_cuda_launcher(n, nsample, c, idx, grad_output, grad_input1, grad_input2);
+}
diff --git a/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda_kernel.cu
new file mode 100644
index 000000000..0848a455d
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda_kernel.cu
@@ -0,0 +1,44 @@
+#include "../cuda_utils.h"
+#include "subtraction_cuda_kernel.h"
+
+
+__global__ void subtraction_forward_cuda_kernel(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output) {
+    // input: input1: (n, c), input2: (n, c), idx: (n, nsample), output: (n, nsample, c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= n * nsample * c) return;
+    const int c_idx = index % c;
+    const int nsample_idx = (index / c) % nsample;
+    const int n_idx = index / nsample / c;
+    const int idx_idx = n_idx * nsample + nsample_idx;
+    const int input1_idx = n_idx * c + c_idx;
+    const int input2_idx = idx[idx_idx] * c + c_idx;
+    output[index] = input1[input1_idx] - input2[input2_idx];
+}
+
+__global__ void subtraction_backward_cuda_kernel(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2) {
+    // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= n * nsample * c) return;
+    const int c_idx = index % c;
+    const int nsample_idx = (index / c) % nsample;
+    const int n_idx = index / nsample / c;
+    const int idx_idx = n_idx * nsample + nsample_idx;
+    const int input1_idx = n_idx * c + c_idx;
+    const int input2_idx = idx[idx_idx] * c + c_idx;
+    atomicAdd(grad_input1 + input1_idx, grad_output[index]);
+    atomicAdd(grad_input2 + input2_idx, -grad_output[index]);
+}
+
+void subtraction_forward_cuda_launcher(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output) {
+    // input: input1: (n, c), input2: (n, c), idx: (n, nsample), output: (n, nsample, c)
+    dim3 blocks(DIVUP(n * nsample * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    subtraction_forward_cuda_kernel<<<blocks, threads, 0>>>(n, nsample, c, input1, input2, idx, output);
+}
+
+void subtraction_backward_cuda_launcher(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2) {
+    // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
+    dim3 blocks(DIVUP(n * nsample * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    subtraction_backward_cuda_kernel<<<blocks, threads, 0>>>(n, nsample, c, idx, grad_output, grad_input1, grad_input2);
+}
diff --git a/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda_kernel.h b/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda_kernel.h
new file mode 100644
index 000000000..856133d97
--- /dev/null
+++ b/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda_kernel.h
@@ -0,0 +1,20 @@
+#ifndef _SUBTRACTION_CUDA_KERNEL
+#define _SUBTRACTION_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void subtraction_forward_cuda(int n, int nsample, int c, at::Tensor input1_tensor, at::Tensor input2_tensor, at::Tensor idx_tensor, at::Tensor output_tensor);
+void subtraction_backward_cuda(int n, int nsample, int c, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input1_tensor, at::Tensor grad_input2_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void subtraction_forward_cuda_launcher(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output);
+void subtraction_backward_cuda_launcher(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/projects/PTv3/libs/pointrope/__init__.py b/projects/PTv3/libs/pointrope/__init__.py
new file mode 100644
index 000000000..c8f754883
--- /dev/null
+++ b/projects/PTv3/libs/pointrope/__init__.py
@@ -0,0 +1 @@
+from .functions import *
diff --git a/projects/PTv3/libs/pointrope/functions/__init__.py b/projects/PTv3/libs/pointrope/functions/__init__.py
new file mode 100644
index 000000000..bfe3ef321
--- /dev/null
+++ b/projects/PTv3/libs/pointrope/functions/__init__.py
@@ -0,0 +1 @@
+from .pointrope import PointROPE
diff --git a/projects/PTv3/libs/pointrope/functions/pointrope.py b/projects/PTv3/libs/pointrope/functions/pointrope.py
new file mode 100644
index 000000000..7065098a5
--- /dev/null
+++ b/projects/PTv3/libs/pointrope/functions/pointrope.py
@@ -0,0 +1,116 @@
+import torch
+from torch.autograd import Function
+from torch.onnx.symbolic_helper import _get_tensor_sizes
+
+try:
+    import pointrope_cuda as _kernels  # run `python setup.py install`
+except ModuleNotFoundError:
+    from . import pointrope_cuda as _kernels  # run `python setup.py build_ext --inplace`
+
+
+class PointROPE_func(Function):
+    @staticmethod
+    def symbolic(g, tokens, positions, base, F0=1.0):
+        output = g.op(
+            "litept::PointRoPE",
+            tokens,
+            positions,
+            base_f=float(base),
+            f0_f=float(F0),
+            outputs=1,
+        )
+        token_shape = _get_tensor_sizes(tokens)
+        # PointRoPE is in-place in forward() and returns a tensor with the same shape as `tokens`.
+        # Preserve the full [B, N, H, D] shape for ONNX shape inference when available.
+        if token_shape is not None and hasattr(output.type(), "with_sizes"):
+            output_type = tokens.type().with_sizes(token_shape)
+            output.setType(output_type)
+
+        return output
+
+    @staticmethod
+    def forward(ctx, tokens, positions, base, F0=1.0):
+        ctx.save_for_backward(positions)
+        ctx.saved_base = base
+        ctx.saved_F0 = F0
+        # tokens = tokens.clone() # uncomment this if inplace doesn't work
+        _kernels.pointrope(tokens, positions, base, F0)
+        ctx.mark_dirty(tokens)
+        return tokens
+
+    @staticmethod
+    def backward(ctx, grad_res):
+        positions, base, F0 = ctx.saved_tensors[0], ctx.saved_base, ctx.saved_F0
+        _kernels.pointrope(grad_res, positions, base, -F0)
+        ctx.mark_dirty(grad_res)
+        return grad_res, None, None, None
+
+
+# class PointROPE(torch.nn.Module):
+#     def __init__(self, freq=100.0, F0=1.0):
+#         super().__init__()
+#         self.base = freq
+#         self.F0 = F0
+
+#     def forward(self, tokens, positions):
+#         PointROPE_func.apply(tokens.transpose(1, 2), positions, self.base, self.F0)
+#         return tokens
+
+
+class PointROPE(torch.nn.Module):
+    def __init__(self, freq=100.0, F0=1.0):
+        super().__init__()
+        self.base = freq
+        self.F0 = F0
+
+    def forward(self, tokens: torch.Tensor, positions: torch.Tensor):
+        """"""
+        tokens = tokens.transpose(1, 2)
+        assert tokens.dim() == 4, tokens.shape
+        B, N, H, D = tokens.shape
+        assert D % 6 == 0
+        Q = D // 6
+
+        # pos -> [B, N, 3]
+        if positions.dim() == 2:
+            assert positions.shape[0] == B * N and positions.shape[1] == 3
+            pos_bn3 = positions.view(B, N, 3)
+        else:
+            assert positions.shape == (B, N, 3), f"{positions.shape=} vs {tokens.shape=}"
+            pos_bn3 = positions
+
+        # inv_freq: [Q]
+        # inv_freq[q] = fwd / base^(q/Q)
+        q = torch.arange(Q, device=tokens.device, dtype=tokens.dtype)
+        inv_freq = torch.tensor(float(self.F0), device=tokens.device, dtype=tokens.dtype) / (
+            torch.tensor(float(self.base), device=tokens.device, dtype=tokens.dtype) ** (q / float(Q))
+        )
+
+        # freq: [B, N, 3, Q]
+        pos_f = pos_bn3.to(dtype=tokens.dtype)
+        freq = pos_f.unsqueeze(-1) * inv_freq.view(1, 1, 1, Q)
+
+        cos = torch.cos(freq)  # [B, N, 3, Q]
+        sin = torch.sin(freq)  # [B, N, 3, Q]
+
+        # separate tokens into 6-blocks: each [B, N, H, Q]
+        ux, vx, uy, vy, uz, vz = tokens.split(Q, dim=-1)
+
+        # broadcast each [B, N, 1, Q] to head per axis
+        cx = cos[:, :, 0, :].unsqueeze(2)
+        sx = sin[:, :, 0, :].unsqueeze(2)
+        cy = cos[:, :, 1, :].unsqueeze(2)
+        sy = sin[:, :, 1, :].unsqueeze(2)
+        cz = cos[:, :, 2, :].unsqueeze(2)
+        sz = sin[:, :, 2, :].unsqueeze(2)
+
+        ru_x = ux * cx - vx * sx
+        rv_x = vx * cx + ux * sx
+
+        ru_y = uy * cy - vy * sy
+        rv_y = vy * cy + uy * sy
+
+        ru_z = uz * cz - vz * sz
+        rv_z = vz * cz + uz * sz
+
+        return torch.cat([ru_x, rv_x, ru_y, rv_y, ru_z, rv_z], dim=-1).transpose(1, 2)
diff --git a/projects/PTv3/libs/pointrope/pyproject.toml b/projects/PTv3/libs/pointrope/pyproject.toml
new file mode 100644
index 000000000..233cba7ca
--- /dev/null
+++ b/projects/PTv3/libs/pointrope/pyproject.toml
@@ -0,0 +1,13 @@
+[project]
+name = "pointrope"
+version = "0.1.0"
+requires-python = "==3.10.*"
+dependencies = []
+
+[build-system]
+requires = ["setuptools>=61"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+where = ["."]
+exclude = []
diff --git a/projects/PTv3/libs/pointrope/setup.py b/projects/PTv3/libs/pointrope/setup.py
new file mode 100644
index 000000000..0c8adc396
--- /dev/null
+++ b/projects/PTv3/libs/pointrope/setup.py
@@ -0,0 +1,35 @@
+from setuptools import setup
+from torch import cuda
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+# compile for all possible CUDA architectures
+# all_cuda_archs = cuda.get_gencode_flags().replace('compute=','arch=').split()
+# alternatively, you can list cuda archs that you want, eg:
+# check https://developer.nvidia.com/cuda-gpus to find your arch
+# fmt: off
+all_cuda_archs = [
+    "-gencode", "arch=compute_90,code=sm_90",
+    "-gencode", "arch=compute_75,code=sm_75",
+    "-gencode", "arch=compute_80,code=sm_80",
+    "-gencode", "arch=compute_86,code=sm_86",
+    "-gencode", "arch=compute_89,code=sm_89",
+]
+# fmt: on
+
+setup(
+    name="pointrope",
+    ext_modules=[
+        CUDAExtension(
+            name="pointrope_cuda",
+            sources=[
+                "src/pointrope.cpp",
+                "src/kernels.cu",
+            ],
+            extra_compile_args=dict(
+                nvcc=["-O3", "--ptxas-options=-v", "--use_fast_math"] + all_cuda_archs,
+                cxx=["-O3"],
+            ),
+        )
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)
diff --git a/projects/PTv3/libs/pointrope/src/kernels.cu b/projects/PTv3/libs/pointrope/src/kernels.cu
new file mode 100644
index 000000000..1372465c2
--- /dev/null
+++ b/projects/PTv3/libs/pointrope/src/kernels.cu
@@ -0,0 +1,94 @@
+
+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+
+#define CHECK_CUDA(tensor) {\
+    TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \
+    TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); }
+void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));}
+
+
+template < typename scalar_t  >
+__global__ void pointrope_cuda_kernel(
+        torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> tokens,
+        const int64_t* __restrict__ pos,
+        const float base,
+        const float fwd )
+{
+    // tokens shape = (B, N, H, D)
+    const int N = tokens.size(1);
+    const int H = tokens.size(2);
+    const int D = tokens.size(3);
+
+    // each block update a single token, for all heads
+    // each thread takes care of a single output
+    extern __shared__ float shared[];
+    float* shared_inv_freq = shared + D;
+
+    const int b = blockIdx.x / N;
+    const int n = blockIdx.x % N;
+
+    const int Q = D / 6;  // D = 18, Q = 3
+    // one token = [0..Q : Q..2Q : 2Q..3Q : 3Q..4Q : 4Q..5Q : 5Q..D]
+    //              u_X     v_X     u_Y      v_Y       u_Z      v_Z
+
+    // shared memory: first, compute inv_freq
+    if (threadIdx.x < Q)
+        shared_inv_freq[threadIdx.x] = fwd / powf(base, threadIdx.x/float(Q));
+    __syncthreads();
+
+    // range of threadIdx.x is [0, 1, ..., 17]
+
+    // start of X or Y or Z part
+    const int X = threadIdx.x * 3 / D;
+    const int m = (X*D/3) + (threadIdx.x % Q);   // index of u_Y or u_X
+
+    // grab the cos,sin
+    const float freq = pos[blockIdx.x*3+X] * shared_inv_freq[threadIdx.x % Q];
+    const float cos = cosf(freq);
+    const float sin = sinf(freq);
+
+
+    for (int h = 0; h < H; h++)
+    {
+        // then, load all the token for this head in shared memory
+        shared[threadIdx.x] = tokens[b][n][h][threadIdx.x];
+        __syncthreads();
+
+        const float u = shared[m];
+        const float v = shared[m+Q];
+
+        // write output
+        if ((threadIdx.x % (D/3)) < Q)
+            tokens[b][n][h][threadIdx.x] = u*cos - v*sin;
+        else
+            tokens[b][n][h][threadIdx.x] = v*cos + u*sin;
+    }
+}
+
+void pointrope_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd )
+{
+    const int B = tokens.size(0); // batch size
+    const int N = tokens.size(1); // sequence length
+    const int H = tokens.size(2); // number of heads
+    const int D = tokens.size(3); // dimension per head
+
+    TORCH_CHECK(tokens.stride(3) == 1 && tokens.stride(2) == D, "tokens are not contiguous");
+    TORCH_CHECK(pos.is_contiguous(), "positions are not contiguous");
+    TORCH_CHECK(pos.size(0) == B && pos.size(1) == N && pos.size(2) == 3, "bad pos.shape");
+    TORCH_CHECK(D % 6 == 0, "token dim must be multiple of 6");
+
+    // one block for each layer, one thread per local-max
+    const int THREADS_PER_BLOCK = D;
+    const int N_BLOCKS = B * N; // each block takes care of H*D values
+    const int SHARED_MEM = sizeof(float) * (D + D/6);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::kHalf, at::kBFloat16, tokens.scalar_type(), "pointrope_cuda", ([&] {
+        pointrope_cuda_kernel<scalar_t> <<<N_BLOCKS, THREADS_PER_BLOCK, SHARED_MEM>>> (
+            tokens.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
+            pos.data_ptr<int64_t>(),
+            base, fwd); //, N, H, D );
+    }));
+}
diff --git a/projects/PTv3/libs/pointrope/src/pointrope.cpp b/projects/PTv3/libs/pointrope/src/pointrope.cpp
new file mode 100644
index 000000000..1b620b0f5
--- /dev/null
+++ b/projects/PTv3/libs/pointrope/src/pointrope.cpp
@@ -0,0 +1,66 @@
+
+
+#include <torch/extension.h>
+
+// forward declaration
+void pointrope_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd );
+
+void pointrope_cpu( torch::Tensor tokens, const torch::Tensor positions, const float base, const float fwd )
+{
+    const int B = tokens.size(0);
+    const int N = tokens.size(1);
+    const int H = tokens.size(2); // number head, eg. 2
+    const int D = tokens.size(3) / 6; // if dimension per head is 18, then D = 3
+
+    auto tok = tokens.accessor<float, 4>();
+    auto pos = positions.accessor<int64_t, 3>();
+
+    for (int b = 0; b < B; b++) {
+      for (int x = 0; x < 3; x++) { // x and then y then z (3d)
+        for (int n = 0; n < N; n++) {
+
+            // grab the token position
+            const int p = pos[b][n][x];
+
+            for (int h = 0; h < H; h++) {
+                for (int d = 0; d < D; d++) {
+                    // grab the two values
+                    float u = tok[b][n][h][d+0+x*2*D];
+                    float v = tok[b][n][h][d+D+x*2*D];
+
+                    // grab the cos,sin
+                    const float inv_freq = fwd * p / powf(base, d/float(D));
+                    float c = cosf(inv_freq);
+                    float s = sinf(inv_freq);
+
+                    // write the result
+                    tok[b][n][h][d+0+x*2*D] = u*c - v*s;
+                    tok[b][n][h][d+D+x*2*D] = v*c + u*s;
+                }
+            }
+        }
+      }
+    }
+}
+
+void pointrope( torch::Tensor tokens,
+        const torch::Tensor positions,
+        const float base,
+        const float fwd )
+{
+    TORCH_CHECK(tokens.dim() == 4, "tokens must have 4 dimensions");
+    TORCH_CHECK(positions.dim() == 3, "positions must have 3 dimensions");
+    TORCH_CHECK(tokens.size(0) == positions.size(0), "batch size differs between tokens & positions");
+    TORCH_CHECK(tokens.size(1) == positions.size(1), "seq_length differs between tokens & positions");
+    TORCH_CHECK(positions.size(2) == 3, "positions.shape[2] must be equal to 3");
+    TORCH_CHECK(tokens.is_cuda() == positions.is_cuda(), "tokens and positions are not on the same device" );
+
+    if (tokens.is_cuda())
+        pointrope_cuda( tokens, positions, base, fwd );
+    else
+        pointrope_cpu( tokens, positions, base, fwd );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("pointrope", &pointrope, "PointROPE forward/backward");
+}
diff --git a/projects/PTv3/models/__init__.py b/projects/PTv3/models/__init__.py
index 6dd1f71b2..a569efe8d 100644
--- a/projects/PTv3/models/__init__.py
+++ b/projects/PTv3/models/__init__.py
@@ -1,5 +1,6 @@
 from .builder import build_model
 from .default import *
+from .litept import *
 
 # Pretraining
 from .point_prompt_training import *
diff --git a/projects/PTv3/models/litept/__init__.py b/projects/PTv3/models/litept/__init__.py
new file mode 100644
index 000000000..56c8ac6bf
--- /dev/null
+++ b/projects/PTv3/models/litept/__init__.py
@@ -0,0 +1 @@
+from .litept import *  # noqa
diff --git a/projects/PTv3/models/litept/litept.py b/projects/PTv3/models/litept/litept.py
new file mode 100644
index 000000000..6a1fadac2
--- /dev/null
+++ b/projects/PTv3/models/litept/litept.py
@@ -0,0 +1,598 @@
+from functools import partial
+
+import flash_attn
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_scatter
+from addict import Dict
+from libs.pointrope import PointROPE
+from models.builder import MODELS
+from models.modules import MLP, Embedding, PointModule, PointSequential
+from models.scatter.functional import argsort, segment_csr, unique
+from models.utils.structure import Point
+from timm.layers import DropPath
+
+
+class PointROPEAttention(PointModule):
+    def __init__(
+        self,
+        channels,
+        num_heads,
+        patch_size,
+        rope_freq,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        order_index=0,
+    ):
+        super().__init__()
+        assert channels % num_heads == 0
+        self.channels = channels
+        self.num_heads = num_heads
+        self.scale = qk_scale or (channels // num_heads) ** -0.5
+        self.order_index = order_index
+
+        self.patch_size = patch_size
+        self.attn_drop = attn_drop
+
+        self.qkv = torch.nn.Linear(channels, channels * 3, bias=qkv_bias)
+        self.proj = torch.nn.Linear(channels, channels)
+        self.proj_drop = torch.nn.Dropout(proj_drop)
+
+        # pointrope
+        self.rope = PointROPE(freq=rope_freq)
+
+    def forward(self, point: Point):
+        H = self.num_heads
+        K = self.patch_size
+        C = self.channels
+
+        pad, unpad, cu_seqlens = point.get_padding_and_inverse(self.patch_size)
+
+        order = point.serialized_order[self.order_index][pad]
+        inverse = unpad[point.serialized_inverse[self.order_index]]
+
+        # padding and reshape feat and batch for serialized point patch
+        qkv = self.qkv(point.feat)[order]  # [N, C]
+
+        ## apply pointrope
+        pos = point.grid_coord[order]  # [N, 3]
+        pos = pos.reshape(-1, 3).unsqueeze(0)
+
+        q, k, v = qkv.half().chunk(3, dim=-1)
+        q = q.reshape(-1, H, C // H).transpose(0, 1)[None]  # [1, H, N, head_dim]
+        k = k.reshape(-1, H, C // H).transpose(0, 1)[None]  # [1, H, N, head_dim]
+
+        # workround to make pointrope cuda float32 happy
+        q = self.rope(q.float(), pos).to(q.dtype)  # [1, H, N, head_dim]
+        k = self.rope(k.float(), pos).to(k.dtype)  # [1, H, N, head_dim]
+
+        # assemble input for flash attention
+        qkv_rotated = torch.stack(
+            [
+                q.squeeze(0).transpose(0, 1),
+                k.squeeze(0).transpose(0, 1),
+                v.reshape(-1, H, C // H),
+            ],
+            dim=1,
+        )  # [N, 3, H, head_dim]
+
+        if torch.onnx.is_in_onnx_export():
+            assert (qkv_rotated.shape[0] % K) == 0
+            # encode and reshape qkv: (N', K, 3, H, C') => (3, N', H, K, C')
+            q, k, v = qkv_rotated.reshape(-1, K, 3, H, C // H).permute(2, 0, 3, 1, 4).unbind(dim=0)
+            # attn
+            attn = (q * self.scale) @ k.transpose(-2, -1)  # (N', H, K, K)
+            attn = F.softmax(attn, dim=-1)
+            feat = (attn @ v).transpose(1, 2).reshape(-1, C)
+        else:
+            feat = flash_attn.flash_attn_varlen_qkvpacked_func(
+                qkv_rotated,
+                cu_seqlens,
+                max_seqlen=self.patch_size,
+                dropout_p=self.attn_drop if self.training else 0,
+                softmax_scale=self.scale,
+            ).reshape(-1, C)
+
+        feat = feat.to(qkv.dtype)
+        feat = feat[inverse]
+
+        # ffn
+        feat = self.proj(feat)
+        feat = self.proj_drop(feat)
+        point.feat = feat
+        return point
+
+
+class Block(PointModule):
+    def __init__(
+        self,
+        channels,
+        num_heads,
+        patch_size=48,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        pre_norm=True,
+        order_index=0,
+        cpe_indice_key=None,
+        enable_conv=True,
+        enable_attn=True,
+        rope_freq=100.0,
+        export_mode=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.pre_norm = pre_norm
+        self.export_mode = export_mode
+
+        self.enable_conv = enable_conv
+        self.enable_attn = enable_attn
+
+        if self.enable_conv:
+            if export_mode:
+                from SparseConvolution.sparse_conv import SubMConv3d
+            else:
+                from spconv.pytorch import SubMConv3d
+
+            self.conv = PointSequential(
+                SubMConv3d(
+                    channels,
+                    channels,
+                    kernel_size=3,
+                    bias=True,
+                    indice_key=cpe_indice_key,
+                ),
+                nn.Linear(channels, channels),
+                norm_layer(channels),
+            )
+        else:
+            self.norm0 = PointSequential(
+                norm_layer(channels),
+            )
+
+        if self.enable_attn:
+            self.norm1 = PointSequential(norm_layer(channels))
+            self.attn = PointROPEAttention(
+                channels=channels,
+                patch_size=patch_size,
+                rope_freq=rope_freq,
+                num_heads=num_heads,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                proj_drop=proj_drop,
+                order_index=order_index,
+            )
+            self.norm2 = PointSequential(norm_layer(channels))
+            self.mlp = PointSequential(
+                MLP(
+                    in_channels=channels,
+                    hidden_channels=int(channels * mlp_ratio),
+                    out_channels=channels,
+                    act_layer=act_layer,
+                    drop=proj_drop,
+                )
+            )
+            self.drop_path = PointSequential(DropPath(drop_path) if drop_path > 0.0 else nn.Identity())
+
+    def forward(self, point: Point):
+        if self.enable_conv:
+            shortcut = point.feat
+            point = self.conv(point)
+            point.feat = shortcut + point.feat
+        else:
+            point = self.norm0(point)
+
+        if self.enable_attn:
+            shortcut = point.feat
+            if self.pre_norm:
+                point = self.norm1(point)
+            point = self.drop_path(self.attn(point))
+            point.feat = shortcut + point.feat
+            if not self.pre_norm:
+                point = self.norm1(point)
+
+            shortcut = point.feat
+            if self.pre_norm:
+                point = self.norm2(point)
+            point = self.drop_path(self.mlp(point))
+            point.feat = shortcut + point.feat
+            if not self.pre_norm:
+                point = self.norm2(point)
+
+        point.sparse_conv_feat = point.sparse_conv_feat.replace_feature(point.feat)
+        return point
+
+
+class GridPooling(PointModule):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride=2,
+        norm_layer=None,
+        act_layer=None,
+        reduce="max",
+        shuffle_orders=True,
+        traceable=True,  # record parent and cluster
+        re_serialization=False,
+        serialization_order="z",
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.stride = stride
+        assert reduce in ["sum", "mean", "min", "max"]
+        self.reduce = reduce
+        self.shuffle_orders = shuffle_orders
+        self.traceable = traceable
+
+        self.proj = nn.Linear(in_channels, out_channels)
+        if norm_layer is not None:
+            self.norm = PointSequential(norm_layer(out_channels))
+        if act_layer is not None:
+            self.act = PointSequential(act_layer())
+
+        self.re_serialization = re_serialization
+        self.serialization_order = serialization_order
+
+    def forward(self, point: Point):
+        if "grid_coord" in point.keys():
+            grid_coord = point.grid_coord
+        elif {"coord", "grid_size"}.issubset(point.keys()):
+            grid_coord = torch.div(
+                point.coord - point.coord.min(0)[0],
+                point.grid_size,
+                rounding_mode="trunc",
+            ).int()
+        else:
+            raise AssertionError("[gird_coord] or [coord, grid_size] should be include in the Point")
+        grid_coord = torch.div(grid_coord, self.stride, rounding_mode="trunc")
+
+        # Pack batch id and coordinates into a single 1D key for clustering.
+        # NOTE(original): grid_coord = torch.bitwise_or(grid_coord, point.batch.view(-1, 1) << 48)
+        grid_coord_i = grid_coord.to(torch.int64)
+        gx = grid_coord_i[:, 0]
+        gy = grid_coord_i[:, 1]
+        gz = grid_coord_i[:, 2]
+        b = point.batch.to(torch.int64)
+        packed = gx + gy * (1 << 16) + gz * (1 << 32) + b * (1 << 48)
+
+        if torch.onnx.is_in_onnx_export():
+            unique_keys, cluster, counts, _ = unique(packed)
+            # Sort points by cluster id (inverse_indices) to make them contiguous per cluster.
+            indices = argsort(cluster)
+        else:
+            unique_keys, cluster, counts = torch.unique(
+                packed,
+                sorted=True,
+                return_inverse=True,
+                return_counts=True,
+            )
+            # indices of point sorted by cluster, for torch_scatter.segment_csr
+            _, indices = torch.sort(cluster)
+
+        # Unpack to (M, 3) grid coords (drop batch component)
+        # NOTE(original): grid_coord = torch.bitwise_and(grid_coord, ((1 << 48) - 1))
+        key_wo_batch = torch.remainder(unique_keys, (1 << 48))
+        gx = torch.remainder(key_wo_batch, (1 << 16))
+        gy = torch.remainder(torch.div(key_wo_batch, (1 << 16), rounding_mode="trunc"), (1 << 16))
+        gz = torch.remainder(torch.div(key_wo_batch, (1 << 32), rounding_mode="trunc"), (1 << 16))
+        grid_coord = torch.stack([gx, gy, gz], dim=1).to(grid_coord_i.dtype)
+
+        # index pointer for sorted point, for torch_scatter.segment_csr
+        idx_ptr = torch.cat([counts.new_zeros(1), torch.cumsum(counts, dim=0)])
+        # head_indices of each cluster, for reduce attr e.g. code, batch
+        head_indices = indices[idx_ptr[:-1]]
+
+        if not torch.onnx.is_in_onnx_export():
+            scatter_feat = torch_scatter.segment_csr(self.proj(point.feat)[indices], idx_ptr, reduce=self.reduce)
+            scatter_coord = torch_scatter.segment_csr(point.coord[indices], idx_ptr, reduce="mean")
+        else:
+            scatter_feat = segment_csr(self.proj(point.feat)[indices], idx_ptr, self.reduce)
+            scatter_coord = segment_csr(point.coord[indices], idx_ptr, "mean")
+
+        point_dict = Dict(
+            feat=scatter_feat,
+            coord=scatter_coord,
+            grid_coord=grid_coord,
+            batch=point.batch[head_indices],
+        )
+
+        if "origin_coord" in point.keys():
+            if not torch.onnx.is_in_onnx_export():
+                point_dict["origin_coord"] = torch_scatter.segment_csr(
+                    point.origin_coord[indices], idx_ptr, reduce="mean"
+                )
+            else:
+                point_dict["origin_coord"] = segment_csr(point.origin_coord[indices], idx_ptr, "mean")
+        if "condition" in point.keys():
+            point_dict["condition"] = point.condition
+        if "context" in point.keys():
+            point_dict["context"] = point.context
+        if "name" in point.keys():
+            point_dict["name"] = point.name
+        if "split" in point.keys():
+            point_dict["split"] = point.split
+        if "color" in point.keys():
+            if not torch.onnx.is_in_onnx_export():
+                point_dict["color"] = torch_scatter.segment_csr(point.color[indices], idx_ptr, reduce="mean")
+            else:
+                point_dict["color"] = segment_csr(point.color[indices], idx_ptr, "mean")
+        if "grid_size" in point.keys():
+            point_dict["grid_size"] = point.grid_size * self.stride
+        if "mask" in point.keys():
+            if not torch.onnx.is_in_onnx_export():
+                point_dict["mask"] = (
+                    torch_scatter.segment_csr(point.mask[indices].float(), idx_ptr, reduce="mean") > 0.5
+                )
+            else:
+                point_dict["mask"] = segment_csr(point.mask[indices].float(), idx_ptr, "mean") > 0.5
+
+        if self.traceable:
+            point_dict["pooling_inverse"] = cluster
+            point_dict["pooling_parent"] = point
+        point = Point(point_dict)
+        if self.norm is not None:
+            point = self.norm(point)
+        if self.act is not None:
+            point = self.act(point)
+
+        if self.re_serialization:
+            point.serialization(order=self.serialization_order, shuffle_orders=self.shuffle_orders)
+        point.sparsify()
+        return point
+
+
+class GridUnpooling(PointModule):
+    def __init__(
+        self,
+        in_channels,
+        skip_channels,
+        out_channels,
+        norm_layer=None,
+        act_layer=None,
+        traceable=False,  # record parent and cluster
+    ):
+        super().__init__()
+        self.proj = PointSequential(nn.Linear(in_channels, out_channels))
+        self.proj_skip = PointSequential(nn.Linear(skip_channels, out_channels))
+
+        if norm_layer is not None:
+            self.proj.add(norm_layer(out_channels))
+            self.proj_skip.add(norm_layer(out_channels))
+
+        if act_layer is not None:
+            self.proj.add(act_layer())
+            self.proj_skip.add(act_layer())
+
+        self.traceable = traceable
+
+    def forward(self, point):
+        assert "pooling_parent" in point.keys()
+        assert "pooling_inverse" in point.keys()
+        parent = point.pop("pooling_parent")
+        inverse = point.pooling_inverse
+        feat = point.feat
+
+        parent = self.proj_skip(parent)
+        parent.feat = parent.feat + self.proj(point).feat[inverse]
+        parent.sparse_conv_feat = parent.sparse_conv_feat.replace_feature(parent.feat)
+
+        if self.traceable:
+            point.feat = feat
+            parent["unpooling_parent"] = point
+            parent["unpooling_inverse"] = inverse
+        return parent
+
+
+@MODELS.register_module("LitePT")
+class LitePT(PointModule):
+    def __init__(
+        self,
+        in_channels=4,
+        order=("z", "z-trans", "hilbert", "hilbert-trans"),
+        stride=(2, 2, 2, 2),
+        enc_depths=(2, 2, 2, 6, 2),
+        enc_channels=(36, 72, 144, 252, 504),
+        enc_num_head=(2, 4, 8, 14, 28),
+        enc_patch_size=(1024, 1024, 1024, 1024, 1024),
+        enc_conv=(True, True, True, False, False),
+        enc_attn=(False, False, False, True, True),
+        enc_rope_freq=(100.0, 100.0, 100.0, 100.0, 100.0),
+        dec_depths=(0, 0, 0, 0),
+        dec_channels=(72, 72, 144, 252),
+        dec_num_head=(4, 4, 8, 14),
+        dec_patch_size=(1024, 1024, 1024, 1024),
+        dec_conv=(False, False, False, False),
+        dec_attn=(False, False, False, False),
+        dec_rope_freq=(100.0, 100.0, 100.0, 100.0),
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        drop_path=0.3,
+        pre_norm=True,
+        shuffle_orders=True,
+        enc_mode=False,
+        export_mode=False,
+    ):
+        super().__init__()
+        self.num_stages = len(enc_depths)
+        self.order = [order] if isinstance(order, str) else order
+        self.enc_mode = enc_mode
+        self.shuffle_orders = shuffle_orders
+        self.export_mode = export_mode
+
+        self.enc_conv = enc_conv
+        self.enc_attn = enc_attn
+        self.dec_conv = dec_conv
+        self.dec_attn = dec_attn
+
+        assert self.num_stages == len(stride) + 1
+        assert self.num_stages == len(enc_depths)
+        assert self.num_stages == len(enc_channels)
+        assert self.num_stages == len(enc_num_head)
+        assert self.num_stages == len(enc_patch_size)
+        assert self.enc_mode or self.num_stages == len(dec_depths) + 1
+        assert self.enc_mode or self.num_stages == len(dec_channels) + 1
+        assert self.enc_mode or self.num_stages == len(dec_num_head) + 1
+        assert self.enc_mode or self.num_stages == len(dec_patch_size) + 1
+
+        # norm layers
+        bn_layer = partial(nn.BatchNorm1d, eps=1e-3, momentum=0.01)
+        ln_layer = nn.LayerNorm
+
+        # activation layers
+        act_layer = nn.GELU
+
+        self.embedding = Embedding(
+            in_channels=in_channels,
+            embed_channels=enc_channels[0],
+            norm_layer=bn_layer,
+            act_layer=act_layer,
+            export_mode=self.export_mode,
+        )
+
+        # encoder
+        enc_drop_path = [x.item() for x in torch.linspace(0, drop_path, sum(enc_depths))]
+        self.enc = PointSequential()
+        for s in range(self.num_stages):
+            enc_drop_path_ = enc_drop_path[sum(enc_depths[:s]) : sum(enc_depths[: s + 1])]
+            enc = PointSequential()
+            if s > 0:
+                enc.add(
+                    GridPooling(
+                        in_channels=enc_channels[s - 1],
+                        out_channels=enc_channels[s],
+                        stride=stride[s - 1],
+                        norm_layer=bn_layer,
+                        act_layer=act_layer,
+                        re_serialization=enc_attn[s],
+                        serialization_order=self.order,
+                    ),
+                    name="down",
+                )
+            for i in range(enc_depths[s]):
+                enc.add(
+                    Block(
+                        channels=enc_channels[s],
+                        num_heads=enc_num_head[s],
+                        patch_size=enc_patch_size[s],
+                        mlp_ratio=mlp_ratio,
+                        qkv_bias=qkv_bias,
+                        qk_scale=qk_scale,
+                        attn_drop=attn_drop,
+                        proj_drop=proj_drop,
+                        drop_path=enc_drop_path_[i],
+                        norm_layer=ln_layer,
+                        act_layer=act_layer,
+                        pre_norm=pre_norm,
+                        order_index=i % len(self.order),
+                        cpe_indice_key=f"stage{s}",
+                        enable_conv=enc_conv[s],
+                        enable_attn=enc_attn[s],
+                        rope_freq=enc_rope_freq[s],
+                        export_mode=self.export_mode,
+                    ),
+                    name=f"block{i}",
+                )
+            if len(enc) != 0:
+                self.enc.add(module=enc, name=f"enc{s}")
+
+        # decoder
+        if not self.enc_mode:
+            dec_drop_path = [x.item() for x in torch.linspace(0, drop_path, sum(dec_depths))]
+            self.dec = PointSequential()
+            dec_channels = list(dec_channels) + [enc_channels[-1]]
+            for s in reversed(range(self.num_stages - 1)):
+                dec_drop_path_ = dec_drop_path[sum(dec_depths[:s]) : sum(dec_depths[: s + 1])]
+                dec_drop_path_.reverse()
+                dec = PointSequential()
+                dec.add(
+                    GridUnpooling(
+                        in_channels=dec_channels[s + 1],
+                        skip_channels=enc_channels[s],
+                        out_channels=dec_channels[s],
+                        norm_layer=bn_layer,
+                        act_layer=act_layer,
+                    ),
+                    name="up",
+                )
+                for i in range(dec_depths[s]):
+                    dec.add(
+                        Block(
+                            channels=dec_channels[s],
+                            num_heads=dec_num_head[s],
+                            patch_size=dec_patch_size[s],
+                            mlp_ratio=mlp_ratio,
+                            qkv_bias=qkv_bias,
+                            qk_scale=qk_scale,
+                            attn_drop=attn_drop,
+                            proj_drop=proj_drop,
+                            drop_path=dec_drop_path_[i],
+                            norm_layer=ln_layer,
+                            act_layer=act_layer,
+                            pre_norm=pre_norm,
+                            order_index=i % len(self.order),
+                            cpe_indice_key=f"stage{s}",
+                            enable_conv=dec_conv[s],
+                            enable_attn=dec_attn[s],
+                            rope_freq=dec_rope_freq[s],
+                            export_mode=self.export_mode,
+                        ),
+                        name=f"block{i}",
+                    )
+                self.dec.add(module=dec, name=f"dec{s}")
+
+    def forward(self, data_dict):
+        """
+        data_dict is the batched input point cloud, it should contain as least:
+        1. feat [N, input_dim]: input feature for the point cloud
+        2. grid_coord [N, 3]: voxelized coordinate after grid sampling
+           or/and
+           coord [N, 3]: original coordinate + grid_size: grid_size used for grid sampling
+        3. offset [batch_size]: separator of point clouds in batched data
+           or/and
+           batch [N]: batch index of each point
+        """
+        point = Point(data_dict)
+        if self.enc_attn[0]:
+            point.serialization(order=self.order, shuffle_orders=self.shuffle_orders)
+        point.sparsify()
+
+        point = self.embedding(point)
+        point = self.enc(point)
+
+        if not self.enc_mode:
+            point = self.dec(point)
+
+        return point
+
+    def export_forward(self, data_dict):
+        point = Point(data_dict)
+        if self.enc_attn[0]:
+            point["serialized_depth"] = data_dict["serialized_depth"]
+            point["serialized_code"] = data_dict["serialized_code"]
+            point["serialized_order"] = data_dict["serialized_order"]
+            point["serialized_inverse"] = data_dict["serialized_inverse"]
+            point["sparse_shape"] = data_dict["sparse_shape"]
+        point.sparsify()
+
+        point = self.embedding(point)
+        point = self.enc(point)
+
+        if not self.enc_mode:
+            point = self.dec(point)
+
+        return point
diff --git a/projects/PTv3/models/modules.py b/projects/PTv3/models/modules.py
index 851376e9f..3d51a38b9 100644
--- a/projects/PTv3/models/modules.py
+++ b/projects/PTv3/models/modules.py
@@ -80,3 +80,67 @@ def forward(self, input):
                 else:
                     input = module(input)
         return input
+
+
+class MLP(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels=None,
+        out_channels=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_channels = out_channels or in_channels
+        hidden_channels = hidden_channels or in_channels
+        self.fc1 = nn.Linear(in_channels, hidden_channels)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_channels, out_channels)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Embedding(PointModule):
+    def __init__(
+        self,
+        in_channels,
+        embed_channels,
+        norm_layer=None,
+        act_layer=None,
+        export_mode=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.embed_channels = embed_channels
+
+        if export_mode:
+            from SparseConvolution.sparse_conv import SubMConv3d
+        else:
+            from spconv.pytorch import SubMConv3d
+
+        self.stem = PointSequential(
+            conv=SubMConv3d(
+                in_channels,
+                embed_channels,
+                kernel_size=5,
+                padding=1,
+                bias=False,
+                indice_key="stem",
+            )
+        )
+        if norm_layer is not None:
+            self.stem.add(norm_layer(embed_channels), name="norm")
+        if act_layer is not None:
+            self.stem.add(act_layer(), name="act")
+
+    def forward(self, point: Point):
+        point = self.stem(point)
+        return point
diff --git a/projects/PTv3/models/point_transformer_v3/point_transformer_v3m1_base.py b/projects/PTv3/models/point_transformer_v3/point_transformer_v3m1_base.py
index e91fb76c6..acb43eff5 100644
--- a/projects/PTv3/models/point_transformer_v3/point_transformer_v3m1_base.py
+++ b/projects/PTv3/models/point_transformer_v3/point_transformer_v3m1_base.py
@@ -19,7 +19,7 @@
     flash_attn = None
 
 from models.builder import MODELS
-from models.modules import PointModule, PointSequential
+from models.modules import MLP, Embedding, PointModule, PointSequential
 from models.point_prompt_training import PDNorm
 from models.scatter.functional import argsort, segment_csr, unique
 from models.utils.misc import offset2bincount
@@ -143,94 +143,6 @@ def get_rel_pos(self, point, order):
             point[rel_pos_key] = grid_coord.unsqueeze(2) - grid_coord.unsqueeze(1)
         return point[rel_pos_key]
 
-    @torch.no_grad()
-    def get_padding_and_inverse(self, point):
-        pad_key = "pad"
-        unpad_key = "unpad"
-        cu_seqlens_key = "cu_seqlens_key"
-        if pad_key not in point.keys() or unpad_key not in point.keys() or cu_seqlens_key not in point.keys():
-            offset = point.offset
-            bincount = offset2bincount(offset)
-            bincount_pad = (
-                torch.maximum(
-                    torch.div(
-                        bincount + self.patch_size - 1,
-                        self.patch_size,
-                        rounding_mode="trunc",
-                    ),
-                    torch.tensor(1, device=bincount.device),
-                )
-                * self.patch_size
-            )
-            # only pad point when num of points larger than patch_size
-            mask_pad = bincount > self.patch_size
-            bincount_pad = (1 - mask_pad.int()) * bincount + mask_pad.int() * bincount_pad
-
-            if not self.export_mode:
-                _offset = nn.functional.pad(offset, (1, 0))
-                _offset_pad = nn.functional.pad(torch.cumsum(bincount_pad, dim=0), (1, 0))
-
-                pad = torch.arange(_offset_pad[-1], device=offset.device)
-                unpad = torch.arange(_offset[-1], device=offset.device)
-                cu_seqlens = []
-                for i in range(len(offset)):
-                    unpad[_offset[i] : _offset[i + 1]] += _offset_pad[i] - _offset[i]
-                    if bincount[i] != bincount_pad[i]:
-                        pad[
-                            _offset_pad[i + 1] - self.patch_size + (bincount[i] % self.patch_size) : _offset_pad[i + 1]
-                        ] = pad[
-                            _offset_pad[i + 1]
-                            - 2 * self.patch_size
-                            + (bincount[i] % self.patch_size) : _offset_pad[i + 1]
-                            - self.patch_size
-                        ]
-                    pad[_offset_pad[i] : _offset_pad[i + 1]] -= _offset_pad[i] - _offset[i]
-                    cu_seqlens.append(
-                        torch.arange(
-                            _offset_pad[i],
-                            _offset_pad[i + 1],
-                            step=self.patch_size,
-                            dtype=torch.int32,
-                            device=offset.device,
-                        )
-                    )
-                point[pad_key] = pad
-                point[unpad_key] = unpad
-                point[cu_seqlens_key] = nn.functional.pad(torch.concat(cu_seqlens), (0, 1), value=_offset_pad[-1])
-            else:
-                # NOTE(knzo25): needed due to tensorrt reasons
-                assert len(offset) == 1
-
-                # pad_orig = pad
-                # unpad_orig = unpad
-
-                pad = torch.arange(bincount_pad[0], device=offset.device)
-                unpad = torch.arange(offset[0], device=offset.device)
-                cu_seqlens = []
-
-                pad[bincount_pad[0] - self.patch_size + (bincount[0] % self.patch_size) : bincount_pad[0]] = pad[
-                    bincount_pad[0]
-                    - 2 * self.patch_size
-                    + (bincount[0] % self.patch_size) : bincount_pad[0]
-                    - self.patch_size
-                ]
-
-                cu_seqlens.append(
-                    torch.arange(
-                        0,
-                        bincount_pad[0],
-                        step=self.patch_size,
-                        dtype=torch.int32,
-                        device=offset.device,
-                    )
-                )
-
-                point[pad_key] = pad
-                point[unpad_key] = unpad
-                point[cu_seqlens_key] = nn.functional.pad(torch.concat(cu_seqlens), (0, 1), value=bincount_pad[0])
-
-        return point[pad_key], point[unpad_key], point[cu_seqlens_key]
-
     def forward(self, point):
         if not self.enable_flash:
             assert offset2bincount(point.offset).min() >= self.patch_size_max  # NOTE(knzo25): assumed for deployment
@@ -240,7 +152,7 @@ def forward(self, point):
         K = self.patch_size
         C = self.channels
 
-        pad, unpad, cu_seqlens = self.get_padding_and_inverse(point)
+        pad, unpad, cu_seqlens = point.get_padding_and_inverse(self.patch_size)
 
         order = point.serialized_order[self.order_index][pad]
         inverse = unpad[point.serialized_inverse[self.order_index]]
@@ -281,32 +193,6 @@ def forward(self, point):
         return point
 
 
-class MLP(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        hidden_channels=None,
-        out_channels=None,
-        act_layer=nn.GELU,
-        drop=0.0,
-    ):
-        super().__init__()
-        out_channels = out_channels or in_channels
-        hidden_channels = hidden_channels or in_channels
-        self.fc1 = nn.Linear(in_channels, hidden_channels)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_channels, out_channels)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
 class Block(PointModule):
     def __init__(
         self,
@@ -566,44 +452,6 @@ def forward(self, point):
         return parent
 
 
-class Embedding(PointModule):
-    def __init__(
-        self,
-        in_channels,
-        embed_channels,
-        norm_layer=None,
-        act_layer=None,
-        export_mode=False,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.embed_channels = embed_channels
-
-        if export_mode:
-            from SparseConvolution.sparse_conv import SubMConv3d
-        else:
-            from spconv.pytorch import SubMConv3d
-
-        self.stem = PointSequential(
-            conv=SubMConv3d(
-                in_channels,
-                embed_channels,
-                kernel_size=5,
-                padding=1,
-                bias=False,
-                indice_key="stem",
-            )
-        )
-        if norm_layer is not None:
-            self.stem.add(norm_layer(embed_channels), name="norm")
-        if act_layer is not None:
-            self.stem.add(act_layer(), name="act")
-
-    def forward(self, point: Point):
-        point = self.stem(point)
-        return point
-
-
 @MODELS.register_module("PT-v3m1")
 class PointTransformerV3(PointModule):
 
diff --git a/projects/PTv3/models/utils/structure.py b/projects/PTv3/models/utils/structure.py
index df335b9a5..f40f88e1d 100644
--- a/projects/PTv3/models/utils/structure.py
+++ b/projects/PTv3/models/utils/structure.py
@@ -1,8 +1,10 @@
 import spconv.pytorch as spconv
 import torch
 from addict import Dict
-from models.utils import batch2offset, offset2batch
+from models.scatter import argsort
+from models.utils import batch2offset, offset2batch, offset2bincount
 from models.utils.serialization import decode, encode
+from torch import nn
 
 
 def bit_length_tensor(x: torch.Tensor) -> torch.Tensor:
@@ -87,7 +89,10 @@ def serialization(self, order="z", depth=None, shuffle_orders=False):
         #  OrderN ([n])] (k, n)
         code = [encode(self.grid_coord, self.batch, depth, order=order_) for order_ in order]
         code = torch.stack(code)
-        order = torch.argsort(code)
+        if torch.onnx.is_in_onnx_export():
+            order = torch.stack([argsort(code_i) for code_i in code], dim=0)
+        else:
+            order = torch.argsort(code)
         inverse = torch.zeros_like(order).scatter_(
             dim=1,
             index=order,
@@ -137,3 +142,80 @@ def sparsify(self, pad=96):
         )
         self["sparse_shape"] = sparse_shape
         self["sparse_conv_feat"] = sparse_conv_feat
+
+    @torch.no_grad()
+    def get_padding_and_inverse(self, patch_size) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        pad_key = "pad"
+        unpad_key = "unpad"
+        cu_seqlens_key = "cu_seqlens_key"
+        if pad_key not in self.keys() or unpad_key not in self.keys() or cu_seqlens_key not in self.keys():
+            offset = self.offset
+            bincount = offset2bincount(offset)
+            bincount_pad = (
+                torch.div(
+                    bincount + patch_size - 1,
+                    patch_size,
+                    rounding_mode="trunc",
+                )
+                * patch_size
+            )
+            # only pad point when num of points larger than patch_size
+            mask_pad = bincount > patch_size
+            bincount_pad = ~mask_pad * bincount + mask_pad * bincount_pad
+
+            if not torch.onnx.is_in_onnx_export():
+                _offset = nn.functional.pad(offset, (1, 0))
+                _offset_pad = nn.functional.pad(torch.cumsum(bincount_pad, dim=0), (1, 0))
+                pad = torch.arange(_offset_pad[-1], device=offset.device)
+                unpad = torch.arange(_offset[-1], device=offset.device)
+                cu_seqlens = []
+                for i in range(len(offset)):
+                    unpad[_offset[i] : _offset[i + 1]] += _offset_pad[i] - _offset[i]
+                    if bincount[i] != bincount_pad[i]:
+                        pad[_offset_pad[i + 1] - patch_size + (bincount[i] % patch_size) : _offset_pad[i + 1]] = pad[
+                            _offset_pad[i + 1]
+                            - 2 * patch_size
+                            + (bincount[i] % patch_size) : _offset_pad[i + 1]
+                            - patch_size
+                        ]
+                    pad[_offset_pad[i] : _offset_pad[i + 1]] -= _offset_pad[i] - _offset[i]
+                    cu_seqlens.append(
+                        torch.arange(
+                            _offset_pad[i],
+                            _offset_pad[i + 1],
+                            step=patch_size,
+                            dtype=torch.int32,
+                            device=offset.device,
+                        )
+                    )
+                self[pad_key] = pad
+                self[unpad_key] = unpad
+                self[cu_seqlens_key] = nn.functional.pad(torch.concat(cu_seqlens), (0, 1), value=_offset_pad[-1])
+            else:
+                # NOTE: needed due to tensorrt reasons
+                assert len(offset) == 1
+
+                pad = torch.arange(bincount_pad[0], device=offset.device)
+                unpad = torch.arange(offset[0], device=offset.device)
+                cu_seqlens = []
+
+                pad[bincount_pad[0] - patch_size + (bincount[0] % patch_size) : bincount_pad[0]] = pad[
+                    bincount_pad[0] - 2 * patch_size + (bincount[0] % patch_size) : bincount_pad[0] - patch_size
+                ]
+
+                cu_seqlens.append(
+                    torch.arange(
+                        0,
+                        bincount_pad[0],
+                        step=patch_size,
+                        dtype=torch.int32,
+                        device=offset.device,
+                    )
+                )
+
+                self[pad_key] = pad
+                self[unpad_key] = unpad
+                self[cu_seqlens_key] = nn.functional.pad(torch.concat(cu_seqlens), (0, 1), value=bincount_pad[0])
+
+        return self[pad_key], self[unpad_key], self[cu_seqlens_key]
+        return self[pad_key], self[unpad_key], self[cu_seqlens_key]
diff --git a/projects/PTv3/pyproject.toml b/projects/PTv3/pyproject.toml
new file mode 100644
index 000000000..3831aba25
--- /dev/null
+++ b/projects/PTv3/pyproject.toml
@@ -0,0 +1,92 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "ptv3"
+version = "0.1.0"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+  "addict>=2.4.0",
+  "colorhash>=2.1.0",
+  "einops>=0.8.1",
+  "flash-attn",
+  "h5py>=3.15.1",
+  "plyfile>=1.1.3",
+  "pointrope",
+  "pyyaml>=6.0.3",
+  "scipy>=1.15.3",
+  "sharedarray>=3.2.4",
+  "spconv-cu126>=2.3.8",
+  "tensorboard>=2.20.0",
+  "tensorboardx>=2.6.4",
+  "termcolor>=3.3.0",
+  "timm>=1.0.22",
+  "torch==2.8.0",
+  "torchvision==0.23.0",
+  "torchmetrics>=1.8.2",
+  "torch-cluster>=1.6.3",
+  "torch-geometric>=2.7.0",
+  "torch-scatter>=2.1.2",
+  "torch-sparse>=0.6.18",
+  "wandb>=0.23.1",
+  "yapf==0.40.1",
+  "nuscenes-devkit>=1.2.0",
+  "open3d>=0.19.0",
+  "onnx>=1.20.0",
+]
+
+[project.optional-dependencies]
+insseg = ["pointgroup-ops", "pointops"] # for instance segmentation
+
+[tool.uv]
+find-links = [
+  "https://data.pyg.org/whl/torch-2.8.0+cu129.html", # for torch-* libraries
+]
+no-build-isolation-package = [
+  "flash-attn",
+  "torch-cluster",
+  "torch-geometric",
+  "torch-scatter",
+  "torch-sparse",
+  "pointops",
+  "pointrope",
+  "pointgroup-ops",
+]
+
+[tool.uv.extra-build-dependencies]
+pointrope = [{ requirement = "torch", match-runtime = true }]
+torch-cluster = [
+  { requirement = "setuptools", match-runtime = false },
+  { requirement = "torch", match-runtime = true },
+]
+
+[tool.uv.sources]
+torch = [{ index = "pytorch-cu129", marker = "sys_platform == 'linux'" }]
+torchvision = [{ index = "pytorch-cu129", marker = "sys_platform == 'linux'" }]
+flash-attn = { url = "https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl" }
+pointrope = { workspace = true }
+pointops = { workspace = true }
+pointgroup-ops = { workspace = true }
+
+[[tool.uv.index]]
+name = "pytorch-cu129"
+url = "https://download.pytorch.org/whl/cu129"
+explicit = true
+
+[tool.uv.workspace]
+members = ["libs/pointgroup_ops", "libs/pointops", "libs/pointrope"]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = [
+  "datasets",
+  "datasets.*",
+  "engines",
+  "engines.*",
+  "models",
+  "models.*",
+  "utils",
+  "utils.*",
+]