diff --git a/projects/PTv3/Dockerfile b/projects/PTv3/Dockerfile index 9a7e0e3ff..9de653ade 100644 --- a/projects/PTv3/Dockerfile +++ b/projects/PTv3/Dockerfile @@ -4,15 +4,9 @@ FROM ${AWML_BASE_IMAGE} ENV FLASH_ATTN_CUDA_ARCHS="120" ENV PYTHONPATH=/workspace/projects:/workspace/projects/PTv3 -RUN python3 -m pip --no-cache-dir install \ - addict \ - open3d \ - flash-attn --no-build-isolation \ - regex \ - sharedarray \ - spconv-cu120 \ - tensorboardx \ - termcolor \ - torch-scatter -f https://data.pyg.org/whl/torch-2.8.0+cu129.html \ - tqdm \ - yapf==0.40.1 +RUN curl -LsSf https://astral.sh/uv/install.sh | sh \ + && . /root/.local/bin/env \ + && uv pip install --system --python /opt/conda/bin/python \ + --upgrade pip setuptools wheel \ + && uv pip install --system --python /opt/conda/bin/python \ + --no-build-isolation flash-attn /workspace/projects/PTv3 diff --git a/projects/PTv3/configs/semseg-litept-small-v1m1-nuscenes.py b/projects/PTv3/configs/semseg-litept-small-v1m1-nuscenes.py new file mode 100644 index 000000000..3b346fd7d --- /dev/null +++ b/projects/PTv3/configs/semseg-litept-small-v1m1-nuscenes.py @@ -0,0 +1,245 @@ +_base_ = ["./_base_/default_runtime.py"] + +save_path = "work_dirs/litept" + +# misc custom setting +batch_size = 12 # bs: total bs in all gpus +num_worker = 24 +mix_prob = 0.8 +empty_cache = False +enable_amp = True + +# dataset settings +dataset_type = "NuScenesDataset" +data_root = "data/nuscenes" +ignore_index = -1 +info_paths_train = ["info/nuscenes_infos_10sweeps_train.pkl"] +info_paths_val = ["info/nuscenes_infos_10sweeps_val.pkl"] +info_paths_test = ["info/nuscenes_infos_10sweeps_test.pkl"] +class_names = [ + "barrier", + "bicycle", + "bus", + "car", + "construction_vehicle", + "motorcycle", + "pedestrian", + "traffic_cone", + "trailer", + "truck", + "driveable_surface", + "other_flat", + "sidewalk", + "terrain", + "manmade", + "vegetation", +] +class_mapping = { + 0: 0, + 1: 1, + 2: 2, + 3: 3, + 4: 4, + 5: 5, + 6: 6, + 7: 7, + 8: 8, + 9: 9, + 10: 10, + 11: 11, + 12: 12, + 13: 13, + 14: 14, + 15: 15, +} +num_classes = 16 + +# model settings +model = dict( + type="DefaultSegmentorV2", + num_classes=num_classes, + backbone_out_channels=72, + backbone=dict( + type="LitePT", + in_channels=4, + order=["z", "z-trans", "hilbert", "hilbert-trans"], + stride=(2, 2, 2, 2), + enc_depths=(2, 2, 2, 6, 2), + enc_channels=(36, 72, 144, 252, 504), + enc_num_head=(2, 4, 8, 14, 28), + enc_patch_size=(1024, 1024, 1024, 1024, 1024), + enc_conv=(True, True, True, False, False), + enc_attn=(False, False, False, True, True), + enc_rope_freq=(100.0, 100.0, 100.0, 100.0, 100.0), + dec_depths=(0, 0, 0, 0), + dec_channels=(72, 72, 144, 252), + dec_num_head=(4, 4, 8, 14), + dec_patch_size=(1024, 1024, 1024, 1024), + dec_conv=(False, False, False, False), + dec_attn=(False, False, False, False), + dec_rope_freq=(100.0, 100.0, 100.0, 100.0), + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + drop_path=0.3, + shuffle_orders=True, + pre_norm=True, + enc_mode=False, + ), + criteria=[ + dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=ignore_index), + dict(type="LovaszLoss", mode="multiclass", loss_weight=1.0, ignore_index=ignore_index), + ], +) + +# scheduler settings +epoch = 50 +eval_epoch = 50 +optimizer = dict(type="AdamW", lr=0.002, weight_decay=0.005) +scheduler = dict( + type="OneCycleLR", + max_lr=[0.002, 0.0002], + pct_start=0.04, + anneal_strategy="cos", + div_factor=10.0, + final_div_factor=100.0, +) +param_dicts = [dict(keyword="block", lr=0.0002)] + +# dataset settings +data = dict( + num_classes=num_classes, + ignore_index=ignore_index, + train=dict( + type=dataset_type, + split="train", + data_root=data_root, + info_paths=info_paths_train, + transform=[ + # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2), + # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75), + dict(type="RandomRotate", angle=[-1, 1], axis="z", center=[0, 0, 0], p=0.5), + # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5), + # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5), + dict(type="RandomScale", scale=[0.9, 1.1]), + # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]), + dict(type="RandomFlip", p=0.5), + dict(type="RandomJitter", sigma=0.005, clip=0.02), + # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]), + dict( + type="GridSample", + grid_size=0.05, + hash_type="fnv", + mode="train", + keys=("coord", "strength", "segment"), + return_grid_coord=True, + ), + # dict(type="SphereCrop", point_max=1000000, mode="random"), + # dict(type="CenterShift", apply_z=False), + dict(type="ToTensor"), + dict(type="Update", keys_dict={"grid_size": 0.05}), + dict( + type="Collect", + keys=("coord", "grid_coord", "segment", "grid_size"), + feat_keys=("coord", "strength"), + ), + ], + test_mode=False, + ignore_index=ignore_index, + class_mapping=class_mapping, + ), + val=dict( + type=dataset_type, + split="val", + data_root=data_root, + info_paths=info_paths_val, + transform=[ + dict(type="Copy", keys_dict={"segment": "origin_segment"}), + # dict(type="PointClip", point_cloud_range=(-51.2, -51.2, -4, 51.2, 51.2, 2.4)), + dict( + type="GridSample", + grid_size=0.05, + hash_type="fnv", + mode="train", + keys=("coord", "strength", "segment"), + return_grid_coord=True, + return_inverse=True, + ), + # dict(type="SphereCrop", point_max=1000000, mode='center'), + dict(type="ToTensor"), + dict( + type="Collect", + keys=("coord", "grid_coord", "segment", "origin_segment", "inverse"), + feat_keys=("coord", "strength"), + ), + ], + test_mode=False, + ignore_index=ignore_index, + class_mapping=class_mapping, + ), + test=dict( + type=dataset_type, + split="val", + data_root=data_root, + info_paths=info_paths_test, + transform=[ + dict(type="Copy", keys_dict={"segment": "origin_segment"}), + dict( + type="GridSample", + grid_size=0.025, + hash_type="fnv", + mode="train", + keys=("coord", "strength", "segment"), + return_inverse=True, + ), + ], + test_mode=True, + test_cfg=dict( + voxelize=dict( + type="GridSample", + grid_size=0.05, + hash_type="fnv", + mode="test", + keys=("coord", "strength", "segment"), + return_grid_coord=True, + ), + crop=None, + post_transform=[ + dict(type="ToTensor"), + dict( + type="Collect", + keys=("coord", "grid_coord", "index"), + feat_keys=("coord", "strength"), + ), + ], + aug_transform=[ + [dict(type="RandomScale", scale=[0.9, 0.9])], + [dict(type="RandomScale", scale=[0.95, 0.95])], + [dict(type="RandomScale", scale=[1, 1])], + [dict(type="RandomScale", scale=[1.05, 1.05])], + [dict(type="RandomScale", scale=[1.1, 1.1])], + [ + dict(type="RandomScale", scale=[0.9, 0.9]), + dict(type="RandomFlip", p=1), + ], + [ + dict(type="RandomScale", scale=[0.95, 0.95]), + dict(type="RandomFlip", p=1), + ], + [dict(type="RandomScale", scale=[1, 1]), dict(type="RandomFlip", p=1)], + [ + dict(type="RandomScale", scale=[1.05, 1.05]), + dict(type="RandomFlip", p=1), + ], + [ + dict(type="RandomScale", scale=[1.1, 1.1]), + dict(type="RandomFlip", p=1), + ], + ], + ), + ignore_index=ignore_index, + class_mapping=class_mapping, + ), +) diff --git a/projects/PTv3/configs/semseg-litept-small-v1m1-t4dataset.py b/projects/PTv3/configs/semseg-litept-small-v1m1-t4dataset.py new file mode 100644 index 000000000..9d84c9089 --- /dev/null +++ b/projects/PTv3/configs/semseg-litept-small-v1m1-t4dataset.py @@ -0,0 +1,252 @@ +_base_ = [ + "../../../autoware_ml/configs/detection3d/default_runtime.py", + "./_base_/default_runtime.py", + "../../../autoware_ml/configs/segmentation3d/dataset/t4dataset/j6gen2_base.py", +] + +save_path = "work_dirs/litept" + +# misc custom setting +batch_size = 12 # bs: total bs in all gpus +num_worker = 24 +mix_prob = 0.8 +empty_cache = False +enable_amp = True + +# (min_x, min_y, min_z, max_x, max_y, max_z) +point_cloud_range = [-102.4, -102.4, -2.8, 102.4, 102.4, 10.0] +grid_size = 0.1 # original is 0.05 + +# dataset settings +dataset_type = "T4Dataset" +data_root = "data/t4dataset" +ignore_index = -1 +info_paths_train = ["info/lidarseg/t4dataset_j6gen2_lidarseg_infos_train.pkl"] +info_paths_val = ["info/lidarseg/t4dataset_j6gen2_lidarseg_infos_val.pkl"] +info_paths_test = ["info/lidarseg/t4dataset_j6gen2_lidarseg_infos_test.pkl"] +class_mapping = { + "drivable_surface": 0, + "other_flat_surface": 1, + "sidewalk": 2, + "manmade": 3, + "vegetation": 4, + "car": 5, + "bus": 6, + "emergency_vehicle": 7, + "train": 8, + "truck": 9, + "tractor_unit": 10, + "semi_trailer": 11, + "construction_vehicle": 12, + "forklift": 13, + "kart": 14, + "motorcycle": 15, + "bicycle": 16, + "pedestrian": 17, + "personal_mobility": 18, + "animal": 19, + "pushable_pullable": 20, + "traffic_cone": 21, + "stroller": 22, + "debris": 23, + "other_stuff": 24, + "noise": 25, + "ghost_point": 25, + "out_of_sync": ignore_index, + "unpainted": ignore_index, +} +num_classes = 26 + +# model settings +model = dict( + type="DefaultSegmentorV2", + num_classes=num_classes, + backbone_out_channels=72, + backbone=dict( + type="LitePT", + in_channels=4, + order=["z", "z-trans", "hilbert", "hilbert-trans"], + stride=(2, 2, 2, 2), + enc_depths=(2, 2, 2, 6, 2), + enc_channels=(36, 72, 144, 252, 504), + enc_num_head=(2, 4, 8, 14, 28), + enc_patch_size=(1024, 1024, 1024, 1024, 1024), + enc_conv=(True, True, True, False, False), + enc_attn=(False, False, False, True, True), + enc_rope_freq=(100.0, 100.0, 100.0, 100.0, 100.0), + dec_depths=(0, 0, 0, 0), + dec_channels=(72, 72, 144, 252), + dec_num_head=(4, 4, 8, 14), + dec_patch_size=(1024, 1024, 1024, 1024), + dec_conv=(False, False, False, False), + dec_attn=(False, False, False, False), + dec_rope_freq=(100.0, 100.0, 100.0, 100.0), + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + drop_path=0.3, + shuffle_orders=True, + pre_norm=True, + enc_mode=False, + ), + criteria=[ + dict(type="CrossEntropyLoss", loss_weight=1.0, ignore_index=ignore_index), + dict(type="LovaszLoss", mode="multiclass", loss_weight=1.0, ignore_index=ignore_index), + ], +) + +# scheduler settings +epoch = 50 +eval_epoch = 50 +optimizer = dict(type="AdamW", lr=0.002, weight_decay=0.005) +scheduler = dict( + type="OneCycleLR", + max_lr=[0.002, 0.0002], + pct_start=0.04, + anneal_strategy="cos", + div_factor=10.0, + final_div_factor=100.0, +) +param_dicts = [dict(keyword="block", lr=0.0002)] + +# dataset settings +data = dict( + num_classes=num_classes, + ignore_index=ignore_index, + train=dict( + type=dataset_type, + split="train", + data_root=data_root, + info_paths=info_paths_train, + transform=[ + # dict(type="RandomDropout", dropout_ratio=0.2, dropout_application_ratio=0.2), + # dict(type="RandomRotateTargetAngle", angle=(1/2, 1, 3/2), center=[0, 0, 0], axis="z", p=0.75), + dict(type="RandomRotate", angle=[-1, 1], axis="z", center=[0, 0, 0], p=0.5), + # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="x", p=0.5), + # dict(type="RandomRotate", angle=[-1/6, 1/6], axis="y", p=0.5), + dict(type="RandomScale", scale=[0.9, 1.1]), + dict( + type="PointClip", + point_cloud_range=point_cloud_range, + ), + # dict(type="RandomShift", shift=[0.2, 0.2, 0.2]), + dict(type="RandomFlip", p=0.5), + dict(type="RandomJitter", sigma=0.005, clip=0.02), + # dict(type="ElasticDistortion", distortion_params=[[0.2, 0.4], [0.8, 1.6]]), + dict( + type="GridSample", + grid_size=grid_size, + hash_type="fnv", + mode="train", + keys=("coord", "strength", "segment"), + return_grid_coord=True, + ), + dict(type="SphereCrop", point_max=128000, mode="random"), + # dict(type="CenterShift", apply_z=False), + dict(type="ToTensor"), + dict(type="Update", keys_dict={"grid_size": grid_size}), + dict( + type="Collect", + keys=("coord", "grid_coord", "segment", "grid_size"), + feat_keys=("coord", "strength"), + ), + ], + test_mode=False, + ignore_index=ignore_index, + class_mapping=class_mapping, + ), + val=dict( + type=dataset_type, + split="val", + data_root=data_root, + info_paths=info_paths_val, + transform=[ + dict(type="Copy", keys_dict={"segment": "origin_segment"}), + dict(type="PointClip", point_cloud_range=point_cloud_range), + dict( + type="GridSample", + grid_size=grid_size, + hash_type="fnv", + mode="train", + keys=("coord", "strength", "segment"), + return_grid_coord=True, + return_inverse=True, + ), + # dict(type="SphereCrop", point_max=1000000, mode='center'), + dict(type="ToTensor"), + dict( + type="Collect", + keys=("coord", "grid_coord", "segment", "origin_segment", "inverse"), + feat_keys=("coord", "strength"), + ), + ], + test_mode=False, + ignore_index=ignore_index, + class_mapping=class_mapping, + ), + test=dict( + type=dataset_type, + split="val", + data_root=data_root, + info_paths=info_paths_test, + transform=[ + dict(type="Copy", keys_dict={"segment": "origin_segment"}), + dict( + type="GridSample", + grid_size=grid_size, + hash_type="fnv", + mode="train", + keys=("coord", "strength", "segment"), + return_inverse=True, + ), + ], + test_mode=True, + test_cfg=dict( + voxelize=dict( + type="GridSample", + grid_size=grid_size, + hash_type="fnv", + mode="test", + keys=("coord", "strength", "segment"), + return_grid_coord=True, + ), + crop=None, + post_transform=[ + dict(type="ToTensor"), + dict( + type="Collect", + keys=("coord", "grid_coord", "index"), + feat_keys=("coord", "strength"), + ), + ], + aug_transform=[ + [dict(type="RandomScale", scale=[0.9, 0.9])], + [dict(type="RandomScale", scale=[0.95, 0.95])], + [dict(type="RandomScale", scale=[1, 1])], + [dict(type="RandomScale", scale=[1.05, 1.05])], + [dict(type="RandomScale", scale=[1.1, 1.1])], + [ + dict(type="RandomScale", scale=[0.9, 0.9]), + dict(type="RandomFlip", p=1), + ], + [ + dict(type="RandomScale", scale=[0.95, 0.95]), + dict(type="RandomFlip", p=1), + ], + [dict(type="RandomScale", scale=[1, 1]), dict(type="RandomFlip", p=1)], + [ + dict(type="RandomScale", scale=[1.05, 1.05]), + dict(type="RandomFlip", p=1), + ], + [ + dict(type="RandomScale", scale=[1.1, 1.1]), + dict(type="RandomFlip", p=1), + ], + ], + ), + ignore_index=ignore_index, + class_mapping=class_mapping, + ), +) diff --git a/projects/PTv3/datasets/transform.py b/projects/PTv3/datasets/transform.py index fb911adc9..9de0aae73 100644 --- a/projects/PTv3/datasets/transform.py +++ b/projects/PTv3/datasets/transform.py @@ -64,6 +64,19 @@ def __call__(self, data_dict): return data_dict +@TRANSFORMS.register_module() +class Update(object): + def __init__(self, keys_dict=None): + if keys_dict is None: + keys_dict = dict() + self.keys_dict = keys_dict + + def __call__(self, data_dict): + for key, value in self.keys_dict.items(): + data_dict[key] = value + return data_dict + + @TRANSFORMS.register_module() class ToTensor(object): def __call__(self, data): diff --git a/projects/PTv3/libs/pointgroup_ops/functions/__init__.py b/projects/PTv3/libs/pointgroup_ops/functions/__init__.py new file mode 100644 index 000000000..a9e748288 --- /dev/null +++ b/projects/PTv3/libs/pointgroup_ops/functions/__init__.py @@ -0,0 +1 @@ +from .functions import Clustering, ballquery_batch_p, bfs_cluster diff --git a/projects/PTv3/libs/pointgroup_ops/functions/functions.py b/projects/PTv3/libs/pointgroup_ops/functions/functions.py new file mode 100644 index 000000000..8236abaf5 --- /dev/null +++ b/projects/PTv3/libs/pointgroup_ops/functions/functions.py @@ -0,0 +1,168 @@ +import pointgroup_ops_cuda +import torch +from torch.autograd import Function + + +class BallQueryBatchP(Function): + @staticmethod + def forward(ctx, coords, batch_idxs, batch_offsets, radius, meanActive): + """ + :param ctx: + :param coords: (n, 3) float + :param batch_idxs: (n) int + :param batch_offsets: (B+1) int + :param radius: float + :param meanActive: int + :return: idx (nActive), int + :return: start_len (n, 2), int + """ + + n = coords.size(0) + + assert coords.is_contiguous() and coords.is_cuda + assert batch_idxs.is_contiguous() and batch_idxs.is_cuda + assert batch_offsets.is_contiguous() and batch_offsets.is_cuda + + while True: + idx = torch.cuda.IntTensor(n * meanActive).zero_() + start_len = torch.cuda.IntTensor(n, 2).zero_() + nActive = pointgroup_ops_cuda.ballquery_batch_p( + coords, batch_idxs, batch_offsets, idx, start_len, n, meanActive, radius + ) + if nActive <= n * meanActive: + break + meanActive = int(nActive // n + 1) + idx = idx[:nActive] + + return idx, start_len + + @staticmethod + def backward(ctx, a=None, b=None): + return None, None, None + + +ballquery_batch_p = BallQueryBatchP.apply + + +class Clustering: + def __init__( + self, + ignored_labels, + class_mapping, + thresh=0.03, + closed_points=300, + min_points=50, + propose_points=100, + score_func=torch.max, + ) -> None: + self.ignored_labels = ignored_labels + self.thresh = thresh + self.closed_points = closed_points + self.min_points = min_points + self.class_mapping = class_mapping + self.propose_points = propose_points + self.score_func = score_func + + def cluster(self, vertices, scores): + labels = torch.max(scores, 1)[1] # (N) long, cuda + proposals_idx, proposals_offset = self.cluster_(vertices, labels) + + ## debug + # import ipdb; ipdb.set_trace() + # colors = np.array(create_color_palette())[labels.cpu()] + # write_triangle_mesh(vertices, colors, None, 'semantics.ply') + + # scatter + proposals_pred = torch.zeros( + (proposals_offset.shape[0] - 1, vertices.shape[0]), dtype=torch.int + ) # (nProposal, N), int, cuda + proposals_pred[proposals_idx[:, 0].long(), proposals_idx[:, 1].long()] = 1 + labels = labels[proposals_idx[:, 1][proposals_offset[:-1].long()].long()] + + proposals_pointnum = proposals_pred.sum(1) + npoint_mask = proposals_pointnum > self.propose_points + + proposals_pred = proposals_pred[npoint_mask] + labels = labels[npoint_mask] + return proposals_pred, labels + + def cluster_(self, vertices, labels): + """ + :param batch_idxs: (N), int, cuda + :labels: 0-19 + """ + batch_idxs = torch.zeros_like(labels) + + mask_non_ignored = torch.ones_like(labels).bool() + for ignored_label in self.ignored_labels: + mask_non_ignored = mask_non_ignored & (self.class_mapping[labels] != ignored_label) + object_idxs = mask_non_ignored.nonzero().view(-1) + + vertices_ = vertices[object_idxs].float() + labels_ = labels[object_idxs].int() + + if vertices_.numel() == 0: + return torch.zeros((0, 2)).int(), torch.zeros(1).int() + + batch_idxs_ = batch_idxs[object_idxs].int() + batch_offsets_ = torch.FloatTensor([0, object_idxs.shape[0]]).int().cuda() + + idx, start_len = ballquery_batch_p(vertices_, batch_idxs_, batch_offsets_, self.thresh, self.closed_points) + proposals_idx, proposals_offset = bfs_cluster(labels_.cpu(), idx.cpu(), start_len.cpu(), self.min_points) + proposals_idx[:, 1] = object_idxs[proposals_idx[:, 1].long()].int() + + return proposals_idx, proposals_offset + + def get_instances(self, vertices, scores): + proposals_pred, labels = self.cluster(vertices, scores) + instances = {} + for proposal_id in range(len(proposals_pred)): + clusters_i = proposals_pred[proposal_id] + score = scores[clusters_i.bool(), labels[proposal_id]] + score = self.score_func(score) + instances[proposal_id] = {} + instances[proposal_id]["conf"] = score.cpu().numpy() + instances[proposal_id]["label_id"] = self.class_mapping.cpu()[labels[proposal_id]] + instances[proposal_id]["pred_mask"] = clusters_i.cpu().numpy() + return instances + + +class BFSCluster(Function): + @staticmethod + def forward(ctx, semantic_label, ball_query_idxs, start_len, threshold): + """ + :param ctx: + :param semantic_label: (N), int + :param ball_query_idxs: (nActive), int + :param start_len: (N, 2), int + :return: cluster_idxs: int (sumNPoint, 2), dim 0 for cluster_id, dim 1 for corresponding point idxs in N + :return: cluster_offsets: int (nCluster + 1) + """ + + N = start_len.size(0) + + assert semantic_label.is_contiguous() + assert ball_query_idxs.is_contiguous() + assert start_len.is_contiguous() + + cluster_idxs = semantic_label.new() + cluster_offsets = semantic_label.new() + + pointgroup_ops_cuda.bfs_cluster( + semantic_label, + ball_query_idxs, + start_len, + cluster_idxs, + cluster_offsets, + N, + threshold, + ) + + return cluster_idxs, cluster_offsets + + @staticmethod + def backward(ctx, a=None): + return None + + +bfs_cluster = BFSCluster.apply diff --git a/projects/PTv3/libs/pointgroup_ops/pyproject.toml b/projects/PTv3/libs/pointgroup_ops/pyproject.toml new file mode 100644 index 000000000..74b97ba86 --- /dev/null +++ b/projects/PTv3/libs/pointgroup_ops/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "pointgroup-ops" +version = "0.1.0" +requires-python = "==3.10.*" +dependencies = [] + +[build-system] +requires = ["setuptools>=61"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["."] +exclude = [] diff --git a/projects/PTv3/libs/pointgroup_ops/setup.py b/projects/PTv3/libs/pointgroup_ops/setup.py new file mode 100644 index 000000000..f6bcef376 --- /dev/null +++ b/projects/PTv3/libs/pointgroup_ops/setup.py @@ -0,0 +1,58 @@ +import os +from distutils.sysconfig import get_config_vars +from sys import argv + +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +(opt,) = get_config_vars("OPT") +os.environ["OPT"] = " ".join(flag for flag in opt.split() if flag != "-Wstrict-prototypes") + + +def _argparse(pattern, argv, is_flag=True, is_list=False): + if is_flag: + found = pattern in argv + if found: + argv.remove(pattern) + return found, argv + else: + arr = [arg for arg in argv if pattern == arg.split("=")[0]] + if is_list: + if len(arr) == 0: # not found + return False, argv + else: + assert "=" in arr[0], f"{arr[0]} requires a value." + argv.remove(arr[0]) + val = arr[0].split("=")[1] + if "," in val: + return val.split(","), argv + else: + return [val], argv + else: + if len(arr) == 0: # not found + return False, argv + else: + assert "=" in arr[0], f"{arr[0]} requires a value." + argv.remove(arr[0]) + return arr[0].split("=")[1], argv + + +INCLUDE_DIRS, argv = _argparse("--include_dirs", argv, False, is_list=True) +include_dirs = [] +if not (INCLUDE_DIRS is False): + include_dirs += INCLUDE_DIRS + +setup( + name="pointgroup_ops", + packages=["pointgroup_ops"], + package_dir={"pointgroup_ops": "functions"}, + ext_modules=[ + CUDAExtension( + name="pointgroup_ops_cuda", + sources=["src/bfs_cluster.cpp", "src/bfs_cluster_kernel.cu"], + extra_compile_args={"cxx": ["-g"], "nvcc": ["-O2"]}, + ) + ], + include_dirs=[*include_dirs], + cmdclass={"build_ext": BuildExtension}, +) diff --git a/projects/PTv3/libs/pointgroup_ops/src/bfs_cluster.cpp b/projects/PTv3/libs/pointgroup_ops/src/bfs_cluster.cpp new file mode 100644 index 000000000..d0298aae5 --- /dev/null +++ b/projects/PTv3/libs/pointgroup_ops/src/bfs_cluster.cpp @@ -0,0 +1,145 @@ +/* +Ball Query with BatchIdx & Clustering Algorithm +Written by Li Jiang +All Rights Reserved 2020. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int ballquery_batch_p_cuda(int n, int meanActive, float radius, const float *xyz, const int *batch_idxs, const int *batch_offsets, int *idx, int *start_len, cudaStream_t stream); + + +using Int = int32_t; +class ConnectedComponent{ +public: + std::vector pt_idxs {}; + + ConnectedComponent(){}; + void addPoint(Int pt_idx) + { + pt_idxs.push_back(pt_idx); + + } +}; +using ConnectedComponents = std::vector; + +/* ================================== ballquery_batch_p ================================== */ +// input xyz: (n, 3) float +// input batch_idxs: (n) int +// input batch_offsets: (B+1) int, batch_offsets[-1] +// output idx: (n * meanActive) dim 0 for number of points in the ball, idx in n +// output start_len: (n, 2), int +int ballquery_batch_p(at::Tensor xyz_tensor, at::Tensor batch_idxs_tensor, at::Tensor batch_offsets_tensor, at::Tensor idx_tensor, at::Tensor start_len_tensor, int n, int meanActive, float radius){ + const float *xyz = xyz_tensor.data(); + const int *batch_idxs = batch_idxs_tensor.data(); + const int *batch_offsets = batch_offsets_tensor.data(); + int *idx = idx_tensor.data(); + int *start_len = start_len_tensor.data(); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + int cumsum = ballquery_batch_p_cuda(n, meanActive, radius, xyz, batch_idxs, batch_offsets, idx, start_len, stream); + return cumsum; +} + +/* ================================== bfs_cluster ================================== */ +ConnectedComponent find_cc(Int idx, int *semantic_label, Int *ball_query_idxs, int *start_len, int *visited){ + ConnectedComponent cc; + cc.addPoint(idx); + visited[idx] = 1; + + std::queue Q; + assert(Q.empty()); + Q.push(idx); + + while(!Q.empty()){ + Int cur = Q.front(); Q.pop(); + int start = start_len[cur * 2]; + int len = start_len[cur * 2 + 1]; + int label_cur = semantic_label[cur]; + for(Int i = start; i < start + len; i++){ + Int idx_i = ball_query_idxs[i]; + if(semantic_label[idx_i] != label_cur) continue; + if(visited[idx_i] == 1) continue; + + cc.addPoint(idx_i); + visited[idx_i] = 1; + + Q.push(idx_i); + } + } + return cc; +} + +//input: semantic_label, int, N +//input: ball_query_idxs, Int, (nActive) +//input: start_len, int, (N, 2) +//output: clusters, CCs +int get_clusters(int *semantic_label, Int *ball_query_idxs, int *start_len, const Int nPoint, int threshold, ConnectedComponents &clusters){ + int visited[nPoint] = {0}; + + int sumNPoint = 0; + for(Int i = 0; i < nPoint; i++){ + if(visited[i] == 0){ + ConnectedComponent CC = find_cc(i, semantic_label, ball_query_idxs, start_len, visited); + if((int)CC.pt_idxs.size() >= threshold){ + clusters.push_back(CC); + sumNPoint += (int)CC.pt_idxs.size(); + } + } + } + + return sumNPoint; +} + +void fill_cluster_idxs_(ConnectedComponents &CCs, int *cluster_idxs, int *cluster_offsets){ + for(int i = 0; i < (int)CCs.size(); i++){ + cluster_offsets[i + 1] = cluster_offsets[i] + (int)CCs[i].pt_idxs.size(); + for(int j = 0; j < (int)CCs[i].pt_idxs.size(); j++){ + int idx = CCs[i].pt_idxs[j]; + cluster_idxs[(cluster_offsets[i] + j) * 2 + 0] = i; + cluster_idxs[(cluster_offsets[i] + j) * 2 + 1] = idx; + } + } +} + +//input: semantic_label, int, N +//input: ball_query_idxs, int, (nActive) +//input: start_len, int, (N, 2) +//output: cluster_idxs, int (sumNPoint, 2), dim 0 for cluster_id, dim 1 for corresponding point idxs in N +//output: cluster_offsets, int (nCluster + 1) +void bfs_cluster(at::Tensor semantic_label_tensor, at::Tensor ball_query_idxs_tensor, at::Tensor start_len_tensor, +at::Tensor cluster_idxs_tensor, at::Tensor cluster_offsets_tensor, const int N, int threshold){ + int *semantic_label = semantic_label_tensor.data(); + Int *ball_query_idxs = ball_query_idxs_tensor.data(); + int *start_len = start_len_tensor.data(); + + ConnectedComponents CCs; + int sumNPoint = get_clusters(semantic_label, ball_query_idxs, start_len, N, threshold, CCs); + + int nCluster = (int)CCs.size(); + cluster_idxs_tensor.resize_({sumNPoint, 2}); + cluster_offsets_tensor.resize_({nCluster + 1}); + cluster_idxs_tensor.zero_(); + cluster_offsets_tensor.zero_(); + + int *cluster_idxs = cluster_idxs_tensor.data(); + int *cluster_offsets = cluster_offsets_tensor.data(); + + fill_cluster_idxs_(CCs, cluster_idxs, cluster_offsets); +} + +//------------------------------------API------------------------------------------ +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){ + + m.def("ballquery_batch_p", &ballquery_batch_p, "ballquery_batch_p"); + m.def("bfs_cluster", &bfs_cluster, "bfs_cluster"); + +} diff --git a/projects/PTv3/libs/pointgroup_ops/src/bfs_cluster_kernel.cu b/projects/PTv3/libs/pointgroup_ops/src/bfs_cluster_kernel.cu new file mode 100644 index 000000000..99a31842d --- /dev/null +++ b/projects/PTv3/libs/pointgroup_ops/src/bfs_cluster_kernel.cu @@ -0,0 +1,91 @@ +/* +Ball Query with BatchIdx +Written by Li Jiang +All Rights Reserved 2020. +*/ +#include +#include +#include + +#define TOTAL_THREADS 1024 +#define THREADS_PER_BLOCK 512 +#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) + + +/* ================================== ballquery_batch_p ================================== */ +__global__ void ballquery_batch_p_cuda_(int n, int meanActive, float radius, const float *xyz, const int *batch_idxs, const int *batch_offsets, int *idx, int *start_len, int *cumsum) { + int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (pt_idx >= n) return; + + start_len += (pt_idx * 2); + int idx_temp[1000]; + + float radius2 = radius * radius; + float o_x = xyz[pt_idx * 3 + 0]; + float o_y = xyz[pt_idx * 3 + 1]; + float o_z = xyz[pt_idx * 3 + 2]; + + int batch_idx = batch_idxs[pt_idx]; + int start = batch_offsets[batch_idx]; + int end = batch_offsets[batch_idx + 1]; + + int cnt = 0; + for(int k = start; k < end; k++){ + float x = xyz[k * 3 + 0]; + float y = xyz[k * 3 + 1]; + float z = xyz[k * 3 + 2]; + float d2 = (o_x - x) * (o_x - x) + (o_y - y) * (o_y - y) + (o_z - z) * (o_z - z); + if(d2 < radius2){ + if(cnt < 1000){ + idx_temp[cnt] = k; + } + else{ + break; + } + ++cnt; + } + } + + start_len[0] = atomicAdd(cumsum, cnt); + start_len[1] = cnt; + + int thre = n * meanActive; + if(start_len[0] >= thre) return; + + idx += start_len[0]; + if(start_len[0] + cnt >= thre) cnt = thre - start_len[0]; + + for(int k = 0; k < cnt; k++){ + idx[k] = idx_temp[k]; + } +} + + +int ballquery_batch_p_cuda(int n, int meanActive, float radius, const float *xyz, const int *batch_idxs, const int *batch_offsets, int *idx, int *start_len, cudaStream_t stream) { + // param xyz: (n, 3) + // param batch_idxs: (n) + // param batch_offsets: (B + 1) + // output idx: (n * meanActive) dim 0 for number of points in the ball, idx in n + // output start_len: (n, 2), int + + cudaError_t err; + + dim3 blocks(DIVUP(n, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + + int cumsum = 0; + int* p_cumsum; + cudaMalloc((void**)&p_cumsum, sizeof(int)); + cudaMemcpy(p_cumsum, &cumsum, sizeof(int), cudaMemcpyHostToDevice); + + ballquery_batch_p_cuda_<<>>(n, meanActive, radius, xyz, batch_idxs, batch_offsets, idx, start_len, p_cumsum); + + err = cudaGetLastError(); + if (cudaSuccess != err) { + fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); + exit(-1); + } + + cudaMemcpy(&cumsum, p_cumsum, sizeof(int), cudaMemcpyDeviceToHost); + return cumsum; +} diff --git a/projects/PTv3/libs/pointops/__init__.py b/projects/PTv3/libs/pointops/__init__.py new file mode 100644 index 000000000..c8f754883 --- /dev/null +++ b/projects/PTv3/libs/pointops/__init__.py @@ -0,0 +1 @@ +from .functions import * diff --git a/projects/PTv3/libs/pointops/functions/__init__.py b/projects/PTv3/libs/pointops/functions/__init__.py new file mode 100644 index 000000000..1c20807f6 --- /dev/null +++ b/projects/PTv3/libs/pointops/functions/__init__.py @@ -0,0 +1,14 @@ +from .aggregation import aggregation +from .attention import attention_fusion_step, attention_relation_step +from .grouping import grouping, grouping2 +from .interpolation import interpolation, interpolation2 +from .query import ball_query, knn_query, random_ball_query +from .sampling import farthest_point_sampling +from .subtraction import subtraction +from .utils import ( + ball_query_and_group, + batch2offset, + knn_query_and_group, + offset2batch, + query_and_group, +) diff --git a/projects/PTv3/libs/pointops/functions/aggregation.py b/projects/PTv3/libs/pointops/functions/aggregation.py new file mode 100644 index 000000000..8e7865b38 --- /dev/null +++ b/projects/PTv3/libs/pointops/functions/aggregation.py @@ -0,0 +1,50 @@ +import torch +from pointops._C import aggregation_backward_cuda, aggregation_forward_cuda +from torch.autograd import Function + + +class Aggregation(Function): + @staticmethod + def forward(ctx, input, position, weight, idx): + """ + input: input: (n, c), position: (n, nsample, c), weight : (n, nsample, c'), idx: (n, nsample) + output: (n, c) + """ + assert input.is_contiguous() and position.is_contiguous() and weight.is_contiguous() + n, nsample, c = position.shape + w_c = weight.shape[-1] + output = torch.cuda.FloatTensor(n, c).zero_() + aggregation_forward_cuda(n, nsample, c, w_c, input, position, weight, idx, output) + ctx.save_for_backward(input, position, weight, idx) + return output + + @staticmethod + def backward(ctx, grad_output): + """ + input: grad_out: (n, c) + output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight : (n, nsample, c') + """ + input, position, weight, idx = ctx.saved_tensors + n, nsample, c = position.shape + w_c = weight.shape[-1] + grad_input = torch.cuda.FloatTensor(n, c).zero_() + grad_position = torch.cuda.FloatTensor(n, nsample, c).zero_() + grad_weight = torch.cuda.FloatTensor(n, nsample, w_c).zero_() + aggregation_backward_cuda( + n, + nsample, + c, + w_c, + input, + position, + weight, + idx, + grad_output, + grad_input, + grad_position, + grad_weight, + ) + return grad_input, grad_position, grad_weight, None + + +aggregation = Aggregation.apply diff --git a/projects/PTv3/libs/pointops/functions/attention.py b/projects/PTv3/libs/pointops/functions/attention.py new file mode 100644 index 000000000..e063aefde --- /dev/null +++ b/projects/PTv3/libs/pointops/functions/attention.py @@ -0,0 +1,117 @@ +import torch +from pointops._C import ( + attention_fusion_step_backward_cuda, + attention_fusion_step_forward_cuda, + attention_relation_step_backward_cuda, + attention_relation_step_forward_cuda, +) +from torch.autograd import Function + + +class AttentionRelationStep(Function): + @staticmethod + def forward(ctx, query, key, weight, index_target, index_refer): + """ + input - query: (n, g, c), key: (n, g, c), weight: (c) 1_c for scatter attention, + index_target: (m), index_refer: (m) + output - relation: (M, g) + """ + + assert ( + query.is_contiguous() + and key.is_contiguous() + and index_target.is_contiguous() + and index_refer.is_contiguous() + and weight.is_contiguous() + ) + + assert index_target.shape[0] == index_refer.shape[0] + + _, g, c = query.shape + m = index_target.shape[0] + output = torch.cuda.FloatTensor(m, g).zero_() + attention_relation_step_forward_cuda( + m, g, c, query, key, weight, index_target.int(), index_refer.int(), output + ) + ctx.save_for_backward(query, key, weight, index_target, index_refer) + return output + + @staticmethod + def backward(ctx, grad_output): + query, key, weight, index_target, index_refer = ctx.saved_tensors + n, g, c = query.shape + m = index_target.shape[0] + grad_query = torch.cuda.FloatTensor(n, g, c).zero_() + grad_key = torch.cuda.FloatTensor(n, g, c).zero_() + grad_weight = torch.cuda.FloatTensor(c).zero_() + attention_relation_step_backward_cuda( + m, + g, + c, + query, + grad_query, + key, + grad_key, + weight, + grad_weight, + index_target.int(), + index_refer.int(), + grad_output, + ) + return grad_query, grad_key, None, None, None + + +class AttentionFusionStep(Function): + @staticmethod + def forward(ctx, weight, value, index_target, index_refer): + """ + input - weight: (m, g), value: (n, g, c) + index_target: (m), index_value: (m) + output - output: (n, g, c) + """ + + assert ( + weight.is_contiguous() + and value.is_contiguous() + and index_target.is_contiguous() + and index_refer.is_contiguous() + and weight.is_contiguous() + ) + + assert index_target.shape[0] == index_refer.shape[0] + + n, g, c = value.shape + m = index_refer.shape[0] + output = torch.cuda.FloatTensor(n, g, c).zero_() + attention_fusion_step_forward_cuda(m, g, c, weight, value, index_target.int(), index_refer.int(), output) + ctx.save_for_backward(weight, value, index_target, index_refer) + return output + + @staticmethod + def backward(ctx, grad_output): + """ + input: grad_output: (n, g, c) + output: grad_weight: (m, g), grad_value: (n, g, c), none, none + """ + weight, value, index_target, index_refer = ctx.saved_tensors + n, g, c = value.shape + m = index_target.shape[0] + grad_weight = torch.cuda.FloatTensor(m, g).zero_() + grad_value = torch.cuda.FloatTensor(n, g, c).zero_() + attention_fusion_step_backward_cuda( + m, + g, + c, + weight, + grad_weight, + value, + grad_value, + index_target.int(), + index_refer.int(), + grad_output, + ) + return grad_weight, grad_value, None, None + + +attention_relation_step = AttentionRelationStep.apply +attention_fusion_step = AttentionFusionStep.apply diff --git a/projects/PTv3/libs/pointops/functions/grouping.py b/projects/PTv3/libs/pointops/functions/grouping.py new file mode 100644 index 000000000..5de89e443 --- /dev/null +++ b/projects/PTv3/libs/pointops/functions/grouping.py @@ -0,0 +1,54 @@ +import torch +from pointops._C import grouping_backward_cuda, grouping_forward_cuda +from torch.autograd import Function + + +class Grouping(Function): + @staticmethod + def forward(ctx, input, idx): + """ + input: input: (n, c), idx : (m, nsample) + output: (m, nsample, c) + """ + assert input.is_contiguous() and idx.is_contiguous() + m, nsample, n, c = idx.shape[0], idx.shape[1], input.shape[0], input.shape[1] + output = torch.cuda.FloatTensor(m, nsample, c) + grouping_forward_cuda(m, nsample, c, input, idx, output) + ctx.n = n + ctx.save_for_backward(idx) + return output + + @staticmethod + def backward(ctx, grad_output): + """ + input: grad_out: (m, c, nsample) + output: (n, c), None + """ + n = ctx.n + (idx,) = ctx.saved_tensors + m, nsample, c = grad_output.shape + grad_input = torch.cuda.FloatTensor(n, c).zero_() + grouping_backward_cuda(m, nsample, c, grad_output, idx, grad_input) + return grad_input, None + + +def grouping(idx, feat, xyz, new_xyz=None, with_xyz=False): + if new_xyz is None: + new_xyz = xyz + assert xyz.is_contiguous() and feat.is_contiguous() + m, nsample, c = idx.shape[0], idx.shape[1], feat.shape[1] + xyz = torch.cat([xyz, torch.zeros([1, 3]).to(xyz.device)], dim=0) + feat = torch.cat([feat, torch.zeros([1, c]).to(feat.device)], dim=0) + grouped_feat = feat[idx.view(-1).long(), :].view(m, nsample, c) # (m, num_sample, c) + + if with_xyz: + assert new_xyz.is_contiguous() + mask = torch.sign(idx + 1) + grouped_xyz = xyz[idx.view(-1).long(), :].view(m, nsample, 3) - new_xyz.unsqueeze(1) # (m, num_sample, 3) + grouped_xyz = torch.einsum("n s c, n s -> n s c", grouped_xyz, mask) # (m, num_sample, 3) + return torch.cat((grouped_xyz, grouped_feat), -1) + else: + return grouped_feat + + +grouping2 = Grouping.apply diff --git a/projects/PTv3/libs/pointops/functions/interpolation.py b/projects/PTv3/libs/pointops/functions/interpolation.py new file mode 100644 index 000000000..9ba975ac0 --- /dev/null +++ b/projects/PTv3/libs/pointops/functions/interpolation.py @@ -0,0 +1,59 @@ +import torch +from pointops._C import interpolation_backward_cuda, interpolation_forward_cuda +from torch.autograd import Function + +from .query import knn_query + + +def interpolation(xyz, new_xyz, feat, offset, new_offset, k=3): + """ + input: coords: (m, 3), new_xyz: (n, 3), color: (m, c), offset: (b), new_offset: (b) + output: (n, c) + """ + assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous() + idx, dist = knn_query(k, xyz, offset, new_xyz, new_offset) # (n, 3), (n, 3) + dist_recip = 1.0 / (dist + 1e-8) # (n, 3) + norm = torch.sum(dist_recip, dim=1, keepdim=True) + weight = dist_recip / norm # (n, 3) + + new_feat = torch.cuda.FloatTensor(new_xyz.shape[0], feat.shape[1]).zero_() + for i in range(k): + new_feat += feat[idx[:, i].long(), :] * weight[:, i].unsqueeze(-1) + return new_feat + + +class Interpolation(Function): + @staticmethod + def forward(ctx, xyz, new_xyz, input, offset, new_offset, k=3): + """ + input: coords: (m, 3), new_xyz: (n, 3), input: (m, c), offset: (b), new_offset: (b) + output: (n, c) + """ + assert xyz.is_contiguous() and new_xyz.is_contiguous() and input.is_contiguous() + idx, dist = knn_query(k, xyz, offset, new_xyz, new_offset) # (n, k), (n, k) + dist_recip = 1.0 / (dist + 1e-8) # (n, k) + norm = torch.sum(dist_recip, dim=1, keepdim=True) + weight = dist_recip / norm # (n, k) + + n, c, m = new_xyz.shape[0], input.shape[1], input.shape[0] + output = torch.cuda.FloatTensor(n, c).zero_() + interpolation_forward_cuda(n, c, k, input, idx, weight, output) + ctx.m, ctx.k = m, k + ctx.save_for_backward(idx, weight) + return output + + @staticmethod + def backward(ctx, grad_output): + """ + input: coords: (m, 3), new_xyz: (n, 3), input: (m, c), offset: (b), new_offset: (b) + output: (n, c) + """ + m, k = ctx.m, ctx.k + idx, weight = ctx.saved_tensors + n, c = grad_output.shape + grad_input = torch.cuda.FloatTensor(m, c).zero_() + interpolation_backward_cuda(n, c, k, grad_output, idx, weight, grad_input) + return None, None, grad_input, None, None, None + + +interpolation2 = Interpolation.apply diff --git a/projects/PTv3/libs/pointops/functions/query.py b/projects/PTv3/libs/pointops/functions/query.py new file mode 100644 index 000000000..46577591d --- /dev/null +++ b/projects/PTv3/libs/pointops/functions/query.py @@ -0,0 +1,104 @@ +import torch +from pointops._C import ball_query_cuda, knn_query_cuda, random_ball_query_cuda +from torch.autograd import Function + + +class KNNQuery(Function): + @staticmethod + def forward(ctx, nsample, xyz, offset, new_xyz=None, new_offset=None): + """ + input: coords: (n, 3), new_xyz: (m, 3), offset: (b), new_offset: (b) + output: idx: (m, nsample) -1 is placeholder, dist2: (m, nsample) + """ + if new_xyz is None or new_offset is None: + new_xyz = xyz + new_offset = offset + assert xyz.is_contiguous() and new_xyz.is_contiguous() + m = new_xyz.shape[0] + idx = torch.cuda.IntTensor(m, nsample).zero_() + dist2 = torch.cuda.FloatTensor(m, nsample).zero_() + knn_query_cuda(m, nsample, xyz, new_xyz, offset.int(), new_offset.int(), idx, dist2) + return idx, torch.sqrt(dist2) + + +class RandomBallQuery(Function): + """Random Ball Query. + + Find nearby points in spherical space. + """ + + @staticmethod + def forward(ctx, nsample, max_radius, min_radius, xyz, offset, new_xyz=None, new_offset=None): + """ + input: coords: (n, 3), new_xyz: (m, 3), offset: (b), new_offset: (b) + output: idx: (m, nsample), dist2: (m, nsample) + """ + if new_xyz is None or new_offset is None: + new_xyz = xyz + new_offset = offset + assert xyz.is_contiguous() and new_xyz.is_contiguous() + assert min_radius < max_radius + + m = new_xyz.shape[0] + order = [] + for k in range(offset.shape[0]): + s_k, e_k = (0, offset[0]) if k == 0 else (offset[k - 1], offset[k]) + order.append(torch.randperm(e_k - s_k, dtype=torch.int32, device=offset.device) + s_k) + order = torch.cat(order, dim=0) + idx = torch.cuda.IntTensor(m, nsample).zero_() + dist2 = torch.cuda.FloatTensor(m, nsample).zero_() + random_ball_query_cuda( + m, + nsample, + min_radius, + max_radius, + order, + xyz, + new_xyz, + offset.int(), + new_offset.int(), + idx, + dist2, + ) + return idx, torch.sqrt(dist2) + + +class BallQuery(Function): + """Ball Query. + + Find nearby points in spherical space. + """ + + @staticmethod + def forward(ctx, nsample, max_radius, min_radius, xyz, offset, new_xyz=None, new_offset=None): + """ + input: coords: (n, 3), new_xyz: (m, 3), offset: (b), new_offset: (b) + output: idx: (m, nsample), dist2: (m, nsample) + """ + if new_xyz is None or new_offset is None: + new_xyz = xyz + new_offset = offset + assert xyz.is_contiguous() and new_xyz.is_contiguous() + assert min_radius < max_radius + + m = new_xyz.shape[0] + idx = torch.cuda.IntTensor(m, nsample).zero_() + dist2 = torch.cuda.FloatTensor(m, nsample).zero_() + ball_query_cuda( + m, + nsample, + min_radius, + max_radius, + xyz, + new_xyz, + offset.int(), + new_offset.int(), + idx, + dist2, + ) + return idx, torch.sqrt(dist2) + + +knn_query = KNNQuery.apply +ball_query = BallQuery.apply +random_ball_query = RandomBallQuery.apply diff --git a/projects/PTv3/libs/pointops/functions/sampling.py b/projects/PTv3/libs/pointops/functions/sampling.py new file mode 100644 index 000000000..0ee83b51d --- /dev/null +++ b/projects/PTv3/libs/pointops/functions/sampling.py @@ -0,0 +1,24 @@ +import torch +from pointops._C import farthest_point_sampling_cuda +from torch.autograd import Function + + +class FarthestPointSampling(Function): + @staticmethod + def forward(ctx, xyz, offset, new_offset): + """ + input: coords: (n, 3), offset: (b), new_offset: (b) + output: idx: (m) + """ + assert xyz.is_contiguous() + n, b, n_max = xyz.shape[0], offset.shape[0], offset[0] + for i in range(1, b): + n_max = max(offset[i] - offset[i - 1], n_max) + idx = torch.cuda.IntTensor(new_offset[b - 1].item()).zero_() + tmp = torch.cuda.FloatTensor(n).fill_(1e10) + farthest_point_sampling_cuda(b, n_max, xyz, offset.int(), new_offset.int(), tmp, idx) + del tmp + return idx + + +farthest_point_sampling = FarthestPointSampling.apply diff --git a/projects/PTv3/libs/pointops/functions/subtraction.py b/projects/PTv3/libs/pointops/functions/subtraction.py new file mode 100644 index 000000000..1633552c5 --- /dev/null +++ b/projects/PTv3/libs/pointops/functions/subtraction.py @@ -0,0 +1,35 @@ +import torch +from pointops._C import subtraction_backward_cuda, subtraction_forward_cuda +from torch.autograd import Function + + +class Subtraction(Function): + @staticmethod + def forward(ctx, input1, input2, idx): + """ + input: input1: (n, c), input2: (n, c), idx: (n, nsample) + output: (n, nsample, c) + """ + assert input1.is_contiguous() and input2.is_contiguous() + n, c = input1.shape + nsample = idx.shape[-1] + output = torch.cuda.FloatTensor(n, nsample, c).zero_() + subtraction_forward_cuda(n, nsample, c, input1, input2, idx, output) + ctx.save_for_backward(idx) + return output + + @staticmethod + def backward(ctx, grad_output): + """ + input: grad_out: (n, nsample, c) + output: grad_input1: (n, c), grad_input2: (n, c) + """ + (idx,) = ctx.saved_tensors + n, nsample, c = grad_output.shape + grad_input1 = torch.cuda.FloatTensor(n, c).zero_() + grad_input2 = torch.cuda.FloatTensor(n, c).zero_() + subtraction_backward_cuda(n, nsample, c, idx, grad_output, grad_input1, grad_input2) + return grad_input1, grad_input2, None + + +subtraction = Subtraction.apply diff --git a/projects/PTv3/libs/pointops/functions/utils.py b/projects/PTv3/libs/pointops/functions/utils.py new file mode 100644 index 000000000..0c51202cd --- /dev/null +++ b/projects/PTv3/libs/pointops/functions/utils.py @@ -0,0 +1,113 @@ +import torch +from pointops import ball_query, grouping, knn_query + + +def knn_query_and_group( + feat, + xyz, + offset=None, + new_xyz=None, + new_offset=None, + idx=None, + nsample=None, + with_xyz=False, +): + if idx is None: + assert nsample is not None + idx, _ = knn_query(nsample, xyz, offset, new_xyz, new_offset) + return grouping(idx, feat, xyz, new_xyz, with_xyz), idx + + +def ball_query_and_group( + feat, + xyz, + offset=None, + new_xyz=None, + new_offset=None, + idx=None, + max_radio=None, + min_radio=0, + nsample=None, + with_xyz=False, +): + if idx is None: + assert nsample is not None and offset is not None + assert max_radio is not None and min_radio is not None + idx, _ = ball_query(nsample, max_radio, min_radio, xyz, offset, new_xyz, new_offset) + return grouping(idx, feat, xyz, new_xyz, with_xyz), idx + + +def query_and_group( + nsample, + xyz, + new_xyz, + feat, + idx, + offset, + new_offset, + dilation=0, + with_feat=True, + with_xyz=True, +): + """ + input: coords: (n, 3), new_xyz: (m, 3), color: (n, c), idx: (m, nsample), offset: (b), new_offset: (b) + output: new_feat: (m, nsample, c+3), grouped_idx: (m, nsample) + """ + assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous() + if new_xyz is None: + new_xyz = xyz + + if idx is None: + num_samples_total = 1 + (nsample - 1) * (dilation + 1) + # num points in a batch might < num_samples_total => [n1, n2, ..., nk, ns, ns, ns, ...] + idx_no_dilation, _ = knn_query(num_samples_total, xyz, offset, new_xyz, new_offset) # (m, nsample * (d + 1)) + idx = [] + batch_end = offset.tolist() + batch_start = [0] + batch_end[:-1] + new_batch_end = new_offset.tolist() + new_batch_start = [0] + new_batch_end[:-1] + for i in range(offset.shape[0]): + if batch_end[i] - batch_start[i] < num_samples_total: + soft_dilation = (batch_end[i] - batch_start[i] - 1) / (nsample - 1) - 1 + else: + soft_dilation = dilation + idx.append( + idx_no_dilation[ + new_batch_start[i] : new_batch_end[i], + [int((soft_dilation + 1) * i) for i in range(nsample)], + ] + ) + idx = torch.cat(idx, dim=0) + + if not with_feat: + return idx + + n, m, c = xyz.shape[0], new_xyz.shape[0], feat.shape[1] + grouped_xyz = xyz[idx.view(-1).long(), :].view(m, nsample, 3) # (m, nsample, 3) + # grouped_xyz = grouping(coords, idx) # (m, nsample, 3) + grouped_xyz -= new_xyz.unsqueeze(1) # (m, nsample, 3) + grouped_feat = feat[idx.view(-1).long(), :].view(m, nsample, c) # (m, nsample, c) + # grouped_feat = grouping(color, idx) # (m, nsample, c) + + if with_xyz: + return torch.cat((grouped_xyz, grouped_feat), -1), idx # (m, nsample, 3+c) + else: + return grouped_feat, idx + + +def offset2batch(offset): + return ( + torch.cat( + [ + (torch.tensor([i] * (o - offset[i - 1])) if i > 0 else torch.tensor([i] * o)) + for i, o in enumerate(offset) + ], + dim=0, + ) + .long() + .to(offset.device) + ) + + +def batch2offset(batch): + return torch.cumsum(batch.bincount(), dim=0).int() diff --git a/projects/PTv3/libs/pointops/pyproject.toml b/projects/PTv3/libs/pointops/pyproject.toml new file mode 100644 index 000000000..7dafe1f34 --- /dev/null +++ b/projects/PTv3/libs/pointops/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "pointops" +version = "0.1.0" +requires-python = "==3.10.*" +dependencies = [] + +[build-system] +requires = ["setuptools>=61"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["."] +exclude = [] diff --git a/projects/PTv3/libs/pointops/setup.py b/projects/PTv3/libs/pointops/setup.py new file mode 100644 index 000000000..bdd4e4cef --- /dev/null +++ b/projects/PTv3/libs/pointops/setup.py @@ -0,0 +1,32 @@ +import os +from distutils.sysconfig import get_config_vars + +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +(opt,) = get_config_vars("OPT") +os.environ["OPT"] = " ".join(flag for flag in opt.split() if flag != "-Wstrict-prototypes") + +src = "src" +sources = [ + os.path.join(root, file) + for root, dirs, files in os.walk(src) + for file in files + if file.endswith(".cpp") or file.endswith(".cu") +] + +setup( + name="pointops", + version="1.0", + install_requires=["torch", "numpy"], + packages=["pointops"], + package_dir={"pointops": "functions"}, + ext_modules=[ + CUDAExtension( + name="pointops._C", + sources=sources, + extra_compile_args={"cxx": ["-g"], "nvcc": ["-O2"]}, + ) + ], + cmdclass={"build_ext": BuildExtension}, +) diff --git a/projects/PTv3/libs/pointops/src/__init__.py b/projects/PTv3/libs/pointops/src/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda.cpp b/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda.cpp new file mode 100644 index 000000000..491b6f416 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda.cpp @@ -0,0 +1,28 @@ +#include +#include +#include +#include "aggregation_cuda_kernel.h" + + +void aggregation_forward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor output_tensor) +{ + const float *input = input_tensor.data_ptr(); + const float *position = position_tensor.data_ptr(); + const float *weight = weight_tensor.data_ptr(); + const int *idx = idx_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + aggregation_forward_cuda_launcher(n, nsample, c, w_c, input, position, weight, idx, output); +} + +void aggregation_backward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input_tensor, at::Tensor grad_position_tensor, at::Tensor grad_weight_tensor) +{ + const float *input = input_tensor.data_ptr(); + const float *position = position_tensor.data_ptr(); + const float *weight = weight_tensor.data_ptr(); + const int *idx = idx_tensor.data_ptr(); + const float *grad_output = grad_output_tensor.data_ptr(); + float *grad_input = grad_input_tensor.data_ptr(); + float *grad_position = grad_position_tensor.data_ptr(); + float *grad_weight = grad_weight_tensor.data_ptr(); + aggregation_backward_cuda_launcher(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight); +} diff --git a/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda_kernel.cu new file mode 100644 index 000000000..dfeecb509 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda_kernel.cu @@ -0,0 +1,53 @@ +#include "../cuda_utils.h" +#include "aggregation_cuda_kernel.h" + + +__global__ void aggregation_forward_cuda_kernel(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output) { + // input: input: (n, c), position: (n, nsample, c), weight: (n, nsample, w_c), idx: (n, nsample), output: (n, c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n * c) return; + const int c_idx = index % c; + const int n_idx = index / c; + const int w_c_idx = c_idx % w_c; + for (int nsample_idx = 0; nsample_idx < nsample; nsample_idx++) + { + int idx_idx = n_idx * nsample + nsample_idx; + int input_idx = idx[idx_idx] * c + c_idx; + int position_idx = n_idx * nsample * c + nsample_idx * c + c_idx; + int weight_idx = n_idx * nsample * w_c + nsample_idx * w_c + w_c_idx; + output[index] += (input[input_idx] + position[position_idx]) * weight[weight_idx]; + } +} + +__global__ void aggregation_backward_cuda_kernel(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight) { + // input: grad_output: (n, c), output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight: (n, nsample, w_c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n * c) return; + const int c_idx = index % c; + const int n_idx = index / c; + const int w_c_idx = c_idx % w_c; + for (int nsample_idx = 0; nsample_idx < nsample; nsample_idx++) + { + int idx_idx = n_idx * nsample + nsample_idx; + int input_idx = idx[idx_idx] * c + c_idx; + int position_idx = n_idx * nsample * c + nsample_idx * c + c_idx; + int weight_idx = n_idx * nsample * w_c + nsample_idx * w_c + w_c_idx; + atomicAdd(grad_input + input_idx, grad_output[index] * weight[weight_idx]); + grad_position[position_idx] = grad_output[index] * weight[weight_idx]; + atomicAdd(grad_weight + weight_idx, grad_output[index] * (input[input_idx] + position[position_idx])); + } +} + +void aggregation_forward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output) { + // input: input: (n, c), position: (n, nsample, c), weight: (n, nsample, w_c), idx: (n, nsample), output: (n, c) + dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + aggregation_forward_cuda_kernel<<>>(n, nsample, c, w_c, input, position, weight, idx, output); +} + +void aggregation_backward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight) { + // input: grad_output: (n, c), output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight: (n, nsample, w_c) + dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + aggregation_backward_cuda_kernel<<>>(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight); +} diff --git a/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda_kernel.h b/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda_kernel.h new file mode 100644 index 000000000..5211a96aa --- /dev/null +++ b/projects/PTv3/libs/pointops/src/aggregation/aggregation_cuda_kernel.h @@ -0,0 +1,20 @@ +#ifndef _AGGREGATION_CUDA_KERNEL +#define _AGGREGATION_CUDA_KERNEL +#include +#include +#include + +void aggregation_forward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor output_tensor); +void aggregation_backward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input_tensor, at::Tensor grad_position_tensor, at::Tensor grad_weight_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void aggregation_forward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output); +void aggregation_backward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/projects/PTv3/libs/pointops/src/attention/attention_cuda.cpp b/projects/PTv3/libs/pointops/src/attention/attention_cuda.cpp new file mode 100644 index 000000000..79b90c7eb --- /dev/null +++ b/projects/PTv3/libs/pointops/src/attention/attention_cuda.cpp @@ -0,0 +1,76 @@ +#include +#include +#include +#include "attention_cuda_kernel.h" + + +void attention_relation_step_forward_cuda(int m, int g, int c, + at::Tensor query_tensor, at::Tensor key_tensor, at::Tensor weight_tensor, + at::Tensor index_target_tensor, at::Tensor index_refer_tensor, + at::Tensor output_tensor) +{ + const float *query = query_tensor.data_ptr(); + const float *key = key_tensor.data_ptr(); + const float *weight = weight_tensor.data_ptr(); + const int *index_target = index_target_tensor.data_ptr(); + const int *index_refer = index_refer_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + attention_relation_step_forward_cuda_launcher(m, g, c, query, key, weight, index_target, index_refer, output); +} + +void attention_relation_step_backward_cuda(int m, int g, int c, + at::Tensor query_tensor, at::Tensor grad_query_tensor, + at::Tensor key_tensor, at::Tensor grad_key_tensor, + at::Tensor weight_tensor, at::Tensor grad_weight_tensor, + at::Tensor index_target_tensor, at::Tensor index_refer_tensor, + at::Tensor grad_output_tensor) +{ + const float *query = query_tensor.data_ptr(); + float *grad_query = grad_query_tensor.data_ptr(); + const float *key = key_tensor.data_ptr(); + float *grad_key = grad_key_tensor.data_ptr(); + const float *weight = weight_tensor.data_ptr(); + float *grad_weight = grad_weight_tensor.data_ptr(); + const int *index_target = index_target_tensor.data_ptr(); + const int *index_refer = index_refer_tensor.data_ptr(); + const float *grad_output = grad_output_tensor.data_ptr(); + attention_relation_step_backward_cuda_launcher(m, g, c, + query, grad_query, + key, grad_key, + weight, grad_weight, + index_target, index_refer, grad_output); +} + + +void attention_fusion_step_forward_cuda(int m, int g, int c, + at::Tensor weight_tensor, at::Tensor value_tensor, + at::Tensor index_target_tensor, at::Tensor index_refer_tensor, + at::Tensor output_tensor) +{ + const float *weight = weight_tensor.data_ptr(); + const float *value = value_tensor.data_ptr(); + const int *index_target = index_target_tensor.data_ptr(); + const int *index_refer = index_refer_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + attention_fusion_step_forward_cuda_launcher(m, g, c, weight, value, index_target, index_refer, output); +} + + +void attention_fusion_step_backward_cuda(int m, int g, int c, + at::Tensor weight_tensor, at::Tensor grad_weight_tensor, + at::Tensor value_tensor, at::Tensor grad_value_tensor, + at::Tensor index_target_tensor, at::Tensor index_refer_tensor, + at::Tensor grad_output_tensor) +{ + const float *weight = weight_tensor.data_ptr(); + float *grad_weight = grad_weight_tensor.data_ptr(); + const float *value = value_tensor.data_ptr(); + float *grad_value = grad_value_tensor.data_ptr(); + const int *index_target = index_target_tensor.data_ptr(); + const int *index_refer = index_refer_tensor.data_ptr(); + const float *grad_output = grad_output_tensor.data_ptr(); + attention_fusion_step_backward_cuda_launcher(m, g, c, + weight, grad_weight, + value, grad_value, + index_target, index_refer, grad_output); +} diff --git a/projects/PTv3/libs/pointops/src/attention/attention_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/attention/attention_cuda_kernel.cu new file mode 100644 index 000000000..747db4d37 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/attention/attention_cuda_kernel.cu @@ -0,0 +1,147 @@ +#include "../cuda_utils.h" +#include "attention_cuda_kernel.h" + + +/* +Kernels +*/ + +__global__ void attention_relation_step_forward_cuda_kernel(int m, int g, int c, + const float *query, const float *key, const float *weight, + const int *index_target, const int *index_refer, + float *output) +{ + int r_idx = blockIdx.x * blockDim.x + threadIdx.x; + int g_idx = blockIdx.y; + int c_idx = blockIdx.z; + + if (r_idx >= m || g_idx >= g || c_idx >= c) return; + int q_idx = index_target[r_idx] * g * c + g_idx * c + c_idx; + int k_idx = index_refer[r_idx] * g * c + g_idx * c + c_idx; + + float r = query[q_idx] * key[k_idx] * weight[c_idx]; + atomicAdd(output + r_idx * g + g_idx, r); +} + +__global__ void attention_relation_step_backward_cuda_kernel(int m, int g, int c, + const float *query, float *grad_query, + const float *key, float *grad_key, + const float *weight, float *grad_weight, + const int *index_target, const int *index_refer, + const float *grad_output) +{ + int r_idx = blockIdx.x * blockDim.x + threadIdx.x; + int g_idx = blockIdx.y; + int c_idx = blockIdx.z; + + if (r_idx >= m || g_idx >= g || c_idx >= c) return; + + int q_idx = index_target[r_idx] * g * c + g_idx * c + c_idx; + int k_idx = index_refer[r_idx] * g * c + g_idx * c + c_idx; + int o_idx = r_idx * g + g_idx; + float grad_r = grad_output[o_idx]; + atomicAdd(grad_query + q_idx, grad_r * key[k_idx] * weight[c_idx]); + atomicAdd(grad_key + k_idx, grad_r * query[q_idx] * weight[c_idx]); + atomicAdd(grad_weight + c_idx, grad_r * key[k_idx] * query[q_idx]); +} + + +__global__ void attention_fusion_step_forward_cuda_kernel(int m, int g, int c, + const float *weight, const float *value, + const int *index_target, const int *index_refer, + float *output) +{ + int r_idx = blockIdx.x * blockDim.x + threadIdx.x; + int g_idx = blockIdx.y; + int c_idx = blockIdx.z; + + if (r_idx >= m || g_idx >= g || c_idx >= c) return; + + int o_idx = index_target[r_idx] * g * c + g_idx * c + c_idx; + int v_idx = index_refer[r_idx] * g * c + g_idx * c + c_idx; + + float f = weight[r_idx * g + g_idx] * value[v_idx]; + atomicAdd(output + o_idx, f); +} + + +__global__ void attention_fusion_step_backward_cuda_kernel(int m, int g, int c, + const float *weight, float *grad_weight, + const float *value, float *grad_value, + const int *index_target, const int *index_refer, + const float *grad_output) +{ + int r_idx = blockIdx.x * blockDim.x + threadIdx.x; + int g_idx = blockIdx.y; + int c_idx = blockIdx.z; + + if (r_idx >= m || g_idx >= g || c_idx >= c) return; + + int o_idx = index_target[r_idx] * g * c + g_idx * c + c_idx; + int v_idx = index_refer[r_idx] * g * c + g_idx * c + c_idx; + int w_idx = r_idx * g + g_idx; + float grad = grad_output[o_idx]; + atomicAdd(grad_weight + w_idx, grad * value[v_idx]); + atomicAdd(grad_value + v_idx, grad * weight[w_idx]); +} + +/* +Launchers +*/ + + +void attention_relation_step_forward_cuda_launcher(int m, int g, int c, + const float *query, const float *key, const float *weight, + const int *index_target, const int *index_refer, + float *output) +{ + dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), g, c); + dim3 threads(THREADS_PER_BLOCK); + attention_relation_step_forward_cuda_kernel<<>>(m, g, c, query, key, weight, + index_target, index_refer, output); +} + +void attention_relation_step_backward_cuda_launcher(int m, int g, int c, + const float *query, float *grad_query, + const float *key, float *grad_key, + const float *weight, float *grad_weight, + const int *index_target, const int *index_refer, + const float *grad_output) +{ + dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), g, c); + dim3 threads(THREADS_PER_BLOCK); + attention_relation_step_backward_cuda_kernel<<>>(m, g, c, + query, grad_query, + key, grad_key, + weight, grad_weight, + index_target, index_refer, + grad_output); +} + + +void attention_fusion_step_forward_cuda_launcher(int m, int g, int c, + const float *weight, const float *value, + const int *index_target, const int *index_refer, + float *output) +{ + dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), g, c); + dim3 threads(THREADS_PER_BLOCK); + attention_fusion_step_forward_cuda_kernel<<>>(m, g, c, weight, value, + index_target, index_refer, output); +} + + +void attention_fusion_step_backward_cuda_launcher(int m, int g, int c, + const float *weight, float *grad_weight, + const float *value, float *grad_value, + const int *index_target, const int *index_refer, + const float *grad_output) +{ + dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), g, c); + dim3 threads(THREADS_PER_BLOCK); + attention_fusion_step_backward_cuda_kernel<<>>(m, g, c, + weight, grad_weight, + value, grad_value, + index_target, index_refer, + grad_output); +} diff --git a/projects/PTv3/libs/pointops/src/attention/attention_cuda_kernel.h b/projects/PTv3/libs/pointops/src/attention/attention_cuda_kernel.h new file mode 100644 index 000000000..fec965c04 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/attention/attention_cuda_kernel.h @@ -0,0 +1,54 @@ +#ifndef _ATTENTION_CUDA_KERNEL +#define _ATTENTION_CUDA_KERNEL +#include +#include +#include + +void attention_relation_step_forward_cuda(int m, int g, int c, + at::Tensor query_tensor, at::Tensor key_tensor, at::Tensor weight_tensor, + at::Tensor index_target_tensor, at::Tensor index_refer_tensor, + at::Tensor output_tensor); +void attention_relation_step_backward_cuda(int m, int g, int c, + at::Tensor query_tensor, at::Tensor grad_query_tensor, + at::Tensor key_tensor, at::Tensor grad_key_tensor, + at::Tensor weight_tensor, at::Tensor grad_weight_tensor, + at::Tensor index_target_tensor, at::Tensor index_refer_tensor, + at::Tensor grad_output_tensor); +void attention_fusion_step_forward_cuda(int m, int g, int c, + at::Tensor weight_tensor, at::Tensor value_tensor, + at::Tensor index_target_tensor, at::Tensor index_refer_tensor, + at::Tensor output_tensor); +void attention_fusion_step_backward_cuda(int m, int g, int c, + at::Tensor weight_tensor, at::Tensor grad_weight_tensor, + at::Tensor value_tensor, at::Tensor grad_value_tensor, + at::Tensor index_target_tensor, at::Tensor index_refer_tensor, + at::Tensor grad_output_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void attention_relation_step_forward_cuda_launcher(int m, int g, int c, + const float *query, const float *key, const float *weight, + const int *index_target, const int *index_refer, + float *output); +void attention_relation_step_backward_cuda_launcher(int m, int g, int c, + const float *query, float *grad_query, + const float *key, float *grad_key, + const float *weight, float *grad_weight, + const int *index_target, const int *index_refer, + const float *grad_output); +void attention_fusion_step_forward_cuda_launcher(int m, int g, int c, + const float *weight, const float *value, + const int *index_target, const int *index_refer, + float *output); +void attention_fusion_step_backward_cuda_launcher(int m, int g, int c, + const float *weight, float *grad_weight, + const float *value, float *grad_value, + const int *index_target, const int *index_refer, + const float *grad_output); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda.cpp b/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda.cpp new file mode 100644 index 000000000..04cd5ff9e --- /dev/null +++ b/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda.cpp @@ -0,0 +1,20 @@ +#include +#include +#include +#include "ball_query_cuda_kernel.h" + + +void ball_query_cuda(int m, int nsample, + float min_radius, float max_radius, + at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, + at::Tensor offset_tensor, at::Tensor new_offset_tensor, + at::Tensor idx_tensor, at::Tensor dist2_tensor) +{ + const float *xyz = xyz_tensor.data_ptr(); + const float *new_xyz = new_xyz_tensor.data_ptr(); + const int *offset = offset_tensor.data_ptr(); + const int *new_offset = new_offset_tensor.data_ptr(); + int *idx = idx_tensor.data_ptr(); + float *dist2 = dist2_tensor.data_ptr(); + ball_query_cuda_launcher(m, nsample, min_radius, max_radius, xyz, new_xyz, offset, new_offset, idx, dist2); +} diff --git a/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda_kernel.cu new file mode 100644 index 000000000..7b3d95a98 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda_kernel.cu @@ -0,0 +1,190 @@ +#include "../cuda_utils.h" +#include "ball_query_cuda_kernel.h" + + +namespace ball_query_utils{ + +template +__device__ void swap(DType *x, DType *y) +{ + DType tmp = *x; + *x = *y; + *y = tmp; +} + +__device__ void reheap(float *dist, int *idx, int k) +{ + int root = 0; + int child = root * 2 + 1; + while (child < k) + { + if(child + 1 < k && dist[child+1] > dist[child]) + child++; + if(dist[root] > dist[child]) + return; + swap(&dist[root], &dist[child]); + swap(&idx[root], &idx[child]); + root = child; + child = root * 2 + 1; + } +} + + +__device__ void heap_sort(float *dist, int *idx, int k) +{ + int i; + for (i = k - 1; i > 0; i--) + { + swap(&dist[0], &dist[i]); + swap(&idx[0], &idx[i]); + reheap(dist, idx, i); + } +} + +__device__ int get_bt_idx(int idx, const int *offset) +{ + int i = 0; + while (1) + { + if (idx < offset[i]) + break; + else + i++; + } + return i; +} +} // namespace ball_query_utils + +__global__ void ball_query_cuda_kernel(int m, int nsample, + float min_radius, float max_radius, + const float *__restrict__ xyz, const float *__restrict__ new_xyz, + const int *__restrict__ offset, const int *__restrict__ new_offset, + int *__restrict__ idx, float *__restrict__ dist2) { + // input: xyz (n, 3) new_xyz (m, 3) + // output: idx (m, nsample) dist (m, nsample) + int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (pt_idx >= m) return; + + new_xyz += pt_idx * 3; + idx += pt_idx * nsample; + dist2 += pt_idx * nsample; + + int bt_idx = ball_query_utils::get_bt_idx(pt_idx, new_offset); + int start; + if (bt_idx == 0) + start = 0; + else + start = offset[bt_idx - 1]; + int end = offset[bt_idx]; + + float max_radius2 = max_radius * max_radius; + float min_radius2 = min_radius * min_radius; + float new_x = new_xyz[0]; + float new_y = new_xyz[1]; + float new_z = new_xyz[2]; + + float candi_dist[2048]; + int candi_idx[2048]; + int candi_num = 0; + + for(int i = start; i < end; i++){ + float x = xyz[i * 3 + 0]; + float y = xyz[i * 3 + 1]; + float z = xyz[i * 3 + 2]; + float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); + + if (d2 <= 1e-5 || (d2 >= min_radius2 && d2 < max_radius2)){ + // TODO: Check d2 <= 1e-5 + candi_dist[candi_num] = d2; + candi_idx[candi_num] = i; + candi_num += 1; + } + } + ball_query_utils::heap_sort(candi_dist, candi_idx, candi_num); + if(candi_num <= nsample){ + for(int i = 0; i < candi_num; i++){ + idx[i] = candi_idx[i]; + dist2[i] = candi_dist[i]; + } + for(int i = candi_num; i < nsample; i++){ + idx[i] = -1; + dist2[i] = 1e10; + } + } + else{ + float sep = static_cast(candi_num) / nsample; + for(int i = 0; i < nsample; i++) + { + int index = static_cast(sep * i); + idx[i] = candi_idx[index]; + dist2[i] = candi_idx[index]; + } + } +} + +/* Random Sample Mode Ball Query */ + +// __global__ void ball_query_cuda_kernel(int m, int nsample, +// float min_radius, float max_radius, +// const float *__restrict__ xyz, const float *__restrict__ new_xyz, +// const int *__restrict__ offset, const int *__restrict__ new_offset, +// int *__restrict__ idx, float *__restrict__ dist2) { +// // input: xyz (n, 3) new_xyz (m, 3) +// // output: idx (m, nsample) dist (m, nsample) +// int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; +// if (pt_idx >= m) return; +// +// new_xyz += pt_idx * 3; +// idx += pt_idx * nsample; +// dist2 += pt_idx * nsample; +// +// int bt_idx = ball_get_bt_idx(pt_idx, new_offset); +// int start; +// if (bt_idx == 0) +// start = 0; +// else +// start = offset[bt_idx - 1]; +// int end = offset[bt_idx]; +// +// float max_radius2 = max_radius * max_radius; +// float min_radius2 = min_radius * min_radius; +// float new_x = new_xyz[0]; +// float new_y = new_xyz[1]; +// float new_z = new_xyz[2]; +// +// int cnt = 0; +// for(int i = start; i < end; i++){ +// float x = xyz[i * 3 + 0]; +// float y = xyz[i * 3 + 1]; +// float z = xyz[i * 3 + 2]; +// float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); +// +// if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) { +// if (cnt == 0) { +// for (int l = 0; l < nsample; ++l) { +// idx[l] = i; +// dist2[l] = d2; +// } +// } +// idx[cnt] = i; +// ++cnt; +// if (cnt >= nsample) break; +// } +// } +// } + + +void ball_query_cuda_launcher(int m, int nsample, + float min_radius, float max_radius, + const float *xyz, const float *new_xyz, + const int *offset, const int *new_offset, + int *idx, float *dist2) { + // input: new_xyz: (m, 3), xyz: (n, 3), idx: (m, nsample) + dim3 blocks(DIVUP(m, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + ball_query_cuda_kernel<<>>(m, nsample, + min_radius, max_radius, + xyz, new_xyz, + offset, new_offset, + idx, dist2); +} diff --git a/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda_kernel.h b/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda_kernel.h new file mode 100644 index 000000000..03007a285 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/ball_query/ball_query_cuda_kernel.h @@ -0,0 +1,26 @@ +#ifndef _BALL_QUERY_CUDA_KERNEL +#define _BALL_QUERY_CUDA_KERNEL +#include +#include +#include + +void ball_query_cuda(int m, int nsample, + float min_radius, float max_radius, + at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, + at::Tensor offset_tensor, at::Tensor new_offset_tensor, + at::Tensor idx_tensor, at::Tensor dist2_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void ball_query_cuda_launcher(int m, int nsample, + float min_radius, float max_radius, + const float *xyz, const float *new_xyz, + const int *offset, const int *new_offset, + int *idx, float *dist2); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/projects/PTv3/libs/pointops/src/cuda_utils.h b/projects/PTv3/libs/pointops/src/cuda_utils.h new file mode 100644 index 000000000..bbfe7a06b --- /dev/null +++ b/projects/PTv3/libs/pointops/src/cuda_utils.h @@ -0,0 +1,23 @@ +#ifndef _CUDA_UTILS_H +#define _CUDA_UTILS_H + +#include +#include + +#define TOTAL_THREADS 1024 +#define THREADS_PER_BLOCK 512 +#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) + +inline int opt_n_threads(int work_size) { + const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); + return std::max(std::min(1 << pow_2, TOTAL_THREADS), 1); +} + +inline dim3 opt_block_config(int x, int y) { + const int x_threads = opt_n_threads(x); + const int y_threads = std::max(std::min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1); + dim3 block_config(x_threads, y_threads, 1); + return block_config; +} + +#endif diff --git a/projects/PTv3/libs/pointops/src/grouping/grouping_cuda.cpp b/projects/PTv3/libs/pointops/src/grouping/grouping_cuda.cpp new file mode 100644 index 000000000..6f7990ada --- /dev/null +++ b/projects/PTv3/libs/pointops/src/grouping/grouping_cuda.cpp @@ -0,0 +1,21 @@ +#include +#include +#include +#include "grouping_cuda_kernel.h" + + +void grouping_forward_cuda(int m, int nsample, int c, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor output_tensor) +{ + const float *input = input_tensor.data_ptr(); + const int *idx = idx_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + grouping_forward_cuda_launcher(m, nsample, c, input, idx, output); +} + +void grouping_backward_cuda(int m, int nsample, int c, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor grad_input_tensor) +{ + const float *grad_output = grad_output_tensor.data_ptr(); + const int *idx = idx_tensor.data_ptr(); + float *grad_input = grad_input_tensor.data_ptr(); + grouping_backward_cuda_launcher(m, nsample, c, grad_output, idx, grad_input); +} diff --git a/projects/PTv3/libs/pointops/src/grouping/grouping_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/grouping/grouping_cuda_kernel.cu new file mode 100644 index 000000000..a5fbd78fa --- /dev/null +++ b/projects/PTv3/libs/pointops/src/grouping/grouping_cuda_kernel.cu @@ -0,0 +1,40 @@ +#include "../cuda_utils.h" +#include "grouping_cuda_kernel.h" + + +__global__ void grouping_forward_cuda_kernel(int m, int nsample, int c, const float *__restrict__ input, const int *__restrict__ idx, float *__restrict__ output) { + // input: input: (n, c), idx: (m, nsample), output: (m, nsample, c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= m * nsample * c) return; + const int c_idx = index % c; + const int nsample_idx = (index / c) % nsample; + const int m_idx = index / nsample / c; + const int input_idx = idx[m_idx * nsample + nsample_idx] * c + c_idx; + output[index] = input[input_idx]; +} + +__global__ void grouping_backward_cuda_kernel(int m, int nsample, int c, const float *__restrict__ grad_output, const int *__restrict__ idx, float *__restrict__ grad_input) { + // input: grad_output: (m, nsample, c), idx: (m, nsample), output: grad_input: (n, c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= m * nsample * c) return; + const int c_idx = index % c; + const int nsample_idx = (index / c) % nsample; + const int m_idx = index / nsample / c; + const int input_idx = idx[m_idx * nsample + nsample_idx] * c + c_idx; + atomicAdd(grad_input + input_idx, grad_output[index]); +} + +void grouping_forward_cuda_launcher(int m, int nsample, int c, const float *input, const int *idx, float *output) { + // input: input: (n, c), idx: (m, nsample), output: (m, nsample, c) + dim3 blocks(DIVUP(m * nsample * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + grouping_forward_cuda_kernel<<>>(m, nsample, c, input, idx, output); +} + +void grouping_backward_cuda_launcher(int m, int nsample, int c, const float *grad_output, const int *idx, float *grad_input) +{ + // input: grad_output: (m, nsample, c), idx: (m, nsample), output: grad_input: (n, c) + dim3 blocks(DIVUP(m * nsample * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + grouping_backward_cuda_kernel<<>>(m, nsample, c, grad_output, idx, grad_input); +} diff --git a/projects/PTv3/libs/pointops/src/grouping/grouping_cuda_kernel.h b/projects/PTv3/libs/pointops/src/grouping/grouping_cuda_kernel.h new file mode 100644 index 000000000..3db4aaa9f --- /dev/null +++ b/projects/PTv3/libs/pointops/src/grouping/grouping_cuda_kernel.h @@ -0,0 +1,20 @@ +#ifndef _GROUPING_CUDA_KERNEL +#define _GROUPING_CUDA_KERNEL +#include +#include +#include + +void grouping_forward_cuda(int m, int nsample, int c, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor output_tensor); +void grouping_backward_cuda(int m, int nsample, int c, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor grad_input_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void grouping_forward_cuda_launcher(int m, int nsample, int c, const float *input, const int *idx, float *output); +void grouping_backward_cuda_launcher(int m, int nsample, int c, const float *grad_output, const int *idx, float *grad_input); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda.cpp b/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda.cpp new file mode 100644 index 000000000..f2c1b0078 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda.cpp @@ -0,0 +1,23 @@ +#include +#include +#include +#include "interpolation_cuda_kernel.h" + + +void interpolation_forward_cuda(int n, int c, int k, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor output_tensor) +{ + const float *input = input_tensor.data_ptr(); + const int *idx = idx_tensor.data_ptr(); + const float *weight = weight_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + interpolation_forward_cuda_launcher(n, c, k, input, idx, weight, output); +} + +void interpolation_backward_cuda(int n, int c, int k, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_input_tensor) +{ + const float *grad_output = grad_output_tensor.data_ptr(); + const int *idx = idx_tensor.data_ptr(); + const float *weight = weight_tensor.data_ptr(); + float *grad_input = grad_input_tensor.data_ptr(); + interpolation_backward_cuda_launcher(n, c, k, grad_output, idx, weight, grad_input); +} diff --git a/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda_kernel.cu new file mode 100644 index 000000000..f560d8c92 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda_kernel.cu @@ -0,0 +1,47 @@ +#include "../cuda_utils.h" +#include "interpolation_cuda_kernel.h" + + +__global__ void interpolation_forward_cuda_kernel(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output) +{ + // input: input: (m, c), idx: (n, k), weight: (n, k), output: output (n, c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n * c) return; + int c_idx = index % c; + int n_idx = index / c; + for (int i = 0; i < k; i++) + { + int idx_idx = n_idx * k + i; + int input_idx = idx[idx_idx] * c + c_idx; + output[index] += input[input_idx] * weight[idx_idx]; + } +} + +__global__ void interpolation_backward_cuda_kernel(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input) +{ + // input: grad_output: (n, c), idx: (n, k), weight: (n, k), output: grad_input (m, c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n * c) return; + int c_idx = index % c; + int n_idx = index / c; + for (int i = 0; i < k; i++) + { + int idx_idx = n_idx * k + i; + int input_idx = idx[idx_idx] * c + c_idx; + atomicAdd(grad_input + input_idx, grad_output[index] * weight[idx_idx]); + } +} + +void interpolation_forward_cuda_launcher(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output) { + // input: input: (m, c), idx: (n, k), weight: (n, k), output: output (n, c) + dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + interpolation_forward_cuda_kernel<<>>(n, c, k, input, idx, weight, output); +} + +void interpolation_backward_cuda_launcher(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input) { + // input: grad_output: (n, c), idx: (n, k), weight: (n, k), output: grad_input (m, c) + dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + interpolation_backward_cuda_kernel<<>>(n, c, k, grad_output, idx, weight, grad_input); +} diff --git a/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda_kernel.h b/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda_kernel.h new file mode 100644 index 000000000..309e5dd0a --- /dev/null +++ b/projects/PTv3/libs/pointops/src/interpolation/interpolation_cuda_kernel.h @@ -0,0 +1,20 @@ +#ifndef _INTERPOLATION_CUDA_KERNEL +#define _INTERPOLATION_CUDA_KERNEL +#include +#include +#include + +void interpolation_forward_cuda(int n, int c, int k, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor output_tensor); +void interpolation_backward_cuda(int n, int c, int k, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_input_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void interpolation_forward_cuda_launcher(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output); +void interpolation_backward_cuda_launcher(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda.cpp b/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda.cpp new file mode 100644 index 000000000..bbe841ce0 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda.cpp @@ -0,0 +1,16 @@ +#include +#include +#include +#include "knn_query_cuda_kernel.h" + + +void knn_query_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor) +{ + const float *xyz = xyz_tensor.data_ptr(); + const float *new_xyz = new_xyz_tensor.data_ptr(); + const int *offset = offset_tensor.data_ptr(); + const int *new_offset = new_offset_tensor.data_ptr(); + int *idx = idx_tensor.data_ptr(); + float *dist2 = dist2_tensor.data_ptr(); + knn_query_cuda_launcher(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2); +} diff --git a/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda_kernel.cu new file mode 100644 index 000000000..297740237 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda_kernel.cu @@ -0,0 +1,112 @@ +#include "../cuda_utils.h" +#include "knn_query_cuda_kernel.h" + + +namespace knn_query_utils{ + +template +__device__ void swap(DType *x, DType *y) +{ + DType tmp = *x; + *x = *y; + *y = tmp; +} + +__device__ void reheap(float *dist, int *idx, int k) +{ + int root = 0; + int child = root * 2 + 1; + while (child < k) + { + if(child + 1 < k && dist[child+1] > dist[child]) + child++; + if(dist[root] > dist[child]) + return; + swap(&dist[root], &dist[child]); + swap(&idx[root], &idx[child]); + root = child; + child = root * 2 + 1; + } +} + + +__device__ void heap_sort(float *dist, int *idx, int k) +{ + int i; + for (i = k - 1; i > 0; i--) + { + swap(&dist[0], &dist[i]); + swap(&idx[0], &idx[i]); + reheap(dist, idx, i); + } +} + + +__device__ int get_bt_idx(int idx, const int *offset) +{ + int i = 0; + while (1) + { + if (idx < offset[i]) + break; + else + i++; + } + return i; +} +} // namespace knn_query_utils + + +__global__ void knn_query_cuda_kernel(int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, const int *__restrict__ offset, const int *__restrict__ new_offset, int *__restrict__ idx, float *__restrict__ dist2) { + // input: xyz (n, 3) new_xyz (m, 3) + // output: idx (m, nsample) dist2 (m, nsample) + int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (pt_idx >= m) return; + + new_xyz += pt_idx * 3; + idx += pt_idx * nsample; + dist2 += pt_idx * nsample; + + int bt_idx = knn_query_utils::get_bt_idx(pt_idx, new_offset); + int start; + if (bt_idx == 0) + start = 0; + else + start = offset[bt_idx - 1]; + int end = offset[bt_idx]; + + float new_x = new_xyz[0]; + float new_y = new_xyz[1]; + float new_z = new_xyz[2]; + + float best_dist[128]; + int best_idx[128]; + for(int i = 0; i < nsample; i++){ + best_dist[i] = 1e10; + best_idx[i] = -1; + } + for(int i = start; i < end; i++){ + float x = xyz[i * 3 + 0]; + float y = xyz[i * 3 + 1]; + float z = xyz[i * 3 + 2]; + float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); + if (d2 < best_dist[0]){ + best_dist[0] = d2; + best_idx[0] = i; + knn_query_utils::reheap(best_dist, best_idx, nsample); + } + } + knn_query_utils::heap_sort(best_dist, best_idx, nsample); + for(int i = 0; i < nsample; i++){ + idx[i] = best_idx[i]; + dist2[i] = best_dist[i]; + } +} + + +void knn_query_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, const int *new_offset, int *idx, float *dist2) { + // input: new_xyz: (m, 3), xyz: (n, 3), idx: (m, nsample) + dim3 blocks(DIVUP(m, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + knn_query_cuda_kernel<<>>(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2); +} diff --git a/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda_kernel.h b/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda_kernel.h new file mode 100644 index 000000000..c07c1cb46 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/knn_query/knn_query_cuda_kernel.h @@ -0,0 +1,18 @@ +#ifndef _KNN_QUERY_CUDA_KERNEL +#define _KNN_QUERY_CUDA_KERNEL +#include +#include +#include + +void knn_query_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void knn_query_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, const int *new_offset, int *idx, float *dist2); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/projects/PTv3/libs/pointops/src/pointops_api.cpp b/projects/PTv3/libs/pointops/src/pointops_api.cpp new file mode 100644 index 000000000..5ca437760 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/pointops_api.cpp @@ -0,0 +1,32 @@ +#include +#include + +#include "knn_query/knn_query_cuda_kernel.h" +#include "ball_query/ball_query_cuda_kernel.h" +#include "random_ball_query/random_ball_query_cuda_kernel.h" +#include "sampling/sampling_cuda_kernel.h" +#include "grouping/grouping_cuda_kernel.h" +#include "interpolation/interpolation_cuda_kernel.h" +#include "aggregation/aggregation_cuda_kernel.h" +#include "subtraction/subtraction_cuda_kernel.h" +#include "attention/attention_cuda_kernel.h" + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("knn_query_cuda", &knn_query_cuda, "knn_query_cuda"); + m.def("ball_query_cuda", &ball_query_cuda, "ball_query_cuda"); + m.def("random_ball_query_cuda", &random_ball_query_cuda, "random_ball_query_cuda"); + m.def("farthest_point_sampling_cuda", &farthest_point_sampling_cuda, "farthest_point_sampling_cuda"); + m.def("grouping_forward_cuda", &grouping_forward_cuda, "grouping_forward_cuda"); + m.def("grouping_backward_cuda", &grouping_backward_cuda, "grouping_backward_cuda"); + m.def("interpolation_forward_cuda", &interpolation_forward_cuda, "interpolation_forward_cuda"); + m.def("interpolation_backward_cuda", &interpolation_backward_cuda, "interpolation_backward_cuda"); + m.def("subtraction_forward_cuda", &subtraction_forward_cuda, "subtraction_forward_cuda"); + m.def("subtraction_backward_cuda", &subtraction_backward_cuda, "subtraction_backward_cuda"); + m.def("aggregation_forward_cuda", &aggregation_forward_cuda, "aggregation_forward_cuda"); + m.def("aggregation_backward_cuda", &aggregation_backward_cuda, "aggregation_backward_cuda"); + m.def("attention_relation_step_forward_cuda", &attention_relation_step_forward_cuda, "attention_relation_step_forward_cuda"); + m.def("attention_relation_step_backward_cuda", &attention_relation_step_backward_cuda, "attention_relation_step_backward_cuda"); + m.def("attention_fusion_step_forward_cuda", &attention_fusion_step_forward_cuda, "attention_fusion_step_forward_cuda"); + m.def("attention_fusion_step_backward_cuda", &attention_fusion_step_backward_cuda, "attention_fusion_step_backward_cuda"); +} diff --git a/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda.cpp b/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda.cpp new file mode 100644 index 000000000..c2618c94b --- /dev/null +++ b/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda.cpp @@ -0,0 +1,21 @@ +#include +#include +#include +#include "random_ball_query_cuda_kernel.h" + + +void random_ball_query_cuda(int m, int nsample, + float min_radius, float max_radius, at::Tensor order_tensor, + at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, + at::Tensor offset_tensor, at::Tensor new_offset_tensor, + at::Tensor idx_tensor, at::Tensor dist2_tensor) +{ + const int *order = order_tensor.data_ptr(); + const float *xyz = xyz_tensor.data_ptr(); + const float *new_xyz = new_xyz_tensor.data_ptr(); + const int *offset = offset_tensor.data_ptr(); + const int *new_offset = new_offset_tensor.data_ptr(); + int *idx = idx_tensor.data_ptr(); + float *dist2 = dist2_tensor.data_ptr(); + random_ball_query_cuda_launcher(m, nsample, min_radius, max_radius, order, xyz, new_xyz, offset, new_offset, idx, dist2); +} diff --git a/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda_kernel.cu new file mode 100644 index 000000000..bfafb0f8b --- /dev/null +++ b/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda_kernel.cu @@ -0,0 +1,123 @@ +#include "../cuda_utils.h" +#include "random_ball_query_cuda_kernel.h" + + +namespace random_ball_query_utils{ + +template +__device__ void swap(DType *x, DType *y) +{ + DType tmp = *x; + *x = *y; + *y = tmp; +} + +__device__ void reheap(float *dist, int *idx, int k) +{ + int root = 0; + int child = root * 2 + 1; + while (child < k) + { + if(child + 1 < k && dist[child+1] > dist[child]) + child++; + if(dist[root] > dist[child]) + return; + swap(&dist[root], &dist[child]); + swap(&idx[root], &idx[child]); + root = child; + child = root * 2 + 1; + } +} + + +__device__ void heap_sort(float *dist, int *idx, int k) +{ + int i; + for (i = k - 1; i > 0; i--) + { + swap(&dist[0], &dist[i]); + swap(&idx[0], &idx[i]); + reheap(dist, idx, i); + } +} + +__device__ int get_bt_idx(int idx, const int *offset) +{ + int i = 0; + while (1) + { + if (idx < offset[i]) + break; + else + i++; + } + return i; +} +} // namespace ball_query_utils + +__global__ void random_ball_query_cuda_kernel(int m, int nsample, + float min_radius, float max_radius, const int *__restrict__ order, + const float *__restrict__ xyz, const float *__restrict__ new_xyz, + const int *__restrict__ offset, const int *__restrict__ new_offset, + int *__restrict__ idx, float *__restrict__ dist2) { + // input: xyz (n, 3) new_xyz (m, 3) + // output: idx (m, nsample) dist (m, nsample) + int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (pt_idx >= m) return; + + new_xyz += pt_idx * 3; + idx += pt_idx * nsample; + dist2 += pt_idx * nsample; + + int bt_idx = random_ball_query_utils::get_bt_idx(pt_idx, new_offset); + int start; + if (bt_idx == 0) + start = 0; + else + start = offset[bt_idx - 1]; + int end = offset[bt_idx]; + + float max_radius2 = max_radius * max_radius; + float min_radius2 = min_radius * min_radius; + float new_x = new_xyz[0]; + float new_y = new_xyz[1]; + float new_z = new_xyz[2]; + + int cnt = 0; + + for(int i = start; i < end; i++){ + float x = xyz[order[i] * 3 + 0]; + float y = xyz[order[i] * 3 + 1]; + float z = xyz[order[i] * 3 + 2]; + float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); + + if (d2 <= 1e-5 || (d2 >= min_radius2 && d2 < max_radius2)){ + dist2[cnt] = d2; + idx[cnt] = order[i]; + cnt += 1; + if (cnt >= nsample) break; + } + } + + if (cnt < nsample) { + for (int i = cnt; i < nsample; i++){ + idx[i] = -1; + dist2[i] = 1e10; + } + } +} + +void random_ball_query_cuda_launcher(int m, int nsample, + float min_radius, float max_radius, const int *order, + const float *xyz, const float *new_xyz, + const int *offset, const int *new_offset, + int *idx, float *dist2) { + // input: new_xyz: (m, 3), xyz: (n, 3), idx: (m, nsample) + dim3 blocks(DIVUP(m, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + random_ball_query_cuda_kernel<<>>(m, nsample, + min_radius, max_radius, order, + xyz, new_xyz, + offset, new_offset, + idx, dist2); +} diff --git a/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda_kernel.h b/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda_kernel.h new file mode 100644 index 000000000..d3e35be21 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/random_ball_query/random_ball_query_cuda_kernel.h @@ -0,0 +1,26 @@ +#ifndef _RANDOM_BALL_QUERY_CUDA_KERNEL +#define _RANDOM_BALL_QUERY_CUDA_KERNEL +#include +#include +#include + +void random_ball_query_cuda(int m, int nsample, + float min_radius, float max_radius, at::Tensor order_tensor, + at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, + at::Tensor offset_tensor, at::Tensor new_offset_tensor, + at::Tensor idx_tensor, at::Tensor dist2_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void random_ball_query_cuda_launcher(int m, int nsample, + float min_radius, float max_radius, const int *order, + const float *xyz, const float *new_xyz, + const int *offset, const int *new_offset, + int *idx, float *dist2); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/projects/PTv3/libs/pointops/src/sampling/sampling_cuda.cpp b/projects/PTv3/libs/pointops/src/sampling/sampling_cuda.cpp new file mode 100644 index 000000000..7dc8094c3 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/sampling/sampling_cuda.cpp @@ -0,0 +1,15 @@ +#include +#include +#include +#include "sampling_cuda_kernel.h" + + +void farthest_point_sampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor tmp_tensor, at::Tensor idx_tensor) +{ + const float *xyz = xyz_tensor.data_ptr(); + const int *offset = offset_tensor.data_ptr(); + const int *new_offset = new_offset_tensor.data_ptr(); + float *tmp = tmp_tensor.data_ptr(); + int *idx = idx_tensor.data_ptr(); + farthest_point_sampling_cuda_launcher(b, n, xyz, offset, new_offset, tmp, idx); +} diff --git a/projects/PTv3/libs/pointops/src/sampling/sampling_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/sampling/sampling_cuda_kernel.cu new file mode 100644 index 000000000..fb240c2d0 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/sampling/sampling_cuda_kernel.cu @@ -0,0 +1,171 @@ +#include "../cuda_utils.h" +#include "sampling_cuda_kernel.h" + + +__device__ void __update(float *dists, int *dists_i, int idx1, int idx2) { + const float v1 = dists[idx1], v2 = dists[idx2]; + const int i1 = dists_i[idx1], i2 = dists_i[idx2]; + dists[idx1] = max(v1, v2); + dists_i[idx1] = v2 > v1 ? i2 : i1; +} + +// input xyz: (n, 3), tmp: (b, n_max) +// ouput idx (m) +template +__global__ void farthest_point_sampling_cuda_kernel(const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx) +{ + __shared__ float dists[block_size]; + __shared__ int dists_i[block_size]; + + int bid = blockIdx.x; + int start_n, end_n, start_m, end_m, old; + if (bid == 0) { + start_n = 0; + end_n = offset[0]; + start_m = 0; + end_m = new_offset[0]; + old = 0; + } + else { + start_n = offset[bid - 1]; + end_n = offset[bid]; + start_m = new_offset[bid - 1]; + end_m = new_offset[bid]; + old = offset[bid - 1]; + } + + const int stride = block_size; + int tid = threadIdx.x; + if (tid == 0) idx[start_m] = start_n; + + __syncthreads(); + for (int j = start_m + 1; j < end_m; j++) + { + int besti = start_n; + float best = -1; + float x1 = xyz[old * 3 + 0]; + float y1 = xyz[old * 3 + 1]; + float z1 = xyz[old * 3 + 2]; + for (int k = start_n + tid; k < end_n; k += stride) + { + float x2 = xyz[k * 3 + 0]; + float y2 = xyz[k * 3 + 1]; + float z2 = xyz[k * 3 + 2]; + float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1); + float d2 = min(d, tmp[k]); + tmp[k] = d2; + besti = d2 > best ? k : besti; + best = d2 > best ? d2 : best; + } + dists[tid] = best; + dists_i[tid] = besti; + __syncthreads(); + + if (block_size >= 1024) { + if (tid < 512) { + __update(dists, dists_i, tid, tid + 512); + } + __syncthreads(); + } + if (block_size >= 512) { + if (tid < 256) { + __update(dists, dists_i, tid, tid + 256); + } + __syncthreads(); + } + if (block_size >= 256) { + if (tid < 128) { + __update(dists, dists_i, tid, tid + 128); + } + __syncthreads(); + } + if (block_size >= 128) { + if (tid < 64) { + __update(dists, dists_i, tid, tid + 64); + } + __syncthreads(); + } + if (block_size >= 64) { + if (tid < 32) { + __update(dists, dists_i, tid, tid + 32); + } + __syncthreads(); + } + if (block_size >= 32) { + if (tid < 16) { + __update(dists, dists_i, tid, tid + 16); + } + __syncthreads(); + } + if (block_size >= 16) { + if (tid < 8) { + __update(dists, dists_i, tid, tid + 8); + } + __syncthreads(); + } + if (block_size >= 8) { + if (tid < 4) { + __update(dists, dists_i, tid, tid + 4); + } + __syncthreads(); + } + if (block_size >= 4) { + if (tid < 2) { + __update(dists, dists_i, tid, tid + 2); + } + __syncthreads(); + } + if (block_size >= 2) { + if (tid < 1) { + __update(dists, dists_i, tid, tid + 1); + } + __syncthreads(); + } + + old = dists_i[0]; + if (tid == 0) + idx[j] = old; + } +} + +void farthest_point_sampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx) +{ + unsigned int n_threads = opt_n_threads(n); + switch (n_threads) { + case 1024: + farthest_point_sampling_cuda_kernel<1024><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 512: + farthest_point_sampling_cuda_kernel<512><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 256: + farthest_point_sampling_cuda_kernel<256><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 128: + farthest_point_sampling_cuda_kernel<128><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 64: + farthest_point_sampling_cuda_kernel<64><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 32: + farthest_point_sampling_cuda_kernel<32><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 16: + farthest_point_sampling_cuda_kernel<16><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 8: + farthest_point_sampling_cuda_kernel<8><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 4: + farthest_point_sampling_cuda_kernel<4><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 2: + farthest_point_sampling_cuda_kernel<2><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 1: + farthest_point_sampling_cuda_kernel<1><<>>(xyz, offset, new_offset, tmp, idx); + break; + default: + farthest_point_sampling_cuda_kernel<512><<>>(xyz, offset, new_offset, tmp, idx); + } +} diff --git a/projects/PTv3/libs/pointops/src/sampling/sampling_cuda_kernel.h b/projects/PTv3/libs/pointops/src/sampling/sampling_cuda_kernel.h new file mode 100644 index 000000000..f0e076073 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/sampling/sampling_cuda_kernel.h @@ -0,0 +1,18 @@ +#ifndef _SAMPLING_CUDA_KERNEL +#define _SAMPLING_CUDA_KERNEL +#include +#include +#include + +void farthest_point_sampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor tmp_tensor, at::Tensor idx_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void farthest_point_sampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda.cpp b/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda.cpp new file mode 100644 index 000000000..b343857a1 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda.cpp @@ -0,0 +1,23 @@ +#include +#include +#include +#include "subtraction_cuda_kernel.h" + + +void subtraction_forward_cuda(int n, int nsample, int c, at::Tensor input1_tensor, at::Tensor input2_tensor, at::Tensor idx_tensor, at::Tensor output_tensor) +{ + const float *input1 = input1_tensor.data_ptr(); + const float *input2 = input2_tensor.data_ptr(); + const int *idx = idx_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + subtraction_forward_cuda_launcher(n, nsample, c, input1, input2, idx, output); +} + +void subtraction_backward_cuda(int n, int nsample, int c, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input1_tensor, at::Tensor grad_input2_tensor) +{ + const int *idx = idx_tensor.data_ptr(); + const float *grad_output = grad_output_tensor.data_ptr(); + float *grad_input1 = grad_input1_tensor.data_ptr(); + float *grad_input2 = grad_input2_tensor.data_ptr(); + subtraction_backward_cuda_launcher(n, nsample, c, idx, grad_output, grad_input1, grad_input2); +} diff --git a/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda_kernel.cu b/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda_kernel.cu new file mode 100644 index 000000000..0848a455d --- /dev/null +++ b/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda_kernel.cu @@ -0,0 +1,44 @@ +#include "../cuda_utils.h" +#include "subtraction_cuda_kernel.h" + + +__global__ void subtraction_forward_cuda_kernel(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output) { + // input: input1: (n, c), input2: (n, c), idx: (n, nsample), output: (n, nsample, c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n * nsample * c) return; + const int c_idx = index % c; + const int nsample_idx = (index / c) % nsample; + const int n_idx = index / nsample / c; + const int idx_idx = n_idx * nsample + nsample_idx; + const int input1_idx = n_idx * c + c_idx; + const int input2_idx = idx[idx_idx] * c + c_idx; + output[index] = input1[input1_idx] - input2[input2_idx]; +} + +__global__ void subtraction_backward_cuda_kernel(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2) { + // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n * nsample * c) return; + const int c_idx = index % c; + const int nsample_idx = (index / c) % nsample; + const int n_idx = index / nsample / c; + const int idx_idx = n_idx * nsample + nsample_idx; + const int input1_idx = n_idx * c + c_idx; + const int input2_idx = idx[idx_idx] * c + c_idx; + atomicAdd(grad_input1 + input1_idx, grad_output[index]); + atomicAdd(grad_input2 + input2_idx, -grad_output[index]); +} + +void subtraction_forward_cuda_launcher(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output) { + // input: input1: (n, c), input2: (n, c), idx: (n, nsample), output: (n, nsample, c) + dim3 blocks(DIVUP(n * nsample * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + subtraction_forward_cuda_kernel<<>>(n, nsample, c, input1, input2, idx, output); +} + +void subtraction_backward_cuda_launcher(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2) { + // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c) + dim3 blocks(DIVUP(n * nsample * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + subtraction_backward_cuda_kernel<<>>(n, nsample, c, idx, grad_output, grad_input1, grad_input2); +} diff --git a/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda_kernel.h b/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda_kernel.h new file mode 100644 index 000000000..856133d97 --- /dev/null +++ b/projects/PTv3/libs/pointops/src/subtraction/subtraction_cuda_kernel.h @@ -0,0 +1,20 @@ +#ifndef _SUBTRACTION_CUDA_KERNEL +#define _SUBTRACTION_CUDA_KERNEL +#include +#include +#include + +void subtraction_forward_cuda(int n, int nsample, int c, at::Tensor input1_tensor, at::Tensor input2_tensor, at::Tensor idx_tensor, at::Tensor output_tensor); +void subtraction_backward_cuda(int n, int nsample, int c, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input1_tensor, at::Tensor grad_input2_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void subtraction_forward_cuda_launcher(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output); +void subtraction_backward_cuda_launcher(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/projects/PTv3/libs/pointrope/__init__.py b/projects/PTv3/libs/pointrope/__init__.py new file mode 100644 index 000000000..c8f754883 --- /dev/null +++ b/projects/PTv3/libs/pointrope/__init__.py @@ -0,0 +1 @@ +from .functions import * diff --git a/projects/PTv3/libs/pointrope/functions/__init__.py b/projects/PTv3/libs/pointrope/functions/__init__.py new file mode 100644 index 000000000..bfe3ef321 --- /dev/null +++ b/projects/PTv3/libs/pointrope/functions/__init__.py @@ -0,0 +1 @@ +from .pointrope import PointROPE diff --git a/projects/PTv3/libs/pointrope/functions/pointrope.py b/projects/PTv3/libs/pointrope/functions/pointrope.py new file mode 100644 index 000000000..7065098a5 --- /dev/null +++ b/projects/PTv3/libs/pointrope/functions/pointrope.py @@ -0,0 +1,116 @@ +import torch +from torch.autograd import Function +from torch.onnx.symbolic_helper import _get_tensor_sizes + +try: + import pointrope_cuda as _kernels # run `python setup.py install` +except ModuleNotFoundError: + from . import pointrope_cuda as _kernels # run `python setup.py build_ext --inplace` + + +class PointROPE_func(Function): + @staticmethod + def symbolic(g, tokens, positions, base, F0=1.0): + output = g.op( + "litept::PointRoPE", + tokens, + positions, + base_f=float(base), + f0_f=float(F0), + outputs=1, + ) + token_shape = _get_tensor_sizes(tokens) + # PointRoPE is in-place in forward() and returns a tensor with the same shape as `tokens`. + # Preserve the full [B, N, H, D] shape for ONNX shape inference when available. + if token_shape is not None and hasattr(output.type(), "with_sizes"): + output_type = tokens.type().with_sizes(token_shape) + output.setType(output_type) + + return output + + @staticmethod + def forward(ctx, tokens, positions, base, F0=1.0): + ctx.save_for_backward(positions) + ctx.saved_base = base + ctx.saved_F0 = F0 + # tokens = tokens.clone() # uncomment this if inplace doesn't work + _kernels.pointrope(tokens, positions, base, F0) + ctx.mark_dirty(tokens) + return tokens + + @staticmethod + def backward(ctx, grad_res): + positions, base, F0 = ctx.saved_tensors[0], ctx.saved_base, ctx.saved_F0 + _kernels.pointrope(grad_res, positions, base, -F0) + ctx.mark_dirty(grad_res) + return grad_res, None, None, None + + +# class PointROPE(torch.nn.Module): +# def __init__(self, freq=100.0, F0=1.0): +# super().__init__() +# self.base = freq +# self.F0 = F0 + +# def forward(self, tokens, positions): +# PointROPE_func.apply(tokens.transpose(1, 2), positions, self.base, self.F0) +# return tokens + + +class PointROPE(torch.nn.Module): + def __init__(self, freq=100.0, F0=1.0): + super().__init__() + self.base = freq + self.F0 = F0 + + def forward(self, tokens: torch.Tensor, positions: torch.Tensor): + """""" + tokens = tokens.transpose(1, 2) + assert tokens.dim() == 4, tokens.shape + B, N, H, D = tokens.shape + assert D % 6 == 0 + Q = D // 6 + + # pos -> [B, N, 3] + if positions.dim() == 2: + assert positions.shape[0] == B * N and positions.shape[1] == 3 + pos_bn3 = positions.view(B, N, 3) + else: + assert positions.shape == (B, N, 3), f"{positions.shape=} vs {tokens.shape=}" + pos_bn3 = positions + + # inv_freq: [Q] + # inv_freq[q] = fwd / base^(q/Q) + q = torch.arange(Q, device=tokens.device, dtype=tokens.dtype) + inv_freq = torch.tensor(float(self.F0), device=tokens.device, dtype=tokens.dtype) / ( + torch.tensor(float(self.base), device=tokens.device, dtype=tokens.dtype) ** (q / float(Q)) + ) + + # freq: [B, N, 3, Q] + pos_f = pos_bn3.to(dtype=tokens.dtype) + freq = pos_f.unsqueeze(-1) * inv_freq.view(1, 1, 1, Q) + + cos = torch.cos(freq) # [B, N, 3, Q] + sin = torch.sin(freq) # [B, N, 3, Q] + + # separate tokens into 6-blocks: each [B, N, H, Q] + ux, vx, uy, vy, uz, vz = tokens.split(Q, dim=-1) + + # broadcast each [B, N, 1, Q] to head per axis + cx = cos[:, :, 0, :].unsqueeze(2) + sx = sin[:, :, 0, :].unsqueeze(2) + cy = cos[:, :, 1, :].unsqueeze(2) + sy = sin[:, :, 1, :].unsqueeze(2) + cz = cos[:, :, 2, :].unsqueeze(2) + sz = sin[:, :, 2, :].unsqueeze(2) + + ru_x = ux * cx - vx * sx + rv_x = vx * cx + ux * sx + + ru_y = uy * cy - vy * sy + rv_y = vy * cy + uy * sy + + ru_z = uz * cz - vz * sz + rv_z = vz * cz + uz * sz + + return torch.cat([ru_x, rv_x, ru_y, rv_y, ru_z, rv_z], dim=-1).transpose(1, 2) diff --git a/projects/PTv3/libs/pointrope/pyproject.toml b/projects/PTv3/libs/pointrope/pyproject.toml new file mode 100644 index 000000000..233cba7ca --- /dev/null +++ b/projects/PTv3/libs/pointrope/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "pointrope" +version = "0.1.0" +requires-python = "==3.10.*" +dependencies = [] + +[build-system] +requires = ["setuptools>=61"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["."] +exclude = [] diff --git a/projects/PTv3/libs/pointrope/setup.py b/projects/PTv3/libs/pointrope/setup.py new file mode 100644 index 000000000..0c8adc396 --- /dev/null +++ b/projects/PTv3/libs/pointrope/setup.py @@ -0,0 +1,35 @@ +from setuptools import setup +from torch import cuda +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +# compile for all possible CUDA architectures +# all_cuda_archs = cuda.get_gencode_flags().replace('compute=','arch=').split() +# alternatively, you can list cuda archs that you want, eg: +# check https://developer.nvidia.com/cuda-gpus to find your arch +# fmt: off +all_cuda_archs = [ + "-gencode", "arch=compute_90,code=sm_90", + "-gencode", "arch=compute_75,code=sm_75", + "-gencode", "arch=compute_80,code=sm_80", + "-gencode", "arch=compute_86,code=sm_86", + "-gencode", "arch=compute_89,code=sm_89", +] +# fmt: on + +setup( + name="pointrope", + ext_modules=[ + CUDAExtension( + name="pointrope_cuda", + sources=[ + "src/pointrope.cpp", + "src/kernels.cu", + ], + extra_compile_args=dict( + nvcc=["-O3", "--ptxas-options=-v", "--use_fast_math"] + all_cuda_archs, + cxx=["-O3"], + ), + ) + ], + cmdclass={"build_ext": BuildExtension}, +) diff --git a/projects/PTv3/libs/pointrope/src/kernels.cu b/projects/PTv3/libs/pointrope/src/kernels.cu new file mode 100644 index 000000000..1372465c2 --- /dev/null +++ b/projects/PTv3/libs/pointrope/src/kernels.cu @@ -0,0 +1,94 @@ + +#include +#include +#include +#include + +#define CHECK_CUDA(tensor) {\ + TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \ + TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); } +void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));} + + +template < typename scalar_t > +__global__ void pointrope_cuda_kernel( + torch::PackedTensorAccessor32 tokens, + const int64_t* __restrict__ pos, + const float base, + const float fwd ) +{ + // tokens shape = (B, N, H, D) + const int N = tokens.size(1); + const int H = tokens.size(2); + const int D = tokens.size(3); + + // each block update a single token, for all heads + // each thread takes care of a single output + extern __shared__ float shared[]; + float* shared_inv_freq = shared + D; + + const int b = blockIdx.x / N; + const int n = blockIdx.x % N; + + const int Q = D / 6; // D = 18, Q = 3 + // one token = [0..Q : Q..2Q : 2Q..3Q : 3Q..4Q : 4Q..5Q : 5Q..D] + // u_X v_X u_Y v_Y u_Z v_Z + + // shared memory: first, compute inv_freq + if (threadIdx.x < Q) + shared_inv_freq[threadIdx.x] = fwd / powf(base, threadIdx.x/float(Q)); + __syncthreads(); + + // range of threadIdx.x is [0, 1, ..., 17] + + // start of X or Y or Z part + const int X = threadIdx.x * 3 / D; + const int m = (X*D/3) + (threadIdx.x % Q); // index of u_Y or u_X + + // grab the cos,sin + const float freq = pos[blockIdx.x*3+X] * shared_inv_freq[threadIdx.x % Q]; + const float cos = cosf(freq); + const float sin = sinf(freq); + + + for (int h = 0; h < H; h++) + { + // then, load all the token for this head in shared memory + shared[threadIdx.x] = tokens[b][n][h][threadIdx.x]; + __syncthreads(); + + const float u = shared[m]; + const float v = shared[m+Q]; + + // write output + if ((threadIdx.x % (D/3)) < Q) + tokens[b][n][h][threadIdx.x] = u*cos - v*sin; + else + tokens[b][n][h][threadIdx.x] = v*cos + u*sin; + } +} + +void pointrope_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd ) +{ + const int B = tokens.size(0); // batch size + const int N = tokens.size(1); // sequence length + const int H = tokens.size(2); // number of heads + const int D = tokens.size(3); // dimension per head + + TORCH_CHECK(tokens.stride(3) == 1 && tokens.stride(2) == D, "tokens are not contiguous"); + TORCH_CHECK(pos.is_contiguous(), "positions are not contiguous"); + TORCH_CHECK(pos.size(0) == B && pos.size(1) == N && pos.size(2) == 3, "bad pos.shape"); + TORCH_CHECK(D % 6 == 0, "token dim must be multiple of 6"); + + // one block for each layer, one thread per local-max + const int THREADS_PER_BLOCK = D; + const int N_BLOCKS = B * N; // each block takes care of H*D values + const int SHARED_MEM = sizeof(float) * (D + D/6); + + AT_DISPATCH_FLOATING_TYPES_AND2(at::kHalf, at::kBFloat16, tokens.scalar_type(), "pointrope_cuda", ([&] { + pointrope_cuda_kernel <<>> ( + tokens.packed_accessor32(), + pos.data_ptr(), + base, fwd); //, N, H, D ); + })); +} diff --git a/projects/PTv3/libs/pointrope/src/pointrope.cpp b/projects/PTv3/libs/pointrope/src/pointrope.cpp new file mode 100644 index 000000000..1b620b0f5 --- /dev/null +++ b/projects/PTv3/libs/pointrope/src/pointrope.cpp @@ -0,0 +1,66 @@ + + +#include + +// forward declaration +void pointrope_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd ); + +void pointrope_cpu( torch::Tensor tokens, const torch::Tensor positions, const float base, const float fwd ) +{ + const int B = tokens.size(0); + const int N = tokens.size(1); + const int H = tokens.size(2); // number head, eg. 2 + const int D = tokens.size(3) / 6; // if dimension per head is 18, then D = 3 + + auto tok = tokens.accessor(); + auto pos = positions.accessor(); + + for (int b = 0; b < B; b++) { + for (int x = 0; x < 3; x++) { // x and then y then z (3d) + for (int n = 0; n < N; n++) { + + // grab the token position + const int p = pos[b][n][x]; + + for (int h = 0; h < H; h++) { + for (int d = 0; d < D; d++) { + // grab the two values + float u = tok[b][n][h][d+0+x*2*D]; + float v = tok[b][n][h][d+D+x*2*D]; + + // grab the cos,sin + const float inv_freq = fwd * p / powf(base, d/float(D)); + float c = cosf(inv_freq); + float s = sinf(inv_freq); + + // write the result + tok[b][n][h][d+0+x*2*D] = u*c - v*s; + tok[b][n][h][d+D+x*2*D] = v*c + u*s; + } + } + } + } + } +} + +void pointrope( torch::Tensor tokens, + const torch::Tensor positions, + const float base, + const float fwd ) +{ + TORCH_CHECK(tokens.dim() == 4, "tokens must have 4 dimensions"); + TORCH_CHECK(positions.dim() == 3, "positions must have 3 dimensions"); + TORCH_CHECK(tokens.size(0) == positions.size(0), "batch size differs between tokens & positions"); + TORCH_CHECK(tokens.size(1) == positions.size(1), "seq_length differs between tokens & positions"); + TORCH_CHECK(positions.size(2) == 3, "positions.shape[2] must be equal to 3"); + TORCH_CHECK(tokens.is_cuda() == positions.is_cuda(), "tokens and positions are not on the same device" ); + + if (tokens.is_cuda()) + pointrope_cuda( tokens, positions, base, fwd ); + else + pointrope_cpu( tokens, positions, base, fwd ); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("pointrope", &pointrope, "PointROPE forward/backward"); +} diff --git a/projects/PTv3/models/__init__.py b/projects/PTv3/models/__init__.py index 6dd1f71b2..a569efe8d 100644 --- a/projects/PTv3/models/__init__.py +++ b/projects/PTv3/models/__init__.py @@ -1,5 +1,6 @@ from .builder import build_model from .default import * +from .litept import * # Pretraining from .point_prompt_training import * diff --git a/projects/PTv3/models/litept/__init__.py b/projects/PTv3/models/litept/__init__.py new file mode 100644 index 000000000..56c8ac6bf --- /dev/null +++ b/projects/PTv3/models/litept/__init__.py @@ -0,0 +1 @@ +from .litept import * # noqa diff --git a/projects/PTv3/models/litept/litept.py b/projects/PTv3/models/litept/litept.py new file mode 100644 index 000000000..6a1fadac2 --- /dev/null +++ b/projects/PTv3/models/litept/litept.py @@ -0,0 +1,598 @@ +from functools import partial + +import flash_attn +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch_scatter +from addict import Dict +from libs.pointrope import PointROPE +from models.builder import MODELS +from models.modules import MLP, Embedding, PointModule, PointSequential +from models.scatter.functional import argsort, segment_csr, unique +from models.utils.structure import Point +from timm.layers import DropPath + + +class PointROPEAttention(PointModule): + def __init__( + self, + channels, + num_heads, + patch_size, + rope_freq, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + order_index=0, + ): + super().__init__() + assert channels % num_heads == 0 + self.channels = channels + self.num_heads = num_heads + self.scale = qk_scale or (channels // num_heads) ** -0.5 + self.order_index = order_index + + self.patch_size = patch_size + self.attn_drop = attn_drop + + self.qkv = torch.nn.Linear(channels, channels * 3, bias=qkv_bias) + self.proj = torch.nn.Linear(channels, channels) + self.proj_drop = torch.nn.Dropout(proj_drop) + + # pointrope + self.rope = PointROPE(freq=rope_freq) + + def forward(self, point: Point): + H = self.num_heads + K = self.patch_size + C = self.channels + + pad, unpad, cu_seqlens = point.get_padding_and_inverse(self.patch_size) + + order = point.serialized_order[self.order_index][pad] + inverse = unpad[point.serialized_inverse[self.order_index]] + + # padding and reshape feat and batch for serialized point patch + qkv = self.qkv(point.feat)[order] # [N, C] + + ## apply pointrope + pos = point.grid_coord[order] # [N, 3] + pos = pos.reshape(-1, 3).unsqueeze(0) + + q, k, v = qkv.half().chunk(3, dim=-1) + q = q.reshape(-1, H, C // H).transpose(0, 1)[None] # [1, H, N, head_dim] + k = k.reshape(-1, H, C // H).transpose(0, 1)[None] # [1, H, N, head_dim] + + # workround to make pointrope cuda float32 happy + q = self.rope(q.float(), pos).to(q.dtype) # [1, H, N, head_dim] + k = self.rope(k.float(), pos).to(k.dtype) # [1, H, N, head_dim] + + # assemble input for flash attention + qkv_rotated = torch.stack( + [ + q.squeeze(0).transpose(0, 1), + k.squeeze(0).transpose(0, 1), + v.reshape(-1, H, C // H), + ], + dim=1, + ) # [N, 3, H, head_dim] + + if torch.onnx.is_in_onnx_export(): + assert (qkv_rotated.shape[0] % K) == 0 + # encode and reshape qkv: (N', K, 3, H, C') => (3, N', H, K, C') + q, k, v = qkv_rotated.reshape(-1, K, 3, H, C // H).permute(2, 0, 3, 1, 4).unbind(dim=0) + # attn + attn = (q * self.scale) @ k.transpose(-2, -1) # (N', H, K, K) + attn = F.softmax(attn, dim=-1) + feat = (attn @ v).transpose(1, 2).reshape(-1, C) + else: + feat = flash_attn.flash_attn_varlen_qkvpacked_func( + qkv_rotated, + cu_seqlens, + max_seqlen=self.patch_size, + dropout_p=self.attn_drop if self.training else 0, + softmax_scale=self.scale, + ).reshape(-1, C) + + feat = feat.to(qkv.dtype) + feat = feat[inverse] + + # ffn + feat = self.proj(feat) + feat = self.proj_drop(feat) + point.feat = feat + return point + + +class Block(PointModule): + def __init__( + self, + channels, + num_heads, + patch_size=48, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + act_layer=nn.GELU, + pre_norm=True, + order_index=0, + cpe_indice_key=None, + enable_conv=True, + enable_attn=True, + rope_freq=100.0, + export_mode=False, + ): + super().__init__() + self.channels = channels + self.pre_norm = pre_norm + self.export_mode = export_mode + + self.enable_conv = enable_conv + self.enable_attn = enable_attn + + if self.enable_conv: + if export_mode: + from SparseConvolution.sparse_conv import SubMConv3d + else: + from spconv.pytorch import SubMConv3d + + self.conv = PointSequential( + SubMConv3d( + channels, + channels, + kernel_size=3, + bias=True, + indice_key=cpe_indice_key, + ), + nn.Linear(channels, channels), + norm_layer(channels), + ) + else: + self.norm0 = PointSequential( + norm_layer(channels), + ) + + if self.enable_attn: + self.norm1 = PointSequential(norm_layer(channels)) + self.attn = PointROPEAttention( + channels=channels, + patch_size=patch_size, + rope_freq=rope_freq, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=proj_drop, + order_index=order_index, + ) + self.norm2 = PointSequential(norm_layer(channels)) + self.mlp = PointSequential( + MLP( + in_channels=channels, + hidden_channels=int(channels * mlp_ratio), + out_channels=channels, + act_layer=act_layer, + drop=proj_drop, + ) + ) + self.drop_path = PointSequential(DropPath(drop_path) if drop_path > 0.0 else nn.Identity()) + + def forward(self, point: Point): + if self.enable_conv: + shortcut = point.feat + point = self.conv(point) + point.feat = shortcut + point.feat + else: + point = self.norm0(point) + + if self.enable_attn: + shortcut = point.feat + if self.pre_norm: + point = self.norm1(point) + point = self.drop_path(self.attn(point)) + point.feat = shortcut + point.feat + if not self.pre_norm: + point = self.norm1(point) + + shortcut = point.feat + if self.pre_norm: + point = self.norm2(point) + point = self.drop_path(self.mlp(point)) + point.feat = shortcut + point.feat + if not self.pre_norm: + point = self.norm2(point) + + point.sparse_conv_feat = point.sparse_conv_feat.replace_feature(point.feat) + return point + + +class GridPooling(PointModule): + def __init__( + self, + in_channels, + out_channels, + stride=2, + norm_layer=None, + act_layer=None, + reduce="max", + shuffle_orders=True, + traceable=True, # record parent and cluster + re_serialization=False, + serialization_order="z", + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + + self.stride = stride + assert reduce in ["sum", "mean", "min", "max"] + self.reduce = reduce + self.shuffle_orders = shuffle_orders + self.traceable = traceable + + self.proj = nn.Linear(in_channels, out_channels) + if norm_layer is not None: + self.norm = PointSequential(norm_layer(out_channels)) + if act_layer is not None: + self.act = PointSequential(act_layer()) + + self.re_serialization = re_serialization + self.serialization_order = serialization_order + + def forward(self, point: Point): + if "grid_coord" in point.keys(): + grid_coord = point.grid_coord + elif {"coord", "grid_size"}.issubset(point.keys()): + grid_coord = torch.div( + point.coord - point.coord.min(0)[0], + point.grid_size, + rounding_mode="trunc", + ).int() + else: + raise AssertionError("[gird_coord] or [coord, grid_size] should be include in the Point") + grid_coord = torch.div(grid_coord, self.stride, rounding_mode="trunc") + + # Pack batch id and coordinates into a single 1D key for clustering. + # NOTE(original): grid_coord = torch.bitwise_or(grid_coord, point.batch.view(-1, 1) << 48) + grid_coord_i = grid_coord.to(torch.int64) + gx = grid_coord_i[:, 0] + gy = grid_coord_i[:, 1] + gz = grid_coord_i[:, 2] + b = point.batch.to(torch.int64) + packed = gx + gy * (1 << 16) + gz * (1 << 32) + b * (1 << 48) + + if torch.onnx.is_in_onnx_export(): + unique_keys, cluster, counts, _ = unique(packed) + # Sort points by cluster id (inverse_indices) to make them contiguous per cluster. + indices = argsort(cluster) + else: + unique_keys, cluster, counts = torch.unique( + packed, + sorted=True, + return_inverse=True, + return_counts=True, + ) + # indices of point sorted by cluster, for torch_scatter.segment_csr + _, indices = torch.sort(cluster) + + # Unpack to (M, 3) grid coords (drop batch component) + # NOTE(original): grid_coord = torch.bitwise_and(grid_coord, ((1 << 48) - 1)) + key_wo_batch = torch.remainder(unique_keys, (1 << 48)) + gx = torch.remainder(key_wo_batch, (1 << 16)) + gy = torch.remainder(torch.div(key_wo_batch, (1 << 16), rounding_mode="trunc"), (1 << 16)) + gz = torch.remainder(torch.div(key_wo_batch, (1 << 32), rounding_mode="trunc"), (1 << 16)) + grid_coord = torch.stack([gx, gy, gz], dim=1).to(grid_coord_i.dtype) + + # index pointer for sorted point, for torch_scatter.segment_csr + idx_ptr = torch.cat([counts.new_zeros(1), torch.cumsum(counts, dim=0)]) + # head_indices of each cluster, for reduce attr e.g. code, batch + head_indices = indices[idx_ptr[:-1]] + + if not torch.onnx.is_in_onnx_export(): + scatter_feat = torch_scatter.segment_csr(self.proj(point.feat)[indices], idx_ptr, reduce=self.reduce) + scatter_coord = torch_scatter.segment_csr(point.coord[indices], idx_ptr, reduce="mean") + else: + scatter_feat = segment_csr(self.proj(point.feat)[indices], idx_ptr, self.reduce) + scatter_coord = segment_csr(point.coord[indices], idx_ptr, "mean") + + point_dict = Dict( + feat=scatter_feat, + coord=scatter_coord, + grid_coord=grid_coord, + batch=point.batch[head_indices], + ) + + if "origin_coord" in point.keys(): + if not torch.onnx.is_in_onnx_export(): + point_dict["origin_coord"] = torch_scatter.segment_csr( + point.origin_coord[indices], idx_ptr, reduce="mean" + ) + else: + point_dict["origin_coord"] = segment_csr(point.origin_coord[indices], idx_ptr, "mean") + if "condition" in point.keys(): + point_dict["condition"] = point.condition + if "context" in point.keys(): + point_dict["context"] = point.context + if "name" in point.keys(): + point_dict["name"] = point.name + if "split" in point.keys(): + point_dict["split"] = point.split + if "color" in point.keys(): + if not torch.onnx.is_in_onnx_export(): + point_dict["color"] = torch_scatter.segment_csr(point.color[indices], idx_ptr, reduce="mean") + else: + point_dict["color"] = segment_csr(point.color[indices], idx_ptr, "mean") + if "grid_size" in point.keys(): + point_dict["grid_size"] = point.grid_size * self.stride + if "mask" in point.keys(): + if not torch.onnx.is_in_onnx_export(): + point_dict["mask"] = ( + torch_scatter.segment_csr(point.mask[indices].float(), idx_ptr, reduce="mean") > 0.5 + ) + else: + point_dict["mask"] = segment_csr(point.mask[indices].float(), idx_ptr, "mean") > 0.5 + + if self.traceable: + point_dict["pooling_inverse"] = cluster + point_dict["pooling_parent"] = point + point = Point(point_dict) + if self.norm is not None: + point = self.norm(point) + if self.act is not None: + point = self.act(point) + + if self.re_serialization: + point.serialization(order=self.serialization_order, shuffle_orders=self.shuffle_orders) + point.sparsify() + return point + + +class GridUnpooling(PointModule): + def __init__( + self, + in_channels, + skip_channels, + out_channels, + norm_layer=None, + act_layer=None, + traceable=False, # record parent and cluster + ): + super().__init__() + self.proj = PointSequential(nn.Linear(in_channels, out_channels)) + self.proj_skip = PointSequential(nn.Linear(skip_channels, out_channels)) + + if norm_layer is not None: + self.proj.add(norm_layer(out_channels)) + self.proj_skip.add(norm_layer(out_channels)) + + if act_layer is not None: + self.proj.add(act_layer()) + self.proj_skip.add(act_layer()) + + self.traceable = traceable + + def forward(self, point): + assert "pooling_parent" in point.keys() + assert "pooling_inverse" in point.keys() + parent = point.pop("pooling_parent") + inverse = point.pooling_inverse + feat = point.feat + + parent = self.proj_skip(parent) + parent.feat = parent.feat + self.proj(point).feat[inverse] + parent.sparse_conv_feat = parent.sparse_conv_feat.replace_feature(parent.feat) + + if self.traceable: + point.feat = feat + parent["unpooling_parent"] = point + parent["unpooling_inverse"] = inverse + return parent + + +@MODELS.register_module("LitePT") +class LitePT(PointModule): + def __init__( + self, + in_channels=4, + order=("z", "z-trans", "hilbert", "hilbert-trans"), + stride=(2, 2, 2, 2), + enc_depths=(2, 2, 2, 6, 2), + enc_channels=(36, 72, 144, 252, 504), + enc_num_head=(2, 4, 8, 14, 28), + enc_patch_size=(1024, 1024, 1024, 1024, 1024), + enc_conv=(True, True, True, False, False), + enc_attn=(False, False, False, True, True), + enc_rope_freq=(100.0, 100.0, 100.0, 100.0, 100.0), + dec_depths=(0, 0, 0, 0), + dec_channels=(72, 72, 144, 252), + dec_num_head=(4, 4, 8, 14), + dec_patch_size=(1024, 1024, 1024, 1024), + dec_conv=(False, False, False, False), + dec_attn=(False, False, False, False), + dec_rope_freq=(100.0, 100.0, 100.0, 100.0), + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + drop_path=0.3, + pre_norm=True, + shuffle_orders=True, + enc_mode=False, + export_mode=False, + ): + super().__init__() + self.num_stages = len(enc_depths) + self.order = [order] if isinstance(order, str) else order + self.enc_mode = enc_mode + self.shuffle_orders = shuffle_orders + self.export_mode = export_mode + + self.enc_conv = enc_conv + self.enc_attn = enc_attn + self.dec_conv = dec_conv + self.dec_attn = dec_attn + + assert self.num_stages == len(stride) + 1 + assert self.num_stages == len(enc_depths) + assert self.num_stages == len(enc_channels) + assert self.num_stages == len(enc_num_head) + assert self.num_stages == len(enc_patch_size) + assert self.enc_mode or self.num_stages == len(dec_depths) + 1 + assert self.enc_mode or self.num_stages == len(dec_channels) + 1 + assert self.enc_mode or self.num_stages == len(dec_num_head) + 1 + assert self.enc_mode or self.num_stages == len(dec_patch_size) + 1 + + # norm layers + bn_layer = partial(nn.BatchNorm1d, eps=1e-3, momentum=0.01) + ln_layer = nn.LayerNorm + + # activation layers + act_layer = nn.GELU + + self.embedding = Embedding( + in_channels=in_channels, + embed_channels=enc_channels[0], + norm_layer=bn_layer, + act_layer=act_layer, + export_mode=self.export_mode, + ) + + # encoder + enc_drop_path = [x.item() for x in torch.linspace(0, drop_path, sum(enc_depths))] + self.enc = PointSequential() + for s in range(self.num_stages): + enc_drop_path_ = enc_drop_path[sum(enc_depths[:s]) : sum(enc_depths[: s + 1])] + enc = PointSequential() + if s > 0: + enc.add( + GridPooling( + in_channels=enc_channels[s - 1], + out_channels=enc_channels[s], + stride=stride[s - 1], + norm_layer=bn_layer, + act_layer=act_layer, + re_serialization=enc_attn[s], + serialization_order=self.order, + ), + name="down", + ) + for i in range(enc_depths[s]): + enc.add( + Block( + channels=enc_channels[s], + num_heads=enc_num_head[s], + patch_size=enc_patch_size[s], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=proj_drop, + drop_path=enc_drop_path_[i], + norm_layer=ln_layer, + act_layer=act_layer, + pre_norm=pre_norm, + order_index=i % len(self.order), + cpe_indice_key=f"stage{s}", + enable_conv=enc_conv[s], + enable_attn=enc_attn[s], + rope_freq=enc_rope_freq[s], + export_mode=self.export_mode, + ), + name=f"block{i}", + ) + if len(enc) != 0: + self.enc.add(module=enc, name=f"enc{s}") + + # decoder + if not self.enc_mode: + dec_drop_path = [x.item() for x in torch.linspace(0, drop_path, sum(dec_depths))] + self.dec = PointSequential() + dec_channels = list(dec_channels) + [enc_channels[-1]] + for s in reversed(range(self.num_stages - 1)): + dec_drop_path_ = dec_drop_path[sum(dec_depths[:s]) : sum(dec_depths[: s + 1])] + dec_drop_path_.reverse() + dec = PointSequential() + dec.add( + GridUnpooling( + in_channels=dec_channels[s + 1], + skip_channels=enc_channels[s], + out_channels=dec_channels[s], + norm_layer=bn_layer, + act_layer=act_layer, + ), + name="up", + ) + for i in range(dec_depths[s]): + dec.add( + Block( + channels=dec_channels[s], + num_heads=dec_num_head[s], + patch_size=dec_patch_size[s], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=proj_drop, + drop_path=dec_drop_path_[i], + norm_layer=ln_layer, + act_layer=act_layer, + pre_norm=pre_norm, + order_index=i % len(self.order), + cpe_indice_key=f"stage{s}", + enable_conv=dec_conv[s], + enable_attn=dec_attn[s], + rope_freq=dec_rope_freq[s], + export_mode=self.export_mode, + ), + name=f"block{i}", + ) + self.dec.add(module=dec, name=f"dec{s}") + + def forward(self, data_dict): + """ + data_dict is the batched input point cloud, it should contain as least: + 1. feat [N, input_dim]: input feature for the point cloud + 2. grid_coord [N, 3]: voxelized coordinate after grid sampling + or/and + coord [N, 3]: original coordinate + grid_size: grid_size used for grid sampling + 3. offset [batch_size]: separator of point clouds in batched data + or/and + batch [N]: batch index of each point + """ + point = Point(data_dict) + if self.enc_attn[0]: + point.serialization(order=self.order, shuffle_orders=self.shuffle_orders) + point.sparsify() + + point = self.embedding(point) + point = self.enc(point) + + if not self.enc_mode: + point = self.dec(point) + + return point + + def export_forward(self, data_dict): + point = Point(data_dict) + if self.enc_attn[0]: + point["serialized_depth"] = data_dict["serialized_depth"] + point["serialized_code"] = data_dict["serialized_code"] + point["serialized_order"] = data_dict["serialized_order"] + point["serialized_inverse"] = data_dict["serialized_inverse"] + point["sparse_shape"] = data_dict["sparse_shape"] + point.sparsify() + + point = self.embedding(point) + point = self.enc(point) + + if not self.enc_mode: + point = self.dec(point) + + return point diff --git a/projects/PTv3/models/modules.py b/projects/PTv3/models/modules.py index 851376e9f..3d51a38b9 100644 --- a/projects/PTv3/models/modules.py +++ b/projects/PTv3/models/modules.py @@ -80,3 +80,67 @@ def forward(self, input): else: input = module(input) return input + + +class MLP(nn.Module): + def __init__( + self, + in_channels, + hidden_channels=None, + out_channels=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_channels = out_channels or in_channels + hidden_channels = hidden_channels or in_channels + self.fc1 = nn.Linear(in_channels, hidden_channels) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_channels, out_channels) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Embedding(PointModule): + def __init__( + self, + in_channels, + embed_channels, + norm_layer=None, + act_layer=None, + export_mode=False, + ): + super().__init__() + self.in_channels = in_channels + self.embed_channels = embed_channels + + if export_mode: + from SparseConvolution.sparse_conv import SubMConv3d + else: + from spconv.pytorch import SubMConv3d + + self.stem = PointSequential( + conv=SubMConv3d( + in_channels, + embed_channels, + kernel_size=5, + padding=1, + bias=False, + indice_key="stem", + ) + ) + if norm_layer is not None: + self.stem.add(norm_layer(embed_channels), name="norm") + if act_layer is not None: + self.stem.add(act_layer(), name="act") + + def forward(self, point: Point): + point = self.stem(point) + return point diff --git a/projects/PTv3/models/point_transformer_v3/point_transformer_v3m1_base.py b/projects/PTv3/models/point_transformer_v3/point_transformer_v3m1_base.py index e91fb76c6..acb43eff5 100644 --- a/projects/PTv3/models/point_transformer_v3/point_transformer_v3m1_base.py +++ b/projects/PTv3/models/point_transformer_v3/point_transformer_v3m1_base.py @@ -19,7 +19,7 @@ flash_attn = None from models.builder import MODELS -from models.modules import PointModule, PointSequential +from models.modules import MLP, Embedding, PointModule, PointSequential from models.point_prompt_training import PDNorm from models.scatter.functional import argsort, segment_csr, unique from models.utils.misc import offset2bincount @@ -143,94 +143,6 @@ def get_rel_pos(self, point, order): point[rel_pos_key] = grid_coord.unsqueeze(2) - grid_coord.unsqueeze(1) return point[rel_pos_key] - @torch.no_grad() - def get_padding_and_inverse(self, point): - pad_key = "pad" - unpad_key = "unpad" - cu_seqlens_key = "cu_seqlens_key" - if pad_key not in point.keys() or unpad_key not in point.keys() or cu_seqlens_key not in point.keys(): - offset = point.offset - bincount = offset2bincount(offset) - bincount_pad = ( - torch.maximum( - torch.div( - bincount + self.patch_size - 1, - self.patch_size, - rounding_mode="trunc", - ), - torch.tensor(1, device=bincount.device), - ) - * self.patch_size - ) - # only pad point when num of points larger than patch_size - mask_pad = bincount > self.patch_size - bincount_pad = (1 - mask_pad.int()) * bincount + mask_pad.int() * bincount_pad - - if not self.export_mode: - _offset = nn.functional.pad(offset, (1, 0)) - _offset_pad = nn.functional.pad(torch.cumsum(bincount_pad, dim=0), (1, 0)) - - pad = torch.arange(_offset_pad[-1], device=offset.device) - unpad = torch.arange(_offset[-1], device=offset.device) - cu_seqlens = [] - for i in range(len(offset)): - unpad[_offset[i] : _offset[i + 1]] += _offset_pad[i] - _offset[i] - if bincount[i] != bincount_pad[i]: - pad[ - _offset_pad[i + 1] - self.patch_size + (bincount[i] % self.patch_size) : _offset_pad[i + 1] - ] = pad[ - _offset_pad[i + 1] - - 2 * self.patch_size - + (bincount[i] % self.patch_size) : _offset_pad[i + 1] - - self.patch_size - ] - pad[_offset_pad[i] : _offset_pad[i + 1]] -= _offset_pad[i] - _offset[i] - cu_seqlens.append( - torch.arange( - _offset_pad[i], - _offset_pad[i + 1], - step=self.patch_size, - dtype=torch.int32, - device=offset.device, - ) - ) - point[pad_key] = pad - point[unpad_key] = unpad - point[cu_seqlens_key] = nn.functional.pad(torch.concat(cu_seqlens), (0, 1), value=_offset_pad[-1]) - else: - # NOTE(knzo25): needed due to tensorrt reasons - assert len(offset) == 1 - - # pad_orig = pad - # unpad_orig = unpad - - pad = torch.arange(bincount_pad[0], device=offset.device) - unpad = torch.arange(offset[0], device=offset.device) - cu_seqlens = [] - - pad[bincount_pad[0] - self.patch_size + (bincount[0] % self.patch_size) : bincount_pad[0]] = pad[ - bincount_pad[0] - - 2 * self.patch_size - + (bincount[0] % self.patch_size) : bincount_pad[0] - - self.patch_size - ] - - cu_seqlens.append( - torch.arange( - 0, - bincount_pad[0], - step=self.patch_size, - dtype=torch.int32, - device=offset.device, - ) - ) - - point[pad_key] = pad - point[unpad_key] = unpad - point[cu_seqlens_key] = nn.functional.pad(torch.concat(cu_seqlens), (0, 1), value=bincount_pad[0]) - - return point[pad_key], point[unpad_key], point[cu_seqlens_key] - def forward(self, point): if not self.enable_flash: assert offset2bincount(point.offset).min() >= self.patch_size_max # NOTE(knzo25): assumed for deployment @@ -240,7 +152,7 @@ def forward(self, point): K = self.patch_size C = self.channels - pad, unpad, cu_seqlens = self.get_padding_and_inverse(point) + pad, unpad, cu_seqlens = point.get_padding_and_inverse(self.patch_size) order = point.serialized_order[self.order_index][pad] inverse = unpad[point.serialized_inverse[self.order_index]] @@ -281,32 +193,6 @@ def forward(self, point): return point -class MLP(nn.Module): - def __init__( - self, - in_channels, - hidden_channels=None, - out_channels=None, - act_layer=nn.GELU, - drop=0.0, - ): - super().__init__() - out_channels = out_channels or in_channels - hidden_channels = hidden_channels or in_channels - self.fc1 = nn.Linear(in_channels, hidden_channels) - self.act = act_layer() - self.fc2 = nn.Linear(hidden_channels, out_channels) - self.drop = nn.Dropout(drop) - - def forward(self, x): - x = self.fc1(x) - x = self.act(x) - x = self.drop(x) - x = self.fc2(x) - x = self.drop(x) - return x - - class Block(PointModule): def __init__( self, @@ -566,44 +452,6 @@ def forward(self, point): return parent -class Embedding(PointModule): - def __init__( - self, - in_channels, - embed_channels, - norm_layer=None, - act_layer=None, - export_mode=False, - ): - super().__init__() - self.in_channels = in_channels - self.embed_channels = embed_channels - - if export_mode: - from SparseConvolution.sparse_conv import SubMConv3d - else: - from spconv.pytorch import SubMConv3d - - self.stem = PointSequential( - conv=SubMConv3d( - in_channels, - embed_channels, - kernel_size=5, - padding=1, - bias=False, - indice_key="stem", - ) - ) - if norm_layer is not None: - self.stem.add(norm_layer(embed_channels), name="norm") - if act_layer is not None: - self.stem.add(act_layer(), name="act") - - def forward(self, point: Point): - point = self.stem(point) - return point - - @MODELS.register_module("PT-v3m1") class PointTransformerV3(PointModule): diff --git a/projects/PTv3/models/utils/structure.py b/projects/PTv3/models/utils/structure.py index df335b9a5..f40f88e1d 100644 --- a/projects/PTv3/models/utils/structure.py +++ b/projects/PTv3/models/utils/structure.py @@ -1,8 +1,10 @@ import spconv.pytorch as spconv import torch from addict import Dict -from models.utils import batch2offset, offset2batch +from models.scatter import argsort +from models.utils import batch2offset, offset2batch, offset2bincount from models.utils.serialization import decode, encode +from torch import nn def bit_length_tensor(x: torch.Tensor) -> torch.Tensor: @@ -87,7 +89,10 @@ def serialization(self, order="z", depth=None, shuffle_orders=False): # OrderN ([n])] (k, n) code = [encode(self.grid_coord, self.batch, depth, order=order_) for order_ in order] code = torch.stack(code) - order = torch.argsort(code) + if torch.onnx.is_in_onnx_export(): + order = torch.stack([argsort(code_i) for code_i in code], dim=0) + else: + order = torch.argsort(code) inverse = torch.zeros_like(order).scatter_( dim=1, index=order, @@ -137,3 +142,80 @@ def sparsify(self, pad=96): ) self["sparse_shape"] = sparse_shape self["sparse_conv_feat"] = sparse_conv_feat + + @torch.no_grad() + def get_padding_and_inverse(self, patch_size) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + pad_key = "pad" + unpad_key = "unpad" + cu_seqlens_key = "cu_seqlens_key" + if pad_key not in self.keys() or unpad_key not in self.keys() or cu_seqlens_key not in self.keys(): + offset = self.offset + bincount = offset2bincount(offset) + bincount_pad = ( + torch.div( + bincount + patch_size - 1, + patch_size, + rounding_mode="trunc", + ) + * patch_size + ) + # only pad point when num of points larger than patch_size + mask_pad = bincount > patch_size + bincount_pad = ~mask_pad * bincount + mask_pad * bincount_pad + + if not torch.onnx.is_in_onnx_export(): + _offset = nn.functional.pad(offset, (1, 0)) + _offset_pad = nn.functional.pad(torch.cumsum(bincount_pad, dim=0), (1, 0)) + pad = torch.arange(_offset_pad[-1], device=offset.device) + unpad = torch.arange(_offset[-1], device=offset.device) + cu_seqlens = [] + for i in range(len(offset)): + unpad[_offset[i] : _offset[i + 1]] += _offset_pad[i] - _offset[i] + if bincount[i] != bincount_pad[i]: + pad[_offset_pad[i + 1] - patch_size + (bincount[i] % patch_size) : _offset_pad[i + 1]] = pad[ + _offset_pad[i + 1] + - 2 * patch_size + + (bincount[i] % patch_size) : _offset_pad[i + 1] + - patch_size + ] + pad[_offset_pad[i] : _offset_pad[i + 1]] -= _offset_pad[i] - _offset[i] + cu_seqlens.append( + torch.arange( + _offset_pad[i], + _offset_pad[i + 1], + step=patch_size, + dtype=torch.int32, + device=offset.device, + ) + ) + self[pad_key] = pad + self[unpad_key] = unpad + self[cu_seqlens_key] = nn.functional.pad(torch.concat(cu_seqlens), (0, 1), value=_offset_pad[-1]) + else: + # NOTE: needed due to tensorrt reasons + assert len(offset) == 1 + + pad = torch.arange(bincount_pad[0], device=offset.device) + unpad = torch.arange(offset[0], device=offset.device) + cu_seqlens = [] + + pad[bincount_pad[0] - patch_size + (bincount[0] % patch_size) : bincount_pad[0]] = pad[ + bincount_pad[0] - 2 * patch_size + (bincount[0] % patch_size) : bincount_pad[0] - patch_size + ] + + cu_seqlens.append( + torch.arange( + 0, + bincount_pad[0], + step=patch_size, + dtype=torch.int32, + device=offset.device, + ) + ) + + self[pad_key] = pad + self[unpad_key] = unpad + self[cu_seqlens_key] = nn.functional.pad(torch.concat(cu_seqlens), (0, 1), value=bincount_pad[0]) + + return self[pad_key], self[unpad_key], self[cu_seqlens_key] + return self[pad_key], self[unpad_key], self[cu_seqlens_key] diff --git a/projects/PTv3/pyproject.toml b/projects/PTv3/pyproject.toml new file mode 100644 index 000000000..3831aba25 --- /dev/null +++ b/projects/PTv3/pyproject.toml @@ -0,0 +1,92 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "ptv3" +version = "0.1.0" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + "addict>=2.4.0", + "colorhash>=2.1.0", + "einops>=0.8.1", + "flash-attn", + "h5py>=3.15.1", + "plyfile>=1.1.3", + "pointrope", + "pyyaml>=6.0.3", + "scipy>=1.15.3", + "sharedarray>=3.2.4", + "spconv-cu126>=2.3.8", + "tensorboard>=2.20.0", + "tensorboardx>=2.6.4", + "termcolor>=3.3.0", + "timm>=1.0.22", + "torch==2.8.0", + "torchvision==0.23.0", + "torchmetrics>=1.8.2", + "torch-cluster>=1.6.3", + "torch-geometric>=2.7.0", + "torch-scatter>=2.1.2", + "torch-sparse>=0.6.18", + "wandb>=0.23.1", + "yapf==0.40.1", + "nuscenes-devkit>=1.2.0", + "open3d>=0.19.0", + "onnx>=1.20.0", +] + +[project.optional-dependencies] +insseg = ["pointgroup-ops", "pointops"] # for instance segmentation + +[tool.uv] +find-links = [ + "https://data.pyg.org/whl/torch-2.8.0+cu129.html", # for torch-* libraries +] +no-build-isolation-package = [ + "flash-attn", + "torch-cluster", + "torch-geometric", + "torch-scatter", + "torch-sparse", + "pointops", + "pointrope", + "pointgroup-ops", +] + +[tool.uv.extra-build-dependencies] +pointrope = [{ requirement = "torch", match-runtime = true }] +torch-cluster = [ + { requirement = "setuptools", match-runtime = false }, + { requirement = "torch", match-runtime = true }, +] + +[tool.uv.sources] +torch = [{ index = "pytorch-cu129", marker = "sys_platform == 'linux'" }] +torchvision = [{ index = "pytorch-cu129", marker = "sys_platform == 'linux'" }] +flash-attn = { url = "https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl" } +pointrope = { workspace = true } +pointops = { workspace = true } +pointgroup-ops = { workspace = true } + +[[tool.uv.index]] +name = "pytorch-cu129" +url = "https://download.pytorch.org/whl/cu129" +explicit = true + +[tool.uv.workspace] +members = ["libs/pointgroup_ops", "libs/pointops", "libs/pointrope"] + +[tool.setuptools.packages.find] +where = ["."] +include = [ + "datasets", + "datasets.*", + "engines", + "engines.*", + "models", + "models.*", + "utils", + "utils.*", +]