From 08a2a99a7ed444be57759ac03681637574b65bec Mon Sep 17 00:00:00 2001 From: tzhong518 Date: Tue, 23 Dec 2025 14:35:59 +0900 Subject: [PATCH 1/2] add: yolox_multitask model Signed-off-by: tzhong518 --- ...opt-elan-semseg_960x960_300e_cityscapes.py | 267 ++++++++++ ...-opt-elan-semseg_960x960_300e_t4dataset.py | 372 ++++++++++++++ .../yolox/models/heads/__init__.py | 3 + .../yolox/models/heads/seg_head.py | 122 +++++ .../yolox/models/layers/network_blocks.py | 466 ++++++++++++++++++ .../yolox/models/yolox_multitask.py | 189 +++++++ projects/YOLOX_opt_elan/yolox/transforms.py | 37 ++ 7 files changed, 1456 insertions(+) create mode 100644 projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py create mode 100644 projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py create mode 100644 projects/YOLOX_opt_elan/yolox/models/heads/__init__.py create mode 100644 projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py create mode 100644 projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py create mode 100644 projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py create mode 100644 projects/YOLOX_opt_elan/yolox/transforms.py diff --git a/projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py b/projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py new file mode 100644 index 000000000..05af852f9 --- /dev/null +++ b/projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py @@ -0,0 +1,267 @@ +_base_ = [ + "../../../../../autoware_ml/configs/detection2d/default_runtime.py", + "../../../../../autoware_ml/configs/detection2d/schedules/schedule_1x.py", +] + +custom_imports = dict( + imports=[ + "projects.YOLOX_opt_elan.yolox", + "autoware_ml.detection2d.metrics", + "autoware_ml.detection2d.datasets", + "projects.YOLOX_opt_elan.yolox.models", + "projects.YOLOX_opt_elan.yolox.models.yolox_multitask", + "projects.YOLOX_opt_elan.yolox.transforms", + "mmseg.evaluation.metrics", # 引入分割评估指标 + ], + allow_failed_imports=False, +) + +# parameter settings +# IMG_SCALE = (960, 960) +IMG_SCALE = (1024, 512) +max_epochs = 300 +num_last_epochs = 15 +resume_from = None +interval = 1 +batch_size = 16 +activation = "ReLU6" +num_workers = 4 +base_lr = 0.001 + + +classes = ('person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle') +palette = [ + (220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70), + (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32) +] + +seg_classes = ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole', 'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle') +seg_palette = [(128, 64, 128), (244, 35, 232), (70, 70, 70), (102, 102, 156), (190, 153, 153), (153, 153, 153), (250, 170, 30), (220, 220, 0), (107, 142, 35), (152, 251, 152), (70, 130, 180), (220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70), (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32)] + +# metainfo = dict(classes=classes, palette=palette) +metainfo = dict(classes=seg_classes, palette=seg_palette) + +model = dict( + type="YOLOXMultiTask", + data_preprocessor=dict( + type="DetDataPreprocessor", + pad_size_divisor=32, + # batch_augments=[ + # dict( + # type="BatchSyncRandomResize", + # random_size_range=(480, 800), + # size_divisor=32, + # interval=10, + # ) + # ], + ), + backbone=dict( + type="ELANDarknet", + deepen_factor=2, + widen_factor=1, + out_indices=(2, 3, 4), + act_cfg=dict(type=activation), + ), + neck=dict( + type="YOLOXPAFPN_ELAN", + in_channels=[128, 256, 512], + out_channels=128, + num_elan_blocks=2, + act_cfg=dict(type=activation), + ), + bbox_head=dict( + type="YOLOXHead", + num_classes=8, + in_channels=128, + feat_channels=128, + act_cfg=dict(type=activation), + loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0), + loss_bbox=dict(type='IoULoss', loss_weight=0.0), + loss_obj=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0), + loss_l1=dict(type='L1Loss', loss_weight=0.0), + ), + mask_head=dict( + type="YOLOXSegHead", + in_channels=[128, 128, 128], + feat_channels=128, + num_classes=19, + act_cfg=dict(type=activation), + loss=dict(type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0), + ), + train_cfg=dict(assigner=dict(type="SimOTAAssigner", center_radius=2.5)), + test_cfg=dict(score_thr=0.01, nms=dict(type="nms", iou_threshold=0.65)), +) + +dataset_type = 'CocoDataset' +data_root = 'data/cityscapes/' +backend_args = None + +train_pipeline = [ + # dict(type="Mosaic", img_scale=IMG_SCALE, pad_val=114.0), + # dict(type="MixUp", img_scale=IMG_SCALE, ratio_range=(0.8, 1.6), pad_val=114.0), + dict(type="YOLOXHSVRandomAug"), + dict(type="RandomFlip", prob=0.5), + dict(type="Resize", scale=IMG_SCALE, keep_ratio=False), + dict( + type="Pad", + pad_to_square=False, + size_divisor=32, + pad_val=dict(img=(114.0, 114.0, 114.0), seg=255), + ), + dict(type="FilterAnnotations", min_gt_bbox_wh=(1, 1), keep_empty=False), + dict(type="PackDetInputs"), +] + +test_pipeline = [ + dict(type="LoadImageFromFile", backend_args=backend_args), + dict(type="FixCityscapesPath", data_root=data_root, split='val'), + dict(type="LoadAnnotations", with_bbox=True, with_seg=True), + dict(type="Resize", scale=IMG_SCALE, keep_ratio=False), + dict(type="Pad", pad_to_square=False, size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0), seg=255)), + dict( + type="PackDetInputs", + meta_keys=("img_id", "img_path", "ori_shape", "img_shape", "scale_factor"), + ), +] + +train_dataset = dict( + type="MultiImageMixDataset", + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img='leftImg8bit/train', + seg_map_path='gtFine/train' + ), + ann_file='annotations/instancesonly_filtered_gtFine_train.json', + pipeline=[ + dict(type="LoadImageFromFile", backend_args=backend_args), + dict(type="FixCityscapesPath", data_root=data_root, split='train'), + dict(type="LoadAnnotations", with_bbox=True, with_seg=True), + ], + filter_cfg=dict(filter_empty_gt=False, min_size=8), + backend_args=backend_args, + metainfo=metainfo, + ), + pipeline=train_pipeline, +) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=num_workers, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=True), + dataset=train_dataset, +) + +val_dataloader = dict( + batch_size=batch_size, + num_workers=num_workers, + persistent_workers=True, + drop_last=False, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img='leftImg8bit/val', + seg_map_path='gtFine/val' + ), + ann_file='annotations/instancesonly_filtered_gtFine_val.json', + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args, + metainfo=metainfo, + ), +) +test_dataloader = val_dataloader + +val_evaluator = [ + dict(type='mmseg.IoUMetric', ignore_index=255, iou_metrics=['mIoU'], prefix="seg", classes=seg_classes) +] +test_evaluator = val_evaluator + +train_cfg = dict(max_epochs=max_epochs, val_interval=interval) + +optimizer = dict( + type="OptimWrapper", + optimizer=dict(type="SGD", lr=base_lr, momentum=0.9, weight_decay=5e-4, nesterov=True), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), +) + +if max_epochs > 5: + param_scheduler = [ + dict( + type="mmdet.QuadraticWarmupLR", + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True, + ), + dict( + type="CosineAnnealingLR", + eta_min=base_lr * 0.05, + begin=5, + T_max=max_epochs - num_last_epochs, + end=max_epochs - num_last_epochs, + by_epoch=True, + convert_to_iter_based=True, + ), + dict( + type="ConstantLR", + by_epoch=True, + factor=1, + begin=max_epochs - num_last_epochs, + end=max_epochs, + ), + ] +else: + param_scheduler = [] + +log_config = dict( + interval=1, + hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")], +) + +default_hooks = dict( + checkpoint=dict( + interval=interval, + max_keep_ckpts=3, + save_best='seg/mIoU', + rule='greater' + ), + visualization=dict( + type='DetVisualizationHook', + draw=False, + interval=50, + show=False, + wait_time=2, + test_out_dir='vis_data' + ), +) + +custom_hooks = [ + dict(type="YOLOXModeSwitchHook", num_last_epochs=num_last_epochs, priority=48), + dict(type="SyncNormHook", priority=48), + dict( + type="EMAHook", + ema_type="ExpMomentumEMA", + momentum=0.0001, + update_buffers=True, + priority=4, + ), +] + +auto_scale_lr = dict(base_batch_size=batch_size) + +vis_backends = [ + dict(type="LocalVisBackend"), + dict(type="TensorboardVisBackend"), +] + +visualizer = dict( + type='DetLocalVisualizer', + vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')], + name='visualizer', + alpha=0.5, +) diff --git a/projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py b/projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py new file mode 100644 index 000000000..d8f5bee0a --- /dev/null +++ b/projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py @@ -0,0 +1,372 @@ +_base_ = [ + "../../../../../autoware_ml/configs/detection2d/default_runtime.py", + "../../../../../autoware_ml/configs/detection2d/schedules/schedule_1x.py", + "../../../../../autoware_ml/configs/detection2d/dataset/t4dataset/comlops.py", +] + +custom_imports = dict( + imports=[ + "projects.YOLOX_opt_elan.yolox", + "autoware_ml.detection2d.metrics", + "autoware_ml.detection2d.datasets", + "projects.YOLOX_opt_elan.yolox.models", + "projects.YOLOX_opt_elan.yolox.models.yolox_multitask", + "projects.YOLOX_opt_elan.yolox.transforms", + ], + allow_failed_imports=False, +) + +IMG_SCALE = (960, 960) + +# parameter settings +img_scale = (960, 960) +max_epochs = 300 +num_last_epochs = 15 +resume_from = None +interval = 1 +batch_size = 12 +activation = "ReLU6" +num_workers = 4 + +base_lr = 0.001 + +# model settings +model = dict( + type="YOLOXMultiTask", + data_preprocessor=dict( + type="DetDataPreprocessor", + pad_size_divisor=32, + batch_augments=[ + dict( + type="BatchSyncRandomResize", + random_size_range=(480, 800), + size_divisor=32, + interval=10, + ) + ], + ), + backbone=dict( + type="ELANDarknet", + deepen_factor=2, + widen_factor=1, + out_indices=(2, 3, 4), + act_cfg=dict(type=activation), + ), + neck=dict( + type="YOLOXPAFPN_ELAN", + in_channels=[128, 256, 512], + out_channels=128, + num_elan_blocks=2, + act_cfg=dict(type=activation), + ), + bbox_head=dict( + type="YOLOXHead", + num_classes=40, + in_channels=128, + feat_channels=128, + act_cfg=dict(type=activation), + loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0), + loss_bbox=dict(type='IoULoss', loss_weight=0.0), + loss_obj=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0), + loss_l1=dict(type='L1Loss', loss_weight=0.0), + ), + mask_head=dict( + type="YOLOXSegHead", + in_channels=[128, 128, 128], + feat_channels=128, + num_classes=40, + act_cfg=dict(type=activation), + loss=dict(type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0), + ), + train_cfg=dict(assigner=dict(type="SimOTAAssigner", center_radius=2.5)), + test_cfg=dict(score_thr=0.01, nms=dict(type="nms", iou_threshold=0.65)), +) + +data_root = "" +anno_file_root = "./data/comlops/semseg/" +dataset_type = "T4Dataset" + +backend_args = None + +# pipeline +train_pipeline = [ + # dict(type="Mosaic", img_scale=IMG_SCALE, pad_val=114.0), + # dict( + # type="RandomAffine", + # scaling_ratio_range=(0.1, 2), + # border=(-IMG_SCALE[0] // 2, -IMG_SCALE[1] // 2), + # ), + # dict(type="MixUp", img_scale=IMG_SCALE, ratio_range=(0.8, 1.6), pad_val=114.0), + dict(type="LoadAnnotations", with_bbox=True, with_seg=True), + dict(type="YOLOXHSVRandomAug"), + dict(type="RandomFlip", prob=0.5), + dict(type="Resize", scale=IMG_SCALE, keep_ratio=False), + dict( + type="Pad", + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0), seg=255), + ), + dict(type="FilterAnnotations", min_gt_bbox_wh=(1, 1), keep_empty=False), + dict(type="PackDetInputs"), +] + +classes = ( + "animal", + "bicycle", + "building", + "bus", + "car", + "cone", + "construction", + "crosswalk", + "dashed_lane_marking", + "deceleration_line", + "gate", + "guide_post", + "laneline_dash_white", + "laneline_dash_yellow", + "laneline_solid_green", + "laneline_solid_red", + "laneline_solid_white", + "laneline_solid_yellow", + "marking_arrow", + "marking_character", + "marking_other", + "motorcycle", + "other_obstacle", + "other_pedestrian", + "other_vehicle", + "parking_lot", + "pedestrian", + "pole", + "road", + "road_debris", + "sidewalk", + "sky", + "stopline", + "striped_road_marking", + "traffic_light", + "traffic_sign", + "train", + "truck", + # "unknown", + "vegetation/terrain", + "wall/fence", +) + +palette = [ + (150, 120, 90), # 0: animal + (119, 11, 32), # 1: bicycle + (70, 70, 70), # 2: building + (0, 60, 100), # 3: bus + (0, 0, 142), # 4: car + (250, 170, 30), # 5: cone + (230, 150, 140), # 6: construction + (140, 140, 200), # 7: crosswalk + (255, 255, 255), # 8: dashed_lane_marking + (200, 200, 200), # 9: deceleration_line + (190, 153, 153), # 10: gate + (250, 170, 30), # 11: guide_post + (255, 255, 255), # 12: laneline_dash_white + (255, 255, 0), # 13: laneline_dash_yellow + (0, 255, 0), # 14: laneline_solid_green + (255, 0, 0), # 15: laneline_solid_red + (255, 255, 255), # 16: laneline_solid_white + (255, 215, 0), # 17: laneline_solid_yellow + (0, 255, 255), # 18: marking_arrow + (200, 0, 200), # 19: marking_character + (150, 0, 150), # 20: marking_other + (0, 0, 230), # 21: motorcycle + (80, 80, 80), # 22: other_obstacle + (250, 170, 160), # 23: other_pedestrian + (100, 80, 200), # 24: other_vehicle + (180, 165, 180), # 25: parking_lot + (220, 20, 60), # 26: pedestrian + (153, 153, 153), # 27: pole + (128, 64, 128), # 28: road + (110, 110, 110), # 29: road_debris + (244, 35, 232), # 30: sidewalk + (70, 130, 180), # 31: sky + (220, 220, 220), # 32: stopline + (160, 150, 180), # 33: striped_road_marking + (250, 170, 30), # 34: traffic_light + (220, 220, 0), # 35: traffic_sign + (0, 80, 100), # 36: train + (0, 0, 70), # 37: truck + (107, 142, 35), # 38: vegetation/terrain + (102, 102, 156), # 39: wall/fence +] +metainfo = dict(classes=classes, palette=palette) + +train_dataset = dict( + type="MultiImageMixDataset", + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=anno_file_root + "comlops_infos_train_cleaned.json", + pipeline=[ + dict(type="LoadImageFromFile", backend_args=backend_args), + dict(type="LoadAnnotations", with_bbox=True, with_seg=True), + ], + filter_cfg=dict(filter_empty_gt=False, min_size=8), + backend_args=backend_args, + metainfo=metainfo, + ), + pipeline=train_pipeline, +) + +test_pipeline = [ + dict(type="LoadImageFromFile", backend_args=backend_args), + dict(type="LoadAnnotations", with_bbox=True, with_seg=True), + dict(type="Resize", scale=img_scale, keep_ratio=False), + dict(type="Pad", pad_to_square=True, pad_val=dict(img=(114.0, 114.0, 114.0), seg=255)), + dict( + type="PackDetInputs", + meta_keys=( + "img_id", + "img_path", + "ori_shape", + "img_shape", + "scale_factor", + ), + ), +] + +train_dataloader = dict( + batch_size=batch_size, + num_workers=num_workers, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=True), + dataset=train_dataset, +) + +val_dataloader = dict( + batch_size=batch_size, + num_workers=16, + persistent_workers=True, + drop_last=False, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=anno_file_root + "comlops_infos_val_cleaned.json", + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args, + metainfo=metainfo, + indices=2000, + ), +) + +test_dataloader = val_dataloader + +val_evaluator = [ + dict(type="VOCMetric", metric="mAP", prefix="det"), + dict(type='mmseg.IoUMetric', ignore_index=255, iou_metrics=['mIoU'], prefix="seg") +] + +test_evaluator = val_evaluator + +# train_cfg = dict(max_epochs=max_epochs, val_interval=interval) +train_cfg = dict( + _delete_=True, + type='IterBasedTrainLoop', + max_iters=200000, + val_interval=1000 +) + +# optimizer +optimizer = dict( + type="OptimWrapper", + optimizer=dict(type="SGD", lr=base_lr, momentum=0.9, weight_decay=5e-4, nesterov=True), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), +) + +# learning rate scheduler +if max_epochs > 5: + param_scheduler = [ + dict( + type="mmdet.QuadraticWarmupLR", + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True, + ), + dict( + type="CosineAnnealingLR", + eta_min=base_lr * 0.05, + begin=5, + T_max=max_epochs - num_last_epochs, + end=max_epochs - num_last_epochs, + by_epoch=True, + convert_to_iter_based=True, + ), + dict( + type="ConstantLR", + by_epoch=True, + factor=1, + begin=max_epochs - num_last_epochs, + end=max_epochs, + ), + ] +else: + param_scheduler = [] + +# logging +log_config = dict( + interval=1, + hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")], +) + +# default_hooks = dict( +# checkpoint=dict(interval=interval, max_keep_ckpts=3), +# ) +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=1000, + by_epoch=False, + max_keep_ckpts=5, + save_best='seg/mIoU', + rule='greater' + ), + logger=dict( + type='LoggerHook', + interval=50 + ), + visualization=dict( + type='DetVisualizationHook', + draw=False, + interval=100, + show=False, + wait_time=2, + test_out_dir='vis_data' + ), +) + +custom_hooks = [ + dict(type="YOLOXModeSwitchHook", num_last_epochs=num_last_epochs, priority=48), + dict(type="SyncNormHook", priority=48), + dict( + type="EMAHook", + ema_type="ExpMomentumEMA", + momentum=0.0001, + update_buffers=True, + priority=4, + ), +] + +auto_scale_lr = dict(base_batch_size=batch_size) + + +vis_backends = [ + dict(type="LocalVisBackend"), + dict(type="TensorboardVisBackend"), +] + +visualizer = dict( + type='DetLocalVisualizer', + vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')], + name='visualizer', + alpha=0.3, +) diff --git a/projects/YOLOX_opt_elan/yolox/models/heads/__init__.py b/projects/YOLOX_opt_elan/yolox/models/heads/__init__.py new file mode 100644 index 000000000..db9c396aa --- /dev/null +++ b/projects/YOLOX_opt_elan/yolox/models/heads/__init__.py @@ -0,0 +1,3 @@ +from .seg_head import YOLOXSegHead + +__all__ = ("YOLOXSegHead",) \ No newline at end of file diff --git a/projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py b/projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py new file mode 100644 index 000000000..ea5652d4f --- /dev/null +++ b/projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py @@ -0,0 +1,122 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Dict, List, Tuple, Union + +from torch import Tensor + +from mmdet.models.seg_heads.base_semantic_head import BaseSemanticHead +from ..layers.network_blocks import BaseConv, CSPLayer, DWConv + +from mmengine.registry import MODELS + +def get_activation(name="ReLU6"): + if name.lower() == "relu6": + return nn.ReLU6(inplace=True) + elif name.lower() == "relu": + return nn.ReLU(inplace=True) + elif name.lower() == "silu": + return nn.SiLU(inplace=True) + elif name.lower() == "lrelu": + return nn.LeakyReLU(0.1, inplace=True) + else: + raise AttributeError(f"Unsupported act type {name}") + +@MODELS.register_module() +class YOLOXSegHead(nn.Module): + def __init__(self, in_channels, num_classes, feat_channels=None, act_cfg=dict(type="ReLU6"), width=1.0, depthwise=False, train_cfg=None, + test_cfg=None, **kwargs): + super().__init__() + self.num_classes = num_classes + self.width = width + # self.stem_channels = feat_channels if feat_channels is not None else int(64 * width) + self.stem_channels = sum(in_channels) + + act_type = act_cfg.get("type", "ReLU6") + self.act_fn = get_activation(act_type) + + self.train_cfg = train_cfg + + Conv = DWConv if depthwise else BaseConv + + # mask head layers + self.conv1 = Conv(self.stem_channels, self.stem_channels, 3, 1, act=act_type) + self.conv2 = Conv(self.stem_channels, self.stem_channels, 3, 1, act=act_type) + self.up1 = nn.Upsample(scale_factor=2, mode="nearest") + self.conv3 = Conv(self.stem_channels, self.stem_channels // 2, 3, 1, act=act_type) + self.up2 = nn.Upsample(scale_factor=2, mode="nearest") + self.conv4 = Conv(self.stem_channels // 2, self.stem_channels // 2, 3, 1, act=act_type) + self.up3 = nn.Upsample(scale_factor=2, mode="nearest") + self.out_conv = nn.Conv2d(self.stem_channels // 2, num_classes, kernel_size=1, stride=1, padding=0) + + def forward(self, feats): + """ + Args: + feats (list[Tensor] or Tensor): features from backbone+neck + Returns: + seg_pred (Tensor): [B, num_classes, H, W] + """ + if isinstance(feats, (list, tuple)): + target_size = feats[0].shape[2:] + up_feats = [F.interpolate(f, size=target_size, mode='bilinear', align_corners=False) for f in feats] + x = torch.cat(up_feats, dim=1) # [B, sum(C_i), H, W] + else: + x = feats + + x = self.conv1(x) + x = self.conv2(x) + x = self.up1(x) + x = self.conv3(x) + x = self.up2(x) + x = self.conv4(x) + x = self.up3(x) + seg_pred = self.out_conv(x) + return seg_pred + + def loss(self, seg_pred, gt_masks): + """ + Args: + seg_pred: [B, C, H, W] + gt_masks: [B, H, W] long + Returns: + dict: {'loss_mask': ...} + """ + return dict(loss_mask=F.cross_entropy(seg_pred, gt_masks.long(), ignore_index=255)) + + def predict(self, + x: Union[Tensor, Tuple[Tensor]], + batch_data_samples, + rescale: bool = False) -> List[Tensor]: + + batch_img_metas = [ + data_sample.metainfo for data_sample in batch_data_samples + ] + seg_preds = self.forward(x) + + input_shape = batch_img_metas[0]['batch_input_shape'] + seg_preds = F.interpolate( + seg_preds, + size=input_shape, + mode='bilinear', + align_corners=False) + + result_list = [] + for i in range(len(batch_img_metas)): + img_meta = batch_img_metas[i] + h, w = img_meta['img_shape'] + + seg_pred = seg_preds[i][:, :h, :w] + + if rescale: + ori_h, ori_w = img_meta['ori_shape'] + seg_pred = F.interpolate( + seg_pred.unsqueeze(0), + size=(ori_h, ori_w), + mode='bilinear', + align_corners=False).squeeze(0) + + seg_pred = seg_pred.argmax(dim=0).to(torch.int64) + + result_list.append(seg_pred) + + return result_list diff --git a/projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py b/projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py new file mode 100644 index 000000000..1aab7efcb --- /dev/null +++ b/projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py @@ -0,0 +1,466 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +# Copyright (c) Megvii Inc. All rights reserved. + +import torch +import torch.nn as nn +from torch.autograd import Function + +class SiLU(nn.Module): + """export-friendly version of nn.SiLU()""" + + @staticmethod + def forward(x): + return x * torch.sigmoid(x) + +class PactFix(nn.Module): + """export-friendly version of nn.SiLU()""" + + @staticmethod + def forward(x, alpha=4.0): + y = torch.clamp(x, min = 0, max = alpha) + return y + +class Pact(Function): + @staticmethod + def forward(ctx, x, alpha, k): + ctx.save_for_backward(x, alpha) + # y_1 = 0.5 * ( torch.abs(x).detach() - torch.abs(x - alpha).detach() + alpha.item() ) + y = torch.clamp(x, min = 0, max = alpha.item()) + scale = (2**k - 1) / alpha + y_q = torch.round( y * scale) / scale + return y_q + + @staticmethod + def backward(ctx, dLdy_q): + # Backward function, I borrowed code from + # https://github.com/obilaniu/GradOverride/blob/master/functional.py + # We get dL / dy_q as a gradient + x, alpha, = ctx.saved_tensors + # Weight gradient is only valid when [0, alpha] + # Actual gradient for alpha, + # By applying Chain Rule, we get dL / dy_q * dy_q / dy * dy / dalpha + # dL / dy_q = argument, dy_q / dy * dy / dalpha = 0, 1 with x value range + lower_bound = x < 0 + upper_bound = x > alpha + # x_range = 1.0-lower_bound-upper_bound + x_range = ~(lower_bound|upper_bound) + grad_alpha = torch.sum(dLdy_q * torch.ge(x, alpha).float()).view(-1) + return dLdy_q * x_range.float(), grad_alpha, None + +def get_activation(name="silu", inplace=True): + #name = 'relu' + name = name.lower() + if name == "silu": + module = nn.SiLU(inplace=inplace) + elif name == "relu": + module = nn.ReLU(inplace=inplace) + elif name == "lrelu": + module = nn.LeakyReLU(0.1, inplace=inplace) + elif name == "pact": + module = Pact.apply + elif name == "pactfix": + module = PactFix() + elif name == "relu6": + module = nn.ReLU6() + else: + raise AttributeError("Unsupported act type: {}".format(name)) + return module + +K = 2 +#fisrt = True +class BaseConv(nn.Module): + """A Conv2d -> Batchnorm -> silu/leaky relu block""" + + def __init__( + self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu" + ): + super().__init__() + # same padding + pad = (ksize - 1) // 2 + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=ksize, + stride=stride, + padding=pad, + groups=groups, + bias=bias, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.act = get_activation(act, inplace=True) + self.act_name = act + if (self.act_name == "pact") : + self.alpha = nn.Parameter(torch.tensor(20.)) + + + def forward(self, x): + if (self.act_name == "pact") : + return self.act(self.bn(self.conv(x)), self.alpha, 2) + else: + #print(self.conv) + return self.act(self.bn(self.conv(x))) + + def fuseforward(self, x): + return self.act(self.conv(x)) + + +class DWConv(nn.Module): + """Depthwise Conv + Conv""" + + def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"): + super().__init__() + self.dconv = BaseConv( + in_channels, + in_channels, + ksize=ksize, + stride=stride, + groups=in_channels, + act=act, + ) + self.pconv = BaseConv( + in_channels, out_channels, ksize=1, stride=1, groups=1, act=act + ) + + def forward(self, x): + x = self.dconv(x) + return self.pconv(x) + + +class Bottleneck(nn.Module): + # Standard bottleneck + def __init__( + self, + in_channels, + out_channels, + shortcut=True, + expansion=0.5, + depthwise=False, + act="silu", + kernel=3 + ): + super().__init__() + hidden_channels = int(out_channels * expansion) + Conv = DWConv if depthwise else BaseConv + self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act) + self.conv2 = Conv(hidden_channels, out_channels, kernel, stride=1, act=act) + self.use_add = shortcut and in_channels == out_channels + + def forward(self, x): + y = self.conv2(self.conv1(x)) + if self.use_add: + y = y + x + return y + + +class BottleneckV8(nn.Module): + # Standard bottleneck + def __init__( + self, + in_channels, + out_channels, + shortcut=True, + expansion=0.5, + depthwise=False, + act="silu", + kernel=3, + ): + super().__init__() + hidden_channels = int(out_channels * expansion) + Conv = DWConv if depthwise else BaseConv + self.conv1 = BaseConv(in_channels, hidden_channels, kernel, stride=1, act=act) + self.conv2 = Conv(hidden_channels, out_channels, kernel, stride=1, act=act) + self.use_add = shortcut and in_channels == out_channels + + def forward(self, x): + y = self.conv2(self.conv1(x)) + if self.use_add: + y = y + x + return y + + + +class Bottleneck_EFF(nn.Module): + # Standard bottleneck + def __init__( + self, + in_channels, + out_channels, + shortcut=True, + expansion=0.5, + depthwise=False, + act="silu", + kernel=3, + ): + super().__init__() + hidden_channels = int(out_channels * expansion) + Conv = DWConv if depthwise else BaseConv + self.conv1 = BaseConv(in_channels, hidden_channels, kernel, stride=1, act=act) + self.conv2 = Conv(hidden_channels, out_channels, 5, stride=1, act=act) + self.use_add = shortcut and in_channels == out_channels + + def forward(self, x): + y = self.conv2(self.conv1(x)) + if self.use_add: + y = y + x + return y + + +class ResLayer(nn.Module): + "Residual layer with `in_channels` inputs." + + def __init__(self, in_channels: int): + super().__init__() + mid_channels = in_channels // 2 + self.layer1 = BaseConv( + in_channels, mid_channels, ksize=1, stride=1, act="lrelu" + ) + self.layer2 = BaseConv( + mid_channels, in_channels, ksize=3, stride=1, act="lrelu" + ) + + def forward(self, x): + out = self.layer2(self.layer1(x)) + return x + out + + +class SPPBottleneck(nn.Module): + """Spatial pyramid pooling layer used in YOLOv3-SPP""" + + def __init__( + self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu" + ): + super().__init__() + hidden_channels = in_channels // 2 + self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation) + self.m = nn.ModuleList( + [ + nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) + for ks in kernel_sizes + ] + ) + conv2_channels = hidden_channels * (len(kernel_sizes) + 1) + self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation) + + def forward(self, x): + x = self.conv1(x) + x = torch.cat([x] + [m(x) for m in self.m], dim=1) + x = self.conv2(x) + return x + + +class CSPLayer(nn.Module): + """C3 in yolov5, CSP Bottleneck with 3 convolutions""" + + def __init__( + self, + in_channels, + out_channels, + n=1, + shortcut=True, + expansion=0.5, + depthwise=False, + act="silu", + elan=False, + kernel=3, + ): + """ + Args: + in_channels (int): input channels. + out_channels (int): output channels. + n (int): number of Bottlenecks. Default value: 1. + """ + # ch_in, ch_out, number, shortcut, groups, expansion + super().__init__() + hidden_channels = int(out_channels * expansion) # hidden channels + self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act) + self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act) + self.elan = elan + + if (self.elan == True) : + self.conv3 = BaseConv((n+1) * hidden_channels, out_channels, 1, stride=1, act=act) + module_list = [ + BottleneckV8( + hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel + ) + for _ in range(n) + ] + else: + self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act) + module_list = [ + Bottleneck( + hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel + ) + for _ in range(n) + ] + self.m = nn.Sequential(*module_list) + + def forward(self, x): + x_1 = self.conv1(x) + x_2 = self.conv2(x) + el = [] + if (self.elan == True) : + x = x_1 + for m in self.m: + x = m(x) + el.append(x) + x = torch.cat([x_2] + [m for m in el], dim=1) + else : + x_1 = self.m(x_1) + x = torch.cat((x_1, x_2), dim=1) + return self.conv3(x) + +class CSPLayer_EFF(nn.Module): + """C3 in yolov5, CSP Bottleneck with 3 convolutions""" + + def __init__( + self, + in_channels, + out_channels, + n=1, + shortcut=True, + #expansion=0.5, + expansion=1.0, + depthwise=False, + act="silu", + kernel=3, + elan=False, + ): + """ + Args: + in_channels (int): input channels. + out_channels (int): output channels. + n (int): number of Bottlenecks. Default value: 1. + """ + # ch_in, ch_out, number, shortcut, groups, expansion + super().__init__() + hidden_channels = int(out_channels * expansion) # hidden channels + self.conv1 = BaseConv(in_channels, hidden_channels, 3, stride=1, act=act) + self.conv2 = BaseConv(in_channels, hidden_channels, 3, stride=1, act=act) + self.elan = elan + + if (self.elan == True) : + self.conv3 = BaseConv((n+1) * hidden_channels, out_channels, 1, stride=1, act=act) + module_list = [ + BottleneckV8( + hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act + ) + for _ in range(n) + ] + else : + self.conv3 = BaseConv(2 * hidden_channels, out_channels, 3, stride=1, act=act) + + module_list = [ + Bottleneck_EFF( + hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel + ) + for _ in range(n) + ] + self.m = nn.Sequential(*module_list) + + def forward(self, x): + x_1 = self.conv1(x) + x_2 = self.conv2(x) + el = [] + if (self.elan == True) : + x = x_1 + for m in self.m: + x = m(x) + el.append(x) + x = torch.cat([x_2] + [m for m in el], dim=1) + else : + x_1 = self.m(x_1) + x = torch.cat((x_1, x_2), dim=1) + return self.conv3(x) + + +class ELAN(nn.Module): + """C3 in yolov5, CSP Bottleneck with 3 convolutions""" + + def __init__( + self, + in_channels, + out_channels, + n=3, + shortcut=True, + #expansion=0.5, + expansion=1.0, + depthwise=False, + act="silu", + kernel=3, + ): + """ + Args: + in_channels (int): input channels. + out_channels (int): output channels. + n (int): number of Bottlenecks. Default value: 1. + """ + # ch_in, ch_out, number, shortcut, groups, expansion + super().__init__() + hidden_channels = int(out_channels * expansion) # hidden channels + self.conv1 = BaseConv(hidden_channels, out_channels, 3, stride=1, act=act) + self.conv_c = BaseConv(in_channels, hidden_channels, kernel, stride=1, act=act) + self.conv2 = BaseConv((n+1) * hidden_channels, out_channels, 1, stride=1, act=act) + + + module_list = [ + self.conv_c + for _ in range(n) + ] + self.m = nn.Sequential(*module_list) + + def forward(self, x): + x_1 = self.conv1(x) + el = [] + for m in self.m: + x = m(x) + el.append(x) + x = torch.cat([x_1] + [m for m in el], dim=1) + return self.conv2(x) + + + +class Focus(nn.Module): + """Focus width and height information into channel space.""" + + def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"): + super().__init__() + self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride, act=act) + + def forward(self, x): + # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2) + patch_top_left = x[..., ::2, ::2] + patch_top_right = x[..., ::2, 1::2] + patch_bot_left = x[..., 1::2, ::2] + patch_bot_right = x[..., 1::2, 1::2] + x = torch.cat( + ( + patch_top_left, + patch_bot_left, + patch_top_right, + patch_bot_right, + ), + dim=1, + ) + return self.conv(x) + + + +class SimpleStem(nn.Module): + """Simple Stem for Acceleration on Embedded Devices""" + + def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"): + super().__init__() + #self.conv1 = BaseConv(in_channels, out_channels, ksize, stride, act=act) + #self.down1 = BaseConv(out_channels, out_channels, ksize, 2, act=act) + self.down1 = BaseConv(in_channels, out_channels, ksize, 2, act=act) + self.conv2 = BaseConv(out_channels, out_channels, ksize, stride, act=act) + #self.down2 = BaseConv(out_channels, out_channels, ksize, 2, act=act) + + def forward(self, x): + #x = self.conv1(x) + x = self.down1(x) + x = self.conv2(x) + #x = self.down2(x) + return x + diff --git a/projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py b/projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py new file mode 100644 index 000000000..5e7ac8746 --- /dev/null +++ b/projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py @@ -0,0 +1,189 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from torch import Tensor + +from mmengine.model import BaseModule +from mmdet.registry import MODELS +from mmdet.models import BaseDetector +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from mmdet.structures import DetDataSample +from mmengine.structures import PixelData, InstanceData +from mmdet.structures import SampleList + +from .heads import YOLOXSegHead + +from mmengine.logging import print_log + +@MODELS.register_module() +class YOLOXMultiTask(BaseDetector): + """ + YOLOX MultiTask detector + Supports bbox + mask heads. + """ + + def __init__(self, + backbone, + neck, + bbox_head, + mask_head=None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor=None, + init_cfg=None, + **kwargs): + super().__init__(init_cfg=init_cfg) + self.backbone = MODELS.build(backbone) + self.neck = MODELS.build(neck) if neck is not None else None + if bbox_head is not None: + bbox_head.update(train_cfg=train_cfg) + bbox_head.update(test_cfg=test_cfg) + self.bbox_head = MODELS.build(bbox_head) + if mask_head is not None: + mask_head.update(train_cfg=train_cfg) + mask_head.update(test_cfg=test_cfg) + self.mask_head = MODELS.build(mask_head) if mask_head is not None else None + self.data_preprocessor = MODELS.build(data_preprocessor) if data_preprocessor else None + + def extract_feat(self, inputs): + x = self.backbone(inputs) + if self.neck is not None: + x = self.neck(x) + return x + + def _forward(self, imgs, **kwargs): + return self.forward(imgs, **kwargs) + + def forward_train(self, imgs, gt_bboxes, gt_labels, gt_masks=None, **kwargs): + feats = self.extract_feat(imgs) + losses = dict() + losses.update(self.bbox_head.loss(feats[-1], gt_bboxes, gt_labels)) + if self.mask_head is not None and gt_masks is not None: + mask_pred = self.mask_head(feats) + losses.update(self.mask_head.loss(mask_pred, gt_masks)) + return losses + + def forward_test(self, imgs, **kwargs): + feats = self.extract_feat(imgs) + bbox_results = self.bbox_head(feats[-1]) + mask_results = None + if self.mask_head is not None: + mask_results = self.mask_head(feats) + return dict(bboxes=bbox_results, masks=mask_results) + + def forward(self, inputs, data_samples=None, mode='tensor'): + """Forward function with training and testing mode.""" + feats = self.extract_feat(inputs) + + if mode == 'tensor': + return self.bbox_head(feats) + elif mode == 'loss': + s = self.loss(feats, data_samples) + return s + elif mode == 'predict': + pred_instances = self.predict(inputs, data_samples) + + for pred, data_sample in zip(pred_instances, data_samples): + pred.gt_instances = data_sample.gt_instances + if hasattr(data_sample, 'gt_sem_seg'): + pred.gt_sem_seg = data_sample.gt_sem_seg + + return pred_instances + else: + raise ValueError(f"Invalid mode {mode}") + + def loss(self, feats, data_samples): + loss = dict() + # bbox head forward + cls_scores, bbox_preds, objectnesses = self.bbox_head(feats) + batch_gt_instances = [d.gt_instances for d in data_samples] + batch_img_metas = [d.metainfo for d in data_samples] + + loss.update( + self.bbox_head.loss_by_feat( + cls_scores, bbox_preds, objectnesses, batch_gt_instances, batch_img_metas + ) + ) + + # mask head + if self.mask_head is not None: + seg_pred = self.mask_head(feats) + target_size = data_samples[0].gt_sem_seg.sem_seg.shape[-2:] + if seg_pred.shape[-2:] != target_size: + seg_pred = F.interpolate(seg_pred, size=target_size, mode='bilinear', align_corners=False) + + gt_masks_tensor = [] + gt_masks = torch.stack([d.gt_sem_seg.sem_seg.squeeze(0) for d in data_samples], dim=0) # (B, H, W) + gt_masks = gt_masks.to(seg_pred.device) + + mask_loss_dict = self.mask_head.loss(seg_pred, gt_masks) + for k, v in mask_loss_dict.items(): + if torch.is_tensor(v): + loss[k] = v + else: + raise TypeError(f"mask loss '{k}' is not a tensor") + + return loss + + def predict(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + rescale: bool = True, + **kwargs) -> SampleList: + + x = self.extract_feat(batch_inputs) + + if self.with_bbox: + bbox_results_list = self.bbox_head.predict(x, batch_data_samples, rescale=True) + else: + bbox_results_list = [InstanceData() for _ in batch_data_samples] + + seg_results_list = None + if self.with_mask: + seg_results_list = self.mask_head.predict(x, batch_data_samples, rescale=True) + + results = [] + for i, data_sample in enumerate(batch_data_samples): + data_sample.pred_instances = bbox_results_list[i] + + if seg_results_list is not None: + pixel_data = PixelData() + pixel_data.data = seg_results_list[i] + pixel_data.sem_seg = seg_results_list[i] + data_sample.pred_sem_seg = pixel_data + + img_h, img_w = data_sample.metainfo['img_shape'] + ori_h, ori_w = data_sample.metainfo['ori_shape'] + + if hasattr(data_sample, 'gt_instances'): + + scale_factor = data_sample.metainfo['scale_factor'] # (w_scale, h_scale) + + scale_factor_bbox = [scale_factor[0], scale_factor[1], scale_factor[0], scale_factor[1]] + scale_tensor = data_sample.gt_instances.bboxes.new_tensor(scale_factor_bbox) + + data_sample.gt_instances.bboxes = data_sample.gt_instances.bboxes / scale_tensor + + if hasattr(data_sample, 'gt_sem_seg') and data_sample.gt_sem_seg is not None: + gt_sem_seg_data = data_sample.gt_sem_seg.sem_seg # [H_pad, W_pad] + + gt_valid = gt_sem_seg_data[..., :img_h, :img_w] + + if gt_valid.shape[-2:] != (ori_h, ori_w): + gt_resized = F.interpolate( + gt_valid.unsqueeze(0).float(), # [1, 1, h, w] + size=(ori_h, ori_w), + mode='nearest' + ).squeeze(0).long() + + new_gt_pixel_data = PixelData() + new_gt_pixel_data.sem_seg = gt_resized + new_gt_pixel_data.data = gt_resized + data_sample.gt_sem_seg = new_gt_pixel_data + elif 'data' not in data_sample.gt_sem_seg: + data_sample.gt_sem_seg.data = data_sample.gt_sem_seg.sem_seg + + results.append(data_sample) + + return results diff --git a/projects/YOLOX_opt_elan/yolox/transforms.py b/projects/YOLOX_opt_elan/yolox/transforms.py new file mode 100644 index 000000000..939c57102 --- /dev/null +++ b/projects/YOLOX_opt_elan/yolox/transforms.py @@ -0,0 +1,37 @@ +import torch +import torch.nn.functional as F +from mmdet.registry import TRANSFORMS +from mmcv.transforms import BaseTransform + +import os.path as osp + +@TRANSFORMS.register_module() +class ResizeSegMask: + def __init__(self, size): + self.size = size # (H_out, W_out) + + def __call__(self, results): + if 'gt_seg_map' in results and results['gt_seg_map'] is not None: + seg = results['gt_seg_map'] # numpy array (H, W) + seg = torch.from_numpy(seg).unsqueeze(0).unsqueeze(0).float() # (1,1,H,W) + seg = F.interpolate(seg, size=self.size, mode='nearest') + results['gt_seg_map'] = seg.squeeze(0).squeeze(0).long().numpy() # back to (H_out,W_out) + return results + +@TRANSFORMS.register_module() +class FixCityscapesPath(BaseTransform): + def __init__(self, data_root, split='train'): + self.data_root = data_root + self.split = split + + def transform(self, results): + img_path = results['img_path'] + filename = osp.basename(img_path) + + seg_filename = filename.replace('_leftImg8bit.png', '_gtFine_labelTrainIds.png') + city = filename.split('_')[0] + seg_path = osp.join(self.data_root, 'gtFine', self.split, city, seg_filename) + + results['seg_map_path'] = seg_path + + return results From 645bf2af90363f8d499d4b31f8e5550a53097b97 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Dec 2025 05:39:32 +0000 Subject: [PATCH 2/2] ci(pre-commit): autofix --- ...opt-elan-semseg_960x960_300e_cityscapes.py | 107 +++++++----- ...-opt-elan-semseg_960x960_300e_t4dataset.py | 110 +++++------- .../yolox/models/heads/__init__.py | 2 +- .../yolox/models/heads/seg_head.py | 58 +++---- .../yolox/models/layers/network_blocks.py | 159 +++++++----------- .../yolox/models/yolox_multitask.py | 109 ++++++------ projects/YOLOX_opt_elan/yolox/transforms.py | 32 ++-- 7 files changed, 274 insertions(+), 303 deletions(-) diff --git a/projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py b/projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py index 05af852f9..05eac75af 100644 --- a/projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py +++ b/projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py @@ -11,7 +11,7 @@ "projects.YOLOX_opt_elan.yolox.models", "projects.YOLOX_opt_elan.yolox.models.yolox_multitask", "projects.YOLOX_opt_elan.yolox.transforms", - "mmseg.evaluation.metrics", # 引入分割评估指标 + "mmseg.evaluation.metrics", # 引入分割评估指标 ], allow_failed_imports=False, ) @@ -29,14 +29,51 @@ base_lr = 0.001 -classes = ('person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle') -palette = [ - (220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70), - (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32) -] +classes = ("person", "rider", "car", "truck", "bus", "train", "motorcycle", "bicycle") +palette = [(220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70), (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32)] -seg_classes = ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole', 'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle') -seg_palette = [(128, 64, 128), (244, 35, 232), (70, 70, 70), (102, 102, 156), (190, 153, 153), (153, 153, 153), (250, 170, 30), (220, 220, 0), (107, 142, 35), (152, 251, 152), (70, 130, 180), (220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70), (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32)] +seg_classes = ( + "road", + "sidewalk", + "building", + "wall", + "fence", + "pole", + "traffic light", + "traffic sign", + "vegetation", + "terrain", + "sky", + "person", + "rider", + "car", + "truck", + "bus", + "train", + "motorcycle", + "bicycle", +) +seg_palette = [ + (128, 64, 128), + (244, 35, 232), + (70, 70, 70), + (102, 102, 156), + (190, 153, 153), + (153, 153, 153), + (250, 170, 30), + (220, 220, 0), + (107, 142, 35), + (152, 251, 152), + (70, 130, 180), + (220, 20, 60), + (255, 0, 0), + (0, 0, 142), + (0, 0, 70), + (0, 60, 100), + (0, 80, 100), + (0, 0, 230), + (119, 11, 32), +] # metainfo = dict(classes=classes, palette=palette) metainfo = dict(classes=seg_classes, palette=seg_palette) @@ -58,7 +95,7 @@ backbone=dict( type="ELANDarknet", deepen_factor=2, - widen_factor=1, + widen_factor=1, out_indices=(2, 3, 4), act_cfg=dict(type=activation), ), @@ -75,10 +112,10 @@ in_channels=128, feat_channels=128, act_cfg=dict(type=activation), - loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0), - loss_bbox=dict(type='IoULoss', loss_weight=0.0), - loss_obj=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0), - loss_l1=dict(type='L1Loss', loss_weight=0.0), + loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=0.0), + loss_bbox=dict(type="IoULoss", loss_weight=0.0), + loss_obj=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=0.0), + loss_l1=dict(type="L1Loss", loss_weight=0.0), ), mask_head=dict( type="YOLOXSegHead", @@ -92,8 +129,8 @@ test_cfg=dict(score_thr=0.01, nms=dict(type="nms", iou_threshold=0.65)), ) -dataset_type = 'CocoDataset' -data_root = 'data/cityscapes/' +dataset_type = "CocoDataset" +data_root = "data/cityscapes/" backend_args = None train_pipeline = [ @@ -114,7 +151,7 @@ test_pipeline = [ dict(type="LoadImageFromFile", backend_args=backend_args), - dict(type="FixCityscapesPath", data_root=data_root, split='val'), + dict(type="FixCityscapesPath", data_root=data_root, split="val"), dict(type="LoadAnnotations", with_bbox=True, with_seg=True), dict(type="Resize", scale=IMG_SCALE, keep_ratio=False), dict(type="Pad", pad_to_square=False, size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0), seg=255)), @@ -129,14 +166,11 @@ dataset=dict( type=dataset_type, data_root=data_root, - data_prefix=dict( - img='leftImg8bit/train', - seg_map_path='gtFine/train' - ), - ann_file='annotations/instancesonly_filtered_gtFine_train.json', + data_prefix=dict(img="leftImg8bit/train", seg_map_path="gtFine/train"), + ann_file="annotations/instancesonly_filtered_gtFine_train.json", pipeline=[ dict(type="LoadImageFromFile", backend_args=backend_args), - dict(type="FixCityscapesPath", data_root=data_root, split='train'), + dict(type="FixCityscapesPath", data_root=data_root, split="train"), dict(type="LoadAnnotations", with_bbox=True, with_seg=True), ], filter_cfg=dict(filter_empty_gt=False, min_size=8), @@ -163,11 +197,8 @@ dataset=dict( type=dataset_type, data_root=data_root, - data_prefix=dict( - img='leftImg8bit/val', - seg_map_path='gtFine/val' - ), - ann_file='annotations/instancesonly_filtered_gtFine_val.json', + data_prefix=dict(img="leftImg8bit/val", seg_map_path="gtFine/val"), + ann_file="annotations/instancesonly_filtered_gtFine_val.json", test_mode=True, pipeline=test_pipeline, backend_args=backend_args, @@ -177,7 +208,7 @@ test_dataloader = val_dataloader val_evaluator = [ - dict(type='mmseg.IoUMetric', ignore_index=255, iou_metrics=['mIoU'], prefix="seg", classes=seg_classes) + dict(type="mmseg.IoUMetric", ignore_index=255, iou_metrics=["mIoU"], prefix="seg", classes=seg_classes) ] test_evaluator = val_evaluator @@ -224,19 +255,9 @@ ) default_hooks = dict( - checkpoint=dict( - interval=interval, - max_keep_ckpts=3, - save_best='seg/mIoU', - rule='greater' - ), + checkpoint=dict(interval=interval, max_keep_ckpts=3, save_best="seg/mIoU", rule="greater"), visualization=dict( - type='DetVisualizationHook', - draw=False, - interval=50, - show=False, - wait_time=2, - test_out_dir='vis_data' + type="DetVisualizationHook", draw=False, interval=50, show=False, wait_time=2, test_out_dir="vis_data" ), ) @@ -260,8 +281,8 @@ ] visualizer = dict( - type='DetLocalVisualizer', - vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')], - name='visualizer', + type="DetLocalVisualizer", + vis_backends=[dict(type="LocalVisBackend"), dict(type="TensorboardVisBackend")], + name="visualizer", alpha=0.5, ) diff --git a/projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py b/projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py index d8f5bee0a..dc78b0080 100644 --- a/projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py +++ b/projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py @@ -65,10 +65,10 @@ in_channels=128, feat_channels=128, act_cfg=dict(type=activation), - loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0), - loss_bbox=dict(type='IoULoss', loss_weight=0.0), - loss_obj=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0), - loss_l1=dict(type='L1Loss', loss_weight=0.0), + loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=0.0), + loss_bbox=dict(type="IoULoss", loss_weight=0.0), + loss_obj=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=0.0), + loss_l1=dict(type="L1Loss", loss_weight=0.0), ), mask_head=dict( type="YOLOXSegHead", @@ -155,45 +155,45 @@ ) palette = [ - (150, 120, 90), # 0: animal - (119, 11, 32), # 1: bicycle - (70, 70, 70), # 2: building - (0, 60, 100), # 3: bus - (0, 0, 142), # 4: car - (250, 170, 30), # 5: cone - (230, 150, 140), # 6: construction - (140, 140, 200), # 7: crosswalk + (150, 120, 90), # 0: animal + (119, 11, 32), # 1: bicycle + (70, 70, 70), # 2: building + (0, 60, 100), # 3: bus + (0, 0, 142), # 4: car + (250, 170, 30), # 5: cone + (230, 150, 140), # 6: construction + (140, 140, 200), # 7: crosswalk (255, 255, 255), # 8: dashed_lane_marking - (200, 200, 200), # 9: deceleration_line - (190, 153, 153), # 10: gate - (250, 170, 30), # 11: guide_post + (200, 200, 200), # 9: deceleration_line + (190, 153, 153), # 10: gate + (250, 170, 30), # 11: guide_post (255, 255, 255), # 12: laneline_dash_white - (255, 255, 0), # 13: laneline_dash_yellow - (0, 255, 0), # 14: laneline_solid_green - (255, 0, 0), # 15: laneline_solid_red + (255, 255, 0), # 13: laneline_dash_yellow + (0, 255, 0), # 14: laneline_solid_green + (255, 0, 0), # 15: laneline_solid_red (255, 255, 255), # 16: laneline_solid_white - (255, 215, 0), # 17: laneline_solid_yellow - (0, 255, 255), # 18: marking_arrow - (200, 0, 200), # 19: marking_character - (150, 0, 150), # 20: marking_other - (0, 0, 230), # 21: motorcycle - (80, 80, 80), # 22: other_obstacle - (250, 170, 160), # 23: other_pedestrian - (100, 80, 200), # 24: other_vehicle + (255, 215, 0), # 17: laneline_solid_yellow + (0, 255, 255), # 18: marking_arrow + (200, 0, 200), # 19: marking_character + (150, 0, 150), # 20: marking_other + (0, 0, 230), # 21: motorcycle + (80, 80, 80), # 22: other_obstacle + (250, 170, 160), # 23: other_pedestrian + (100, 80, 200), # 24: other_vehicle (180, 165, 180), # 25: parking_lot - (220, 20, 60), # 26: pedestrian - (153, 153, 153), # 27: pole - (128, 64, 128), # 28: road + (220, 20, 60), # 26: pedestrian + (153, 153, 153), # 27: pole + (128, 64, 128), # 28: road (110, 110, 110), # 29: road_debris - (244, 35, 232), # 30: sidewalk - (70, 130, 180), # 31: sky - (220, 220, 220), # 32: stopline + (244, 35, 232), # 30: sidewalk + (70, 130, 180), # 31: sky + (220, 220, 220), # 32: stopline (160, 150, 180), # 33: striped_road_marking - (250, 170, 30), # 34: traffic_light - (220, 220, 0), # 35: traffic_sign - (0, 80, 100), # 36: train - (0, 0, 70), # 37: truck - (107, 142, 35), # 38: vegetation/terrain + (250, 170, 30), # 34: traffic_light + (220, 220, 0), # 35: traffic_sign + (0, 80, 100), # 36: train + (0, 0, 70), # 37: truck + (107, 142, 35), # 38: vegetation/terrain (102, 102, 156), # 39: wall/fence ] metainfo = dict(classes=classes, palette=palette) @@ -254,7 +254,7 @@ pipeline=test_pipeline, backend_args=backend_args, metainfo=metainfo, - indices=2000, + indices=2000, ), ) @@ -262,18 +262,13 @@ val_evaluator = [ dict(type="VOCMetric", metric="mAP", prefix="det"), - dict(type='mmseg.IoUMetric', ignore_index=255, iou_metrics=['mIoU'], prefix="seg") + dict(type="mmseg.IoUMetric", ignore_index=255, iou_metrics=["mIoU"], prefix="seg"), ] test_evaluator = val_evaluator # train_cfg = dict(max_epochs=max_epochs, val_interval=interval) -train_cfg = dict( - _delete_=True, - type='IterBasedTrainLoop', - max_iters=200000, - val_interval=1000 -) +train_cfg = dict(_delete_=True, type="IterBasedTrainLoop", max_iters=200000, val_interval=1000) # optimizer optimizer = dict( @@ -323,24 +318,11 @@ # ) default_hooks = dict( checkpoint=dict( - type='CheckpointHook', - interval=1000, - by_epoch=False, - max_keep_ckpts=5, - save_best='seg/mIoU', - rule='greater' - ), - logger=dict( - type='LoggerHook', - interval=50 + type="CheckpointHook", interval=1000, by_epoch=False, max_keep_ckpts=5, save_best="seg/mIoU", rule="greater" ), + logger=dict(type="LoggerHook", interval=50), visualization=dict( - type='DetVisualizationHook', - draw=False, - interval=100, - show=False, - wait_time=2, - test_out_dir='vis_data' + type="DetVisualizationHook", draw=False, interval=100, show=False, wait_time=2, test_out_dir="vis_data" ), ) @@ -365,8 +347,8 @@ ] visualizer = dict( - type='DetLocalVisualizer', - vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')], - name='visualizer', + type="DetLocalVisualizer", + vis_backends=[dict(type="LocalVisBackend"), dict(type="TensorboardVisBackend")], + name="visualizer", alpha=0.3, ) diff --git a/projects/YOLOX_opt_elan/yolox/models/heads/__init__.py b/projects/YOLOX_opt_elan/yolox/models/heads/__init__.py index db9c396aa..598422de0 100644 --- a/projects/YOLOX_opt_elan/yolox/models/heads/__init__.py +++ b/projects/YOLOX_opt_elan/yolox/models/heads/__init__.py @@ -1,3 +1,3 @@ from .seg_head import YOLOXSegHead -__all__ = ("YOLOXSegHead",) \ No newline at end of file +__all__ = ("YOLOXSegHead",) diff --git a/projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py b/projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py index ea5652d4f..866fc727d 100644 --- a/projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py +++ b/projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py @@ -1,14 +1,14 @@ +from typing import Dict, List, Tuple, Union + import torch import torch.nn as nn import torch.nn.functional as F -from typing import Dict, List, Tuple, Union - +from mmdet.models.seg_heads.base_semantic_head import BaseSemanticHead +from mmengine.registry import MODELS from torch import Tensor -from mmdet.models.seg_heads.base_semantic_head import BaseSemanticHead from ..layers.network_blocks import BaseConv, CSPLayer, DWConv -from mmengine.registry import MODELS def get_activation(name="ReLU6"): if name.lower() == "relu6": @@ -22,10 +22,21 @@ def get_activation(name="ReLU6"): else: raise AttributeError(f"Unsupported act type {name}") + @MODELS.register_module() class YOLOXSegHead(nn.Module): - def __init__(self, in_channels, num_classes, feat_channels=None, act_cfg=dict(type="ReLU6"), width=1.0, depthwise=False, train_cfg=None, - test_cfg=None, **kwargs): + def __init__( + self, + in_channels, + num_classes, + feat_channels=None, + act_cfg=dict(type="ReLU6"), + width=1.0, + depthwise=False, + train_cfg=None, + test_cfg=None, + **kwargs, + ): super().__init__() self.num_classes = num_classes self.width = width @@ -36,7 +47,7 @@ def __init__(self, in_channels, num_classes, feat_channels=None, act_cfg=dict(ty self.act_fn = get_activation(act_type) self.train_cfg = train_cfg - + Conv = DWConv if depthwise else BaseConv # mask head layers @@ -58,7 +69,7 @@ def forward(self, feats): """ if isinstance(feats, (list, tuple)): target_size = feats[0].shape[2:] - up_feats = [F.interpolate(f, size=target_size, mode='bilinear', align_corners=False) for f in feats] + up_feats = [F.interpolate(f, size=target_size, mode="bilinear", align_corners=False) for f in feats] x = torch.cat(up_feats, dim=1) # [B, sum(C_i), H, W] else: x = feats @@ -83,40 +94,29 @@ def loss(self, seg_pred, gt_masks): """ return dict(loss_mask=F.cross_entropy(seg_pred, gt_masks.long(), ignore_index=255)) - def predict(self, - x: Union[Tensor, Tuple[Tensor]], - batch_data_samples, - rescale: bool = False) -> List[Tensor]: - - batch_img_metas = [ - data_sample.metainfo for data_sample in batch_data_samples - ] + def predict(self, x: Union[Tensor, Tuple[Tensor]], batch_data_samples, rescale: bool = False) -> List[Tensor]: + + batch_img_metas = [data_sample.metainfo for data_sample in batch_data_samples] seg_preds = self.forward(x) - input_shape = batch_img_metas[0]['batch_input_shape'] - seg_preds = F.interpolate( - seg_preds, - size=input_shape, - mode='bilinear', - align_corners=False) + input_shape = batch_img_metas[0]["batch_input_shape"] + seg_preds = F.interpolate(seg_preds, size=input_shape, mode="bilinear", align_corners=False) result_list = [] for i in range(len(batch_img_metas)): img_meta = batch_img_metas[i] - h, w = img_meta['img_shape'] + h, w = img_meta["img_shape"] seg_pred = seg_preds[i][:, :h, :w] if rescale: - ori_h, ori_w = img_meta['ori_shape'] + ori_h, ori_w = img_meta["ori_shape"] seg_pred = F.interpolate( - seg_pred.unsqueeze(0), - size=(ori_h, ori_w), - mode='bilinear', - align_corners=False).squeeze(0) + seg_pred.unsqueeze(0), size=(ori_h, ori_w), mode="bilinear", align_corners=False + ).squeeze(0) seg_pred = seg_pred.argmax(dim=0).to(torch.int64) - + result_list.append(seg_pred) return result_list diff --git a/projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py b/projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py index 1aab7efcb..6edebdaae 100644 --- a/projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py +++ b/projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py @@ -6,6 +6,7 @@ import torch.nn as nn from torch.autograd import Function + class SiLU(nn.Module): """export-friendly version of nn.SiLU()""" @@ -13,22 +14,24 @@ class SiLU(nn.Module): def forward(x): return x * torch.sigmoid(x) + class PactFix(nn.Module): """export-friendly version of nn.SiLU()""" @staticmethod def forward(x, alpha=4.0): - y = torch.clamp(x, min = 0, max = alpha) + y = torch.clamp(x, min=0, max=alpha) return y + class Pact(Function): - @staticmethod + @staticmethod def forward(ctx, x, alpha, k): ctx.save_for_backward(x, alpha) # y_1 = 0.5 * ( torch.abs(x).detach() - torch.abs(x - alpha).detach() + alpha.item() ) - y = torch.clamp(x, min = 0, max = alpha.item()) + y = torch.clamp(x, min=0, max=alpha.item()) scale = (2**k - 1) / alpha - y_q = torch.round( y * scale) / scale + y_q = torch.round(y * scale) / scale return y_q @staticmethod @@ -36,20 +39,24 @@ def backward(ctx, dLdy_q): # Backward function, I borrowed code from # https://github.com/obilaniu/GradOverride/blob/master/functional.py # We get dL / dy_q as a gradient - x, alpha, = ctx.saved_tensors + ( + x, + alpha, + ) = ctx.saved_tensors # Weight gradient is only valid when [0, alpha] # Actual gradient for alpha, # By applying Chain Rule, we get dL / dy_q * dy_q / dy * dy / dalpha # dL / dy_q = argument, dy_q / dy * dy / dalpha = 0, 1 with x value range - lower_bound = x < 0 - upper_bound = x > alpha + lower_bound = x < 0 + upper_bound = x > alpha # x_range = 1.0-lower_bound-upper_bound - x_range = ~(lower_bound|upper_bound) + x_range = ~(lower_bound | upper_bound) grad_alpha = torch.sum(dLdy_q * torch.ge(x, alpha).float()).view(-1) return dLdy_q * x_range.float(), grad_alpha, None - + + def get_activation(name="silu", inplace=True): - #name = 'relu' + # name = 'relu' name = name.lower() if name == "silu": module = nn.SiLU(inplace=inplace) @@ -57,9 +64,9 @@ def get_activation(name="silu", inplace=True): module = nn.ReLU(inplace=inplace) elif name == "lrelu": module = nn.LeakyReLU(0.1, inplace=inplace) - elif name == "pact": + elif name == "pact": module = Pact.apply - elif name == "pactfix": + elif name == "pactfix": module = PactFix() elif name == "relu6": module = nn.ReLU6() @@ -67,14 +74,15 @@ def get_activation(name="silu", inplace=True): raise AttributeError("Unsupported act type: {}".format(name)) return module + K = 2 -#fisrt = True + + +# fisrt = True class BaseConv(nn.Module): """A Conv2d -> Batchnorm -> silu/leaky relu block""" - def __init__( - self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu" - ): + def __init__(self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"): super().__init__() # same padding pad = (ksize - 1) // 2 @@ -90,15 +98,14 @@ def __init__( self.bn = nn.BatchNorm2d(out_channels) self.act = get_activation(act, inplace=True) self.act_name = act - if (self.act_name == "pact") : - self.alpha = nn.Parameter(torch.tensor(20.)) + if self.act_name == "pact": + self.alpha = nn.Parameter(torch.tensor(20.0)) - def forward(self, x): - if (self.act_name == "pact") : + if self.act_name == "pact": return self.act(self.bn(self.conv(x)), self.alpha, 2) else: - #print(self.conv) + # print(self.conv) return self.act(self.bn(self.conv(x))) def fuseforward(self, x): @@ -118,9 +125,7 @@ def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"): groups=in_channels, act=act, ) - self.pconv = BaseConv( - in_channels, out_channels, ksize=1, stride=1, groups=1, act=act - ) + self.pconv = BaseConv(in_channels, out_channels, ksize=1, stride=1, groups=1, act=act) def forward(self, x): x = self.dconv(x) @@ -129,16 +134,7 @@ def forward(self, x): class Bottleneck(nn.Module): # Standard bottleneck - def __init__( - self, - in_channels, - out_channels, - shortcut=True, - expansion=0.5, - depthwise=False, - act="silu", - kernel=3 - ): + def __init__(self, in_channels, out_channels, shortcut=True, expansion=0.5, depthwise=False, act="silu", kernel=3): super().__init__() hidden_channels = int(out_channels * expansion) Conv = DWConv if depthwise else BaseConv @@ -179,7 +175,6 @@ def forward(self, x): return y - class Bottleneck_EFF(nn.Module): # Standard bottleneck def __init__( @@ -190,7 +185,7 @@ def __init__( expansion=0.5, depthwise=False, act="silu", - kernel=3, + kernel=3, ): super().__init__() hidden_channels = int(out_channels * expansion) @@ -203,7 +198,7 @@ def forward(self, x): y = self.conv2(self.conv1(x)) if self.use_add: y = y + x - return y + return y class ResLayer(nn.Module): @@ -212,12 +207,8 @@ class ResLayer(nn.Module): def __init__(self, in_channels: int): super().__init__() mid_channels = in_channels // 2 - self.layer1 = BaseConv( - in_channels, mid_channels, ksize=1, stride=1, act="lrelu" - ) - self.layer2 = BaseConv( - mid_channels, in_channels, ksize=3, stride=1, act="lrelu" - ) + self.layer1 = BaseConv(in_channels, mid_channels, ksize=1, stride=1, act="lrelu") + self.layer2 = BaseConv(mid_channels, in_channels, ksize=3, stride=1, act="lrelu") def forward(self, x): out = self.layer2(self.layer1(x)) @@ -227,18 +218,11 @@ def forward(self, x): class SPPBottleneck(nn.Module): """Spatial pyramid pooling layer used in YOLOv3-SPP""" - def __init__( - self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu" - ): + def __init__(self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"): super().__init__() hidden_channels = in_channels // 2 self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation) - self.m = nn.ModuleList( - [ - nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) - for ks in kernel_sizes - ] - ) + self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) for ks in kernel_sizes]) conv2_channels = hidden_channels * (len(kernel_sizes) + 1) self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation) @@ -277,20 +261,16 @@ def __init__( self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act) self.elan = elan - if (self.elan == True) : - self.conv3 = BaseConv((n+1) * hidden_channels, out_channels, 1, stride=1, act=act) + if self.elan == True: + self.conv3 = BaseConv((n + 1) * hidden_channels, out_channels, 1, stride=1, act=act) module_list = [ - BottleneckV8( - hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel - ) + BottleneckV8(hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel) for _ in range(n) - ] + ] else: self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act) module_list = [ - Bottleneck( - hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel - ) + Bottleneck(hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel) for _ in range(n) ] self.m = nn.Sequential(*module_list) @@ -299,17 +279,18 @@ def forward(self, x): x_1 = self.conv1(x) x_2 = self.conv2(x) el = [] - if (self.elan == True) : + if self.elan == True: x = x_1 for m in self.m: x = m(x) el.append(x) x = torch.cat([x_2] + [m for m in el], dim=1) - else : + else: x_1 = self.m(x_1) x = torch.cat((x_1, x_2), dim=1) return self.conv3(x) + class CSPLayer_EFF(nn.Module): """C3 in yolov5, CSP Bottleneck with 3 convolutions""" @@ -319,12 +300,12 @@ def __init__( out_channels, n=1, shortcut=True, - #expansion=0.5, + # expansion=0.5, expansion=1.0, depthwise=False, act="silu", kernel=3, - elan=False, + elan=False, ): """ Args: @@ -339,21 +320,16 @@ def __init__( self.conv2 = BaseConv(in_channels, hidden_channels, 3, stride=1, act=act) self.elan = elan - if (self.elan == True) : - self.conv3 = BaseConv((n+1) * hidden_channels, out_channels, 1, stride=1, act=act) + if self.elan == True: + self.conv3 = BaseConv((n + 1) * hidden_channels, out_channels, 1, stride=1, act=act) module_list = [ - BottleneckV8( - hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act - ) - for _ in range(n) - ] - else : + BottleneckV8(hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act) for _ in range(n) + ] + else: self.conv3 = BaseConv(2 * hidden_channels, out_channels, 3, stride=1, act=act) module_list = [ - Bottleneck_EFF( - hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel - ) + Bottleneck_EFF(hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel) for _ in range(n) ] self.m = nn.Sequential(*module_list) @@ -362,13 +338,13 @@ def forward(self, x): x_1 = self.conv1(x) x_2 = self.conv2(x) el = [] - if (self.elan == True) : + if self.elan == True: x = x_1 for m in self.m: x = m(x) el.append(x) x = torch.cat([x_2] + [m for m in el], dim=1) - else : + else: x_1 = self.m(x_1) x = torch.cat((x_1, x_2), dim=1) return self.conv3(x) @@ -383,7 +359,7 @@ def __init__( out_channels, n=3, shortcut=True, - #expansion=0.5, + # expansion=0.5, expansion=1.0, depthwise=False, act="silu", @@ -398,15 +374,11 @@ def __init__( # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() hidden_channels = int(out_channels * expansion) # hidden channels - self.conv1 = BaseConv(hidden_channels, out_channels, 3, stride=1, act=act) + self.conv1 = BaseConv(hidden_channels, out_channels, 3, stride=1, act=act) self.conv_c = BaseConv(in_channels, hidden_channels, kernel, stride=1, act=act) - self.conv2 = BaseConv((n+1) * hidden_channels, out_channels, 1, stride=1, act=act) - + self.conv2 = BaseConv((n + 1) * hidden_channels, out_channels, 1, stride=1, act=act) - module_list = [ - self.conv_c - for _ in range(n) - ] + module_list = [self.conv_c for _ in range(n)] self.m = nn.Sequential(*module_list) def forward(self, x): @@ -419,7 +391,6 @@ def forward(self, x): return self.conv2(x) - class Focus(nn.Module): """Focus width and height information into channel space.""" @@ -445,22 +416,20 @@ def forward(self, x): return self.conv(x) - class SimpleStem(nn.Module): """Simple Stem for Acceleration on Embedded Devices""" def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"): super().__init__() - #self.conv1 = BaseConv(in_channels, out_channels, ksize, stride, act=act) - #self.down1 = BaseConv(out_channels, out_channels, ksize, 2, act=act) - self.down1 = BaseConv(in_channels, out_channels, ksize, 2, act=act) + # self.conv1 = BaseConv(in_channels, out_channels, ksize, stride, act=act) + # self.down1 = BaseConv(out_channels, out_channels, ksize, 2, act=act) + self.down1 = BaseConv(in_channels, out_channels, ksize, 2, act=act) self.conv2 = BaseConv(out_channels, out_channels, ksize, stride, act=act) - #self.down2 = BaseConv(out_channels, out_channels, ksize, 2, act=act) + # self.down2 = BaseConv(out_channels, out_channels, ksize, 2, act=act) def forward(self, x): - #x = self.conv1(x) + # x = self.conv1(x) x = self.down1(x) x = self.conv2(x) - #x = self.down2(x) + # x = self.down2(x) return x - diff --git a/projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py b/projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py index 5e7ac8746..ac52e9cf1 100644 --- a/projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py +++ b/projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py @@ -1,20 +1,17 @@ import torch import torch.nn as nn import torch.nn.functional as F - -from torch import Tensor - -from mmengine.model import BaseModule -from mmdet.registry import MODELS from mmdet.models import BaseDetector +from mmdet.registry import MODELS +from mmdet.structures import DetDataSample, SampleList from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig -from mmdet.structures import DetDataSample -from mmengine.structures import PixelData, InstanceData -from mmdet.structures import SampleList +from mmengine.logging import print_log +from mmengine.model import BaseModule +from mmengine.structures import InstanceData, PixelData +from torch import Tensor from .heads import YOLOXSegHead -from mmengine.logging import print_log @MODELS.register_module() class YOLOXMultiTask(BaseDetector): @@ -23,16 +20,18 @@ class YOLOXMultiTask(BaseDetector): Supports bbox + mask heads. """ - def __init__(self, - backbone, - neck, - bbox_head, - mask_head=None, - train_cfg: OptConfigType = None, - test_cfg: OptConfigType = None, - data_preprocessor=None, - init_cfg=None, - **kwargs): + def __init__( + self, + backbone, + neck, + bbox_head, + mask_head=None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor=None, + init_cfg=None, + **kwargs, + ): super().__init__(init_cfg=init_cfg) self.backbone = MODELS.build(backbone) self.neck = MODELS.build(neck) if neck is not None else None @@ -54,7 +53,7 @@ def extract_feat(self, inputs): def _forward(self, imgs, **kwargs): return self.forward(imgs, **kwargs) - + def forward_train(self, imgs, gt_bboxes, gt_labels, gt_masks=None, **kwargs): feats = self.extract_feat(imgs) losses = dict() @@ -72,21 +71,21 @@ def forward_test(self, imgs, **kwargs): mask_results = self.mask_head(feats) return dict(bboxes=bbox_results, masks=mask_results) - def forward(self, inputs, data_samples=None, mode='tensor'): + def forward(self, inputs, data_samples=None, mode="tensor"): """Forward function with training and testing mode.""" feats = self.extract_feat(inputs) - if mode == 'tensor': + if mode == "tensor": return self.bbox_head(feats) - elif mode == 'loss': + elif mode == "loss": s = self.loss(feats, data_samples) - return s - elif mode == 'predict': + return s + elif mode == "predict": pred_instances = self.predict(inputs, data_samples) - + for pred, data_sample in zip(pred_instances, data_samples): pred.gt_instances = data_sample.gt_instances - if hasattr(data_sample, 'gt_sem_seg'): + if hasattr(data_sample, "gt_sem_seg"): pred.gt_sem_seg = data_sample.gt_sem_seg return pred_instances @@ -101,9 +100,7 @@ def loss(self, feats, data_samples): batch_img_metas = [d.metainfo for d in data_samples] loss.update( - self.bbox_head.loss_by_feat( - cls_scores, bbox_preds, objectnesses, batch_gt_instances, batch_img_metas - ) + self.bbox_head.loss_by_feat(cls_scores, bbox_preds, objectnesses, batch_gt_instances, batch_img_metas) ) # mask head @@ -111,7 +108,7 @@ def loss(self, feats, data_samples): seg_pred = self.mask_head(feats) target_size = data_samples[0].gt_sem_seg.sem_seg.shape[-2:] if seg_pred.shape[-2:] != target_size: - seg_pred = F.interpolate(seg_pred, size=target_size, mode='bilinear', align_corners=False) + seg_pred = F.interpolate(seg_pred, size=target_size, mode="bilinear", align_corners=False) gt_masks_tensor = [] gt_masks = torch.stack([d.gt_sem_seg.sem_seg.squeeze(0) for d in data_samples], dim=0) # (B, H, W) @@ -126,14 +123,12 @@ def loss(self, feats, data_samples): return loss - def predict(self, - batch_inputs: Tensor, - batch_data_samples: SampleList, - rescale: bool = True, - **kwargs) -> SampleList: - + def predict( + self, batch_inputs: Tensor, batch_data_samples: SampleList, rescale: bool = True, **kwargs + ) -> SampleList: + x = self.extract_feat(batch_inputs) - + if self.with_bbox: bbox_results_list = self.bbox_head.predict(x, batch_data_samples, rescale=True) else: @@ -146,42 +141,44 @@ def predict(self, results = [] for i, data_sample in enumerate(batch_data_samples): data_sample.pred_instances = bbox_results_list[i] - + if seg_results_list is not None: pixel_data = PixelData() pixel_data.data = seg_results_list[i] pixel_data.sem_seg = seg_results_list[i] data_sample.pred_sem_seg = pixel_data - - img_h, img_w = data_sample.metainfo['img_shape'] - ori_h, ori_w = data_sample.metainfo['ori_shape'] - - if hasattr(data_sample, 'gt_instances'): - scale_factor = data_sample.metainfo['scale_factor'] # (w_scale, h_scale) + img_h, img_w = data_sample.metainfo["img_shape"] + ori_h, ori_w = data_sample.metainfo["ori_shape"] + + if hasattr(data_sample, "gt_instances"): + + scale_factor = data_sample.metainfo["scale_factor"] # (w_scale, h_scale) scale_factor_bbox = [scale_factor[0], scale_factor[1], scale_factor[0], scale_factor[1]] scale_tensor = data_sample.gt_instances.bboxes.new_tensor(scale_factor_bbox) data_sample.gt_instances.bboxes = data_sample.gt_instances.bboxes / scale_tensor - if hasattr(data_sample, 'gt_sem_seg') and data_sample.gt_sem_seg is not None: - gt_sem_seg_data = data_sample.gt_sem_seg.sem_seg # [H_pad, W_pad] - - gt_valid = gt_sem_seg_data[..., :img_h, :img_w] + if hasattr(data_sample, "gt_sem_seg") and data_sample.gt_sem_seg is not None: + gt_sem_seg_data = data_sample.gt_sem_seg.sem_seg # [H_pad, W_pad] + + gt_valid = gt_sem_seg_data[..., :img_h, :img_w] if gt_valid.shape[-2:] != (ori_h, ori_w): - gt_resized = F.interpolate( - gt_valid.unsqueeze(0).float(), # [1, 1, h, w] - size=(ori_h, ori_w), - mode='nearest' - ).squeeze(0).long() - + gt_resized = ( + F.interpolate( + gt_valid.unsqueeze(0).float(), size=(ori_h, ori_w), mode="nearest" # [1, 1, h, w] + ) + .squeeze(0) + .long() + ) + new_gt_pixel_data = PixelData() new_gt_pixel_data.sem_seg = gt_resized new_gt_pixel_data.data = gt_resized data_sample.gt_sem_seg = new_gt_pixel_data - elif 'data' not in data_sample.gt_sem_seg: + elif "data" not in data_sample.gt_sem_seg: data_sample.gt_sem_seg.data = data_sample.gt_sem_seg.sem_seg results.append(data_sample) diff --git a/projects/YOLOX_opt_elan/yolox/transforms.py b/projects/YOLOX_opt_elan/yolox/transforms.py index 939c57102..3aad09cff 100644 --- a/projects/YOLOX_opt_elan/yolox/transforms.py +++ b/projects/YOLOX_opt_elan/yolox/transforms.py @@ -1,9 +1,10 @@ +import os.path as osp + import torch import torch.nn.functional as F -from mmdet.registry import TRANSFORMS from mmcv.transforms import BaseTransform +from mmdet.registry import TRANSFORMS -import os.path as osp @TRANSFORMS.register_module() class ResizeSegMask: @@ -11,27 +12,28 @@ def __init__(self, size): self.size = size # (H_out, W_out) def __call__(self, results): - if 'gt_seg_map' in results and results['gt_seg_map'] is not None: - seg = results['gt_seg_map'] # numpy array (H, W) + if "gt_seg_map" in results and results["gt_seg_map"] is not None: + seg = results["gt_seg_map"] # numpy array (H, W) seg = torch.from_numpy(seg).unsqueeze(0).unsqueeze(0).float() # (1,1,H,W) - seg = F.interpolate(seg, size=self.size, mode='nearest') - results['gt_seg_map'] = seg.squeeze(0).squeeze(0).long().numpy() # back to (H_out,W_out) + seg = F.interpolate(seg, size=self.size, mode="nearest") + results["gt_seg_map"] = seg.squeeze(0).squeeze(0).long().numpy() # back to (H_out,W_out) return results + @TRANSFORMS.register_module() class FixCityscapesPath(BaseTransform): - def __init__(self, data_root, split='train'): + def __init__(self, data_root, split="train"): self.data_root = data_root self.split = split def transform(self, results): - img_path = results['img_path'] + img_path = results["img_path"] filename = osp.basename(img_path) - - seg_filename = filename.replace('_leftImg8bit.png', '_gtFine_labelTrainIds.png') - city = filename.split('_')[0] - seg_path = osp.join(self.data_root, 'gtFine', self.split, city, seg_filename) - - results['seg_map_path'] = seg_path - + + seg_filename = filename.replace("_leftImg8bit.png", "_gtFine_labelTrainIds.png") + city = filename.split("_")[0] + seg_path = osp.join(self.data_root, "gtFine", self.split, city, seg_filename) + + results["seg_map_path"] = seg_path + return results