From 08a2a99a7ed444be57759ac03681637574b65bec Mon Sep 17 00:00:00 2001
From: tzhong518 <sworgun@gmail.com>
Date: Tue, 23 Dec 2025 14:35:59 +0900
Subject: [PATCH 1/2] add: yolox_multitask model

Signed-off-by: tzhong518 <sworgun@gmail.com>
---
 ...opt-elan-semseg_960x960_300e_cityscapes.py | 267 ++++++++++
 ...-opt-elan-semseg_960x960_300e_t4dataset.py | 372 ++++++++++++++
 .../yolox/models/heads/__init__.py            |   3 +
 .../yolox/models/heads/seg_head.py            | 122 +++++
 .../yolox/models/layers/network_blocks.py     | 466 ++++++++++++++++++
 .../yolox/models/yolox_multitask.py           | 189 +++++++
 projects/YOLOX_opt_elan/yolox/transforms.py   |  37 ++
 7 files changed, 1456 insertions(+)
 create mode 100644 projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py
 create mode 100644 projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py
 create mode 100644 projects/YOLOX_opt_elan/yolox/models/heads/__init__.py
 create mode 100644 projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py
 create mode 100644 projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py
 create mode 100644 projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py
 create mode 100644 projects/YOLOX_opt_elan/yolox/transforms.py

diff --git a/projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py b/projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py
new file mode 100644
index 000000000..05af852f9
--- /dev/null
+++ b/projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py
@@ -0,0 +1,267 @@
+_base_ = [
+    "../../../../../autoware_ml/configs/detection2d/default_runtime.py",
+    "../../../../../autoware_ml/configs/detection2d/schedules/schedule_1x.py",
+]
+
+custom_imports = dict(
+    imports=[
+        "projects.YOLOX_opt_elan.yolox",
+        "autoware_ml.detection2d.metrics",
+        "autoware_ml.detection2d.datasets",
+        "projects.YOLOX_opt_elan.yolox.models",
+        "projects.YOLOX_opt_elan.yolox.models.yolox_multitask",
+        "projects.YOLOX_opt_elan.yolox.transforms",
+        "mmseg.evaluation.metrics", # 引入分割评估指标
+    ],
+    allow_failed_imports=False,
+)
+
+# parameter settings
+# IMG_SCALE = (960, 960)
+IMG_SCALE = (1024, 512)
+max_epochs = 300
+num_last_epochs = 15
+resume_from = None
+interval = 1
+batch_size = 16
+activation = "ReLU6"
+num_workers = 4
+base_lr = 0.001
+
+
+classes = ('person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle')
+palette = [
+    (220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70),
+    (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32)
+]
+
+seg_classes = ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole', 'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle')
+seg_palette = [(128, 64, 128), (244, 35, 232), (70, 70, 70), (102, 102, 156), (190, 153, 153), (153, 153, 153), (250, 170, 30), (220, 220, 0), (107, 142, 35), (152, 251, 152), (70, 130, 180), (220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70), (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32)]
+
+# metainfo = dict(classes=classes, palette=palette)
+metainfo = dict(classes=seg_classes, palette=seg_palette)
+
+model = dict(
+    type="YOLOXMultiTask",
+    data_preprocessor=dict(
+        type="DetDataPreprocessor",
+        pad_size_divisor=32,
+        # batch_augments=[
+        #     dict(
+        #         type="BatchSyncRandomResize",
+        #         random_size_range=(480, 800),
+        #         size_divisor=32,
+        #         interval=10,
+        #     )
+        # ],
+    ),
+    backbone=dict(
+        type="ELANDarknet",
+        deepen_factor=2,
+        widen_factor=1, 
+        out_indices=(2, 3, 4),
+        act_cfg=dict(type=activation),
+    ),
+    neck=dict(
+        type="YOLOXPAFPN_ELAN",
+        in_channels=[128, 256, 512],
+        out_channels=128,
+        num_elan_blocks=2,
+        act_cfg=dict(type=activation),
+    ),
+    bbox_head=dict(
+        type="YOLOXHead",
+        num_classes=8,
+        in_channels=128,
+        feat_channels=128,
+        act_cfg=dict(type=activation),
+        loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0),
+        loss_bbox=dict(type='IoULoss', loss_weight=0.0),
+        loss_obj=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0),
+        loss_l1=dict(type='L1Loss', loss_weight=0.0),
+    ),
+    mask_head=dict(
+        type="YOLOXSegHead",
+        in_channels=[128, 128, 128],
+        feat_channels=128,
+        num_classes=19,
+        act_cfg=dict(type=activation),
+        loss=dict(type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0),
+    ),
+    train_cfg=dict(assigner=dict(type="SimOTAAssigner", center_radius=2.5)),
+    test_cfg=dict(score_thr=0.01, nms=dict(type="nms", iou_threshold=0.65)),
+)
+
+dataset_type = 'CocoDataset'
+data_root = 'data/cityscapes/'
+backend_args = None
+
+train_pipeline = [
+    # dict(type="Mosaic", img_scale=IMG_SCALE, pad_val=114.0),
+    # dict(type="MixUp", img_scale=IMG_SCALE, ratio_range=(0.8, 1.6), pad_val=114.0),
+    dict(type="YOLOXHSVRandomAug"),
+    dict(type="RandomFlip", prob=0.5),
+    dict(type="Resize", scale=IMG_SCALE, keep_ratio=False),
+    dict(
+        type="Pad",
+        pad_to_square=False,
+        size_divisor=32,
+        pad_val=dict(img=(114.0, 114.0, 114.0), seg=255),
+    ),
+    dict(type="FilterAnnotations", min_gt_bbox_wh=(1, 1), keep_empty=False),
+    dict(type="PackDetInputs"),
+]
+
+test_pipeline = [
+    dict(type="LoadImageFromFile", backend_args=backend_args),
+    dict(type="FixCityscapesPath", data_root=data_root, split='val'),
+    dict(type="LoadAnnotations", with_bbox=True, with_seg=True),
+    dict(type="Resize", scale=IMG_SCALE, keep_ratio=False),
+    dict(type="Pad", pad_to_square=False, size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0), seg=255)),
+    dict(
+        type="PackDetInputs",
+        meta_keys=("img_id", "img_path", "ori_shape", "img_shape", "scale_factor"),
+    ),
+]
+
+train_dataset = dict(
+    type="MultiImageMixDataset",
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img='leftImg8bit/train',
+            seg_map_path='gtFine/train'
+        ),
+        ann_file='annotations/instancesonly_filtered_gtFine_train.json',
+        pipeline=[
+            dict(type="LoadImageFromFile", backend_args=backend_args),
+            dict(type="FixCityscapesPath", data_root=data_root, split='train'),
+            dict(type="LoadAnnotations", with_bbox=True, with_seg=True),
+        ],
+        filter_cfg=dict(filter_empty_gt=False, min_size=8),
+        backend_args=backend_args,
+        metainfo=metainfo,
+    ),
+    pipeline=train_pipeline,
+)
+
+train_dataloader = dict(
+    batch_size=batch_size,
+    num_workers=num_workers,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=True),
+    dataset=train_dataset,
+)
+
+val_dataloader = dict(
+    batch_size=batch_size,
+    num_workers=num_workers,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img='leftImg8bit/val', 
+            seg_map_path='gtFine/val'
+        ),
+        ann_file='annotations/instancesonly_filtered_gtFine_val.json',
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args,
+        metainfo=metainfo,
+    ),
+)
+test_dataloader = val_dataloader
+
+val_evaluator = [
+    dict(type='mmseg.IoUMetric', ignore_index=255, iou_metrics=['mIoU'], prefix="seg", classes=seg_classes)
+]
+test_evaluator = val_evaluator
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=interval)
+
+optimizer = dict(
+    type="OptimWrapper",
+    optimizer=dict(type="SGD", lr=base_lr, momentum=0.9, weight_decay=5e-4, nesterov=True),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+)
+
+if max_epochs > 5:
+    param_scheduler = [
+        dict(
+            type="mmdet.QuadraticWarmupLR",
+            by_epoch=True,
+            begin=0,
+            end=5,
+            convert_to_iter_based=True,
+        ),
+        dict(
+            type="CosineAnnealingLR",
+            eta_min=base_lr * 0.05,
+            begin=5,
+            T_max=max_epochs - num_last_epochs,
+            end=max_epochs - num_last_epochs,
+            by_epoch=True,
+            convert_to_iter_based=True,
+        ),
+        dict(
+            type="ConstantLR",
+            by_epoch=True,
+            factor=1,
+            begin=max_epochs - num_last_epochs,
+            end=max_epochs,
+        ),
+    ]
+else:
+    param_scheduler = []
+
+log_config = dict(
+    interval=1,
+    hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")],
+)
+
+default_hooks = dict(
+    checkpoint=dict(
+        interval=interval,
+        max_keep_ckpts=3, 
+        save_best='seg/mIoU',
+        rule='greater'
+    ),
+    visualization=dict(
+        type='DetVisualizationHook',
+        draw=False,
+        interval=50,
+        show=False,
+        wait_time=2,
+        test_out_dir='vis_data'
+    ),
+)
+
+custom_hooks = [
+    dict(type="YOLOXModeSwitchHook", num_last_epochs=num_last_epochs, priority=48),
+    dict(type="SyncNormHook", priority=48),
+    dict(
+        type="EMAHook",
+        ema_type="ExpMomentumEMA",
+        momentum=0.0001,
+        update_buffers=True,
+        priority=4,
+    ),
+]
+
+auto_scale_lr = dict(base_batch_size=batch_size)
+
+vis_backends = [
+    dict(type="LocalVisBackend"),
+    dict(type="TensorboardVisBackend"),
+]
+
+visualizer = dict(
+    type='DetLocalVisualizer',
+    vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')],
+    name='visualizer',
+    alpha=0.5,
+)
diff --git a/projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py b/projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py
new file mode 100644
index 000000000..d8f5bee0a
--- /dev/null
+++ b/projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py
@@ -0,0 +1,372 @@
+_base_ = [
+    "../../../../../autoware_ml/configs/detection2d/default_runtime.py",
+    "../../../../../autoware_ml/configs/detection2d/schedules/schedule_1x.py",
+    "../../../../../autoware_ml/configs/detection2d/dataset/t4dataset/comlops.py",
+]
+
+custom_imports = dict(
+    imports=[
+        "projects.YOLOX_opt_elan.yolox",
+        "autoware_ml.detection2d.metrics",
+        "autoware_ml.detection2d.datasets",
+        "projects.YOLOX_opt_elan.yolox.models",
+        "projects.YOLOX_opt_elan.yolox.models.yolox_multitask",
+        "projects.YOLOX_opt_elan.yolox.transforms",
+    ],
+    allow_failed_imports=False,
+)
+
+IMG_SCALE = (960, 960)
+
+# parameter settings
+img_scale = (960, 960)
+max_epochs = 300
+num_last_epochs = 15
+resume_from = None
+interval = 1
+batch_size = 12
+activation = "ReLU6"
+num_workers = 4
+
+base_lr = 0.001
+
+# model settings
+model = dict(
+    type="YOLOXMultiTask",
+    data_preprocessor=dict(
+        type="DetDataPreprocessor",
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type="BatchSyncRandomResize",
+                random_size_range=(480, 800),
+                size_divisor=32,
+                interval=10,
+            )
+        ],
+    ),
+    backbone=dict(
+        type="ELANDarknet",
+        deepen_factor=2,
+        widen_factor=1,
+        out_indices=(2, 3, 4),
+        act_cfg=dict(type=activation),
+    ),
+    neck=dict(
+        type="YOLOXPAFPN_ELAN",
+        in_channels=[128, 256, 512],
+        out_channels=128,
+        num_elan_blocks=2,
+        act_cfg=dict(type=activation),
+    ),
+    bbox_head=dict(
+        type="YOLOXHead",
+        num_classes=40,
+        in_channels=128,
+        feat_channels=128,
+        act_cfg=dict(type=activation),
+        loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0),
+        loss_bbox=dict(type='IoULoss', loss_weight=0.0),
+        loss_obj=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0),
+        loss_l1=dict(type='L1Loss', loss_weight=0.0),   
+    ),
+    mask_head=dict(
+        type="YOLOXSegHead",
+        in_channels=[128, 128, 128],
+        feat_channels=128,
+        num_classes=40,
+        act_cfg=dict(type=activation),
+        loss=dict(type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0),
+    ),
+    train_cfg=dict(assigner=dict(type="SimOTAAssigner", center_radius=2.5)),
+    test_cfg=dict(score_thr=0.01, nms=dict(type="nms", iou_threshold=0.65)),
+)
+
+data_root = ""
+anno_file_root = "./data/comlops/semseg/"
+dataset_type = "T4Dataset"
+
+backend_args = None
+
+# pipeline
+train_pipeline = [
+    # dict(type="Mosaic", img_scale=IMG_SCALE, pad_val=114.0),
+    # dict(
+    #     type="RandomAffine",
+    #     scaling_ratio_range=(0.1, 2),
+    #     border=(-IMG_SCALE[0] // 2, -IMG_SCALE[1] // 2),
+    # ),
+    # dict(type="MixUp", img_scale=IMG_SCALE, ratio_range=(0.8, 1.6), pad_val=114.0),
+    dict(type="LoadAnnotations", with_bbox=True, with_seg=True),
+    dict(type="YOLOXHSVRandomAug"),
+    dict(type="RandomFlip", prob=0.5),
+    dict(type="Resize", scale=IMG_SCALE, keep_ratio=False),
+    dict(
+        type="Pad",
+        pad_to_square=True,
+        pad_val=dict(img=(114.0, 114.0, 114.0), seg=255),
+    ),
+    dict(type="FilterAnnotations", min_gt_bbox_wh=(1, 1), keep_empty=False),
+    dict(type="PackDetInputs"),
+]
+
+classes = (
+    "animal",
+    "bicycle",
+    "building",
+    "bus",
+    "car",
+    "cone",
+    "construction",
+    "crosswalk",
+    "dashed_lane_marking",
+    "deceleration_line",
+    "gate",
+    "guide_post",
+    "laneline_dash_white",
+    "laneline_dash_yellow",
+    "laneline_solid_green",
+    "laneline_solid_red",
+    "laneline_solid_white",
+    "laneline_solid_yellow",
+    "marking_arrow",
+    "marking_character",
+    "marking_other",
+    "motorcycle",
+    "other_obstacle",
+    "other_pedestrian",
+    "other_vehicle",
+    "parking_lot",
+    "pedestrian",
+    "pole",
+    "road",
+    "road_debris",
+    "sidewalk",
+    "sky",
+    "stopline",
+    "striped_road_marking",
+    "traffic_light",
+    "traffic_sign",
+    "train",
+    "truck",
+    # "unknown",
+    "vegetation/terrain",
+    "wall/fence",
+)
+
+palette = [
+    (150, 120, 90),   # 0: animal
+    (119, 11, 32),    # 1: bicycle
+    (70, 70, 70),     # 2: building
+    (0, 60, 100),     # 3: bus 
+    (0, 0, 142),      # 4: car
+    (250, 170, 30),   # 5: cone 
+    (230, 150, 140),  # 6: construction 
+    (140, 140, 200),  # 7: crosswalk 
+    (255, 255, 255),  # 8: dashed_lane_marking
+    (200, 200, 200),  # 9: deceleration_line 
+    (190, 153, 153),  # 10: gate 
+    (250, 170, 30),   # 11: guide_post
+    (255, 255, 255),  # 12: laneline_dash_white
+    (255, 255, 0),    # 13: laneline_dash_yellow
+    (0, 255, 0),      # 14: laneline_solid_green
+    (255, 0, 0),      # 15: laneline_solid_red
+    (255, 255, 255),  # 16: laneline_solid_white
+    (255, 215, 0),    # 17: laneline_solid_yellow 
+    (0, 255, 255),    # 18: marking_arrow
+    (200, 0, 200),    # 19: marking_character 
+    (150, 0, 150),    # 20: marking_other
+    (0, 0, 230),      # 21: motorcycle 
+    (80, 80, 80),     # 22: other_obstacle
+    (250, 170, 160),  # 23: other_pedestrian 
+    (100, 80, 200),   # 24: other_vehicle 
+    (180, 165, 180),  # 25: parking_lot
+    (220, 20, 60),    # 26: pedestrian 
+    (153, 153, 153),  # 27: pole 
+    (128, 64, 128),   # 28: road
+    (110, 110, 110),  # 29: road_debris
+    (244, 35, 232),   # 30: sidewalk
+    (70, 130, 180),   # 31: sky 
+    (220, 220, 220),  # 32: stopline 
+    (160, 150, 180),  # 33: striped_road_marking
+    (250, 170, 30),   # 34: traffic_light
+    (220, 220, 0),    # 35: traffic_sign
+    (0, 80, 100),     # 36: train 
+    (0, 0, 70),       # 37: truck 
+    (107, 142, 35),   # 38: vegetation/terrain
+    (102, 102, 156),  # 39: wall/fence
+]
+metainfo = dict(classes=classes, palette=palette)
+
+train_dataset = dict(
+    type="MultiImageMixDataset",
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=anno_file_root + "comlops_infos_train_cleaned.json",
+        pipeline=[
+            dict(type="LoadImageFromFile", backend_args=backend_args),
+            dict(type="LoadAnnotations", with_bbox=True, with_seg=True),
+        ],
+        filter_cfg=dict(filter_empty_gt=False, min_size=8),
+        backend_args=backend_args,
+        metainfo=metainfo,
+    ),
+    pipeline=train_pipeline,
+)
+
+test_pipeline = [
+    dict(type="LoadImageFromFile", backend_args=backend_args),
+    dict(type="LoadAnnotations", with_bbox=True, with_seg=True),
+    dict(type="Resize", scale=img_scale, keep_ratio=False),
+    dict(type="Pad", pad_to_square=True, pad_val=dict(img=(114.0, 114.0, 114.0), seg=255)),
+    dict(
+        type="PackDetInputs",
+        meta_keys=(
+            "img_id",
+            "img_path",
+            "ori_shape",
+            "img_shape",
+            "scale_factor",
+        ),
+    ),
+]
+
+train_dataloader = dict(
+    batch_size=batch_size,
+    num_workers=num_workers,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=True),
+    dataset=train_dataset,
+)
+
+val_dataloader = dict(
+    batch_size=batch_size,
+    num_workers=16,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=anno_file_root + "comlops_infos_val_cleaned.json",
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args,
+        metainfo=metainfo,
+        indices=2000, 
+    ),
+)
+
+test_dataloader = val_dataloader
+
+val_evaluator = [
+    dict(type="VOCMetric", metric="mAP", prefix="det"),
+    dict(type='mmseg.IoUMetric', ignore_index=255, iou_metrics=['mIoU'], prefix="seg")
+]
+
+test_evaluator = val_evaluator
+
+# train_cfg = dict(max_epochs=max_epochs, val_interval=interval)
+train_cfg = dict(
+    _delete_=True,
+    type='IterBasedTrainLoop',
+    max_iters=200000,     
+    val_interval=1000
+)
+
+# optimizer
+optimizer = dict(
+    type="OptimWrapper",
+    optimizer=dict(type="SGD", lr=base_lr, momentum=0.9, weight_decay=5e-4, nesterov=True),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+)
+
+# learning rate scheduler
+if max_epochs > 5:
+    param_scheduler = [
+        dict(
+            type="mmdet.QuadraticWarmupLR",
+            by_epoch=True,
+            begin=0,
+            end=5,
+            convert_to_iter_based=True,
+        ),
+        dict(
+            type="CosineAnnealingLR",
+            eta_min=base_lr * 0.05,
+            begin=5,
+            T_max=max_epochs - num_last_epochs,
+            end=max_epochs - num_last_epochs,
+            by_epoch=True,
+            convert_to_iter_based=True,
+        ),
+        dict(
+            type="ConstantLR",
+            by_epoch=True,
+            factor=1,
+            begin=max_epochs - num_last_epochs,
+            end=max_epochs,
+        ),
+    ]
+else:
+    param_scheduler = []
+
+# logging
+log_config = dict(
+    interval=1,
+    hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")],
+)
+
+# default_hooks = dict(
+#     checkpoint=dict(interval=interval, max_keep_ckpts=3),
+# )
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook', 
+        interval=1000,
+        by_epoch=False,
+        max_keep_ckpts=5, 
+        save_best='seg/mIoU',
+        rule='greater'
+    ),
+    logger=dict(
+        type='LoggerHook', 
+        interval=50 
+    ),
+    visualization=dict(
+        type='DetVisualizationHook',
+        draw=False,
+        interval=100,
+        show=False,
+        wait_time=2,
+        test_out_dir='vis_data'
+    ),
+)
+
+custom_hooks = [
+    dict(type="YOLOXModeSwitchHook", num_last_epochs=num_last_epochs, priority=48),
+    dict(type="SyncNormHook", priority=48),
+    dict(
+        type="EMAHook",
+        ema_type="ExpMomentumEMA",
+        momentum=0.0001,
+        update_buffers=True,
+        priority=4,
+    ),
+]
+
+auto_scale_lr = dict(base_batch_size=batch_size)
+
+
+vis_backends = [
+    dict(type="LocalVisBackend"),
+    dict(type="TensorboardVisBackend"),
+]
+
+visualizer = dict(
+    type='DetLocalVisualizer',
+    vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')],
+    name='visualizer',
+    alpha=0.3,
+)
diff --git a/projects/YOLOX_opt_elan/yolox/models/heads/__init__.py b/projects/YOLOX_opt_elan/yolox/models/heads/__init__.py
new file mode 100644
index 000000000..db9c396aa
--- /dev/null
+++ b/projects/YOLOX_opt_elan/yolox/models/heads/__init__.py
@@ -0,0 +1,3 @@
+from .seg_head import YOLOXSegHead
+
+__all__ = ("YOLOXSegHead",)
\ No newline at end of file
diff --git a/projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py b/projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py
new file mode 100644
index 000000000..ea5652d4f
--- /dev/null
+++ b/projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py
@@ -0,0 +1,122 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, List, Tuple, Union
+
+from torch import Tensor
+
+from mmdet.models.seg_heads.base_semantic_head import BaseSemanticHead
+from ..layers.network_blocks import BaseConv, CSPLayer, DWConv
+
+from mmengine.registry import MODELS
+
+def get_activation(name="ReLU6"):
+    if name.lower() == "relu6":
+        return nn.ReLU6(inplace=True)
+    elif name.lower() == "relu":
+        return nn.ReLU(inplace=True)
+    elif name.lower() == "silu":
+        return nn.SiLU(inplace=True)
+    elif name.lower() == "lrelu":
+        return nn.LeakyReLU(0.1, inplace=True)
+    else:
+        raise AttributeError(f"Unsupported act type {name}")
+
+@MODELS.register_module()
+class YOLOXSegHead(nn.Module):
+    def __init__(self, in_channels, num_classes, feat_channels=None, act_cfg=dict(type="ReLU6"), width=1.0, depthwise=False, train_cfg=None,
+                 test_cfg=None, **kwargs):
+        super().__init__()
+        self.num_classes = num_classes
+        self.width = width
+        # self.stem_channels = feat_channels if feat_channels is not None else int(64 * width)
+        self.stem_channels = sum(in_channels)
+
+        act_type = act_cfg.get("type", "ReLU6")
+        self.act_fn = get_activation(act_type)
+
+        self.train_cfg = train_cfg
+        
+        Conv = DWConv if depthwise else BaseConv
+
+        # mask head layers
+        self.conv1 = Conv(self.stem_channels, self.stem_channels, 3, 1, act=act_type)
+        self.conv2 = Conv(self.stem_channels, self.stem_channels, 3, 1, act=act_type)
+        self.up1 = nn.Upsample(scale_factor=2, mode="nearest")
+        self.conv3 = Conv(self.stem_channels, self.stem_channels // 2, 3, 1, act=act_type)
+        self.up2 = nn.Upsample(scale_factor=2, mode="nearest")
+        self.conv4 = Conv(self.stem_channels // 2, self.stem_channels // 2, 3, 1, act=act_type)
+        self.up3 = nn.Upsample(scale_factor=2, mode="nearest")
+        self.out_conv = nn.Conv2d(self.stem_channels // 2, num_classes, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, feats):
+        """
+        Args:
+            feats (list[Tensor] or Tensor): features from backbone+neck
+        Returns:
+            seg_pred (Tensor): [B, num_classes, H, W]
+        """
+        if isinstance(feats, (list, tuple)):
+            target_size = feats[0].shape[2:]
+            up_feats = [F.interpolate(f, size=target_size, mode='bilinear', align_corners=False) for f in feats]
+            x = torch.cat(up_feats, dim=1)  # [B, sum(C_i), H, W]
+        else:
+            x = feats
+
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.up1(x)
+        x = self.conv3(x)
+        x = self.up2(x)
+        x = self.conv4(x)
+        x = self.up3(x)
+        seg_pred = self.out_conv(x)
+        return seg_pred
+
+    def loss(self, seg_pred, gt_masks):
+        """
+        Args:
+            seg_pred: [B, C, H, W]
+            gt_masks: [B, H, W] long
+        Returns:
+            dict: {'loss_mask': ...}
+        """
+        return dict(loss_mask=F.cross_entropy(seg_pred, gt_masks.long(), ignore_index=255))
+
+    def predict(self,
+                x: Union[Tensor, Tuple[Tensor]],
+                batch_data_samples,
+                rescale: bool = False) -> List[Tensor]:
+        
+        batch_img_metas = [
+            data_sample.metainfo for data_sample in batch_data_samples
+        ]
+        seg_preds = self.forward(x)
+
+        input_shape = batch_img_metas[0]['batch_input_shape']
+        seg_preds = F.interpolate(
+            seg_preds,
+            size=input_shape,
+            mode='bilinear',
+            align_corners=False)
+
+        result_list = []
+        for i in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[i]
+            h, w = img_meta['img_shape']
+
+            seg_pred = seg_preds[i][:, :h, :w]
+
+            if rescale:
+                ori_h, ori_w = img_meta['ori_shape']
+                seg_pred = F.interpolate(
+                    seg_pred.unsqueeze(0),
+                    size=(ori_h, ori_w),
+                    mode='bilinear',
+                    align_corners=False).squeeze(0)
+
+            seg_pred = seg_pred.argmax(dim=0).to(torch.int64)
+            
+            result_list.append(seg_pred)
+
+        return result_list
diff --git a/projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py b/projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py
new file mode 100644
index 000000000..1aab7efcb
--- /dev/null
+++ b/projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py
@@ -0,0 +1,466 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+
+class SiLU(nn.Module):
+    """export-friendly version of nn.SiLU()"""
+
+    @staticmethod
+    def forward(x):
+        return x * torch.sigmoid(x)
+
+class PactFix(nn.Module):
+    """export-friendly version of nn.SiLU()"""
+
+    @staticmethod
+    def forward(x, alpha=4.0):
+        y = torch.clamp(x, min = 0, max = alpha)
+        return y
+
+class Pact(Function):
+    @staticmethod        
+    def forward(ctx, x, alpha, k):
+        ctx.save_for_backward(x, alpha)
+        # y_1 = 0.5 * ( torch.abs(x).detach() - torch.abs(x - alpha).detach() + alpha.item() )
+        y = torch.clamp(x, min = 0, max = alpha.item())
+        scale = (2**k - 1) / alpha
+        y_q = torch.round( y * scale) / scale
+        return y_q
+
+    @staticmethod
+    def backward(ctx, dLdy_q):
+        # Backward function, I borrowed code from
+        # https://github.com/obilaniu/GradOverride/blob/master/functional.py
+        # We get dL / dy_q as a gradient
+        x, alpha, = ctx.saved_tensors
+        # Weight gradient is only valid when [0, alpha]
+        # Actual gradient for alpha,
+        # By applying Chain Rule, we get dL / dy_q * dy_q / dy * dy / dalpha
+        # dL / dy_q = argument,  dy_q / dy * dy / dalpha = 0, 1 with x value range
+        lower_bound      = x < 0
+        upper_bound      = x > alpha
+        # x_range       = 1.0-lower_bound-upper_bound
+        x_range = ~(lower_bound|upper_bound)
+        grad_alpha = torch.sum(dLdy_q * torch.ge(x, alpha).float()).view(-1)
+        return dLdy_q * x_range.float(), grad_alpha, None
+    
+def get_activation(name="silu", inplace=True):
+    #name = 'relu'
+    name = name.lower()
+    if name == "silu":
+        module = nn.SiLU(inplace=inplace)
+    elif name == "relu":
+        module = nn.ReLU(inplace=inplace)
+    elif name == "lrelu":
+        module = nn.LeakyReLU(0.1, inplace=inplace)
+    elif name == "pact":        
+        module = Pact.apply
+    elif name == "pactfix":        
+        module = PactFix()
+    elif name == "relu6":
+        module = nn.ReLU6()
+    else:
+        raise AttributeError("Unsupported act type: {}".format(name))
+    return module
+
+K = 2
+#fisrt = True
+class BaseConv(nn.Module):
+    """A Conv2d -> Batchnorm -> silu/leaky relu block"""
+
+    def __init__(
+        self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"
+    ):
+        super().__init__()
+        # same padding
+        pad = (ksize - 1) // 2
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            bias=bias,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.act = get_activation(act, inplace=True)
+        self.act_name = act
+        if (self.act_name == "pact") :
+            self.alpha = nn.Parameter(torch.tensor(20.))
+
+        
+    def forward(self, x):
+        if (self.act_name == "pact") :
+            return self.act(self.bn(self.conv(x)), self.alpha, 2)
+        else:
+            #print(self.conv)
+            return self.act(self.bn(self.conv(x)))
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class DWConv(nn.Module):
+    """Depthwise Conv + Conv"""
+
+    def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"):
+        super().__init__()
+        self.dconv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            act=act,
+        )
+        self.pconv = BaseConv(
+            in_channels, out_channels, ksize=1, stride=1, groups=1, act=act
+        )
+
+    def forward(self, x):
+        x = self.dconv(x)
+        return self.pconv(x)
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act="silu",
+        kernel=3
+    ):
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = Conv(hidden_channels, out_channels, kernel, stride=1, act=act)
+        self.use_add = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.use_add:
+            y = y + x
+        return y
+
+
+class BottleneckV8(nn.Module):
+    # Standard bottleneck
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act="silu",
+        kernel=3,
+    ):
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(in_channels, hidden_channels, kernel, stride=1, act=act)
+        self.conv2 = Conv(hidden_channels, out_channels, kernel, stride=1, act=act)
+        self.use_add = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.use_add:
+            y = y + x
+        return y
+
+
+
+class Bottleneck_EFF(nn.Module):
+    # Standard bottleneck
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act="silu",
+        kernel=3,            
+    ):
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(in_channels, hidden_channels, kernel, stride=1, act=act)
+        self.conv2 = Conv(hidden_channels, out_channels, 5, stride=1, act=act)
+        self.use_add = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.use_add:
+            y = y + x
+        return y    
+
+
+class ResLayer(nn.Module):
+    "Residual layer with `in_channels` inputs."
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+        mid_channels = in_channels // 2
+        self.layer1 = BaseConv(
+            in_channels, mid_channels, ksize=1, stride=1, act="lrelu"
+        )
+        self.layer2 = BaseConv(
+            mid_channels, in_channels, ksize=3, stride=1, act="lrelu"
+        )
+
+    def forward(self, x):
+        out = self.layer2(self.layer1(x))
+        return x + out
+
+
+class SPPBottleneck(nn.Module):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP"""
+
+    def __init__(
+        self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"
+    ):
+        super().__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation)
+        self.m = nn.ModuleList(
+            [
+                nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+                for ks in kernel_sizes
+            ]
+        )
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+class CSPLayer(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=1,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act="silu",
+        elan=False,
+        kernel=3,
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+        self.elan = elan
+
+        if (self.elan == True) :
+            self.conv3 = BaseConv((n+1) * hidden_channels, out_channels, 1, stride=1, act=act)
+            module_list = [
+                BottleneckV8(
+                    hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel
+                )
+                for _ in range(n)
+            ]            
+        else:
+            self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act)
+            module_list = [
+                Bottleneck(
+                    hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel
+                )
+                for _ in range(n)
+            ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        el = []
+        if (self.elan == True) :
+            x = x_1
+            for m in self.m:
+                x = m(x)
+                el.append(x)
+            x = torch.cat([x_2] + [m for m in el], dim=1)
+        else :
+            x_1 = self.m(x_1)
+            x = torch.cat((x_1, x_2), dim=1)
+        return self.conv3(x)
+
+class CSPLayer_EFF(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=1,
+        shortcut=True,
+        #expansion=0.5,
+        expansion=1.0,
+        depthwise=False,
+        act="silu",
+        kernel=3,
+        elan=False,            
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(in_channels, hidden_channels, 3, stride=1, act=act)
+        self.conv2 = BaseConv(in_channels, hidden_channels, 3, stride=1, act=act)
+        self.elan = elan
+
+        if (self.elan == True) :
+            self.conv3 = BaseConv((n+1) * hidden_channels, out_channels, 1, stride=1, act=act)
+            module_list = [
+                BottleneckV8(
+                    hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act
+                )
+                for _ in range(n)
+            ]            
+        else :
+            self.conv3 = BaseConv(2 * hidden_channels, out_channels, 3, stride=1, act=act)
+
+            module_list = [
+                Bottleneck_EFF(
+                    hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel
+                )
+                for _ in range(n)
+            ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        el = []
+        if (self.elan == True) :
+            x = x_1
+            for m in self.m:
+                x = m(x)
+                el.append(x)
+            x = torch.cat([x_2] + [m for m in el], dim=1)
+        else :
+            x_1 = self.m(x_1)
+            x = torch.cat((x_1, x_2), dim=1)
+        return self.conv3(x)
+
+
+class ELAN(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=3,
+        shortcut=True,
+        #expansion=0.5,
+        expansion=1.0,
+        depthwise=False,
+        act="silu",
+        kernel=3,
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(hidden_channels, out_channels, 3, stride=1, act=act)        
+        self.conv_c = BaseConv(in_channels, hidden_channels, kernel, stride=1, act=act)
+        self.conv2 = BaseConv((n+1) * hidden_channels, out_channels, 1, stride=1, act=act)
+
+
+        module_list = [
+            self.conv_c
+            for _ in range(n)
+        ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        el = []
+        for m in self.m:
+            x = m(x)
+            el.append(x)
+        x = torch.cat([x_1] + [m for m in el], dim=1)
+        return self.conv2(x)
+
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space."""
+
+    def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"):
+        super().__init__()
+        self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride, act=act)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)
+
+
+
+class SimpleStem(nn.Module):
+    """Simple Stem for Acceleration on Embedded Devices"""
+
+    def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"):
+        super().__init__()
+        #self.conv1 = BaseConv(in_channels, out_channels, ksize, stride, act=act)
+        #self.down1 = BaseConv(out_channels, out_channels, ksize, 2, act=act)
+        self.down1 = BaseConv(in_channels, out_channels, ksize, 2, act=act)        
+        self.conv2 = BaseConv(out_channels, out_channels, ksize, stride, act=act)
+        #self.down2 = BaseConv(out_channels, out_channels, ksize, 2, act=act)                
+
+    def forward(self, x):
+        #x = self.conv1(x)
+        x = self.down1(x)
+        x = self.conv2(x)
+        #x = self.down2(x)                
+        return x
+    
diff --git a/projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py b/projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py
new file mode 100644
index 000000000..5e7ac8746
--- /dev/null
+++ b/projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py
@@ -0,0 +1,189 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch import Tensor
+
+from mmengine.model import BaseModule
+from mmdet.registry import MODELS
+from mmdet.models import BaseDetector
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from mmdet.structures import DetDataSample
+from mmengine.structures import PixelData, InstanceData
+from mmdet.structures import SampleList
+
+from .heads import YOLOXSegHead
+
+from mmengine.logging import print_log
+
+@MODELS.register_module()
+class YOLOXMultiTask(BaseDetector):
+    """
+    YOLOX MultiTask detector
+    Supports bbox + mask heads.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 mask_head=None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor=None,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        self.neck = MODELS.build(neck) if neck is not None else None
+        if bbox_head is not None:
+            bbox_head.update(train_cfg=train_cfg)
+            bbox_head.update(test_cfg=test_cfg)
+            self.bbox_head = MODELS.build(bbox_head)
+        if mask_head is not None:
+            mask_head.update(train_cfg=train_cfg)
+            mask_head.update(test_cfg=test_cfg)
+            self.mask_head = MODELS.build(mask_head) if mask_head is not None else None
+        self.data_preprocessor = MODELS.build(data_preprocessor) if data_preprocessor else None
+
+    def extract_feat(self, inputs):
+        x = self.backbone(inputs)
+        if self.neck is not None:
+            x = self.neck(x)
+        return x
+
+    def _forward(self, imgs, **kwargs):
+        return self.forward(imgs, **kwargs)
+    
+    def forward_train(self, imgs, gt_bboxes, gt_labels, gt_masks=None, **kwargs):
+        feats = self.extract_feat(imgs)
+        losses = dict()
+        losses.update(self.bbox_head.loss(feats[-1], gt_bboxes, gt_labels))
+        if self.mask_head is not None and gt_masks is not None:
+            mask_pred = self.mask_head(feats)
+            losses.update(self.mask_head.loss(mask_pred, gt_masks))
+        return losses
+
+    def forward_test(self, imgs, **kwargs):
+        feats = self.extract_feat(imgs)
+        bbox_results = self.bbox_head(feats[-1])
+        mask_results = None
+        if self.mask_head is not None:
+            mask_results = self.mask_head(feats)
+        return dict(bboxes=bbox_results, masks=mask_results)
+
+    def forward(self, inputs, data_samples=None, mode='tensor'):
+        """Forward function with training and testing mode."""
+        feats = self.extract_feat(inputs)
+
+        if mode == 'tensor':
+            return self.bbox_head(feats)
+        elif mode == 'loss':
+            s = self.loss(feats, data_samples)
+            return  s 
+        elif mode == 'predict':
+            pred_instances = self.predict(inputs, data_samples)
+    
+            for pred, data_sample in zip(pred_instances, data_samples):
+                pred.gt_instances = data_sample.gt_instances
+                if hasattr(data_sample, 'gt_sem_seg'):
+                    pred.gt_sem_seg = data_sample.gt_sem_seg
+
+            return pred_instances
+        else:
+            raise ValueError(f"Invalid mode {mode}")
+
+    def loss(self, feats, data_samples):
+        loss = dict()
+        # bbox head forward
+        cls_scores, bbox_preds, objectnesses = self.bbox_head(feats)
+        batch_gt_instances = [d.gt_instances for d in data_samples]
+        batch_img_metas = [d.metainfo for d in data_samples]
+
+        loss.update(
+            self.bbox_head.loss_by_feat(
+                cls_scores, bbox_preds, objectnesses, batch_gt_instances, batch_img_metas
+            )
+        )
+
+        # mask head
+        if self.mask_head is not None:
+            seg_pred = self.mask_head(feats)
+            target_size = data_samples[0].gt_sem_seg.sem_seg.shape[-2:]
+            if seg_pred.shape[-2:] != target_size:
+                seg_pred = F.interpolate(seg_pred, size=target_size, mode='bilinear', align_corners=False)
+
+            gt_masks_tensor = []
+            gt_masks = torch.stack([d.gt_sem_seg.sem_seg.squeeze(0) for d in data_samples], dim=0)  # (B, H, W)
+            gt_masks = gt_masks.to(seg_pred.device)
+
+            mask_loss_dict = self.mask_head.loss(seg_pred, gt_masks)
+            for k, v in mask_loss_dict.items():
+                if torch.is_tensor(v):
+                    loss[k] = v
+                else:
+                    raise TypeError(f"mask loss '{k}' is not a tensor")
+
+        return loss
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True,
+                **kwargs) -> SampleList:
+        
+        x = self.extract_feat(batch_inputs)
+        
+        if self.with_bbox:
+            bbox_results_list = self.bbox_head.predict(x, batch_data_samples, rescale=True)
+        else:
+            bbox_results_list = [InstanceData() for _ in batch_data_samples]
+
+        seg_results_list = None
+        if self.with_mask:
+            seg_results_list = self.mask_head.predict(x, batch_data_samples, rescale=True)
+
+        results = []
+        for i, data_sample in enumerate(batch_data_samples):
+            data_sample.pred_instances = bbox_results_list[i]
+            
+            if seg_results_list is not None:
+                pixel_data = PixelData()
+                pixel_data.data = seg_results_list[i]
+                pixel_data.sem_seg = seg_results_list[i]
+                data_sample.pred_sem_seg = pixel_data
+            
+            img_h, img_w = data_sample.metainfo['img_shape']
+            ori_h, ori_w = data_sample.metainfo['ori_shape']
+            
+            if hasattr(data_sample, 'gt_instances'):
+
+                scale_factor = data_sample.metainfo['scale_factor'] # (w_scale, h_scale)
+
+                scale_factor_bbox = [scale_factor[0], scale_factor[1], scale_factor[0], scale_factor[1]]
+                scale_tensor = data_sample.gt_instances.bboxes.new_tensor(scale_factor_bbox)
+
+                data_sample.gt_instances.bboxes = data_sample.gt_instances.bboxes / scale_tensor
+
+            if hasattr(data_sample, 'gt_sem_seg') and data_sample.gt_sem_seg is not None:
+                gt_sem_seg_data = data_sample.gt_sem_seg.sem_seg # [H_pad, W_pad]
+                
+                gt_valid = gt_sem_seg_data[..., :img_h, :img_w] 
+
+                if gt_valid.shape[-2:] != (ori_h, ori_w):
+                    gt_resized = F.interpolate(
+                        gt_valid.unsqueeze(0).float(), # [1, 1, h, w]
+                        size=(ori_h, ori_w), 
+                        mode='nearest'
+                    ).squeeze(0).long()
+                    
+                    new_gt_pixel_data = PixelData()
+                    new_gt_pixel_data.sem_seg = gt_resized
+                    new_gt_pixel_data.data = gt_resized
+                    data_sample.gt_sem_seg = new_gt_pixel_data
+                elif 'data' not in data_sample.gt_sem_seg:
+                    data_sample.gt_sem_seg.data = data_sample.gt_sem_seg.sem_seg
+
+            results.append(data_sample)
+
+        return results
diff --git a/projects/YOLOX_opt_elan/yolox/transforms.py b/projects/YOLOX_opt_elan/yolox/transforms.py
new file mode 100644
index 000000000..939c57102
--- /dev/null
+++ b/projects/YOLOX_opt_elan/yolox/transforms.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn.functional as F
+from mmdet.registry import TRANSFORMS
+from mmcv.transforms import BaseTransform
+
+import os.path as osp
+
+@TRANSFORMS.register_module()
+class ResizeSegMask:
+    def __init__(self, size):
+        self.size = size  # (H_out, W_out)
+
+    def __call__(self, results):
+        if 'gt_seg_map' in results and results['gt_seg_map'] is not None:
+            seg = results['gt_seg_map']  # numpy array (H, W)
+            seg = torch.from_numpy(seg).unsqueeze(0).unsqueeze(0).float()  # (1,1,H,W)
+            seg = F.interpolate(seg, size=self.size, mode='nearest')
+            results['gt_seg_map'] = seg.squeeze(0).squeeze(0).long().numpy()  # back to (H_out,W_out)
+        return results
+
+@TRANSFORMS.register_module()
+class FixCityscapesPath(BaseTransform):
+    def __init__(self, data_root, split='train'):
+        self.data_root = data_root
+        self.split = split
+
+    def transform(self, results):
+        img_path = results['img_path']
+        filename = osp.basename(img_path)
+        
+        seg_filename = filename.replace('_leftImg8bit.png', '_gtFine_labelTrainIds.png')
+        city = filename.split('_')[0]
+        seg_path = osp.join(self.data_root, 'gtFine', self.split, city, seg_filename)
+        
+        results['seg_map_path'] = seg_path
+        
+        return results

From 645bf2af90363f8d499d4b31f8e5550a53097b97 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 23 Dec 2025 05:39:32 +0000
Subject: [PATCH 2/2] ci(pre-commit): autofix

---
 ...opt-elan-semseg_960x960_300e_cityscapes.py | 107 +++++++-----
 ...-opt-elan-semseg_960x960_300e_t4dataset.py | 110 +++++-------
 .../yolox/models/heads/__init__.py            |   2 +-
 .../yolox/models/heads/seg_head.py            |  58 +++----
 .../yolox/models/layers/network_blocks.py     | 159 +++++++-----------
 .../yolox/models/yolox_multitask.py           | 109 ++++++------
 projects/YOLOX_opt_elan/yolox/transforms.py   |  32 ++--
 7 files changed, 274 insertions(+), 303 deletions(-)

diff --git a/projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py b/projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py
index 05af852f9..05eac75af 100644
--- a/projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py
+++ b/projects/YOLOX_opt_elan/configs/cityscapes/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_cityscapes.py
@@ -11,7 +11,7 @@
         "projects.YOLOX_opt_elan.yolox.models",
         "projects.YOLOX_opt_elan.yolox.models.yolox_multitask",
         "projects.YOLOX_opt_elan.yolox.transforms",
-        "mmseg.evaluation.metrics", # 引入分割评估指标
+        "mmseg.evaluation.metrics",  # 引入分割评估指标
     ],
     allow_failed_imports=False,
 )
@@ -29,14 +29,51 @@
 base_lr = 0.001
 
 
-classes = ('person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle')
-palette = [
-    (220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70),
-    (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32)
-]
+classes = ("person", "rider", "car", "truck", "bus", "train", "motorcycle", "bicycle")
+palette = [(220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70), (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32)]
 
-seg_classes = ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole', 'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle')
-seg_palette = [(128, 64, 128), (244, 35, 232), (70, 70, 70), (102, 102, 156), (190, 153, 153), (153, 153, 153), (250, 170, 30), (220, 220, 0), (107, 142, 35), (152, 251, 152), (70, 130, 180), (220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70), (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32)]
+seg_classes = (
+    "road",
+    "sidewalk",
+    "building",
+    "wall",
+    "fence",
+    "pole",
+    "traffic light",
+    "traffic sign",
+    "vegetation",
+    "terrain",
+    "sky",
+    "person",
+    "rider",
+    "car",
+    "truck",
+    "bus",
+    "train",
+    "motorcycle",
+    "bicycle",
+)
+seg_palette = [
+    (128, 64, 128),
+    (244, 35, 232),
+    (70, 70, 70),
+    (102, 102, 156),
+    (190, 153, 153),
+    (153, 153, 153),
+    (250, 170, 30),
+    (220, 220, 0),
+    (107, 142, 35),
+    (152, 251, 152),
+    (70, 130, 180),
+    (220, 20, 60),
+    (255, 0, 0),
+    (0, 0, 142),
+    (0, 0, 70),
+    (0, 60, 100),
+    (0, 80, 100),
+    (0, 0, 230),
+    (119, 11, 32),
+]
 
 # metainfo = dict(classes=classes, palette=palette)
 metainfo = dict(classes=seg_classes, palette=seg_palette)
@@ -58,7 +95,7 @@
     backbone=dict(
         type="ELANDarknet",
         deepen_factor=2,
-        widen_factor=1, 
+        widen_factor=1,
         out_indices=(2, 3, 4),
         act_cfg=dict(type=activation),
     ),
@@ -75,10 +112,10 @@
         in_channels=128,
         feat_channels=128,
         act_cfg=dict(type=activation),
-        loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0),
-        loss_bbox=dict(type='IoULoss', loss_weight=0.0),
-        loss_obj=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0),
-        loss_l1=dict(type='L1Loss', loss_weight=0.0),
+        loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=0.0),
+        loss_bbox=dict(type="IoULoss", loss_weight=0.0),
+        loss_obj=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=0.0),
+        loss_l1=dict(type="L1Loss", loss_weight=0.0),
     ),
     mask_head=dict(
         type="YOLOXSegHead",
@@ -92,8 +129,8 @@
     test_cfg=dict(score_thr=0.01, nms=dict(type="nms", iou_threshold=0.65)),
 )
 
-dataset_type = 'CocoDataset'
-data_root = 'data/cityscapes/'
+dataset_type = "CocoDataset"
+data_root = "data/cityscapes/"
 backend_args = None
 
 train_pipeline = [
@@ -114,7 +151,7 @@
 
 test_pipeline = [
     dict(type="LoadImageFromFile", backend_args=backend_args),
-    dict(type="FixCityscapesPath", data_root=data_root, split='val'),
+    dict(type="FixCityscapesPath", data_root=data_root, split="val"),
     dict(type="LoadAnnotations", with_bbox=True, with_seg=True),
     dict(type="Resize", scale=IMG_SCALE, keep_ratio=False),
     dict(type="Pad", pad_to_square=False, size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0), seg=255)),
@@ -129,14 +166,11 @@
     dataset=dict(
         type=dataset_type,
         data_root=data_root,
-        data_prefix=dict(
-            img='leftImg8bit/train',
-            seg_map_path='gtFine/train'
-        ),
-        ann_file='annotations/instancesonly_filtered_gtFine_train.json',
+        data_prefix=dict(img="leftImg8bit/train", seg_map_path="gtFine/train"),
+        ann_file="annotations/instancesonly_filtered_gtFine_train.json",
         pipeline=[
             dict(type="LoadImageFromFile", backend_args=backend_args),
-            dict(type="FixCityscapesPath", data_root=data_root, split='train'),
+            dict(type="FixCityscapesPath", data_root=data_root, split="train"),
             dict(type="LoadAnnotations", with_bbox=True, with_seg=True),
         ],
         filter_cfg=dict(filter_empty_gt=False, min_size=8),
@@ -163,11 +197,8 @@
     dataset=dict(
         type=dataset_type,
         data_root=data_root,
-        data_prefix=dict(
-            img='leftImg8bit/val', 
-            seg_map_path='gtFine/val'
-        ),
-        ann_file='annotations/instancesonly_filtered_gtFine_val.json',
+        data_prefix=dict(img="leftImg8bit/val", seg_map_path="gtFine/val"),
+        ann_file="annotations/instancesonly_filtered_gtFine_val.json",
         test_mode=True,
         pipeline=test_pipeline,
         backend_args=backend_args,
@@ -177,7 +208,7 @@
 test_dataloader = val_dataloader
 
 val_evaluator = [
-    dict(type='mmseg.IoUMetric', ignore_index=255, iou_metrics=['mIoU'], prefix="seg", classes=seg_classes)
+    dict(type="mmseg.IoUMetric", ignore_index=255, iou_metrics=["mIoU"], prefix="seg", classes=seg_classes)
 ]
 test_evaluator = val_evaluator
 
@@ -224,19 +255,9 @@
 )
 
 default_hooks = dict(
-    checkpoint=dict(
-        interval=interval,
-        max_keep_ckpts=3, 
-        save_best='seg/mIoU',
-        rule='greater'
-    ),
+    checkpoint=dict(interval=interval, max_keep_ckpts=3, save_best="seg/mIoU", rule="greater"),
     visualization=dict(
-        type='DetVisualizationHook',
-        draw=False,
-        interval=50,
-        show=False,
-        wait_time=2,
-        test_out_dir='vis_data'
+        type="DetVisualizationHook", draw=False, interval=50, show=False, wait_time=2, test_out_dir="vis_data"
     ),
 )
 
@@ -260,8 +281,8 @@
 ]
 
 visualizer = dict(
-    type='DetLocalVisualizer',
-    vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')],
-    name='visualizer',
+    type="DetLocalVisualizer",
+    vis_backends=[dict(type="LocalVisBackend"), dict(type="TensorboardVisBackend")],
+    name="visualizer",
     alpha=0.5,
 )
diff --git a/projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py b/projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py
index d8f5bee0a..dc78b0080 100644
--- a/projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py
+++ b/projects/YOLOX_opt_elan/configs/t4dataset/YOLOX_opt-S-DynamicRecognition/yolox-s-opt-elan-semseg_960x960_300e_t4dataset.py
@@ -65,10 +65,10 @@
         in_channels=128,
         feat_channels=128,
         act_cfg=dict(type=activation),
-        loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0),
-        loss_bbox=dict(type='IoULoss', loss_weight=0.0),
-        loss_obj=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.0),
-        loss_l1=dict(type='L1Loss', loss_weight=0.0),   
+        loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=0.0),
+        loss_bbox=dict(type="IoULoss", loss_weight=0.0),
+        loss_obj=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=0.0),
+        loss_l1=dict(type="L1Loss", loss_weight=0.0),
     ),
     mask_head=dict(
         type="YOLOXSegHead",
@@ -155,45 +155,45 @@
 )
 
 palette = [
-    (150, 120, 90),   # 0: animal
-    (119, 11, 32),    # 1: bicycle
-    (70, 70, 70),     # 2: building
-    (0, 60, 100),     # 3: bus 
-    (0, 0, 142),      # 4: car
-    (250, 170, 30),   # 5: cone 
-    (230, 150, 140),  # 6: construction 
-    (140, 140, 200),  # 7: crosswalk 
+    (150, 120, 90),  # 0: animal
+    (119, 11, 32),  # 1: bicycle
+    (70, 70, 70),  # 2: building
+    (0, 60, 100),  # 3: bus
+    (0, 0, 142),  # 4: car
+    (250, 170, 30),  # 5: cone
+    (230, 150, 140),  # 6: construction
+    (140, 140, 200),  # 7: crosswalk
     (255, 255, 255),  # 8: dashed_lane_marking
-    (200, 200, 200),  # 9: deceleration_line 
-    (190, 153, 153),  # 10: gate 
-    (250, 170, 30),   # 11: guide_post
+    (200, 200, 200),  # 9: deceleration_line
+    (190, 153, 153),  # 10: gate
+    (250, 170, 30),  # 11: guide_post
     (255, 255, 255),  # 12: laneline_dash_white
-    (255, 255, 0),    # 13: laneline_dash_yellow
-    (0, 255, 0),      # 14: laneline_solid_green
-    (255, 0, 0),      # 15: laneline_solid_red
+    (255, 255, 0),  # 13: laneline_dash_yellow
+    (0, 255, 0),  # 14: laneline_solid_green
+    (255, 0, 0),  # 15: laneline_solid_red
     (255, 255, 255),  # 16: laneline_solid_white
-    (255, 215, 0),    # 17: laneline_solid_yellow 
-    (0, 255, 255),    # 18: marking_arrow
-    (200, 0, 200),    # 19: marking_character 
-    (150, 0, 150),    # 20: marking_other
-    (0, 0, 230),      # 21: motorcycle 
-    (80, 80, 80),     # 22: other_obstacle
-    (250, 170, 160),  # 23: other_pedestrian 
-    (100, 80, 200),   # 24: other_vehicle 
+    (255, 215, 0),  # 17: laneline_solid_yellow
+    (0, 255, 255),  # 18: marking_arrow
+    (200, 0, 200),  # 19: marking_character
+    (150, 0, 150),  # 20: marking_other
+    (0, 0, 230),  # 21: motorcycle
+    (80, 80, 80),  # 22: other_obstacle
+    (250, 170, 160),  # 23: other_pedestrian
+    (100, 80, 200),  # 24: other_vehicle
     (180, 165, 180),  # 25: parking_lot
-    (220, 20, 60),    # 26: pedestrian 
-    (153, 153, 153),  # 27: pole 
-    (128, 64, 128),   # 28: road
+    (220, 20, 60),  # 26: pedestrian
+    (153, 153, 153),  # 27: pole
+    (128, 64, 128),  # 28: road
     (110, 110, 110),  # 29: road_debris
-    (244, 35, 232),   # 30: sidewalk
-    (70, 130, 180),   # 31: sky 
-    (220, 220, 220),  # 32: stopline 
+    (244, 35, 232),  # 30: sidewalk
+    (70, 130, 180),  # 31: sky
+    (220, 220, 220),  # 32: stopline
     (160, 150, 180),  # 33: striped_road_marking
-    (250, 170, 30),   # 34: traffic_light
-    (220, 220, 0),    # 35: traffic_sign
-    (0, 80, 100),     # 36: train 
-    (0, 0, 70),       # 37: truck 
-    (107, 142, 35),   # 38: vegetation/terrain
+    (250, 170, 30),  # 34: traffic_light
+    (220, 220, 0),  # 35: traffic_sign
+    (0, 80, 100),  # 36: train
+    (0, 0, 70),  # 37: truck
+    (107, 142, 35),  # 38: vegetation/terrain
     (102, 102, 156),  # 39: wall/fence
 ]
 metainfo = dict(classes=classes, palette=palette)
@@ -254,7 +254,7 @@
         pipeline=test_pipeline,
         backend_args=backend_args,
         metainfo=metainfo,
-        indices=2000, 
+        indices=2000,
     ),
 )
 
@@ -262,18 +262,13 @@
 
 val_evaluator = [
     dict(type="VOCMetric", metric="mAP", prefix="det"),
-    dict(type='mmseg.IoUMetric', ignore_index=255, iou_metrics=['mIoU'], prefix="seg")
+    dict(type="mmseg.IoUMetric", ignore_index=255, iou_metrics=["mIoU"], prefix="seg"),
 ]
 
 test_evaluator = val_evaluator
 
 # train_cfg = dict(max_epochs=max_epochs, val_interval=interval)
-train_cfg = dict(
-    _delete_=True,
-    type='IterBasedTrainLoop',
-    max_iters=200000,     
-    val_interval=1000
-)
+train_cfg = dict(_delete_=True, type="IterBasedTrainLoop", max_iters=200000, val_interval=1000)
 
 # optimizer
 optimizer = dict(
@@ -323,24 +318,11 @@
 # )
 default_hooks = dict(
     checkpoint=dict(
-        type='CheckpointHook', 
-        interval=1000,
-        by_epoch=False,
-        max_keep_ckpts=5, 
-        save_best='seg/mIoU',
-        rule='greater'
-    ),
-    logger=dict(
-        type='LoggerHook', 
-        interval=50 
+        type="CheckpointHook", interval=1000, by_epoch=False, max_keep_ckpts=5, save_best="seg/mIoU", rule="greater"
     ),
+    logger=dict(type="LoggerHook", interval=50),
     visualization=dict(
-        type='DetVisualizationHook',
-        draw=False,
-        interval=100,
-        show=False,
-        wait_time=2,
-        test_out_dir='vis_data'
+        type="DetVisualizationHook", draw=False, interval=100, show=False, wait_time=2, test_out_dir="vis_data"
     ),
 )
 
@@ -365,8 +347,8 @@
 ]
 
 visualizer = dict(
-    type='DetLocalVisualizer',
-    vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')],
-    name='visualizer',
+    type="DetLocalVisualizer",
+    vis_backends=[dict(type="LocalVisBackend"), dict(type="TensorboardVisBackend")],
+    name="visualizer",
     alpha=0.3,
 )
diff --git a/projects/YOLOX_opt_elan/yolox/models/heads/__init__.py b/projects/YOLOX_opt_elan/yolox/models/heads/__init__.py
index db9c396aa..598422de0 100644
--- a/projects/YOLOX_opt_elan/yolox/models/heads/__init__.py
+++ b/projects/YOLOX_opt_elan/yolox/models/heads/__init__.py
@@ -1,3 +1,3 @@
 from .seg_head import YOLOXSegHead
 
-__all__ = ("YOLOXSegHead",)
\ No newline at end of file
+__all__ = ("YOLOXSegHead",)
diff --git a/projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py b/projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py
index ea5652d4f..866fc727d 100644
--- a/projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py
+++ b/projects/YOLOX_opt_elan/yolox/models/heads/seg_head.py
@@ -1,14 +1,14 @@
+from typing import Dict, List, Tuple, Union
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from typing import Dict, List, Tuple, Union
-
+from mmdet.models.seg_heads.base_semantic_head import BaseSemanticHead
+from mmengine.registry import MODELS
 from torch import Tensor
 
-from mmdet.models.seg_heads.base_semantic_head import BaseSemanticHead
 from ..layers.network_blocks import BaseConv, CSPLayer, DWConv
 
-from mmengine.registry import MODELS
 
 def get_activation(name="ReLU6"):
     if name.lower() == "relu6":
@@ -22,10 +22,21 @@ def get_activation(name="ReLU6"):
     else:
         raise AttributeError(f"Unsupported act type {name}")
 
+
 @MODELS.register_module()
 class YOLOXSegHead(nn.Module):
-    def __init__(self, in_channels, num_classes, feat_channels=None, act_cfg=dict(type="ReLU6"), width=1.0, depthwise=False, train_cfg=None,
-                 test_cfg=None, **kwargs):
+    def __init__(
+        self,
+        in_channels,
+        num_classes,
+        feat_channels=None,
+        act_cfg=dict(type="ReLU6"),
+        width=1.0,
+        depthwise=False,
+        train_cfg=None,
+        test_cfg=None,
+        **kwargs,
+    ):
         super().__init__()
         self.num_classes = num_classes
         self.width = width
@@ -36,7 +47,7 @@ def __init__(self, in_channels, num_classes, feat_channels=None, act_cfg=dict(ty
         self.act_fn = get_activation(act_type)
 
         self.train_cfg = train_cfg
-        
+
         Conv = DWConv if depthwise else BaseConv
 
         # mask head layers
@@ -58,7 +69,7 @@ def forward(self, feats):
         """
         if isinstance(feats, (list, tuple)):
             target_size = feats[0].shape[2:]
-            up_feats = [F.interpolate(f, size=target_size, mode='bilinear', align_corners=False) for f in feats]
+            up_feats = [F.interpolate(f, size=target_size, mode="bilinear", align_corners=False) for f in feats]
             x = torch.cat(up_feats, dim=1)  # [B, sum(C_i), H, W]
         else:
             x = feats
@@ -83,40 +94,29 @@ def loss(self, seg_pred, gt_masks):
         """
         return dict(loss_mask=F.cross_entropy(seg_pred, gt_masks.long(), ignore_index=255))
 
-    def predict(self,
-                x: Union[Tensor, Tuple[Tensor]],
-                batch_data_samples,
-                rescale: bool = False) -> List[Tensor]:
-        
-        batch_img_metas = [
-            data_sample.metainfo for data_sample in batch_data_samples
-        ]
+    def predict(self, x: Union[Tensor, Tuple[Tensor]], batch_data_samples, rescale: bool = False) -> List[Tensor]:
+
+        batch_img_metas = [data_sample.metainfo for data_sample in batch_data_samples]
         seg_preds = self.forward(x)
 
-        input_shape = batch_img_metas[0]['batch_input_shape']
-        seg_preds = F.interpolate(
-            seg_preds,
-            size=input_shape,
-            mode='bilinear',
-            align_corners=False)
+        input_shape = batch_img_metas[0]["batch_input_shape"]
+        seg_preds = F.interpolate(seg_preds, size=input_shape, mode="bilinear", align_corners=False)
 
         result_list = []
         for i in range(len(batch_img_metas)):
             img_meta = batch_img_metas[i]
-            h, w = img_meta['img_shape']
+            h, w = img_meta["img_shape"]
 
             seg_pred = seg_preds[i][:, :h, :w]
 
             if rescale:
-                ori_h, ori_w = img_meta['ori_shape']
+                ori_h, ori_w = img_meta["ori_shape"]
                 seg_pred = F.interpolate(
-                    seg_pred.unsqueeze(0),
-                    size=(ori_h, ori_w),
-                    mode='bilinear',
-                    align_corners=False).squeeze(0)
+                    seg_pred.unsqueeze(0), size=(ori_h, ori_w), mode="bilinear", align_corners=False
+                ).squeeze(0)
 
             seg_pred = seg_pred.argmax(dim=0).to(torch.int64)
-            
+
             result_list.append(seg_pred)
 
         return result_list
diff --git a/projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py b/projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py
index 1aab7efcb..6edebdaae 100644
--- a/projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py
+++ b/projects/YOLOX_opt_elan/yolox/models/layers/network_blocks.py
@@ -6,6 +6,7 @@
 import torch.nn as nn
 from torch.autograd import Function
 
+
 class SiLU(nn.Module):
     """export-friendly version of nn.SiLU()"""
 
@@ -13,22 +14,24 @@ class SiLU(nn.Module):
     def forward(x):
         return x * torch.sigmoid(x)
 
+
 class PactFix(nn.Module):
     """export-friendly version of nn.SiLU()"""
 
     @staticmethod
     def forward(x, alpha=4.0):
-        y = torch.clamp(x, min = 0, max = alpha)
+        y = torch.clamp(x, min=0, max=alpha)
         return y
 
+
 class Pact(Function):
-    @staticmethod        
+    @staticmethod
     def forward(ctx, x, alpha, k):
         ctx.save_for_backward(x, alpha)
         # y_1 = 0.5 * ( torch.abs(x).detach() - torch.abs(x - alpha).detach() + alpha.item() )
-        y = torch.clamp(x, min = 0, max = alpha.item())
+        y = torch.clamp(x, min=0, max=alpha.item())
         scale = (2**k - 1) / alpha
-        y_q = torch.round( y * scale) / scale
+        y_q = torch.round(y * scale) / scale
         return y_q
 
     @staticmethod
@@ -36,20 +39,24 @@ def backward(ctx, dLdy_q):
         # Backward function, I borrowed code from
         # https://github.com/obilaniu/GradOverride/blob/master/functional.py
         # We get dL / dy_q as a gradient
-        x, alpha, = ctx.saved_tensors
+        (
+            x,
+            alpha,
+        ) = ctx.saved_tensors
         # Weight gradient is only valid when [0, alpha]
         # Actual gradient for alpha,
         # By applying Chain Rule, we get dL / dy_q * dy_q / dy * dy / dalpha
         # dL / dy_q = argument,  dy_q / dy * dy / dalpha = 0, 1 with x value range
-        lower_bound      = x < 0
-        upper_bound      = x > alpha
+        lower_bound = x < 0
+        upper_bound = x > alpha
         # x_range       = 1.0-lower_bound-upper_bound
-        x_range = ~(lower_bound|upper_bound)
+        x_range = ~(lower_bound | upper_bound)
         grad_alpha = torch.sum(dLdy_q * torch.ge(x, alpha).float()).view(-1)
         return dLdy_q * x_range.float(), grad_alpha, None
-    
+
+
 def get_activation(name="silu", inplace=True):
-    #name = 'relu'
+    # name = 'relu'
     name = name.lower()
     if name == "silu":
         module = nn.SiLU(inplace=inplace)
@@ -57,9 +64,9 @@ def get_activation(name="silu", inplace=True):
         module = nn.ReLU(inplace=inplace)
     elif name == "lrelu":
         module = nn.LeakyReLU(0.1, inplace=inplace)
-    elif name == "pact":        
+    elif name == "pact":
         module = Pact.apply
-    elif name == "pactfix":        
+    elif name == "pactfix":
         module = PactFix()
     elif name == "relu6":
         module = nn.ReLU6()
@@ -67,14 +74,15 @@ def get_activation(name="silu", inplace=True):
         raise AttributeError("Unsupported act type: {}".format(name))
     return module
 
+
 K = 2
-#fisrt = True
+
+
+# fisrt = True
 class BaseConv(nn.Module):
     """A Conv2d -> Batchnorm -> silu/leaky relu block"""
 
-    def __init__(
-        self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"
-    ):
+    def __init__(self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"):
         super().__init__()
         # same padding
         pad = (ksize - 1) // 2
@@ -90,15 +98,14 @@ def __init__(
         self.bn = nn.BatchNorm2d(out_channels)
         self.act = get_activation(act, inplace=True)
         self.act_name = act
-        if (self.act_name == "pact") :
-            self.alpha = nn.Parameter(torch.tensor(20.))
+        if self.act_name == "pact":
+            self.alpha = nn.Parameter(torch.tensor(20.0))
 
-        
     def forward(self, x):
-        if (self.act_name == "pact") :
+        if self.act_name == "pact":
             return self.act(self.bn(self.conv(x)), self.alpha, 2)
         else:
-            #print(self.conv)
+            # print(self.conv)
             return self.act(self.bn(self.conv(x)))
 
     def fuseforward(self, x):
@@ -118,9 +125,7 @@ def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"):
             groups=in_channels,
             act=act,
         )
-        self.pconv = BaseConv(
-            in_channels, out_channels, ksize=1, stride=1, groups=1, act=act
-        )
+        self.pconv = BaseConv(in_channels, out_channels, ksize=1, stride=1, groups=1, act=act)
 
     def forward(self, x):
         x = self.dconv(x)
@@ -129,16 +134,7 @@ def forward(self, x):
 
 class Bottleneck(nn.Module):
     # Standard bottleneck
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        shortcut=True,
-        expansion=0.5,
-        depthwise=False,
-        act="silu",
-        kernel=3
-    ):
+    def __init__(self, in_channels, out_channels, shortcut=True, expansion=0.5, depthwise=False, act="silu", kernel=3):
         super().__init__()
         hidden_channels = int(out_channels * expansion)
         Conv = DWConv if depthwise else BaseConv
@@ -179,7 +175,6 @@ def forward(self, x):
         return y
 
 
-
 class Bottleneck_EFF(nn.Module):
     # Standard bottleneck
     def __init__(
@@ -190,7 +185,7 @@ def __init__(
         expansion=0.5,
         depthwise=False,
         act="silu",
-        kernel=3,            
+        kernel=3,
     ):
         super().__init__()
         hidden_channels = int(out_channels * expansion)
@@ -203,7 +198,7 @@ def forward(self, x):
         y = self.conv2(self.conv1(x))
         if self.use_add:
             y = y + x
-        return y    
+        return y
 
 
 class ResLayer(nn.Module):
@@ -212,12 +207,8 @@ class ResLayer(nn.Module):
     def __init__(self, in_channels: int):
         super().__init__()
         mid_channels = in_channels // 2
-        self.layer1 = BaseConv(
-            in_channels, mid_channels, ksize=1, stride=1, act="lrelu"
-        )
-        self.layer2 = BaseConv(
-            mid_channels, in_channels, ksize=3, stride=1, act="lrelu"
-        )
+        self.layer1 = BaseConv(in_channels, mid_channels, ksize=1, stride=1, act="lrelu")
+        self.layer2 = BaseConv(mid_channels, in_channels, ksize=3, stride=1, act="lrelu")
 
     def forward(self, x):
         out = self.layer2(self.layer1(x))
@@ -227,18 +218,11 @@ def forward(self, x):
 class SPPBottleneck(nn.Module):
     """Spatial pyramid pooling layer used in YOLOv3-SPP"""
 
-    def __init__(
-        self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"
-    ):
+    def __init__(self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"):
         super().__init__()
         hidden_channels = in_channels // 2
         self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation)
-        self.m = nn.ModuleList(
-            [
-                nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
-                for ks in kernel_sizes
-            ]
-        )
+        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) for ks in kernel_sizes])
         conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
         self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation)
 
@@ -277,20 +261,16 @@ def __init__(
         self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
         self.elan = elan
 
-        if (self.elan == True) :
-            self.conv3 = BaseConv((n+1) * hidden_channels, out_channels, 1, stride=1, act=act)
+        if self.elan == True:
+            self.conv3 = BaseConv((n + 1) * hidden_channels, out_channels, 1, stride=1, act=act)
             module_list = [
-                BottleneckV8(
-                    hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel
-                )
+                BottleneckV8(hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel)
                 for _ in range(n)
-            ]            
+            ]
         else:
             self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act)
             module_list = [
-                Bottleneck(
-                    hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel
-                )
+                Bottleneck(hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel)
                 for _ in range(n)
             ]
         self.m = nn.Sequential(*module_list)
@@ -299,17 +279,18 @@ def forward(self, x):
         x_1 = self.conv1(x)
         x_2 = self.conv2(x)
         el = []
-        if (self.elan == True) :
+        if self.elan == True:
             x = x_1
             for m in self.m:
                 x = m(x)
                 el.append(x)
             x = torch.cat([x_2] + [m for m in el], dim=1)
-        else :
+        else:
             x_1 = self.m(x_1)
             x = torch.cat((x_1, x_2), dim=1)
         return self.conv3(x)
 
+
 class CSPLayer_EFF(nn.Module):
     """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
 
@@ -319,12 +300,12 @@ def __init__(
         out_channels,
         n=1,
         shortcut=True,
-        #expansion=0.5,
+        # expansion=0.5,
         expansion=1.0,
         depthwise=False,
         act="silu",
         kernel=3,
-        elan=False,            
+        elan=False,
     ):
         """
         Args:
@@ -339,21 +320,16 @@ def __init__(
         self.conv2 = BaseConv(in_channels, hidden_channels, 3, stride=1, act=act)
         self.elan = elan
 
-        if (self.elan == True) :
-            self.conv3 = BaseConv((n+1) * hidden_channels, out_channels, 1, stride=1, act=act)
+        if self.elan == True:
+            self.conv3 = BaseConv((n + 1) * hidden_channels, out_channels, 1, stride=1, act=act)
             module_list = [
-                BottleneckV8(
-                    hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act
-                )
-                for _ in range(n)
-            ]            
-        else :
+                BottleneckV8(hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act) for _ in range(n)
+            ]
+        else:
             self.conv3 = BaseConv(2 * hidden_channels, out_channels, 3, stride=1, act=act)
 
             module_list = [
-                Bottleneck_EFF(
-                    hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel
-                )
+                Bottleneck_EFF(hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act, kernel=kernel)
                 for _ in range(n)
             ]
         self.m = nn.Sequential(*module_list)
@@ -362,13 +338,13 @@ def forward(self, x):
         x_1 = self.conv1(x)
         x_2 = self.conv2(x)
         el = []
-        if (self.elan == True) :
+        if self.elan == True:
             x = x_1
             for m in self.m:
                 x = m(x)
                 el.append(x)
             x = torch.cat([x_2] + [m for m in el], dim=1)
-        else :
+        else:
             x_1 = self.m(x_1)
             x = torch.cat((x_1, x_2), dim=1)
         return self.conv3(x)
@@ -383,7 +359,7 @@ def __init__(
         out_channels,
         n=3,
         shortcut=True,
-        #expansion=0.5,
+        # expansion=0.5,
         expansion=1.0,
         depthwise=False,
         act="silu",
@@ -398,15 +374,11 @@ def __init__(
         # ch_in, ch_out, number, shortcut, groups, expansion
         super().__init__()
         hidden_channels = int(out_channels * expansion)  # hidden channels
-        self.conv1 = BaseConv(hidden_channels, out_channels, 3, stride=1, act=act)        
+        self.conv1 = BaseConv(hidden_channels, out_channels, 3, stride=1, act=act)
         self.conv_c = BaseConv(in_channels, hidden_channels, kernel, stride=1, act=act)
-        self.conv2 = BaseConv((n+1) * hidden_channels, out_channels, 1, stride=1, act=act)
-
+        self.conv2 = BaseConv((n + 1) * hidden_channels, out_channels, 1, stride=1, act=act)
 
-        module_list = [
-            self.conv_c
-            for _ in range(n)
-        ]
+        module_list = [self.conv_c for _ in range(n)]
         self.m = nn.Sequential(*module_list)
 
     def forward(self, x):
@@ -419,7 +391,6 @@ def forward(self, x):
         return self.conv2(x)
 
 
-
 class Focus(nn.Module):
     """Focus width and height information into channel space."""
 
@@ -445,22 +416,20 @@ def forward(self, x):
         return self.conv(x)
 
 
-
 class SimpleStem(nn.Module):
     """Simple Stem for Acceleration on Embedded Devices"""
 
     def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"):
         super().__init__()
-        #self.conv1 = BaseConv(in_channels, out_channels, ksize, stride, act=act)
-        #self.down1 = BaseConv(out_channels, out_channels, ksize, 2, act=act)
-        self.down1 = BaseConv(in_channels, out_channels, ksize, 2, act=act)        
+        # self.conv1 = BaseConv(in_channels, out_channels, ksize, stride, act=act)
+        # self.down1 = BaseConv(out_channels, out_channels, ksize, 2, act=act)
+        self.down1 = BaseConv(in_channels, out_channels, ksize, 2, act=act)
         self.conv2 = BaseConv(out_channels, out_channels, ksize, stride, act=act)
-        #self.down2 = BaseConv(out_channels, out_channels, ksize, 2, act=act)                
+        # self.down2 = BaseConv(out_channels, out_channels, ksize, 2, act=act)
 
     def forward(self, x):
-        #x = self.conv1(x)
+        # x = self.conv1(x)
         x = self.down1(x)
         x = self.conv2(x)
-        #x = self.down2(x)                
+        # x = self.down2(x)
         return x
-    
diff --git a/projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py b/projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py
index 5e7ac8746..ac52e9cf1 100644
--- a/projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py
+++ b/projects/YOLOX_opt_elan/yolox/models/yolox_multitask.py
@@ -1,20 +1,17 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-
-from torch import Tensor
-
-from mmengine.model import BaseModule
-from mmdet.registry import MODELS
 from mmdet.models import BaseDetector
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample, SampleList
 from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
-from mmdet.structures import DetDataSample
-from mmengine.structures import PixelData, InstanceData
-from mmdet.structures import SampleList
+from mmengine.logging import print_log
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData, PixelData
+from torch import Tensor
 
 from .heads import YOLOXSegHead
 
-from mmengine.logging import print_log
 
 @MODELS.register_module()
 class YOLOXMultiTask(BaseDetector):
@@ -23,16 +20,18 @@ class YOLOXMultiTask(BaseDetector):
     Supports bbox + mask heads.
     """
 
-    def __init__(self,
-                 backbone,
-                 neck,
-                 bbox_head,
-                 mask_head=None,
-                 train_cfg: OptConfigType = None,
-                 test_cfg: OptConfigType = None,
-                 data_preprocessor=None,
-                 init_cfg=None,
-                 **kwargs):
+    def __init__(
+        self,
+        backbone,
+        neck,
+        bbox_head,
+        mask_head=None,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        data_preprocessor=None,
+        init_cfg=None,
+        **kwargs,
+    ):
         super().__init__(init_cfg=init_cfg)
         self.backbone = MODELS.build(backbone)
         self.neck = MODELS.build(neck) if neck is not None else None
@@ -54,7 +53,7 @@ def extract_feat(self, inputs):
 
     def _forward(self, imgs, **kwargs):
         return self.forward(imgs, **kwargs)
-    
+
     def forward_train(self, imgs, gt_bboxes, gt_labels, gt_masks=None, **kwargs):
         feats = self.extract_feat(imgs)
         losses = dict()
@@ -72,21 +71,21 @@ def forward_test(self, imgs, **kwargs):
             mask_results = self.mask_head(feats)
         return dict(bboxes=bbox_results, masks=mask_results)
 
-    def forward(self, inputs, data_samples=None, mode='tensor'):
+    def forward(self, inputs, data_samples=None, mode="tensor"):
         """Forward function with training and testing mode."""
         feats = self.extract_feat(inputs)
 
-        if mode == 'tensor':
+        if mode == "tensor":
             return self.bbox_head(feats)
-        elif mode == 'loss':
+        elif mode == "loss":
             s = self.loss(feats, data_samples)
-            return  s 
-        elif mode == 'predict':
+            return s
+        elif mode == "predict":
             pred_instances = self.predict(inputs, data_samples)
-    
+
             for pred, data_sample in zip(pred_instances, data_samples):
                 pred.gt_instances = data_sample.gt_instances
-                if hasattr(data_sample, 'gt_sem_seg'):
+                if hasattr(data_sample, "gt_sem_seg"):
                     pred.gt_sem_seg = data_sample.gt_sem_seg
 
             return pred_instances
@@ -101,9 +100,7 @@ def loss(self, feats, data_samples):
         batch_img_metas = [d.metainfo for d in data_samples]
 
         loss.update(
-            self.bbox_head.loss_by_feat(
-                cls_scores, bbox_preds, objectnesses, batch_gt_instances, batch_img_metas
-            )
+            self.bbox_head.loss_by_feat(cls_scores, bbox_preds, objectnesses, batch_gt_instances, batch_img_metas)
         )
 
         # mask head
@@ -111,7 +108,7 @@ def loss(self, feats, data_samples):
             seg_pred = self.mask_head(feats)
             target_size = data_samples[0].gt_sem_seg.sem_seg.shape[-2:]
             if seg_pred.shape[-2:] != target_size:
-                seg_pred = F.interpolate(seg_pred, size=target_size, mode='bilinear', align_corners=False)
+                seg_pred = F.interpolate(seg_pred, size=target_size, mode="bilinear", align_corners=False)
 
             gt_masks_tensor = []
             gt_masks = torch.stack([d.gt_sem_seg.sem_seg.squeeze(0) for d in data_samples], dim=0)  # (B, H, W)
@@ -126,14 +123,12 @@ def loss(self, feats, data_samples):
 
         return loss
 
-    def predict(self,
-                batch_inputs: Tensor,
-                batch_data_samples: SampleList,
-                rescale: bool = True,
-                **kwargs) -> SampleList:
-        
+    def predict(
+        self, batch_inputs: Tensor, batch_data_samples: SampleList, rescale: bool = True, **kwargs
+    ) -> SampleList:
+
         x = self.extract_feat(batch_inputs)
-        
+
         if self.with_bbox:
             bbox_results_list = self.bbox_head.predict(x, batch_data_samples, rescale=True)
         else:
@@ -146,42 +141,44 @@ def predict(self,
         results = []
         for i, data_sample in enumerate(batch_data_samples):
             data_sample.pred_instances = bbox_results_list[i]
-            
+
             if seg_results_list is not None:
                 pixel_data = PixelData()
                 pixel_data.data = seg_results_list[i]
                 pixel_data.sem_seg = seg_results_list[i]
                 data_sample.pred_sem_seg = pixel_data
-            
-            img_h, img_w = data_sample.metainfo['img_shape']
-            ori_h, ori_w = data_sample.metainfo['ori_shape']
-            
-            if hasattr(data_sample, 'gt_instances'):
 
-                scale_factor = data_sample.metainfo['scale_factor'] # (w_scale, h_scale)
+            img_h, img_w = data_sample.metainfo["img_shape"]
+            ori_h, ori_w = data_sample.metainfo["ori_shape"]
+
+            if hasattr(data_sample, "gt_instances"):
+
+                scale_factor = data_sample.metainfo["scale_factor"]  # (w_scale, h_scale)
 
                 scale_factor_bbox = [scale_factor[0], scale_factor[1], scale_factor[0], scale_factor[1]]
                 scale_tensor = data_sample.gt_instances.bboxes.new_tensor(scale_factor_bbox)
 
                 data_sample.gt_instances.bboxes = data_sample.gt_instances.bboxes / scale_tensor
 
-            if hasattr(data_sample, 'gt_sem_seg') and data_sample.gt_sem_seg is not None:
-                gt_sem_seg_data = data_sample.gt_sem_seg.sem_seg # [H_pad, W_pad]
-                
-                gt_valid = gt_sem_seg_data[..., :img_h, :img_w] 
+            if hasattr(data_sample, "gt_sem_seg") and data_sample.gt_sem_seg is not None:
+                gt_sem_seg_data = data_sample.gt_sem_seg.sem_seg  # [H_pad, W_pad]
+
+                gt_valid = gt_sem_seg_data[..., :img_h, :img_w]
 
                 if gt_valid.shape[-2:] != (ori_h, ori_w):
-                    gt_resized = F.interpolate(
-                        gt_valid.unsqueeze(0).float(), # [1, 1, h, w]
-                        size=(ori_h, ori_w), 
-                        mode='nearest'
-                    ).squeeze(0).long()
-                    
+                    gt_resized = (
+                        F.interpolate(
+                            gt_valid.unsqueeze(0).float(), size=(ori_h, ori_w), mode="nearest"  # [1, 1, h, w]
+                        )
+                        .squeeze(0)
+                        .long()
+                    )
+
                     new_gt_pixel_data = PixelData()
                     new_gt_pixel_data.sem_seg = gt_resized
                     new_gt_pixel_data.data = gt_resized
                     data_sample.gt_sem_seg = new_gt_pixel_data
-                elif 'data' not in data_sample.gt_sem_seg:
+                elif "data" not in data_sample.gt_sem_seg:
                     data_sample.gt_sem_seg.data = data_sample.gt_sem_seg.sem_seg
 
             results.append(data_sample)
diff --git a/projects/YOLOX_opt_elan/yolox/transforms.py b/projects/YOLOX_opt_elan/yolox/transforms.py
index 939c57102..3aad09cff 100644
--- a/projects/YOLOX_opt_elan/yolox/transforms.py
+++ b/projects/YOLOX_opt_elan/yolox/transforms.py
@@ -1,9 +1,10 @@
+import os.path as osp
+
 import torch
 import torch.nn.functional as F
-from mmdet.registry import TRANSFORMS
 from mmcv.transforms import BaseTransform
+from mmdet.registry import TRANSFORMS
 
-import os.path as osp
 
 @TRANSFORMS.register_module()
 class ResizeSegMask:
@@ -11,27 +12,28 @@ def __init__(self, size):
         self.size = size  # (H_out, W_out)
 
     def __call__(self, results):
-        if 'gt_seg_map' in results and results['gt_seg_map'] is not None:
-            seg = results['gt_seg_map']  # numpy array (H, W)
+        if "gt_seg_map" in results and results["gt_seg_map"] is not None:
+            seg = results["gt_seg_map"]  # numpy array (H, W)
             seg = torch.from_numpy(seg).unsqueeze(0).unsqueeze(0).float()  # (1,1,H,W)
-            seg = F.interpolate(seg, size=self.size, mode='nearest')
-            results['gt_seg_map'] = seg.squeeze(0).squeeze(0).long().numpy()  # back to (H_out,W_out)
+            seg = F.interpolate(seg, size=self.size, mode="nearest")
+            results["gt_seg_map"] = seg.squeeze(0).squeeze(0).long().numpy()  # back to (H_out,W_out)
         return results
 
+
 @TRANSFORMS.register_module()
 class FixCityscapesPath(BaseTransform):
-    def __init__(self, data_root, split='train'):
+    def __init__(self, data_root, split="train"):
         self.data_root = data_root
         self.split = split
 
     def transform(self, results):
-        img_path = results['img_path']
+        img_path = results["img_path"]
         filename = osp.basename(img_path)
-        
-        seg_filename = filename.replace('_leftImg8bit.png', '_gtFine_labelTrainIds.png')
-        city = filename.split('_')[0]
-        seg_path = osp.join(self.data_root, 'gtFine', self.split, city, seg_filename)
-        
-        results['seg_map_path'] = seg_path
-        
+
+        seg_filename = filename.replace("_leftImg8bit.png", "_gtFine_labelTrainIds.png")
+        city = filename.split("_")[0]
+        seg_path = osp.join(self.data_root, "gtFine", self.split, city, seg_filename)
+
+        results["seg_map_path"] = seg_path
+
         return results