diff --git a/visdet/cv/cnn/bricks/activation.py b/visdet/cv/cnn/bricks/activation.py
index 0640af16..9a250e2a 100644
--- a/visdet/cv/cnn/bricks/activation.py
+++ b/visdet/cv/cnn/bricks/activation.py
@@ -45,12 +45,15 @@ class Clamp(nn.Module):
             Default to 1.
     """
 
+    min: float
+    max: float
+
     def __init__(self, min: float = -1.0, max: float = 1.0):
         super().__init__()
-        self.min = min
-        self.max = max
+        object.__setattr__(self, "min", min)
+        object.__setattr__(self, "max", max)
 
-    def forward(self, x) -> torch.Tensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward function.
 
         Args:
diff --git a/visdet/cv/cnn/bricks/conv_module.py b/visdet/cv/cnn/bricks/conv_module.py
index ae7ba541..8b64a540 100644
--- a/visdet/cv/cnn/bricks/conv_module.py
+++ b/visdet/cv/cnn/bricks/conv_module.py
@@ -1,10 +1,15 @@
+from __future__ import annotations
+
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
 from functools import partial
+from typing import TYPE_CHECKING, Any, Callable, cast
 
 import torch
 import torch.nn as nn
+from torch import Tensor
 from torch.nn.modules.batchnorm import _BatchNorm
+from torch.nn.modules.conv import _ConvNd
 from torch.nn.modules.instancenorm import _InstanceNorm
 
 from visdet.cv.cnn.bricks.activation import build_activation_layer
@@ -14,8 +19,10 @@
 from visdet.engine.model import constant_init, kaiming_init
 from visdet.engine.registry import MODELS
 
+EfficientConvBnEvalForward = Callable[[_BatchNorm, _ConvNd, Tensor], Tensor]
+
 
-def efficient_conv_bn_eval_forward(bn: _BatchNorm, conv: nn.modules.conv._ConvNd, x: torch.Tensor):
+def efficient_conv_bn_eval_forward(bn: _BatchNorm, conv: _ConvNd, x: Tensor) -> Tensor:
     """
     Implementation based on https://arxiv.org/abs/2305.11624
     "Tune-Mode ConvBN Blocks For Efficient Transfer Learning"
@@ -31,31 +38,37 @@ def efficient_conv_bn_eval_forward(bn: _BatchNorm, conv: nn.modules.conv._ConvNd
     """
     # These lines of code are designed to deal with various cases
     # like bn without affine transform, and conv without bias
+    running_var = bn.running_var
+    running_mean = bn.running_mean
+    if running_var is None or running_mean is None:
+        msg = "BatchNorm running stats must exist when efficient_conv_bn_eval_forward is enabled"
+        raise RuntimeError(msg)
+
     weight_on_the_fly = conv.weight
     if conv.bias is not None:
         bias_on_the_fly = conv.bias
     else:
-        bias_on_the_fly = torch.zeros_like(bn.running_var)
+        bias_on_the_fly = torch.zeros_like(running_var)
 
     if bn.weight is not None:
         bn_weight = bn.weight
     else:
-        bn_weight = torch.ones_like(bn.running_var)
+        bn_weight = torch.ones_like(running_var)
 
     if bn.bias is not None:
         bn_bias = bn.bias
     else:
-        bn_bias = torch.zeros_like(bn.running_var)
+        bn_bias = torch.zeros_like(running_var)
 
     # shape of [C_out, 1, 1, 1] in Conv2d
-    weight_coeff = torch.rsqrt(bn.running_var + bn.eps).reshape([-1] + [1] * (len(conv.weight.shape) - 1))
+    weight_coeff = torch.rsqrt(running_var + bn.eps).reshape([-1] + [1] * (len(conv.weight.shape) - 1))
     # shape of [C_out, 1, 1, 1] in Conv2d
     coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff
 
     # shape of [C_out, C_in, k, k] in Conv2d
     weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly
     # shape of [C_out] in Conv2d
-    bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() * (bias_on_the_fly - bn.running_mean)
+    bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() * (bias_on_the_fly - running_mean)
 
     return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly)
 
@@ -117,6 +130,15 @@ class ConvModule(nn.Module):
     """
 
     _abbr_ = "conv_block"
+    conv_cfg: dict[str, Any] | None
+    norm_cfg: dict[str, Any] | None
+    act_cfg: dict[str, Any] | None
+    order: tuple[str, str, str]
+    padding_layer: nn.Module | None
+    activate: nn.Module | None
+    efficient_conv_bn_eval_forward: EfficientConvBnEvalForward | None
+    norm_name: str | None
+    conv: _ConvNd
 
     def __init__(
         self,
@@ -142,22 +164,26 @@ def __init__(
         assert norm_cfg is None or isinstance(norm_cfg, dict)
         assert act_cfg is None or isinstance(act_cfg, dict)
         official_padding_mode = ["zeros", "circular"]
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.inplace = inplace
-        self.with_spectral_norm = with_spectral_norm
-        self.with_explicit_padding = padding_mode not in official_padding_mode
-        self.order = order
+        # Store config dicts as attributes - these are simple data, not tensors
+        object.__setattr__(self, "conv_cfg", conv_cfg)
+        object.__setattr__(self, "norm_cfg", norm_cfg)
+        object.__setattr__(self, "act_cfg", act_cfg)
+        object.__setattr__(self, "inplace", inplace)
+        object.__setattr__(self, "with_spectral_norm", with_spectral_norm)
+        object.__setattr__(self, "with_explicit_padding", padding_mode not in official_padding_mode)
+        object.__setattr__(self, "order", order)
         assert isinstance(self.order, tuple) and len(self.order) == 3
         assert set(order) == {"conv", "norm", "act"}
 
-        self.with_norm = norm_cfg is not None
-        self.with_activation = act_cfg is not None
+        object.__setattr__(self, "with_norm", norm_cfg is not None)
+        object.__setattr__(self, "with_activation", act_cfg is not None)
+        self.padding_layer: nn.Module | None = None
+        self.activate: nn.Module | None = None
+        object.__setattr__(self, "efficient_conv_bn_eval_forward", None)
         # if the conv layer is before a norm layer, bias is unnecessary.
         if bias == "auto":
             bias = not self.with_norm
-        self.with_bias = bias
+        object.__setattr__(self, "with_bias", bias)
 
         if self.with_explicit_padding:
             pad_cfg = dict(type=padding_mode)
@@ -166,7 +192,7 @@ def __init__(
         # reset padding to 0 for conv module
         conv_padding = 0 if self.with_explicit_padding else padding
         # build convolution layer
-        self.conv = build_conv_layer(
+        conv_layer = build_conv_layer(
             conv_cfg,
             in_channels,
             out_channels,
@@ -177,16 +203,17 @@ def __init__(
             groups=groups,
             bias=bias,
         )
+        self.conv = cast(_ConvNd, conv_layer)
         # export the attributes of self.conv to a higher level for convenience
-        self.in_channels = self.conv.in_channels
-        self.out_channels = self.conv.out_channels
-        self.kernel_size = self.conv.kernel_size
-        self.stride = self.conv.stride
-        self.padding = padding
-        self.dilation = self.conv.dilation
-        self.transposed = self.conv.transposed
-        self.output_padding = self.conv.output_padding
-        self.groups = self.conv.groups
+        object.__setattr__(self, "in_channels", self.conv.in_channels)
+        object.__setattr__(self, "out_channels", self.conv.out_channels)
+        object.__setattr__(self, "kernel_size", self.conv.kernel_size)
+        object.__setattr__(self, "stride", self.conv.stride)
+        object.__setattr__(self, "padding", padding)
+        object.__setattr__(self, "dilation", self.conv.dilation)
+        object.__setattr__(self, "transposed", self.conv.transposed)
+        object.__setattr__(self, "output_padding", self.conv.output_padding)
+        object.__setattr__(self, "groups", self.conv.groups)
 
         if self.with_spectral_norm:
             self.conv = nn.utils.spectral_norm(self.conv)
@@ -198,19 +225,22 @@ def __init__(
                 norm_channels = out_channels
             else:
                 norm_channels = in_channels
-            self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)  # type: ignore
-            self.add_module(self.norm_name, norm)
+            assert norm_cfg is not None
+            norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
+            object.__setattr__(self, "norm_name", norm_name)
+            self.add_module(norm_name, norm)
             if self.with_bias:
                 if isinstance(norm, (_BatchNorm, _InstanceNorm)):
                     warnings.warn("Unnecessary conv bias before batch/instance norm")
         else:
-            self.norm_name = None  # type: ignore
+            object.__setattr__(self, "norm_name", None)
 
         self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
 
         # build activation layer
         if self.with_activation:
-            act_cfg_ = act_cfg.copy()  # type: ignore
+            assert act_cfg is not None
+            act_cfg_ = cast(dict[str, Any], act_cfg.copy())
             # nn.Tanh has no 'inplace' argument
             if act_cfg_["type"] not in [
                 "Tanh",
@@ -227,7 +257,7 @@ def __init__(
         self.init_weights()
 
     @property
-    def norm(self):
+    def norm(self) -> nn.Module | None:
         if self.norm_name:
             return getattr(self, self.norm_name)
         else:
@@ -244,7 +274,7 @@ def init_weights(self):
         # Note: For PyTorch's conv layers, they will be overwritten by our
         #    initialization implementation using default ``kaiming_init``.
         if not hasattr(self.conv, "init_weights"):
-            if self.with_activation and self.act_cfg["type"] == "LeakyReLU":
+            if self.with_activation and self.act_cfg is not None and self.act_cfg["type"] == "LeakyReLU":
                 nonlinearity = "leaky_relu"
                 a = self.act_cfg.get("negative_slope", 0.01)
             else:
@@ -252,7 +282,9 @@ def init_weights(self):
                 a = 0
             kaiming_init(self.conv, a=a, nonlinearity=nonlinearity)
         if self.with_norm:
-            constant_init(self.norm, 1, bias=0)
+            norm_layer = self.norm
+            if norm_layer is not None:
+                constant_init(norm_layer, 1, bias=0)
 
     def forward(self, x: torch.Tensor, activate: bool = True, norm: bool = True) -> torch.Tensor:
         layer_index = 0
@@ -260,78 +292,95 @@ def forward(self, x: torch.Tensor, activate: bool = True, norm: bool = True) ->
             layer = self.order[layer_index]
             if layer == "conv":
                 if self.with_explicit_padding:
+                    if self.padding_layer is None:
+                        raise RuntimeError("Padding layer is not initialized")
                     x = self.padding_layer(x)
                 # if the next operation is norm and we have a norm layer in
                 # eval mode and we have enabled `efficient_conv_bn_eval` for
                 # the conv operator, then activate the optimized forward and
                 # skip the next norm operator since it has been fused
+                norm_layer = self.norm
                 if (
                     layer_index + 1 < len(self.order)
                     and self.order[layer_index + 1] == "norm"
                     and norm
                     and self.with_norm
-                    and not self.norm.training
+                    and norm_layer is not None
+                    and not norm_layer.training
                     and self.efficient_conv_bn_eval_forward is not None
                 ):
-                    self.conv.forward = partial(self.efficient_conv_bn_eval_forward, self.norm, self.conv)
+                    bn_module = cast(_BatchNorm, norm_layer)
+                    self.conv.forward = partial(self.efficient_conv_bn_eval_forward, bn_module, self.conv)  # type: ignore[method-assign]
                     layer_index += 1
                     x = self.conv(x)
-                    del self.conv.forward
+                    del self.conv.forward  # type: ignore[attr-defined]
                 else:
                     x = self.conv(x)
             elif layer == "norm" and norm and self.with_norm:
-                x = self.norm(x)
+                norm_layer = self.norm
+                if norm_layer is None:
+                    raise RuntimeError("Norm layer not initialized")
+                x = norm_layer(x)
             elif layer == "act" and activate and self.with_activation:
+                if self.activate is None:
+                    raise RuntimeError("Activation layer not initialized")
                 x = self.activate(x)
             layer_index += 1
         return x
 
-    def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval=True):
+    def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval: bool = True) -> None:
         # efficient_conv_bn_eval works for conv + bn
         # with `track_running_stats` option
-        if efficient_conv_bn_eval and self.norm and isinstance(self.norm, _BatchNorm) and self.norm.track_running_stats:
-            self.efficient_conv_bn_eval_forward = efficient_conv_bn_eval_forward
+        norm_layer = self.norm
+        if (
+            efficient_conv_bn_eval
+            and norm_layer is not None
+            and isinstance(norm_layer, _BatchNorm)
+            and norm_layer.track_running_stats
+        ):
+            object.__setattr__(self, "efficient_conv_bn_eval_forward", efficient_conv_bn_eval_forward)
         else:
-            self.efficient_conv_bn_eval_forward = None  # type: ignore
+            object.__setattr__(self, "efficient_conv_bn_eval_forward", None)
 
     @staticmethod
     def create_from_conv_bn(
-        conv: torch.nn.modules.conv._ConvNd,
-        bn: torch.nn.modules.batchnorm._BatchNorm,
-        efficient_conv_bn_eval=True,
+        conv: _ConvNd,
+        bn: _BatchNorm,
+        efficient_conv_bn_eval: bool = True,
     ) -> "ConvModule":
         """Create a ConvModule from a conv and a bn module."""
         self = ConvModule.__new__(ConvModule)
         super(ConvModule, self).__init__()
 
-        self.conv_cfg = None
-        self.norm_cfg = None
-        self.act_cfg = None
-        self.inplace = False
-        self.with_spectral_norm = False
-        self.with_explicit_padding = False
-        self.order = ("conv", "norm", "act")
+        object.__setattr__(self, "conv_cfg", None)
+        object.__setattr__(self, "norm_cfg", None)
+        object.__setattr__(self, "act_cfg", None)
+        object.__setattr__(self, "inplace", False)
+        object.__setattr__(self, "with_spectral_norm", False)
+        object.__setattr__(self, "with_explicit_padding", False)
+        object.__setattr__(self, "order", ("conv", "norm", "act"))
 
-        self.with_norm = True
-        self.with_activation = False
-        self.with_bias = conv.bias is not None
+        object.__setattr__(self, "with_norm", True)
+        object.__setattr__(self, "with_activation", False)
+        object.__setattr__(self, "with_bias", conv.bias is not None)
 
         # build convolution layer
         self.conv = conv
         # export the attributes of self.conv to a higher level for convenience
-        self.in_channels = self.conv.in_channels
-        self.out_channels = self.conv.out_channels
-        self.kernel_size = self.conv.kernel_size
-        self.stride = self.conv.stride
-        self.padding = self.conv.padding
-        self.dilation = self.conv.dilation
-        self.transposed = self.conv.transposed
-        self.output_padding = self.conv.output_padding
-        self.groups = self.conv.groups
+        object.__setattr__(self, "in_channels", self.conv.in_channels)
+        object.__setattr__(self, "out_channels", self.conv.out_channels)
+        object.__setattr__(self, "kernel_size", self.conv.kernel_size)
+        object.__setattr__(self, "stride", self.conv.stride)
+        object.__setattr__(self, "padding", self.conv.padding)
+        object.__setattr__(self, "dilation", self.conv.dilation)
+        object.__setattr__(self, "transposed", self.conv.transposed)
+        object.__setattr__(self, "output_padding", self.conv.output_padding)
+        object.__setattr__(self, "groups", self.conv.groups)
 
         # build normalization layers
-        self.norm_name, norm = "bn", bn
-        self.add_module(self.norm_name, norm)
+        norm_name: str = "bn"
+        object.__setattr__(self, "norm_name", norm_name)
+        self.add_module(norm_name, bn)
 
         self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
 
diff --git a/visdet/cv/cnn/bricks/drop.py b/visdet/cv/cnn/bricks/drop.py
index 7938aa22..b9b7642e 100644
--- a/visdet/cv/cnn/bricks/drop.py
+++ b/visdet/cv/cnn/bricks/drop.py
@@ -37,9 +37,11 @@ class DropPath(nn.Module):
         drop_prob (float): Probability of the path to be zeroed. Default: 0.1
     """
 
+    drop_prob: float
+
     def __init__(self, drop_prob: float = 0.1):
         super().__init__()
-        self.drop_prob = drop_prob
+        object.__setattr__(self, "drop_prob", drop_prob)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return drop_path(x, self.drop_prob, self.training)
@@ -61,10 +63,14 @@ def __init__(self, drop_prob: float = 0.5, inplace: bool = False):
         super().__init__(p=drop_prob, inplace=inplace)
 
 
-def build_dropout(cfg: dict | float | None, default_args: dict | None = None) -> Any:
+def build_dropout(cfg: dict[str, Any] | float | None, default_args: dict | None = None) -> Any:
     """Builder for drop out layers."""
     if cfg is None:
         return None
     if isinstance(cfg, float):
-        cfg = dict(type="Dropout", drop_prob=cfg)
-    return MODELS.build(cfg, default_args=default_args)
+        cfg_dict: dict[str, Any] = dict(type="Dropout", drop_prob=cfg)
+    else:
+        if not isinstance(cfg, dict):
+            raise TypeError(f"cfg must be dict or float, but got {type(cfg)!r}")
+        cfg_dict = cfg
+    return MODELS.build(cfg_dict, default_args=default_args)
diff --git a/visdet/cv/cnn/bricks/hsigmoid.py b/visdet/cv/cnn/bricks/hsigmoid.py
index 949a4cfc..cef6e1ed 100644
--- a/visdet/cv/cnn/bricks/hsigmoid.py
+++ b/visdet/cv/cnn/bricks/hsigmoid.py
@@ -27,6 +27,11 @@ class HSigmoid(nn.Module):
         Tensor: The output tensor.
     """
 
+    bias: float
+    divisor: float
+    min_value: float
+    max_value: float
+
     def __init__(
         self,
         bias: float = 3.0,
@@ -43,11 +48,11 @@ def __init__(
             "Hsigmoid(x) = min(max((x + 3) / 6, 0), 1).",
             stacklevel=2,
         )
-        self.bias = bias
-        self.divisor = divisor
+        object.__setattr__(self, "bias", bias)
+        object.__setattr__(self, "divisor", divisor)
         assert self.divisor != 0
-        self.min_value = min_value
-        self.max_value = max_value
+        object.__setattr__(self, "min_value", min_value)
+        object.__setattr__(self, "max_value", max_value)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = (x + self.bias) / self.divisor
diff --git a/visdet/cv/cnn/bricks/scale.py b/visdet/cv/cnn/bricks/scale.py
index e708786c..c805f3bb 100644
--- a/visdet/cv/cnn/bricks/scale.py
+++ b/visdet/cv/cnn/bricks/scale.py
@@ -34,6 +34,9 @@ class LayerScale(nn.Module):
         scale (float): Initial value of scale factor. Default: 1.0
     """
 
+    inplace: bool
+    data_format: str
+
     def __init__(
         self,
         dim: int,
@@ -45,8 +48,8 @@ def __init__(
         assert data_format in ("channels_last", "channels_first"), (
             "'data_format' could only be channels_last or channels_first."
         )
-        self.inplace = inplace
-        self.data_format = data_format
+        object.__setattr__(self, "inplace", inplace)
+        object.__setattr__(self, "data_format", data_format)
         self.weight = nn.Parameter(torch.ones(dim) * scale)
 
     def forward(self, x) -> torch.Tensor:
diff --git a/visdet/cv/cnn/bricks/transformer.py b/visdet/cv/cnn/bricks/transformer.py
index 3de2b43f..178e5166 100644
--- a/visdet/cv/cnn/bricks/transformer.py
+++ b/visdet/cv/cnn/bricks/transformer.py
@@ -1,8 +1,11 @@
+from __future__ import annotations
+
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
 import math
 import warnings
 from collections.abc import Sequence
+from typing import Any, Iterable, cast
 
 import torch
 import torch.nn as nn
@@ -20,6 +23,10 @@
 from visdet.engine.utils import deprecated_api_warning, to_2tuple
 
 
+def _tuple2(value: int | tuple[int, int] | Iterable[int]) -> tuple[int, int]:
+    return cast(tuple[int, int], to_2tuple(value))
+
+
 def build_positional_encoding(cfg, default_args=None):
     """Builder for Position Encoding."""
     return MODELS.build(cfg, default_args=default_args)
@@ -80,18 +87,24 @@ class AdaptivePadding(nn.Module):
         >>> assert (out.shape[2], out.shape[3]) == (16, 32)
     """
 
-    def __init__(self, kernel_size=1, stride=1, dilation=1, padding="corner"):
+    def __init__(
+        self,
+        kernel_size: int | tuple[int, int] = 1,
+        stride: int | tuple[int, int] = 1,
+        dilation: int | tuple[int, int] = 1,
+        padding: str = "corner",
+    ):
         super().__init__()
         assert padding in ("same", "corner")
 
-        kernel_size = to_2tuple(kernel_size)
-        stride = to_2tuple(stride)
-        dilation = to_2tuple(dilation)
+        kernel_tuple = _tuple2(kernel_size)
+        stride_tuple = _tuple2(stride)
+        dilation_tuple = _tuple2(dilation)
 
-        self.padding = padding
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.dilation = dilation
+        object.__setattr__(self, "padding", padding)
+        object.__setattr__(self, "kernel_size", kernel_tuple)
+        object.__setattr__(self, "stride", stride_tuple)
+        object.__setattr__(self, "dilation", dilation_tuple)
 
     def get_pad_shape(self, input_shape):
         """Calculate the padding size of input.
@@ -104,16 +117,17 @@ def get_pad_shape(self, input_shape):
             original H and W directions
         """
         input_h, input_w = input_shape
-        kernel_h, kernel_w = self.kernel_size
-        stride_h, stride_w = self.stride
+        kernel_h, kernel_w = cast(tuple[int, int], self.kernel_size)
+        stride_h, stride_w = cast(tuple[int, int], self.stride)
+        dilation = cast(tuple[int, int], self.dilation)
         output_h = math.ceil(input_h / stride_h)
         output_w = math.ceil(input_w / stride_w)
         pad_h = max(
-            (output_h - 1) * stride_h + (kernel_h - 1) * self.dilation[0] + 1 - input_h,
+            (output_h - 1) * stride_h + (kernel_h - 1) * dilation[0] + 1 - input_h,
             0,
         )
         pad_w = max(
-            (output_w - 1) * stride_w + (kernel_w - 1) * self.dilation[1] + 1 - input_w,
+            (output_w - 1) * stride_w + (kernel_w - 1) * dilation[1] + 1 - input_w,
             0,
         )
         return pad_h, pad_w
@@ -166,77 +180,81 @@ class PatchEmbed(BaseModule):
 
     def __init__(
         self,
-        in_channels=3,
-        embed_dims=768,
-        conv_type="Conv2d",
-        kernel_size=16,
-        stride=16,
-        padding="corner",
-        dilation=1,
-        bias=True,
-        norm_cfg=None,
-        input_size=None,
-        init_cfg=None,
+        in_channels: int = 3,
+        embed_dims: int = 768,
+        conv_type: str = "Conv2d",
+        kernel_size: int | tuple[int, int] = 16,
+        stride: int | tuple[int, int] | None = 16,
+        padding: int | tuple[int, int] | str = "corner",
+        dilation: int | tuple[int, int] = 1,
+        bias: bool = True,
+        norm_cfg: dict | None = None,
+        input_size: int | tuple[int, int] | None = None,
+        init_cfg: dict | None = None,
     ):
         super().__init__(init_cfg=init_cfg)
 
-        self.embed_dims = embed_dims
+        object.__setattr__(self, "embed_dims", embed_dims)
         if stride is None:
             stride = kernel_size
 
-        kernel_size = to_2tuple(kernel_size)
-        stride = to_2tuple(stride)
-        dilation = to_2tuple(dilation)
+        kernel_tuple = _tuple2(kernel_size)
+        stride_tuple = _tuple2(stride)
+        dilation_tuple = _tuple2(dilation)
 
         if isinstance(padding, str):
             self.adaptive_padding = AdaptivePadding(
-                kernel_size=kernel_size,
-                stride=stride,
-                dilation=dilation,
+                kernel_size=kernel_tuple,
+                stride=stride_tuple,
+                dilation=dilation_tuple,
                 padding=padding,
             )
             # disable the padding of conv
             padding = 0
         else:
-            self.adaptive_padding = None
-        padding = to_2tuple(padding)
+            object.__setattr__(self, "adaptive_padding", None)
+        padding_tuple = _tuple2(padding)
 
         self.projection = build_conv_layer(
             dict(type=conv_type),
             in_channels=in_channels,
             out_channels=embed_dims,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
+            kernel_size=kernel_tuple,
+            stride=stride_tuple,
+            padding=padding_tuple,
+            dilation=dilation_tuple,
             bias=bias,
         )
 
         if norm_cfg is not None:
             self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
         else:
-            self.norm = None
+            object.__setattr__(self, "norm", None)
 
         if input_size:
-            input_size = to_2tuple(input_size)
+            input_size_tuple = _tuple2(input_size)
             # `init_out_size` would be used outside to
             # calculate the num_patches
             # e.g. when `use_abs_pos_embed` outside
-            self.init_input_size = input_size
+            object.__setattr__(self, "init_input_size", input_size_tuple)
             if self.adaptive_padding:
-                pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size)
-                input_h, input_w = input_size
+                pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size_tuple)
+                input_h, input_w = input_size_tuple
                 input_h = input_h + pad_h
                 input_w = input_w + pad_w
-                input_size = (input_h, input_w)
+                input_size_tuple = (input_h, input_w)
 
             # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
-            h_out = (input_size[0] + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) // stride[0] + 1
-            w_out = (input_size[1] + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) // stride[1] + 1
-            self.init_out_size = (h_out, w_out)
+            h_out = (
+                input_size_tuple[0] + 2 * padding_tuple[0] - dilation_tuple[0] * (kernel_tuple[0] - 1) - 1
+            ) // stride_tuple[0] + 1
+            w_out = (
+                input_size_tuple[1] + 2 * padding_tuple[1] - dilation_tuple[1] * (kernel_tuple[1] - 1) - 1
+            ) // stride_tuple[1] + 1
+            object.__setattr__(self, "init_out_size", (h_out, w_out))
         else:
-            self.init_input_size = None
-            self.init_out_size = None
+            object.__setattr__(self, "init_input_size", None)
+            object.__setattr__(self, "init_out_size", None)
 
     def forward(self, x):
         """
@@ -294,51 +312,57 @@ class PatchMerging(BaseModule):
             Default: None.
     """
 
+    adaptive_padding: AdaptivePadding | None
+    norm: nn.Module | None
+    sampler: nn.Unfold
+
     def __init__(
         self,
-        in_channels,
-        out_channels,
-        kernel_size=2,
-        stride=None,
-        padding="corner",
-        dilation=1,
-        bias=False,
-        norm_cfg=dict(type="LN"),
-        init_cfg=None,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int | tuple[int, int] = 2,
+        stride: int | tuple[int, int] | None = None,
+        padding: int | tuple[int, int] | str = "corner",
+        dilation: int | tuple[int, int] = 1,
+        bias: bool = False,
+        norm_cfg: dict | None = dict(type="LN"),
+        init_cfg: dict | None = None,
     ):
         super().__init__(init_cfg=init_cfg)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        if stride:
-            stride = stride
-        else:
-            stride = kernel_size
+        object.__setattr__(self, "in_channels", in_channels)
+        object.__setattr__(self, "out_channels", out_channels)
+        stride_value = stride if stride is not None else kernel_size
 
-        kernel_size = to_2tuple(kernel_size)
-        stride = to_2tuple(stride)
-        dilation = to_2tuple(dilation)
+        kernel_size_tuple = _tuple2(kernel_size)
+        stride_tuple = _tuple2(stride_value)
+        dilation_tuple = _tuple2(dilation)
 
         if isinstance(padding, str):
             self.adaptive_padding = AdaptivePadding(
-                kernel_size=kernel_size,
-                stride=stride,
-                dilation=dilation,
+                kernel_size=kernel_size_tuple,
+                stride=stride_tuple,
+                dilation=dilation_tuple,
                 padding=padding,
             )
             # disable the padding of unfold
             padding = 0
         else:
-            self.adaptive_padding = None
-
-        padding = to_2tuple(padding)
-        self.sampler = nn.Unfold(kernel_size=kernel_size, dilation=dilation, padding=padding, stride=stride)
+            object.__setattr__(self, "adaptive_padding", None)
+
+        padding_tuple = _tuple2(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size_tuple,
+            dilation=dilation_tuple,
+            padding=padding_tuple,
+            stride=stride_tuple,
+        )
 
-        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+        sample_dim = kernel_size_tuple[0] * kernel_size_tuple[1] * in_channels
 
         if norm_cfg is not None:
             self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
         else:
-            self.norm = None
+            object.__setattr__(self, "norm", None)
 
         self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
 
@@ -373,16 +397,17 @@ def forward(self, x, input_size):
         # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
         x = self.sampler(x)
 
-        out_h = (
-            H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] * (self.sampler.kernel_size[0] - 1) - 1
-        ) // self.sampler.stride[0] + 1
-        out_w = (
-            W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] * (self.sampler.kernel_size[1] - 1) - 1
-        ) // self.sampler.stride[1] + 1
+        padding_hw = _tuple2(self.sampler.padding)
+        dilation_hw = _tuple2(self.sampler.dilation)
+        kernel_hw = _tuple2(self.sampler.kernel_size)
+        stride_hw = _tuple2(self.sampler.stride)
+        out_h = (H + 2 * padding_hw[0] - dilation_hw[0] * (kernel_hw[0] - 1) - 1) // stride_hw[0] + 1
+        out_w = (W + 2 * padding_hw[1] - dilation_hw[1] * (kernel_hw[1] - 1) - 1) // stride_hw[1] + 1
 
         output_size = (out_h, out_w)
         x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
-        x = self.norm(x) if self.norm else x
+        if self.norm is not None:
+            x = self.norm(x)
         x = self.reduction(x)
         return x, output_size
 
@@ -412,13 +437,13 @@ class MultiheadAttention(BaseModule):
 
     def __init__(
         self,
-        embed_dims,
-        num_heads,
-        attn_drop=0.0,
-        proj_drop=0.0,
-        dropout_layer=dict(type="Dropout", drop_prob=0.0),
-        init_cfg=None,
-        batch_first=False,
+        embed_dims: int,
+        num_heads: int,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        dropout_layer: dict | None = dict(type="Dropout", drop_prob=0.0),
+        init_cfg: dict | None = None,
+        batch_first: bool = False,
         **kwargs,
     ):
         super().__init__(init_cfg)
@@ -431,11 +456,12 @@ def __init__(
                 DeprecationWarning,
             )
             attn_drop = kwargs["dropout"]
-            dropout_layer["drop_prob"] = kwargs.pop("dropout")
+            if dropout_layer is not None:
+                dropout_layer["drop_prob"] = kwargs.pop("dropout")
 
-        self.embed_dims = embed_dims
-        self.num_heads = num_heads
-        self.batch_first = batch_first
+        object.__setattr__(self, "embed_dims", embed_dims)
+        object.__setattr__(self, "num_heads", num_heads)
+        object.__setattr__(self, "batch_first", batch_first)
 
         self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, **kwargs)
 
@@ -566,21 +592,21 @@ class FFN(BaseModule):
     @deprecated_api_warning({"dropout": "ffn_drop", "add_residual": "add_identity"}, cls_name="FFN")
     def __init__(
         self,
-        embed_dims=256,
-        feedforward_channels=1024,
-        num_fcs=2,
-        act_cfg=dict(type="ReLU", inplace=True),
-        ffn_drop=0.0,
-        dropout_layer=None,
-        add_identity=True,
-        init_cfg=None,
-        layer_scale_init_value=0.0,
+        embed_dims: int = 256,
+        feedforward_channels: int = 1024,
+        num_fcs: int = 2,
+        act_cfg: dict = dict(type="ReLU", inplace=True),
+        ffn_drop: float = 0.0,
+        dropout_layer: dict | None = None,
+        add_identity: bool = True,
+        init_cfg: dict | None = None,
+        layer_scale_init_value: float = 0.0,
     ):
         super().__init__(init_cfg)
         assert num_fcs >= 2, f"num_fcs should be no less than 2. got {num_fcs}."
-        self.embed_dims = embed_dims
-        self.feedforward_channels = feedforward_channels
-        self.num_fcs = num_fcs
+        object.__setattr__(self, "embed_dims", embed_dims)
+        object.__setattr__(self, "feedforward_channels", feedforward_channels)
+        object.__setattr__(self, "num_fcs", num_fcs)
 
         layers = []
         in_channels = embed_dims
@@ -597,7 +623,7 @@ def __init__(
         layers.append(nn.Dropout(ffn_drop))
         self.layers = Sequential(*layers)
         self.dropout_layer = build_dropout(dropout_layer) if dropout_layer else torch.nn.Identity()
-        self.add_identity = add_identity
+        object.__setattr__(self, "add_identity", add_identity)
 
         if layer_scale_init_value > 0:
             self.gamma2 = LayerScale(embed_dims, scale=layer_scale_init_value)
@@ -656,10 +682,17 @@ class BaseTransformerLayer(BaseModule):
             or (n, batch, embed_dim). Default to False.
     """
 
+    num_attn: int
+    operation_order: tuple[str, ...]
+    norm_cfg: dict
+    pre_norm: bool
+    embed_dims: int
+    batch_first: bool
+
     def __init__(
         self,
-        attn_cfgs=None,
-        ffn_cfgs=dict(
+        attn_cfgs: dict[str, Any] | list[dict[str, Any]] | None = None,
+        ffn_cfgs: dict[str, Any] | list[dict[str, Any]] = dict(
             type="FFN",
             embed_dims=256,
             feedforward_channels=1024,
@@ -667,12 +700,17 @@ def __init__(
             ffn_drop=0.0,
             act_cfg=dict(type="ReLU", inplace=True),
         ),
-        operation_order=None,
-        norm_cfg=dict(type="LN"),
-        init_cfg=None,
-        batch_first=False,
+        operation_order: tuple[str, ...] | None = None,
+        norm_cfg: dict = dict(type="LN"),
+        init_cfg: dict | None = None,
+        batch_first: bool = False,
         **kwargs,
     ):
+        if isinstance(ffn_cfgs, list):
+            ffn_cfg_data: dict[str, Any] | list[dict[str, Any]] = [copy.deepcopy(cfg) for cfg in ffn_cfgs]
+        else:
+            ffn_cfg_data = copy.deepcopy(ffn_cfgs)
+
         deprecated_args = dict(
             feedforward_channels="feedforward_channels",
             ffn_dropout="ffn_drop",
@@ -687,62 +725,75 @@ def __init__(
                     f"to a dict named `ffn_cfgs`. ",
                     DeprecationWarning,
                 )
-                ffn_cfgs[new_name] = kwargs[ori_name]
+                if isinstance(ffn_cfg_data, dict):
+                    ffn_cfg_data[new_name] = kwargs[ori_name]
 
         super().__init__(init_cfg)
 
-        self.batch_first = batch_first
+        if operation_order is None:
+            raise ValueError("operation_order must be provided")
+        object.__setattr__(self, "batch_first", batch_first)
+        operation_order_tuple = operation_order
 
-        assert set(operation_order) & {"self_attn", "norm", "ffn", "cross_attn"} == set(operation_order), (
+        assert set(operation_order_tuple) & {"self_attn", "norm", "ffn", "cross_attn"} == set(operation_order_tuple), (
             f"The operation_order of {self.__class__.__name__} should contains all four operation type {['self_attn', 'norm', 'ffn', 'cross_attn']}"
         )
 
-        num_attn = operation_order.count("self_attn") + operation_order.count("cross_attn")
-        if isinstance(attn_cfgs, dict):
-            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
+        num_attn = operation_order_tuple.count("self_attn") + operation_order_tuple.count("cross_attn")
+        attn_cfg_list: list[dict[str, Any]]
+        if num_attn == 0:
+            attn_cfg_list = []
+        elif isinstance(attn_cfgs, dict):
+            attn_cfg_list = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
+        elif attn_cfgs is None:
+            raise ValueError("attn_cfgs must be provided when attention ops exist")
         else:
-            assert num_attn == len(attn_cfgs), (
-                f"The length of attn_cfg {num_attn} is not consistent with the number of attentionin operation_order {operation_order}."
-            )
-
-        self.num_attn = num_attn
-        self.operation_order = operation_order
-        self.norm_cfg = norm_cfg
-        self.pre_norm = operation_order[0] == "norm"
+            attn_cfg_list = attn_cfgs
+            if len(attn_cfg_list) != num_attn:
+                raise ValueError(f"Expected {num_attn} attention configs but received {len(attn_cfg_list)}")
+
+        object.__setattr__(self, "num_attn", num_attn)
+        object.__setattr__(self, "operation_order", operation_order_tuple)
+        object.__setattr__(self, "norm_cfg", norm_cfg)
+        object.__setattr__(self, "pre_norm", operation_order_tuple[0] == "norm")
         self.attentions = ModuleList()
 
         index = 0
-        for operation_name in operation_order:
+        for operation_name in operation_order_tuple:
             if operation_name in ["self_attn", "cross_attn"]:
-                if "batch_first" in attn_cfgs[index]:
-                    assert self.batch_first == attn_cfgs[index]["batch_first"]
+                cfg = attn_cfg_list[index]
+                if "batch_first" in cfg:
+                    assert self.batch_first == cfg["batch_first"]
                 else:
-                    attn_cfgs[index]["batch_first"] = self.batch_first
-                attention = build_attention(attn_cfgs[index])
+                    cfg["batch_first"] = self.batch_first
+                attention = build_attention(cfg)
                 # Some custom attentions used as `self_attn`
                 # or `cross_attn` can have different behavior.
                 attention.operation_name = operation_name
                 self.attentions.append(attention)
                 index += 1
 
-        self.embed_dims = self.attentions[0].embed_dims
+        if not self.attentions:
+            raise ValueError("At least one attention module is required")
+        object.__setattr__(self, "embed_dims", self.attentions[0].embed_dims)
 
         self.ffns = ModuleList()
-        num_ffns = operation_order.count("ffn")
-        if isinstance(ffn_cfgs, dict):
-            ffn_cfgs = ConfigDict(ffn_cfgs)
-        if isinstance(ffn_cfgs, dict):
-            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
-        assert len(ffn_cfgs) == num_ffns
+        num_ffns = operation_order_tuple.count("ffn")
+        if isinstance(ffn_cfg_data, dict):
+            base_ffn_cfg = ConfigDict(ffn_cfg_data)
+            ffn_cfg_list = [copy.deepcopy(base_ffn_cfg) for _ in range(num_ffns)]
+        else:
+            ffn_cfg_list = ffn_cfg_data
+        assert len(ffn_cfg_list) == num_ffns
         for ffn_index in range(num_ffns):
-            if "embed_dims" not in ffn_cfgs[ffn_index]:
-                ffn_cfgs[ffn_index]["embed_dims"] = self.embed_dims
+            if "embed_dims" not in ffn_cfg_list[ffn_index]:
+                ffn_cfg_list[ffn_index]["embed_dims"] = self.embed_dims
             else:
-                assert ffn_cfgs[ffn_index]["embed_dims"] == self.embed_dims
-            self.ffns.append(build_feedforward_network(ffn_cfgs[ffn_index], dict(type="FFN")))
+                assert ffn_cfg_list[ffn_index]["embed_dims"] == self.embed_dims
+            self.ffns.append(build_feedforward_network(ffn_cfg_list[ffn_index], dict(type="FFN")))
 
         self.norms = ModuleList()
-        num_norms = operation_order.count("norm")
+        num_norms = operation_order_tuple.count("norm")
         for _ in range(num_norms):
             self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
 
@@ -866,18 +917,28 @@ class TransformerLayerSequence(BaseModule):
             Default: None.
     """
 
-    def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):
+    num_layers: int
+    embed_dims: int
+    pre_norm: bool
+
+    def __init__(
+        self,
+        transformerlayers: dict | list[dict] | None = None,
+        num_layers: int | None = None,
+        init_cfg: dict | None = None,
+    ):
         super().__init__(init_cfg)
+        assert num_layers is not None, "num_layers must be provided"
         if isinstance(transformerlayers, dict):
             transformerlayers = [copy.deepcopy(transformerlayers) for _ in range(num_layers)]
         else:
             assert isinstance(transformerlayers, list) and len(transformerlayers) == num_layers
-        self.num_layers = num_layers
+        object.__setattr__(self, "num_layers", num_layers)
         self.layers = ModuleList()
         for i in range(num_layers):
             self.layers.append(build_transformer_layer(transformerlayers[i]))
-        self.embed_dims = self.layers[0].embed_dims
-        self.pre_norm = self.layers[0].pre_norm
+        object.__setattr__(self, "embed_dims", self.layers[0].embed_dims)
+        object.__setattr__(self, "pre_norm", self.layers[0].pre_norm)
 
     def forward(
         self,
@@ -929,3 +990,20 @@ def forward(
                 **kwargs,
             )
         return query
+
+    padding: str
+    kernel_size: tuple[int, int]
+    stride: tuple[int, int]
+    dilation: tuple[int, int]
+    adaptive_padding: AdaptivePadding | None
+    norm: nn.Module | None
+    init_input_size: tuple[int, int] | None
+    init_out_size: tuple[int, int] | None
+    padding: str
+    kernel_size: tuple[int, int]
+    stride: tuple[int, int]
+    dilation: tuple[int, int]
+    adaptive_padding: AdaptivePadding | None
+    norm: nn.Module | None
+    init_input_size: tuple[int, int] | None
+    init_out_size: tuple[int, int] | None
diff --git a/visdet/cv/cnn/bricks/upsample.py b/visdet/cv/cnn/bricks/upsample.py
index a14a4e50..f6a856c0 100644
--- a/visdet/cv/cnn/bricks/upsample.py
+++ b/visdet/cv/cnn/bricks/upsample.py
@@ -27,6 +27,11 @@ class PixelShufflePack(nn.Module):
             channels.
     """
 
+    in_channels: int
+    out_channels: int
+    scale_factor: int
+    upsample_kernel: int
+
     def __init__(
         self,
         in_channels: int,
@@ -35,10 +40,10 @@ def __init__(
         upsample_kernel: int,
     ):
         super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.scale_factor = scale_factor
-        self.upsample_kernel = upsample_kernel
+        object.__setattr__(self, "in_channels", in_channels)
+        object.__setattr__(self, "out_channels", out_channels)
+        object.__setattr__(self, "scale_factor", scale_factor)
+        object.__setattr__(self, "upsample_kernel", upsample_kernel)
         self.upsample_conv = nn.Conv2d(
             self.in_channels,
             self.out_channels * scale_factor * scale_factor,
diff --git a/visdet/cv/cnn/bricks/wrappers.py b/visdet/cv/cnn/bricks/wrappers.py
index c33f95b3..7eb40e36 100644
--- a/visdet/cv/cnn/bricks/wrappers.py
+++ b/visdet/cv/cnn/bricks/wrappers.py
@@ -7,6 +7,7 @@
 """
 
 import math
+from typing import cast
 
 import torch
 import torch.nn as nn
@@ -26,6 +27,14 @@ def obsolete_torch_version(torch_version, version_threshold) -> bool:
     return torch_version == "parrots" or torch_version <= version_threshold
 
 
+def _zero_dummy_grad(module: nn.Module, reference: torch.Tensor) -> torch.Tensor:
+    """Return a zero tensor that participates in autograd like module params."""
+    total = torch.zeros([], device=reference.device, dtype=reference.dtype)
+    for parameter in module.parameters():
+        total = total + parameter.view(-1)[0]
+    return total * 0.0
+
+
 class NewEmptyTensorOp(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x: torch.Tensor, new_shape: tuple) -> torch.Tensor:
@@ -43,20 +52,24 @@ class Conv2d(nn.Conv2d):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
             out_shape = [x.shape[0], self.out_channels]
+            kernel = tuple(int(v) for v in _pair(self.kernel_size))
+            padding = tuple(int(v) for v in _pair(self.padding))
+            stride = tuple(int(v) for v in _pair(self.stride))
+            dilation = tuple(int(v) for v in _pair(self.dilation))
             for i, k, p, s, d in zip(
                 x.shape[-2:],
-                self.kernel_size,
-                self.padding,
-                self.stride,
-                self.dilation,
+                kernel,
+                padding,
+                stride,
+                dilation,
                 strict=False,
             ):
                 o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
                 out_shape.append(o)
-            empty = NewEmptyTensorOp.apply(x, out_shape)
+            empty = cast(torch.Tensor, NewEmptyTensorOp.apply(x, out_shape))
             if self.training:
                 # produce dummy gradient to avoid DDP warning.
-                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                dummy = _zero_dummy_grad(self, x)
                 return empty + dummy
             else:
                 return empty
@@ -69,20 +82,24 @@ class Conv3d(nn.Conv3d):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
             out_shape = [x.shape[0], self.out_channels]
+            kernel = tuple(int(v) for v in _triple(self.kernel_size))
+            padding = tuple(int(v) for v in _triple(self.padding))
+            stride = tuple(int(v) for v in _triple(self.stride))
+            dilation = tuple(int(v) for v in _triple(self.dilation))
             for i, k, p, s, d in zip(
                 x.shape[-3:],
-                self.kernel_size,
-                self.padding,
-                self.stride,
-                self.dilation,
+                kernel,
+                padding,
+                stride,
+                dilation,
                 strict=False,
             ):
                 o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
                 out_shape.append(o)
-            empty = NewEmptyTensorOp.apply(x, out_shape)
+            empty = cast(torch.Tensor, NewEmptyTensorOp.apply(x, out_shape))
             if self.training:
                 # produce dummy gradient to avoid DDP warning.
-                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                dummy = _zero_dummy_grad(self, x)
                 return empty + dummy
             else:
                 return empty
@@ -96,20 +113,25 @@ class ConvTranspose2d(nn.ConvTranspose2d):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
             out_shape = [x.shape[0], self.out_channels]
+            kernel = tuple(int(v) for v in _pair(self.kernel_size))
+            padding = tuple(int(v) for v in _pair(self.padding))
+            stride = tuple(int(v) for v in _pair(self.stride))
+            dilation = tuple(int(v) for v in _pair(self.dilation))
+            output_padding = tuple(int(v) for v in _pair(self.output_padding))
             for i, k, p, s, d, op in zip(
                 x.shape[-2:],
-                self.kernel_size,
-                self.padding,
-                self.stride,
-                self.dilation,
-                self.output_padding,
+                kernel,
+                padding,
+                stride,
+                dilation,
+                output_padding,
                 strict=False,
             ):
                 out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
-            empty = NewEmptyTensorOp.apply(x, out_shape)
+            empty = cast(torch.Tensor, NewEmptyTensorOp.apply(x, out_shape))
             if self.training:
                 # produce dummy gradient to avoid DDP warning.
-                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                dummy = _zero_dummy_grad(self, x)
                 return empty + dummy
             else:
                 return empty
@@ -123,20 +145,25 @@ class ConvTranspose3d(nn.ConvTranspose3d):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
             out_shape = [x.shape[0], self.out_channels]
+            kernel = tuple(int(v) for v in _triple(self.kernel_size))
+            padding = tuple(int(v) for v in _triple(self.padding))
+            stride = tuple(int(v) for v in _triple(self.stride))
+            dilation = tuple(int(v) for v in _triple(self.dilation))
+            output_padding = tuple(int(v) for v in _triple(self.output_padding))
             for i, k, p, s, d, op in zip(
                 x.shape[-3:],
-                self.kernel_size,
-                self.padding,
-                self.stride,
-                self.dilation,
-                self.output_padding,
+                kernel,
+                padding,
+                stride,
+                dilation,
+                output_padding,
                 strict=False,
             ):
                 out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
-            empty = NewEmptyTensorOp.apply(x, out_shape)
+            empty = cast(torch.Tensor, NewEmptyTensorOp.apply(x, out_shape))
             if self.training:
                 # produce dummy gradient to avoid DDP warning.
-                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                dummy = _zero_dummy_grad(self, x)
                 return empty + dummy
             else:
                 return empty
@@ -145,58 +172,66 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class MaxPool2d(nn.MaxPool2d):
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         # PyTorch 1.9 does not support empty tensor inference yet
         if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:
             out_shape = list(x.shape[:2])
+            kernel = tuple(int(v) for v in _pair(self.kernel_size))
+            padding = tuple(int(v) for v in _pair(self.padding))
+            stride = tuple(int(v) for v in _pair(self.stride))
+            dilation = tuple(int(v) for v in _pair(self.dilation))
             for i, k, p, s, d in zip(
                 x.shape[-2:],
-                _pair(self.kernel_size),
-                _pair(self.padding),
-                _pair(self.stride),
-                _pair(self.dilation),
+                kernel,
+                padding,
+                stride,
+                dilation,
                 strict=False,
             ):
                 o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
                 o = math.ceil(o) if self.ceil_mode else math.floor(o)
                 out_shape.append(o)
-            empty = NewEmptyTensorOp.apply(x, out_shape)
+            empty = cast(torch.Tensor, NewEmptyTensorOp.apply(x, out_shape))
             return empty
 
         return super().forward(x)
 
 
 class MaxPool3d(nn.MaxPool3d):
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         # PyTorch 1.9 does not support empty tensor inference yet
         if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:
             out_shape = list(x.shape[:2])
+            kernel = tuple(int(v) for v in _triple(self.kernel_size))
+            padding = tuple(int(v) for v in _triple(self.padding))
+            stride = tuple(int(v) for v in _triple(self.stride))
+            dilation = tuple(int(v) for v in _triple(self.dilation))
             for i, k, p, s, d in zip(
                 x.shape[-3:],
-                _triple(self.kernel_size),
-                _triple(self.padding),
-                _triple(self.stride),
-                _triple(self.dilation),
+                kernel,
+                padding,
+                stride,
+                dilation,
                 strict=False,
             ):
                 o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
                 o = math.ceil(o) if self.ceil_mode else math.floor(o)
                 out_shape.append(o)
-            empty = NewEmptyTensorOp.apply(x, out_shape)
+            empty = cast(torch.Tensor, NewEmptyTensorOp.apply(x, out_shape))
             return empty
 
         return super().forward(x)
 
 
-class Linear(torch.nn.Linear):
+class Linear(nn.Linear):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # empty tensor forward of Linear layer is supported in Pytorch 1.6
         if obsolete_torch_version(TORCH_VERSION, (1, 5)) and x.numel() == 0:
             out_shape = [x.shape[0], self.out_features]
-            empty = NewEmptyTensorOp.apply(x, out_shape)
+            empty = cast(torch.Tensor, NewEmptyTensorOp.apply(x, out_shape))
             if self.training:
                 # produce dummy gradient to avoid DDP warning.
-                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                dummy = _zero_dummy_grad(self, x)
                 return empty + dummy
             else:
                 return empty
diff --git a/visdet/cv/image/io.py b/visdet/cv/image/io.py
index d7896e2d..f733f3a0 100644
--- a/visdet/cv/image/io.py
+++ b/visdet/cv/image/io.py
@@ -13,19 +13,19 @@
 )
 
 try:
-    from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG
+    from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG  # type: ignore[import-untyped]
 except ImportError:
-    TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None
+    TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None  # type: ignore[assignment]
 
 try:
     from PIL import Image, ImageOps
 except ImportError:
-    Image = None
+    Image = None  # type: ignore[assignment]
 
 try:
-    import tifffile
+    import tifffile  # type: ignore[import-untyped]
 except ImportError:
-    tifffile = None
+    tifffile = None  # type: ignore[assignment]
 
 jpeg = None
 supported_backends = ["cv2", "turbojpeg", "pillow", "tifffile"]
@@ -137,11 +137,11 @@ def imfrombytes(
     else:
         # cv2 backend
         if len(content) == 0:
-            return None
+            return None  # type: ignore[return-value]
         img_np = np.frombuffer(content, np.uint8)
-        flag = imread_flags[flag] if isinstance(flag, str) else flag
-        img = cv2.imdecode(img_np, flag)
-        if img is not None and flag == IMREAD_COLOR and channel_order == "rgb":
+        flag_int: int = imread_flags[flag] if isinstance(flag, str) else flag
+        img = cv2.imdecode(img_np, flag_int)  # type: ignore[arg-type]
+        if img is not None and flag_int == IMREAD_COLOR and channel_order == "rgb":
             cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
         return img
 
@@ -264,7 +264,7 @@ def imwrite(
     # Encode image according to image suffix.
     # For example, if image path is '/path/your/img.jpg', the encode
     # format is '.jpg'.
-    flag, img_buff = cv2.imencode(img_ext, img, params)
+    flag, img_buff = cv2.imencode(img_ext, img, params)  # type: ignore[arg-type]
 
     if flag:
         with open(file_path, "wb") as f:
diff --git a/visdet/cv/ops/roi_align.py b/visdet/cv/ops/roi_align.py
index 0f93678e..9d2530f1 100644
--- a/visdet/cv/ops/roi_align.py
+++ b/visdet/cv/ops/roi_align.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
 import torch.nn as nn
 from torchvision.ops import RoIAlign as TVRoIAlign
 from torchvision.ops import roi_align as tv_roi_align
@@ -19,25 +21,33 @@ class RoIAlign(nn.Module):
             We set this to True by default for better performance.
     """
 
+    output_size: Union[int, Tuple[int, int]]
+    spatial_scale: float
+    sampling_ratio: int
+    pool_mode: str
+    aligned: bool
+    use_torchvision: bool
+    roi_align: TVRoIAlign
+
     def __init__(
         self,
-        output_size,
-        spatial_scale=1.0,
-        sampling_ratio=0,
-        pool_mode="avg",
-        aligned=True,
-        use_torchvision=True,
-    ):
+        output_size: Union[int, Tuple[int, int]],
+        spatial_scale: float = 1.0,
+        sampling_ratio: int = 0,
+        pool_mode: str = "avg",
+        aligned: bool = True,
+        use_torchvision: bool = True,
+    ) -> None:
         super().__init__()
-        self.output_size = output_size
-        self.spatial_scale = spatial_scale
-        self.sampling_ratio = sampling_ratio
-        self.pool_mode = pool_mode
-        self.aligned = aligned
-        self.use_torchvision = use_torchvision
+        self.output_size = output_size  # type: ignore[misc]
+        self.spatial_scale = spatial_scale  # type: ignore[misc]
+        self.sampling_ratio = sampling_ratio  # type: ignore[misc]
+        self.pool_mode = pool_mode  # type: ignore[misc]
+        self.aligned = aligned  # type: ignore[misc]
+        self.use_torchvision = use_torchvision  # type: ignore[misc]
 
         if isinstance(self.output_size, int):
-            self.output_size = (self.output_size, self.output_size)
+            self.output_size = (self.output_size, self.output_size)  # type: ignore[misc]
 
         # We always use torchvision's implementation for simplicity
         self.roi_align = TVRoIAlign(
diff --git a/visdet/cv/transforms/builder.py b/visdet/cv/transforms/builder.py
index 6a26953e..d38b302f 100644
--- a/visdet/cv/transforms/builder.py
+++ b/visdet/cv/transforms/builder.py
@@ -135,7 +135,7 @@ def build_transforms(cfg):
             transforms.append(transform)
 
         # Import Compose here to avoid circular imports
-        from visdet.cv.transforms.compose import Compose
+        from visdet.cv.transforms.wrappers import Compose
 
         return Compose(transforms)
     else:
diff --git a/visdet/cv/transforms/loading.py b/visdet/cv/transforms/loading.py
index 4545820a..851d645a 100644
--- a/visdet/cv/transforms/loading.py
+++ b/visdet/cv/transforms/loading.py
@@ -360,7 +360,11 @@ def _load_seg_map(self, results: dict) -> None:
         else:
             img_bytes = engine_fileio.get(results["seg_map_path"], backend_args=self.backend_args)
 
-        results["gt_seg_map"] = imfrombytes(img_bytes, flag="unchanged", backend=self.imdecode_backend).squeeze()
+        # Convert memoryview to bytes if needed
+        img_bytes_for_decode = bytes(img_bytes) if isinstance(img_bytes, memoryview) else img_bytes
+        results["gt_seg_map"] = imfrombytes(
+            img_bytes_for_decode, flag="unchanged", backend=self.imdecode_backend
+        ).squeeze()
 
     def _load_kps(self, results: dict) -> None:
         """Private function to load keypoints annotations.
diff --git a/visdet/cv/transforms/processing.py b/visdet/cv/transforms/processing.py
index 029e445b..3630ee49 100644
--- a/visdet/cv/transforms/processing.py
+++ b/visdet/cv/transforms/processing.py
@@ -242,6 +242,7 @@ def random_sample_ratio(img_scale, ratio_range):
 
     def _random_scale(self, results):
         """Randomly sample an img_scale."""
+        assert self.img_scale is not None, "img_scale must be initialized"
         if self.ratio_range is not None:
             scale, scale_idx = self.random_sample_ratio(self.img_scale[0], self.ratio_range)
         elif len(self.img_scale) == 1:
@@ -483,9 +484,12 @@ def _pad_img(self, results: dict) -> None:
             size = (pad_h, pad_w)
         elif self.size is not None:
             size = self.size[::-1]
+        pad_val_for_impad: int | float | tuple
         if isinstance(pad_val, int) and results["img"].ndim == 3:
-            pad_val = tuple(pad_val for _ in range(results["img"].shape[2]))
-        padded_img = impad(results["img"], shape=size, pad_val=pad_val, padding_mode=self.padding_mode)
+            pad_val_for_impad = tuple(pad_val for _ in range(results["img"].shape[2]))
+        else:
+            pad_val_for_impad = pad_val
+        padded_img = impad(results["img"], shape=size, pad_val=pad_val_for_impad, padding_mode=self.padding_mode)
 
         results["img"] = padded_img
 
@@ -510,13 +514,16 @@ def _pad_seg(self, results: dict) -> None:
         """Pad semantic segmentation map according to
         ``results['pad_shape']``."""
         if results.get("gt_seg_map", None) is not None:
-            pad_val = self.pad_val.get("seg", 255)
-            if isinstance(pad_val, int) and results["gt_seg_map"].ndim == 3:
-                pad_val = tuple(pad_val for _ in range(results["gt_seg_map"].shape[2]))
+            pad_val_seg = self.pad_val.get("seg", 255)
+            pad_val_for_seg: int | float | list
+            if isinstance(pad_val_seg, int) and results["gt_seg_map"].ndim == 3:
+                pad_val_for_seg = list(pad_val_seg for _ in range(results["gt_seg_map"].shape[2]))
+            else:
+                pad_val_for_seg = pad_val_seg
             results["gt_seg_map"] = impad(
                 results["gt_seg_map"],
                 shape=results["pad_shape"][:2],
-                pad_val=pad_val,
+                pad_val=pad_val_for_seg,
                 padding_mode=self.padding_mode,
             )
 
@@ -716,7 +723,7 @@ def transform(self, results: dict) -> dict:
                 img_width = max(img_width, crop_width)
                 pad_size = (img_width, img_height)
                 _pad_cfg = self.pad_cfg.copy()
-                _pad_cfg.update(dict(size=pad_size))
+                _pad_cfg["size"] = pad_size  # type: ignore[index]
                 pad_transform = TRANSFORMS.build(_pad_cfg)
                 results = pad_transform(results)
             else:
@@ -966,12 +973,13 @@ def transform(self, results: dict) -> dict:
         for scale in self.scales:
             for flip, direction in flip_args:
                 _resize_cfg = self.resize_cfg.copy()
-                _resize_cfg.update({self.scale_key: scale})
+                _resize_cfg[self.scale_key] = scale  # type: ignore[index]
                 _resize_flip = [_resize_cfg]
 
                 if flip:
                     _flip_cfg = self.flip_cfg.copy()
-                    _flip_cfg.update(prob=1.0, direction=direction)
+                    _flip_cfg["prob"] = 1.0  # type: ignore[index]
+                    _flip_cfg["direction"] = direction  # type: ignore[index]
                     _resize_flip.append(_flip_cfg)
                 else:
                     results["flip"] = False
@@ -1331,8 +1339,8 @@ def _flip_bbox(self, bboxes: np.ndarray, img_shape: tuple[int, int], direction:
         """
         # Handle BaseBoxes objects using their own flip method
         if hasattr(bboxes, "flip_"):
-            flipped = bboxes.clone()
-            flipped.flip_(img_shape, direction)
+            flipped = bboxes.clone()  # type: ignore[attr-defined]
+            flipped.flip_(img_shape, direction)  # type: ignore[attr-defined]
             return flipped
 
         # Handle numpy arrays
@@ -1394,7 +1402,7 @@ def _flip_keypoints(
         flipped = np.concatenate([flipped, meta_info], axis=-1)
         return flipped
 
-    def _flip_seg_map(self, seg_map: dict, direction: str) -> np.ndarray:
+    def _flip_seg_map(self, seg_map: np.ndarray, direction: str) -> np.ndarray:
         """Flip segmentation map horizontally, vertically or diagonally.
 
         Args:
diff --git a/visdet/cv/transforms/utils.py b/visdet/cv/transforms/utils.py
index b88a6739..b3bd07c0 100644
--- a/visdet/cv/transforms/utils.py
+++ b/visdet/cv/transforms/utils.py
@@ -51,8 +51,9 @@ def __set_name__(self, owner, name):
     def __call__(self, *args, **kwargs):
         # Get the transform instance whose method is decorated
         # by cache_randomness
+        assert self.instance_ref is not None, "instance_ref must be set"
         instance = self.instance_ref()
-        name = self.__name__
+        name: str = self.__name__  # type: ignore[misc]
 
         # Check the flag ``self._cache_enabled``, which should be
         # set by the contextmanagers like ``cache_random_parameters```
@@ -63,12 +64,12 @@ def __call__(self, *args, **kwargs):
             # ``cache_enabled``` is set by contextmanagers like
             # ``cache_random_params```.
             if not hasattr(instance, "_cache"):
-                instance._cache = {}
+                instance._cache = {}  # type: ignore[attr-defined]
 
-            if name not in instance._cache:
-                instance._cache[name] = self.func(instance, *args, **kwargs)
+            if name not in instance._cache:  # type: ignore[attr-defined]
+                instance._cache[name] = self.func(instance, *args, **kwargs)  # type: ignore[attr-defined]
             # Return the cached value
-            return instance._cache[name]
+            return instance._cache[name]  # type: ignore[attr-defined]
         else:
             # Clear cache
             if hasattr(instance, "_cache"):
@@ -210,12 +211,12 @@ def _start_cache(t: BaseTransform):
             return
 
         # Set cache enabled flag
-        t._cache_enabled = True
+        t._cache_enabled = True  # type: ignore[attr-defined]
 
         # Store the original method and init the counter
         if hasattr(t, "_methods_with_randomness"):
-            t.transform = _add_invoke_checker(t, "transform")
-            for name in t._methods_with_randomness:
+            t.transform = _add_invoke_checker(t, "transform")  # type: ignore[method-assign]
+            for name in t._methods_with_randomness:  # type: ignore[attr-defined]
                 setattr(t, name, _add_invoke_counter(t, name))
 
     def _end_cache(t: BaseTransform):
@@ -230,19 +231,19 @@ def _end_cache(t: BaseTransform):
 
         # Restore the original method
         if hasattr(t, "_methods_with_randomness"):
-            for name in t._methods_with_randomness:
+            for name in t._methods_with_randomness:  # type: ignore[attr-defined]
                 key = f"{id(t)}.{name}"
                 setattr(t, name, key2method[key])
 
             key_transform = f"{id(t)}.transform"
-            t.transform = key2method[key_transform]
+            t.transform = key2method[key_transform]  # type: ignore[method-assign]
 
     def _apply(t: BaseTransform | Iterable, func: Callable[[BaseTransform], None]):
         if isinstance(t, BaseTransform):
             func(t)
         if isinstance(t, Iterable):
             for _t in t:
-                _apply(_t, func)
+                _apply(_t, func)  # type: ignore[arg-type]
 
     try:
         _apply(transforms, _start_cache)
diff --git a/visdet/datasets/api_wrappers/cocoeval_mp.py b/visdet/datasets/api_wrappers/cocoeval_mp.py
index e7337784..c2056b88 100644
--- a/visdet/datasets/api_wrappers/cocoeval_mp.py
+++ b/visdet/datasets/api_wrappers/cocoeval_mp.py
@@ -3,6 +3,7 @@
 import itertools
 import time
 from collections import defaultdict
+from typing import Any, Iterable, cast
 
 import numpy as np
 import torch.multiprocessing as mp
@@ -13,6 +14,12 @@
 
 
 class COCOevalMP(COCOeval):
+    _gts: dict[tuple[int, int], list[dict[str, Any]]]
+    _dts: dict[tuple[int, int], list[dict[str, Any]]]
+    evalImgs: Any
+    eval: dict[str, Any]
+    stats: np.ndarray
+
     def _prepare(self):
         """
         Prepare ._gts and ._dts for evaluation based on params
@@ -26,24 +33,26 @@ def _toMask(anns, coco):
                 ann["segmentation"] = rle
 
         p = self.params
+        gts: list[dict[str, Any]]
+        dts: list[dict[str, Any]]
         if p.useCats:
             gts = []
             dts = []
             img_ids = set(p.imgIds)
             cat_ids = set(p.catIds)
-            for gt in self.cocoGt.dataset["annotations"]:
+            for gt in self.cocoGt.dataset["annotations"]:  # type: ignore[attr-defined]
                 if (gt["category_id"] in cat_ids) and (gt["image_id"] in img_ids):
-                    gts.append(gt)
-            for dt in self.cocoDt.dataset["annotations"]:
+                    gts.append(cast(dict[str, Any], gt))
+            for dt in self.cocoDt.dataset["annotations"]:  # type: ignore[attr-defined]
                 if (dt["category_id"] in cat_ids) and (dt["image_id"] in img_ids):
-                    dts.append(dt)
+                    dts.append(cast(dict[str, Any], dt))
             # gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
             # dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
             # gts=self.cocoGt.dataset['annotations']
             # dts=self.cocoDt.dataset['annotations']
         else:
-            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
-            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+            gts = [cast(dict[str, Any], ann) for ann in self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))]  # type: ignore[attr-defined]
+            dts = [cast(dict[str, Any], ann) for ann in self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))]  # type: ignore[attr-defined]
 
         # convert ground truth to mask if iouType == 'segm'
         if p.iouType == "segm":
@@ -55,14 +64,14 @@ def _toMask(anns, coco):
             gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
             if p.iouType == "keypoints":
                 gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
-        self._gts = defaultdict(list)  # gt for evaluation
-        self._dts = defaultdict(list)  # dt for evaluation
+        self._gts = defaultdict(list)
+        self._dts = defaultdict(list)
         for gt in gts:
             self._gts[gt["image_id"], gt["category_id"]].append(gt)
         for dt in dts:
             self._dts[dt["image_id"], dt["category_id"]].append(dt)
-        self.evalImgs = defaultdict(list)  # per-image per-category evaluation results
-        self.eval = {}  # accumulated evaluation results
+        self.evalImgs = defaultdict(list)
+        self.eval = {}
 
     def evaluate(self):
         """Run per image evaluation on given images and store results (a list
@@ -144,8 +153,14 @@ def evaluateImg(self, imgId, catId, aRng, maxDet):
         iscrowd = [int(o["iscrowd"]) for o in gt]
         # load computed ious
         # ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId]
-        ious = self.computeIoU(imgId, catId)
-        ious = ious[:, gtind] if len(ious) > 0 else ious
+        ious_raw = self.computeIoU(imgId, catId)
+        ious_array = np.asarray(ious_raw, dtype=float)
+        if ious_array.size == 0:
+            ious = np.zeros((0, 0))
+        else:
+            if ious_array.ndim == 1:
+                ious_array = ious_array.reshape((-1, 1))
+            ious = ious_array[:, gtind]
 
         T = len(p.iouThrs)
         G = len(gt)
@@ -221,15 +236,15 @@ def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100):
                 # IoU
                 if iouThr is not None:
                     t = np.where(iouThr == p.iouThrs)[0]
-                    s = s[t]
-                s = s[:, :, :, aind, mind]
+                    s = s[t]  # type: ignore[index]
+                s = s[:, :, :, aind, mind]  # type: ignore[index]
             else:
                 # dimension of recall: [TxKxAxM]
                 s = self.eval["recall"]
                 if iouThr is not None:
                     t = np.where(iouThr == p.iouThrs)[0]
-                    s = s[t]
-                s = s[:, :, aind, mind]
+                    s = s[t]  # type: ignore[index]
+                s = s[:, :, aind, mind]  # type: ignore[index]
             if len(s[s > -1]) == 0:
                 mean_s = -1
             else:
diff --git a/visdet/datasets/builder.py b/visdet/datasets/builder.py
index 0b906e23..867dbc9f 100644
--- a/visdet/datasets/builder.py
+++ b/visdet/datasets/builder.py
@@ -3,28 +3,16 @@
 import platform
 import random
 import warnings
+from collections.abc import Mapping, Sequence
 from functools import partial
+from typing import Any
 
 import numpy as np
 import torch
-from torch.utils.data import DataLoader
+from torch.utils.data import DataLoader, Dataset, default_collate
 
 from visdet.cv import build_from_cfg
-from visdet.engine.dist import get_dist_info
-from visdet.engine.registry import Registry
-from visdet.engine.utils import TORCH_VERSION, digit_version
-
-try:
-    from torch.utils.data import collate_fn
-
-    def collate(batch):
-        return collate_fn(batch)
-except ImportError:
-    # Fallback implementation
-    def collate(batch):
-        return batch
-
-
+from visdet.datasets.dataset_wrappers import ClassBalancedDataset, ConcatDataset, RepeatDataset
 from visdet.datasets.samplers import (
     ClassAwareSampler,
     DistributedGroupSampler,
@@ -33,6 +21,9 @@ def collate(batch):
     InfiniteBatchSampler,
     InfiniteGroupBatchSampler,
 )
+from visdet.engine.dist import get_dist_info
+from visdet.engine.registry import Registry
+from visdet.engine.utils import digit_version
 
 if platform.system() != "Windows":
     # https://github.com/pytorch/pytorch/issues/973
@@ -48,9 +39,12 @@ def collate(batch):
 PIPELINES = Registry("pipeline")
 
 
-def _concat_dataset(cfg, default_args=None):
-    from visdet.datasets.dataset_wrappers import ConcatDataset
+def collate(batch, samples_per_gpu: int = 1):  # noqa: ARG001 - kept for backward compat
+    """Wrap PyTorch's default collate to match mmengine's signature."""
+    return default_collate(batch)
+
 
+def _concat_dataset(cfg: dict[str, Any], default_args: dict[str, Any] | None = None):
     ann_files = cfg["ann_file"]
     img_prefixes = cfg.get("img_prefix", None)
     seg_prefixes = cfg.get("seg_prefix", None)
@@ -76,14 +70,9 @@ def _concat_dataset(cfg, default_args=None):
     return ConcatDataset(datasets, separate_eval)
 
 
-def build_dataset(cfg, default_args=None):
-    from visdet.datasets.dataset_wrappers import (
-        ClassBalancedDataset,
-        ConcatDataset,
-        MultiImageMixDataset,
-        RepeatDataset,
-    )
-
+def build_dataset(
+    cfg: dict[str, Any] | list[dict[str, Any]] | tuple[dict[str, Any], ...], default_args: dict[str, Any] | None = None
+):
     if isinstance(cfg, (list, tuple)):
         dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
     elif cfg["type"] == "ConcatDataset":
@@ -96,10 +85,7 @@ def build_dataset(cfg, default_args=None):
     elif cfg["type"] == "ClassBalancedDataset":
         dataset = ClassBalancedDataset(build_dataset(cfg["dataset"], default_args), cfg["oversample_thr"])
     elif cfg["type"] == "MultiImageMixDataset":
-        cp_cfg = copy.deepcopy(cfg)
-        cp_cfg["dataset"] = build_dataset(cp_cfg["dataset"])
-        cp_cfg.pop("type")
-        dataset = MultiImageMixDataset(**cp_cfg)
+        raise NotImplementedError("MultiImageMixDataset is not yet available in visdet")
     elif isinstance(cfg.get("ann_file"), (list, tuple)):
         dataset = _concat_dataset(cfg, default_args)
     else:
@@ -109,18 +95,18 @@ def build_dataset(cfg, default_args=None):
 
 
 def build_dataloader(
-    dataset,
-    samples_per_gpu,
-    workers_per_gpu,
-    num_gpus=1,
-    dist=True,
-    shuffle=True,
-    seed=None,
-    runner_type="EpochBasedRunner",
-    persistent_workers=False,
-    class_aware_sampler=None,
-    **kwargs,
-):
+    dataset: Dataset,
+    samples_per_gpu: int,
+    workers_per_gpu: int,
+    num_gpus: int = 1,
+    dist: bool = True,
+    shuffle: bool = True,
+    seed: int | None = None,
+    runner_type: str = "EpochBasedRunner",
+    persistent_workers: bool = False,
+    class_aware_sampler: dict[str, Any] | None = None,
+    **kwargs: Any,
+) -> DataLoader:
     """Build PyTorch DataLoader.
 
     In distributed training, each GPU/process has a dataloader.
@@ -150,18 +136,22 @@ def build_dataloader(
         DataLoader: A PyTorch dataloader.
     """
     rank, world_size = get_dist_info()
+    samples_per_gpu_int = int(samples_per_gpu)
+    workers_per_gpu_int = int(workers_per_gpu)
+    num_gpus_int = int(num_gpus)
+    seed_int = int(seed) if seed is not None else None
 
     if dist:
         # When model is :obj:`DistributedDataParallel`,
         # `batch_size` of :obj:`dataloader` is the
         # number of training samples on each GPU.
-        batch_size = samples_per_gpu
-        num_workers = workers_per_gpu
+        batch_size = samples_per_gpu_int
+        num_workers = workers_per_gpu_int
     else:
         # When model is obj:`DataParallel`
         # the batch size is samples on all the GPUS
-        batch_size = num_gpus * samples_per_gpu
-        num_workers = num_gpus * workers_per_gpu
+        batch_size = num_gpus_int * samples_per_gpu_int
+        num_workers = num_gpus_int * workers_per_gpu_int
 
     if runner_type == "IterBasedRunner":
         # this is a batch sampler, which can yield
@@ -169,38 +159,42 @@ def build_dataloader(
         # it can be used in both `DataParallel` and
         # `DistributedDataParallel`
         if shuffle:
-            batch_sampler = InfiniteGroupBatchSampler(dataset, batch_size, world_size, rank, seed=seed)
+            batch_sampler = InfiniteGroupBatchSampler(dataset, batch_size, world_size, rank, seed=seed_int)
         else:
-            batch_sampler = InfiniteBatchSampler(dataset, batch_size, world_size, rank, seed=seed, shuffle=False)
+            batch_sampler = InfiniteBatchSampler(dataset, batch_size, world_size, rank, seed=seed_int, shuffle=False)
         batch_size = 1
         sampler = None
     else:
         if class_aware_sampler is not None:
             # ClassAwareSampler can be used in both distributed and
             # non-distributed training.
-            num_sample_class = class_aware_sampler.get("num_sample_class", 1)
+            num_sample_class = int(class_aware_sampler.get("num_sample_class", 1))
             sampler = ClassAwareSampler(
                 dataset,
-                samples_per_gpu,
+                samples_per_gpu_int,
                 world_size,
                 rank,
-                seed=seed,
+                seed=seed_int,
                 num_sample_class=num_sample_class,
             )
         elif dist:
             # DistributedGroupSampler will definitely shuffle the data to
             # satisfy that images on each GPU are in the same group
             if shuffle:
-                sampler = DistributedGroupSampler(dataset, samples_per_gpu, world_size, rank, seed=seed)
+                sampler = DistributedGroupSampler(dataset, samples_per_gpu_int, world_size, rank, seed=seed_int)
             else:
-                sampler = DistributedSampler(dataset, world_size, rank, shuffle=False, seed=seed)
+                # DistributedSampler signature differs between PyTorch versions
+                sampler = DistributedSampler(dataset, world_size, rank, shuffle=False, seed=seed_int)  # type: ignore[call-arg]
         else:
-            sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None
+            sampler = GroupSampler(dataset, samples_per_gpu_int) if shuffle else None
         batch_sampler = None
 
-    init_fn = partial(worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) if seed is not None else None
+    init_fn = (
+        partial(worker_init_fn, num_workers=num_workers, rank=rank, seed=seed_int) if seed_int is not None else None
+    )
 
-    if TORCH_VERSION != "parrots" and digit_version(TORCH_VERSION) >= digit_version("1.7.0"):
+    # Check PyTorch version for persistent_workers support (available in 1.7.0+)
+    if digit_version(torch.__version__) >= digit_version("1.7.0"):
         kwargs["persistent_workers"] = persistent_workers
     elif persistent_workers is True:
         warnings.warn("persistent_workers is invalid because your pytorch version is lower than 1.7.0")
@@ -211,7 +205,7 @@ def build_dataloader(
         sampler=sampler,
         num_workers=num_workers,
         batch_sampler=batch_sampler,
-        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu_int),
         pin_memory=kwargs.pop("pin_memory", False),
         worker_init_fn=init_fn,
         **kwargs,
diff --git a/visdet/datasets/dataset_wrappers.py b/visdet/datasets/dataset_wrappers.py
new file mode 100644
index 00000000..8176bef7
--- /dev/null
+++ b/visdet/datasets/dataset_wrappers.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Dataset wrapper aliases used across visdet."""
+
+from visdet.engine.dataset.dataset_wrapper import (
+    ClassBalancedDataset as _ClassBalancedDataset,
+)
+from visdet.engine.dataset.dataset_wrapper import (
+    ConcatDataset as _ConcatDataset,
+)
+from visdet.engine.dataset.dataset_wrapper import (
+    RepeatDataset as _RepeatDataset,
+)
+
+ClassBalancedDataset = _ClassBalancedDataset
+ConcatDataset = _ConcatDataset
+RepeatDataset = _RepeatDataset
+
+__all__ = ["ClassBalancedDataset", "ConcatDataset", "RepeatDataset"]
diff --git a/visdet/datasets/pipelines.py b/visdet/datasets/pipelines.py
index 9ec6551a..03e70fc4 100644
--- a/visdet/datasets/pipelines.py
+++ b/visdet/datasets/pipelines.py
@@ -5,6 +5,8 @@
 backward compatibility with the old pipelines namespace.
 """
 
+# Import available transforms
+from visdet.datasets.transforms.formatting import PackDetInputs
 from visdet.datasets.transforms.load_image import (
     LoadImageFromFile,
     LoadImageFromWebcam,
@@ -20,23 +22,11 @@
     RandomFlip,
 )
 
-# Try to import optional transforms that may not exist in all versions
-try:
-    from visdet.datasets.transforms.transforms import RandomResize
-except ImportError:
-    RandomResize = None
-
-try:
-    from visdet.datasets.transforms.formatting import DefaultFormatBundle, PackDetInputs
-except ImportError:
-    PackDetInputs = None
-    DefaultFormatBundle = None
-
-try:
-    from visdet.datasets.transforms.wrappers import RandomApply, RandomChoice
-except ImportError:
-    RandomApply = None
-    RandomChoice = None
+# These transforms don't exist in visdet yet - set to None for compatibility
+RandomResize = None  # type: ignore[assignment]
+DefaultFormatBundle = None  # type: ignore[assignment]
+RandomApply = None  # type: ignore[assignment]
+RandomChoice = None  # type: ignore[assignment]
 
 __all__ = [
     "FilterAnnotations",
diff --git a/visdet/datasets/samplers/distributed_sampler.py b/visdet/datasets/samplers/distributed_sampler.py
index 76756b59..d8eef7ca 100644
--- a/visdet/datasets/samplers/distributed_sampler.py
+++ b/visdet/datasets/samplers/distributed_sampler.py
@@ -1,19 +1,28 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
+from typing import Iterator, Optional
 
 import torch
+from torch.utils.data import Dataset
 from torch.utils.data import DistributedSampler as _DistributedSampler
 
 from visdet.engine.dist import sync_random_seed
 
 
-def get_device():
+def get_device() -> str:
     """Returns an available device, cuda or cpu."""
     return "cuda" if torch.cuda.is_available() else "cpu"
 
 
 class DistributedSampler(_DistributedSampler):
-    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0):
+    def __init__(
+        self,
+        dataset: Dataset,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+        seed: int = 0,
+    ) -> None:
         super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
 
         # In distributed sampling, different ranks should sample
@@ -25,7 +34,7 @@ def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0):
         device = get_device()
         self.seed = sync_random_seed(seed, device)
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[int]:
         # deterministically shuffle based on epoch
         if self.shuffle:
             g = torch.Generator()
@@ -34,9 +43,9 @@ def __iter__(self):
             # Otherwise, the next iteration of this sampler will
             # yield the same ordering.
             g.manual_seed(self.epoch + self.seed)
-            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore[arg-type]
         else:
-            indices = torch.arange(len(self.dataset)).tolist()
+            indices = torch.arange(len(self.dataset)).tolist()  # type: ignore[arg-type]
 
         # add extra samples to make it evenly divisible
         # in case that indices is shorter than half of total_size
diff --git a/visdet/datasets/transforms/formatting.py b/visdet/datasets/transforms/formatting.py
index 64825210..dec2480e 100644
--- a/visdet/datasets/transforms/formatting.py
+++ b/visdet/datasets/transforms/formatting.py
@@ -127,8 +127,8 @@ def transform(self, results: dict) -> dict:
             data_sample.proposals = proposals
 
         if "gt_seg_map" in results:
-            gt_sem_seg_data = dict(sem_seg=to_tensor(results["gt_seg_map"][None, ...].copy()))
-            gt_sem_seg_data = PixelData(**gt_sem_seg_data)
+            gt_sem_seg_tensor = to_tensor(results["gt_seg_map"][None, ...].copy())
+            gt_sem_seg_data = PixelData(sem_seg=gt_sem_seg_tensor)
             if "ignore_index" in results:
                 metainfo = dict(ignore_index=results["ignore_index"])
                 gt_sem_seg_data.set_metainfo(metainfo)
diff --git a/visdet/datasets/transforms/loading.py b/visdet/datasets/transforms/loading.py
index 295299d3..4864cc10 100644
--- a/visdet/datasets/transforms/loading.py
+++ b/visdet/datasets/transforms/loading.py
@@ -96,8 +96,8 @@ def __init__(
         to_float32: bool = False,
         color_type: str = "unchanged",
         imdecode_backend: str = "cv2",
-        file_client_args: dict = None,
-        backend_args: dict = None,
+        file_client_args: dict | None = None,
+        backend_args: dict | None = None,
     ) -> None:
         self.to_float32 = to_float32
         self.color_type = color_type
@@ -552,10 +552,10 @@ def __init__(
         with_seg: bool = True,
         box_type: str = "hbox",
         imdecode_backend: str = "cv2",
-        backend_args: dict = None,
+        backend_args: dict | None = None,
     ) -> None:
         try:
-            from panopticapi import utils
+            from panopticapi import utils  # type: ignore[import-untyped]
         except ImportError:
             raise ImportError(
                 "panopticapi is not installed, please install it by: "
diff --git a/visdet/engine/config/config_wrapper.py b/visdet/engine/config/config_wrapper.py
index bcaaa53f..60b1bf46 100644
--- a/visdet/engine/config/config_wrapper.py
+++ b/visdet/engine/config/config_wrapper.py
@@ -6,7 +6,7 @@
 
 import warnings
 from pathlib import Path
-from typing import Any, Dict, Union
+from typing import TYPE_CHECKING, Any, Dict, Union, cast
 
 from visdet.engine.config import Config as BaseConfig
 from visdet.engine.config.schema_generator import validate_config_with_schema
@@ -73,7 +73,9 @@ def fromfile(
                     stacklevel=2,
                 )
             # Use the parent class's fromfile method for .py files
-            return super(Config, Config).fromfile(str(filename))
+            # The base Config.fromfile returns BaseConfig, but since Config extends BaseConfig,
+            # we can safely cast it back to Config
+            return cast("Config", super(Config, Config).fromfile(str(filename)))
 
         else:
             raise ValueError(f"Unsupported config file extension: {filename.suffix}. Supported: .yaml, .yml, .py")
diff --git a/visdet/engine/config/yaml_loader.py b/visdet/engine/config/yaml_loader.py
index a510510b..dad2817e 100644
--- a/visdet/engine/config/yaml_loader.py
+++ b/visdet/engine/config/yaml_loader.py
@@ -162,13 +162,13 @@ def _resolve_path(self, ref_path: str, current_file: Path) -> Path:
         Returns:
             Absolute resolved path
         """
-        ref_path = Path(ref_path)
+        ref_path_obj = Path(ref_path)
 
-        if ref_path.is_absolute():
-            return ref_path
+        if ref_path_obj.is_absolute():
+            return ref_path_obj
         else:
             # Resolve relative to the directory containing current_file
-            return (current_file.parent / ref_path).resolve()
+            return (current_file.parent / ref_path_obj).resolve()
 
     def _deep_merge(self, base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
         """Deep merge two dictionaries, with override taking precedence.
diff --git a/visdet/engine/dist/__init__.py b/visdet/engine/dist/__init__.py
index 920ff72c..149b5919 100644
--- a/visdet/engine/dist/__init__.py
+++ b/visdet/engine/dist/__init__.py
@@ -5,11 +5,13 @@
 import os
 import pickle
 import warnings
-from typing import Any, List, Optional
+from typing import Any, Callable, List, Optional, TypeVar
 
 import torch
 import torch.distributed as dist_lib
 
+F = TypeVar("F", bound=Callable)
+
 
 def _is_dist_available_and_initialized():
     """Check if distributed training is available and initialized."""
@@ -55,13 +57,14 @@ def is_main_process():
     return get_rank() == 0
 
 
-def master_only(func):
+def master_only(func: F) -> F:
     """Decorator to make a function only execute on master process."""
     @functools.wraps(func)
-    def wrapper(*args, **kwargs):
+    def wrapper(*args, **kwargs):  # type: ignore[no-untyped-def]
         if is_main_process():
             return func(*args, **kwargs)
-    return wrapper
+        return None
+    return wrapper  # type: ignore[return-value]
 
 
 def barrier():
diff --git a/visdet/engine/hooks/visualization_hook.py b/visdet/engine/hooks/visualization_hook.py
index 0b016d88..8ecee135 100644
--- a/visdet/engine/hooks/visualization_hook.py
+++ b/visdet/engine/hooks/visualization_hook.py
@@ -4,6 +4,7 @@
 from collections.abc import Sequence
 
 import numpy as np
+import torch
 
 from visdet.cv import imfrombytes, imwrite
 from visdet.engine.fileio import get
@@ -57,7 +58,7 @@ def __init__(
         show: bool = False,
         wait_time: float = 0.0,
         test_out_dir: str | None = None,
-        backend_args: dict = None,
+        backend_args: dict | None = None,
     ):
         self._visualizer: Visualizer = Visualizer.get_current_instance()
         self.interval = interval
@@ -83,8 +84,8 @@ def after_val_iter(
         self,
         runner: Runner,
         batch_idx: int,
-        data_batch: dict,
-        outputs: Sequence[DetDataSample],
+        data_batch: dict | tuple | list | None = None,
+        outputs: Sequence[DetDataSample] | None = None,
     ) -> None:
         """Run after every ``self.interval`` validation iterations.
 
@@ -95,7 +96,7 @@ def after_val_iter(
             outputs (Sequence[:obj:`DetDataSample`]]): A batch of data samples
                 that contain annotations and predictions.
         """
-        if self.draw is False:
+        if self.draw is False or outputs is None:
             return
 
         # There is no guarantee that the same batch of images
@@ -122,8 +123,8 @@ def after_test_iter(
         self,
         runner: Runner,
         batch_idx: int,
-        data_batch: dict,
-        outputs: Sequence[DetDataSample],
+        data_batch: dict | tuple | list | None = None,
+        outputs: Sequence[DetDataSample] | None = None,
     ) -> None:
         """Run after every testing iterations.
 
@@ -134,7 +135,7 @@ def after_test_iter(
             outputs (Sequence[:obj:`DetDataSample`]): A batch of data samples
                 that contain annotations and predictions.
         """
-        if self.draw is False:
+        if self.draw is False or outputs is None:
             return
 
         if self.test_out_dir is not None:
@@ -200,8 +201,8 @@ def after_test_iter(
         self,
         runner: Runner,
         batch_idx: int,
-        data_batch: dict,
-        outputs: Sequence[DetDataSample],
+        data_batch: dict | tuple | list | None = None,
+        outputs: Sequence[DetDataSample] | None = None,
     ) -> None:
         """Run after every testing iterations.
 
@@ -212,7 +213,7 @@ def after_test_iter(
             outputs (Sequence[:obj:`DetDataSample`]): A batch of data samples
                 that contain annotations and predictions.
         """
-        if self.draw is False:
+        if self.draw is False or outputs is None:
             return
 
         if self.test_out_dir is not None:
@@ -236,20 +237,32 @@ def after_test_iter(
             text = data_sample.text
             if isinstance(text, str):  # VG
                 gt_instances = data_sample.gt_instances
+                if gt_instances is None:
+                    continue
                 tokens_positive = data_sample.tokens_positive
                 if "phrase_ids" in data_sample:
                     # flickr30k
                     gt_labels = data_sample.phrase_ids
                 else:
                     gt_labels = gt_instances.labels
-                gt_bboxes = gt_instances.get("bboxes", None)
-                if gt_bboxes is not None and isinstance(gt_bboxes, BaseBoxes):
-                    gt_instances.bboxes = gt_bboxes.tensor
+                gt_bboxes_raw = gt_instances.get("bboxes", None)
+                if gt_bboxes_raw is not None and isinstance(gt_bboxes_raw, BaseBoxes):
+                    gt_instances.bboxes = gt_bboxes_raw.tensor
+                    gt_bboxes = gt_bboxes_raw.tensor
+                else:
+                    gt_bboxes = gt_bboxes_raw
                 print(gt_labels, tokens_positive, gt_bboxes, img_path)
                 pred_instances = data_sample.pred_instances
+                if pred_instances is None:
+                    continue
                 pred_instances = pred_instances[pred_instances.scores > self.score_thr]
                 pred_labels = pred_instances.labels
-                pred_bboxes = pred_instances.bboxes
+                pred_bboxes_raw = pred_instances.bboxes
+                # Convert BaseBoxes to tensor
+                if isinstance(pred_bboxes_raw, BaseBoxes):
+                    pred_bboxes = pred_bboxes_raw.tensor
+                else:
+                    pred_bboxes = pred_bboxes_raw
                 pred_scores = pred_instances.scores
 
                 max_label = 0
@@ -311,6 +324,9 @@ def after_test_iter(
                     self._visualizer.draw_bboxes(bbox, edge_colors=color, alpha=1)
                 print(pred_labels, pred_bboxes, pred_scores, colors)
                 areas = (pred_bboxes[:, 3] - pred_bboxes[:, 1]) * (pred_bboxes[:, 2] - pred_bboxes[:, 0])
+                # Convert to numpy if it's a tensor
+                if isinstance(areas, torch.Tensor):
+                    areas = areas.cpu().numpy()
                 scales = _get_adaptive_scales(areas)
                 score = [str(round(s.item(), 2)) for s in pred_scores]
                 font_sizes = [int(13 * scales[i]) for i in range(len(scales))]
@@ -347,7 +363,7 @@ def after_test_iter(
                 if out_file is not None:
                     imwrite(drawn_img[..., ::-1], out_file)
                 else:
-                    self.add_image("test_img", drawn_img, self._test_index)
+                    self._visualizer.add_image("test_img", drawn_img, self._test_index)
             else:  # OD
                 self._visualizer.add_datasample(
                     osp.basename(img_path) if self.show else "test_img",
diff --git a/visdet/engine/optim/optimizer/builder.py b/visdet/engine/optim/optimizer/builder.py
index 5f0ee77f..db42b40c 100644
--- a/visdet/engine/optim/optimizer/builder.py
+++ b/visdet/engine/optim/optimizer/builder.py
@@ -52,7 +52,7 @@ def register_dadaptation_optimizers() -> list[str]:
     """
     dadaptation_optimizers = []
     try:
-        import dadaptation
+        import dadaptation  # type: ignore[import-untyped]
     except ImportError:
         pass
     else:
@@ -75,7 +75,7 @@ def register_lion_optimizers() -> list[str]:
     """
     optimizers = []
     try:
-        from lion_pytorch import Lion
+        from lion_pytorch import Lion  # type: ignore[import-untyped]
     except ImportError:
         pass
     else:
@@ -95,7 +95,7 @@ def register_sophia_optimizers() -> list[str]:
     """
     optimizers = []
     try:
-        import Sophia
+        import Sophia  # type: ignore[import-untyped]
     except ImportError:
         pass
     else:
@@ -122,7 +122,7 @@ def register_bitsandbytes_optimizers() -> list[str]:
     """
     dadaptation_optimizers = []
     try:
-        import bitsandbytes as bnb
+        import bitsandbytes as bnb  # type: ignore[import-untyped]
     except ImportError:
         # bitsandbytes is an optional dependency
         return dadaptation_optimizers
@@ -150,7 +150,7 @@ def register_bitsandbytes_optimizers() -> list[str]:
 def register_transformers_optimizers():
     transformer_optimizers = []
     try:
-        from transformers import Adafactor
+        from transformers import Adafactor  # type: ignore[import-untyped]
     except ImportError:
         pass
     else:
diff --git a/visdet/engine/optim/optimizer/default_constructor.py b/visdet/engine/optim/optimizer/default_constructor.py
index 9cdbbde2..8b827baf 100644
--- a/visdet/engine/optim/optimizer/default_constructor.py
+++ b/visdet/engine/optim/optimizer/default_constructor.py
@@ -271,7 +271,8 @@ def add_params(
 
     def __call__(self, model: nn.Module):  # -> OptimWrapper:
         if hasattr(model, "module"):
-            model = model.module
+            # DistributedDataParallel wraps the model in a .module attribute
+            model = model.module  # type: ignore[assignment]
 
         optim_wrapper_cfg = self.optim_wrapper_cfg.copy()
         optim_wrapper_cfg.setdefault("type", "OptimWrapper")
diff --git a/visdet/engine/structures/__init__.py b/visdet/engine/structures/__init__.py
index bf9532c0..98f214f9 100644
--- a/visdet/engine/structures/__init__.py
+++ b/visdet/engine/structures/__init__.py
@@ -1,5 +1,3 @@
-# ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
 from visdet.engine.structures.base_data_element import BaseDataElement
 from visdet.engine.structures.instance_data import InstanceData
diff --git a/visdet/engine/structures/base_data_element.py b/visdet/engine/structures/base_data_element.py
index 58e6b38f..9d9eff01 100644
--- a/visdet/engine/structures/base_data_element.py
+++ b/visdet/engine/structures/base_data_element.py
@@ -1,13 +1,13 @@
-# ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
 from collections.abc import Iterator
-from typing import Any
+from typing import Any, TypeVar
 
 import numpy as np
 import torch
 
+T = TypeVar("T", bound="BaseDataElement")
+
 
 class BaseDataElement:
     """A base data interface that supports Tensor-like and dict-like
@@ -210,16 +210,16 @@ class BaseDataElement:
         ...     det_sample.proposals = torch.rand((5, 4))
     """
 
-    def __init__(self, *, metainfo: dict | None = None, **kwargs) -> None:
-        self._metainfo_fields: set = set()
-        self._data_fields: set = set()
+    def __init__(self, *, metainfo: dict[str, Any] | None = None, **kwargs: Any) -> None:
+        self._metainfo_fields: set[str] = set()
+        self._data_fields: set[str] = set()
 
         if metainfo is not None:
             self.set_metainfo(metainfo=metainfo)
         if kwargs:
             self.set_data(kwargs)
 
-    def set_metainfo(self, metainfo: dict) -> None:
+    def set_metainfo(self, metainfo: dict[str, Any]) -> None:
         """Set or change key-value pairs in ``metainfo_field`` by parameter
         ``metainfo``.
 
@@ -232,7 +232,7 @@ def set_metainfo(self, metainfo: dict) -> None:
         for k, v in meta.items():
             self.set_field(name=k, value=v, field_type="metainfo", dtype=None)
 
-    def set_data(self, data: dict) -> None:
+    def set_data(self, data: dict[str, Any]) -> None:
         """Set or change key-value pairs in ``data_field`` by parameter
         ``data``.
 
@@ -258,7 +258,7 @@ def update(self, instance: "BaseDataElement") -> None:
         self.set_metainfo(dict(instance.metainfo_items()))
         self.set_data(dict(instance.items()))
 
-    def new(self, *, metainfo: dict | None = None, **kwargs) -> "BaseDataElement":
+    def new(self: T, *, metainfo: dict[str, Any] | None = None, **kwargs: Any) -> T:
         """Return a new data element with same type. If ``metainfo`` and
         ``data`` are None, the new data element will have same metainfo and
         data. If metainfo or data is not None, the new result will overwrite it
@@ -284,9 +284,9 @@ def new(self, *, metainfo: dict | None = None, **kwargs) -> "BaseDataElement":
             new_data.set_data(kwargs)
         else:
             new_data.set_data(dict(self.items()))
-        return new_data
+        return new_data  # type: ignore[return-value]
 
-    def clone(self):
+    def clone(self: T) -> T:
         """Deep copy the current data element.
 
         Returns:
@@ -295,9 +295,9 @@ def clone(self):
         clone_data = self.__class__()
         clone_data.set_metainfo(dict(self.metainfo_items()))
         clone_data.set_data(dict(self.items()))
-        return clone_data
+        return clone_data  # type: ignore[return-value]
 
-    def keys(self) -> list:
+    def keys(self) -> list[str]:
         """
         Returns:
             list: Contains all keys in data_fields.
@@ -309,35 +309,35 @@ def keys(self) -> list:
         private_keys = {"_" + key for key in self._data_fields if isinstance(getattr(type(self), key, None), property)}
         return list(self._data_fields - private_keys)
 
-    def metainfo_keys(self) -> list:
+    def metainfo_keys(self) -> list[str]:
         """
         Returns:
             list: Contains all keys in metainfo_fields.
         """
         return list(self._metainfo_fields)
 
-    def values(self) -> list:
+    def values(self) -> list[Any]:
         """
         Returns:
             list: Contains all values in data.
         """
         return [getattr(self, k) for k in self.keys()]
 
-    def metainfo_values(self) -> list:
+    def metainfo_values(self) -> list[Any]:
         """
         Returns:
             list: Contains all values in metainfo.
         """
         return [getattr(self, k) for k in self.metainfo_keys()]
 
-    def all_keys(self) -> list:
+    def all_keys(self) -> list[str]:
         """
         Returns:
             list: Contains all keys in metainfo and data.
         """
         return self.metainfo_keys() + self.keys()
 
-    def all_values(self) -> list:
+    def all_values(self) -> list[Any]:
         """
         Returns:
             list: Contains all values in metainfo and data.
@@ -372,7 +372,7 @@ def metainfo_items(self) -> Iterator[tuple[str, Any]]:
             yield (k, getattr(self, k))
 
     @property
-    def metainfo(self) -> dict:
+    def metainfo(self) -> dict[str, Any]:
         """dict: A dict contains metainfo of current data element."""
         return dict(self.metainfo_items())
 
@@ -403,31 +403,35 @@ def __delattr__(self, item: str):
     # dict-like methods
     __delitem__ = __delattr__
 
-    def get(self, key, default=None) -> Any:
+    def get(self, key: str, default: Any = None) -> Any:
         """Get property in data and metainfo as the same as python."""
         # Use `getattr()` rather than `self.__dict__.get()` to allow getting
         # properties.
         return getattr(self, key, default)
 
-    def pop(self, *args) -> Any:
+    def pop(self, key: str, default: Any = ...) -> Any:
         """Pop property in data and metainfo as the same as python."""
-        assert len(args) < 3, "``pop`` get more than 2 arguments"
-        name = args[0]
-        if name in self._metainfo_fields:
-            self._metainfo_fields.remove(args[0])
-            return self.__dict__.pop(*args)
+        if key in self._metainfo_fields:
+            self._metainfo_fields.remove(key)
+            if default is ...:
+                return self.__dict__.pop(key)
+            else:
+                return self.__dict__.pop(key, default)
 
-        elif name in self._data_fields:
-            self._data_fields.remove(args[0])
-            return self.__dict__.pop(*args)
+        elif key in self._data_fields:
+            self._data_fields.remove(key)
+            if default is ...:
+                return self.__dict__.pop(key)
+            else:
+                return self.__dict__.pop(key, default)
 
         # with default value
-        elif len(args) == 2:
-            return args[1]
+        elif default is not ...:
+            return default
         else:
             # don't just use 'self.__dict__.pop(*args)' for only popping key in
             # metainfo or data
-            raise KeyError(f"{args[0]} is not contained in metainfo or data")
+            raise KeyError(f"{key} is not contained in metainfo or data")
 
     def __contains__(self, item: str) -> bool:
         """Whether the item is in dataelement.
@@ -465,7 +469,7 @@ def set_field(
         super().__setattr__(name, value)
 
     # Tensor-like methods
-    def to(self, *args, **kwargs) -> "BaseDataElement":
+    def to(self: T, *args: Any, **kwargs: Any) -> T:
         """Apply same name function to all tensors in data_fields."""
         new_data = self.new()
         for k, v in self.items():
@@ -476,7 +480,7 @@ def to(self, *args, **kwargs) -> "BaseDataElement":
         return new_data
 
     # Tensor-like methods
-    def cpu(self) -> "BaseDataElement":
+    def cpu(self: T) -> T:
         """Convert all tensors to CPU in data."""
         new_data = self.new()
         for k, v in self.items():
@@ -487,7 +491,7 @@ def cpu(self) -> "BaseDataElement":
         return new_data
 
     # Tensor-like methods
-    def cuda(self) -> "BaseDataElement":
+    def cuda(self: T) -> T:
         """Convert all tensors to GPU in data."""
         new_data = self.new()
         for k, v in self.items():
@@ -498,60 +502,83 @@ def cuda(self) -> "BaseDataElement":
         return new_data
 
     # Tensor-like methods
-    def musa(self) -> "BaseDataElement":
+    def musa(self: T) -> T:
         """Convert all tensors to musa in data."""
         new_data = self.new()
         for k, v in self.items():
-            if isinstance(v, torch.Tensor | BaseDataElement):
+            if isinstance(v, BaseDataElement):
                 v = v.musa()
                 data = {k: v}
                 new_data.set_data(data)
+            elif isinstance(v, torch.Tensor):
+                if hasattr(v, "musa"):
+                    v = v.musa()  # type: ignore[attr-defined]
+                    data = {k: v}
+                    new_data.set_data(data)
         return new_data
 
     # Tensor-like methods
-    def npu(self) -> "BaseDataElement":
+    def npu(self: T) -> T:
         """Convert all tensors to NPU in data."""
         new_data = self.new()
         for k, v in self.items():
-            if isinstance(v, torch.Tensor | BaseDataElement):
+            if isinstance(v, BaseDataElement):
                 v = v.npu()
                 data = {k: v}
                 new_data.set_data(data)
+            elif isinstance(v, torch.Tensor):
+                if hasattr(v, "npu"):
+                    v = v.npu()  # type: ignore[attr-defined]
+                    data = {k: v}
+                    new_data.set_data(data)
         return new_data
 
-    def mlu(self) -> "BaseDataElement":
+    def mlu(self: T) -> T:
         """Convert all tensors to MLU in data."""
         new_data = self.new()
         for k, v in self.items():
-            if isinstance(v, torch.Tensor | BaseDataElement):
+            if isinstance(v, BaseDataElement):
                 v = v.mlu()
                 data = {k: v}
                 new_data.set_data(data)
+            elif isinstance(v, torch.Tensor):
+                if hasattr(v, "mlu"):
+                    v = v.mlu()  # type: ignore[attr-defined]
+                    data = {k: v}
+                    new_data.set_data(data)
         return new_data
 
     # Tensor-like methods
-    def detach(self) -> "BaseDataElement":
+    def detach(self: T) -> T:
         """Detach all tensors in data."""
         new_data = self.new()
         for k, v in self.items():
-            if isinstance(v, torch.Tensor | BaseDataElement):
+            if isinstance(v, BaseDataElement):
                 v = v.detach()
                 data = {k: v}
                 new_data.set_data(data)
+            elif isinstance(v, torch.Tensor):
+                v = v.detach()  # type: ignore[misc]
+                data = {k: v}
+                new_data.set_data(data)
         return new_data
 
     # Tensor-like methods
-    def numpy(self) -> "BaseDataElement":
+    def numpy(self: T) -> T:
         """Convert all tensors to np.ndarray in data."""
         new_data = self.new()
         for k, v in self.items():
-            if isinstance(v, torch.Tensor | BaseDataElement):
+            if isinstance(v, BaseDataElement):
                 v = v.detach().cpu().numpy()
                 data = {k: v}
                 new_data.set_data(data)
+            elif isinstance(v, torch.Tensor):
+                v = v.detach().cpu().numpy()  # type: ignore[misc]
+                data = {k: v}
+                new_data.set_data(data)
         return new_data
 
-    def to_tensor(self) -> "BaseDataElement":
+    def to_tensor(self: T) -> T:
         """Convert all np.ndarray to tensor in data."""
         new_data = self.new()
         for k, v in self.items():
@@ -565,7 +592,7 @@ def to_tensor(self) -> "BaseDataElement":
             new_data.set_data(data)
         return new_data
 
-    def to_dict(self) -> dict:
+    def to_dict(self) -> dict[str, Any]:
         """Convert BaseDataElement to dict."""
         return {k: v.to_dict() if isinstance(v, BaseDataElement) else v for k, v in self.all_items()}
 
diff --git a/visdet/engine/structures/instance_data.py b/visdet/engine/structures/instance_data.py
index ad2a2c2f..f93fd78a 100644
--- a/visdet/engine/structures/instance_data.py
+++ b/visdet/engine/structures/instance_data.py
@@ -1,34 +1,35 @@
-# ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
 import itertools
 from collections.abc import Sized
-from typing import Any, Union
+from typing import TYPE_CHECKING, Any, Union, overload
 
 import numpy as np
 import torch
 
 from visdet.engine.device import get_device
-
 from visdet.engine.structures.base_data_element import BaseDataElement
 
-BoolTypeTensor: Any
-LongTypeTensor: Any
+if TYPE_CHECKING:
+    from visdet.structures.bbox import BaseBoxes
+    from visdet.structures.mask import BitmapMasks, PolygonMasks
+
+BoolTypeTensor: type[torch.Tensor]
+LongTypeTensor: type[torch.Tensor]
 
 if get_device() == "npu":
-    BoolTypeTensor = Union[torch.BoolTensor, torch.npu.BoolTensor]
-    LongTypeTensor = Union[torch.LongTensor, torch.npu.LongTensor]
+    BoolTypeTensor = Union[torch.BoolTensor, torch.npu.BoolTensor]  # type: ignore[misc,assignment,name-defined]
+    LongTypeTensor = Union[torch.LongTensor, torch.npu.LongTensor]  # type: ignore[misc,assignment,name-defined]
 elif get_device() == "mlu":
-    BoolTypeTensor = Union[torch.BoolTensor, torch.mlu.BoolTensor]
-    LongTypeTensor = Union[torch.LongTensor, torch.mlu.LongTensor]
+    BoolTypeTensor = Union[torch.BoolTensor, torch.mlu.BoolTensor]  # type: ignore[misc,assignment,name-defined]
+    LongTypeTensor = Union[torch.LongTensor, torch.mlu.LongTensor]  # type: ignore[misc,assignment,name-defined]
 elif get_device() == "musa":
-    BoolTypeTensor = Union[torch.BoolTensor, torch.musa.BoolTensor]
-    LongTypeTensor = Union[torch.LongTensor, torch.musa.LongTensor]
+    BoolTypeTensor = Union[torch.BoolTensor, torch.musa.BoolTensor]  # type: ignore[misc,assignment,name-defined]
+    LongTypeTensor = Union[torch.LongTensor, torch.musa.LongTensor]  # type: ignore[misc,assignment,name-defined]
 else:
-    BoolTypeTensor = Union[torch.BoolTensor, torch.cuda.BoolTensor]
-    LongTypeTensor = Union[torch.LongTensor, torch.cuda.LongTensor]
+    BoolTypeTensor = Union[torch.BoolTensor, torch.cuda.BoolTensor]  # type: ignore[misc,assignment,name-defined]
+    LongTypeTensor = Union[torch.LongTensor, torch.cuda.LongTensor]  # type: ignore[misc,assignment,name-defined]
 
-IndexType: Any = Union[str, slice, int, list, LongTypeTensor, BoolTypeTensor, np.ndarray]
+IndexType = Union[str, slice, int, list[int], torch.Tensor, np.ndarray]
 
 
 # Modified from
@@ -169,7 +170,6 @@ def __getitem__(self, item: IndexType) -> "InstanceData":
         Returns:
             :obj:`InstanceData`: Corresponding values.
         """
-        assert isinstance(item, IndexType.__args__)
         if isinstance(item, list):
             item = np.array(item)
         if isinstance(item, np.ndarray):
@@ -181,10 +181,10 @@ def __getitem__(self, item: IndexType) -> "InstanceData":
             item = torch.from_numpy(item)
 
         if isinstance(item, str):
-            return getattr(self, item)
+            return getattr(self, item)  # type: ignore[return-value]
 
         if isinstance(item, int):
-            if item >= len(self) or item < -len(self):  # type:ignore
+            if item >= len(self) or item < -len(self):
                 raise IndexError(f"Index {item} out of range!")
             else:
                 # keep the dimension
@@ -193,7 +193,9 @@ def __getitem__(self, item: IndexType) -> "InstanceData":
         new_data = self.__class__(metainfo=self.metainfo)
         if isinstance(item, torch.Tensor):
             assert item.dim() == 1, "Only support to get the values along the first dimension."
-            if isinstance(item, BoolTypeTensor.__args__):
+            # Check if it's a boolean tensor
+            is_bool_tensor = item.dtype == torch.bool
+            if is_bool_tensor:
                 assert len(item) == len(self), (
                     "The shape of the "
                     "input(BoolTensor) "
@@ -212,14 +214,14 @@ def __getitem__(self, item: IndexType) -> "InstanceData":
                     new_data[k] = v[item.cpu().numpy()]
                 elif isinstance(v, str | list | tuple) or (hasattr(v, "__getitem__") and hasattr(v, "cat")):
                     # convert to indexes from BoolTensor
-                    if isinstance(item, BoolTypeTensor.__args__):
+                    if is_bool_tensor:
                         indexes = torch.nonzero(item).view(-1).cpu().numpy().tolist()
                     else:
                         indexes = item.cpu().numpy().tolist()
                     slice_list = []
                     if indexes:
                         for index in indexes:
-                            slice_list.append(slice(index, None, len(v)))
+                            slice_list.append(slice(index, None, len(v)))  # type: ignore[arg-type]
                     else:
                         slice_list.append(slice(None, 0, None))
                     r_list = [v[s] for s in slice_list]
@@ -228,7 +230,7 @@ def __getitem__(self, item: IndexType) -> "InstanceData":
                         for r in r_list[1:]:
                             new_value = new_value + r
                     else:
-                        new_value = v.cat(r_list)
+                        new_value = v.cat(r_list)  # type: ignore[attr-defined]
                     new_data[k] = new_value
                 else:
                     raise ValueError(
@@ -239,7 +241,7 @@ def __getitem__(self, item: IndexType) -> "InstanceData":
             # item is a slice
             for k, v in self.items():
                 new_data[k] = v[item]
-        return new_data  # type:ignore
+        return new_data
 
     @staticmethod
     def cat(instances_list: list["InstanceData"]) -> "InstanceData":
@@ -276,22 +278,24 @@ def cat(instances_list: list["InstanceData"]) -> "InstanceData":
 
         new_data = instances_list[0].__class__(metainfo=instances_list[0].metainfo)
         for k in instances_list[0].keys():
-            values = [results[k] for results in instances_list]
+            values: list[Any] = [results[k] for results in instances_list]
             v0 = values[0]
-            if isinstance(v0, torch.Tensor):
-                new_values = torch.cat(values, dim=0)
-            elif isinstance(v0, np.ndarray):
-                new_values = np.concatenate(values, axis=0)
-            elif isinstance(v0, str | list | tuple):
+            new_values: Any
+            # Use explicit type checking instead of isinstance to avoid mypy narrowing issues
+            if type(v0).__name__ == "Tensor" or isinstance(v0, torch.Tensor):
+                new_values = torch.cat(values, dim=0)  # type: ignore[arg-type]
+            elif type(v0).__name__ == "ndarray" or isinstance(v0, np.ndarray):
+                new_values = np.concatenate(values, axis=0)  # type: ignore[arg-type]
+            elif isinstance(v0, (str, list, tuple)):
                 new_values = v0[:]
                 for v in values[1:]:
-                    new_values += v
+                    new_values += v  # type: ignore[operator]
             elif hasattr(v0, "cat"):
-                new_values = v0.cat(values)
+                new_values = v0.cat(values)  # type: ignore[attr-defined]
             else:
                 raise ValueError(f"The type of `{k}` is `{type(v0)}` which has no attribute of `cat`")
             new_data[k] = new_values
-        return new_data  # type:ignore
+        return new_data
 
     def __len__(self) -> int:
         """int: The length of InstanceData."""
@@ -299,3 +303,14 @@ def __len__(self) -> int:
             return len(self.values()[0])
         else:
             return 0
+
+    # Provide type hints for commonly accessed dynamic attributes
+    if TYPE_CHECKING:
+        # These are the most commonly accessed attributes in visualization code
+        bboxes: torch.Tensor | "BaseBoxes"
+        labels: torch.Tensor
+        scores: torch.Tensor
+        masks: torch.Tensor | "BitmapMasks" | "PolygonMasks"
+        label_names: list[str]
+        priors: torch.Tensor  # Used in dense heads for anchor-based detection
+        level_ids: torch.Tensor  # Used to track which FPN level each instance belongs to
diff --git a/visdet/engine/structures/label_data.py b/visdet/engine/structures/label_data.py
index a21f2424..a8bf9407 100644
--- a/visdet/engine/structures/label_data.py
+++ b/visdet/engine/structures/label_data.py
@@ -1,5 +1,3 @@
-# ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
 
 import torch
diff --git a/visdet/engine/structures/pixel_data.py b/visdet/engine/structures/pixel_data.py
index dbf4de61..0a24f8cc 100644
--- a/visdet/engine/structures/pixel_data.py
+++ b/visdet/engine/structures/pixel_data.py
@@ -1,8 +1,7 @@
-# ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
 from collections.abc import Sequence
+from typing import TYPE_CHECKING, Any, overload
 
 import numpy as np
 import torch
@@ -95,10 +94,12 @@ def __getitem__(self, item: Sequence[int | slice]) -> "PixelData":
         new_data = self.__class__(metainfo=self.metainfo)
         if isinstance(item, tuple):
             assert len(item) == 2, "Only support to slice height and width"
+            shape = self.shape
+            assert shape is not None, "Cannot slice PixelData with no data fields"
             tmp_item: list[slice] = []
             for index, single_item in enumerate(item[::-1]):
                 if isinstance(single_item, int):
-                    tmp_item.insert(0, slice(single_item, None, self.shape[-index - 1]))
+                    tmp_item.insert(0, slice(single_item, None, shape[-index - 1]))
                 elif isinstance(single_item, slice):
                     tmp_item.insert(0, single_item)
                 else:
@@ -112,11 +113,19 @@ def __getitem__(self, item: Sequence[int | slice]) -> "PixelData":
         return new_data
 
     @property
-    def shape(self):
+    def shape(self) -> tuple[int, int] | None:
         """The shape of pixel data."""
         if len(self._data_fields) > 0:
-            return tuple(self.values()[0].shape[-2:])
+            first_value = self.values()[0]
+            if isinstance(first_value, (torch.Tensor, np.ndarray)):
+                return tuple(first_value.shape[-2:])  # type: ignore[return-value]
+            return None
         else:
             return None
 
+    # Provide specific type hints for common attributes
+    if TYPE_CHECKING:
+        # These are the most commonly accessed attributes in visualization code
+        sem_seg: torch.Tensor | np.ndarray
+
     # TODO padding, resize
diff --git a/visdet/engine/utils/dl_utils/collect_env.py b/visdet/engine/utils/dl_utils/collect_env.py
index 1558618b..ab51157a 100644
--- a/visdet/engine/utils/dl_utils/collect_env.py
+++ b/visdet/engine/utils/dl_utils/collect_env.py
@@ -38,7 +38,7 @@ def collect_env():
 
     cuda_available = is_cuda_available()
     env_info["CUDA available"] = cuda_available
-    env_info["numpy_random_seed"] = np.random.get_state()[1][0]
+    env_info["numpy_random_seed"] = np.random.get_state()[1][0]  # type: ignore[misc]
 
     if cuda_available:
         devices = defaultdict(list)
@@ -48,7 +48,7 @@ def collect_env():
             env_info["GPU " + ",".join(device_ids)] = name
 
     env_info["PyTorch"] = torch.__version__
-    env_info["TorchVision"] = torchvision.__version__
+    env_info["TorchVision"] = torchvision.__version__  # type: ignore[attr-defined]
     env_info["OpenCV"] = cv2.__version__
     env_info["VisEngine"] = visengine_version
 
diff --git a/visdet/engine/visualization/visualizer.py b/visdet/engine/visualization/visualizer.py
index e3ea53b0..ba5e3caf 100644
--- a/visdet/engine/visualization/visualizer.py
+++ b/visdet/engine/visualization/visualizer.py
@@ -197,7 +197,7 @@ def __init__(
             if (
                 save_dir_arg is not None
                 and save_dir_arg.default is save_dir_arg.empty
-                and vis_backend._save_dir is None
+                and getattr(vis_backend, "_save_dir", None) is None
             ):
                 warnings.warn(f"Failed to add {vis_backend.__class__}, please provide the `save_dir` argument.")
                 continue
@@ -271,7 +271,7 @@ def show(
 
             # Find a better way for inline to show the image
             if is_inline:
-                return fig
+                return fig  # type: ignore[return-value]
             wait_continue(fig, timeout=wait_time, continue_key=continue_key)
         elif backend == "cv2":
             # Keep images are shown in the same window, and the title of window
@@ -383,7 +383,7 @@ def _is_posion_valid(self, position: np.ndarray) -> bool:
         Returns:
             bool: Whether the position is in image.
         """
-        flag = (
+        flag = bool(
             (position[..., 0] < self.width).all()
             and (position[..., 0] >= 0).all()
             and (position[..., 1] < self.height).all()
@@ -694,7 +694,7 @@ def draw_circles(
         face_colors = color_val_matplotlib(face_colors)  # type: ignore
         circles = []
         for i in range(len(center)):
-            circles.append(Circle(tuple(center[i]), radius[i]))
+            circles.append(Circle(tuple(center[i]), float(radius[i])))
 
         if isinstance(line_widths, (int, float)):
             line_widths = [line_widths] * len(circles)
@@ -776,7 +776,7 @@ def draw_bboxes(
         ).reshape(-1, 4, 2)
         poly = [p for p in poly]
         return self.draw_polygons(
-            poly,
+            poly,  # type: ignore[arg-type]
             alpha=alpha,
             edge_colors=edge_colors,
             line_styles=line_styles,
@@ -833,12 +833,14 @@ def draw_polygons(
             polygons = [polygons]
         if isinstance(polygons, list):
             for polygon in polygons:
-                assert polygon.shape[1] == 2, (
-                    f"The shape of each polygon in `polygons` should be (M, 2), but got {polygon.shape}"
-                )
+                # Type narrowing: polygon is either np.ndarray or torch.Tensor here
+                if isinstance(polygon, (np.ndarray, torch.Tensor)):
+                    assert polygon.shape[1] == 2, (  # type: ignore
+                        f"The shape of each polygon in `polygons` should be (M, 2), but got {polygon.shape}"
+                    )
         polygons = [tensor2ndarray(polygon) for polygon in polygons]
         for polygon in polygons:
-            if not self._is_posion_valid(polygon):
+            if not self._is_posion_valid(tensor2ndarray(polygon)):
                 warnings.warn(
                     "Warning: The polygon is out of bounds, the drawn polygon may not be in the image",
                     UserWarning,
@@ -914,7 +916,7 @@ def draw_binary_masks(
             for channel in color:
                 assert 0 <= channel <= 255  # type: ignore
 
-        if isinstance(alphas, float):
+        if isinstance(alphas, (int, float)):
             alphas = [alphas] * binary_mask_len
 
         for binary_mask, color, alpha in zip(binary_masks, colors, alphas, strict=False):
@@ -991,7 +993,7 @@ def draw_featmap(
 
         assert isinstance(featmap, torch.Tensor), f"`featmap` should be torch.Tensor, but got {type(featmap)}"
         assert featmap.ndim == 3, f"Input dimension must be 3, but got {featmap.ndim}"
-        featmap = featmap.detach().cpu()
+        featmap = featmap.detach().cpu()  # type: ignore[misc]
 
         if overlaid_image is not None:
             if overlaid_image.ndim == 2:
@@ -1062,7 +1064,7 @@ def draw_featmap(
                 axes.axis("off")
                 axes.text(2, 15, f"channel: {indices[i]}", fontsize=10)
                 axes.imshow(convert_overlay_heatmap(topk_featmap[i], overlaid_image, alpha))
-            image = img_from_canvas(fig.canvas)
+            image = img_from_canvas(fig.canvas)  # type: ignore[arg-type]
             plt.close(fig)
             return image
 
@@ -1131,14 +1133,15 @@ def add_scalars(self, scalar_dict: dict, step: int = 0, file_path: str | None =
     @master_only
     def add_datasample(
         self,
-        name,
+        name: str,
         image: np.ndarray,
         data_sample: Optional["BaseDataElement"] = None,
         draw_gt: bool = True,
         draw_pred: bool = True,
         show: bool = False,
-        wait_time: int = 0,
+        wait_time: int | float = 0,
         step: int = 0,
+        **kwargs,  # Allow subclasses to add extra arguments like pred_score_thr, out_file, etc.
     ) -> None:
         """Draw datasample."""
         pass
@@ -1181,5 +1184,8 @@ def get_instance(cls, name: str, **kwargs) -> "Visualizer":
             object: Corresponding name instance.
         """
         instance = super().get_instance(name, **kwargs)
-        Visualizer._instance_dict[name] = instance
+        # Store instance in the class-level dict for get_current_instance()
+        if not hasattr(Visualizer, "_instance_dict"):
+            Visualizer._instance_dict = {}  # type: ignore[attr-defined]
+        Visualizer._instance_dict[name] = instance  # type: ignore[attr-defined,index,assignment]
         return instance
diff --git a/visdet/evaluation/metrics/coco_metric.py b/visdet/evaluation/metrics/coco_metric.py
index d19a390d..4479d7e0 100644
--- a/visdet/evaluation/metrics/coco_metric.py
+++ b/visdet/evaluation/metrics/coco_metric.py
@@ -78,8 +78,8 @@ def __init__(
         metric_items: Sequence[str] | None = None,
         format_only: bool = False,
         outfile_prefix: str | None = None,
-        file_client_args: dict = None,
-        backend_args: dict = None,
+        file_client_args: dict | None = None,
+        backend_args: dict | None = None,
         collect_device: str = "cpu",
         prefix: str | None = None,
         sort_categories: bool = False,
@@ -106,7 +106,7 @@ def __init__(
         # iou_thrs used to compute recall or precision.
         if iou_thrs is None:
             iou_thrs = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True)
-        self.iou_thrs = iou_thrs
+        self.iou_thrs: Sequence[float] = iou_thrs if isinstance(iou_thrs, Sequence) else [iou_thrs]
         self.metric_items = metric_items
         self.format_only = format_only
         if self.format_only:
@@ -143,8 +143,8 @@ def __init__(
             self._coco_api = None
 
         # handle dataset lazy init
-        self.cat_ids = None
-        self.img_ids = None
+        self.cat_ids: list[int] | None = None
+        self.img_ids: list[int] | None = None
 
     def fast_eval_recall(
         self,
@@ -167,6 +167,8 @@ def fast_eval_recall(
         """
         gt_bboxes = []
         pred_bboxes = [result["bboxes"] for result in results]
+        assert self.img_ids is not None, "img_ids must be initialized"
+        assert self._coco_api is not None, "coco_api must be initialized"
         for i in range(len(self.img_ids)):
             ann_ids = self._coco_api.get_ann_ids(img_ids=self.img_ids[i])
             ann_info = self._coco_api.load_anns(ann_ids)
@@ -240,6 +242,7 @@ def results2json(self, results: Sequence[dict], outfile_prefix: str) -> dict:
                 data["image_id"] = image_id
                 data["bbox"] = self.xyxy2xywh(bboxes[i])
                 data["score"] = float(scores[i])
+                assert self.cat_ids is not None, "cat_ids must be initialized"
                 data["category_id"] = self.cat_ids[label]
                 bbox_json_results.append(data)
 
@@ -254,6 +257,7 @@ def results2json(self, results: Sequence[dict], outfile_prefix: str) -> dict:
                 data["image_id"] = image_id
                 data["bbox"] = self.xyxy2xywh(bboxes[i])
                 data["score"] = float(mask_scores[i])
+                assert self.cat_ids is not None, "cat_ids must be initialized"
                 data["category_id"] = self.cat_ids[label]
                 if isinstance(masks[i]["counts"], bytes):
                     masks[i]["counts"] = masks[i]["counts"].decode()
@@ -282,6 +286,7 @@ def gt_to_coco_json(self, gt_dicts: Sequence[dict], outfile_prefix: str) -> str:
         Returns:
             str: The filename of the json file.
         """
+        assert self.dataset_meta is not None, "dataset_meta must be initialized"
         categories = [dict(id=id, name=name) for id, name in enumerate(self.dataset_meta["classes"])]
         image_infos = []
         annotations = []
@@ -411,6 +416,8 @@ def compute_metrics(self, results: list) -> dict[str, float]:
             self._coco_api = COCO(coco_json_path)
 
         # handle lazy init
+        assert self._coco_api is not None, "coco_api must be initialized"
+        assert self.dataset_meta is not None, "dataset_meta must be initialized"
         if self.cat_ids is None:
             self.cat_ids = self._coco_api.get_cat_ids(cat_names=self.dataset_meta["classes"])
         if self.img_ids is None:
@@ -516,9 +523,14 @@ def compute_metrics(self, results: list) -> dict[str, float]:
                 if self.classwise:  # Compute per-category AP
                     # Compute per-category AP
                     # from https://github.com/facebookresearch/detectron2/
-                    precisions = coco_eval.eval["precision"]
+                    precisions_raw = coco_eval.eval["precision"]
                     # precision: (iou, recall, cls, area range, max dets)
-                    assert len(self.cat_ids) == precisions.shape[2]
+                    assert isinstance(precisions_raw, np.ndarray), "precisions must be ndarray"
+                    precisions: np.ndarray = precisions_raw
+                    assert self.cat_ids is not None, "cat_ids must be initialized"
+                    # Type narrowing for ndarray shape attribute
+                    precisions_shape: tuple[int, ...] = precisions.shape  # type: ignore[assignment]
+                    assert len(self.cat_ids) == precisions_shape[2]
 
                     results_per_category = []
                     for idx, cat_id in enumerate(self.cat_ids):
diff --git a/visdet/models/backbones/hrnet.py b/visdet/models/backbones/hrnet.py
index 902a5e85..f988d374 100644
--- a/visdet/models/backbones/hrnet.py
+++ b/visdet/models/backbones/hrnet.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
+from typing import Any
 
 import torch.nn as nn
 from torch.nn.modules.batchnorm import _BatchNorm
@@ -17,6 +18,17 @@ class HRModule(BaseModule):
     is in this module.
     """
 
+    block_init_cfg: dict[str, Any] | None
+    in_channels: list[int]
+    num_branches: int
+    multiscale_output: bool
+    norm_cfg: dict[str, Any]
+    conv_cfg: dict[str, Any] | None
+    with_cp: bool
+    branches: ModuleList
+    fuse_layers: nn.ModuleList | None
+    relu: nn.ReLU
+
     def __init__(
         self,
         num_branches,
@@ -32,16 +44,16 @@ def __init__(
         init_cfg=None,
     ):
         super(HRModule, self).__init__(init_cfg)
-        self.block_init_cfg = block_init_cfg
+        self.block_init_cfg = block_init_cfg  # type: ignore[unresolved-attribute]
         self._check_branches(num_branches, num_blocks, in_channels, num_channels)
 
         self.in_channels = in_channels
         self.num_branches = num_branches
 
-        self.multiscale_output = multiscale_output
-        self.norm_cfg = norm_cfg
-        self.conv_cfg = conv_cfg
-        self.with_cp = with_cp
+        self.multiscale_output = multiscale_output  # type: ignore[unresolved-attribute]
+        self.norm_cfg = norm_cfg  # type: ignore[unresolved-attribute]
+        self.conv_cfg = conv_cfg  # type: ignore[unresolved-attribute]
+        self.with_cp = with_cp  # type: ignore[unresolved-attribute]
         self.branches = self._make_branches(num_branches, blocks, num_blocks, num_channels)
         self.fuse_layers = self._make_fuse_layers()
         self.relu = nn.ReLU(inplace=False)
@@ -187,6 +199,7 @@ def forward(self, x):
             x[i] = self.branches[i](x[i])
 
         x_fuse = []
+        assert self.fuse_layers is not None
         for i in range(len(self.fuse_layers)):
             y = 0
             for j in range(self.num_branches):
@@ -276,6 +289,30 @@ class HRNet(BaseModule):
 
     blocks_dict = {"BASIC": BasicBlock, "BOTTLENECK": Bottleneck}
 
+    pretrained: str | None
+    extra: dict[str, Any]
+    conv_cfg: dict[str, Any] | None
+    norm_cfg: dict[str, Any]
+    norm_eval: bool
+    with_cp: bool
+    zero_init_residual: bool
+    norm1_name: str
+    norm2_name: str
+    conv1: nn.Module
+    conv2: nn.Module
+    relu: nn.ReLU
+    layer1: Sequential
+    stage1_cfg: dict[str, Any]
+    stage2_cfg: dict[str, Any]
+    stage3_cfg: dict[str, Any]
+    stage4_cfg: dict[str, Any]
+    transition1: nn.ModuleList
+    transition2: nn.ModuleList
+    transition3: nn.ModuleList
+    stage2: Sequential
+    stage3: Sequential
+    stage4: Sequential
+
     def __init__(
         self,
         extra,
@@ -291,14 +328,14 @@ def __init__(
     ):
         super(HRNet, self).__init__(init_cfg)
 
-        self.pretrained = pretrained
+        self.pretrained = pretrained  # type: ignore[unresolved-attribute]
         assert not (init_cfg and pretrained), "init_cfg and pretrained cannot be specified at the same time"
         if isinstance(pretrained, str):
             warnings.warn('DeprecationWarning: pretrained is deprecated, please use "init_cfg" instead')
-            self.init_cfg = dict(type="Pretrained", checkpoint=pretrained)
+            self.init_cfg = dict(type="Pretrained", checkpoint=pretrained)  # type: ignore[unresolved-attribute]
         elif pretrained is None:
             if init_cfg is None:
-                self.init_cfg = [
+                self.init_cfg = [  # type: ignore[unresolved-attribute]
                     dict(type="Kaiming", layer="Conv2d"),
                     dict(type="Constant", val=1, layer=["_BatchNorm", "GroupNorm"]),
                 ]
@@ -314,15 +351,15 @@ def __init__(
             assert len(cfg["num_blocks"]) == cfg["num_branches"] and len(cfg["num_channels"]) == cfg["num_branches"]
 
         self.extra = extra
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.norm_eval = norm_eval
-        self.with_cp = with_cp
-        self.zero_init_residual = zero_init_residual
+        self.conv_cfg = conv_cfg  # type: ignore[unresolved-attribute]
+        self.norm_cfg = norm_cfg  # type: ignore[unresolved-attribute]
+        self.norm_eval = norm_eval  # type: ignore[unresolved-attribute]
+        self.with_cp = with_cp  # type: ignore[unresolved-attribute]
+        self.zero_init_residual = zero_init_residual  # type: ignore[unresolved-attribute]
 
         # stem net
-        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)  # type: ignore[unresolved-attribute]
+        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)  # type: ignore[unresolved-attribute]
 
         self.conv1 = build_conv_layer(
             self.conv_cfg,
diff --git a/visdet/models/backbones/regnet.py b/visdet/models/backbones/regnet.py
index 546008de..84cf0f59 100644
--- a/visdet/models/backbones/regnet.py
+++ b/visdet/models/backbones/regnet.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
+from typing import Any
 
 import numpy as np
 import torch.nn as nn
@@ -79,6 +80,11 @@ class RegNet(ResNet):
         "regnetx_12gf": dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, bot_mul=1.0),
     }
 
+    bottleneck_ratio: list[float]
+    stage_widths: list[int]
+    group_widths: list[int]
+    plugins: Any
+
     def __init__(
         self,
         arch,
@@ -122,41 +128,41 @@ def __init__(
         stage_widths, stage_blocks = self.get_stages_from_blocks(widths)
         # Generate group widths and bot muls
         group_widths = [arch["group_w"] for _ in range(num_stages)]
-        self.bottleneck_ratio = [arch["bot_mul"] for _ in range(num_stages)]
+        self.bottleneck_ratio = [arch["bot_mul"] for _ in range(num_stages)]  # type: ignore[unresolved-attribute]
         # Adjust the compatibility of stage_widths and group_widths
         stage_widths, group_widths = self.adjust_width_group(stage_widths, self.bottleneck_ratio, group_widths)
 
         # Group params by stage
-        self.stage_widths = stage_widths
-        self.group_widths = group_widths
-        self.depth = sum(stage_blocks)
-        self.stem_channels = stem_channels
-        self.base_channels = base_channels
-        self.num_stages = num_stages
+        self.stage_widths = stage_widths  # type: ignore[unresolved-attribute]
+        self.group_widths = group_widths  # type: ignore[unresolved-attribute]
+        self.depth = sum(stage_blocks)  # type: ignore[unresolved-attribute]
+        self.stem_channels = stem_channels  # type: ignore[unresolved-attribute]
+        self.base_channels = base_channels  # type: ignore[unresolved-attribute]
+        self.num_stages = num_stages  # type: ignore[unresolved-attribute]
         assert num_stages >= 1 and num_stages <= 4
-        self.strides = strides
-        self.dilations = dilations
+        self.strides = strides  # type: ignore[unresolved-attribute]
+        self.dilations = dilations  # type: ignore[unresolved-attribute]
         assert len(strides) == len(dilations) == num_stages
-        self.out_indices = out_indices
+        self.out_indices = out_indices  # type: ignore[unresolved-attribute]
         assert max(out_indices) < num_stages
-        self.style = style
-        self.deep_stem = deep_stem
-        self.avg_down = avg_down
-        self.frozen_stages = frozen_stages
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.with_cp = with_cp
-        self.norm_eval = norm_eval
-        self.dcn = dcn
-        self.stage_with_dcn = stage_with_dcn
+        self.style = style  # type: ignore[unresolved-attribute]
+        self.deep_stem = deep_stem  # type: ignore[unresolved-attribute]
+        self.avg_down = avg_down  # type: ignore[unresolved-attribute]
+        self.frozen_stages = frozen_stages  # type: ignore[unresolved-attribute]
+        self.conv_cfg = conv_cfg  # type: ignore[unresolved-attribute]
+        self.norm_cfg = norm_cfg  # type: ignore[unresolved-attribute]
+        self.with_cp = with_cp  # type: ignore[unresolved-attribute]
+        self.norm_eval = norm_eval  # type: ignore[unresolved-attribute]
+        self.dcn = dcn  # type: ignore[unresolved-attribute]
+        self.stage_with_dcn = stage_with_dcn  # type: ignore[unresolved-attribute]
         if dcn is not None:
             assert len(stage_with_dcn) == num_stages
-        self.plugins = plugins
-        self.zero_init_residual = zero_init_residual
-        self.block = Bottleneck
+        self.plugins = plugins  # type: ignore[unresolved-attribute]
+        self.zero_init_residual = zero_init_residual  # type: ignore[unresolved-attribute]
+        self.block = Bottleneck  # type: ignore[unresolved-attribute]
         expansion_bak = self.block.expansion
         self.block.expansion = 1
-        self.stage_blocks = stage_blocks[:num_stages]
+        self.stage_blocks = stage_blocks[:num_stages]  # type: ignore[unresolved-attribute]
 
         self._make_stem_layer(in_channels, stem_channels)
 
@@ -164,10 +170,10 @@ def __init__(
         assert not (init_cfg and pretrained), "init_cfg and pretrained cannot be specified at the same time"
         if isinstance(pretrained, str):
             warnings.warn('DeprecationWarning: pretrained is deprecated, please use "init_cfg" instead')
-            self.init_cfg = dict(type="Pretrained", checkpoint=pretrained)
+            self.init_cfg = dict(type="Pretrained", checkpoint=pretrained)  # type: ignore[unresolved-attribute]
         elif pretrained is None:
             if init_cfg is None:
-                self.init_cfg = [
+                self.init_cfg = [  # type: ignore[unresolved-attribute]
                     dict(type="Kaiming", layer="Conv2d"),
                     dict(type="Constant", val=1, layer=["_BatchNorm", "GroupNorm"]),
                 ]
@@ -176,8 +182,8 @@ def __init__(
         else:
             raise TypeError("pretrained must be a str or None")
 
-        self.inplanes = stem_channels
-        self.res_layers = []
+        self.inplanes = stem_channels  # type: ignore[unresolved-attribute]
+        self.res_layers = []  # type: ignore[unresolved-attribute]
         for i, num_blocks in enumerate(self.stage_blocks):
             stride = self.strides[i]
             dilation = self.dilations[i]
@@ -187,7 +193,7 @@ def __init__(
 
             dcn = self.dcn if self.stage_with_dcn[i] else None
             if self.plugins is not None:
-                stage_plugins = self.make_stage_plugins(self.plugins, i)
+                stage_plugins = self.make_stage_plugins(self.plugins, i)  # type: ignore[call-non-callable]
             else:
                 stage_plugins = None
 
@@ -210,14 +216,14 @@ def __init__(
                 base_channels=self.stage_widths[i],
                 init_cfg=block_init_cfg,
             )
-            self.inplanes = self.stage_widths[i]
+            self.inplanes = self.stage_widths[i]  # type: ignore[unresolved-attribute]
             layer_name = f"layer{i + 1}"
             self.add_module(layer_name, res_layer)
             self.res_layers.append(layer_name)
 
         self._freeze_stages()
 
-        self.feat_dim = stage_widths[-1]
+        self.feat_dim = stage_widths[-1]  # type: ignore[unresolved-attribute]
         self.block.expansion = expansion_bak
 
     def _make_stem_layer(self, in_channels, base_channels):
@@ -230,7 +236,7 @@ def _make_stem_layer(self, in_channels, base_channels):
             padding=1,
             bias=False,
         )
-        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, base_channels, postfix=1)
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, base_channels, postfix=1)  # type: ignore[unresolved-attribute]
         self.add_module(self.norm1_name, norm1)
         self.relu = nn.ReLU(inplace=True)
 
diff --git a/visdet/models/backbones/res2net.py b/visdet/models/backbones/res2net.py
index bb1c2e08..e9815aa0 100644
--- a/visdet/models/backbones/res2net.py
+++ b/visdet/models/backbones/res2net.py
@@ -42,8 +42,8 @@ def __init__(
         assert scales > 1, "Res2Net degenerates to ResNet when scales = 1."
         width = int(math.floor(self.planes * (base_width / 64)))
 
-        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, width * scales, postfix=1)
-        self.norm3_name, norm3 = build_norm_layer(self.norm_cfg, self.planes * self.expansion, postfix=3)
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, width * scales, postfix=1)  # type: ignore[unresolved-attribute]
+        self.norm3_name, norm3 = build_norm_layer(self.norm_cfg, self.planes * self.expansion, postfix=3)  # type: ignore[unresolved-attribute]
 
         self.conv1 = build_conv_layer(
             self.conv_cfg,
@@ -87,9 +87,9 @@ def __init__(
         )
         self.add_module(self.norm3_name, norm3)
 
-        self.stage_type = stage_type
-        self.scales = scales
-        self.width = width
+        self.stage_type = stage_type  # type: ignore[unresolved-attribute]
+        self.scales = scales  # type: ignore[unresolved-attribute]
+        self.width = width  # type: ignore[unresolved-attribute]
 
         # Remove conv2 since we replaced it with multi-scale convs
         delattr(self, "conv2")
@@ -291,8 +291,8 @@ def __init__(
         avg_down=True,
         **kwargs,
     ):
-        self.scales = scales
-        self.base_width = base_width
+        self.scales = scales  # type: ignore[unresolved-attribute]
+        self.base_width = base_width  # type: ignore[unresolved-attribute]
         super(Res2Net, self).__init__(
             depth=depth,
             deep_stem=deep_stem,
diff --git a/visdet/models/backbones/resnest.py b/visdet/models/backbones/resnest.py
index 7f213605..64780ad7 100644
--- a/visdet/models/backbones/resnest.py
+++ b/visdet/models/backbones/resnest.py
@@ -75,8 +75,8 @@ def __init__(
     ):
         super(SplitAttentionConv2d, self).__init__(init_cfg)
         inter_channels = max(in_channels * radix // reduction_factor, 32)
-        self.radix = radix
-        self.groups = groups
+        self.radix = radix  # type: ignore[unresolved-attribute]
+        self.groups = groups  # type: ignore[unresolved-attribute]
         self.channels = channels
         self.conv = build_conv_layer(
             conv_cfg,
@@ -90,11 +90,11 @@ def __init__(
             bias=False,
         )
         # To be consistent with original implementation, starting from 0
-        self.norm0_name, norm0 = build_norm_layer(norm_cfg, channels * radix, postfix=0)
+        self.norm0_name, norm0 = build_norm_layer(norm_cfg, channels * radix, postfix=0)  # type: ignore[unresolved-attribute]
         self.add_module(self.norm0_name, norm0)
         self.relu = nn.ReLU(inplace=True)
         self.fc1 = build_conv_layer(None, channels, inter_channels, 1, groups=self.groups)
-        self.norm1_name, norm1 = build_norm_layer(norm_cfg, inter_channels, postfix=1)
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, inter_channels, postfix=1)  # type: ignore[unresolved-attribute]
         self.add_module(self.norm1_name, norm1)
         self.fc2 = build_conv_layer(None, inter_channels, channels * radix, 1, groups=self.groups)
         self.rsoftmax = RSoftmax(radix, groups)
@@ -177,10 +177,10 @@ def __init__(
         else:
             width = math.floor(self.planes * (base_width / base_channels)) * groups
 
-        self.avg_down_stride = avg_down_stride and self.conv2_stride > 1
+        self.avg_down_stride = avg_down_stride and self.conv2_stride > 1  # type: ignore[unresolved-attribute]
 
-        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, width, postfix=1)
-        self.norm3_name, norm3 = build_norm_layer(self.norm_cfg, self.planes * self.expansion, postfix=3)
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, width, postfix=1)  # type: ignore[unresolved-attribute]
+        self.norm3_name, norm3 = build_norm_layer(self.norm_cfg, self.planes * self.expansion, postfix=3)  # type: ignore[unresolved-attribute]
 
         self.conv1 = build_conv_layer(
             self.conv_cfg,
@@ -290,11 +290,11 @@ def __init__(
         avg_down=True,
         **kwargs,
     ):
-        self.groups = groups
-        self.base_width = base_width
-        self.radix = radix
-        self.reduction_factor = reduction_factor
-        self.avg_down_stride = avg_down_stride
+        self.groups = groups  # type: ignore[unresolved-attribute]
+        self.base_width = base_width  # type: ignore[unresolved-attribute]
+        self.radix = radix  # type: ignore[unresolved-attribute]
+        self.reduction_factor = reduction_factor  # type: ignore[unresolved-attribute]
+        self.avg_down_stride = avg_down_stride  # type: ignore[unresolved-attribute]
         super(ResNeSt, self).__init__(
             depth=depth,
             deep_stem=deep_stem,
diff --git a/visdet/models/backbones/resnet.py b/visdet/models/backbones/resnet.py
index 8de2bff3..f0a5c2af 100644
--- a/visdet/models/backbones/resnet.py
+++ b/visdet/models/backbones/resnet.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import logging
 import warnings
+from typing import Any
 
 import torch.nn as nn
 import torch.utils.checkpoint as cp
@@ -34,6 +35,16 @@ class BasicBlock(BaseModule):
 
     expansion = 1
 
+    norm1_name: str
+    norm2_name: str
+    conv1: nn.Module
+    conv2: nn.Module
+    relu: nn.ReLU
+    downsample: nn.Module | None
+    stride: int
+    dilation: int
+    with_cp: bool
+
     def __init__(
         self,
         inplanes,
@@ -53,8 +64,8 @@ def __init__(
         assert dcn is None, "DCN is not supported in BasicBlock"
         assert plugins is None, "Plugins are not supported yet"
 
-        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)  # type: ignore[unresolved-attribute]
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)  # type: ignore[unresolved-attribute]
 
         self.conv1 = build_conv_layer(
             conv_cfg,
@@ -71,10 +82,10 @@ def __init__(
         self.add_module(self.norm2_name, norm2)
 
         self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-        self.dilation = dilation
-        self.with_cp = with_cp
+        self.downsample = downsample  # type: ignore[unresolved-attribute]
+        self.stride = stride  # type: ignore[unresolved-attribute]
+        self.dilation = dilation  # type: ignore[unresolved-attribute]
+        self.with_cp = with_cp  # type: ignore[unresolved-attribute]
 
     @property
     def norm1(self):
@@ -136,6 +147,27 @@ class Bottleneck(BaseModule):
 
     expansion = 4
 
+    inplanes: int
+    planes: int
+    stride: int
+    dilation: int
+    style: str
+    with_cp: bool
+    conv_cfg: dict[str, Any] | None
+    norm_cfg: dict[str, Any]
+    dcn: dict[str, Any] | None
+    with_dcn: bool
+    conv1_stride: int
+    conv2_stride: int
+    norm1_name: str
+    norm2_name: str
+    norm3_name: str
+    conv1: nn.Module
+    conv2: nn.Module
+    conv3: nn.Module
+    relu: nn.ReLU
+    downsample: nn.Module | None
+
     def __init__(
         self,
         inplanes,
@@ -161,27 +193,27 @@ def __init__(
         assert dcn is None or isinstance(dcn, dict)
         assert plugins is None, "Plugins are not supported yet"
 
-        self.inplanes = inplanes
-        self.planes = planes
-        self.stride = stride
-        self.dilation = dilation
-        self.style = style
-        self.with_cp = with_cp
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.dcn = dcn
-        self.with_dcn = dcn is not None
+        self.inplanes = inplanes  # type: ignore[unresolved-attribute]
+        self.planes = planes  # type: ignore[unresolved-attribute]
+        self.stride = stride  # type: ignore[unresolved-attribute]
+        self.dilation = dilation  # type: ignore[unresolved-attribute]
+        self.style = style  # type: ignore[unresolved-attribute]
+        self.with_cp = with_cp  # type: ignore[unresolved-attribute]
+        self.conv_cfg = conv_cfg  # type: ignore[unresolved-attribute]
+        self.norm_cfg = norm_cfg  # type: ignore[unresolved-attribute]
+        self.dcn = dcn  # type: ignore[unresolved-attribute]
+        self.with_dcn = dcn is not None  # type: ignore[unresolved-attribute]
 
         if self.style == "pytorch":
-            self.conv1_stride = 1
-            self.conv2_stride = stride
+            self.conv1_stride = 1  # type: ignore[unresolved-attribute]
+            self.conv2_stride = stride  # type: ignore[unresolved-attribute]
         else:
-            self.conv1_stride = stride
-            self.conv2_stride = 1
+            self.conv1_stride = stride  # type: ignore[unresolved-attribute]
+            self.conv2_stride = 1  # type: ignore[unresolved-attribute]
 
-        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
-        self.norm3_name, norm3 = build_norm_layer(norm_cfg, planes * self.expansion, postfix=3)
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)  # type: ignore[unresolved-attribute]
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)  # type: ignore[unresolved-attribute]
+        self.norm3_name, norm3 = build_norm_layer(norm_cfg, planes * self.expansion, postfix=3)  # type: ignore[unresolved-attribute]
 
         self.conv1 = build_conv_layer(
             conv_cfg,
@@ -215,7 +247,7 @@ def __init__(
         self.add_module(self.norm3_name, norm3)
 
         self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
+        self.downsample = downsample  # type: ignore[unresolved-attribute]
 
     @property
     def norm1(self):
@@ -324,6 +356,35 @@ class ResNet(BaseModule):
         152: (Bottleneck, (3, 8, 36, 3)),
     }
 
+    zero_init_residual: bool
+    depth: int
+    stem_channels: int
+    base_channels: int
+    num_stages: int
+    strides: tuple[int, ...]
+    dilations: tuple[int, ...]
+    out_indices: tuple[int, ...]
+    style: str
+    deep_stem: bool
+    avg_down: bool
+    frozen_stages: int
+    conv_cfg: dict[str, Any] | None
+    norm_cfg: dict[str, Any]
+    with_cp: bool
+    norm_eval: bool
+    dcn: dict[str, Any] | None
+    stage_with_dcn: tuple[bool, ...]
+    block: type[BasicBlock] | type[Bottleneck]
+    stage_blocks: tuple[int, ...]
+    inplanes: int
+    res_layers: list[str]
+    feat_dim: int
+    stem: nn.Sequential
+    conv1: nn.Module
+    norm1_name: str
+    relu: nn.ReLU
+    maxpool: nn.MaxPool2d
+
     def __init__(
         self,
         depth,
@@ -349,7 +410,7 @@ def __init__(
         init_cfg=None,
     ):
         super(ResNet, self).__init__(init_cfg=init_cfg)
-        self.zero_init_residual = zero_init_residual
+        self.zero_init_residual = zero_init_residual  # type: ignore[unresolved-attribute]
         if depth not in self.arch_settings:
             raise KeyError(f"invalid depth {depth} for resnet")
 
@@ -357,10 +418,10 @@ def __init__(
         assert not (init_cfg and pretrained), "init_cfg and pretrained cannot be specified at the same time"
         if isinstance(pretrained, str):
             warnings.warn("DeprecationWarning: pretrained is deprecated, please use 'init_cfg' instead")
-            self.init_cfg = dict(type="Pretrained", checkpoint=pretrained)
+            self.init_cfg = dict(type="Pretrained", checkpoint=pretrained)  # type: ignore[unresolved-attribute]
         elif pretrained is None:
             if init_cfg is None:
-                self.init_cfg = [
+                self.init_cfg = [  # type: ignore[unresolved-attribute]
                     dict(type="Kaiming", layer="Conv2d"),
                     dict(type="Constant", val=1, layer=["_BatchNorm", "GroupNorm"]),
                 ]
@@ -373,37 +434,37 @@ def __init__(
         else:
             raise TypeError("pretrained must be a str or None")
 
-        self.depth = depth
+        self.depth = depth  # type: ignore[unresolved-attribute]
         if stem_channels is None:
             stem_channels = base_channels
-        self.stem_channels = stem_channels
-        self.base_channels = base_channels
-        self.num_stages = num_stages
+        self.stem_channels = stem_channels  # type: ignore[unresolved-attribute]
+        self.base_channels = base_channels  # type: ignore[unresolved-attribute]
+        self.num_stages = num_stages  # type: ignore[unresolved-attribute]
         assert num_stages >= 1 and num_stages <= 4
-        self.strides = strides
-        self.dilations = dilations
+        self.strides = strides  # type: ignore[unresolved-attribute]
+        self.dilations = dilations  # type: ignore[unresolved-attribute]
         assert len(strides) == len(dilations) == num_stages
-        self.out_indices = out_indices
+        self.out_indices = out_indices  # type: ignore[unresolved-attribute]
         assert max(out_indices) < num_stages
-        self.style = style
-        self.deep_stem = deep_stem
-        self.avg_down = avg_down
-        self.frozen_stages = frozen_stages
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.with_cp = with_cp
-        self.norm_eval = norm_eval
-        self.dcn = dcn
-        self.stage_with_dcn = stage_with_dcn
+        self.style = style  # type: ignore[unresolved-attribute]
+        self.deep_stem = deep_stem  # type: ignore[unresolved-attribute]
+        self.avg_down = avg_down  # type: ignore[unresolved-attribute]
+        self.frozen_stages = frozen_stages  # type: ignore[unresolved-attribute]
+        self.conv_cfg = conv_cfg  # type: ignore[unresolved-attribute]
+        self.norm_cfg = norm_cfg  # type: ignore[unresolved-attribute]
+        self.with_cp = with_cp  # type: ignore[unresolved-attribute]
+        self.norm_eval = norm_eval  # type: ignore[unresolved-attribute]
+        self.dcn = dcn  # type: ignore[unresolved-attribute]
+        self.stage_with_dcn = stage_with_dcn  # type: ignore[unresolved-attribute]
         if dcn is not None:
             assert len(stage_with_dcn) == num_stages
-        self.block, stage_blocks = self.arch_settings[depth]
-        self.stage_blocks = stage_blocks[:num_stages]
-        self.inplanes = stem_channels
+        self.block, stage_blocks = self.arch_settings[depth]  # type: ignore[unresolved-attribute]
+        self.stage_blocks = stage_blocks[:num_stages]  # type: ignore[unresolved-attribute]
+        self.inplanes = stem_channels  # type: ignore[unresolved-attribute]
 
         self._make_stem_layer(in_channels, stem_channels)
 
-        self.res_layers = []
+        self.res_layers = []  # type: ignore[unresolved-attribute]
         for i, num_blocks in enumerate(self.stage_blocks):
             stride = strides[i]
             dilation = dilations[i]
@@ -424,14 +485,14 @@ def __init__(
                 dcn=dcn,
                 init_cfg=block_init_cfg,
             )
-            self.inplanes = planes * self.block.expansion
+            self.inplanes = planes * self.block.expansion  # type: ignore[unresolved-attribute]
             layer_name = f"layer{i + 1}"
             self.add_module(layer_name, res_layer)
             self.res_layers.append(layer_name)
 
         self._freeze_stages()
 
-        self.feat_dim = self.block.expansion * base_channels * 2 ** (len(self.stage_blocks) - 1)
+        self.feat_dim = self.block.expansion * base_channels * 2 ** (len(self.stage_blocks) - 1)  # type: ignore[unresolved-attribute]
 
     def make_res_layer(self, **kwargs):
         """Pack all blocks in a stage into a ``ResLayer``."""
@@ -489,7 +550,7 @@ def _make_stem_layer(self, in_channels, stem_channels):
                 padding=3,
                 bias=False,
             )
-            self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, stem_channels, postfix=1)
+            self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, stem_channels, postfix=1)  # type: ignore[unresolved-attribute]
             self.add_module(self.norm1_name, norm1)
             self.relu = nn.ReLU(inplace=True)
         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
diff --git a/visdet/models/backbones/resnext.py b/visdet/models/backbones/resnext.py
index 87110912..25b741a8 100644
--- a/visdet/models/backbones/resnext.py
+++ b/visdet/models/backbones/resnext.py
@@ -19,9 +19,9 @@ class Bottleneck(_Bottleneck):
 
     def __init__(self, inplanes, planes, groups=1, base_width=4, base_channels=64, **kwargs):
         # Extract groups and base_width before calling parent
-        self.groups = groups
-        self.base_width = base_width
-        self.base_channels = base_channels
+        self.groups = groups  # type: ignore[unresolved-attribute]
+        self.base_width = base_width  # type: ignore[unresolved-attribute]
+        self.base_channels = base_channels  # type: ignore[unresolved-attribute]
 
         super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
 
@@ -31,9 +31,9 @@ def __init__(self, inplanes, planes, groups=1, base_width=4, base_channels=64, *
             width = math.floor(self.planes * (base_width / base_channels)) * groups
 
         # Rebuild norm and conv layers with grouped convolutions
-        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, width, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, width, postfix=2)
-        self.norm3_name, norm3 = build_norm_layer(self.norm_cfg, self.planes * self.expansion, postfix=3)
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, width, postfix=1)  # type: ignore[unresolved-attribute]
+        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, width, postfix=2)  # type: ignore[unresolved-attribute]
+        self.norm3_name, norm3 = build_norm_layer(self.norm_cfg, self.planes * self.expansion, postfix=3)  # type: ignore[unresolved-attribute]
 
         self.conv1 = build_conv_layer(
             self.conv_cfg,
@@ -104,8 +104,8 @@ class ResNeXt(ResNet):
     }
 
     def __init__(self, groups=1, base_width=4, **kwargs):
-        self.groups = groups
-        self.base_width = base_width
+        self.groups = groups  # type: ignore[unresolved-attribute]
+        self.base_width = base_width  # type: ignore[unresolved-attribute]
         super(ResNeXt, self).__init__(**kwargs)
 
     def make_res_layer(self, **kwargs):
diff --git a/visdet/models/backbones/swin.py b/visdet/models/backbones/swin.py
index b53fecf8..85d1f8c2 100644
--- a/visdet/models/backbones/swin.py
+++ b/visdet/models/backbones/swin.py
@@ -61,14 +61,14 @@ def __init__(
         self.num_heads = num_heads
         head_embed_dims = embed_dims // num_heads
         self.scale = qk_scale or head_embed_dims**-0.5
-        self.init_cfg = init_cfg
+        self.init_cfg = init_cfg  # type: ignore[unresolved-attribute]
         if backend not in {"torch", "flash"}:
             raise ValueError(f"Unsupported attention backend: {backend}")
         # Fall back to torch if flash is not available
         if backend == "flash" and flash_swin_attn_func is None:
-            self.backend = "torch"
+            self.backend = "torch"  # type: ignore[unresolved-attribute]
         else:
-            self.backend = backend
+            self.backend = backend  # type: ignore[unresolved-attribute]
         self.head_embed_dims = head_embed_dims
 
         # define a parameter table of relative position bias
@@ -89,7 +89,7 @@ def __init__(
         self.proj_drop = nn.Dropout(proj_drop_rate)
 
         self.softmax = nn.Softmax(dim=-1)
-        self._flash_fallback_warned = False
+        self._flash_fallback_warned = False  # type: ignore[unresolved-attribute]
 
     def init_weights(self):
         trunc_normal_(self.relative_position_bias_table, std=0.02)
@@ -108,7 +108,7 @@ def forward(self, x, mask=None):
         # make torchscript happy (cannot use tensor as tuple)
         q, k, v = qkv[0], qkv[1], qkv[2]
 
-        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(  # type: ignore[call-non-callable,no-matching-overload]
             self.window_size[0] * self.window_size[1],
             self.window_size[0] * self.window_size[1],
             -1,
@@ -185,9 +185,9 @@ def __init__(
         super().__init__(init_cfg)
 
         self.window_size = window_size
-        self.shift_size = shift_size
+        self.shift_size = shift_size  # type: ignore[unresolved-attribute]
         assert 0 <= self.shift_size < self.window_size
-        self.backend = backend
+        self.backend = backend  # type: ignore[unresolved-attribute]
 
         self.w_msa = WindowMSA(
             embed_dims=embed_dims,
@@ -349,9 +349,9 @@ def __init__(
     ):
         super(SwinBlock, self).__init__()
 
-        self.init_cfg = init_cfg
-        self.with_cp = with_cp
-        self.attn_backend = attn_backend
+        self.init_cfg = init_cfg  # type: ignore[unresolved-attribute]
+        self.with_cp = with_cp  # type: ignore[unresolved-attribute]
+        self.attn_backend = attn_backend  # type: ignore[unresolved-attribute]
 
         self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
         self.attn = ShiftWindowMSA(
@@ -480,7 +480,7 @@ def __init__(
             )
             self.blocks.append(block)
 
-        self.downsample = downsample
+        self.downsample = downsample  # type: ignore[unresolved-attribute]
 
     def forward(self, x, hw_shape):
         for block in self.blocks:
@@ -580,8 +580,8 @@ def __init__(
         frozen_stages=-1,
         init_cfg=None,
     ):
-        self.convert_weights = convert_weights
-        self.frozen_stages = frozen_stages
+        self.convert_weights = convert_weights  # type: ignore[unresolved-attribute]
+        self.frozen_stages = frozen_stages  # type: ignore[unresolved-attribute]
         if isinstance(pretrain_img_size, int):
             pretrain_img_size = to_2tuple(pretrain_img_size)
         elif isinstance(pretrain_img_size, tuple):
@@ -594,17 +594,17 @@ def __init__(
         assert not (init_cfg and pretrained), "init_cfg and pretrained cannot be specified at the same time"
         if isinstance(pretrained, str):
             warnings.warn('DeprecationWarning: pretrained is deprecated, please use "init_cfg" instead')
-            self.init_cfg = dict(type="Pretrained", checkpoint=pretrained)
+            self.init_cfg = dict(type="Pretrained", checkpoint=pretrained)  # type: ignore[unresolved-attribute]
         elif pretrained is None:
-            self.init_cfg = init_cfg
+            self.init_cfg = init_cfg  # type: ignore[unresolved-attribute]
         else:
             raise TypeError("pretrained must be a str or None")
 
         super(SwinTransformer, self).__init__(init_cfg=init_cfg)
 
         num_layers = len(depths)
-        self.out_indices = out_indices
-        self.use_abs_pos_embed = use_abs_pos_embed
+        self.out_indices = out_indices  # type: ignore[unresolved-attribute]
+        self.use_abs_pos_embed = use_abs_pos_embed  # type: ignore[unresolved-attribute]
 
         assert strides[0] == patch_size, "Use non-overlapping patch embed."
 
@@ -625,7 +625,7 @@ def __init__(
             self.absolute_pos_embed = nn.Parameter(torch.zeros((1, num_patches, embed_dims)))
 
         self.drop_after_pos = nn.Dropout(p=drop_rate)
-        self.attn_backend = attn_backend
+        self.attn_backend = attn_backend  # type: ignore[unresolved-attribute]
 
         # set stochastic depth decay rule
         total_depth = sum(depths)
@@ -667,7 +667,7 @@ def __init__(
             if downsample:
                 in_channels = downsample.out_channels
 
-        self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)]
+        self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)]  # type: ignore[unresolved-attribute]
         # Add a norm layer for each output
         for i in out_indices:
             layer = build_norm_layer(norm_cfg, self.num_features[i])[1]
@@ -715,7 +715,7 @@ def init_weights(self):
             assert "checkpoint" in self.init_cfg, (
                 f"Only support specify `Pretrained` in `init_cfg` in {self.__class__.__name__} "
             )
-            ckpt = CheckpointLoader.load_checkpoint(self.init_cfg.checkpoint, logger=logger, map_location="cpu")
+            ckpt = CheckpointLoader.load_checkpoint(self.init_cfg.checkpoint, logger=logger, map_location="cpu")  # type: ignore[possibly-missing-attribute]
             if "state_dict" in ckpt:
                 _state_dict = ckpt["state_dict"]
             elif "model" in ckpt:
diff --git a/visdet/models/dense_heads/anchor_head.py b/visdet/models/dense_heads/anchor_head.py
index 8de7a184..583f049d 100644
--- a/visdet/models/dense_heads/anchor_head.py
+++ b/visdet/models/dense_heads/anchor_head.py
@@ -1,5 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import annotations
+
 import warnings
+from collections.abc import Mapping, Sequence
+from typing import Any, cast
 
 import torch
 import torch.nn as nn
@@ -15,13 +19,7 @@
 from visdet.models.utils import images_to_levels, multi_apply, unmap
 from visdet.registry import MODELS, TASK_UTILS
 from visdet.structures.bbox import BaseBoxes, cat_boxes, get_box_tensor
-from visdet.utils import (
-    ConfigType,
-    InstanceList,
-    OptConfigType,
-    OptInstanceList,
-    OptMultiConfig,
-)
+from visdet.utils import ConfigType, InstanceList, OptConfigType, OptInstanceList, OptMultiConfig
 
 
 @MODELS.register_module()
@@ -47,6 +45,19 @@ class AnchorHead(BaseDenseHead):
         init_cfg (dict or list[dict], optional): Initialization config dict.
     """
 
+    # Type annotations for attributes
+    in_channels: int
+    num_classes: int
+    feat_channels: int
+    reg_decoded_bbox: bool
+    loss_bbox: nn.Module
+    train_cfg: ConfigType | None
+    assigner: nn.Module
+    sampler: nn.Module
+    num_base_priors: int
+    conv_cls: nn.Module
+    conv_reg: nn.Module
+
     def __init__(
         self,
         num_classes: int,
@@ -72,39 +83,52 @@ def __init__(
         init_cfg: OptMultiConfig = dict(type="Normal", layer="Conv2d", std=0.01),
     ) -> None:
         super().__init__(init_cfg=init_cfg)
-        self.in_channels = in_channels
-        self.num_classes = num_classes
-        self.feat_channels = feat_channels
-        self.use_sigmoid_cls = loss_cls.get("use_sigmoid", False)
+        self.in_channels: int = in_channels  # type: ignore[misc]
+        self.num_classes: int = num_classes  # type: ignore[misc]
+        self.feat_channels: int = feat_channels  # type: ignore[misc]
+        if not isinstance(loss_cls, Mapping):
+            raise TypeError("loss_cls config must be a mapping")
+        loss_cls_cfg = dict(loss_cls)
+        self.use_sigmoid_cls: bool = bool(loss_cls_cfg.get("use_sigmoid", False))  # type: ignore[misc]
         if self.use_sigmoid_cls:
-            self.cls_out_channels = num_classes
+            self.cls_out_channels = num_classes  # type: ignore[assignment]
         else:
-            self.cls_out_channels = num_classes + 1
+            self.cls_out_channels = num_classes + 1  # type: ignore[assignment]
 
         if self.cls_out_channels <= 0:
             raise ValueError(f"num_classes={num_classes} is too small")
-        self.reg_decoded_bbox = reg_decoded_bbox
-
-        self.bbox_coder = TASK_UTILS.build(bbox_coder)
-        self.loss_cls = MODELS.build(loss_cls)
-        self.loss_bbox = MODELS.build(loss_bbox)
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-        if self.train_cfg:
-            self.assigner = TASK_UTILS.build(self.train_cfg["assigner"])
-            if train_cfg.get("sampler", None) is not None:
-                self.sampler = TASK_UTILS.build(self.train_cfg["sampler"], default_args=dict(context=self))
+        self.reg_decoded_bbox = reg_decoded_bbox  # type: ignore[assignment]
+
+        if (
+            not isinstance(bbox_coder, Mapping)
+            or not isinstance(loss_bbox, Mapping)
+            or not isinstance(anchor_generator, Mapping)
+        ):
+            raise TypeError("bbox_coder, loss_bbox and anchor_generator configs must be mappings")
+        self.bbox_coder = TASK_UTILS.build(dict(bbox_coder))
+        self.loss_cls = MODELS.build(dict(loss_cls))
+        self.loss_bbox = MODELS.build(dict(loss_bbox))
+        self.train_cfg: ConfigType | None = dict(train_cfg) if isinstance(train_cfg, Mapping) else train_cfg  # type: ignore[misc]
+        self.test_cfg = dict(test_cfg) if isinstance(test_cfg, Mapping) else test_cfg  # type: ignore[assignment]
+        if isinstance(self.train_cfg, Mapping):
+            assigner_cfg = self.train_cfg["assigner"]
+            if not isinstance(assigner_cfg, Mapping):
+                raise TypeError("assigner cfg must be a mapping")
+            self.assigner = TASK_UTILS.build(dict(assigner_cfg))
+            sampler_cfg = self.train_cfg.get("sampler")
+            if isinstance(sampler_cfg, Mapping):
+                self.sampler = TASK_UTILS.build(dict(sampler_cfg), default_args=dict(context=self))  # type: ignore[assignment]
             else:
-                self.sampler = PseudoSampler(context=self)
+                self.sampler = PseudoSampler(context=self)  # type: ignore[assignment]
 
-        self.fp16_enabled = False
+        self.fp16_enabled: bool = False  # type: ignore[misc]
 
-        self.prior_generator = TASK_UTILS.build(anchor_generator)
+        self.prior_generator: AnchorGenerator = cast(AnchorGenerator, TASK_UTILS.build(dict(anchor_generator)))
 
         # Usually the numbers of anchors for each level are the same
         # except SSD detectors. So it is an int in the most dense
         # heads but a list of int in SSDHead
-        self.num_base_priors = self.prior_generator.num_base_priors[0]
+        self.num_base_priors: int = int(self.prior_generator.num_base_priors[0])  # type: ignore[misc]
         self._init_layers()
 
     @property
@@ -121,9 +145,10 @@ def anchor_generator(self) -> AnchorGenerator:
 
     def _init_layers(self) -> None:
         """Initialize layers of the head."""
-        self.conv_cls = nn.Conv2d(self.in_channels, self.num_base_priors * self.cls_out_channels, 1)
-        reg_dim = self.bbox_coder.encode_size
-        self.conv_reg = nn.Conv2d(self.in_channels, self.num_base_priors * reg_dim, 1)
+        self.conv_cls = nn.Conv2d(self.in_channels, self.num_base_priors * self.cls_out_channels, 1)  # type: ignore[assignment]
+        reg_dim = self.bbox_coder.encode_size  # type: ignore[attr-defined]
+        assert isinstance(reg_dim, int), "reg_dim must be an integer"
+        self.conv_reg = nn.Conv2d(self.in_channels, self.num_base_priors * reg_dim, 1)  # type: ignore[assignment]
 
     def forward_single(self, x: Tensor) -> tuple[Tensor, Tensor]:
         """Forward feature of a single scale level.
@@ -142,7 +167,7 @@ def forward_single(self, x: Tensor) -> tuple[Tensor, Tensor]:
         bbox_pred = self.conv_reg(x)
         return cls_score, bbox_pred
 
-    def forward(self, x: tuple[Tensor]) -> tuple[list[Tensor]]:
+    def forward(self, x: tuple[Tensor]) -> tuple[list[Tensor], list[Tensor]]:
         """Forward features from the upstream network.
 
         Args:
@@ -159,11 +184,12 @@ def forward(self, x: tuple[Tensor]) -> tuple[list[Tensor]]:
                     scale levels, each is a 4D-tensor, the channels number \
                     is num_base_priors * 4.
         """
-        return multi_apply(self.forward_single, x)
+        cls_scores, bbox_preds = multi_apply(self.forward_single, x)
+        return list(cls_scores), list(bbox_preds)
 
     def get_anchors(
         self,
-        featmap_sizes: list[tuple],
+        featmap_sizes: Sequence[tuple[int, int] | torch.Size],
         batch_img_metas: list[dict],
         device: torch.device | str = "cuda",
     ) -> tuple[list[list[Tensor]], list[list[Tensor]]]:
@@ -182,17 +208,23 @@ def get_anchors(
                 - valid_flag_list (list[list[Tensor]]): Valid flags of each
                   image.
         """
+        normalized_sizes: list[tuple[int, int]] = []
+        for size in featmap_sizes:
+            h, w = size[:2]
+            normalized_sizes.append((int(h), int(w)))
         num_imgs = len(batch_img_metas)
 
         # since feature map sizes of all images are the same, we only compute
         # anchors for one time
-        multi_level_anchors = self.prior_generator.grid_priors(featmap_sizes, device=device)
+        # PyTorch stubs incorrectly type AnchorGenerator methods as Tensor (not callable)
+        multi_level_anchors = self.prior_generator.grid_priors(normalized_sizes, device=device)  # type: ignore[call-non-callable]
         anchor_list = [multi_level_anchors for _ in range(num_imgs)]
 
         # for each image, we compute valid flags of multi level anchors
         valid_flag_list = []
         for img_id, img_meta in enumerate(batch_img_metas):
-            multi_level_flags = self.prior_generator.valid_flags(featmap_sizes, img_meta["pad_shape"], device)
+            # PyTorch stubs incorrectly type AnchorGenerator.valid_flags as Tensor (not callable)
+            multi_level_flags = self.prior_generator.valid_flags(normalized_sizes, img_meta["pad_shape"], device)  # type: ignore[call-non-callable]
             valid_flag_list.append(multi_level_flags)
 
         return anchor_list, valid_flag_list
@@ -238,11 +270,16 @@ def _get_targets_single(
                 - neg_inds (Tensor): negative samples indexes.
                 - sampling_result (:obj:`SamplingResult`): Sampling results.
         """
+        assert self.train_cfg is not None, "train_cfg must be set for training"
+        train_cfg = self.train_cfg
+        if not isinstance(train_cfg, Mapping):
+            raise TypeError("train_cfg must be a mapping when training")
+        allowed_border = int(train_cfg.get("allowed_border", 0))
         inside_flags = anchor_inside_flags(
             flat_anchors,
             valid_flags,
             img_meta["img_shape"][:2],
-            self.train_cfg["allowed_border"],
+            allowed_border,
         )
         if not inside_flags.any():
             raise ValueError(
@@ -254,13 +291,16 @@ def _get_targets_single(
         anchors = flat_anchors[inside_flags]
 
         pred_instances = InstanceData(priors=anchors)
-        assign_result = self.assigner.assign(pred_instances, gt_instances, gt_instances_ignore)
+        assign_result = self.assigner.assign(pred_instances, gt_instances, gt_instances_ignore)  # type: ignore[attr-defined,call-arg]
         # No sampling is required except for RPN and
         # Guided Anchoring algorithms
-        sampling_result = self.sampler.sample(assign_result, pred_instances, gt_instances)
+        sampling_result = self.sampler.sample(assign_result, pred_instances, gt_instances)  # type: ignore[attr-defined,call-arg]
 
-        num_valid_anchors = anchors.shape[0]
-        target_dim = gt_instances.bboxes.size(-1) if self.reg_decoded_bbox else self.bbox_coder.encode_size
+        num_valid_anchors = int(anchors.shape[0])
+        encode_size = self.bbox_coder.encode_size  # type: ignore[attr-defined]
+        assert isinstance(encode_size, int), "encode_size must be an integer"
+        target_dim_raw = gt_instances.bboxes.size(-1) if self.reg_decoded_bbox else encode_size
+        target_dim = int(target_dim_raw)  # Convert to int to satisfy type checker
         bbox_targets = anchors.new_zeros(num_valid_anchors, target_dim)
         bbox_weights = anchors.new_zeros(num_valid_anchors, target_dim)
 
@@ -275,7 +315,7 @@ def _get_targets_single(
         # box type `pos_bbox_targets` to tensor.
         if len(pos_inds) > 0:
             if not self.reg_decoded_bbox:
-                pos_bbox_targets = self.bbox_coder.encode(sampling_result.pos_priors, sampling_result.pos_gt_bboxes)
+                pos_bbox_targets = self.bbox_coder.encode(sampling_result.pos_priors, sampling_result.pos_gt_bboxes)  # type: ignore[attr-defined,call-arg]
             else:
                 pos_bbox_targets = sampling_result.pos_gt_bboxes
                 pos_bbox_targets = get_box_tensor(pos_bbox_targets)
@@ -283,16 +323,17 @@ def _get_targets_single(
             bbox_weights[pos_inds, :] = 1.0
 
             labels[pos_inds] = sampling_result.pos_gt_labels
-            if self.train_cfg["pos_weight"] <= 0:
+            pos_weight = float(train_cfg.get("pos_weight", 0))
+            if pos_weight <= 0:
                 label_weights[pos_inds] = 1.0
             else:
-                label_weights[pos_inds] = self.train_cfg["pos_weight"]
+                label_weights[pos_inds] = pos_weight
         if len(neg_inds) > 0:
             label_weights[neg_inds] = 1.0
 
         # map up to original set of anchors
         if unmap_outputs:
-            num_total_anchors = flat_anchors.size(0)
+            num_total_anchors = int(flat_anchors.size(0))
             labels = unmap(labels, num_total_anchors, inside_flags, fill=self.num_classes)  # fill bg label
             label_weights = unmap(label_weights, num_total_anchors, inside_flags)
             bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
@@ -367,7 +408,9 @@ def get_targets(
         assert len(anchor_list) == len(valid_flag_list) == num_imgs
 
         if batch_gt_instances_ignore is None:
-            batch_gt_instances_ignore = [None] * num_imgs
+            gt_instances_ignore_list: list[InstanceData | None] = [None for _ in range(num_imgs)]
+        else:
+            gt_instances_ignore_list = list(batch_gt_instances_ignore)
 
         # anchor number of multi levels
         num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
@@ -376,7 +419,7 @@ def get_targets(
         concat_valid_flag_list = []
         for i in range(num_imgs):
             assert len(anchor_list[i]) == len(valid_flag_list[i])
-            concat_anchor_list.append(cat_boxes(anchor_list[i]))
+            concat_anchor_list.append(cat_boxes(anchor_list[i]))  # type: ignore[arg-type]
             concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
 
         # compute targets for each image
@@ -386,7 +429,7 @@ def get_targets(
             concat_valid_flag_list,
             batch_gt_instances,
             batch_img_metas,
-            batch_gt_instances_ignore,
+            gt_instances_ignore_list,
             unmap_outputs=unmap_outputs,
         )
         (
@@ -469,13 +512,15 @@ def loss_by_feat_single(
         target_dim = bbox_targets.size(-1)
         bbox_targets = bbox_targets.reshape(-1, target_dim)
         bbox_weights = bbox_weights.reshape(-1, target_dim)
-        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, self.bbox_coder.encode_size)
+        encode_size = self.bbox_coder.encode_size  # type: ignore[attr-defined]
+        assert isinstance(encode_size, int), "encode_size must be an integer"
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, encode_size)
         if self.reg_decoded_bbox:
             # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
             # is applied directly on the decoded bounding boxes, it
             # decodes the already encoded coordinates to absolute format.
             anchors = anchors.reshape(-1, anchors.size(-1))
-            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)  # type: ignore[attr-defined,call-arg]
             bbox_pred = get_box_tensor(bbox_pred)
         loss_bbox = self.loss_bbox(bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor)
         return loss_cls, loss_bbox
@@ -535,7 +580,7 @@ def loss_by_feat(
         # concat all level anchors and flags to a single tensor
         concat_anchor_list = []
         for i in range(len(anchor_list)):
-            concat_anchor_list.append(cat_boxes(anchor_list[i]))
+            concat_anchor_list.append(cat_boxes(anchor_list[i]))  # type: ignore[arg-type]
         all_anchor_list = images_to_levels(concat_anchor_list, num_level_anchors)
 
         losses_cls, losses_bbox = multi_apply(
diff --git a/visdet/models/dense_heads/base_dense_head.py b/visdet/models/dense_heads/base_dense_head.py
index 0a993edb..337df274 100644
--- a/visdet/models/dense_heads/base_dense_head.py
+++ b/visdet/models/dense_heads/base_dense_head.py
@@ -2,9 +2,10 @@
 import copy
 from abc import ABCMeta, abstractmethod
 from inspect import signature
+from typing import Any, Mapping, Sequence, cast
 
 import torch
-from torch import Tensor
+from torch import Tensor, nn
 
 from visdet.cv.ops import batched_nms
 from visdet.engine.config import ConfigDict
@@ -13,8 +14,8 @@
 from visdet.models.test_time_augs import merge_aug_results
 from visdet.models.utils import filter_scores_and_topk, select_single_mlvl, unpack_gt_instances
 from visdet.structures import SampleList
-from visdet.structures.bbox import cat_boxes, get_box_tensor, get_box_wh, scale_boxes
-from visdet.utils import InstanceList, OptMultiConfig
+from visdet.structures.bbox import BaseBoxes, cat_boxes, get_box_tensor, get_box_wh, scale_boxes
+from visdet.utils import InstanceList
 
 
 class BaseDenseHead(BaseModule, metaclass=ABCMeta):
@@ -54,11 +55,20 @@ class BaseDenseHead(BaseModule, metaclass=ABCMeta):
     loss_and_predict(): forward() -> loss_by_feat() -> predict_by_feat()
     """
 
-    def __init__(self, init_cfg: OptMultiConfig = None) -> None:
+    # Type annotations for attributes set in subclasses
+    prior_generator: nn.Module
+    bbox_coder: nn.Module
+    loss_cls: nn.Module
+    use_sigmoid_cls: bool
+    cls_out_channels: int
+    test_cfg: ConfigDict | None
+    _raw_positive_infos: dict[str, Any]
+
+    def __init__(self, init_cfg: dict[str, Any] | list[dict[str, Any]] | None = None) -> None:
         super().__init__(init_cfg=init_cfg)
         # `_raw_positive_infos` will be used in `get_positive_infos`, which
         # can get positive information.
-        self._raw_positive_infos = dict()
+        self._raw_positive_infos = {}  # type: ignore[assignment]
 
     def init_weights(self) -> None:
         """Initialize the weights."""
@@ -69,7 +79,7 @@ def init_weights(self) -> None:
             if hasattr(m, "conv_offset"):
                 constant_init(m.conv_offset, 0)
 
-    def get_positive_infos(self) -> InstanceList:
+    def get_positive_infos(self) -> InstanceList | None:
         """Get positive information from sampling results.
 
         Returns:
@@ -83,7 +93,7 @@ def get_positive_infos(self) -> InstanceList:
         sampling_results = self._raw_positive_infos.get("sampling_results", None)
         assert sampling_results is not None
         positive_infos = []
-        for sampling_result in enumerate(sampling_results):
+        for _, sampling_result in enumerate(sampling_results):
             pos_info = InstanceData()
             pos_info.bboxes = sampling_result.pos_gt_bboxes
             pos_info.labels = sampling_result.pos_gt_labels
@@ -163,7 +173,17 @@ def loss_and_predict(
         )
         losses = self.loss_by_feat(*loss_inputs)
 
-        predictions = self.predict_by_feat(*outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg)
+        # Unpack outs explicitly - forward returns (cls_scores, bbox_preds) or (cls_scores, bbox_preds, score_factors)
+        if len(outs) == 2:
+            cls_scores, bbox_preds = outs
+            predictions = self.predict_by_feat(
+                cls_scores, bbox_preds, batch_img_metas=batch_img_metas, cfg=proposal_cfg
+            )
+        else:
+            cls_scores, bbox_preds, score_factors = outs
+            predictions = self.predict_by_feat(
+                cls_scores, bbox_preds, score_factors, batch_img_metas=batch_img_metas, cfg=proposal_cfg
+            )
         return losses, predictions
 
     def predict(self, x: tuple[Tensor], batch_data_samples: SampleList, rescale: bool = False) -> InstanceList:
@@ -187,7 +207,15 @@ def predict(self, x: tuple[Tensor], batch_data_samples: SampleList, rescale: boo
 
         outs = self(x)
 
-        predictions = self.predict_by_feat(*outs, batch_img_metas=batch_img_metas, rescale=rescale)
+        # Unpack outs explicitly - forward returns (cls_scores, bbox_preds) or (cls_scores, bbox_preds, score_factors)
+        if len(outs) == 2:
+            cls_scores, bbox_preds = outs
+            predictions = self.predict_by_feat(cls_scores, bbox_preds, batch_img_metas=batch_img_metas, rescale=rescale)
+        else:
+            cls_scores, bbox_preds, score_factors = outs
+            predictions = self.predict_by_feat(
+                cls_scores, bbox_preds, score_factors, batch_img_metas=batch_img_metas, rescale=rescale
+            )
         return predictions
 
     def predict_by_feat(
@@ -239,6 +267,7 @@ def predict_by_feat(
                   the last dimension 4 arrange as (x1, y1, x2, y2).
         """
         assert len(cls_scores) == len(bbox_preds)
+        assert batch_img_metas is not None, "batch_img_metas must be provided"
 
         if score_factors is None:
             # e.g. Retina, FreeAnchor, Foveabox, etc.
@@ -251,7 +280,9 @@ def predict_by_feat(
         num_levels = len(cls_scores)
 
         featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
-        mlvl_priors = self.prior_generator.grid_priors(
+        # Type narrow prior_generator - it's a Module with grid_priors method
+        assert hasattr(self.prior_generator, "grid_priors"), "prior_generator must have grid_priors method"
+        mlvl_priors = self.prior_generator.grid_priors(  # type: ignore[call-arg]
             featmap_sizes, dtype=cls_scores[0].dtype, device=cls_scores[0].device
         )
 
@@ -262,9 +293,15 @@ def predict_by_feat(
             cls_score_list = select_single_mlvl(cls_scores, img_id, detach=True)
             bbox_pred_list = select_single_mlvl(bbox_preds, img_id, detach=True)
             if with_score_factors:
-                score_factor_list = select_single_mlvl(score_factors, img_id, detach=True)
+                score_factor_raw = select_single_mlvl(score_factors, img_id, detach=True)
+                score_factor_list = cast(list[Tensor | None], list(score_factor_raw))
             else:
-                score_factor_list = [None for _ in range(num_levels)]
+                empty_factors: list[Tensor | None] = [None for _ in range(num_levels)]
+                score_factor_list = empty_factors
+
+            # Use test_cfg if cfg is not provided
+            effective_cfg = cfg if cfg is not None else self.test_cfg
+            assert effective_cfg is not None, "Either cfg or self.test_cfg must be provided"
 
             results = self._predict_by_feat_single(
                 cls_score_list=cls_score_list,
@@ -272,7 +309,7 @@ def predict_by_feat(
                 score_factor_list=score_factor_list,
                 mlvl_priors=mlvl_priors,
                 img_meta=img_meta,
-                cfg=cfg,
+                cfg=effective_cfg,
                 rescale=rescale,
                 with_nms=with_nms,
             )
@@ -283,7 +320,7 @@ def _predict_by_feat_single(
         self,
         cls_score_list: list[Tensor],
         bbox_pred_list: list[Tensor],
-        score_factor_list: list[Tensor],
+        score_factor_list: list[Tensor | None],
         mlvl_priors: list[Tensor],
         img_meta: dict,
         cfg: ConfigDict,
@@ -329,7 +366,7 @@ def _predict_by_feat_single(
                 - bboxes (Tensor): Has a shape (num_instances, 4),
                   the last dimension 4 arrange as (x1, y1, x2, y2).
         """
-        if score_factor_list[0] is None:
+        if not score_factor_list or score_factor_list[0] is None:
             # e.g. Retina, FreeAnchor, etc.
             with_score_factors = False
         else:
@@ -346,7 +383,7 @@ def _predict_by_feat_single(
         mlvl_scores = []
         mlvl_labels = []
         if with_score_factors:
-            mlvl_score_factors = []
+            mlvl_score_factors: list[Tensor] | None = []
         else:
             mlvl_score_factors = None
         for level_idx, (cls_score, bbox_pred, score_factor, priors) in enumerate(
@@ -354,9 +391,11 @@ def _predict_by_feat_single(
         ):
             assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
 
-            dim = self.bbox_coder.encode_size
+            dim_raw = getattr(self.bbox_coder, "encode_size", 4)
+            dim = int(dim_raw)
             bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim)
             if with_score_factors:
+                assert score_factor is not None
                 score_factor = score_factor.permute(1, 2, 0).reshape(-1).sigmoid()
             cls_score = cls_score.permute(1, 2, 0).reshape(-1, self.cls_out_channels)
 
@@ -364,7 +403,7 @@ def _predict_by_feat_single(
             # CrossEntropyCustomLoss and FocalCustomLoss, and is currently used
             # in v3det.
             if getattr(self.loss_cls, "custom_cls_channels", False):
-                scores = self.loss_cls.get_activation(cls_score)
+                scores = self.loss_cls.get_activation(cls_score)  # type: ignore[attr-defined,call-arg]
             elif self.use_sigmoid_cls:
                 scores = cls_score.sigmoid()
             else:
@@ -383,10 +422,12 @@ def _predict_by_feat_single(
             results = filter_scores_and_topk(scores, score_thr, nms_pre, dict(bbox_pred=bbox_pred, priors=priors))
             scores, labels, keep_idxs, filtered_results = results
 
+            assert isinstance(filtered_results, dict)
             bbox_pred = filtered_results["bbox_pred"]
             priors = filtered_results["priors"]
 
             if with_score_factors:
+                assert score_factor is not None
                 score_factor = score_factor[keep_idxs]
 
             mlvl_bbox_preds.append(bbox_pred)
@@ -394,18 +435,23 @@ def _predict_by_feat_single(
             mlvl_scores.append(scores)
             mlvl_labels.append(labels)
 
-            if with_score_factors:
+            if with_score_factors and mlvl_score_factors is not None:
+                assert score_factor is not None
                 mlvl_score_factors.append(score_factor)
 
         bbox_pred = torch.cat(mlvl_bbox_preds)
-        priors = cat_boxes(mlvl_valid_priors)
-        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+        if mlvl_valid_priors and isinstance(mlvl_valid_priors[0], BaseBoxes):
+            priors = cat_boxes(mlvl_valid_priors)  # type: ignore[arg-type]
+        else:
+            priors = torch.cat(mlvl_valid_priors)  # type: ignore[arg-type]
+        # Type narrow bbox_coder - it's a Module with decode method
+        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)  # type: ignore[attr-defined,call-arg]
 
         results = InstanceData()
         results.bboxes = bboxes
         results.scores = torch.cat(mlvl_scores)
         results.labels = torch.cat(mlvl_labels)
-        if with_score_factors:
+        if with_score_factors and mlvl_score_factors is not None:
             results.score_factors = torch.cat(mlvl_score_factors)
 
         return self._bbox_post_process(
@@ -453,8 +499,11 @@ def _bbox_post_process(
                   the last dimension 4 arrange as (x1, y1, x2, y2).
         """
         if rescale:
+            assert img_meta is not None
             assert img_meta.get("scale_factor") is not None
-            scale_factor = [1 / s for s in img_meta["scale_factor"]]
+            raw_scale_factor = img_meta["scale_factor"]
+            assert isinstance(raw_scale_factor, (tuple, list)) and len(raw_scale_factor) >= 2
+            scale_factor = (1.0 / float(raw_scale_factor[0]), 1.0 / float(raw_scale_factor[1]))
             results.bboxes = scale_boxes(results.bboxes, scale_factor)
 
         if hasattr(results, "score_factors"):
@@ -464,20 +513,26 @@ def _bbox_post_process(
             results.scores = results.scores * score_factors
 
         # filter small size bboxes
-        if cfg.get("min_bbox_size", -1) >= 0:
+        min_bbox_size = float(cfg.get("min_bbox_size", -1))
+        if min_bbox_size >= 0:
             w, h = get_box_wh(results.bboxes)
-            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+            valid_mask = (w > min_bbox_size) & (h > min_bbox_size)
             if not valid_mask.all():
                 results = results[valid_mask]
 
         # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg
         if with_nms and results.bboxes.numel() > 0:
             bboxes = get_box_tensor(results.bboxes)
-            det_bboxes, keep_idxs = batched_nms(bboxes, results.scores, results.labels, cfg.nms)
+            nms_cfg = cfg.get("nms")
+            det_bboxes, keep_idxs = batched_nms(bboxes, results.scores, results.labels, nms_cfg)
             results = results[keep_idxs]
             # some nms would reweight the score, such as softnms
             results.scores = det_bboxes[:, -1]
-            results = results[: cfg.max_per_img]
+            max_per_img = int(cfg.get("max_per_img", 100))
+            if keep_idxs.size(0) > max_per_img:
+                _, inds = results.scores.sort(descending=True)
+                inds = inds[:max_per_img]
+                results = results[inds]
 
         return results
 
@@ -519,11 +574,11 @@ def aug_test(
                   the last dimension 4 arrange as (x1, y1, x2, y2).
         """
         # TODO: remove this for detr and deformdetr
-        sig_of_get_results = signature(self.get_results)
-        get_results_args = [p.name for p in sig_of_get_results.parameters.values()]
-        get_results_single_sig = signature(self._get_results_single)
-        get_results_single_sig_args = [p.name for p in get_results_single_sig.parameters.values()]
-        assert ("with_nms" in get_results_args) and ("with_nms" in get_results_single_sig_args), (
+        sig_of_predict_by_feat = signature(self.predict_by_feat)
+        predict_by_feat_args = [p.name for p in sig_of_predict_by_feat.parameters.values()]
+        sig_of_predict_by_feat_single = signature(self._predict_by_feat_single)
+        predict_by_feat_single_args = [p.name for p in sig_of_predict_by_feat_single.parameters.values()]
+        assert ("with_nms" in predict_by_feat_args) and ("with_nms" in predict_by_feat_single_args), (
             f"{self.__class__.__name__}does not support test-time augmentation "
         )
 
@@ -531,27 +586,48 @@ def aug_test(
         aug_batch_results = []
         for x, img_metas in zip(aug_batch_feats, aug_batch_img_metas):
             outs = self.forward(x)
-            batch_instance_results = self.get_results(
-                *outs,
-                img_metas=img_metas,
-                cfg=self.test_cfg,
-                rescale=False,
-                with_nms=with_ori_nms,
-                **kwargs,
-            )
+            # Type narrow test_cfg - it's defined in subclasses as ConfigDict
+            test_cfg = self.test_cfg if hasattr(self, "test_cfg") else None
+            # Unpack outs for predict_by_feat call
+            if len(outs) == 2:
+                cls_scores, bbox_preds = outs
+                batch_instance_results = self.predict_by_feat(
+                    cls_scores,
+                    bbox_preds,
+                    batch_img_metas=img_metas,
+                    cfg=test_cfg,
+                    rescale=False,
+                    with_nms=with_ori_nms,
+                )
+            else:
+                cls_scores, bbox_preds, score_factors = outs
+                batch_instance_results = self.predict_by_feat(
+                    cls_scores,
+                    bbox_preds,
+                    score_factors,
+                    batch_img_metas=img_metas,
+                    cfg=test_cfg,
+                    rescale=False,
+                    with_nms=with_ori_nms,
+                )
             aug_batch_results.append(batch_instance_results)
 
         # after merging, bboxes will be rescaled to the original image
         batch_results = merge_aug_results(aug_batch_results, aug_batch_img_metas)
 
+        # Get test_cfg attributes with type narrowing
+        test_cfg = self.test_cfg if hasattr(self, "test_cfg") else None
+        nms_cfg = test_cfg.get("nms") if isinstance(test_cfg, ConfigDict) else None
+        max_per_img = test_cfg.get("max_per_img", 100) if isinstance(test_cfg, ConfigDict) else 100
+
         final_results = []
         for img_id in range(num_imgs):
             results = batch_results[img_id]
-            det_bboxes, keep_idxs = batched_nms(results.bboxes, results.scores, results.labels, self.test_cfg.nms)
+            det_bboxes, keep_idxs = batched_nms(results.bboxes, results.scores, results.labels, nms_cfg)
             results = results[keep_idxs]
             # some nms operation may reweight the score such as softnms
             results.scores = det_bboxes[:, -1]
-            results = results[: self.test_cfg.max_per_img]
+            results = results[:max_per_img]
             if rescale:
                 # all results have been mapped to the original scale
                 # in `merge_aug_results`, so just pass
diff --git a/visdet/models/dense_heads/rpn_head.py b/visdet/models/dense_heads/rpn_head.py
index 027a1291..04531b7f 100644
--- a/visdet/models/dense_heads/rpn_head.py
+++ b/visdet/models/dense_heads/rpn_head.py
@@ -1,7 +1,6 @@
 # ruff: noqa
 # fmt: off
 # isort: skip
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
 
@@ -68,6 +67,12 @@ class RPNHead(AnchorHead):
             Defaults to 1.
     """
 
+    # Type annotations
+    num_convs: int
+    rpn_conv: nn.Module
+    rpn_cls: nn.Module
+    rpn_reg: nn.Module
+
     def __init__(
         self,
         in_channels: int,
@@ -78,7 +83,7 @@ def __init__(
     ) -> None:
         if init_cfg is None:
             init_cfg = {"type": "Normal", "layer": "Conv2d", "std": 0.01}
-        self.num_convs = num_convs
+        self.num_convs = num_convs  # type: ignore[assignment]
         assert num_classes == 1
         super().__init__(
             num_classes=num_classes, in_channels=in_channels, init_cfg=init_cfg, **kwargs
@@ -102,11 +107,12 @@ def _init_layers(self) -> None:
             self.rpn_conv = nn.Sequential(*rpn_convs)
         else:
             self.rpn_conv = nn.Conv2d(self.in_channels, self.feat_channels, 3, padding=1)
-        self.rpn_cls = nn.Conv2d(
+        self.rpn_cls = nn.Conv2d(  # type: ignore[assignment]
             self.feat_channels, self.num_base_priors * self.cls_out_channels, 1
         )
-        reg_dim = self.bbox_coder.encode_size
-        self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_base_priors * reg_dim, 1)
+        reg_dim = self.bbox_coder.encode_size  # type: ignore[attr-defined]
+        assert isinstance(reg_dim, int), "reg_dim must be an integer"
+        self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_base_priors * reg_dim, 1)  # type: ignore[assignment]
 
     def forward_single(self, x: Tensor) -> tuple[Tensor, Tensor]:
         """Forward feature of a single scale level.
@@ -253,7 +259,7 @@ def _predict_by_feat_single(
 
         bbox_pred = torch.cat(mlvl_bbox_preds)
         priors = cat_boxes(mlvl_valid_priors)
-        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)  # type: ignore[attr-defined,call-arg]
 
         results = InstanceData()
         results.bboxes = bboxes
@@ -299,8 +305,11 @@ def _bbox_post_process(
         """
         assert with_nms, "`with_nms` must be True in RPNHead"
         if rescale:
+            assert img_meta is not None
             assert img_meta.get("scale_factor") is not None
-            scale_factor = [1 / s for s in img_meta["scale_factor"]]
+            raw_scale_factor = img_meta["scale_factor"]
+            assert isinstance(raw_scale_factor, (tuple, list)) and len(raw_scale_factor) >= 2
+            scale_factor = (1.0 / float(raw_scale_factor[0]), 1.0 / float(raw_scale_factor[1]))
             results.bboxes = scale_boxes(results.bboxes, scale_factor)
 
         # filter small size bboxes
diff --git a/visdet/models/layers/__init__.py b/visdet/models/layers/__init__.py
index daa3e1af..7f80b39e 100644
--- a/visdet/models/layers/__init__.py
+++ b/visdet/models/layers/__init__.py
@@ -33,7 +33,7 @@ def __init__(self, kernel_size=1, stride=1, dilation=1, padding="corner"):
         stride = to_2tuple(stride)
         dilation = to_2tuple(dilation)
 
-        self.padding = padding
+        self.padding: str = padding  # type: ignore[misc]
         self.kernel_size = kernel_size
         self.stride = stride
         self.dilation = dilation
@@ -90,8 +90,8 @@ def __init__(
         if stride is None:
             stride = patch_size
 
-        self.img_size = img_size
-        self.patch_size = patch_size
+        self.img_size: int = img_size  # type: ignore[misc]
+        self.patch_size: int = patch_size  # type: ignore[misc]
 
         kernel_size = to_2tuple(patch_size)
         stride = to_2tuple(stride)
@@ -107,7 +107,7 @@ def __init__(
             # disable the padding of conv
             padding = 0
         else:
-            self.adap_padding = None
+            self.adap_padding: None = None  # type: ignore[misc]
 
         padding = to_2tuple(padding)
 
@@ -124,7 +124,7 @@ def __init__(
         if norm_cfg is not None:
             self.norm = nn.LayerNorm(embed_dims)
         else:
-            self.norm = None
+            self.norm: None = None  # type: ignore[misc]
 
     def forward(self, x):
         B, C, H, W = x.shape
@@ -154,7 +154,7 @@ def __init__(
         super().__init__(init_cfg=init_cfg)
         self.in_channels = in_channels
         self.out_channels = out_channels
-        self.stride = stride
+        self.stride: int = stride  # type: ignore[misc]
         self.reduction = nn.Linear(stride * stride * in_channels, out_channels, bias=False)
         self.norm = nn.LayerNorm(stride * stride * in_channels)
 
diff --git a/visdet/models/layers/normed_predictor.py b/visdet/models/layers/normed_predictor.py
index 70f50a1f..3605ed26 100644
--- a/visdet/models/layers/normed_predictor.py
+++ b/visdet/models/layers/normed_predictor.py
@@ -16,23 +16,27 @@ class NormedLinear(nn.Linear):
 
     Args:
         tempeature (float, optional): Tempeature term. Defaults to 20.
-        power (int, optional): Power term. Defaults to 1.0.
+        power (float, optional): Power term. Defaults to 1.0.
         eps (float, optional): The minimal value of divisor to
              keep numerical stability. Defaults to 1e-6.
     """
 
+    tempearture: float
+    power: float
+    eps: float
+
     def __init__(
         self,
         *args,
         tempearture: float = 20,
-        power: int = 1.0,
+        power: float = 1.0,
         eps: float = 1e-6,
         **kwargs,
     ) -> None:
         super().__init__(*args, **kwargs)
-        self.tempearture = tempearture
-        self.power = power
-        self.eps = eps
+        self.tempearture = tempearture  # type: ignore[misc]
+        self.power = power  # type: ignore[misc]
+        self.eps = eps  # type: ignore[misc]
         self.init_weights()
 
     def init_weights(self) -> None:
@@ -56,27 +60,32 @@ class NormedConv2d(nn.Conv2d):
 
     Args:
         tempeature (float, optional): Tempeature term. Defaults to 20.
-        power (int, optional): Power term. Defaults to 1.0.
+        power (float, optional): Power term. Defaults to 1.0.
         eps (float, optional): The minimal value of divisor to
              keep numerical stability. Defaults to 1e-6.
         norm_over_kernel (bool, optional): Normalize over kernel.
              Defaults to False.
     """
 
+    tempearture: float
+    power: float
+    norm_over_kernel: bool
+    eps: float
+
     def __init__(
         self,
         *args,
         tempearture: float = 20,
-        power: int = 1.0,
+        power: float = 1.0,
         eps: float = 1e-6,
         norm_over_kernel: bool = False,
         **kwargs,
     ) -> None:
         super().__init__(*args, **kwargs)
-        self.tempearture = tempearture
-        self.power = power
-        self.norm_over_kernel = norm_over_kernel
-        self.eps = eps
+        self.tempearture = tempearture  # type: ignore[misc]
+        self.power = power  # type: ignore[misc]
+        self.norm_over_kernel = norm_over_kernel  # type: ignore[misc]
+        self.eps = eps  # type: ignore[misc]
 
     def forward(self, x: Tensor) -> Tensor:
         """Forward function for `NormedConv2d`."""
@@ -91,7 +100,7 @@ def forward(self, x: Tensor) -> Tensor:
         x_ = x_ * self.tempearture
 
         if hasattr(self, "conv2d_forward"):
-            x_ = self.conv2d_forward(x_, weight_)
+            x_ = self.conv2d_forward(x_, weight_)  # type: ignore[misc]
         else:
             if digit_version(torch.__version__) >= digit_version("1.8"):
                 x_ = self._conv_forward(x_, weight_, self.bias)
diff --git a/visdet/models/losses/accuracy.py b/visdet/models/losses/accuracy.py
index ce4a5272..c63c89c7 100644
--- a/visdet/models/losses/accuracy.py
+++ b/visdet/models/losses/accuracy.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any
+
 import torch.nn as nn
 
 
@@ -49,7 +51,7 @@ def accuracy(pred, target, topk=1, thresh=None):
 
 
 class Accuracy(nn.Module):
-    def __init__(self, topk=(1,), thresh=None):
+    def __init__(self, topk: tuple[int, ...] | int = (1,), thresh: float | None = None):
         """Module to calculate the accuracy.
 
         Args:
@@ -59,8 +61,8 @@ def __init__(self, topk=(1,), thresh=None):
                 under this threshold are considered incorrect. Default to None.
         """
         super().__init__()
-        self.topk = topk
-        self.thresh = thresh
+        self.topk: tuple[int, ...] | int = topk
+        self.thresh: float | None = thresh
 
     def forward(self, pred, target):
         """Forward function to calculate accuracy.
diff --git a/visdet/models/losses/cross_entropy_loss.py b/visdet/models/losses/cross_entropy_loss.py
index 9a6c0878..2e577fee 100644
--- a/visdet/models/losses/cross_entropy_loss.py
+++ b/visdet/models/losses/cross_entropy_loss.py
@@ -1,9 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
+from collections.abc import Callable
+from typing import Any
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch import Tensor
 
 from visdet.models.losses.accuracy import accuracy
 from visdet.models.losses.utils import weight_reduce_loss
@@ -196,14 +199,14 @@ def mask_cross_entropy(
 class CrossEntropyLoss(nn.Module):
     def __init__(
         self,
-        use_sigmoid=False,
-        use_mask=False,
-        reduction="mean",
-        class_weight=None,
-        ignore_index=None,
-        loss_weight=1.0,
-        avg_non_ignore=False,
-    ):
+        use_sigmoid: bool = False,
+        use_mask: bool = False,
+        reduction: str = "mean",
+        class_weight: list[float] | None = None,
+        ignore_index: int | None = None,
+        loss_weight: float = 1.0,
+        avg_non_ignore: bool = False,
+    ) -> None:
         """CrossEntropyLoss.
 
         Args:
@@ -223,13 +226,13 @@ def __init__(
         """
         super(CrossEntropyLoss, self).__init__()
         assert (use_sigmoid is False) or (use_mask is False)
-        self.use_sigmoid = use_sigmoid
-        self.use_mask = use_mask
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-        self.class_weight = class_weight
-        self.ignore_index = ignore_index
-        self.avg_non_ignore = avg_non_ignore
+        self.use_sigmoid: bool = use_sigmoid
+        self.use_mask: bool = use_mask
+        self.reduction: str = reduction
+        self.loss_weight: float = loss_weight
+        self.class_weight: list[float] | None = class_weight
+        self.ignore_index: int | None = ignore_index
+        self.avg_non_ignore: bool = avg_non_ignore
         if (ignore_index is not None) and not self.avg_non_ignore and self.reduction == "mean":
             warnings.warn(
                 "Default ``avg_non_ignore`` is False, if you would like to "
@@ -239,11 +242,11 @@ def __init__(
             )
 
         if self.use_sigmoid:
-            self.cls_criterion = binary_cross_entropy
+            self.cls_criterion: Callable[..., Tensor] = binary_cross_entropy
         elif self.use_mask:
-            self.cls_criterion = mask_cross_entropy
+            self.cls_criterion: Callable[..., Tensor] = mask_cross_entropy
         else:
-            self.cls_criterion = cross_entropy
+            self.cls_criterion: Callable[..., Tensor] = cross_entropy
 
     def extra_repr(self):
         """Extra repr."""
@@ -252,14 +255,14 @@ def extra_repr(self):
 
     def forward(
         self,
-        cls_score,
-        label,
-        weight=None,
-        avg_factor=None,
-        reduction_override=None,
-        ignore_index=None,
-        **kwargs,
-    ):
+        cls_score: Tensor,
+        label: Tensor,
+        weight: Tensor | None = None,
+        avg_factor: int | None = None,
+        reduction_override: str | None = None,
+        ignore_index: int | None = None,
+        **kwargs: Any,
+    ) -> Tensor:
         """Forward function.
 
         Args:
@@ -302,15 +305,15 @@ def forward(
 class CrossEntropyCustomLoss(CrossEntropyLoss):
     def __init__(
         self,
-        use_sigmoid=False,
-        use_mask=False,
-        reduction="mean",
-        num_classes=-1,
-        class_weight=None,
-        ignore_index=None,
-        loss_weight=1.0,
-        avg_non_ignore=False,
-    ):
+        use_sigmoid: bool = False,
+        use_mask: bool = False,
+        reduction: str = "mean",
+        num_classes: int = -1,
+        class_weight: list[float] | None = None,
+        ignore_index: int | None = None,
+        loss_weight: float = 1.0,
+        avg_non_ignore: bool = False,
+    ) -> None:
         """CrossEntropyCustomLoss.
 
         Args:
@@ -331,13 +334,13 @@ def __init__(
         """
         super(CrossEntropyCustomLoss, self).__init__()
         assert (use_sigmoid is False) or (use_mask is False)
-        self.use_sigmoid = use_sigmoid
-        self.use_mask = use_mask
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-        self.class_weight = class_weight
-        self.ignore_index = ignore_index
-        self.avg_non_ignore = avg_non_ignore
+        self.use_sigmoid: bool = use_sigmoid
+        self.use_mask: bool = use_mask
+        self.reduction: str = reduction
+        self.loss_weight: float = loss_weight
+        self.class_weight: list[float] | None = class_weight
+        self.ignore_index: int | None = ignore_index
+        self.avg_non_ignore: bool = avg_non_ignore
         if (ignore_index is not None) and not self.avg_non_ignore and self.reduction == "mean":
             warnings.warn(
                 "Default ``avg_non_ignore`` is False, if you would like to "
@@ -347,22 +350,22 @@ def __init__(
             )
 
         if self.use_sigmoid:
-            self.cls_criterion = binary_cross_entropy
+            self.cls_criterion: Callable[..., Tensor] = binary_cross_entropy
         elif self.use_mask:
-            self.cls_criterion = mask_cross_entropy
+            self.cls_criterion: Callable[..., Tensor] = mask_cross_entropy
         else:
-            self.cls_criterion = cross_entropy
+            self.cls_criterion: Callable[..., Tensor] = cross_entropy
 
-        self.num_classes = num_classes
+        self.num_classes: int = num_classes
 
         assert self.num_classes != -1
 
         # custom output channels of the classifier
-        self.custom_cls_channels = True
+        self.custom_cls_channels: bool = True
         # custom activation of cls_score
-        self.custom_activation = True
+        self.custom_activation: bool = True
         # custom accuracy of the classsifier
-        self.custom_accuracy = True
+        self.custom_accuracy: bool = True
 
     def get_cls_channels(self, num_classes):
         assert num_classes == self.num_classes
diff --git a/visdet/models/losses/smooth_l1_loss.py b/visdet/models/losses/smooth_l1_loss.py
index 0bfcf0ec..b9531c8f 100644
--- a/visdet/models/losses/smooth_l1_loss.py
+++ b/visdet/models/losses/smooth_l1_loss.py
@@ -64,9 +64,9 @@ class SmoothL1Loss(nn.Module):
 
     def __init__(self, beta: float = 1.0, reduction: str = "mean", loss_weight: float = 1.0) -> None:
         super().__init__()
-        self.beta = beta
-        self.reduction = reduction
-        self.loss_weight = loss_weight
+        self.beta: float = beta
+        self.reduction: str = reduction
+        self.loss_weight: float = loss_weight
 
     def forward(
         self,
@@ -123,8 +123,8 @@ class L1Loss(nn.Module):
 
     def __init__(self, reduction: str = "mean", loss_weight: float = 1.0) -> None:
         super().__init__()
-        self.reduction = reduction
-        self.loss_weight = loss_weight
+        self.reduction: str = reduction
+        self.loss_weight: float = loss_weight
 
     def forward(
         self,
diff --git a/visdet/models/losses/utils.py b/visdet/models/losses/utils.py
index fad70d7b..ba49f1c8 100644
--- a/visdet/models/losses/utils.py
+++ b/visdet/models/losses/utils.py
@@ -1,5 +1,3 @@
-# ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
 import functools
 from collections.abc import Callable
@@ -19,14 +17,15 @@ def reduce_loss(loss: Tensor, reduction: str) -> Tensor:
     Return:
         Tensor: Reduced loss tensor.
     """
-    reduction_enum = F._Reduction.get_enum(reduction)
-    # none: 0, elementwise_mean:1, sum: 2
-    if reduction_enum == 0:
+    # Use string comparison instead of F._Reduction which is private
+    if reduction == "none":
         return loss
-    elif reduction_enum == 1:
+    elif reduction == "mean":
         return loss.mean()
-    elif reduction_enum == 2:
+    elif reduction == "sum":
         return loss.sum()
+    else:
+        raise ValueError(f"Invalid reduction mode: {reduction}")
 
 
 def weight_reduce_loss(
diff --git a/visdet/models/roi_heads/base_roi_head.py b/visdet/models/roi_heads/base_roi_head.py
index 01d1a568..98a50df2 100644
--- a/visdet/models/roi_heads/base_roi_head.py
+++ b/visdet/models/roi_heads/base_roi_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
+from typing import Any
 
 from torch import Tensor
 
@@ -12,6 +13,11 @@
 class BaseRoIHead(BaseModule, metaclass=ABCMeta):
     """Base class for RoIHeads."""
 
+    train_cfg: Any
+    test_cfg: Any
+    predict_bbox: Any  # Method attribute defined in subclasses
+    predict_mask: Any  # Method attribute defined in subclasses
+
     def __init__(
         self,
         bbox_roi_extractor: OptMultiConfig = None,
@@ -24,8 +30,8 @@ def __init__(
         init_cfg: OptMultiConfig = None,
     ) -> None:
         super().__init__(init_cfg=init_cfg)
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg  # type: ignore[unresolved-attribute]
+        self.test_cfg = test_cfg  # type: ignore[unresolved-attribute]
         if shared_head is not None:
             self.shared_head = MODELS.build(shared_head)
 
diff --git a/visdet/models/roi_heads/bbox_heads/bbox_head.py b/visdet/models/roi_heads/bbox_heads/bbox_head.py
index eb26c7a4..6cd403d6 100644
--- a/visdet/models/roi_heads/bbox_heads/bbox_head.py
+++ b/visdet/models/roi_heads/bbox_heads/bbox_head.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
+from typing import Any
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -23,6 +25,26 @@ class BBoxHead(BaseModule):
     """Simplest RoI head, with only two fc layers for classification and
     regression respectively."""
 
+    with_avg_pool: bool
+    with_cls: bool
+    with_reg: bool
+    roi_feat_size: tuple[int, int]
+    roi_feat_area: int
+    in_channels: int
+    num_classes: int
+    predict_box_type: str
+    reg_class_agnostic: bool
+    reg_decoded_bbox: bool
+    reg_predictor_cfg: Any
+    cls_predictor_cfg: Any
+    bbox_coder: Any
+    loss_cls: Any
+    loss_bbox: Any
+    fc_cls: Any
+    fc_reg: Any
+    avg_pool: Any
+    debug_imgs: Any
+
     def __init__(
         self,
         with_avg_pool: bool = False,
@@ -48,26 +70,26 @@ def __init__(
     ) -> None:
         super().__init__(init_cfg=init_cfg)
         assert with_cls or with_reg
-        self.with_avg_pool = with_avg_pool
-        self.with_cls = with_cls
-        self.with_reg = with_reg
-        self.roi_feat_size = _pair(roi_feat_size)
-        self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1]
-        self.in_channels = in_channels
-        self.num_classes = num_classes
-        self.predict_box_type = predict_box_type
-        self.reg_class_agnostic = reg_class_agnostic
-        self.reg_decoded_bbox = reg_decoded_bbox
-        self.reg_predictor_cfg = reg_predictor_cfg
-        self.cls_predictor_cfg = cls_predictor_cfg
-
-        self.bbox_coder = TASK_UTILS.build(bbox_coder)
-        self.loss_cls = MODELS.build(loss_cls)
-        self.loss_bbox = MODELS.build(loss_bbox)
+        self.with_avg_pool = with_avg_pool  # type: ignore[unresolved-attribute]
+        self.with_cls = with_cls  # type: ignore[unresolved-attribute]
+        self.with_reg = with_reg  # type: ignore[unresolved-attribute]
+        self.roi_feat_size = _pair(roi_feat_size)  # type: ignore[unresolved-attribute]
+        self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1]  # type: ignore[unresolved-attribute]
+        self.in_channels = in_channels  # type: ignore[unresolved-attribute]
+        self.num_classes = num_classes  # type: ignore[unresolved-attribute]
+        self.predict_box_type = predict_box_type  # type: ignore[unresolved-attribute]
+        self.reg_class_agnostic = reg_class_agnostic  # type: ignore[unresolved-attribute]
+        self.reg_decoded_bbox = reg_decoded_bbox  # type: ignore[unresolved-attribute]
+        self.reg_predictor_cfg = reg_predictor_cfg  # type: ignore[unresolved-attribute]
+        self.cls_predictor_cfg = cls_predictor_cfg  # type: ignore[unresolved-attribute]
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)  # type: ignore[unresolved-attribute]
+        self.loss_cls = MODELS.build(loss_cls)  # type: ignore[unresolved-attribute]
+        self.loss_bbox = MODELS.build(loss_bbox)  # type: ignore[unresolved-attribute]
 
         in_channels = self.in_channels
         if self.with_avg_pool:
-            self.avg_pool = nn.AvgPool2d(self.roi_feat_size)
+            self.avg_pool = nn.AvgPool2d(self.roi_feat_size)  # type: ignore[unresolved-attribute]
         else:
             in_channels *= self.roi_feat_area
         if self.with_cls:
@@ -78,17 +100,17 @@ def __init__(
                 cls_channels = num_classes + 1
             cls_predictor_cfg_ = self.cls_predictor_cfg.copy()
             cls_predictor_cfg_.update(in_features=in_channels, out_features=cls_channels)
-            self.fc_cls = MODELS.build(cls_predictor_cfg_)
+            self.fc_cls = MODELS.build(cls_predictor_cfg_)  # type: ignore[unresolved-attribute]
         if self.with_reg:
             box_dim = self.bbox_coder.encode_size
             out_dim_reg = box_dim if reg_class_agnostic else box_dim * num_classes
             reg_predictor_cfg_ = self.reg_predictor_cfg.copy()
             if isinstance(reg_predictor_cfg_, (dict, ConfigDict)):
                 reg_predictor_cfg_.update(in_features=in_channels, out_features=out_dim_reg)
-            self.fc_reg = MODELS.build(reg_predictor_cfg_)
-        self.debug_imgs = None
+            self.fc_reg = MODELS.build(reg_predictor_cfg_)  # type: ignore[unresolved-attribute]
+        self.debug_imgs = None  # type: ignore[unresolved-attribute]
         if init_cfg is None:
-            self.init_cfg = []
+            self.init_cfg = []  # type: ignore[unresolved-attribute]
             if self.with_cls:
                 self.init_cfg += [dict(type="Normal", std=0.01, override=dict(name="fc_cls"))]
             if self.with_reg:
@@ -112,11 +134,11 @@ def custom_accuracy(self) -> bool:
         """get custom_accuracy from loss_cls."""
         return getattr(self.loss_cls, "custom_accuracy", False)
 
-    def forward(self, x: tuple[Tensor]) -> tuple:
+    def forward(self, x: Tensor) -> tuple:
         """Forward features from the upstream network.
 
         Args:
-            x (tuple[Tensor]): Features from the upstream network, each is
+            x (Tensor): Features from the upstream network, each is
                 a 4D-tensor.
 
         Returns:
@@ -136,7 +158,7 @@ def forward(self, x: tuple[Tensor]) -> tuple:
             else:
                 # avg_pool does not support empty tensor,
                 # so use torch.mean instead it
-                x = torch.mean(x, dim=(-1, -2))
+                x = torch.mean(x, dim=(-1, -2), keepdim=False)
         cls_score = self.fc_cls(x) if self.with_cls else None
         bbox_pred = self.fc_reg(x) if self.with_reg else None
         return cls_score, bbox_pred
@@ -199,7 +221,8 @@ def get_bboxes(
         if cfg is None:
             return bboxes, scores
         else:
-            det_bboxes, det_labels = multiclass_nms(bboxes, scores, cfg.score_thr, cfg.nms, cfg.max_per_img)
+            nms_result = multiclass_nms(bboxes, scores, cfg.score_thr, cfg.nms, cfg.max_per_img)
+            det_bboxes, det_labels = nms_result[0], nms_result[1]
 
             return det_bboxes, det_labels
 
@@ -381,7 +404,7 @@ def loss_and_target(
             bbox_pred,
             rois,
             *cls_reg_targets,
-            reduction_override=reduction_override,
+            reduction_override=reduction_override,  # type: ignore[arg-type]
         )
 
         # cls_reg_targets is only for cascade rcnn
@@ -607,8 +630,9 @@ def _predict_by_feat_single(
 
         if rescale and bboxes.size(0) > 0:
             assert img_meta.get("scale_factor") is not None
-            scale_factor = [1 / s for s in img_meta["scale_factor"]]
-            bboxes = scale_boxes(bboxes, scale_factor)
+            scale_factor_list = [1 / s for s in img_meta["scale_factor"]]
+            scale_factor_tuple = (scale_factor_list[0], scale_factor_list[1])
+            bboxes = scale_boxes(bboxes, scale_factor_tuple)
 
         # Get the inside tensor when `bboxes` is a box type
         bboxes = get_box_tensor(bboxes)
@@ -621,7 +645,7 @@ def _predict_by_feat_single(
             results.bboxes = bboxes
             results.scores = scores
         else:
-            det_bboxes, det_labels = multiclass_nms(
+            nms_result = multiclass_nms(
                 bboxes,
                 scores,
                 rcnn_test_cfg["score_thr"],
@@ -629,6 +653,7 @@ def _predict_by_feat_single(
                 rcnn_test_cfg["max_per_img"],
                 box_dim=box_dim,
             )
+            det_bboxes, det_labels = nms_result[0], nms_result[1]
             results.bboxes = det_bboxes[:, :-1]
             results.scores = det_bboxes[:, -1]
             results.labels = det_labels
diff --git a/visdet/models/roi_heads/bbox_heads/convfc_bbox_head.py b/visdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
index 968b6326..f97273f2 100644
--- a/visdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
+++ b/visdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
+from typing import Any
+
 import torch.nn as nn
 from torch import Tensor
 
@@ -21,6 +23,27 @@ class ConvFCBBoxHead(BBoxHead):
                                     \-> reg convs -> reg fcs -> reg
     """
 
+    num_shared_convs: int
+    num_shared_fcs: int
+    num_cls_convs: int
+    num_cls_fcs: int
+    num_reg_convs: int
+    num_reg_fcs: int
+    conv_out_channels: int
+    fc_out_channels: int
+    conv_cfg: Any
+    norm_cfg: Any
+    shared_convs: Any
+    shared_fcs: Any
+    shared_out_channels: int
+    cls_convs: Any
+    cls_fcs: Any
+    cls_last_dim: int
+    reg_convs: Any
+    reg_fcs: Any
+    reg_last_dim: int
+    relu: nn.ReLU
+
     def __init__(
         self,
         num_shared_convs: int = 0,
@@ -37,7 +60,7 @@ def __init__(
         *args,
         **kwargs,
     ) -> None:
-        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)  # type: ignore[misc]
         assert num_shared_convs + num_shared_fcs + num_cls_convs + num_cls_fcs + num_reg_convs + num_reg_fcs > 0
         if num_cls_convs > 0 or num_reg_convs > 0:
             assert num_shared_fcs == 0
@@ -45,30 +68,30 @@ def __init__(
             assert num_cls_convs == 0 and num_cls_fcs == 0
         if not self.with_reg:
             assert num_reg_convs == 0 and num_reg_fcs == 0
-        self.num_shared_convs = num_shared_convs
-        self.num_shared_fcs = num_shared_fcs
-        self.num_cls_convs = num_cls_convs
-        self.num_cls_fcs = num_cls_fcs
-        self.num_reg_convs = num_reg_convs
-        self.num_reg_fcs = num_reg_fcs
-        self.conv_out_channels = conv_out_channels
-        self.fc_out_channels = fc_out_channels
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
+        self.num_shared_convs = num_shared_convs  # type: ignore[unresolved-attribute]
+        self.num_shared_fcs = num_shared_fcs  # type: ignore[unresolved-attribute]
+        self.num_cls_convs = num_cls_convs  # type: ignore[unresolved-attribute]
+        self.num_cls_fcs = num_cls_fcs  # type: ignore[unresolved-attribute]
+        self.num_reg_convs = num_reg_convs  # type: ignore[unresolved-attribute]
+        self.num_reg_fcs = num_reg_fcs  # type: ignore[unresolved-attribute]
+        self.conv_out_channels = conv_out_channels  # type: ignore[unresolved-attribute]
+        self.fc_out_channels = fc_out_channels  # type: ignore[unresolved-attribute]
+        self.conv_cfg = conv_cfg  # type: ignore[unresolved-attribute]
+        self.norm_cfg = norm_cfg  # type: ignore[unresolved-attribute]
 
         # add shared convs and fcs
-        self.shared_convs, self.shared_fcs, last_layer_dim = self._add_conv_fc_branch(
+        self.shared_convs, self.shared_fcs, last_layer_dim = self._add_conv_fc_branch(  # type: ignore[unresolved-attribute]
             self.num_shared_convs, self.num_shared_fcs, self.in_channels, True
         )
-        self.shared_out_channels = last_layer_dim
+        self.shared_out_channels = last_layer_dim  # type: ignore[unresolved-attribute]
 
         # add cls specific branch
-        self.cls_convs, self.cls_fcs, self.cls_last_dim = self._add_conv_fc_branch(
+        self.cls_convs, self.cls_fcs, self.cls_last_dim = self._add_conv_fc_branch(  # type: ignore[unresolved-attribute]
             self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels
         )
 
         # add reg specific branch
-        self.reg_convs, self.reg_fcs, self.reg_last_dim = self._add_conv_fc_branch(
+        self.reg_convs, self.reg_fcs, self.reg_last_dim = self._add_conv_fc_branch(  # type: ignore[unresolved-attribute]
             self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels
         )
 
@@ -78,7 +101,7 @@ def __init__(
             if self.num_reg_fcs == 0:
                 self.reg_last_dim *= self.roi_feat_area
 
-        self.relu = nn.ReLU(inplace=True)
+        self.relu = nn.ReLU(inplace=True)  # type: ignore[unresolved-attribute]
         # reconstruct fc_cls and fc_reg since input channels are changed
         if self.with_cls:
             if self.custom_cls_channels:
@@ -87,17 +110,17 @@ def __init__(
                 cls_channels = self.num_classes + 1
             cls_predictor_cfg_ = self.cls_predictor_cfg.copy()
             cls_predictor_cfg_.update(in_features=self.cls_last_dim, out_features=cls_channels)
-            self.fc_cls = MODELS.build(cls_predictor_cfg_)
+            self.fc_cls = MODELS.build(cls_predictor_cfg_)  # type: ignore[unresolved-attribute]
         if self.with_reg:
             box_dim = self.bbox_coder.encode_size
             out_dim_reg = box_dim if self.reg_class_agnostic else box_dim * self.num_classes
             reg_predictor_cfg_ = self.reg_predictor_cfg.copy()
             if isinstance(reg_predictor_cfg_, (dict, ConfigDict)):
                 reg_predictor_cfg_.update(in_features=self.reg_last_dim, out_features=out_dim_reg)
-            self.fc_reg = MODELS.build(reg_predictor_cfg_)
+            self.fc_reg = MODELS.build(reg_predictor_cfg_)  # type: ignore[unresolved-attribute]
 
         if init_cfg is None:
-            self.init_cfg += [
+            self.init_cfg += [  # type: ignore[unresolved-attribute,operator]
                 dict(
                     type="Xavier",
                     distribution="uniform",
@@ -150,7 +173,7 @@ def _add_conv_fc_branch(
             last_layer_dim = self.fc_out_channels
         return branch_convs, branch_fcs, last_layer_dim
 
-    def forward(self, x: tuple[Tensor]) -> tuple:
+    def forward(self, x: Tensor) -> tuple:
         """Forward features from the upstream network.
 
         Args:
@@ -211,7 +234,7 @@ def forward(self, x: tuple[Tensor]) -> tuple:
 # reduce the dumb classifications errors
 @MODELS.register_module()
 class Shared2FCBBoxHead(ConvFCBBoxHead):
-    def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None:
+    def __init__(self, fc_out_channels: int = 1024, **kwargs) -> None:
         super().__init__(
             num_shared_convs=0,
             num_shared_fcs=2,
@@ -220,14 +243,13 @@ def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None:
             num_reg_convs=0,
             num_reg_fcs=0,
             fc_out_channels=fc_out_channels,
-            *args,
             **kwargs,
         )
 
 
 @MODELS.register_module()
 class Shared4Conv1FCBBoxHead(ConvFCBBoxHead):
-    def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None:
+    def __init__(self, fc_out_channels: int = 1024, **kwargs) -> None:
         super().__init__(
             num_shared_convs=4,
             num_shared_fcs=1,
@@ -236,6 +258,5 @@ def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None:
             num_reg_convs=0,
             num_reg_fcs=0,
             fc_out_channels=fc_out_channels,
-            *args,
             **kwargs,
         )
diff --git a/visdet/models/roi_heads/cascade_roi_head.py b/visdet/models/roi_heads/cascade_roi_head.py
index a5cdabdf..ae6e7fc3 100644
--- a/visdet/models/roi_heads/cascade_roi_head.py
+++ b/visdet/models/roi_heads/cascade_roi_head.py
@@ -1,5 +1,3 @@
-# ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
 """Cascade RoI head for visdet."""
 
@@ -12,7 +10,7 @@
 from torch import Tensor
 
 from visdet.engine.structures import InstanceData
-
+from visdet.models.roi_heads.base_roi_head import BaseRoIHead
 from visdet.models.task_modules.samplers import SamplingResult
 from visdet.models.test_time_augs import merge_aug_masks
 from visdet.models.utils import empty_instances, unpack_gt_instances
@@ -27,8 +25,6 @@
     OptMultiConfig,
 )
 
-from visdet.models.roi_heads.base_roi_head import BaseRoIHead
-
 
 @MODELS.register_module()
 class CascadeRoIHead(BaseRoIHead):
@@ -54,8 +50,8 @@ def __init__(
         assert bbox_head is not None
         assert shared_head is None, "Shared head is not supported in Cascade RCNN."
 
-        self.num_stages = num_stages
-        self.stage_loss_weights = stage_loss_weights
+        self.num_stages = num_stages  # type: ignore[misc]
+        self.stage_loss_weights = stage_loss_weights  # type: ignore[misc]
         super().__init__(
             bbox_roi_extractor=bbox_roi_extractor,
             bbox_head=bbox_head,
@@ -93,29 +89,29 @@ def init_mask_head(self, mask_roi_extractor: MultiConfig, mask_head: MultiConfig
             self.mask_head.append(MODELS.build(head_cfg))
 
         if mask_roi_extractor is not None:
-            self.share_roi_extractor = False
-            self.mask_roi_extractor = nn.ModuleList()
+            self.share_roi_extractor = False  # type: ignore[misc]
+            self.mask_roi_extractor = nn.ModuleList()  # type: ignore[misc]
             if not isinstance(mask_roi_extractor, list):
                 mask_roi_extractor = [mask_roi_extractor for _ in range(self.num_stages)]
             assert len(mask_roi_extractor) == self.num_stages
             for roi_extractor_cfg in mask_roi_extractor:
-                self.mask_roi_extractor.append(MODELS.build(roi_extractor_cfg))
+                self.mask_roi_extractor.append(MODELS.build(roi_extractor_cfg))  # type: ignore[union-attr]
         else:
-            self.share_roi_extractor = True
-            self.mask_roi_extractor = self.bbox_roi_extractor
+            self.share_roi_extractor = True  # type: ignore[misc]
+            self.mask_roi_extractor = self.bbox_roi_extractor  # type: ignore[misc]
 
     def init_assigner_sampler(self) -> None:
         """Initialize assigner and sampler for each stage."""
-        self.bbox_assigner: list | None = []
-        self.bbox_sampler: list | None = []
+        self.bbox_assigner: list | None = []  # type: ignore[misc]
+        self.bbox_sampler: list | None = []  # type: ignore[misc]
         if self.train_cfg is not None:
             assert isinstance(self.train_cfg, (list, tuple)), (
                 "Cascade RCNN expects list-style train_cfg for each stage."
             )
             for idx, rcnn_train_cfg in enumerate(self.train_cfg):
-                self.bbox_assigner.append(TASK_UTILS.build(rcnn_train_cfg["assigner"]))
-                self.current_stage = idx
-                self.bbox_sampler.append(
+                self.bbox_assigner.append(TASK_UTILS.build(rcnn_train_cfg["assigner"]))  # type: ignore[union-attr]
+                self.current_stage = idx  # type: ignore[misc]
+                self.bbox_sampler.append(  # type: ignore[union-attr]
                     TASK_UTILS.build(rcnn_train_cfg["sampler"], default_args=dict(context=self)),
                 )
 
@@ -185,13 +181,13 @@ def loss(
         losses: dict[str, Tensor] = {}
         results_list = rpn_results_list
         for stage in range(self.num_stages):
-            self.current_stage = stage
+            self.current_stage = stage  # type: ignore[misc]
             stage_loss_weight = self.stage_loss_weights[stage]
 
             sampling_results: list[SamplingResult] = []
             if self.with_bbox or self.with_mask:
-                bbox_assigner = self.bbox_assigner[stage]
-                bbox_sampler = self.bbox_sampler[stage]
+                bbox_assigner = self.bbox_assigner[stage]  # type: ignore[index]
+                bbox_sampler = self.bbox_sampler[stage]  # type: ignore[index]
                 for i in range(num_imgs):
                     results = results_list[i]
                     results.priors = results.pop("bboxes")
@@ -346,7 +342,10 @@ def _refine_roi(
                     refined_bboxes = get_box_tensor(refined_bboxes)
                     refined_rois = torch.cat([rois[img_idx][:, [0]], refined_bboxes], dim=1)
                     refine_rois_list.append(refined_rois)
-                rois = torch.cat(refine_rois_list) if refine_rois_list else rois[0].new_zeros((0, 5))
+                if refine_rois_list:
+                    rois = torch.cat(refine_rois_list, dim=0)
+                else:
+                    rois = rois[0].new_zeros((0, 5))
 
         cls_scores = [
             sum(score_set[i] for score_set in ms_scores) / float(len(ms_scores)) for i in range(len(batch_img_metas))
@@ -371,7 +370,7 @@ def forward(
         if self.with_mask:
             aug_masks = []
             if isinstance(rois, (list, tuple)):
-                rois = torch.cat(rois)
+                rois = torch.cat(list(rois), dim=0)  # type: ignore[arg-type]
             for stage in range(self.num_stages):
                 mask_results = self._mask_forward(stage, x, rois)
                 mask_preds = mask_results["mask_preds"].split(num_proposals_per_img, 0)
diff --git a/visdet/models/roi_heads/mask_heads/fcn_mask_head.py b/visdet/models/roi_heads/mask_heads/fcn_mask_head.py
index d7ebbd05..b6a6db6a 100644
--- a/visdet/models/roi_heads/mask_heads/fcn_mask_head.py
+++ b/visdet/models/roi_heads/mask_heads/fcn_mask_head.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
+from typing import Any
+
 import numpy as np
 import torch
 import torch.nn as nn
@@ -14,6 +16,7 @@
 from visdet.models.task_modules.samplers import SamplingResult
 from visdet.models.utils import empty_instances
 from visdet.registry import MODELS
+from visdet.structures.bbox import get_box_tensor
 from visdet.structures.mask import mask_target
 from visdet.utils import ConfigType, InstanceList, OptConfigType, OptMultiConfig
 
@@ -25,6 +28,26 @@
 
 @MODELS.register_module()
 class FCNMaskHead(BaseModule):
+    num_convs: int
+    roi_feat_size: tuple[int, int]
+    in_channels: int
+    conv_kernel_size: int
+    conv_out_channels: int
+    upsample_cfg: Any
+    upsample_method: Any
+    scale_factor: Any
+    num_classes: int
+    class_agnostic: int
+    conv_cfg: Any
+    norm_cfg: Any
+    predictor_cfg: Any
+    loss_mask: Any
+    convs: Any
+    upsample: Any
+    conv_logits: Any
+    relu: nn.ReLU
+    debug_imgs: Any
+
     def __init__(
         self,
         num_convs: int = 4,
@@ -43,27 +66,27 @@ def __init__(
     ) -> None:
         assert init_cfg is None, "To prevent abnormal initialization behavior, init_cfg is not allowed to be set"
         super().__init__(init_cfg=init_cfg)
-        self.upsample_cfg = upsample_cfg.copy()
+        self.upsample_cfg = upsample_cfg.copy()  # type: ignore[unresolved-attribute]
         if self.upsample_cfg["type"] not in [None, "deconv", "nearest", "bilinear"]:
             raise ValueError(
                 f'Invalid upsample method {self.upsample_cfg["type"]}, accepted methods are "deconv", "nearest", "bilinear"'
             )
-        self.num_convs = num_convs
+        self.num_convs = num_convs  # type: ignore[unresolved-attribute]
         # WARN: roi_feat_size is reserved and not used
-        self.roi_feat_size = _pair(roi_feat_size)
-        self.in_channels = in_channels
-        self.conv_kernel_size = conv_kernel_size
-        self.conv_out_channels = conv_out_channels
-        self.upsample_method = self.upsample_cfg.get("type")
-        self.scale_factor = self.upsample_cfg.pop("scale_factor", None)
-        self.num_classes = num_classes
-        self.class_agnostic = class_agnostic
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.predictor_cfg = predictor_cfg
-        self.loss_mask = MODELS.build(loss_mask)
-
-        self.convs = ModuleList()
+        self.roi_feat_size = _pair(roi_feat_size)  # type: ignore[unresolved-attribute]
+        self.in_channels = in_channels  # type: ignore[unresolved-attribute]
+        self.conv_kernel_size = conv_kernel_size  # type: ignore[unresolved-attribute]
+        self.conv_out_channels = conv_out_channels  # type: ignore[unresolved-attribute]
+        self.upsample_method = self.upsample_cfg.get("type")  # type: ignore[unresolved-attribute]
+        self.scale_factor = self.upsample_cfg.pop("scale_factor", None)  # type: ignore[unresolved-attribute]
+        self.num_classes = num_classes  # type: ignore[unresolved-attribute]
+        self.class_agnostic = class_agnostic  # type: ignore[unresolved-attribute]
+        self.conv_cfg = conv_cfg  # type: ignore[unresolved-attribute]
+        self.norm_cfg = norm_cfg  # type: ignore[unresolved-attribute]
+        self.predictor_cfg = predictor_cfg  # type: ignore[unresolved-attribute]
+        self.loss_mask = MODELS.build(loss_mask)  # type: ignore[unresolved-attribute]
+
+        self.convs = ModuleList()  # type: ignore[unresolved-attribute]
         for i in range(self.num_convs):
             in_channels = self.in_channels if i == 0 else self.conv_out_channels
             padding = (self.conv_kernel_size - 1) // 2
@@ -80,7 +103,7 @@ def __init__(
         upsample_in_channels = self.conv_out_channels if self.num_convs > 0 else in_channels
         upsample_cfg_ = self.upsample_cfg.copy()
         if self.upsample_method is None:
-            self.upsample = None
+            self.upsample = None  # type: ignore[unresolved-attribute]
         elif self.upsample_method == "deconv":
             upsample_cfg_.update(
                 in_channels=upsample_in_channels,
@@ -88,7 +111,7 @@ def __init__(
                 kernel_size=self.scale_factor,
                 stride=self.scale_factor,
             )
-            self.upsample = build_upsample_layer(upsample_cfg_)
+            self.upsample = build_upsample_layer(upsample_cfg_)  # type: ignore[unresolved-attribute]
         else:
             # suppress warnings
             align_corners = None if self.upsample_method == "nearest" else False
@@ -97,13 +120,13 @@ def __init__(
                 mode=self.upsample_method,
                 align_corners=align_corners,
             )
-            self.upsample = build_upsample_layer(upsample_cfg_)
+            self.upsample = build_upsample_layer(upsample_cfg_)  # type: ignore[unresolved-attribute]
 
         out_channels = 1 if self.class_agnostic else self.num_classes
         logits_in_channel = self.conv_out_channels if self.upsample_method == "deconv" else upsample_in_channels
-        self.conv_logits = build_conv_layer(self.predictor_cfg, logits_in_channel, out_channels, 1)
-        self.relu = nn.ReLU(inplace=True)
-        self.debug_imgs = None
+        self.conv_logits = build_conv_layer(self.predictor_cfg, logits_in_channel, out_channels, 1)  # type: ignore[unresolved-attribute]
+        self.relu = nn.ReLU(inplace=True)  # type: ignore[unresolved-attribute]
+        self.debug_imgs = None  # type: ignore[unresolved-attribute]
 
     def init_weights(self) -> None:
         """Initialize the weights."""
@@ -111,9 +134,11 @@ def init_weights(self) -> None:
         for m in [self.upsample, self.conv_logits]:
             if m is None:
                 continue
-            elif hasattr(m, "weight") and hasattr(m, "bias"):
-                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
-                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.Linear)):
+                if m.weight is not None:
+                    nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
 
     def forward(self, x: Tensor) -> Tensor:
         """Forward features from the upstream network.
@@ -253,9 +278,11 @@ def predict_by_feat(
                     mask_thr_binary=rcnn_test_cfg.mask_thr_binary,
                 )[0]
             else:
+                # Convert bboxes to tensor if needed
+                bboxes_tensor = get_box_tensor(bboxes)
                 im_mask = self._predict_by_feat_single(
                     mask_preds=mask_preds[img_id],
-                    bboxes=bboxes,
+                    bboxes=bboxes_tensor,
                     labels=results.labels,
                     img_meta=img_meta,
                     rcnn_test_cfg=rcnn_test_cfg,
diff --git a/visdet/models/roi_heads/roi_extractors/base_roi_extractor.py b/visdet/models/roi_heads/roi_extractors/base_roi_extractor.py
index bb1c1f40..f67dd277 100644
--- a/visdet/models/roi_heads/roi_extractors/base_roi_extractor.py
+++ b/visdet/models/roi_heads/roi_extractors/base_roi_extractor.py
@@ -22,6 +22,9 @@ class BaseRoIExtractor(BaseModule, metaclass=ABCMeta):
             dict], optional): Initialization config dict. Defaults to None.
     """
 
+    out_channels: int
+    featmap_strides: list[int]
+
     def __init__(
         self,
         roi_layer: ConfigType,
@@ -31,8 +34,8 @@ def __init__(
     ) -> None:
         super().__init__(init_cfg=init_cfg)
         self.roi_layers = self.build_roi_layers(roi_layer, featmap_strides)
-        self.out_channels = out_channels
-        self.featmap_strides = featmap_strides
+        self.out_channels = out_channels  # type: ignore[unresolved-attribute]
+        self.featmap_strides = featmap_strides  # type: ignore[unresolved-attribute]
 
     @property
     def num_inputs(self) -> int:
diff --git a/visdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py b/visdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
index 2c588232..29102c72 100644
--- a/visdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
+++ b/visdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
@@ -27,6 +27,8 @@ class SingleRoIExtractor(BaseRoIExtractor):
             dict], optional): Initialization config dict. Defaults to None.
     """
 
+    finest_scale: int
+
     def __init__(
         self,
         roi_layer: ConfigType,
@@ -41,7 +43,7 @@ def __init__(
             featmap_strides=featmap_strides,
             init_cfg=init_cfg,
         )
-        self.finest_scale = finest_scale
+        self.finest_scale = finest_scale  # type: ignore[unresolved-attribute]
 
     def map_roi_levels(self, rois: Tensor, num_levels: int) -> Tensor:
         """Map rois to corresponding feature levels by scales.
diff --git a/visdet/models/roi_heads/standard_roi_head.py b/visdet/models/roi_heads/standard_roi_head.py
index 69c08c2b..d7a8ce80 100644
--- a/visdet/models/roi_heads/standard_roi_head.py
+++ b/visdet/models/roi_heads/standard_roi_head.py
@@ -1,18 +1,18 @@
-# ruff: noqa
-# type: ignore
+from typing import List, Optional, Tuple
+
 import torch
 import torch.nn as nn
-from visdet.registry import MODELS, TASK_UTILS
-from visdet.structures.bbox import bbox2roi
-from visdet.models.utils import empty_instances, unpack_gt_instances
-from visdet.engine.structures import InstanceData
-from visdet.utils.typing_utils import ConfigType
-from visdet.utils import InstanceList, OptConfigType, OptMultiConfig
-from visdet.structures import DetDataSample, SampleList
-from typing import List, Optional, Tuple
 from torch import Tensor
+
+from visdet.engine.structures import InstanceData
 from visdet.models.roi_heads.base_roi_head import BaseRoIHead
 from visdet.models.task_modules.samplers import SamplingResult
+from visdet.models.utils import empty_instances, unpack_gt_instances
+from visdet.registry import MODELS, TASK_UTILS
+from visdet.structures import DetDataSample, SampleList
+from visdet.structures.bbox import bbox2roi
+from visdet.utils import InstanceList, OptConfigType, OptMultiConfig
+from visdet.utils.typing_utils import ConfigType
 
 
 @MODELS.register_module()
@@ -39,17 +39,17 @@ def init_mask_head(self, mask_roi_extractor: ConfigType, mask_head: ConfigType)
             mask_head (dict or ConfigDict): Config of mask in mask head.
         """
         if mask_roi_extractor is not None:
-            self.mask_roi_extractor = MODELS.build(mask_roi_extractor)
-            self.share_roi_extractor = False
+            self.mask_roi_extractor = MODELS.build(mask_roi_extractor)  # type: ignore[misc]
+            self.share_roi_extractor = False  # type: ignore[misc]
         else:
-            self.share_roi_extractor = True
-            self.mask_roi_extractor = self.bbox_roi_extractor
-        self.mask_head = MODELS.build(mask_head)
+            self.share_roi_extractor = True  # type: ignore[misc]
+            self.mask_roi_extractor = self.bbox_roi_extractor  # type: ignore[misc]
+        self.mask_head = MODELS.build(mask_head)  # type: ignore[misc]
 
     def init_assigner_sampler(self) -> None:
         """Initialize assigner and sampler."""
-        self.bbox_assigner = None
-        self.bbox_sampler = None
+        self.bbox_assigner = None  # type: ignore[misc]
+        self.bbox_sampler = None  # type: ignore[misc]
         if self.train_cfg:
             # Support both direct train_cfg and nested under 'rcnn' key
             if "rcnn" in self.train_cfg:
@@ -63,7 +63,7 @@ def forward(
         self,
         x: Tuple[Tensor],
         rpn_results_list: InstanceList,
-        batch_data_samples: SampleList = None,
+        batch_data_samples: SampleList | None = None,
     ) -> tuple:
         """Network forward process. Usually includes backbone, neck and head
         forward without any post-processing.
@@ -127,8 +127,8 @@ def loss(
             rpn_results = rpn_results_list[i]
             rpn_results.priors = rpn_results.pop("bboxes")
 
-            assign_result = self.bbox_assigner.assign(rpn_results, batch_gt_instances[i], batch_gt_instances_ignore[i])
-            sampling_result = self.bbox_sampler.sample(
+            assign_result = self.bbox_assigner.assign(rpn_results, batch_gt_instances[i], batch_gt_instances_ignore[i])  # type: ignore[union-attr]
+            sampling_result = self.bbox_sampler.sample(  # type: ignore[union-attr]
                 assign_result,
                 rpn_results,
                 batch_gt_instances[i],
@@ -256,7 +256,7 @@ def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict:
     def _mask_forward(
         self,
         x: Tuple[Tensor],
-        rois: Tensor = None,
+        rois: Tensor | None = None,
         pos_inds: Optional[Tensor] = None,
         bbox_feats: Optional[Tensor] = None,
     ) -> dict:
diff --git a/visdet/models/task_modules/assigners/assign_result.py b/visdet/models/task_modules/assigners/assign_result.py
index afd6db1e..99b572c4 100644
--- a/visdet/models/task_modules/assigners/assign_result.py
+++ b/visdet/models/task_modules/assigners/assign_result.py
@@ -113,7 +113,7 @@ def random(cls, **kwargs):
             >>> self = AssignResult.random()
             >>> print(self.info)
         """
-        from ..samplers.sampling_result import ensure_rng
+        from visdet.core.bbox.demodata import ensure_rng
 
         rng = ensure_rng(kwargs.get("rng", None))
 
diff --git a/visdet/models/task_modules/samplers/__init__.py b/visdet/models/task_modules/samplers/__init__.py
index 83b056e8..73c0b625 100644
--- a/visdet/models/task_modules/samplers/__init__.py
+++ b/visdet/models/task_modules/samplers/__init__.py
@@ -3,6 +3,7 @@
 from visdet.registry import TASK_UTILS
 from visdet.utils import util_mixins
 from visdet.engine.structures import InstanceData
+from visdet.structures.bbox import BaseBoxes
 
 
 class SamplingResult(util_mixins.NiceRepr):
@@ -74,6 +75,10 @@ def sample(
         priors = pred_instances.priors
         gt_bboxes = gt_instances.bboxes
 
+        # Convert BaseBoxes to tensor if needed
+        if isinstance(gt_bboxes, BaseBoxes):
+            gt_bboxes = gt_bboxes.tensor
+
         pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False).squeeze(-1)
         neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False).squeeze(-1)
 
@@ -111,6 +116,10 @@ def sample(
         priors = pred_instances.priors
         gt_bboxes = gt_instances.bboxes
 
+        # Convert BaseBoxes to tensor if needed
+        if isinstance(gt_bboxes, BaseBoxes):
+            gt_bboxes = gt_bboxes.tensor
+
         num_expected_pos = int(self.num * self.pos_fraction)
         pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False).squeeze(-1)
         if pos_inds.numel() > num_expected_pos:
diff --git a/visdet/models/utils/image.py b/visdet/models/utils/image.py
index c09d197b..da572880 100644
--- a/visdet/models/utils/image.py
+++ b/visdet/models/utils/image.py
@@ -21,7 +21,7 @@ def imrenormalize(img: Tensor | np.ndarray, img_norm_cfg: dict, new_img_norm_cfg
         assert img.ndim == 4 and img.shape[0] == 1
         new_img = img.squeeze(0).cpu().numpy().transpose(1, 2, 0)
         new_img = _imrenormalize(new_img, img_norm_cfg, new_img_norm_cfg)
-        new_img = new_img.transpose(2, 0, 1)[None]
+        new_img = new_img.transpose(2, 0, 1)[None]  # type: ignore[misc]
         return torch.from_numpy(new_img).to(img)
     else:
         return _imrenormalize(img, img_norm_cfg, new_img_norm_cfg)
diff --git a/visdet/models/utils/res_layer.py b/visdet/models/utils/res_layer.py
index 5b9c192f..c9ec00e9 100644
--- a/visdet/models/utils/res_layer.py
+++ b/visdet/models/utils/res_layer.py
@@ -149,7 +149,7 @@ def __init__(
         assert dcn is None, "Not implemented yet."
         assert plugins is None, "Not implemented yet."
         assert not with_cp, "Not implemented yet."
-        self.with_norm = norm_cfg is not None
+        self.with_norm: bool = norm_cfg is not None  # type: ignore[misc]
         with_bias = True if norm_cfg is None else False
         self.conv1 = build_conv_layer(
             conv_cfg,
@@ -162,18 +162,20 @@ def __init__(
             bias=with_bias,
         )
         if self.with_norm:
-            self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+            norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+            self.norm1_name: str = norm1_name  # type: ignore[misc]
             self.add_module(self.norm1_name, norm1)
         self.conv2 = build_conv_layer(conv_cfg, planes, planes, 3, padding=1, bias=with_bias)
         if self.with_norm:
-            self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+            norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+            self.norm2_name: str = norm2_name  # type: ignore[misc]
             self.add_module(self.norm2_name, norm2)
 
         self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-        self.dilation = dilation
-        self.with_cp = with_cp
+        self.downsample: nn.Module | None = downsample  # type: ignore[misc]
+        self.stride: int = stride  # type: ignore[misc]
+        self.dilation: int = dilation  # type: ignore[misc]
+        self.with_cp: bool = with_cp  # type: ignore[misc]
 
     @property
     def norm1(self):
diff --git a/visdet/runner.py b/visdet/runner.py
index 7d78f155..7cb9e577 100644
--- a/visdet/runner.py
+++ b/visdet/runner.py
@@ -128,7 +128,7 @@ def _deep_merge(self, base: dict, override: dict) -> dict:
 
     def _build_config(self) -> None:
         """Build a full MMEngine-compatible configuration from resolved presets."""
-        from visdet.engine import Config
+        from visdet.engine.config import Config
 
         # Automatically sync num_classes from dataset to model
         self._sync_num_classes()
@@ -241,7 +241,7 @@ def train(self) -> None:
         """
         # MMEngineRunner is imported here to avoid potential circular dependencies
         # and to ensure registries are populated first.
-        from visdet.engine import DefaultScope
+        from visdet.engine.registry import DefaultScope
         from visdet.engine.runner import Runner as MMEngineRunner
 
         # Ensure the 'visdet' scope is active for component registration.
diff --git a/visdet/structures/bbox/base_boxes.py b/visdet/structures/bbox/base_boxes.py
index 572ebc77..71ba7f7c 100644
--- a/visdet/structures/bbox/base_boxes.py
+++ b/visdet/structures/bbox/base_boxes.py
@@ -1,9 +1,8 @@
 # ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
-from abc import ABCMeta, abstractmethod, abstractproperty, abstractstaticmethod
+from abc import ABCMeta, abstractmethod
 from collections.abc import Sequence
-from typing import TypeVar, Union
+from typing import TypeVar, Union, cast
 
 import numpy as np
 import torch
@@ -11,16 +10,13 @@
 
 from visdet.structures.mask.structures import BitmapMasks, PolygonMasks
 
-T = TypeVar("T")
+T = TypeVar("T", bound="BaseBoxes")
 DeviceType = Union[str, torch.device]
 IndexType = Union[
     slice,
     int,
     list,
-    torch.LongTensor,
-    torch.cuda.LongTensor,
-    torch.BoolTensor,
-    torch.cuda.BoolTensor,
+    Tensor,
     np.ndarray,
 ]
 MaskType = Union[BitmapMasks, PolygonMasks]
@@ -117,7 +113,7 @@ def empty_boxes(self: T, dtype: torch.dtype | None = None, device: DeviceType |
             T: empty boxes with shape of (0, box_dim).
         """
         empty_box = self.tensor.new_zeros(0, self.box_dim, dtype=dtype, device=device)
-        return type(self)(empty_box, clone=False)
+        return cast(T, type(self)(empty_box, clone=False))
 
     def fake_boxes(
         self: T,
@@ -139,7 +135,7 @@ def fake_boxes(
             T: Fake boxes with shape of ``sizes``.
         """
         fake_boxes = self.tensor.new_full(sizes, fill, dtype=dtype, device=device)
-        return type(self)(fake_boxes, clone=False)
+        return cast(T, type(self)(fake_boxes, clone=False))
 
     def __getitem__(self: T, index: IndexType) -> T:
         """Rewrite getitem to protect the last dimension shape."""
@@ -158,12 +154,13 @@ def __getitem__(self: T, index: IndexType) -> T:
         boxes = boxes[index]
         if boxes.dim() == 1:
             boxes = boxes.reshape(1, -1)
-        return type(self)(boxes, clone=False)
+        return cast(T, type(self)(boxes, clone=False))
 
-    def __setitem__(self: T, index: IndexType, values: Tensor | T) -> T:
+    def __setitem__(self: T, index: IndexType, values: Tensor | T) -> None:
         """Rewrite setitem to protect the last dimension shape."""
+        assert isinstance(values, BaseBoxes), "The value to be set must be a BaseBoxes instance"
         assert type(values) is type(self), "The value to be set must be the same box type as self"
-        values = values.tensor
+        values_tensor = values.tensor
 
         if isinstance(index, np.ndarray):
             index = torch.as_tensor(index, device=self.device)
@@ -176,7 +173,7 @@ def __setitem__(self: T, index: IndexType, values: Tensor | T) -> T:
             if Ellipsis in index:
                 assert index[-1] is Ellipsis
 
-        self.tensor[index] = values
+        self.tensor[index] = values_tensor
 
     def __len__(self) -> int:
         """Return the length of self.tensor first dimension."""
@@ -247,84 +244,84 @@ def numpy(self) -> np.ndarray:
 
     def to(self: T, *args, **kwargs) -> T:
         """Reload ``to`` from self.tensor."""
-        return type(self)(self.tensor.to(*args, **kwargs), clone=False)
+        return cast(T, type(self)(self.tensor.to(*args, **kwargs), clone=False))
 
     def cpu(self: T) -> T:
         """Reload ``cpu`` from self.tensor."""
-        return type(self)(self.tensor.cpu(), clone=False)
+        return cast(T, type(self)(self.tensor.cpu(), clone=False))
 
     def cuda(self: T, *args, **kwargs) -> T:
         """Reload ``cuda`` from self.tensor."""
-        return type(self)(self.tensor.cuda(*args, **kwargs), clone=False)
+        return cast(T, type(self)(self.tensor.cuda(*args, **kwargs), clone=False))
 
     def clone(self: T) -> T:
         """Reload ``clone`` from self.tensor."""
-        return type(self)(self.tensor)
+        return cast(T, type(self)(self.tensor))
 
     def detach(self: T) -> T:
         """Reload ``detach`` from self.tensor."""
-        return type(self)(self.tensor.detach(), clone=False)
+        return cast(T, type(self)(self.tensor.detach(), clone=False))
 
-    def view(self: T, *shape: tuple[int]) -> T:
+    def view(self: T, *shape: int) -> T:
         """Reload ``view`` from self.tensor."""
-        return type(self)(self.tensor.view(shape), clone=False)
+        return cast(T, type(self)(self.tensor.view(*shape), clone=False))
 
-    def reshape(self: T, *shape: tuple[int]) -> T:
+    def reshape(self: T, *shape: int) -> T:
         """Reload ``reshape`` from self.tensor."""
-        return type(self)(self.tensor.reshape(shape), clone=False)
+        return cast(T, type(self)(self.tensor.reshape(*shape), clone=False))
 
-    def expand(self: T, *sizes: tuple[int]) -> T:
+    def expand(self: T, *sizes: int) -> T:
         """Reload ``expand`` from self.tensor."""
-        return type(self)(self.tensor.expand(sizes), clone=False)
+        return cast(T, type(self)(self.tensor.expand(*sizes), clone=False))
 
-    def repeat(self: T, *sizes: tuple[int]) -> T:
+    def repeat(self: T, *sizes: int) -> T:
         """Reload ``repeat`` from self.tensor."""
-        return type(self)(self.tensor.repeat(sizes), clone=False)
+        return cast(T, type(self)(self.tensor.repeat(*sizes), clone=False))
 
     def transpose(self: T, dim0: int, dim1: int) -> T:
         """Reload ``transpose`` from self.tensor."""
         ndim = self.tensor.dim()
         assert dim0 != -1 and dim0 != ndim - 1
         assert dim1 != -1 and dim1 != ndim - 1
-        return type(self)(self.tensor.transpose(dim0, dim1), clone=False)
+        return cast(T, type(self)(self.tensor.transpose(dim0, dim1), clone=False))
 
-    def permute(self: T, *dims: tuple[int]) -> T:
+    def permute(self: T, *dims: int) -> T:
         """Reload ``permute`` from self.tensor."""
         assert dims[-1] == -1 or dims[-1] == self.tensor.dim() - 1
-        return type(self)(self.tensor.permute(dims), clone=False)
+        return cast(T, type(self)(self.tensor.permute(*dims), clone=False))
 
     def split(self: T, split_size_or_sections: int | Sequence[int], dim: int = 0) -> list[T]:
         """Reload ``split`` from self.tensor."""
         assert dim != -1 and dim != self.tensor.dim() - 1
         boxes_list = self.tensor.split(split_size_or_sections, dim=dim)
-        return [type(self)(boxes, clone=False) for boxes in boxes_list]
+        return [cast(T, type(self)(boxes, clone=False)) for boxes in boxes_list]
 
     def chunk(self: T, chunks: int, dim: int = 0) -> list[T]:
         """Reload ``chunk`` from self.tensor."""
         assert dim != -1 and dim != self.tensor.dim() - 1
         boxes_list = self.tensor.chunk(chunks, dim=dim)
-        return [type(self)(boxes, clone=False) for boxes in boxes_list]
+        return [cast(T, type(self)(boxes, clone=False)) for boxes in boxes_list]
 
-    def unbind(self: T, dim: int = 0) -> T:
+    def unbind(self: T, dim: int = 0) -> list[T]:
         """Reload ``unbind`` from self.tensor."""
         assert dim != -1 and dim != self.tensor.dim() - 1
         boxes_list = self.tensor.unbind(dim=dim)
-        return [type(self)(boxes, clone=False) for boxes in boxes_list]
+        return [cast(T, type(self)(boxes, clone=False)) for boxes in boxes_list]
 
     def flatten(self: T, start_dim: int = 0, end_dim: int = -2) -> T:
         """Reload ``flatten`` from self.tensor."""
         assert end_dim != -1 and end_dim != self.tensor.dim() - 1
-        return type(self)(self.tensor.flatten(start_dim, end_dim), clone=False)
+        return cast(T, type(self)(self.tensor.flatten(start_dim, end_dim), clone=False))
 
     def squeeze(self: T, dim: int | None = None) -> T:
         """Reload ``squeeze`` from self.tensor."""
         boxes = self.tensor.squeeze() if dim is None else self.tensor.squeeze(dim)
-        return type(self)(boxes, clone=False)
+        return cast(T, type(self)(boxes, clone=False))
 
     def unsqueeze(self: T, dim: int) -> T:
         """Reload ``unsqueeze`` from self.tensor."""
         assert dim != -1 and dim != self.tensor.dim()
-        return type(self)(self.tensor.unsqueeze(dim), clone=False)
+        return cast(T, type(self)(self.tensor.unsqueeze(dim), clone=False))
 
     @classmethod
     def cat(cls: type[T], box_list: Sequence[T], dim: int = 0) -> T:
@@ -371,22 +368,26 @@ def stack(cls: type[T], box_list: Sequence[T], dim: int = 0) -> T:
         th_box_list = [boxes.tensor for boxes in box_list]
         return cls(torch.stack(th_box_list, dim=dim), clone=False)
 
-    @abstractproperty
+    @property
+    @abstractmethod
     def centers(self) -> Tensor:
         """Return a tensor representing the centers of boxes."""
         pass
 
-    @abstractproperty
+    @property
+    @abstractmethod
     def areas(self) -> Tensor:
         """Return a tensor representing the areas of boxes."""
         pass
 
-    @abstractproperty
+    @property
+    @abstractmethod
     def widths(self) -> Tensor:
         """Return a tensor representing the widths of boxes."""
         pass
 
-    @abstractproperty
+    @property
+    @abstractmethod
     def heights(self) -> Tensor:
         """Return a tensor representing the heights of boxes."""
         pass
@@ -516,7 +517,8 @@ def find_inside_points(self, points: Tensor, is_aligned: bool = False) -> BoolTe
         """
         pass
 
-    @abstractstaticmethod
+    @staticmethod
+    @abstractmethod
     def overlaps(
         boxes1: "BaseBoxes",
         boxes2: "BaseBoxes",
@@ -544,7 +546,8 @@ def overlaps(
         """
         pass
 
-    @abstractstaticmethod
+    @staticmethod
+    @abstractmethod
     def from_instance_masks(masks: MaskType) -> "BaseBoxes":
         """Create boxes from instance masks.
 
diff --git a/visdet/structures/bbox/bbox_overlaps.py b/visdet/structures/bbox/bbox_overlaps.py
index 6a15bb4d..7772ab4b 100644
--- a/visdet/structures/bbox/bbox_overlaps.py
+++ b/visdet/structures/bbox/bbox_overlaps.py
@@ -1,10 +1,11 @@
-# ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Literal
+
 import torch
+from torch import Tensor
 
 
-def fp16_clamp(x, min=None, max=None):
+def fp16_clamp(x: Tensor, min: float | None = None, max: float | None = None) -> Tensor:
     if not x.is_cuda and x.dtype == torch.float16:
         # clamp for cpu float16, tensor fp16 has no clamp implementation
         return x.float().clamp(min, max).half()
@@ -12,7 +13,13 @@ def fp16_clamp(x, min=None, max=None):
     return x.clamp(min, max)
 
 
-def bbox_overlaps(bboxes1, bboxes2, mode="iou", is_aligned=False, eps=1e-6):
+def bbox_overlaps(
+    bboxes1: Tensor,
+    bboxes2: Tensor,
+    mode: Literal["iou", "iof", "giou"] = "iou",
+    is_aligned: bool = False,
+    eps: float = 1e-6,
+) -> Tensor:
     """Calculate overlap between two set of bboxes.
 
     FP16 Contributed by https://github.com/open-mmlab/mmdetection/pull/4889
@@ -146,14 +153,14 @@ def bbox_overlaps(bboxes1, bboxes2, mode="iou", is_aligned=False, eps=1e-6):
             enclosed_lt = torch.min(bboxes1[..., :, None, :2], bboxes2[..., None, :, :2])
             enclosed_rb = torch.max(bboxes1[..., :, None, 2:], bboxes2[..., None, :, 2:])
 
-    eps = union.new_tensor([eps])
-    union = torch.max(union, eps)
+    eps_tensor = union.new_tensor([eps])
+    union = torch.max(union, eps_tensor)
     ious = overlap / union
     if mode in ["iou", "iof"]:
         return ious
     # calculate gious
     enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0)
     enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
-    enclose_area = torch.max(enclose_area, eps)
+    enclose_area = torch.max(enclose_area, eps_tensor)
     gious = ious - (enclose_area - union) / enclose_area
     return gious
diff --git a/visdet/structures/bbox/box_type.py b/visdet/structures/bbox/box_type.py
index ba019196..85400b62 100644
--- a/visdet/structures/bbox/box_type.py
+++ b/visdet/structures/bbox/box_type.py
@@ -1,8 +1,6 @@
-# ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
 from collections.abc import Callable
-from typing import Union
+from typing import Any, TypeVar, Union
 
 import numpy as np
 import torch
@@ -11,10 +9,12 @@
 from visdet.structures.bbox.base_boxes import BaseBoxes
 
 BoxType = Union[np.ndarray, Tensor, BaseBoxes]
+T = TypeVar("T", bound=type)
+F = TypeVar("F", bound=Callable)
 
-box_types: dict = {}
-_box_type_to_name: dict = {}
-box_converters: dict = {}
+box_types: dict[str, type] = {}
+_box_type_to_name: dict[type, str] = {}
+box_converters: dict[str, Callable] = {}
 
 
 def _register_box(name: str, box_type: type, force: bool = False) -> None:
@@ -42,7 +42,7 @@ def _register_box(name: str, box_type: type, force: bool = False) -> None:
     _box_type_to_name[box_type] = name
 
 
-def register_box(name: str, box_type: type | None = None, force: bool = False) -> type | Callable:
+def register_box(name: str, box_type: T | None = None, force: bool = False) -> T | Callable[[T], T]:
     """Register a box type.
 
     A record will be added to ``bbox_types``, whose key is the box type name
@@ -80,7 +80,7 @@ def register_box(name: str, box_type: type | None = None, force: bool = False) -
         return box_type
 
     # use it as a decorator: @register_box(name)
-    def _register(cls):
+    def _register(cls: T) -> T:
         _register_box(name=name, box_type=cls, force=force)
         return cls
 
@@ -113,9 +113,9 @@ def _register_box_converter(
 def register_box_converter(
     src_type: str | type,
     dst_type: str | type,
-    converter: Callable | None = None,
+    converter: F | None = None,
     force: bool = False,
-) -> Callable:
+) -> F | Callable[[F], F]:
     """Register a box converter.
 
     A record will be added to ``box_converter``, whose key is
@@ -151,7 +151,7 @@ def register_box_converter(
         return converter
 
     # use it as a decorator: @register_box_converter(name)
-    def _register(func):
+    def _register(func: F) -> F:
         _register_box_converter(src_type=src_type, dst_type=dst_type, converter=func, force=force)
         return func
 
@@ -226,16 +226,17 @@ def convert_box_type(
     converter = box_converters[converter_name]
 
     if is_box_cls:
-        boxes = converter(boxes.tensor)
-        return dst_type_cls(boxes)
+        converted_boxes: Tensor = converter(boxes.tensor)  # type: ignore[arg-type]
+        return dst_type_cls(converted_boxes)
     elif is_numpy:
-        boxes = converter(torch.from_numpy(boxes))
-        return boxes.numpy()
+        converted_boxes = converter(torch.from_numpy(boxes))  # type: ignore[arg-type]
+        assert isinstance(converted_boxes, Tensor)
+        return converted_boxes.numpy()
     else:
-        return converter(boxes)
+        return converter(boxes)  # type: ignore[arg-type,return-value]
 
 
-def autocast_box_type(dst_box_type="hbox") -> Callable:
+def autocast_box_type(dst_box_type: str = "hbox") -> Callable[[Callable], Callable]:
     """A decorator which automatically casts results['gt_bboxes'] to the
     destination box type.
 
@@ -253,7 +254,7 @@ def autocast_box_type(dst_box_type="hbox") -> Callable:
     _, box_type_cls = get_box_type(dst_box_type)
 
     def decorator(func: Callable) -> Callable:
-        def wrapper(self, results: dict, *args, **kwargs) -> dict:
+        def wrapper(self: Any, results: dict, *args: Any, **kwargs: Any) -> dict:
             if "gt_bboxes" not in results or isinstance(results["gt_bboxes"], BaseBoxes):
                 return func(self, results)
             elif isinstance(results["gt_bboxes"], np.ndarray):
diff --git a/visdet/structures/bbox/coders/base_bbox_coder.py b/visdet/structures/bbox/coders/base_bbox_coder.py
index 96ba1e3e..f5958c96 100644
--- a/visdet/structures/bbox/coders/base_bbox_coder.py
+++ b/visdet/structures/bbox/coders/base_bbox_coder.py
@@ -1,7 +1,8 @@
-# ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
+from typing import Any
+
+from torch import Tensor
 
 
 class BaseBBoxCoder(metaclass=ABCMeta):
@@ -15,14 +16,14 @@ class BaseBBoxCoder(metaclass=ABCMeta):
     # The size of the last of dimension of the encoded tensor.
     encode_size = 4
 
-    def __init__(self, use_box_type: bool = False, **kwargs):
+    def __init__(self, use_box_type: bool = False, **kwargs: Any) -> None:
         self.use_box_type = use_box_type
 
     @abstractmethod
-    def encode(self, bboxes, gt_bboxes):
+    def encode(self, bboxes: Tensor, gt_bboxes: Tensor) -> Tensor:
         """Encode deltas between bboxes and ground truth boxes."""
 
     @abstractmethod
-    def decode(self, bboxes, bboxes_pred):
+    def decode(self, bboxes: Tensor, bboxes_pred: Tensor) -> Tensor:
         """Decode the predicted bboxes according to prediction and base
         boxes."""
diff --git a/visdet/structures/bbox/horizontal_boxes.py b/visdet/structures/bbox/horizontal_boxes.py
index 1691aeea..eb4ff98d 100644
--- a/visdet/structures/bbox/horizontal_boxes.py
+++ b/visdet/structures/bbox/horizontal_boxes.py
@@ -1,7 +1,6 @@
 # ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import TypeVar, Union
+from typing import Literal, TypeVar, Union, cast
 
 import cv2
 import numpy as np
@@ -52,8 +51,8 @@ class HorizontalBoxes(BaseBoxes):
     def __init__(
         self,
         data: Tensor | np.ndarray,
-        dtype: torch.dtype = None,
-        device: DeviceType = None,
+        dtype: torch.dtype | None = None,
+        device: DeviceType | None = None,
         clone: bool = True,
         in_mode: str | None = None,
     ) -> None:
@@ -247,8 +246,8 @@ def rescale_(self, scale_factor: tuple[float, float]) -> None:
         """
         boxes = self.tensor
         assert len(scale_factor) == 2
-        scale_factor = boxes.new_tensor(scale_factor).repeat(2)
-        self.tensor = boxes * scale_factor
+        scale_factor_tensor = boxes.new_tensor(scale_factor).repeat(2)
+        self.tensor = boxes * scale_factor_tensor
 
     def resize_(self, scale_factor: tuple[float, float]) -> None:
         """Resize the box width and height w.r.t scale_factor in-place.
@@ -267,8 +266,8 @@ def resize_(self, scale_factor: tuple[float, float]) -> None:
         assert len(scale_factor) == 2
         ctrs = (boxes[..., 2:] + boxes[..., :2]) / 2
         wh = boxes[..., 2:] - boxes[..., :2]
-        scale_factor = boxes.new_tensor(scale_factor)
-        wh = wh * scale_factor
+        scale_factor_tensor = boxes.new_tensor(scale_factor)
+        wh = wh * scale_factor_tensor
         xy1 = ctrs - 0.5 * wh
         xy2 = ctrs + 0.5 * wh
         self.tensor = torch.cat([xy1, xy2], dim=-1)
@@ -296,19 +295,21 @@ def is_inside(
         img_h, img_w = img_shape
         boxes = self.tensor
         if all_inside:
-            return (
+            result = (
                 (boxes[:, 0] >= -allowed_border)
                 & (boxes[:, 1] >= -allowed_border)
                 & (boxes[:, 2] < img_w + allowed_border)
                 & (boxes[:, 3] < img_h + allowed_border)
             )
+            return cast(BoolTensor, result)
         else:
-            return (
+            result = (
                 (boxes[..., 0] < img_w + allowed_border)
                 & (boxes[..., 1] < img_h + allowed_border)
                 & (boxes[..., 2] > -allowed_border)
                 & (boxes[..., 3] > -allowed_border)
             )
+            return cast(BoolTensor, result)
 
     def find_inside_points(self, points: Tensor, is_aligned: bool = False) -> BoolTensor:
         """Find inside box points. Boxes dimension must be 2.
@@ -335,12 +336,13 @@ def find_inside_points(self, points: Tensor, is_aligned: bool = False) -> BoolTe
             assert boxes.size(0) == points.size(0)
 
         x_min, y_min, x_max, y_max = boxes.unbind(dim=-1)
-        return (
+        result = (
             (points[..., 0] >= x_min)
             & (points[..., 0] <= x_max)
             & (points[..., 1] >= y_min)
             & (points[..., 1] <= y_max)
         )
+        return cast(BoolTensor, result)
 
     def create_masks(self, img_shape: tuple[int, int]) -> BitmapMasks:
         """
@@ -388,7 +390,8 @@ def overlaps(
         """
         boxes1 = boxes1.convert_to("hbox")
         boxes2 = boxes2.convert_to("hbox")
-        return bbox_overlaps(boxes1.tensor, boxes2.tensor, mode=mode, is_aligned=is_aligned, eps=eps)
+        mode_literal = cast(Literal["iou", "iof", "giou"], mode)
+        return bbox_overlaps(boxes1.tensor, boxes2.tensor, mode=mode_literal, is_aligned=is_aligned, eps=eps)
 
     @staticmethod
     def from_instance_masks(masks: MaskType) -> "HorizontalBoxes":
diff --git a/visdet/structures/bbox/transforms.py b/visdet/structures/bbox/transforms.py
index 836ccc79..4f69d9d9 100644
--- a/visdet/structures/bbox/transforms.py
+++ b/visdet/structures/bbox/transforms.py
@@ -1,7 +1,6 @@
-# ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
 from collections.abc import Sequence
+from typing import Literal
 
 import numpy as np
 import torch
@@ -25,20 +24,23 @@ def find_inside_bboxes(bboxes: Tensor, img_h: int, img_w: int) -> Tensor:
     return inside_inds
 
 
-def bbox_flip(bboxes: Tensor, img_shape: tuple[int], direction: str = "horizontal") -> Tensor:
+def bbox_flip(
+    bboxes: Tensor,
+    img_shape: tuple[int, int],
+    direction: Literal["horizontal", "vertical", "diagonal"] = "horizontal",
+) -> Tensor:
     """Flip bboxes horizontally or vertically.
 
     Args:
         bboxes (Tensor): Shape (..., 4*k)
-        img_shape (Tuple[int]): Image shape.
-        direction (str): Flip direction, options are "horizontal", "vertical",
-            "diagonal". Default: "horizontal"
+        img_shape (tuple[int, int]): Image shape as (height, width).
+        direction (Literal["horizontal", "vertical", "diagonal"]): Flip direction.
+            Default: "horizontal"
 
     Returns:
         Tensor: Flipped bboxes.
     """
     assert bboxes.shape[-1] % 4 == 0
-    assert direction in ["horizontal", "vertical", "diagonal"]
     flipped = bboxes.clone()
     if direction == "horizontal":
         flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
@@ -56,10 +58,10 @@ def bbox_flip(bboxes: Tensor, img_shape: tuple[int], direction: str = "horizonta
 
 def bbox_mapping(
     bboxes: Tensor,
-    img_shape: tuple[int],
-    scale_factor: float | tuple[float],
+    img_shape: tuple[int, int],
+    scale_factor: float | tuple[float, float],
     flip: bool,
-    flip_direction: str = "horizontal",
+    flip_direction: Literal["horizontal", "vertical", "diagonal"] = "horizontal",
 ) -> Tensor:
     """Map bboxes from the original image scale to testing scale."""
     new_bboxes = bboxes * bboxes.new_tensor(scale_factor)
@@ -70,10 +72,10 @@ def bbox_mapping(
 
 def bbox_mapping_back(
     bboxes: Tensor,
-    img_shape: tuple[int],
-    scale_factor: float | tuple[float],
+    img_shape: tuple[int, int],
+    scale_factor: float | tuple[float, float],
     flip: bool,
-    flip_direction: str = "horizontal",
+    flip_direction: Literal["horizontal", "vertical", "diagonal"] = "horizontal",
 ) -> Tensor:
     """Map bboxes from testing scale to original image scale."""
     new_bboxes = bbox_flip(bboxes, img_shape, flip_direction) if flip else bboxes
@@ -140,6 +142,7 @@ def bbox2result(bboxes: Tensor | np.ndarray, labels: Tensor | np.ndarray, num_cl
     else:
         if isinstance(bboxes, torch.Tensor):
             bboxes = bboxes.detach().cpu().numpy()
+        if isinstance(labels, torch.Tensor):
             labels = labels.detach().cpu().numpy()
         return [bboxes[labels == i, :] for i in range(num_classes)]
 
@@ -182,7 +185,7 @@ def distance2bbox(
         # clip bboxes with dynamic `min` and `max` for onnx
         if torch.onnx.is_in_onnx_export():
             # TODO: delete
-            from visdet.core.export import dynamic_clip_for_onnx
+            from visdet.core.export import dynamic_clip_for_onnx  # type: ignore[import-not-found]
 
             x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape)
             bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
@@ -220,8 +223,8 @@ def scale_boxes(boxes: Tensor | BaseBoxes, scale_factor: tuple[float, float]) ->
     else:
         # Tensor boxes will be treated as horizontal boxes
         repeat_num = int(boxes.size(-1) / 2)
-        scale_factor = boxes.new_tensor(scale_factor).repeat((1, repeat_num))
-        return boxes * scale_factor
+        scale_factor_tensor = boxes.new_tensor(scale_factor).repeat((1, repeat_num))
+        return boxes * scale_factor_tensor
 
 
 def get_box_tensor(boxes: Tensor | BaseBoxes) -> Tensor:
@@ -324,7 +327,7 @@ def bbox_xyxy_to_cxcywh(bbox: Tensor) -> Tensor:
     return torch.cat(bbox_new, dim=-1)
 
 
-def bbox2corner(bboxes: torch.Tensor) -> torch.Tensor:
+def bbox2corner(bboxes: Tensor) -> Tensor:
     """Convert bbox coordinates from (x1, y1, x2, y2) to corners ((x1, y1),
     (x2, y1), (x1, y2), (x2, y2)).
 
@@ -337,7 +340,7 @@ def bbox2corner(bboxes: torch.Tensor) -> torch.Tensor:
     return torch.cat([x1, y1, x2, y1, x1, y2, x2, y2], dim=1).reshape(-1, 2)
 
 
-def corner2bbox(corners: torch.Tensor) -> torch.Tensor:
+def corner2bbox(corners: Tensor) -> Tensor:
     """Convert bbox coordinates from corners ((x1, y1), (x2, y1), (x1, y2),
     (x2, y2)) to (x1, y1, x2, y2).
 
@@ -353,10 +356,10 @@ def corner2bbox(corners: torch.Tensor) -> torch.Tensor:
 
 
 def bbox_project(
-    bboxes: torch.Tensor | np.ndarray,
-    homography_matrix: torch.Tensor | np.ndarray,
+    bboxes: Tensor | np.ndarray,
+    homography_matrix: Tensor | np.ndarray,
     img_shape: tuple[int, int] | None = None,
-) -> torch.Tensor | np.ndarray:
+) -> Tensor | np.ndarray:
     """Geometric transformation for bbox.
 
     Args:
@@ -372,6 +375,9 @@ def bbox_project(
         bboxes = torch.from_numpy(bboxes)
     if isinstance(homography_matrix, np.ndarray):
         homography_matrix = torch.from_numpy(homography_matrix)
+
+    # At this point bboxes must be a Tensor
+    assert isinstance(bboxes, torch.Tensor)
     corners = bbox2corner(bboxes)
     corners = torch.cat([corners, corners.new_ones(corners.shape[0], 1)], dim=1)
     corners = torch.matmul(homography_matrix, corners.t()).t()
@@ -401,7 +407,9 @@ def cat_boxes(data_list: list[Tensor | BaseBoxes], dim: int = 0) -> Tensor | Bas
     if data_list and isinstance(data_list[0], BaseBoxes):
         return data_list[0].cat(data_list, dim=dim)
     else:
-        return torch.cat(data_list, dim=dim)
+        # Type checker needs to know these are all Tensors
+        tensor_list: list[Tensor] = [x for x in data_list if isinstance(x, Tensor)]
+        return torch.cat(tensor_list, dim=dim)
 
 
 def stack_boxes(data_list: list[Tensor | BaseBoxes], dim: int = 0) -> Tensor | BaseBoxes:
@@ -419,29 +427,9 @@ def stack_boxes(data_list: list[Tensor | BaseBoxes], dim: int = 0) -> Tensor | B
     if data_list and isinstance(data_list[0], BaseBoxes):
         return data_list[0].stack(data_list, dim=dim)
     else:
-        return torch.stack(data_list, dim=dim)
-
-
-def scale_boxes(boxes: Tensor | BaseBoxes, scale_factor: tuple[float, float]) -> Tensor | BaseBoxes:
-    """Scale boxes with type of tensor or box type.
-
-    Args:
-        boxes (Tensor or :obj:`BaseBoxes`): boxes need to be scaled. Its type
-            can be a tensor or a box type.
-        scale_factor (Tuple[float, float]): factors for scaling boxes.
-            The length should be 2.
-
-    Returns:
-        Union[Tensor, :obj:`BaseBoxes`]: Scaled boxes.
-    """
-    if isinstance(boxes, BaseBoxes):
-        boxes.rescale_(scale_factor)
-        return boxes
-    else:
-        # Tensor boxes will be treated as horizontal boxes
-        repeat_num = int(boxes.size(-1) / 2)
-        scale_factor = boxes.new_tensor(scale_factor).repeat((1, repeat_num))
-        return boxes * scale_factor
+        # Type checker needs to know these are all Tensors
+        tensor_list: list[Tensor] = [x for x in data_list if isinstance(x, Tensor)]
+        return torch.stack(tensor_list, dim=dim)
 
 
 def get_box_wh(boxes: Tensor | BaseBoxes) -> tuple[Tensor, Tensor]:
@@ -464,22 +452,6 @@ def get_box_wh(boxes: Tensor | BaseBoxes) -> tuple[Tensor, Tensor]:
     return w, h
 
 
-def get_box_tensor(boxes: Tensor | BaseBoxes) -> Tensor:
-    """Get tensor data from box type boxes.
-
-    Args:
-        boxes (Tensor or BaseBoxes): boxes with type of tensor or box type.
-            If its type is a tensor, the boxes will be directly returned.
-            If its type is a box type, the `boxes.tensor` will be returned.
-
-    Returns:
-        Tensor: boxes tensor.
-    """
-    if isinstance(boxes, BaseBoxes):
-        boxes = boxes.tensor
-    return boxes
-
-
 def empty_box_as(boxes: Tensor | BaseBoxes) -> Tensor | BaseBoxes:
     """Generate empty box according to input ``boxes` type and device.
 
@@ -497,7 +469,7 @@ def empty_box_as(boxes: Tensor | BaseBoxes) -> Tensor | BaseBoxes:
         return boxes.new_zeros(0, 4)
 
 
-def bbox_xyxy_to_cxcyah(bboxes: torch.Tensor) -> torch.Tensor:
+def bbox_xyxy_to_cxcyah(bboxes: Tensor) -> Tensor:
     """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, ratio, h).
 
     Args:
@@ -514,7 +486,7 @@ def bbox_xyxy_to_cxcyah(bboxes: torch.Tensor) -> torch.Tensor:
     return xyah
 
 
-def bbox_cxcyah_to_xyxy(bboxes: torch.Tensor) -> torch.Tensor:
+def bbox_cxcyah_to_xyxy(bboxes: Tensor) -> Tensor:
     """Convert bbox coordinates from (cx, cy, ratio, h) to (x1, y1, x2, y2).
 
     Args:
diff --git a/visdet/structures/det_data_sample.py b/visdet/structures/det_data_sample.py
index 71244f14..d9de95e1 100644
--- a/visdet/structures/det_data_sample.py
+++ b/visdet/structures/det_data_sample.py
@@ -1,7 +1,6 @@
-# ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
+
+from typing import TYPE_CHECKING, Any, overload
 
 from visdet.engine.structures import BaseDataElement, InstanceData, PixelData
 
@@ -118,101 +117,109 @@ class DetDataSample(BaseDataElement):
     """
 
     @property
-    def proposals(self) -> InstanceData:
+    def proposals(self) -> InstanceData | None:
         return getattr(self, "_proposals", None)
 
     @proposals.setter
-    def proposals(self, value: InstanceData):
+    def proposals(self, value: InstanceData) -> None:
         self.set_field(value, "_proposals", dtype=InstanceData)
 
     @proposals.deleter
-    def proposals(self):
-        del self._proposals
+    def proposals(self) -> None:
+        del self._proposals  # type: ignore[has-type]
 
     @property
-    def gt_instances(self) -> InstanceData:
+    def gt_instances(self) -> InstanceData | None:
         return getattr(self, "_gt_instances", None)
 
     @gt_instances.setter
-    def gt_instances(self, value: InstanceData):
+    def gt_instances(self, value: InstanceData) -> None:
         self.set_field(value, "_gt_instances", dtype=InstanceData)
 
     @gt_instances.deleter
-    def gt_instances(self):
-        del self._gt_instances
+    def gt_instances(self) -> None:
+        del self._gt_instances  # type: ignore[has-type]
 
     @property
-    def pred_instances(self) -> InstanceData:
+    def pred_instances(self) -> InstanceData | None:
         return getattr(self, "_pred_instances", None)
 
     @pred_instances.setter
-    def pred_instances(self, value: InstanceData):
+    def pred_instances(self, value: InstanceData) -> None:
         self.set_field(value, "_pred_instances", dtype=InstanceData)
 
     @pred_instances.deleter
-    def pred_instances(self):
-        del self._pred_instances
+    def pred_instances(self) -> None:
+        del self._pred_instances  # type: ignore[has-type]
 
     @property
-    def ignored_instances(self) -> InstanceData:
+    def ignored_instances(self) -> InstanceData | None:
         return getattr(self, "_ignored_instances", None)
 
     @ignored_instances.setter
-    def ignored_instances(self, value: InstanceData):
+    def ignored_instances(self, value: InstanceData) -> None:
         self.set_field(value, "_ignored_instances", dtype=InstanceData)
 
     @ignored_instances.deleter
-    def ignored_instances(self):
-        del self._ignored_instances
+    def ignored_instances(self) -> None:
+        del self._ignored_instances  # type: ignore[has-type]
 
     @property
-    def gt_panoptic_seg(self) -> PixelData:
+    def gt_panoptic_seg(self) -> PixelData | None:
         return getattr(self, "_gt_panoptic_seg", None)
 
     @gt_panoptic_seg.setter
-    def gt_panoptic_seg(self, value: PixelData):
+    def gt_panoptic_seg(self, value: PixelData) -> None:
         self.set_field(value, "_gt_panoptic_seg", dtype=PixelData)
 
     @gt_panoptic_seg.deleter
-    def gt_panoptic_seg(self):
-        del self._gt_panoptic_seg
+    def gt_panoptic_seg(self) -> None:
+        del self._gt_panoptic_seg  # type: ignore[has-type]
 
     @property
-    def pred_panoptic_seg(self) -> PixelData:
+    def pred_panoptic_seg(self) -> PixelData | None:
         return getattr(self, "_pred_panoptic_seg", None)
 
     @pred_panoptic_seg.setter
-    def pred_panoptic_seg(self, value: PixelData):
+    def pred_panoptic_seg(self, value: PixelData) -> None:
         self.set_field(value, "_pred_panoptic_seg", dtype=PixelData)
 
     @pred_panoptic_seg.deleter
-    def pred_panoptic_seg(self):
-        del self._pred_panoptic_seg
+    def pred_panoptic_seg(self) -> None:
+        del self._pred_panoptic_seg  # type: ignore[has-type]
 
     @property
-    def gt_sem_seg(self) -> PixelData:
+    def gt_sem_seg(self) -> PixelData | None:
         return getattr(self, "_gt_sem_seg", None)
 
     @gt_sem_seg.setter
-    def gt_sem_seg(self, value: PixelData):
+    def gt_sem_seg(self, value: PixelData) -> None:
         self.set_field(value, "_gt_sem_seg", dtype=PixelData)
 
     @gt_sem_seg.deleter
-    def gt_sem_seg(self):
-        del self._gt_sem_seg
+    def gt_sem_seg(self) -> None:
+        del self._gt_sem_seg  # type: ignore[has-type]
 
     @property
-    def pred_sem_seg(self) -> PixelData:
+    def pred_sem_seg(self) -> PixelData | None:
         return getattr(self, "_pred_sem_seg", None)
 
     @pred_sem_seg.setter
-    def pred_sem_seg(self, value: PixelData):
+    def pred_sem_seg(self, value: PixelData) -> None:
         self.set_field(value, "_pred_sem_seg", dtype=PixelData)
 
     @pred_sem_seg.deleter
-    def pred_sem_seg(self):
-        del self._pred_sem_seg
+    def pred_sem_seg(self) -> None:
+        del self._pred_sem_seg  # type: ignore[has-type]
+
+    # Provide specific type hints for common attributes
+    if TYPE_CHECKING:
+        # These are commonly accessed metainfo attributes in visualization code
+        img_path: str
+        text: str | list[str]
+        tokens_positive: list[list[tuple[int, int]]]
+        phrase_ids: list[int]
 
 
 SampleList = list[DetDataSample]
-OptSampleList = Optional[SampleList]
+OptSampleList = SampleList | None
diff --git a/visdet/structures/mask/utils.py b/visdet/structures/mask/utils.py
index 75b37707..34e2c56f 100644
--- a/visdet/structures/mask/utils.py
+++ b/visdet/structures/mask/utils.py
@@ -1,13 +1,17 @@
-# ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any
+
 import numpy as np
 import pycocotools.mask as mask_util
 import torch
+from torch import Tensor
+
 from visdet.engine.utils import slice_list
 
 
-def split_combined_polys(polys, poly_lens, polys_per_mask):
+def split_combined_polys(
+    polys: list[Tensor], poly_lens: list[Tensor], polys_per_mask: list[Tensor]
+) -> list[list[list[Any]]]:
     """Split the combined 1-D polys into masks.
 
     A mask is represented as a list of polys, and a poly is represented as
@@ -37,7 +41,7 @@ def split_combined_polys(polys, poly_lens, polys_per_mask):
 
 
 # TODO: move this function to more proper place
-def encode_mask_results(mask_results):
+def encode_mask_results(mask_results: list[np.ndarray]) -> list[dict[str, Any]]:
     """Encode bitmap mask to RLE code.
 
     Args:
@@ -54,7 +58,7 @@ def encode_mask_results(mask_results):
     return encoded_mask_results
 
 
-def mask2bbox(masks):
+def mask2bbox(masks: Tensor) -> Tensor:
     """Obtain tight bounding boxes of binary masks.
 
     Args:
diff --git a/visdet/tests/test_models/test_backbones/test_swin.py b/visdet/tests/test_models/test_backbones/test_swin.py
index 881c3a4b..ecb1150c 100644
--- a/visdet/tests/test_models/test_backbones/test_swin.py
+++ b/visdet/tests/test_models/test_backbones/test_swin.py
@@ -1,4 +1,4 @@
-import pytest
+import pytest  # type: ignore[import-not-found]
 import torch
 
 from visdet.models.backbones.swin import (
diff --git a/visdet/tests/test_models/test_roi_heads/test_bbox_heads.py b/visdet/tests/test_models/test_roi_heads/test_bbox_heads.py
index bcb34c76..80eece8d 100644
--- a/visdet/tests/test_models/test_roi_heads/test_bbox_heads.py
+++ b/visdet/tests/test_models/test_roi_heads/test_bbox_heads.py
@@ -1,6 +1,6 @@
 """Test cases for bbox heads."""
 
-import pytest
+import pytest  # type: ignore[import-not-found]
 import torch
 
 from visdet.engine.config import Config
diff --git a/visdet/tests/test_models/test_roi_heads/test_cascade_roi_head.py b/visdet/tests/test_models/test_roi_heads/test_cascade_roi_head.py
index e06db3e9..05088ba6 100644
--- a/visdet/tests/test_models/test_roi_heads/test_cascade_roi_head.py
+++ b/visdet/tests/test_models/test_roi_heads/test_cascade_roi_head.py
@@ -1,6 +1,6 @@
 """Tests for CascadeRoIHead to validate cascade logic and mask prediction."""
 
-import pytest
+import pytest  # type: ignore[import-not-found]
 import torch
 
 from visdet.engine.config import ConfigDict
diff --git a/visdet/utils/setup_env.py b/visdet/utils/setup_env.py
index 692e7dd0..7d589501 100644
--- a/visdet/utils/setup_env.py
+++ b/visdet/utils/setup_env.py
@@ -32,6 +32,7 @@ def register_all_modules(init_default_scope: bool = True) -> None:
             DefaultScope.get_instance("visdet", scope_name="visdet")
             return
         current_scope = DefaultScope.get_current_instance()
+        assert current_scope is not None, "DefaultScope instance should exist at this point"
         if current_scope.scope_name != "visdet":
             warnings.warn(
                 "The current default scope "
diff --git a/visdet/utils/typing_utils.py b/visdet/utils/typing_utils.py
index 0214403f..18e76bec 100644
--- a/visdet/utils/typing_utils.py
+++ b/visdet/utils/typing_utils.py
@@ -1,28 +1,24 @@
-# ruff: noqa
-# type: ignore
 # Copyright (c) OpenMMLab. All rights reserved.
 """Collecting some commonly used type hint in mmdetection."""
 
 from collections.abc import Sequence
-from typing import Optional, Union
+from typing import Any, Union
 
 from visdet.engine.config import ConfigDict
 from visdet.engine.structures import InstanceData, PixelData
 
 # TODO: Need to avoid circular import with assigner and sampler
 # Type hint of config data
-from typing import Dict, Any
-
-ConfigType = Union[ConfigDict, dict, str, Dict[str, Any]]
-OptConfigType = Optional[ConfigType]
+ConfigType = Union[ConfigDict, dict, str, dict[str, Any]]
+OptConfigType = ConfigType | None
 # Type hint of one or more config data
 MultiConfig = Union[ConfigType, list[ConfigType]]
-OptMultiConfig = Optional[MultiConfig]
+OptMultiConfig = MultiConfig | None
 
 InstanceList = list[InstanceData]
-OptInstanceList = Optional[InstanceList]
+OptInstanceList = InstanceList | None
 
 PixelList = list[PixelData]
-OptPixelList = Optional[PixelList]
+OptPixelList = PixelList | None
 
 RangeType = Sequence[tuple[int, int]]
diff --git a/visdet/visualization/local_visualizer.py b/visdet/visualization/local_visualizer.py
index 6972676e..ec2e8877 100644
--- a/visdet/visualization/local_visualizer.py
+++ b/visdet/visualization/local_visualizer.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
+from __future__ import annotations
+
+from typing import Any, Sequence, cast
 
 import cv2
 import numpy as np
@@ -12,9 +14,13 @@
 from visdet.evaluation import INSTANCE_OFFSET
 from visdet.registry import VISUALIZERS
 from visdet.structures import DetDataSample
+from visdet.structures.bbox import BaseBoxes
 from visdet.structures.mask import BitmapMasks, PolygonMasks, bitmap_to_polygon
 from visdet.visualization.palette import _get_adaptive_scales, get_palette, jitter_color
 
+ColorTuple = tuple[int, int, int]
+PaletteInput = list[ColorTuple] | ColorTuple | str | None
+
 
 @VISUALIZERS.register_module()
 class DetLocalVisualizer(Visualizer):
@@ -80,9 +86,9 @@ def __init__(
         image: np.ndarray | None = None,
         vis_backends: dict | None = None,
         save_dir: str | None = None,
-        bbox_color: str | tuple[int] | None = None,
-        text_color: str | tuple[int] | None = (200, 200, 200),
-        mask_color: str | tuple[int] | None = None,
+        bbox_color: str | ColorTuple | None = None,
+        text_color: str | ColorTuple = (200, 200, 200),
+        mask_color: str | ColorTuple | None = None,
         line_width: int | float = 3,
         alpha: float = 0.8,
     ) -> None:
@@ -95,14 +101,50 @@ def __init__(
         # Set default value. When calling
         # `DetLocalVisualizer().dataset_meta=xxx`,
         # it will override the default value.
-        self.dataset_meta = {}
+        # Meta information attached by runner/metrics; values may vary so keep
+        # loose typing internally and normalize on read.
+        self.dataset_meta: dict[str, Any] = {}
+
+    def _meta_classes(self) -> list[str] | None:
+        meta = self.dataset_meta or {}
+        classes = meta.get("classes")
+        if isinstance(classes, Sequence) and all(isinstance(name, str) for name in classes):
+            return list(classes)
+        return None
+
+    def _meta_palette(self) -> PaletteInput:
+        meta = self.dataset_meta or {}
+        palette = meta.get("palette")
+        return self._normalize_palette_input(palette)
+
+    def _normalize_palette_input(self, palette: Any) -> PaletteInput:
+        if palette is None:
+            return None
+        if isinstance(palette, list):
+            normalized: list[ColorTuple] = []
+            for color in palette:
+                if isinstance(color, tuple):
+                    normalized.append(cast(ColorTuple, tuple(int(c) for c in color[:3])))
+                elif isinstance(color, list):
+                    normalized.append(cast(ColorTuple, tuple(int(c) for c in color[:3])))
+            return normalized or None
+        if isinstance(palette, np.ndarray):
+            if palette.ndim == 2:
+                return [cast(ColorTuple, tuple(int(c) for c in row.tolist()[:3])) for row in palette]
+            if palette.ndim == 1:
+                return cast(ColorTuple, tuple(int(c) for c in palette.tolist()[:3]))
+        if isinstance(palette, tuple):
+            return cast(ColorTuple, tuple(int(c) for c in palette[:3]))
+        if isinstance(palette, str):
+            return palette
+        return None
 
     def _draw_instances(
         self,
         image: np.ndarray,
-        instances: ["InstanceData"],
+        instances: InstanceData,
         classes: list[str] | None,
-        palette: list[tuple] | None,
+        palette: PaletteInput,
     ) -> np.ndarray:
         """Draw instances of GT or prediction.
 
@@ -119,54 +161,71 @@ def _draw_instances(
         """
         self.set_image(image)
 
-        if "bboxes" in instances and instances.bboxes.sum() > 0:
-            bboxes = instances.bboxes
-            labels = instances.labels
+        if "bboxes" in instances:
+            bboxes_raw = instances.bboxes
+            # Convert BaseBoxes to tensor
+            if isinstance(bboxes_raw, BaseBoxes):
+                bboxes = bboxes_raw.tensor
+            else:
+                bboxes = bboxes_raw
 
-            max_label = int(max(labels) if len(labels) > 0 else 0)
-            text_palette = get_palette(self.text_color, max_label + 1)
-            text_colors = [text_palette[label] for label in labels]
-
-            bbox_color = palette if self.bbox_color is None else self.bbox_color
-            bbox_palette = get_palette(bbox_color, max_label + 1)
-            colors = [bbox_palette[label] for label in labels]
-            self.draw_bboxes(
-                bboxes,
-                edge_colors=colors,
-                alpha=self.alpha,
-                line_widths=self.line_width,
-            )
+            if bboxes.sum() > 0:
+                label_tensor = cast(torch.Tensor, instances.labels)
+                label_ids = label_tensor.to(dtype=torch.int64).tolist()
 
-            positions = bboxes[:, :2] + self.line_width
-            areas = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0])
-            scales = _get_adaptive_scales(areas)
+                max_label = int(max(label_ids) if len(label_ids) > 0 else 0)
+                text_palette = get_palette(self.text_color, max_label + 1)
+                text_colors = [text_palette[label] for label in label_ids]
 
-            for i, (pos, label) in enumerate(zip(positions, labels)):
-                if "label_names" in instances:
-                    label_text = instances.label_names[i]
+                if self.bbox_color is None:
+                    bbox_color: PaletteInput = palette
                 else:
-                    label_text = classes[label] if classes is not None else f"class {label}"
-                if "scores" in instances:
-                    score = round(float(instances.scores[i]) * 100, 1)
-                    label_text += f": {score}"
-
-                self.draw_texts(
-                    label_text,
-                    pos,
-                    colors=text_colors[i],
-                    font_sizes=int(13 * scales[i]),
-                    bboxes=[
-                        {
-                            "facecolor": "black",
-                            "alpha": 0.8,
-                            "pad": 0.7,
-                            "edgecolor": "none",
-                        }
-                    ],
+                    bbox_color = cast(PaletteInput, self.bbox_color)
+                bbox_palette = get_palette(bbox_color, max_label + 1)
+                colors = [bbox_palette[label] for label in label_ids]
+                self.draw_bboxes(
+                    bboxes,
+                    edge_colors=colors,
+                    alpha=self.alpha,
+                    line_widths=self.line_width,
                 )
 
+                positions = bboxes[:, :2] + self.line_width
+                if isinstance(positions, torch.Tensor):
+                    positions = positions.cpu().numpy()
+                areas = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0])
+                # Convert to numpy if it's a tensor
+                if isinstance(areas, torch.Tensor):
+                    areas = areas.cpu().numpy()
+                scales = _get_adaptive_scales(areas)
+
+                for i, (pos, label) in enumerate(zip(positions, label_ids)):
+                    if "label_names" in instances:
+                        label_text = str(instances.label_names[i])
+                    else:
+                        label_text = classes[label] if classes is not None else f"class {label}"
+                    if "scores" in instances:
+                        score = round(float(instances.scores[i]) * 100, 1)
+                        label_text += f": {score}"
+
+                    self.draw_texts(
+                        label_text,
+                        pos,
+                        colors=text_colors[i],
+                        font_sizes=int(13 * scales[i]),
+                        bboxes=[
+                            {
+                                "facecolor": "black",
+                                "alpha": 0.8,
+                                "pad": 0.7,
+                                "edgecolor": "none",
+                            }
+                        ],
+                    )
+
         if "masks" in instances:
-            labels = instances.labels
+            label_tensor = cast(torch.Tensor, instances.labels)
+            label_ids = label_tensor.to(dtype=torch.int64).tolist()
             masks = instances.masks
             if isinstance(masks, torch.Tensor):
                 masks = masks.numpy()
@@ -182,14 +241,17 @@ def _draw_instances(
             logger.debug(f"Image shape: {image.shape}")
             logger.debug(f"Masks shape: {masks.shape}")
             logger.debug(f"Masks dtype: {masks.dtype}")
-            logger.debug(f"Number of instances: {len(labels)}")
+            logger.debug(f"Number of instances: {len(label_ids)}")
 
-            max_label = int(max(labels) if len(labels) > 0 else 0)
-            mask_color = palette if self.mask_color is None else self.mask_color
+            max_label = int(max(label_ids) if len(label_ids) > 0 else 0)
+            if self.mask_color is None:
+                mask_color: PaletteInput = palette
+            else:
+                mask_color = cast(PaletteInput, self.mask_color)
             mask_palette = get_palette(mask_color, max_label + 1)
-            colors = [jitter_color(mask_palette[label]) for label in labels]
+            colors = [jitter_color(mask_palette[label]) for label in label_ids]
             text_palette = get_palette(self.text_color, max_label + 1)
-            text_colors = [text_palette[label] for label in labels]
+            text_colors = [text_palette[label] for label in label_ids]
 
             polygons = []
             for i, mask in enumerate(masks):
@@ -198,7 +260,16 @@ def _draw_instances(
             self.draw_polygons(polygons, edge_colors="w", alpha=self.alpha)
             self.draw_binary_masks(masks, colors=colors, alphas=self.alpha)
 
-            if len(labels) > 0 and ("bboxes" not in instances or instances.bboxes.sum() == 0):
+            # Check if we need to draw text labels for masks
+            has_valid_bboxes = False
+            if "bboxes" in instances:
+                bboxes_raw = instances.bboxes
+                if isinstance(bboxes_raw, BaseBoxes):
+                    has_valid_bboxes = bboxes_raw.tensor.sum() > 0
+                else:
+                    has_valid_bboxes = bboxes_raw.sum() > 0
+
+            if len(label_ids) > 0 and not has_valid_bboxes:
                 # instances.bboxes.sum()==0 represent dummy bboxes.
                 # A typical example of SOLO does not exist bbox branch.
                 areas = []
@@ -212,7 +283,7 @@ def _draw_instances(
                 areas = np.stack(areas, axis=0)
                 scales = _get_adaptive_scales(areas)
 
-                for i, (pos, label) in enumerate(zip(positions, labels)):
+                for i, (pos, label) in enumerate(zip(positions, label_ids)):
                     if "label_names" in instances:
                         label_text = instances.label_names[i]
                     else:
@@ -241,9 +312,9 @@ def _draw_instances(
     def _draw_panoptic_seg(
         self,
         image: np.ndarray,
-        panoptic_seg: ["PixelData"],
+        panoptic_seg: PixelData,
         classes: list[str] | None,
-        palette: list | None,
+        palette: PaletteInput,
     ) -> np.ndarray:
         """Draw panoptic seg of GT or prediction.
 
@@ -257,7 +328,10 @@ def _draw_panoptic_seg(
             np.ndarray: the drawn image which channel is RGB.
         """
         # TODO: Is there a way to bypass？
-        num_classes = len(classes)
+        if classes is None:
+            raise ValueError("classes should not be None when drawing panoptic segmentation")
+        class_list = list(classes)
+        num_classes = len(class_list)
 
         panoptic_seg_data = panoptic_seg.sem_seg[0]
 
@@ -265,8 +339,8 @@ def _draw_panoptic_seg(
 
         if "label_names" in panoptic_seg:
             # open set panoptic segmentation
-            classes = panoptic_seg.metainfo["label_names"]
-            ignore_index = panoptic_seg.metainfo.get("ignore_index", len(classes))
+            class_list = list(panoptic_seg.metainfo["label_names"])
+            ignore_index = panoptic_seg.metainfo.get("ignore_index", len(class_list))
             ids = ids[ids != ignore_index]
         else:
             # for VOID label
@@ -274,11 +348,15 @@ def _draw_panoptic_seg(
 
         labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64)
         segms = panoptic_seg_data[None] == ids[:, None, None]
+        segms = np.asarray(segms, dtype=bool)
 
         max_label = int(max(labels) if len(labels) > 0 else 0)
 
-        mask_color = palette if self.mask_color is None else self.mask_color
-        mask_palette = get_palette(mask_color, max_label + 1)
+        if palette is not None:
+            mask_color_input: PaletteInput = palette
+        else:
+            mask_color_input = cast(PaletteInput, self.mask_color)
+        mask_palette = get_palette(mask_color_input, max_label + 1)
         colors = [mask_palette[label] for label in labels]
 
         self.set_image(image)
@@ -306,7 +384,7 @@ def _draw_panoptic_seg(
         text_colors = [text_palette[label] for label in labels]
 
         for i, (pos, label) in enumerate(zip(positions, labels)):
-            label_text = classes[label]
+            label_text = class_list[label] if label < len(class_list) else f"class {label}"
 
             self.draw_texts(
                 label_text,
@@ -329,8 +407,8 @@ def _draw_sem_seg(
         self,
         image: np.ndarray,
         sem_seg: PixelData,
-        classes: list | None,
-        palette: list | None,
+        classes: list[str] | None,
+        palette: PaletteInput,
     ) -> np.ndarray:
         """Draw semantic seg of GT or prediction.
 
@@ -362,20 +440,26 @@ def _draw_sem_seg(
 
         if "label_names" in sem_seg:
             # open set semseg
-            label_names = sem_seg.metainfo["label_names"]
+            label_names_seq = sem_seg.metainfo["label_names"]
+            label_names = list(label_names_seq)
         else:
+            if classes is None:
+                raise ValueError("label_names should not be None")
             label_names = classes
 
+        palette_source: PaletteInput = palette if palette is not None else self.mask_color
+        palette_list = get_palette(palette_source, len(label_names))
+
         labels = np.array(ids, dtype=np.int64)
-        colors = [palette[label] for label in labels]
+        colors = [palette_list[label] for label in labels]
 
         self.set_image(image)
 
         # draw semantic masks
         for i, (label, color) in enumerate(zip(labels, colors)):
-            masks = sem_seg_data == label
+            masks = (sem_seg_data == label).astype(bool)
             self.draw_binary_masks(masks, colors=[color], alphas=self.alpha)
-            label_text = label_names[label]
+            label_text = label_names[label] if label < len(label_names) else f"class {label}"
             _, _, stats, centroids = cv2.connectedComponentsWithStats(masks[0].astype(np.uint8), connectivity=8)
             if stats.shape[0] > 1:
                 largest_id = np.argmax(stats[1:, -1]) + 1
@@ -407,7 +491,7 @@ def add_datasample(
         self,
         name: str,
         image: np.ndarray,
-        data_sample: Optional["DetDataSample"] = None,
+        data_sample: DetDataSample | None = None,
         draw_gt: bool = True,
         draw_pred: bool = True,
         show: bool = False,
@@ -445,8 +529,8 @@ def add_datasample(
             step (int): Global step value to record. Defaults to 0.
         """
         image = image.clip(0, 255).astype(np.uint8)
-        classes = self.dataset_meta.get("classes", None)
-        palette = self.dataset_meta.get("palette", None)
+        classes = self._meta_classes()
+        palette = self._meta_palette()
 
         gt_img_data = None
         pred_img_data = None
@@ -456,12 +540,12 @@ def add_datasample(
 
         if draw_gt and data_sample is not None:
             gt_img_data = image
-            if "gt_instances" in data_sample:
+            if "gt_instances" in data_sample and data_sample.gt_instances is not None:
                 gt_img_data = self._draw_instances(image, data_sample.gt_instances, classes, palette)
-            if "gt_sem_seg" in data_sample:
+            if "gt_sem_seg" in data_sample and data_sample.gt_sem_seg is not None:
                 gt_img_data = self._draw_sem_seg(gt_img_data, data_sample.gt_sem_seg, classes, palette)
 
-            if "gt_panoptic_seg" in data_sample:
+            if "gt_panoptic_seg" in data_sample and data_sample.gt_panoptic_seg is not None:
                 assert classes is not None, (
                     "class information is not provided when visualizing panoptic segmentation results."
                 )
@@ -469,15 +553,15 @@ def add_datasample(
 
         if draw_pred and data_sample is not None:
             pred_img_data = image
-            if "pred_instances" in data_sample:
+            if "pred_instances" in data_sample and data_sample.pred_instances is not None:
                 pred_instances = data_sample.pred_instances
                 pred_instances = pred_instances[pred_instances.scores > pred_score_thr]
                 pred_img_data = self._draw_instances(image, pred_instances, classes, palette)
 
-            if "pred_sem_seg" in data_sample:
+            if "pred_sem_seg" in data_sample and data_sample.pred_sem_seg is not None:
                 pred_img_data = self._draw_sem_seg(pred_img_data, data_sample.pred_sem_seg, classes, palette)
 
-            if "pred_panoptic_seg" in data_sample:
+            if "pred_panoptic_seg" in data_sample and data_sample.pred_panoptic_seg is not None:
                 assert classes is not None, (
                     "class information is not provided when visualizing panoptic segmentation results."
                 )
diff --git a/visdet/visualization/palette.py b/visdet/visualization/palette.py
index 75d2a677..8f506378 100644
--- a/visdet/visualization/palette.py
+++ b/visdet/visualization/palette.py
@@ -1,11 +1,32 @@
 # ruff: noqa
 # Copyright (c) OpenMMLab. All rights reserved.
 
+from __future__ import annotations
+
+from collections.abc import Sequence
+from typing import Any, cast
+
 import numpy as np
-from visdet.engine.utils import is_str
+
+ColorTuple = tuple[int, int, int]
+
+
+def _as_color_tuple(color: Any) -> ColorTuple | None:
+    if isinstance(color, np.ndarray):
+        values = color.tolist()
+        ints = [int(c) for c in values[:3]]
+    elif isinstance(color, (list, tuple)):
+        ints = [int(c) for c in list(color)[:3]]
+    else:
+        return None
+    if not ints:
+        return None
+    while len(ints) < 3:
+        ints.append(0)
+    return cast(ColorTuple, tuple(ints[:3]))
 
 
-def palette_val(palette: list[tuple]) -> list[tuple]:
+def palette_val(palette: Sequence[Sequence[int]] | Sequence[int]) -> list[ColorTuple]:
     """Convert palette to matplotlib palette.
 
     Args:
@@ -16,35 +37,63 @@ def palette_val(palette: list[tuple]) -> list[tuple]:
     """
     new_palette = []
     for color in palette:
-        color = [c / 255 for c in color]
+        color_tuple = _as_color_tuple(color)
+        if color_tuple is None:
+            continue
+        color = [c / 255 for c in color_tuple]
         new_palette.append(tuple(color))
     return new_palette
 
 
-def get_palette(palette: list[tuple] | str | tuple, num_classes: int) -> list[tuple[int]]:
+def get_palette(
+    palette: Sequence[Sequence[int]] | Sequence[int] | np.ndarray | str | ColorTuple | None,
+    num_classes: int,
+) -> list[ColorTuple]:
     """Get palette from various inputs.
 
     Args:
-        palette (list[tuple] | str | tuple): palette inputs.
+        palette (list[tuple] | str | tuple | None): palette inputs.
         num_classes (int): the number of classes.
 
     Returns:
-        list[tuple[int]]: A list of color tuples.
+        list[tuple[int, ...]]: A list of color tuples.
     """
     assert isinstance(num_classes, int)
 
+    dataset_palette: list[ColorTuple] | None = None
     if isinstance(palette, list):
-        dataset_palette = palette
-    elif isinstance(palette, tuple):
-        dataset_palette = [palette] * num_classes
-    elif palette == "random" or palette is None:
+        colors: list[ColorTuple] = []
+        for color in palette:
+            color_tuple = _as_color_tuple(color)
+            if color_tuple is not None:
+                colors.append(color_tuple)
+        if colors:
+            dataset_palette = colors
+    elif isinstance(palette, np.ndarray):
+        if palette.ndim == 1:
+            color_tuple = _as_color_tuple(palette)
+            if color_tuple is not None:
+                dataset_palette = [color_tuple] * num_classes
+        else:
+            colors = []
+            for row in palette:
+                color_tuple = _as_color_tuple(row)
+                if color_tuple is not None:
+                    colors.append(color_tuple)
+            if colors:
+                dataset_palette = colors
+    else:
+        color_tuple = _as_color_tuple(palette)
+        if color_tuple is not None:
+            dataset_palette = [color_tuple] * num_classes
+    if dataset_palette is None and (palette == "random" or palette is None):
         state = np.random.get_state()
         # random color
         np.random.seed(42)
         palette = np.random.randint(0, 256, size=(num_classes, 3))
         np.random.set_state(state)
-        dataset_palette = [tuple(c) for c in palette]
-    elif palette == "coco":
+        dataset_palette = [cast(ColorTuple, tuple(int(x) for x in c[:3])) for c in palette]
+    elif dataset_palette is None and palette == "coco":
         # For now, we'll use a predefined COCO palette
         # This avoids circular imports from datasets
         coco_palette = [
@@ -134,8 +183,8 @@ def get_palette(palette: list[tuple] | str | tuple, num_classes: int) -> list[tu
             # Generate additional colors if needed
             np.random.seed(42)
             extra_colors = np.random.randint(0, 256, size=(num_classes - len(dataset_palette), 3))
-            dataset_palette.extend([tuple(c) for c in extra_colors])
-    elif palette == "citys":
+            dataset_palette.extend([cast(ColorTuple, tuple(int(x) for x in c[:3])) for c in extra_colors])
+    elif dataset_palette is None and palette == "citys":
         # Cityscapes palette - simplified version
         citys_palette = [
             (128, 64, 128),
@@ -162,8 +211,8 @@ def get_palette(palette: list[tuple] | str | tuple, num_classes: int) -> list[tu
         if len(dataset_palette) < num_classes:
             np.random.seed(42)
             extra_colors = np.random.randint(0, 256, size=(num_classes - len(dataset_palette), 3))
-            dataset_palette.extend([tuple(c) for c in extra_colors])
-    elif palette == "voc":
+            dataset_palette.extend([cast(ColorTuple, tuple(int(x) for x in c[:3])) for c in extra_colors])
+    elif dataset_palette is None and palette == "voc":
         # VOC palette
         voc_palette = [
             (0, 0, 0),
@@ -192,8 +241,8 @@ def get_palette(palette: list[tuple] | str | tuple, num_classes: int) -> list[tu
         if len(dataset_palette) < num_classes:
             np.random.seed(42)
             extra_colors = np.random.randint(0, 256, size=(num_classes - len(dataset_palette), 3))
-            dataset_palette.extend([tuple(c) for c in extra_colors])
-    elif is_str(palette):
+            dataset_palette.extend([cast(ColorTuple, tuple(int(x) for x in c[:3])) for c in extra_colors])
+    elif dataset_palette is None and isinstance(palette, str):
         # Convert color string to RGB tuple
         # Simple color name to RGB mapping
         color_map = {
@@ -208,14 +257,17 @@ def get_palette(palette: list[tuple] | str | tuple, num_classes: int) -> list[tu
         }
         rgb = color_map.get(palette.lower(), (128, 128, 128))
         dataset_palette = [rgb] * num_classes
-    else:
+    if dataset_palette is None:
         raise TypeError(f"Invalid type for palette: {type(palette)}")
 
-    assert len(dataset_palette) >= num_classes, "The length of palette should not be less than `num_classes`."
-    return dataset_palette
+    if len(dataset_palette) < num_classes:
+        last_color = dataset_palette[-1] if dataset_palette else (0, 0, 0)
+        dataset_palette = dataset_palette + [last_color] * (num_classes - len(dataset_palette))
 
+    return dataset_palette[:num_classes]
 
-def _get_adaptive_scales(areas: np.ndarray, min_area: int = 800, max_area: int = 30000) -> np.ndarray:
+
+def _get_adaptive_scales(areas: np.ndarray | float, min_area: int = 800, max_area: int = 30000) -> np.ndarray:
     """Get adaptive scales according to areas.
 
     The scale range is [0.5, 1.0]. When the area is less than
@@ -223,19 +275,24 @@ def _get_adaptive_scales(areas: np.ndarray, min_area: int = 800, max_area: int =
     ``max_area``, the scale is 1.0.
 
     Args:
-        areas (ndarray): The areas of bboxes or masks with the
-            shape of (n, ).
+        areas (ndarray | float): The areas of bboxes or masks with the
+            shape of (n, ) or a single float value.
         min_area (int): Lower bound areas for adaptive scales.
             Defaults to 800.
         max_area (int): Upper bound areas for adaptive scales.
             Defaults to 30000.
 
     Returns:
-        ndarray: The adaotive scales with the shape of (n, ).
+        ndarray: The adaotive scales with the shape of (n, ) or (1,).
     """
-    scales = 0.5 + (areas - min_area) // (max_area - min_area)
-    scales = np.clip(scales, 0.5, 1.0)
-    return scales
+    if isinstance(areas, np.ndarray):
+        scales = 0.5 + (areas - min_area) // (max_area - min_area)
+        scales = np.clip(scales, 0.5, 1.0)
+        return scales
+    else:
+        # Handle scalar case - convert to array
+        scale = 0.5 + (areas - min_area) / (max_area - min_area)
+        return np.array([np.clip(scale, 0.5, 1.0)])
 
 
 def jitter_color(color: tuple) -> tuple:
@@ -250,5 +307,5 @@ def jitter_color(color: tuple) -> tuple:
     """
     jitter = np.random.rand(3)
     jitter = (jitter / np.linalg.norm(jitter) - 0.5) * 0.5 * 255
-    color = np.clip(jitter + color, 0, 255).astype(np.uint8)
-    return tuple(color)
+    clipped = np.clip(jitter + color, 0, 255).astype(np.uint8)
+    return cast(ColorTuple, tuple(int(c) for c in clipped.tolist()[:3]))