diff --git a/visdet/cv/cnn/bricks/activation.py b/visdet/cv/cnn/bricks/activation.py index 0640af16..9a250e2a 100644 --- a/visdet/cv/cnn/bricks/activation.py +++ b/visdet/cv/cnn/bricks/activation.py @@ -45,12 +45,15 @@ class Clamp(nn.Module): Default to 1. """ + min: float + max: float + def __init__(self, min: float = -1.0, max: float = 1.0): super().__init__() - self.min = min - self.max = max + object.__setattr__(self, "min", min) + object.__setattr__(self, "max", max) - def forward(self, x) -> torch.Tensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: """Forward function. Args: diff --git a/visdet/cv/cnn/bricks/conv_module.py b/visdet/cv/cnn/bricks/conv_module.py index ae7ba541..8b64a540 100644 --- a/visdet/cv/cnn/bricks/conv_module.py +++ b/visdet/cv/cnn/bricks/conv_module.py @@ -1,10 +1,15 @@ +from __future__ import annotations + # Copyright (c) OpenMMLab. All rights reserved. import warnings from functools import partial +from typing import TYPE_CHECKING, Any, Callable, cast import torch import torch.nn as nn +from torch import Tensor from torch.nn.modules.batchnorm import _BatchNorm +from torch.nn.modules.conv import _ConvNd from torch.nn.modules.instancenorm import _InstanceNorm from visdet.cv.cnn.bricks.activation import build_activation_layer @@ -14,8 +19,10 @@ from visdet.engine.model import constant_init, kaiming_init from visdet.engine.registry import MODELS +EfficientConvBnEvalForward = Callable[[_BatchNorm, _ConvNd, Tensor], Tensor] + -def efficient_conv_bn_eval_forward(bn: _BatchNorm, conv: nn.modules.conv._ConvNd, x: torch.Tensor): +def efficient_conv_bn_eval_forward(bn: _BatchNorm, conv: _ConvNd, x: Tensor) -> Tensor: """ Implementation based on https://arxiv.org/abs/2305.11624 "Tune-Mode ConvBN Blocks For Efficient Transfer Learning" @@ -31,31 +38,37 @@ def efficient_conv_bn_eval_forward(bn: _BatchNorm, conv: nn.modules.conv._ConvNd """ # These lines of code are designed to deal with various cases # like bn without affine transform, and conv without bias + running_var = bn.running_var + running_mean = bn.running_mean + if running_var is None or running_mean is None: + msg = "BatchNorm running stats must exist when efficient_conv_bn_eval_forward is enabled" + raise RuntimeError(msg) + weight_on_the_fly = conv.weight if conv.bias is not None: bias_on_the_fly = conv.bias else: - bias_on_the_fly = torch.zeros_like(bn.running_var) + bias_on_the_fly = torch.zeros_like(running_var) if bn.weight is not None: bn_weight = bn.weight else: - bn_weight = torch.ones_like(bn.running_var) + bn_weight = torch.ones_like(running_var) if bn.bias is not None: bn_bias = bn.bias else: - bn_bias = torch.zeros_like(bn.running_var) + bn_bias = torch.zeros_like(running_var) # shape of [C_out, 1, 1, 1] in Conv2d - weight_coeff = torch.rsqrt(bn.running_var + bn.eps).reshape([-1] + [1] * (len(conv.weight.shape) - 1)) + weight_coeff = torch.rsqrt(running_var + bn.eps).reshape([-1] + [1] * (len(conv.weight.shape) - 1)) # shape of [C_out, 1, 1, 1] in Conv2d coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff # shape of [C_out, C_in, k, k] in Conv2d weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly # shape of [C_out] in Conv2d - bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() * (bias_on_the_fly - bn.running_mean) + bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() * (bias_on_the_fly - running_mean) return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly) @@ -117,6 +130,15 @@ class ConvModule(nn.Module): """ _abbr_ = "conv_block" + conv_cfg: dict[str, Any] | None + norm_cfg: dict[str, Any] | None + act_cfg: dict[str, Any] | None + order: tuple[str, str, str] + padding_layer: nn.Module | None + activate: nn.Module | None + efficient_conv_bn_eval_forward: EfficientConvBnEvalForward | None + norm_name: str | None + conv: _ConvNd def __init__( self, @@ -142,22 +164,26 @@ def __init__( assert norm_cfg is None or isinstance(norm_cfg, dict) assert act_cfg is None or isinstance(act_cfg, dict) official_padding_mode = ["zeros", "circular"] - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.act_cfg = act_cfg - self.inplace = inplace - self.with_spectral_norm = with_spectral_norm - self.with_explicit_padding = padding_mode not in official_padding_mode - self.order = order + # Store config dicts as attributes - these are simple data, not tensors + object.__setattr__(self, "conv_cfg", conv_cfg) + object.__setattr__(self, "norm_cfg", norm_cfg) + object.__setattr__(self, "act_cfg", act_cfg) + object.__setattr__(self, "inplace", inplace) + object.__setattr__(self, "with_spectral_norm", with_spectral_norm) + object.__setattr__(self, "with_explicit_padding", padding_mode not in official_padding_mode) + object.__setattr__(self, "order", order) assert isinstance(self.order, tuple) and len(self.order) == 3 assert set(order) == {"conv", "norm", "act"} - self.with_norm = norm_cfg is not None - self.with_activation = act_cfg is not None + object.__setattr__(self, "with_norm", norm_cfg is not None) + object.__setattr__(self, "with_activation", act_cfg is not None) + self.padding_layer: nn.Module | None = None + self.activate: nn.Module | None = None + object.__setattr__(self, "efficient_conv_bn_eval_forward", None) # if the conv layer is before a norm layer, bias is unnecessary. if bias == "auto": bias = not self.with_norm - self.with_bias = bias + object.__setattr__(self, "with_bias", bias) if self.with_explicit_padding: pad_cfg = dict(type=padding_mode) @@ -166,7 +192,7 @@ def __init__( # reset padding to 0 for conv module conv_padding = 0 if self.with_explicit_padding else padding # build convolution layer - self.conv = build_conv_layer( + conv_layer = build_conv_layer( conv_cfg, in_channels, out_channels, @@ -177,16 +203,17 @@ def __init__( groups=groups, bias=bias, ) + self.conv = cast(_ConvNd, conv_layer) # export the attributes of self.conv to a higher level for convenience - self.in_channels = self.conv.in_channels - self.out_channels = self.conv.out_channels - self.kernel_size = self.conv.kernel_size - self.stride = self.conv.stride - self.padding = padding - self.dilation = self.conv.dilation - self.transposed = self.conv.transposed - self.output_padding = self.conv.output_padding - self.groups = self.conv.groups + object.__setattr__(self, "in_channels", self.conv.in_channels) + object.__setattr__(self, "out_channels", self.conv.out_channels) + object.__setattr__(self, "kernel_size", self.conv.kernel_size) + object.__setattr__(self, "stride", self.conv.stride) + object.__setattr__(self, "padding", padding) + object.__setattr__(self, "dilation", self.conv.dilation) + object.__setattr__(self, "transposed", self.conv.transposed) + object.__setattr__(self, "output_padding", self.conv.output_padding) + object.__setattr__(self, "groups", self.conv.groups) if self.with_spectral_norm: self.conv = nn.utils.spectral_norm(self.conv) @@ -198,19 +225,22 @@ def __init__( norm_channels = out_channels else: norm_channels = in_channels - self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels) # type: ignore - self.add_module(self.norm_name, norm) + assert norm_cfg is not None + norm_name, norm = build_norm_layer(norm_cfg, norm_channels) + object.__setattr__(self, "norm_name", norm_name) + self.add_module(norm_name, norm) if self.with_bias: if isinstance(norm, (_BatchNorm, _InstanceNorm)): warnings.warn("Unnecessary conv bias before batch/instance norm") else: - self.norm_name = None # type: ignore + object.__setattr__(self, "norm_name", None) self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval) # build activation layer if self.with_activation: - act_cfg_ = act_cfg.copy() # type: ignore + assert act_cfg is not None + act_cfg_ = cast(dict[str, Any], act_cfg.copy()) # nn.Tanh has no 'inplace' argument if act_cfg_["type"] not in [ "Tanh", @@ -227,7 +257,7 @@ def __init__( self.init_weights() @property - def norm(self): + def norm(self) -> nn.Module | None: if self.norm_name: return getattr(self, self.norm_name) else: @@ -244,7 +274,7 @@ def init_weights(self): # Note: For PyTorch's conv layers, they will be overwritten by our # initialization implementation using default ``kaiming_init``. if not hasattr(self.conv, "init_weights"): - if self.with_activation and self.act_cfg["type"] == "LeakyReLU": + if self.with_activation and self.act_cfg is not None and self.act_cfg["type"] == "LeakyReLU": nonlinearity = "leaky_relu" a = self.act_cfg.get("negative_slope", 0.01) else: @@ -252,7 +282,9 @@ def init_weights(self): a = 0 kaiming_init(self.conv, a=a, nonlinearity=nonlinearity) if self.with_norm: - constant_init(self.norm, 1, bias=0) + norm_layer = self.norm + if norm_layer is not None: + constant_init(norm_layer, 1, bias=0) def forward(self, x: torch.Tensor, activate: bool = True, norm: bool = True) -> torch.Tensor: layer_index = 0 @@ -260,78 +292,95 @@ def forward(self, x: torch.Tensor, activate: bool = True, norm: bool = True) -> layer = self.order[layer_index] if layer == "conv": if self.with_explicit_padding: + if self.padding_layer is None: + raise RuntimeError("Padding layer is not initialized") x = self.padding_layer(x) # if the next operation is norm and we have a norm layer in # eval mode and we have enabled `efficient_conv_bn_eval` for # the conv operator, then activate the optimized forward and # skip the next norm operator since it has been fused + norm_layer = self.norm if ( layer_index + 1 < len(self.order) and self.order[layer_index + 1] == "norm" and norm and self.with_norm - and not self.norm.training + and norm_layer is not None + and not norm_layer.training and self.efficient_conv_bn_eval_forward is not None ): - self.conv.forward = partial(self.efficient_conv_bn_eval_forward, self.norm, self.conv) + bn_module = cast(_BatchNorm, norm_layer) + self.conv.forward = partial(self.efficient_conv_bn_eval_forward, bn_module, self.conv) # type: ignore[method-assign] layer_index += 1 x = self.conv(x) - del self.conv.forward + del self.conv.forward # type: ignore[attr-defined] else: x = self.conv(x) elif layer == "norm" and norm and self.with_norm: - x = self.norm(x) + norm_layer = self.norm + if norm_layer is None: + raise RuntimeError("Norm layer not initialized") + x = norm_layer(x) elif layer == "act" and activate and self.with_activation: + if self.activate is None: + raise RuntimeError("Activation layer not initialized") x = self.activate(x) layer_index += 1 return x - def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval=True): + def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval: bool = True) -> None: # efficient_conv_bn_eval works for conv + bn # with `track_running_stats` option - if efficient_conv_bn_eval and self.norm and isinstance(self.norm, _BatchNorm) and self.norm.track_running_stats: - self.efficient_conv_bn_eval_forward = efficient_conv_bn_eval_forward + norm_layer = self.norm + if ( + efficient_conv_bn_eval + and norm_layer is not None + and isinstance(norm_layer, _BatchNorm) + and norm_layer.track_running_stats + ): + object.__setattr__(self, "efficient_conv_bn_eval_forward", efficient_conv_bn_eval_forward) else: - self.efficient_conv_bn_eval_forward = None # type: ignore + object.__setattr__(self, "efficient_conv_bn_eval_forward", None) @staticmethod def create_from_conv_bn( - conv: torch.nn.modules.conv._ConvNd, - bn: torch.nn.modules.batchnorm._BatchNorm, - efficient_conv_bn_eval=True, + conv: _ConvNd, + bn: _BatchNorm, + efficient_conv_bn_eval: bool = True, ) -> "ConvModule": """Create a ConvModule from a conv and a bn module.""" self = ConvModule.__new__(ConvModule) super(ConvModule, self).__init__() - self.conv_cfg = None - self.norm_cfg = None - self.act_cfg = None - self.inplace = False - self.with_spectral_norm = False - self.with_explicit_padding = False - self.order = ("conv", "norm", "act") + object.__setattr__(self, "conv_cfg", None) + object.__setattr__(self, "norm_cfg", None) + object.__setattr__(self, "act_cfg", None) + object.__setattr__(self, "inplace", False) + object.__setattr__(self, "with_spectral_norm", False) + object.__setattr__(self, "with_explicit_padding", False) + object.__setattr__(self, "order", ("conv", "norm", "act")) - self.with_norm = True - self.with_activation = False - self.with_bias = conv.bias is not None + object.__setattr__(self, "with_norm", True) + object.__setattr__(self, "with_activation", False) + object.__setattr__(self, "with_bias", conv.bias is not None) # build convolution layer self.conv = conv # export the attributes of self.conv to a higher level for convenience - self.in_channels = self.conv.in_channels - self.out_channels = self.conv.out_channels - self.kernel_size = self.conv.kernel_size - self.stride = self.conv.stride - self.padding = self.conv.padding - self.dilation = self.conv.dilation - self.transposed = self.conv.transposed - self.output_padding = self.conv.output_padding - self.groups = self.conv.groups + object.__setattr__(self, "in_channels", self.conv.in_channels) + object.__setattr__(self, "out_channels", self.conv.out_channels) + object.__setattr__(self, "kernel_size", self.conv.kernel_size) + object.__setattr__(self, "stride", self.conv.stride) + object.__setattr__(self, "padding", self.conv.padding) + object.__setattr__(self, "dilation", self.conv.dilation) + object.__setattr__(self, "transposed", self.conv.transposed) + object.__setattr__(self, "output_padding", self.conv.output_padding) + object.__setattr__(self, "groups", self.conv.groups) # build normalization layers - self.norm_name, norm = "bn", bn - self.add_module(self.norm_name, norm) + norm_name: str = "bn" + object.__setattr__(self, "norm_name", norm_name) + self.add_module(norm_name, bn) self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval) diff --git a/visdet/cv/cnn/bricks/drop.py b/visdet/cv/cnn/bricks/drop.py index 7938aa22..b9b7642e 100644 --- a/visdet/cv/cnn/bricks/drop.py +++ b/visdet/cv/cnn/bricks/drop.py @@ -37,9 +37,11 @@ class DropPath(nn.Module): drop_prob (float): Probability of the path to be zeroed. Default: 0.1 """ + drop_prob: float + def __init__(self, drop_prob: float = 0.1): super().__init__() - self.drop_prob = drop_prob + object.__setattr__(self, "drop_prob", drop_prob) def forward(self, x: torch.Tensor) -> torch.Tensor: return drop_path(x, self.drop_prob, self.training) @@ -61,10 +63,14 @@ def __init__(self, drop_prob: float = 0.5, inplace: bool = False): super().__init__(p=drop_prob, inplace=inplace) -def build_dropout(cfg: dict | float | None, default_args: dict | None = None) -> Any: +def build_dropout(cfg: dict[str, Any] | float | None, default_args: dict | None = None) -> Any: """Builder for drop out layers.""" if cfg is None: return None if isinstance(cfg, float): - cfg = dict(type="Dropout", drop_prob=cfg) - return MODELS.build(cfg, default_args=default_args) + cfg_dict: dict[str, Any] = dict(type="Dropout", drop_prob=cfg) + else: + if not isinstance(cfg, dict): + raise TypeError(f"cfg must be dict or float, but got {type(cfg)!r}") + cfg_dict = cfg + return MODELS.build(cfg_dict, default_args=default_args) diff --git a/visdet/cv/cnn/bricks/hsigmoid.py b/visdet/cv/cnn/bricks/hsigmoid.py index 949a4cfc..cef6e1ed 100644 --- a/visdet/cv/cnn/bricks/hsigmoid.py +++ b/visdet/cv/cnn/bricks/hsigmoid.py @@ -27,6 +27,11 @@ class HSigmoid(nn.Module): Tensor: The output tensor. """ + bias: float + divisor: float + min_value: float + max_value: float + def __init__( self, bias: float = 3.0, @@ -43,11 +48,11 @@ def __init__( "Hsigmoid(x) = min(max((x + 3) / 6, 0), 1).", stacklevel=2, ) - self.bias = bias - self.divisor = divisor + object.__setattr__(self, "bias", bias) + object.__setattr__(self, "divisor", divisor) assert self.divisor != 0 - self.min_value = min_value - self.max_value = max_value + object.__setattr__(self, "min_value", min_value) + object.__setattr__(self, "max_value", max_value) def forward(self, x: torch.Tensor) -> torch.Tensor: x = (x + self.bias) / self.divisor diff --git a/visdet/cv/cnn/bricks/scale.py b/visdet/cv/cnn/bricks/scale.py index e708786c..c805f3bb 100644 --- a/visdet/cv/cnn/bricks/scale.py +++ b/visdet/cv/cnn/bricks/scale.py @@ -34,6 +34,9 @@ class LayerScale(nn.Module): scale (float): Initial value of scale factor. Default: 1.0 """ + inplace: bool + data_format: str + def __init__( self, dim: int, @@ -45,8 +48,8 @@ def __init__( assert data_format in ("channels_last", "channels_first"), ( "'data_format' could only be channels_last or channels_first." ) - self.inplace = inplace - self.data_format = data_format + object.__setattr__(self, "inplace", inplace) + object.__setattr__(self, "data_format", data_format) self.weight = nn.Parameter(torch.ones(dim) * scale) def forward(self, x) -> torch.Tensor: diff --git a/visdet/cv/cnn/bricks/transformer.py b/visdet/cv/cnn/bricks/transformer.py index 3de2b43f..178e5166 100644 --- a/visdet/cv/cnn/bricks/transformer.py +++ b/visdet/cv/cnn/bricks/transformer.py @@ -1,8 +1,11 @@ +from __future__ import annotations + # Copyright (c) OpenMMLab. All rights reserved. import copy import math import warnings from collections.abc import Sequence +from typing import Any, Iterable, cast import torch import torch.nn as nn @@ -20,6 +23,10 @@ from visdet.engine.utils import deprecated_api_warning, to_2tuple +def _tuple2(value: int | tuple[int, int] | Iterable[int]) -> tuple[int, int]: + return cast(tuple[int, int], to_2tuple(value)) + + def build_positional_encoding(cfg, default_args=None): """Builder for Position Encoding.""" return MODELS.build(cfg, default_args=default_args) @@ -80,18 +87,24 @@ class AdaptivePadding(nn.Module): >>> assert (out.shape[2], out.shape[3]) == (16, 32) """ - def __init__(self, kernel_size=1, stride=1, dilation=1, padding="corner"): + def __init__( + self, + kernel_size: int | tuple[int, int] = 1, + stride: int | tuple[int, int] = 1, + dilation: int | tuple[int, int] = 1, + padding: str = "corner", + ): super().__init__() assert padding in ("same", "corner") - kernel_size = to_2tuple(kernel_size) - stride = to_2tuple(stride) - dilation = to_2tuple(dilation) + kernel_tuple = _tuple2(kernel_size) + stride_tuple = _tuple2(stride) + dilation_tuple = _tuple2(dilation) - self.padding = padding - self.kernel_size = kernel_size - self.stride = stride - self.dilation = dilation + object.__setattr__(self, "padding", padding) + object.__setattr__(self, "kernel_size", kernel_tuple) + object.__setattr__(self, "stride", stride_tuple) + object.__setattr__(self, "dilation", dilation_tuple) def get_pad_shape(self, input_shape): """Calculate the padding size of input. @@ -104,16 +117,17 @@ def get_pad_shape(self, input_shape): original H and W directions """ input_h, input_w = input_shape - kernel_h, kernel_w = self.kernel_size - stride_h, stride_w = self.stride + kernel_h, kernel_w = cast(tuple[int, int], self.kernel_size) + stride_h, stride_w = cast(tuple[int, int], self.stride) + dilation = cast(tuple[int, int], self.dilation) output_h = math.ceil(input_h / stride_h) output_w = math.ceil(input_w / stride_w) pad_h = max( - (output_h - 1) * stride_h + (kernel_h - 1) * self.dilation[0] + 1 - input_h, + (output_h - 1) * stride_h + (kernel_h - 1) * dilation[0] + 1 - input_h, 0, ) pad_w = max( - (output_w - 1) * stride_w + (kernel_w - 1) * self.dilation[1] + 1 - input_w, + (output_w - 1) * stride_w + (kernel_w - 1) * dilation[1] + 1 - input_w, 0, ) return pad_h, pad_w @@ -166,77 +180,81 @@ class PatchEmbed(BaseModule): def __init__( self, - in_channels=3, - embed_dims=768, - conv_type="Conv2d", - kernel_size=16, - stride=16, - padding="corner", - dilation=1, - bias=True, - norm_cfg=None, - input_size=None, - init_cfg=None, + in_channels: int = 3, + embed_dims: int = 768, + conv_type: str = "Conv2d", + kernel_size: int | tuple[int, int] = 16, + stride: int | tuple[int, int] | None = 16, + padding: int | tuple[int, int] | str = "corner", + dilation: int | tuple[int, int] = 1, + bias: bool = True, + norm_cfg: dict | None = None, + input_size: int | tuple[int, int] | None = None, + init_cfg: dict | None = None, ): super().__init__(init_cfg=init_cfg) - self.embed_dims = embed_dims + object.__setattr__(self, "embed_dims", embed_dims) if stride is None: stride = kernel_size - kernel_size = to_2tuple(kernel_size) - stride = to_2tuple(stride) - dilation = to_2tuple(dilation) + kernel_tuple = _tuple2(kernel_size) + stride_tuple = _tuple2(stride) + dilation_tuple = _tuple2(dilation) if isinstance(padding, str): self.adaptive_padding = AdaptivePadding( - kernel_size=kernel_size, - stride=stride, - dilation=dilation, + kernel_size=kernel_tuple, + stride=stride_tuple, + dilation=dilation_tuple, padding=padding, ) # disable the padding of conv padding = 0 else: - self.adaptive_padding = None - padding = to_2tuple(padding) + object.__setattr__(self, "adaptive_padding", None) + padding_tuple = _tuple2(padding) self.projection = build_conv_layer( dict(type=conv_type), in_channels=in_channels, out_channels=embed_dims, - kernel_size=kernel_size, - stride=stride, - padding=padding, - dilation=dilation, + kernel_size=kernel_tuple, + stride=stride_tuple, + padding=padding_tuple, + dilation=dilation_tuple, bias=bias, ) if norm_cfg is not None: self.norm = build_norm_layer(norm_cfg, embed_dims)[1] else: - self.norm = None + object.__setattr__(self, "norm", None) if input_size: - input_size = to_2tuple(input_size) + input_size_tuple = _tuple2(input_size) # `init_out_size` would be used outside to # calculate the num_patches # e.g. when `use_abs_pos_embed` outside - self.init_input_size = input_size + object.__setattr__(self, "init_input_size", input_size_tuple) if self.adaptive_padding: - pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size) - input_h, input_w = input_size + pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size_tuple) + input_h, input_w = input_size_tuple input_h = input_h + pad_h input_w = input_w + pad_w - input_size = (input_h, input_w) + input_size_tuple = (input_h, input_w) # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html - h_out = (input_size[0] + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) // stride[0] + 1 - w_out = (input_size[1] + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) // stride[1] + 1 - self.init_out_size = (h_out, w_out) + h_out = ( + input_size_tuple[0] + 2 * padding_tuple[0] - dilation_tuple[0] * (kernel_tuple[0] - 1) - 1 + ) // stride_tuple[0] + 1 + w_out = ( + input_size_tuple[1] + 2 * padding_tuple[1] - dilation_tuple[1] * (kernel_tuple[1] - 1) - 1 + ) // stride_tuple[1] + 1 + object.__setattr__(self, "init_out_size", (h_out, w_out)) else: - self.init_input_size = None - self.init_out_size = None + object.__setattr__(self, "init_input_size", None) + object.__setattr__(self, "init_out_size", None) def forward(self, x): """ @@ -294,51 +312,57 @@ class PatchMerging(BaseModule): Default: None. """ + adaptive_padding: AdaptivePadding | None + norm: nn.Module | None + sampler: nn.Unfold + def __init__( self, - in_channels, - out_channels, - kernel_size=2, - stride=None, - padding="corner", - dilation=1, - bias=False, - norm_cfg=dict(type="LN"), - init_cfg=None, + in_channels: int, + out_channels: int, + kernel_size: int | tuple[int, int] = 2, + stride: int | tuple[int, int] | None = None, + padding: int | tuple[int, int] | str = "corner", + dilation: int | tuple[int, int] = 1, + bias: bool = False, + norm_cfg: dict | None = dict(type="LN"), + init_cfg: dict | None = None, ): super().__init__(init_cfg=init_cfg) - self.in_channels = in_channels - self.out_channels = out_channels - if stride: - stride = stride - else: - stride = kernel_size + object.__setattr__(self, "in_channels", in_channels) + object.__setattr__(self, "out_channels", out_channels) + stride_value = stride if stride is not None else kernel_size - kernel_size = to_2tuple(kernel_size) - stride = to_2tuple(stride) - dilation = to_2tuple(dilation) + kernel_size_tuple = _tuple2(kernel_size) + stride_tuple = _tuple2(stride_value) + dilation_tuple = _tuple2(dilation) if isinstance(padding, str): self.adaptive_padding = AdaptivePadding( - kernel_size=kernel_size, - stride=stride, - dilation=dilation, + kernel_size=kernel_size_tuple, + stride=stride_tuple, + dilation=dilation_tuple, padding=padding, ) # disable the padding of unfold padding = 0 else: - self.adaptive_padding = None - - padding = to_2tuple(padding) - self.sampler = nn.Unfold(kernel_size=kernel_size, dilation=dilation, padding=padding, stride=stride) + object.__setattr__(self, "adaptive_padding", None) + + padding_tuple = _tuple2(padding) + self.sampler = nn.Unfold( + kernel_size=kernel_size_tuple, + dilation=dilation_tuple, + padding=padding_tuple, + stride=stride_tuple, + ) - sample_dim = kernel_size[0] * kernel_size[1] * in_channels + sample_dim = kernel_size_tuple[0] * kernel_size_tuple[1] * in_channels if norm_cfg is not None: self.norm = build_norm_layer(norm_cfg, sample_dim)[1] else: - self.norm = None + object.__setattr__(self, "norm", None) self.reduction = nn.Linear(sample_dim, out_channels, bias=bias) @@ -373,16 +397,17 @@ def forward(self, x, input_size): # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2) x = self.sampler(x) - out_h = ( - H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] * (self.sampler.kernel_size[0] - 1) - 1 - ) // self.sampler.stride[0] + 1 - out_w = ( - W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] * (self.sampler.kernel_size[1] - 1) - 1 - ) // self.sampler.stride[1] + 1 + padding_hw = _tuple2(self.sampler.padding) + dilation_hw = _tuple2(self.sampler.dilation) + kernel_hw = _tuple2(self.sampler.kernel_size) + stride_hw = _tuple2(self.sampler.stride) + out_h = (H + 2 * padding_hw[0] - dilation_hw[0] * (kernel_hw[0] - 1) - 1) // stride_hw[0] + 1 + out_w = (W + 2 * padding_hw[1] - dilation_hw[1] * (kernel_hw[1] - 1) - 1) // stride_hw[1] + 1 output_size = (out_h, out_w) x = x.transpose(1, 2) # B, H/2*W/2, 4*C - x = self.norm(x) if self.norm else x + if self.norm is not None: + x = self.norm(x) x = self.reduction(x) return x, output_size @@ -412,13 +437,13 @@ class MultiheadAttention(BaseModule): def __init__( self, - embed_dims, - num_heads, - attn_drop=0.0, - proj_drop=0.0, - dropout_layer=dict(type="Dropout", drop_prob=0.0), - init_cfg=None, - batch_first=False, + embed_dims: int, + num_heads: int, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + dropout_layer: dict | None = dict(type="Dropout", drop_prob=0.0), + init_cfg: dict | None = None, + batch_first: bool = False, **kwargs, ): super().__init__(init_cfg) @@ -431,11 +456,12 @@ def __init__( DeprecationWarning, ) attn_drop = kwargs["dropout"] - dropout_layer["drop_prob"] = kwargs.pop("dropout") + if dropout_layer is not None: + dropout_layer["drop_prob"] = kwargs.pop("dropout") - self.embed_dims = embed_dims - self.num_heads = num_heads - self.batch_first = batch_first + object.__setattr__(self, "embed_dims", embed_dims) + object.__setattr__(self, "num_heads", num_heads) + object.__setattr__(self, "batch_first", batch_first) self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, **kwargs) @@ -566,21 +592,21 @@ class FFN(BaseModule): @deprecated_api_warning({"dropout": "ffn_drop", "add_residual": "add_identity"}, cls_name="FFN") def __init__( self, - embed_dims=256, - feedforward_channels=1024, - num_fcs=2, - act_cfg=dict(type="ReLU", inplace=True), - ffn_drop=0.0, - dropout_layer=None, - add_identity=True, - init_cfg=None, - layer_scale_init_value=0.0, + embed_dims: int = 256, + feedforward_channels: int = 1024, + num_fcs: int = 2, + act_cfg: dict = dict(type="ReLU", inplace=True), + ffn_drop: float = 0.0, + dropout_layer: dict | None = None, + add_identity: bool = True, + init_cfg: dict | None = None, + layer_scale_init_value: float = 0.0, ): super().__init__(init_cfg) assert num_fcs >= 2, f"num_fcs should be no less than 2. got {num_fcs}." - self.embed_dims = embed_dims - self.feedforward_channels = feedforward_channels - self.num_fcs = num_fcs + object.__setattr__(self, "embed_dims", embed_dims) + object.__setattr__(self, "feedforward_channels", feedforward_channels) + object.__setattr__(self, "num_fcs", num_fcs) layers = [] in_channels = embed_dims @@ -597,7 +623,7 @@ def __init__( layers.append(nn.Dropout(ffn_drop)) self.layers = Sequential(*layers) self.dropout_layer = build_dropout(dropout_layer) if dropout_layer else torch.nn.Identity() - self.add_identity = add_identity + object.__setattr__(self, "add_identity", add_identity) if layer_scale_init_value > 0: self.gamma2 = LayerScale(embed_dims, scale=layer_scale_init_value) @@ -656,10 +682,17 @@ class BaseTransformerLayer(BaseModule): or (n, batch, embed_dim). Default to False. """ + num_attn: int + operation_order: tuple[str, ...] + norm_cfg: dict + pre_norm: bool + embed_dims: int + batch_first: bool + def __init__( self, - attn_cfgs=None, - ffn_cfgs=dict( + attn_cfgs: dict[str, Any] | list[dict[str, Any]] | None = None, + ffn_cfgs: dict[str, Any] | list[dict[str, Any]] = dict( type="FFN", embed_dims=256, feedforward_channels=1024, @@ -667,12 +700,17 @@ def __init__( ffn_drop=0.0, act_cfg=dict(type="ReLU", inplace=True), ), - operation_order=None, - norm_cfg=dict(type="LN"), - init_cfg=None, - batch_first=False, + operation_order: tuple[str, ...] | None = None, + norm_cfg: dict = dict(type="LN"), + init_cfg: dict | None = None, + batch_first: bool = False, **kwargs, ): + if isinstance(ffn_cfgs, list): + ffn_cfg_data: dict[str, Any] | list[dict[str, Any]] = [copy.deepcopy(cfg) for cfg in ffn_cfgs] + else: + ffn_cfg_data = copy.deepcopy(ffn_cfgs) + deprecated_args = dict( feedforward_channels="feedforward_channels", ffn_dropout="ffn_drop", @@ -687,62 +725,75 @@ def __init__( f"to a dict named `ffn_cfgs`. ", DeprecationWarning, ) - ffn_cfgs[new_name] = kwargs[ori_name] + if isinstance(ffn_cfg_data, dict): + ffn_cfg_data[new_name] = kwargs[ori_name] super().__init__(init_cfg) - self.batch_first = batch_first + if operation_order is None: + raise ValueError("operation_order must be provided") + object.__setattr__(self, "batch_first", batch_first) + operation_order_tuple = operation_order - assert set(operation_order) & {"self_attn", "norm", "ffn", "cross_attn"} == set(operation_order), ( + assert set(operation_order_tuple) & {"self_attn", "norm", "ffn", "cross_attn"} == set(operation_order_tuple), ( f"The operation_order of {self.__class__.__name__} should contains all four operation type {['self_attn', 'norm', 'ffn', 'cross_attn']}" ) - num_attn = operation_order.count("self_attn") + operation_order.count("cross_attn") - if isinstance(attn_cfgs, dict): - attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] + num_attn = operation_order_tuple.count("self_attn") + operation_order_tuple.count("cross_attn") + attn_cfg_list: list[dict[str, Any]] + if num_attn == 0: + attn_cfg_list = [] + elif isinstance(attn_cfgs, dict): + attn_cfg_list = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] + elif attn_cfgs is None: + raise ValueError("attn_cfgs must be provided when attention ops exist") else: - assert num_attn == len(attn_cfgs), ( - f"The length of attn_cfg {num_attn} is not consistent with the number of attentionin operation_order {operation_order}." - ) - - self.num_attn = num_attn - self.operation_order = operation_order - self.norm_cfg = norm_cfg - self.pre_norm = operation_order[0] == "norm" + attn_cfg_list = attn_cfgs + if len(attn_cfg_list) != num_attn: + raise ValueError(f"Expected {num_attn} attention configs but received {len(attn_cfg_list)}") + + object.__setattr__(self, "num_attn", num_attn) + object.__setattr__(self, "operation_order", operation_order_tuple) + object.__setattr__(self, "norm_cfg", norm_cfg) + object.__setattr__(self, "pre_norm", operation_order_tuple[0] == "norm") self.attentions = ModuleList() index = 0 - for operation_name in operation_order: + for operation_name in operation_order_tuple: if operation_name in ["self_attn", "cross_attn"]: - if "batch_first" in attn_cfgs[index]: - assert self.batch_first == attn_cfgs[index]["batch_first"] + cfg = attn_cfg_list[index] + if "batch_first" in cfg: + assert self.batch_first == cfg["batch_first"] else: - attn_cfgs[index]["batch_first"] = self.batch_first - attention = build_attention(attn_cfgs[index]) + cfg["batch_first"] = self.batch_first + attention = build_attention(cfg) # Some custom attentions used as `self_attn` # or `cross_attn` can have different behavior. attention.operation_name = operation_name self.attentions.append(attention) index += 1 - self.embed_dims = self.attentions[0].embed_dims + if not self.attentions: + raise ValueError("At least one attention module is required") + object.__setattr__(self, "embed_dims", self.attentions[0].embed_dims) self.ffns = ModuleList() - num_ffns = operation_order.count("ffn") - if isinstance(ffn_cfgs, dict): - ffn_cfgs = ConfigDict(ffn_cfgs) - if isinstance(ffn_cfgs, dict): - ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] - assert len(ffn_cfgs) == num_ffns + num_ffns = operation_order_tuple.count("ffn") + if isinstance(ffn_cfg_data, dict): + base_ffn_cfg = ConfigDict(ffn_cfg_data) + ffn_cfg_list = [copy.deepcopy(base_ffn_cfg) for _ in range(num_ffns)] + else: + ffn_cfg_list = ffn_cfg_data + assert len(ffn_cfg_list) == num_ffns for ffn_index in range(num_ffns): - if "embed_dims" not in ffn_cfgs[ffn_index]: - ffn_cfgs[ffn_index]["embed_dims"] = self.embed_dims + if "embed_dims" not in ffn_cfg_list[ffn_index]: + ffn_cfg_list[ffn_index]["embed_dims"] = self.embed_dims else: - assert ffn_cfgs[ffn_index]["embed_dims"] == self.embed_dims - self.ffns.append(build_feedforward_network(ffn_cfgs[ffn_index], dict(type="FFN"))) + assert ffn_cfg_list[ffn_index]["embed_dims"] == self.embed_dims + self.ffns.append(build_feedforward_network(ffn_cfg_list[ffn_index], dict(type="FFN"))) self.norms = ModuleList() - num_norms = operation_order.count("norm") + num_norms = operation_order_tuple.count("norm") for _ in range(num_norms): self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) @@ -866,18 +917,28 @@ class TransformerLayerSequence(BaseModule): Default: None. """ - def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None): + num_layers: int + embed_dims: int + pre_norm: bool + + def __init__( + self, + transformerlayers: dict | list[dict] | None = None, + num_layers: int | None = None, + init_cfg: dict | None = None, + ): super().__init__(init_cfg) + assert num_layers is not None, "num_layers must be provided" if isinstance(transformerlayers, dict): transformerlayers = [copy.deepcopy(transformerlayers) for _ in range(num_layers)] else: assert isinstance(transformerlayers, list) and len(transformerlayers) == num_layers - self.num_layers = num_layers + object.__setattr__(self, "num_layers", num_layers) self.layers = ModuleList() for i in range(num_layers): self.layers.append(build_transformer_layer(transformerlayers[i])) - self.embed_dims = self.layers[0].embed_dims - self.pre_norm = self.layers[0].pre_norm + object.__setattr__(self, "embed_dims", self.layers[0].embed_dims) + object.__setattr__(self, "pre_norm", self.layers[0].pre_norm) def forward( self, @@ -929,3 +990,20 @@ def forward( **kwargs, ) return query + + padding: str + kernel_size: tuple[int, int] + stride: tuple[int, int] + dilation: tuple[int, int] + adaptive_padding: AdaptivePadding | None + norm: nn.Module | None + init_input_size: tuple[int, int] | None + init_out_size: tuple[int, int] | None + padding: str + kernel_size: tuple[int, int] + stride: tuple[int, int] + dilation: tuple[int, int] + adaptive_padding: AdaptivePadding | None + norm: nn.Module | None + init_input_size: tuple[int, int] | None + init_out_size: tuple[int, int] | None diff --git a/visdet/cv/cnn/bricks/upsample.py b/visdet/cv/cnn/bricks/upsample.py index a14a4e50..f6a856c0 100644 --- a/visdet/cv/cnn/bricks/upsample.py +++ b/visdet/cv/cnn/bricks/upsample.py @@ -27,6 +27,11 @@ class PixelShufflePack(nn.Module): channels. """ + in_channels: int + out_channels: int + scale_factor: int + upsample_kernel: int + def __init__( self, in_channels: int, @@ -35,10 +40,10 @@ def __init__( upsample_kernel: int, ): super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.scale_factor = scale_factor - self.upsample_kernel = upsample_kernel + object.__setattr__(self, "in_channels", in_channels) + object.__setattr__(self, "out_channels", out_channels) + object.__setattr__(self, "scale_factor", scale_factor) + object.__setattr__(self, "upsample_kernel", upsample_kernel) self.upsample_conv = nn.Conv2d( self.in_channels, self.out_channels * scale_factor * scale_factor, diff --git a/visdet/cv/cnn/bricks/wrappers.py b/visdet/cv/cnn/bricks/wrappers.py index c33f95b3..7eb40e36 100644 --- a/visdet/cv/cnn/bricks/wrappers.py +++ b/visdet/cv/cnn/bricks/wrappers.py @@ -7,6 +7,7 @@ """ import math +from typing import cast import torch import torch.nn as nn @@ -26,6 +27,14 @@ def obsolete_torch_version(torch_version, version_threshold) -> bool: return torch_version == "parrots" or torch_version <= version_threshold +def _zero_dummy_grad(module: nn.Module, reference: torch.Tensor) -> torch.Tensor: + """Return a zero tensor that participates in autograd like module params.""" + total = torch.zeros([], device=reference.device, dtype=reference.dtype) + for parameter in module.parameters(): + total = total + parameter.view(-1)[0] + return total * 0.0 + + class NewEmptyTensorOp(torch.autograd.Function): @staticmethod def forward(ctx, x: torch.Tensor, new_shape: tuple) -> torch.Tensor: @@ -43,20 +52,24 @@ class Conv2d(nn.Conv2d): def forward(self, x: torch.Tensor) -> torch.Tensor: if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0: out_shape = [x.shape[0], self.out_channels] + kernel = tuple(int(v) for v in _pair(self.kernel_size)) + padding = tuple(int(v) for v in _pair(self.padding)) + stride = tuple(int(v) for v in _pair(self.stride)) + dilation = tuple(int(v) for v in _pair(self.dilation)) for i, k, p, s, d in zip( x.shape[-2:], - self.kernel_size, - self.padding, - self.stride, - self.dilation, + kernel, + padding, + stride, + dilation, strict=False, ): o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1 out_shape.append(o) - empty = NewEmptyTensorOp.apply(x, out_shape) + empty = cast(torch.Tensor, NewEmptyTensorOp.apply(x, out_shape)) if self.training: # produce dummy gradient to avoid DDP warning. - dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + dummy = _zero_dummy_grad(self, x) return empty + dummy else: return empty @@ -69,20 +82,24 @@ class Conv3d(nn.Conv3d): def forward(self, x: torch.Tensor) -> torch.Tensor: if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0: out_shape = [x.shape[0], self.out_channels] + kernel = tuple(int(v) for v in _triple(self.kernel_size)) + padding = tuple(int(v) for v in _triple(self.padding)) + stride = tuple(int(v) for v in _triple(self.stride)) + dilation = tuple(int(v) for v in _triple(self.dilation)) for i, k, p, s, d in zip( x.shape[-3:], - self.kernel_size, - self.padding, - self.stride, - self.dilation, + kernel, + padding, + stride, + dilation, strict=False, ): o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1 out_shape.append(o) - empty = NewEmptyTensorOp.apply(x, out_shape) + empty = cast(torch.Tensor, NewEmptyTensorOp.apply(x, out_shape)) if self.training: # produce dummy gradient to avoid DDP warning. - dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + dummy = _zero_dummy_grad(self, x) return empty + dummy else: return empty @@ -96,20 +113,25 @@ class ConvTranspose2d(nn.ConvTranspose2d): def forward(self, x: torch.Tensor) -> torch.Tensor: if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0: out_shape = [x.shape[0], self.out_channels] + kernel = tuple(int(v) for v in _pair(self.kernel_size)) + padding = tuple(int(v) for v in _pair(self.padding)) + stride = tuple(int(v) for v in _pair(self.stride)) + dilation = tuple(int(v) for v in _pair(self.dilation)) + output_padding = tuple(int(v) for v in _pair(self.output_padding)) for i, k, p, s, d, op in zip( x.shape[-2:], - self.kernel_size, - self.padding, - self.stride, - self.dilation, - self.output_padding, + kernel, + padding, + stride, + dilation, + output_padding, strict=False, ): out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op) - empty = NewEmptyTensorOp.apply(x, out_shape) + empty = cast(torch.Tensor, NewEmptyTensorOp.apply(x, out_shape)) if self.training: # produce dummy gradient to avoid DDP warning. - dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + dummy = _zero_dummy_grad(self, x) return empty + dummy else: return empty @@ -123,20 +145,25 @@ class ConvTranspose3d(nn.ConvTranspose3d): def forward(self, x: torch.Tensor) -> torch.Tensor: if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0: out_shape = [x.shape[0], self.out_channels] + kernel = tuple(int(v) for v in _triple(self.kernel_size)) + padding = tuple(int(v) for v in _triple(self.padding)) + stride = tuple(int(v) for v in _triple(self.stride)) + dilation = tuple(int(v) for v in _triple(self.dilation)) + output_padding = tuple(int(v) for v in _triple(self.output_padding)) for i, k, p, s, d, op in zip( x.shape[-3:], - self.kernel_size, - self.padding, - self.stride, - self.dilation, - self.output_padding, + kernel, + padding, + stride, + dilation, + output_padding, strict=False, ): out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op) - empty = NewEmptyTensorOp.apply(x, out_shape) + empty = cast(torch.Tensor, NewEmptyTensorOp.apply(x, out_shape)) if self.training: # produce dummy gradient to avoid DDP warning. - dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + dummy = _zero_dummy_grad(self, x) return empty + dummy else: return empty @@ -145,58 +172,66 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class MaxPool2d(nn.MaxPool2d): - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward(self, x: torch.Tensor) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: # PyTorch 1.9 does not support empty tensor inference yet if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0: out_shape = list(x.shape[:2]) + kernel = tuple(int(v) for v in _pair(self.kernel_size)) + padding = tuple(int(v) for v in _pair(self.padding)) + stride = tuple(int(v) for v in _pair(self.stride)) + dilation = tuple(int(v) for v in _pair(self.dilation)) for i, k, p, s, d in zip( x.shape[-2:], - _pair(self.kernel_size), - _pair(self.padding), - _pair(self.stride), - _pair(self.dilation), + kernel, + padding, + stride, + dilation, strict=False, ): o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1 o = math.ceil(o) if self.ceil_mode else math.floor(o) out_shape.append(o) - empty = NewEmptyTensorOp.apply(x, out_shape) + empty = cast(torch.Tensor, NewEmptyTensorOp.apply(x, out_shape)) return empty return super().forward(x) class MaxPool3d(nn.MaxPool3d): - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward(self, x: torch.Tensor) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: # PyTorch 1.9 does not support empty tensor inference yet if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0: out_shape = list(x.shape[:2]) + kernel = tuple(int(v) for v in _triple(self.kernel_size)) + padding = tuple(int(v) for v in _triple(self.padding)) + stride = tuple(int(v) for v in _triple(self.stride)) + dilation = tuple(int(v) for v in _triple(self.dilation)) for i, k, p, s, d in zip( x.shape[-3:], - _triple(self.kernel_size), - _triple(self.padding), - _triple(self.stride), - _triple(self.dilation), + kernel, + padding, + stride, + dilation, strict=False, ): o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1 o = math.ceil(o) if self.ceil_mode else math.floor(o) out_shape.append(o) - empty = NewEmptyTensorOp.apply(x, out_shape) + empty = cast(torch.Tensor, NewEmptyTensorOp.apply(x, out_shape)) return empty return super().forward(x) -class Linear(torch.nn.Linear): +class Linear(nn.Linear): def forward(self, x: torch.Tensor) -> torch.Tensor: # empty tensor forward of Linear layer is supported in Pytorch 1.6 if obsolete_torch_version(TORCH_VERSION, (1, 5)) and x.numel() == 0: out_shape = [x.shape[0], self.out_features] - empty = NewEmptyTensorOp.apply(x, out_shape) + empty = cast(torch.Tensor, NewEmptyTensorOp.apply(x, out_shape)) if self.training: # produce dummy gradient to avoid DDP warning. - dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + dummy = _zero_dummy_grad(self, x) return empty + dummy else: return empty diff --git a/visdet/cv/image/io.py b/visdet/cv/image/io.py index d7896e2d..f733f3a0 100644 --- a/visdet/cv/image/io.py +++ b/visdet/cv/image/io.py @@ -13,19 +13,19 @@ ) try: - from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG + from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG # type: ignore[import-untyped] except ImportError: - TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None + TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None # type: ignore[assignment] try: from PIL import Image, ImageOps except ImportError: - Image = None + Image = None # type: ignore[assignment] try: - import tifffile + import tifffile # type: ignore[import-untyped] except ImportError: - tifffile = None + tifffile = None # type: ignore[assignment] jpeg = None supported_backends = ["cv2", "turbojpeg", "pillow", "tifffile"] @@ -137,11 +137,11 @@ def imfrombytes( else: # cv2 backend if len(content) == 0: - return None + return None # type: ignore[return-value] img_np = np.frombuffer(content, np.uint8) - flag = imread_flags[flag] if isinstance(flag, str) else flag - img = cv2.imdecode(img_np, flag) - if img is not None and flag == IMREAD_COLOR and channel_order == "rgb": + flag_int: int = imread_flags[flag] if isinstance(flag, str) else flag + img = cv2.imdecode(img_np, flag_int) # type: ignore[arg-type] + if img is not None and flag_int == IMREAD_COLOR and channel_order == "rgb": cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) return img @@ -264,7 +264,7 @@ def imwrite( # Encode image according to image suffix. # For example, if image path is '/path/your/img.jpg', the encode # format is '.jpg'. - flag, img_buff = cv2.imencode(img_ext, img, params) + flag, img_buff = cv2.imencode(img_ext, img, params) # type: ignore[arg-type] if flag: with open(file_path, "wb") as f: diff --git a/visdet/cv/ops/roi_align.py b/visdet/cv/ops/roi_align.py index 0f93678e..9d2530f1 100644 --- a/visdet/cv/ops/roi_align.py +++ b/visdet/cv/ops/roi_align.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple, Union + import torch.nn as nn from torchvision.ops import RoIAlign as TVRoIAlign from torchvision.ops import roi_align as tv_roi_align @@ -19,25 +21,33 @@ class RoIAlign(nn.Module): We set this to True by default for better performance. """ + output_size: Union[int, Tuple[int, int]] + spatial_scale: float + sampling_ratio: int + pool_mode: str + aligned: bool + use_torchvision: bool + roi_align: TVRoIAlign + def __init__( self, - output_size, - spatial_scale=1.0, - sampling_ratio=0, - pool_mode="avg", - aligned=True, - use_torchvision=True, - ): + output_size: Union[int, Tuple[int, int]], + spatial_scale: float = 1.0, + sampling_ratio: int = 0, + pool_mode: str = "avg", + aligned: bool = True, + use_torchvision: bool = True, + ) -> None: super().__init__() - self.output_size = output_size - self.spatial_scale = spatial_scale - self.sampling_ratio = sampling_ratio - self.pool_mode = pool_mode - self.aligned = aligned - self.use_torchvision = use_torchvision + self.output_size = output_size # type: ignore[misc] + self.spatial_scale = spatial_scale # type: ignore[misc] + self.sampling_ratio = sampling_ratio # type: ignore[misc] + self.pool_mode = pool_mode # type: ignore[misc] + self.aligned = aligned # type: ignore[misc] + self.use_torchvision = use_torchvision # type: ignore[misc] if isinstance(self.output_size, int): - self.output_size = (self.output_size, self.output_size) + self.output_size = (self.output_size, self.output_size) # type: ignore[misc] # We always use torchvision's implementation for simplicity self.roi_align = TVRoIAlign( diff --git a/visdet/cv/transforms/builder.py b/visdet/cv/transforms/builder.py index 6a26953e..d38b302f 100644 --- a/visdet/cv/transforms/builder.py +++ b/visdet/cv/transforms/builder.py @@ -135,7 +135,7 @@ def build_transforms(cfg): transforms.append(transform) # Import Compose here to avoid circular imports - from visdet.cv.transforms.compose import Compose + from visdet.cv.transforms.wrappers import Compose return Compose(transforms) else: diff --git a/visdet/cv/transforms/loading.py b/visdet/cv/transforms/loading.py index 4545820a..851d645a 100644 --- a/visdet/cv/transforms/loading.py +++ b/visdet/cv/transforms/loading.py @@ -360,7 +360,11 @@ def _load_seg_map(self, results: dict) -> None: else: img_bytes = engine_fileio.get(results["seg_map_path"], backend_args=self.backend_args) - results["gt_seg_map"] = imfrombytes(img_bytes, flag="unchanged", backend=self.imdecode_backend).squeeze() + # Convert memoryview to bytes if needed + img_bytes_for_decode = bytes(img_bytes) if isinstance(img_bytes, memoryview) else img_bytes + results["gt_seg_map"] = imfrombytes( + img_bytes_for_decode, flag="unchanged", backend=self.imdecode_backend + ).squeeze() def _load_kps(self, results: dict) -> None: """Private function to load keypoints annotations. diff --git a/visdet/cv/transforms/processing.py b/visdet/cv/transforms/processing.py index 029e445b..3630ee49 100644 --- a/visdet/cv/transforms/processing.py +++ b/visdet/cv/transforms/processing.py @@ -242,6 +242,7 @@ def random_sample_ratio(img_scale, ratio_range): def _random_scale(self, results): """Randomly sample an img_scale.""" + assert self.img_scale is not None, "img_scale must be initialized" if self.ratio_range is not None: scale, scale_idx = self.random_sample_ratio(self.img_scale[0], self.ratio_range) elif len(self.img_scale) == 1: @@ -483,9 +484,12 @@ def _pad_img(self, results: dict) -> None: size = (pad_h, pad_w) elif self.size is not None: size = self.size[::-1] + pad_val_for_impad: int | float | tuple if isinstance(pad_val, int) and results["img"].ndim == 3: - pad_val = tuple(pad_val for _ in range(results["img"].shape[2])) - padded_img = impad(results["img"], shape=size, pad_val=pad_val, padding_mode=self.padding_mode) + pad_val_for_impad = tuple(pad_val for _ in range(results["img"].shape[2])) + else: + pad_val_for_impad = pad_val + padded_img = impad(results["img"], shape=size, pad_val=pad_val_for_impad, padding_mode=self.padding_mode) results["img"] = padded_img @@ -510,13 +514,16 @@ def _pad_seg(self, results: dict) -> None: """Pad semantic segmentation map according to ``results['pad_shape']``.""" if results.get("gt_seg_map", None) is not None: - pad_val = self.pad_val.get("seg", 255) - if isinstance(pad_val, int) and results["gt_seg_map"].ndim == 3: - pad_val = tuple(pad_val for _ in range(results["gt_seg_map"].shape[2])) + pad_val_seg = self.pad_val.get("seg", 255) + pad_val_for_seg: int | float | list + if isinstance(pad_val_seg, int) and results["gt_seg_map"].ndim == 3: + pad_val_for_seg = list(pad_val_seg for _ in range(results["gt_seg_map"].shape[2])) + else: + pad_val_for_seg = pad_val_seg results["gt_seg_map"] = impad( results["gt_seg_map"], shape=results["pad_shape"][:2], - pad_val=pad_val, + pad_val=pad_val_for_seg, padding_mode=self.padding_mode, ) @@ -716,7 +723,7 @@ def transform(self, results: dict) -> dict: img_width = max(img_width, crop_width) pad_size = (img_width, img_height) _pad_cfg = self.pad_cfg.copy() - _pad_cfg.update(dict(size=pad_size)) + _pad_cfg["size"] = pad_size # type: ignore[index] pad_transform = TRANSFORMS.build(_pad_cfg) results = pad_transform(results) else: @@ -966,12 +973,13 @@ def transform(self, results: dict) -> dict: for scale in self.scales: for flip, direction in flip_args: _resize_cfg = self.resize_cfg.copy() - _resize_cfg.update({self.scale_key: scale}) + _resize_cfg[self.scale_key] = scale # type: ignore[index] _resize_flip = [_resize_cfg] if flip: _flip_cfg = self.flip_cfg.copy() - _flip_cfg.update(prob=1.0, direction=direction) + _flip_cfg["prob"] = 1.0 # type: ignore[index] + _flip_cfg["direction"] = direction # type: ignore[index] _resize_flip.append(_flip_cfg) else: results["flip"] = False @@ -1331,8 +1339,8 @@ def _flip_bbox(self, bboxes: np.ndarray, img_shape: tuple[int, int], direction: """ # Handle BaseBoxes objects using their own flip method if hasattr(bboxes, "flip_"): - flipped = bboxes.clone() - flipped.flip_(img_shape, direction) + flipped = bboxes.clone() # type: ignore[attr-defined] + flipped.flip_(img_shape, direction) # type: ignore[attr-defined] return flipped # Handle numpy arrays @@ -1394,7 +1402,7 @@ def _flip_keypoints( flipped = np.concatenate([flipped, meta_info], axis=-1) return flipped - def _flip_seg_map(self, seg_map: dict, direction: str) -> np.ndarray: + def _flip_seg_map(self, seg_map: np.ndarray, direction: str) -> np.ndarray: """Flip segmentation map horizontally, vertically or diagonally. Args: diff --git a/visdet/cv/transforms/utils.py b/visdet/cv/transforms/utils.py index b88a6739..b3bd07c0 100644 --- a/visdet/cv/transforms/utils.py +++ b/visdet/cv/transforms/utils.py @@ -51,8 +51,9 @@ def __set_name__(self, owner, name): def __call__(self, *args, **kwargs): # Get the transform instance whose method is decorated # by cache_randomness + assert self.instance_ref is not None, "instance_ref must be set" instance = self.instance_ref() - name = self.__name__ + name: str = self.__name__ # type: ignore[misc] # Check the flag ``self._cache_enabled``, which should be # set by the contextmanagers like ``cache_random_parameters``` @@ -63,12 +64,12 @@ def __call__(self, *args, **kwargs): # ``cache_enabled``` is set by contextmanagers like # ``cache_random_params```. if not hasattr(instance, "_cache"): - instance._cache = {} + instance._cache = {} # type: ignore[attr-defined] - if name not in instance._cache: - instance._cache[name] = self.func(instance, *args, **kwargs) + if name not in instance._cache: # type: ignore[attr-defined] + instance._cache[name] = self.func(instance, *args, **kwargs) # type: ignore[attr-defined] # Return the cached value - return instance._cache[name] + return instance._cache[name] # type: ignore[attr-defined] else: # Clear cache if hasattr(instance, "_cache"): @@ -210,12 +211,12 @@ def _start_cache(t: BaseTransform): return # Set cache enabled flag - t._cache_enabled = True + t._cache_enabled = True # type: ignore[attr-defined] # Store the original method and init the counter if hasattr(t, "_methods_with_randomness"): - t.transform = _add_invoke_checker(t, "transform") - for name in t._methods_with_randomness: + t.transform = _add_invoke_checker(t, "transform") # type: ignore[method-assign] + for name in t._methods_with_randomness: # type: ignore[attr-defined] setattr(t, name, _add_invoke_counter(t, name)) def _end_cache(t: BaseTransform): @@ -230,19 +231,19 @@ def _end_cache(t: BaseTransform): # Restore the original method if hasattr(t, "_methods_with_randomness"): - for name in t._methods_with_randomness: + for name in t._methods_with_randomness: # type: ignore[attr-defined] key = f"{id(t)}.{name}" setattr(t, name, key2method[key]) key_transform = f"{id(t)}.transform" - t.transform = key2method[key_transform] + t.transform = key2method[key_transform] # type: ignore[method-assign] def _apply(t: BaseTransform | Iterable, func: Callable[[BaseTransform], None]): if isinstance(t, BaseTransform): func(t) if isinstance(t, Iterable): for _t in t: - _apply(_t, func) + _apply(_t, func) # type: ignore[arg-type] try: _apply(transforms, _start_cache) diff --git a/visdet/datasets/api_wrappers/cocoeval_mp.py b/visdet/datasets/api_wrappers/cocoeval_mp.py index e7337784..c2056b88 100644 --- a/visdet/datasets/api_wrappers/cocoeval_mp.py +++ b/visdet/datasets/api_wrappers/cocoeval_mp.py @@ -3,6 +3,7 @@ import itertools import time from collections import defaultdict +from typing import Any, Iterable, cast import numpy as np import torch.multiprocessing as mp @@ -13,6 +14,12 @@ class COCOevalMP(COCOeval): + _gts: dict[tuple[int, int], list[dict[str, Any]]] + _dts: dict[tuple[int, int], list[dict[str, Any]]] + evalImgs: Any + eval: dict[str, Any] + stats: np.ndarray + def _prepare(self): """ Prepare ._gts and ._dts for evaluation based on params @@ -26,24 +33,26 @@ def _toMask(anns, coco): ann["segmentation"] = rle p = self.params + gts: list[dict[str, Any]] + dts: list[dict[str, Any]] if p.useCats: gts = [] dts = [] img_ids = set(p.imgIds) cat_ids = set(p.catIds) - for gt in self.cocoGt.dataset["annotations"]: + for gt in self.cocoGt.dataset["annotations"]: # type: ignore[attr-defined] if (gt["category_id"] in cat_ids) and (gt["image_id"] in img_ids): - gts.append(gt) - for dt in self.cocoDt.dataset["annotations"]: + gts.append(cast(dict[str, Any], gt)) + for dt in self.cocoDt.dataset["annotations"]: # type: ignore[attr-defined] if (dt["category_id"] in cat_ids) and (dt["image_id"] in img_ids): - dts.append(dt) + dts.append(cast(dict[str, Any], dt)) # gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) # dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) # gts=self.cocoGt.dataset['annotations'] # dts=self.cocoDt.dataset['annotations'] else: - gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds)) - dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds)) + gts = [cast(dict[str, Any], ann) for ann in self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))] # type: ignore[attr-defined] + dts = [cast(dict[str, Any], ann) for ann in self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))] # type: ignore[attr-defined] # convert ground truth to mask if iouType == 'segm' if p.iouType == "segm": @@ -55,14 +64,14 @@ def _toMask(anns, coco): gt["ignore"] = "iscrowd" in gt and gt["iscrowd"] if p.iouType == "keypoints": gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"] - self._gts = defaultdict(list) # gt for evaluation - self._dts = defaultdict(list) # dt for evaluation + self._gts = defaultdict(list) + self._dts = defaultdict(list) for gt in gts: self._gts[gt["image_id"], gt["category_id"]].append(gt) for dt in dts: self._dts[dt["image_id"], dt["category_id"]].append(dt) - self.evalImgs = defaultdict(list) # per-image per-category evaluation results - self.eval = {} # accumulated evaluation results + self.evalImgs = defaultdict(list) + self.eval = {} def evaluate(self): """Run per image evaluation on given images and store results (a list @@ -144,8 +153,14 @@ def evaluateImg(self, imgId, catId, aRng, maxDet): iscrowd = [int(o["iscrowd"]) for o in gt] # load computed ious # ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId] - ious = self.computeIoU(imgId, catId) - ious = ious[:, gtind] if len(ious) > 0 else ious + ious_raw = self.computeIoU(imgId, catId) + ious_array = np.asarray(ious_raw, dtype=float) + if ious_array.size == 0: + ious = np.zeros((0, 0)) + else: + if ious_array.ndim == 1: + ious_array = ious_array.reshape((-1, 1)) + ious = ious_array[:, gtind] T = len(p.iouThrs) G = len(gt) @@ -221,15 +236,15 @@ def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100): # IoU if iouThr is not None: t = np.where(iouThr == p.iouThrs)[0] - s = s[t] - s = s[:, :, :, aind, mind] + s = s[t] # type: ignore[index] + s = s[:, :, :, aind, mind] # type: ignore[index] else: # dimension of recall: [TxKxAxM] s = self.eval["recall"] if iouThr is not None: t = np.where(iouThr == p.iouThrs)[0] - s = s[t] - s = s[:, :, aind, mind] + s = s[t] # type: ignore[index] + s = s[:, :, aind, mind] # type: ignore[index] if len(s[s > -1]) == 0: mean_s = -1 else: diff --git a/visdet/datasets/builder.py b/visdet/datasets/builder.py index 0b906e23..867dbc9f 100644 --- a/visdet/datasets/builder.py +++ b/visdet/datasets/builder.py @@ -3,28 +3,16 @@ import platform import random import warnings +from collections.abc import Mapping, Sequence from functools import partial +from typing import Any import numpy as np import torch -from torch.utils.data import DataLoader +from torch.utils.data import DataLoader, Dataset, default_collate from visdet.cv import build_from_cfg -from visdet.engine.dist import get_dist_info -from visdet.engine.registry import Registry -from visdet.engine.utils import TORCH_VERSION, digit_version - -try: - from torch.utils.data import collate_fn - - def collate(batch): - return collate_fn(batch) -except ImportError: - # Fallback implementation - def collate(batch): - return batch - - +from visdet.datasets.dataset_wrappers import ClassBalancedDataset, ConcatDataset, RepeatDataset from visdet.datasets.samplers import ( ClassAwareSampler, DistributedGroupSampler, @@ -33,6 +21,9 @@ def collate(batch): InfiniteBatchSampler, InfiniteGroupBatchSampler, ) +from visdet.engine.dist import get_dist_info +from visdet.engine.registry import Registry +from visdet.engine.utils import digit_version if platform.system() != "Windows": # https://github.com/pytorch/pytorch/issues/973 @@ -48,9 +39,12 @@ def collate(batch): PIPELINES = Registry("pipeline") -def _concat_dataset(cfg, default_args=None): - from visdet.datasets.dataset_wrappers import ConcatDataset +def collate(batch, samples_per_gpu: int = 1): # noqa: ARG001 - kept for backward compat + """Wrap PyTorch's default collate to match mmengine's signature.""" + return default_collate(batch) + +def _concat_dataset(cfg: dict[str, Any], default_args: dict[str, Any] | None = None): ann_files = cfg["ann_file"] img_prefixes = cfg.get("img_prefix", None) seg_prefixes = cfg.get("seg_prefix", None) @@ -76,14 +70,9 @@ def _concat_dataset(cfg, default_args=None): return ConcatDataset(datasets, separate_eval) -def build_dataset(cfg, default_args=None): - from visdet.datasets.dataset_wrappers import ( - ClassBalancedDataset, - ConcatDataset, - MultiImageMixDataset, - RepeatDataset, - ) - +def build_dataset( + cfg: dict[str, Any] | list[dict[str, Any]] | tuple[dict[str, Any], ...], default_args: dict[str, Any] | None = None +): if isinstance(cfg, (list, tuple)): dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg]) elif cfg["type"] == "ConcatDataset": @@ -96,10 +85,7 @@ def build_dataset(cfg, default_args=None): elif cfg["type"] == "ClassBalancedDataset": dataset = ClassBalancedDataset(build_dataset(cfg["dataset"], default_args), cfg["oversample_thr"]) elif cfg["type"] == "MultiImageMixDataset": - cp_cfg = copy.deepcopy(cfg) - cp_cfg["dataset"] = build_dataset(cp_cfg["dataset"]) - cp_cfg.pop("type") - dataset = MultiImageMixDataset(**cp_cfg) + raise NotImplementedError("MultiImageMixDataset is not yet available in visdet") elif isinstance(cfg.get("ann_file"), (list, tuple)): dataset = _concat_dataset(cfg, default_args) else: @@ -109,18 +95,18 @@ def build_dataset(cfg, default_args=None): def build_dataloader( - dataset, - samples_per_gpu, - workers_per_gpu, - num_gpus=1, - dist=True, - shuffle=True, - seed=None, - runner_type="EpochBasedRunner", - persistent_workers=False, - class_aware_sampler=None, - **kwargs, -): + dataset: Dataset, + samples_per_gpu: int, + workers_per_gpu: int, + num_gpus: int = 1, + dist: bool = True, + shuffle: bool = True, + seed: int | None = None, + runner_type: str = "EpochBasedRunner", + persistent_workers: bool = False, + class_aware_sampler: dict[str, Any] | None = None, + **kwargs: Any, +) -> DataLoader: """Build PyTorch DataLoader. In distributed training, each GPU/process has a dataloader. @@ -150,18 +136,22 @@ def build_dataloader( DataLoader: A PyTorch dataloader. """ rank, world_size = get_dist_info() + samples_per_gpu_int = int(samples_per_gpu) + workers_per_gpu_int = int(workers_per_gpu) + num_gpus_int = int(num_gpus) + seed_int = int(seed) if seed is not None else None if dist: # When model is :obj:`DistributedDataParallel`, # `batch_size` of :obj:`dataloader` is the # number of training samples on each GPU. - batch_size = samples_per_gpu - num_workers = workers_per_gpu + batch_size = samples_per_gpu_int + num_workers = workers_per_gpu_int else: # When model is obj:`DataParallel` # the batch size is samples on all the GPUS - batch_size = num_gpus * samples_per_gpu - num_workers = num_gpus * workers_per_gpu + batch_size = num_gpus_int * samples_per_gpu_int + num_workers = num_gpus_int * workers_per_gpu_int if runner_type == "IterBasedRunner": # this is a batch sampler, which can yield @@ -169,38 +159,42 @@ def build_dataloader( # it can be used in both `DataParallel` and # `DistributedDataParallel` if shuffle: - batch_sampler = InfiniteGroupBatchSampler(dataset, batch_size, world_size, rank, seed=seed) + batch_sampler = InfiniteGroupBatchSampler(dataset, batch_size, world_size, rank, seed=seed_int) else: - batch_sampler = InfiniteBatchSampler(dataset, batch_size, world_size, rank, seed=seed, shuffle=False) + batch_sampler = InfiniteBatchSampler(dataset, batch_size, world_size, rank, seed=seed_int, shuffle=False) batch_size = 1 sampler = None else: if class_aware_sampler is not None: # ClassAwareSampler can be used in both distributed and # non-distributed training. - num_sample_class = class_aware_sampler.get("num_sample_class", 1) + num_sample_class = int(class_aware_sampler.get("num_sample_class", 1)) sampler = ClassAwareSampler( dataset, - samples_per_gpu, + samples_per_gpu_int, world_size, rank, - seed=seed, + seed=seed_int, num_sample_class=num_sample_class, ) elif dist: # DistributedGroupSampler will definitely shuffle the data to # satisfy that images on each GPU are in the same group if shuffle: - sampler = DistributedGroupSampler(dataset, samples_per_gpu, world_size, rank, seed=seed) + sampler = DistributedGroupSampler(dataset, samples_per_gpu_int, world_size, rank, seed=seed_int) else: - sampler = DistributedSampler(dataset, world_size, rank, shuffle=False, seed=seed) + # DistributedSampler signature differs between PyTorch versions + sampler = DistributedSampler(dataset, world_size, rank, shuffle=False, seed=seed_int) # type: ignore[call-arg] else: - sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None + sampler = GroupSampler(dataset, samples_per_gpu_int) if shuffle else None batch_sampler = None - init_fn = partial(worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) if seed is not None else None + init_fn = ( + partial(worker_init_fn, num_workers=num_workers, rank=rank, seed=seed_int) if seed_int is not None else None + ) - if TORCH_VERSION != "parrots" and digit_version(TORCH_VERSION) >= digit_version("1.7.0"): + # Check PyTorch version for persistent_workers support (available in 1.7.0+) + if digit_version(torch.__version__) >= digit_version("1.7.0"): kwargs["persistent_workers"] = persistent_workers elif persistent_workers is True: warnings.warn("persistent_workers is invalid because your pytorch version is lower than 1.7.0") @@ -211,7 +205,7 @@ def build_dataloader( sampler=sampler, num_workers=num_workers, batch_sampler=batch_sampler, - collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), + collate_fn=partial(collate, samples_per_gpu=samples_per_gpu_int), pin_memory=kwargs.pop("pin_memory", False), worker_init_fn=init_fn, **kwargs, diff --git a/visdet/datasets/dataset_wrappers.py b/visdet/datasets/dataset_wrappers.py new file mode 100644 index 00000000..8176bef7 --- /dev/null +++ b/visdet/datasets/dataset_wrappers.py @@ -0,0 +1,18 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Dataset wrapper aliases used across visdet.""" + +from visdet.engine.dataset.dataset_wrapper import ( + ClassBalancedDataset as _ClassBalancedDataset, +) +from visdet.engine.dataset.dataset_wrapper import ( + ConcatDataset as _ConcatDataset, +) +from visdet.engine.dataset.dataset_wrapper import ( + RepeatDataset as _RepeatDataset, +) + +ClassBalancedDataset = _ClassBalancedDataset +ConcatDataset = _ConcatDataset +RepeatDataset = _RepeatDataset + +__all__ = ["ClassBalancedDataset", "ConcatDataset", "RepeatDataset"] diff --git a/visdet/datasets/pipelines.py b/visdet/datasets/pipelines.py index 9ec6551a..03e70fc4 100644 --- a/visdet/datasets/pipelines.py +++ b/visdet/datasets/pipelines.py @@ -5,6 +5,8 @@ backward compatibility with the old pipelines namespace. """ +# Import available transforms +from visdet.datasets.transforms.formatting import PackDetInputs from visdet.datasets.transforms.load_image import ( LoadImageFromFile, LoadImageFromWebcam, @@ -20,23 +22,11 @@ RandomFlip, ) -# Try to import optional transforms that may not exist in all versions -try: - from visdet.datasets.transforms.transforms import RandomResize -except ImportError: - RandomResize = None - -try: - from visdet.datasets.transforms.formatting import DefaultFormatBundle, PackDetInputs -except ImportError: - PackDetInputs = None - DefaultFormatBundle = None - -try: - from visdet.datasets.transforms.wrappers import RandomApply, RandomChoice -except ImportError: - RandomApply = None - RandomChoice = None +# These transforms don't exist in visdet yet - set to None for compatibility +RandomResize = None # type: ignore[assignment] +DefaultFormatBundle = None # type: ignore[assignment] +RandomApply = None # type: ignore[assignment] +RandomChoice = None # type: ignore[assignment] __all__ = [ "FilterAnnotations", diff --git a/visdet/datasets/samplers/distributed_sampler.py b/visdet/datasets/samplers/distributed_sampler.py index 76756b59..d8eef7ca 100644 --- a/visdet/datasets/samplers/distributed_sampler.py +++ b/visdet/datasets/samplers/distributed_sampler.py @@ -1,19 +1,28 @@ # Copyright (c) OpenMMLab. All rights reserved. import math +from typing import Iterator, Optional import torch +from torch.utils.data import Dataset from torch.utils.data import DistributedSampler as _DistributedSampler from visdet.engine.dist import sync_random_seed -def get_device(): +def get_device() -> str: """Returns an available device, cuda or cpu.""" return "cuda" if torch.cuda.is_available() else "cpu" class DistributedSampler(_DistributedSampler): - def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0): + def __init__( + self, + dataset: Dataset, + num_replicas: Optional[int] = None, + rank: Optional[int] = None, + shuffle: bool = True, + seed: int = 0, + ) -> None: super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) # In distributed sampling, different ranks should sample @@ -25,7 +34,7 @@ def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0): device = get_device() self.seed = sync_random_seed(seed, device) - def __iter__(self): + def __iter__(self) -> Iterator[int]: # deterministically shuffle based on epoch if self.shuffle: g = torch.Generator() @@ -34,9 +43,9 @@ def __iter__(self): # Otherwise, the next iteration of this sampler will # yield the same ordering. g.manual_seed(self.epoch + self.seed) - indices = torch.randperm(len(self.dataset), generator=g).tolist() + indices = torch.randperm(len(self.dataset), generator=g).tolist() # type: ignore[arg-type] else: - indices = torch.arange(len(self.dataset)).tolist() + indices = torch.arange(len(self.dataset)).tolist() # type: ignore[arg-type] # add extra samples to make it evenly divisible # in case that indices is shorter than half of total_size diff --git a/visdet/datasets/transforms/formatting.py b/visdet/datasets/transforms/formatting.py index 64825210..dec2480e 100644 --- a/visdet/datasets/transforms/formatting.py +++ b/visdet/datasets/transforms/formatting.py @@ -127,8 +127,8 @@ def transform(self, results: dict) -> dict: data_sample.proposals = proposals if "gt_seg_map" in results: - gt_sem_seg_data = dict(sem_seg=to_tensor(results["gt_seg_map"][None, ...].copy())) - gt_sem_seg_data = PixelData(**gt_sem_seg_data) + gt_sem_seg_tensor = to_tensor(results["gt_seg_map"][None, ...].copy()) + gt_sem_seg_data = PixelData(sem_seg=gt_sem_seg_tensor) if "ignore_index" in results: metainfo = dict(ignore_index=results["ignore_index"]) gt_sem_seg_data.set_metainfo(metainfo) diff --git a/visdet/datasets/transforms/loading.py b/visdet/datasets/transforms/loading.py index 295299d3..4864cc10 100644 --- a/visdet/datasets/transforms/loading.py +++ b/visdet/datasets/transforms/loading.py @@ -96,8 +96,8 @@ def __init__( to_float32: bool = False, color_type: str = "unchanged", imdecode_backend: str = "cv2", - file_client_args: dict = None, - backend_args: dict = None, + file_client_args: dict | None = None, + backend_args: dict | None = None, ) -> None: self.to_float32 = to_float32 self.color_type = color_type @@ -552,10 +552,10 @@ def __init__( with_seg: bool = True, box_type: str = "hbox", imdecode_backend: str = "cv2", - backend_args: dict = None, + backend_args: dict | None = None, ) -> None: try: - from panopticapi import utils + from panopticapi import utils # type: ignore[import-untyped] except ImportError: raise ImportError( "panopticapi is not installed, please install it by: " diff --git a/visdet/engine/config/config_wrapper.py b/visdet/engine/config/config_wrapper.py index bcaaa53f..60b1bf46 100644 --- a/visdet/engine/config/config_wrapper.py +++ b/visdet/engine/config/config_wrapper.py @@ -6,7 +6,7 @@ import warnings from pathlib import Path -from typing import Any, Dict, Union +from typing import TYPE_CHECKING, Any, Dict, Union, cast from visdet.engine.config import Config as BaseConfig from visdet.engine.config.schema_generator import validate_config_with_schema @@ -73,7 +73,9 @@ def fromfile( stacklevel=2, ) # Use the parent class's fromfile method for .py files - return super(Config, Config).fromfile(str(filename)) + # The base Config.fromfile returns BaseConfig, but since Config extends BaseConfig, + # we can safely cast it back to Config + return cast("Config", super(Config, Config).fromfile(str(filename))) else: raise ValueError(f"Unsupported config file extension: {filename.suffix}. Supported: .yaml, .yml, .py") diff --git a/visdet/engine/config/yaml_loader.py b/visdet/engine/config/yaml_loader.py index a510510b..dad2817e 100644 --- a/visdet/engine/config/yaml_loader.py +++ b/visdet/engine/config/yaml_loader.py @@ -162,13 +162,13 @@ def _resolve_path(self, ref_path: str, current_file: Path) -> Path: Returns: Absolute resolved path """ - ref_path = Path(ref_path) + ref_path_obj = Path(ref_path) - if ref_path.is_absolute(): - return ref_path + if ref_path_obj.is_absolute(): + return ref_path_obj else: # Resolve relative to the directory containing current_file - return (current_file.parent / ref_path).resolve() + return (current_file.parent / ref_path_obj).resolve() def _deep_merge(self, base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]: """Deep merge two dictionaries, with override taking precedence. diff --git a/visdet/engine/dist/__init__.py b/visdet/engine/dist/__init__.py index 920ff72c..149b5919 100644 --- a/visdet/engine/dist/__init__.py +++ b/visdet/engine/dist/__init__.py @@ -5,11 +5,13 @@ import os import pickle import warnings -from typing import Any, List, Optional +from typing import Any, Callable, List, Optional, TypeVar import torch import torch.distributed as dist_lib +F = TypeVar("F", bound=Callable) + def _is_dist_available_and_initialized(): """Check if distributed training is available and initialized.""" @@ -55,13 +57,14 @@ def is_main_process(): return get_rank() == 0 -def master_only(func): +def master_only(func: F) -> F: """Decorator to make a function only execute on master process.""" @functools.wraps(func) - def wrapper(*args, **kwargs): + def wrapper(*args, **kwargs): # type: ignore[no-untyped-def] if is_main_process(): return func(*args, **kwargs) - return wrapper + return None + return wrapper # type: ignore[return-value] def barrier(): diff --git a/visdet/engine/hooks/visualization_hook.py b/visdet/engine/hooks/visualization_hook.py index 0b016d88..8ecee135 100644 --- a/visdet/engine/hooks/visualization_hook.py +++ b/visdet/engine/hooks/visualization_hook.py @@ -4,6 +4,7 @@ from collections.abc import Sequence import numpy as np +import torch from visdet.cv import imfrombytes, imwrite from visdet.engine.fileio import get @@ -57,7 +58,7 @@ def __init__( show: bool = False, wait_time: float = 0.0, test_out_dir: str | None = None, - backend_args: dict = None, + backend_args: dict | None = None, ): self._visualizer: Visualizer = Visualizer.get_current_instance() self.interval = interval @@ -83,8 +84,8 @@ def after_val_iter( self, runner: Runner, batch_idx: int, - data_batch: dict, - outputs: Sequence[DetDataSample], + data_batch: dict | tuple | list | None = None, + outputs: Sequence[DetDataSample] | None = None, ) -> None: """Run after every ``self.interval`` validation iterations. @@ -95,7 +96,7 @@ def after_val_iter( outputs (Sequence[:obj:`DetDataSample`]]): A batch of data samples that contain annotations and predictions. """ - if self.draw is False: + if self.draw is False or outputs is None: return # There is no guarantee that the same batch of images @@ -122,8 +123,8 @@ def after_test_iter( self, runner: Runner, batch_idx: int, - data_batch: dict, - outputs: Sequence[DetDataSample], + data_batch: dict | tuple | list | None = None, + outputs: Sequence[DetDataSample] | None = None, ) -> None: """Run after every testing iterations. @@ -134,7 +135,7 @@ def after_test_iter( outputs (Sequence[:obj:`DetDataSample`]): A batch of data samples that contain annotations and predictions. """ - if self.draw is False: + if self.draw is False or outputs is None: return if self.test_out_dir is not None: @@ -200,8 +201,8 @@ def after_test_iter( self, runner: Runner, batch_idx: int, - data_batch: dict, - outputs: Sequence[DetDataSample], + data_batch: dict | tuple | list | None = None, + outputs: Sequence[DetDataSample] | None = None, ) -> None: """Run after every testing iterations. @@ -212,7 +213,7 @@ def after_test_iter( outputs (Sequence[:obj:`DetDataSample`]): A batch of data samples that contain annotations and predictions. """ - if self.draw is False: + if self.draw is False or outputs is None: return if self.test_out_dir is not None: @@ -236,20 +237,32 @@ def after_test_iter( text = data_sample.text if isinstance(text, str): # VG gt_instances = data_sample.gt_instances + if gt_instances is None: + continue tokens_positive = data_sample.tokens_positive if "phrase_ids" in data_sample: # flickr30k gt_labels = data_sample.phrase_ids else: gt_labels = gt_instances.labels - gt_bboxes = gt_instances.get("bboxes", None) - if gt_bboxes is not None and isinstance(gt_bboxes, BaseBoxes): - gt_instances.bboxes = gt_bboxes.tensor + gt_bboxes_raw = gt_instances.get("bboxes", None) + if gt_bboxes_raw is not None and isinstance(gt_bboxes_raw, BaseBoxes): + gt_instances.bboxes = gt_bboxes_raw.tensor + gt_bboxes = gt_bboxes_raw.tensor + else: + gt_bboxes = gt_bboxes_raw print(gt_labels, tokens_positive, gt_bboxes, img_path) pred_instances = data_sample.pred_instances + if pred_instances is None: + continue pred_instances = pred_instances[pred_instances.scores > self.score_thr] pred_labels = pred_instances.labels - pred_bboxes = pred_instances.bboxes + pred_bboxes_raw = pred_instances.bboxes + # Convert BaseBoxes to tensor + if isinstance(pred_bboxes_raw, BaseBoxes): + pred_bboxes = pred_bboxes_raw.tensor + else: + pred_bboxes = pred_bboxes_raw pred_scores = pred_instances.scores max_label = 0 @@ -311,6 +324,9 @@ def after_test_iter( self._visualizer.draw_bboxes(bbox, edge_colors=color, alpha=1) print(pred_labels, pred_bboxes, pred_scores, colors) areas = (pred_bboxes[:, 3] - pred_bboxes[:, 1]) * (pred_bboxes[:, 2] - pred_bboxes[:, 0]) + # Convert to numpy if it's a tensor + if isinstance(areas, torch.Tensor): + areas = areas.cpu().numpy() scales = _get_adaptive_scales(areas) score = [str(round(s.item(), 2)) for s in pred_scores] font_sizes = [int(13 * scales[i]) for i in range(len(scales))] @@ -347,7 +363,7 @@ def after_test_iter( if out_file is not None: imwrite(drawn_img[..., ::-1], out_file) else: - self.add_image("test_img", drawn_img, self._test_index) + self._visualizer.add_image("test_img", drawn_img, self._test_index) else: # OD self._visualizer.add_datasample( osp.basename(img_path) if self.show else "test_img", diff --git a/visdet/engine/optim/optimizer/builder.py b/visdet/engine/optim/optimizer/builder.py index 5f0ee77f..db42b40c 100644 --- a/visdet/engine/optim/optimizer/builder.py +++ b/visdet/engine/optim/optimizer/builder.py @@ -52,7 +52,7 @@ def register_dadaptation_optimizers() -> list[str]: """ dadaptation_optimizers = [] try: - import dadaptation + import dadaptation # type: ignore[import-untyped] except ImportError: pass else: @@ -75,7 +75,7 @@ def register_lion_optimizers() -> list[str]: """ optimizers = [] try: - from lion_pytorch import Lion + from lion_pytorch import Lion # type: ignore[import-untyped] except ImportError: pass else: @@ -95,7 +95,7 @@ def register_sophia_optimizers() -> list[str]: """ optimizers = [] try: - import Sophia + import Sophia # type: ignore[import-untyped] except ImportError: pass else: @@ -122,7 +122,7 @@ def register_bitsandbytes_optimizers() -> list[str]: """ dadaptation_optimizers = [] try: - import bitsandbytes as bnb + import bitsandbytes as bnb # type: ignore[import-untyped] except ImportError: # bitsandbytes is an optional dependency return dadaptation_optimizers @@ -150,7 +150,7 @@ def register_bitsandbytes_optimizers() -> list[str]: def register_transformers_optimizers(): transformer_optimizers = [] try: - from transformers import Adafactor + from transformers import Adafactor # type: ignore[import-untyped] except ImportError: pass else: diff --git a/visdet/engine/optim/optimizer/default_constructor.py b/visdet/engine/optim/optimizer/default_constructor.py index 9cdbbde2..8b827baf 100644 --- a/visdet/engine/optim/optimizer/default_constructor.py +++ b/visdet/engine/optim/optimizer/default_constructor.py @@ -271,7 +271,8 @@ def add_params( def __call__(self, model: nn.Module): # -> OptimWrapper: if hasattr(model, "module"): - model = model.module + # DistributedDataParallel wraps the model in a .module attribute + model = model.module # type: ignore[assignment] optim_wrapper_cfg = self.optim_wrapper_cfg.copy() optim_wrapper_cfg.setdefault("type", "OptimWrapper") diff --git a/visdet/engine/structures/__init__.py b/visdet/engine/structures/__init__.py index bf9532c0..98f214f9 100644 --- a/visdet/engine/structures/__init__.py +++ b/visdet/engine/structures/__init__.py @@ -1,5 +1,3 @@ -# ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. from visdet.engine.structures.base_data_element import BaseDataElement from visdet.engine.structures.instance_data import InstanceData diff --git a/visdet/engine/structures/base_data_element.py b/visdet/engine/structures/base_data_element.py index 58e6b38f..9d9eff01 100644 --- a/visdet/engine/structures/base_data_element.py +++ b/visdet/engine/structures/base_data_element.py @@ -1,13 +1,13 @@ -# ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. import copy from collections.abc import Iterator -from typing import Any +from typing import Any, TypeVar import numpy as np import torch +T = TypeVar("T", bound="BaseDataElement") + class BaseDataElement: """A base data interface that supports Tensor-like and dict-like @@ -210,16 +210,16 @@ class BaseDataElement: ... det_sample.proposals = torch.rand((5, 4)) """ - def __init__(self, *, metainfo: dict | None = None, **kwargs) -> None: - self._metainfo_fields: set = set() - self._data_fields: set = set() + def __init__(self, *, metainfo: dict[str, Any] | None = None, **kwargs: Any) -> None: + self._metainfo_fields: set[str] = set() + self._data_fields: set[str] = set() if metainfo is not None: self.set_metainfo(metainfo=metainfo) if kwargs: self.set_data(kwargs) - def set_metainfo(self, metainfo: dict) -> None: + def set_metainfo(self, metainfo: dict[str, Any]) -> None: """Set or change key-value pairs in ``metainfo_field`` by parameter ``metainfo``. @@ -232,7 +232,7 @@ def set_metainfo(self, metainfo: dict) -> None: for k, v in meta.items(): self.set_field(name=k, value=v, field_type="metainfo", dtype=None) - def set_data(self, data: dict) -> None: + def set_data(self, data: dict[str, Any]) -> None: """Set or change key-value pairs in ``data_field`` by parameter ``data``. @@ -258,7 +258,7 @@ def update(self, instance: "BaseDataElement") -> None: self.set_metainfo(dict(instance.metainfo_items())) self.set_data(dict(instance.items())) - def new(self, *, metainfo: dict | None = None, **kwargs) -> "BaseDataElement": + def new(self: T, *, metainfo: dict[str, Any] | None = None, **kwargs: Any) -> T: """Return a new data element with same type. If ``metainfo`` and ``data`` are None, the new data element will have same metainfo and data. If metainfo or data is not None, the new result will overwrite it @@ -284,9 +284,9 @@ def new(self, *, metainfo: dict | None = None, **kwargs) -> "BaseDataElement": new_data.set_data(kwargs) else: new_data.set_data(dict(self.items())) - return new_data + return new_data # type: ignore[return-value] - def clone(self): + def clone(self: T) -> T: """Deep copy the current data element. Returns: @@ -295,9 +295,9 @@ def clone(self): clone_data = self.__class__() clone_data.set_metainfo(dict(self.metainfo_items())) clone_data.set_data(dict(self.items())) - return clone_data + return clone_data # type: ignore[return-value] - def keys(self) -> list: + def keys(self) -> list[str]: """ Returns: list: Contains all keys in data_fields. @@ -309,35 +309,35 @@ def keys(self) -> list: private_keys = {"_" + key for key in self._data_fields if isinstance(getattr(type(self), key, None), property)} return list(self._data_fields - private_keys) - def metainfo_keys(self) -> list: + def metainfo_keys(self) -> list[str]: """ Returns: list: Contains all keys in metainfo_fields. """ return list(self._metainfo_fields) - def values(self) -> list: + def values(self) -> list[Any]: """ Returns: list: Contains all values in data. """ return [getattr(self, k) for k in self.keys()] - def metainfo_values(self) -> list: + def metainfo_values(self) -> list[Any]: """ Returns: list: Contains all values in metainfo. """ return [getattr(self, k) for k in self.metainfo_keys()] - def all_keys(self) -> list: + def all_keys(self) -> list[str]: """ Returns: list: Contains all keys in metainfo and data. """ return self.metainfo_keys() + self.keys() - def all_values(self) -> list: + def all_values(self) -> list[Any]: """ Returns: list: Contains all values in metainfo and data. @@ -372,7 +372,7 @@ def metainfo_items(self) -> Iterator[tuple[str, Any]]: yield (k, getattr(self, k)) @property - def metainfo(self) -> dict: + def metainfo(self) -> dict[str, Any]: """dict: A dict contains metainfo of current data element.""" return dict(self.metainfo_items()) @@ -403,31 +403,35 @@ def __delattr__(self, item: str): # dict-like methods __delitem__ = __delattr__ - def get(self, key, default=None) -> Any: + def get(self, key: str, default: Any = None) -> Any: """Get property in data and metainfo as the same as python.""" # Use `getattr()` rather than `self.__dict__.get()` to allow getting # properties. return getattr(self, key, default) - def pop(self, *args) -> Any: + def pop(self, key: str, default: Any = ...) -> Any: """Pop property in data and metainfo as the same as python.""" - assert len(args) < 3, "``pop`` get more than 2 arguments" - name = args[0] - if name in self._metainfo_fields: - self._metainfo_fields.remove(args[0]) - return self.__dict__.pop(*args) + if key in self._metainfo_fields: + self._metainfo_fields.remove(key) + if default is ...: + return self.__dict__.pop(key) + else: + return self.__dict__.pop(key, default) - elif name in self._data_fields: - self._data_fields.remove(args[0]) - return self.__dict__.pop(*args) + elif key in self._data_fields: + self._data_fields.remove(key) + if default is ...: + return self.__dict__.pop(key) + else: + return self.__dict__.pop(key, default) # with default value - elif len(args) == 2: - return args[1] + elif default is not ...: + return default else: # don't just use 'self.__dict__.pop(*args)' for only popping key in # metainfo or data - raise KeyError(f"{args[0]} is not contained in metainfo or data") + raise KeyError(f"{key} is not contained in metainfo or data") def __contains__(self, item: str) -> bool: """Whether the item is in dataelement. @@ -465,7 +469,7 @@ def set_field( super().__setattr__(name, value) # Tensor-like methods - def to(self, *args, **kwargs) -> "BaseDataElement": + def to(self: T, *args: Any, **kwargs: Any) -> T: """Apply same name function to all tensors in data_fields.""" new_data = self.new() for k, v in self.items(): @@ -476,7 +480,7 @@ def to(self, *args, **kwargs) -> "BaseDataElement": return new_data # Tensor-like methods - def cpu(self) -> "BaseDataElement": + def cpu(self: T) -> T: """Convert all tensors to CPU in data.""" new_data = self.new() for k, v in self.items(): @@ -487,7 +491,7 @@ def cpu(self) -> "BaseDataElement": return new_data # Tensor-like methods - def cuda(self) -> "BaseDataElement": + def cuda(self: T) -> T: """Convert all tensors to GPU in data.""" new_data = self.new() for k, v in self.items(): @@ -498,60 +502,83 @@ def cuda(self) -> "BaseDataElement": return new_data # Tensor-like methods - def musa(self) -> "BaseDataElement": + def musa(self: T) -> T: """Convert all tensors to musa in data.""" new_data = self.new() for k, v in self.items(): - if isinstance(v, torch.Tensor | BaseDataElement): + if isinstance(v, BaseDataElement): v = v.musa() data = {k: v} new_data.set_data(data) + elif isinstance(v, torch.Tensor): + if hasattr(v, "musa"): + v = v.musa() # type: ignore[attr-defined] + data = {k: v} + new_data.set_data(data) return new_data # Tensor-like methods - def npu(self) -> "BaseDataElement": + def npu(self: T) -> T: """Convert all tensors to NPU in data.""" new_data = self.new() for k, v in self.items(): - if isinstance(v, torch.Tensor | BaseDataElement): + if isinstance(v, BaseDataElement): v = v.npu() data = {k: v} new_data.set_data(data) + elif isinstance(v, torch.Tensor): + if hasattr(v, "npu"): + v = v.npu() # type: ignore[attr-defined] + data = {k: v} + new_data.set_data(data) return new_data - def mlu(self) -> "BaseDataElement": + def mlu(self: T) -> T: """Convert all tensors to MLU in data.""" new_data = self.new() for k, v in self.items(): - if isinstance(v, torch.Tensor | BaseDataElement): + if isinstance(v, BaseDataElement): v = v.mlu() data = {k: v} new_data.set_data(data) + elif isinstance(v, torch.Tensor): + if hasattr(v, "mlu"): + v = v.mlu() # type: ignore[attr-defined] + data = {k: v} + new_data.set_data(data) return new_data # Tensor-like methods - def detach(self) -> "BaseDataElement": + def detach(self: T) -> T: """Detach all tensors in data.""" new_data = self.new() for k, v in self.items(): - if isinstance(v, torch.Tensor | BaseDataElement): + if isinstance(v, BaseDataElement): v = v.detach() data = {k: v} new_data.set_data(data) + elif isinstance(v, torch.Tensor): + v = v.detach() # type: ignore[misc] + data = {k: v} + new_data.set_data(data) return new_data # Tensor-like methods - def numpy(self) -> "BaseDataElement": + def numpy(self: T) -> T: """Convert all tensors to np.ndarray in data.""" new_data = self.new() for k, v in self.items(): - if isinstance(v, torch.Tensor | BaseDataElement): + if isinstance(v, BaseDataElement): v = v.detach().cpu().numpy() data = {k: v} new_data.set_data(data) + elif isinstance(v, torch.Tensor): + v = v.detach().cpu().numpy() # type: ignore[misc] + data = {k: v} + new_data.set_data(data) return new_data - def to_tensor(self) -> "BaseDataElement": + def to_tensor(self: T) -> T: """Convert all np.ndarray to tensor in data.""" new_data = self.new() for k, v in self.items(): @@ -565,7 +592,7 @@ def to_tensor(self) -> "BaseDataElement": new_data.set_data(data) return new_data - def to_dict(self) -> dict: + def to_dict(self) -> dict[str, Any]: """Convert BaseDataElement to dict.""" return {k: v.to_dict() if isinstance(v, BaseDataElement) else v for k, v in self.all_items()} diff --git a/visdet/engine/structures/instance_data.py b/visdet/engine/structures/instance_data.py index ad2a2c2f..f93fd78a 100644 --- a/visdet/engine/structures/instance_data.py +++ b/visdet/engine/structures/instance_data.py @@ -1,34 +1,35 @@ -# ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. import itertools from collections.abc import Sized -from typing import Any, Union +from typing import TYPE_CHECKING, Any, Union, overload import numpy as np import torch from visdet.engine.device import get_device - from visdet.engine.structures.base_data_element import BaseDataElement -BoolTypeTensor: Any -LongTypeTensor: Any +if TYPE_CHECKING: + from visdet.structures.bbox import BaseBoxes + from visdet.structures.mask import BitmapMasks, PolygonMasks + +BoolTypeTensor: type[torch.Tensor] +LongTypeTensor: type[torch.Tensor] if get_device() == "npu": - BoolTypeTensor = Union[torch.BoolTensor, torch.npu.BoolTensor] - LongTypeTensor = Union[torch.LongTensor, torch.npu.LongTensor] + BoolTypeTensor = Union[torch.BoolTensor, torch.npu.BoolTensor] # type: ignore[misc,assignment,name-defined] + LongTypeTensor = Union[torch.LongTensor, torch.npu.LongTensor] # type: ignore[misc,assignment,name-defined] elif get_device() == "mlu": - BoolTypeTensor = Union[torch.BoolTensor, torch.mlu.BoolTensor] - LongTypeTensor = Union[torch.LongTensor, torch.mlu.LongTensor] + BoolTypeTensor = Union[torch.BoolTensor, torch.mlu.BoolTensor] # type: ignore[misc,assignment,name-defined] + LongTypeTensor = Union[torch.LongTensor, torch.mlu.LongTensor] # type: ignore[misc,assignment,name-defined] elif get_device() == "musa": - BoolTypeTensor = Union[torch.BoolTensor, torch.musa.BoolTensor] - LongTypeTensor = Union[torch.LongTensor, torch.musa.LongTensor] + BoolTypeTensor = Union[torch.BoolTensor, torch.musa.BoolTensor] # type: ignore[misc,assignment,name-defined] + LongTypeTensor = Union[torch.LongTensor, torch.musa.LongTensor] # type: ignore[misc,assignment,name-defined] else: - BoolTypeTensor = Union[torch.BoolTensor, torch.cuda.BoolTensor] - LongTypeTensor = Union[torch.LongTensor, torch.cuda.LongTensor] + BoolTypeTensor = Union[torch.BoolTensor, torch.cuda.BoolTensor] # type: ignore[misc,assignment,name-defined] + LongTypeTensor = Union[torch.LongTensor, torch.cuda.LongTensor] # type: ignore[misc,assignment,name-defined] -IndexType: Any = Union[str, slice, int, list, LongTypeTensor, BoolTypeTensor, np.ndarray] +IndexType = Union[str, slice, int, list[int], torch.Tensor, np.ndarray] # Modified from @@ -169,7 +170,6 @@ def __getitem__(self, item: IndexType) -> "InstanceData": Returns: :obj:`InstanceData`: Corresponding values. """ - assert isinstance(item, IndexType.__args__) if isinstance(item, list): item = np.array(item) if isinstance(item, np.ndarray): @@ -181,10 +181,10 @@ def __getitem__(self, item: IndexType) -> "InstanceData": item = torch.from_numpy(item) if isinstance(item, str): - return getattr(self, item) + return getattr(self, item) # type: ignore[return-value] if isinstance(item, int): - if item >= len(self) or item < -len(self): # type:ignore + if item >= len(self) or item < -len(self): raise IndexError(f"Index {item} out of range!") else: # keep the dimension @@ -193,7 +193,9 @@ def __getitem__(self, item: IndexType) -> "InstanceData": new_data = self.__class__(metainfo=self.metainfo) if isinstance(item, torch.Tensor): assert item.dim() == 1, "Only support to get the values along the first dimension." - if isinstance(item, BoolTypeTensor.__args__): + # Check if it's a boolean tensor + is_bool_tensor = item.dtype == torch.bool + if is_bool_tensor: assert len(item) == len(self), ( "The shape of the " "input(BoolTensor) " @@ -212,14 +214,14 @@ def __getitem__(self, item: IndexType) -> "InstanceData": new_data[k] = v[item.cpu().numpy()] elif isinstance(v, str | list | tuple) or (hasattr(v, "__getitem__") and hasattr(v, "cat")): # convert to indexes from BoolTensor - if isinstance(item, BoolTypeTensor.__args__): + if is_bool_tensor: indexes = torch.nonzero(item).view(-1).cpu().numpy().tolist() else: indexes = item.cpu().numpy().tolist() slice_list = [] if indexes: for index in indexes: - slice_list.append(slice(index, None, len(v))) + slice_list.append(slice(index, None, len(v))) # type: ignore[arg-type] else: slice_list.append(slice(None, 0, None)) r_list = [v[s] for s in slice_list] @@ -228,7 +230,7 @@ def __getitem__(self, item: IndexType) -> "InstanceData": for r in r_list[1:]: new_value = new_value + r else: - new_value = v.cat(r_list) + new_value = v.cat(r_list) # type: ignore[attr-defined] new_data[k] = new_value else: raise ValueError( @@ -239,7 +241,7 @@ def __getitem__(self, item: IndexType) -> "InstanceData": # item is a slice for k, v in self.items(): new_data[k] = v[item] - return new_data # type:ignore + return new_data @staticmethod def cat(instances_list: list["InstanceData"]) -> "InstanceData": @@ -276,22 +278,24 @@ def cat(instances_list: list["InstanceData"]) -> "InstanceData": new_data = instances_list[0].__class__(metainfo=instances_list[0].metainfo) for k in instances_list[0].keys(): - values = [results[k] for results in instances_list] + values: list[Any] = [results[k] for results in instances_list] v0 = values[0] - if isinstance(v0, torch.Tensor): - new_values = torch.cat(values, dim=0) - elif isinstance(v0, np.ndarray): - new_values = np.concatenate(values, axis=0) - elif isinstance(v0, str | list | tuple): + new_values: Any + # Use explicit type checking instead of isinstance to avoid mypy narrowing issues + if type(v0).__name__ == "Tensor" or isinstance(v0, torch.Tensor): + new_values = torch.cat(values, dim=0) # type: ignore[arg-type] + elif type(v0).__name__ == "ndarray" or isinstance(v0, np.ndarray): + new_values = np.concatenate(values, axis=0) # type: ignore[arg-type] + elif isinstance(v0, (str, list, tuple)): new_values = v0[:] for v in values[1:]: - new_values += v + new_values += v # type: ignore[operator] elif hasattr(v0, "cat"): - new_values = v0.cat(values) + new_values = v0.cat(values) # type: ignore[attr-defined] else: raise ValueError(f"The type of `{k}` is `{type(v0)}` which has no attribute of `cat`") new_data[k] = new_values - return new_data # type:ignore + return new_data def __len__(self) -> int: """int: The length of InstanceData.""" @@ -299,3 +303,14 @@ def __len__(self) -> int: return len(self.values()[0]) else: return 0 + + # Provide type hints for commonly accessed dynamic attributes + if TYPE_CHECKING: + # These are the most commonly accessed attributes in visualization code + bboxes: torch.Tensor | "BaseBoxes" + labels: torch.Tensor + scores: torch.Tensor + masks: torch.Tensor | "BitmapMasks" | "PolygonMasks" + label_names: list[str] + priors: torch.Tensor # Used in dense heads for anchor-based detection + level_ids: torch.Tensor # Used to track which FPN level each instance belongs to diff --git a/visdet/engine/structures/label_data.py b/visdet/engine/structures/label_data.py index a21f2424..a8bf9407 100644 --- a/visdet/engine/structures/label_data.py +++ b/visdet/engine/structures/label_data.py @@ -1,5 +1,3 @@ -# ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. import torch diff --git a/visdet/engine/structures/pixel_data.py b/visdet/engine/structures/pixel_data.py index dbf4de61..0a24f8cc 100644 --- a/visdet/engine/structures/pixel_data.py +++ b/visdet/engine/structures/pixel_data.py @@ -1,8 +1,7 @@ -# ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. import warnings from collections.abc import Sequence +from typing import TYPE_CHECKING, Any, overload import numpy as np import torch @@ -95,10 +94,12 @@ def __getitem__(self, item: Sequence[int | slice]) -> "PixelData": new_data = self.__class__(metainfo=self.metainfo) if isinstance(item, tuple): assert len(item) == 2, "Only support to slice height and width" + shape = self.shape + assert shape is not None, "Cannot slice PixelData with no data fields" tmp_item: list[slice] = [] for index, single_item in enumerate(item[::-1]): if isinstance(single_item, int): - tmp_item.insert(0, slice(single_item, None, self.shape[-index - 1])) + tmp_item.insert(0, slice(single_item, None, shape[-index - 1])) elif isinstance(single_item, slice): tmp_item.insert(0, single_item) else: @@ -112,11 +113,19 @@ def __getitem__(self, item: Sequence[int | slice]) -> "PixelData": return new_data @property - def shape(self): + def shape(self) -> tuple[int, int] | None: """The shape of pixel data.""" if len(self._data_fields) > 0: - return tuple(self.values()[0].shape[-2:]) + first_value = self.values()[0] + if isinstance(first_value, (torch.Tensor, np.ndarray)): + return tuple(first_value.shape[-2:]) # type: ignore[return-value] + return None else: return None + # Provide specific type hints for common attributes + if TYPE_CHECKING: + # These are the most commonly accessed attributes in visualization code + sem_seg: torch.Tensor | np.ndarray + # TODO padding, resize diff --git a/visdet/engine/utils/dl_utils/collect_env.py b/visdet/engine/utils/dl_utils/collect_env.py index 1558618b..ab51157a 100644 --- a/visdet/engine/utils/dl_utils/collect_env.py +++ b/visdet/engine/utils/dl_utils/collect_env.py @@ -38,7 +38,7 @@ def collect_env(): cuda_available = is_cuda_available() env_info["CUDA available"] = cuda_available - env_info["numpy_random_seed"] = np.random.get_state()[1][0] + env_info["numpy_random_seed"] = np.random.get_state()[1][0] # type: ignore[misc] if cuda_available: devices = defaultdict(list) @@ -48,7 +48,7 @@ def collect_env(): env_info["GPU " + ",".join(device_ids)] = name env_info["PyTorch"] = torch.__version__ - env_info["TorchVision"] = torchvision.__version__ + env_info["TorchVision"] = torchvision.__version__ # type: ignore[attr-defined] env_info["OpenCV"] = cv2.__version__ env_info["VisEngine"] = visengine_version diff --git a/visdet/engine/visualization/visualizer.py b/visdet/engine/visualization/visualizer.py index e3ea53b0..ba5e3caf 100644 --- a/visdet/engine/visualization/visualizer.py +++ b/visdet/engine/visualization/visualizer.py @@ -197,7 +197,7 @@ def __init__( if ( save_dir_arg is not None and save_dir_arg.default is save_dir_arg.empty - and vis_backend._save_dir is None + and getattr(vis_backend, "_save_dir", None) is None ): warnings.warn(f"Failed to add {vis_backend.__class__}, please provide the `save_dir` argument.") continue @@ -271,7 +271,7 @@ def show( # Find a better way for inline to show the image if is_inline: - return fig + return fig # type: ignore[return-value] wait_continue(fig, timeout=wait_time, continue_key=continue_key) elif backend == "cv2": # Keep images are shown in the same window, and the title of window @@ -383,7 +383,7 @@ def _is_posion_valid(self, position: np.ndarray) -> bool: Returns: bool: Whether the position is in image. """ - flag = ( + flag = bool( (position[..., 0] < self.width).all() and (position[..., 0] >= 0).all() and (position[..., 1] < self.height).all() @@ -694,7 +694,7 @@ def draw_circles( face_colors = color_val_matplotlib(face_colors) # type: ignore circles = [] for i in range(len(center)): - circles.append(Circle(tuple(center[i]), radius[i])) + circles.append(Circle(tuple(center[i]), float(radius[i]))) if isinstance(line_widths, (int, float)): line_widths = [line_widths] * len(circles) @@ -776,7 +776,7 @@ def draw_bboxes( ).reshape(-1, 4, 2) poly = [p for p in poly] return self.draw_polygons( - poly, + poly, # type: ignore[arg-type] alpha=alpha, edge_colors=edge_colors, line_styles=line_styles, @@ -833,12 +833,14 @@ def draw_polygons( polygons = [polygons] if isinstance(polygons, list): for polygon in polygons: - assert polygon.shape[1] == 2, ( - f"The shape of each polygon in `polygons` should be (M, 2), but got {polygon.shape}" - ) + # Type narrowing: polygon is either np.ndarray or torch.Tensor here + if isinstance(polygon, (np.ndarray, torch.Tensor)): + assert polygon.shape[1] == 2, ( # type: ignore + f"The shape of each polygon in `polygons` should be (M, 2), but got {polygon.shape}" + ) polygons = [tensor2ndarray(polygon) for polygon in polygons] for polygon in polygons: - if not self._is_posion_valid(polygon): + if not self._is_posion_valid(tensor2ndarray(polygon)): warnings.warn( "Warning: The polygon is out of bounds, the drawn polygon may not be in the image", UserWarning, @@ -914,7 +916,7 @@ def draw_binary_masks( for channel in color: assert 0 <= channel <= 255 # type: ignore - if isinstance(alphas, float): + if isinstance(alphas, (int, float)): alphas = [alphas] * binary_mask_len for binary_mask, color, alpha in zip(binary_masks, colors, alphas, strict=False): @@ -991,7 +993,7 @@ def draw_featmap( assert isinstance(featmap, torch.Tensor), f"`featmap` should be torch.Tensor, but got {type(featmap)}" assert featmap.ndim == 3, f"Input dimension must be 3, but got {featmap.ndim}" - featmap = featmap.detach().cpu() + featmap = featmap.detach().cpu() # type: ignore[misc] if overlaid_image is not None: if overlaid_image.ndim == 2: @@ -1062,7 +1064,7 @@ def draw_featmap( axes.axis("off") axes.text(2, 15, f"channel: {indices[i]}", fontsize=10) axes.imshow(convert_overlay_heatmap(topk_featmap[i], overlaid_image, alpha)) - image = img_from_canvas(fig.canvas) + image = img_from_canvas(fig.canvas) # type: ignore[arg-type] plt.close(fig) return image @@ -1131,14 +1133,15 @@ def add_scalars(self, scalar_dict: dict, step: int = 0, file_path: str | None = @master_only def add_datasample( self, - name, + name: str, image: np.ndarray, data_sample: Optional["BaseDataElement"] = None, draw_gt: bool = True, draw_pred: bool = True, show: bool = False, - wait_time: int = 0, + wait_time: int | float = 0, step: int = 0, + **kwargs, # Allow subclasses to add extra arguments like pred_score_thr, out_file, etc. ) -> None: """Draw datasample.""" pass @@ -1181,5 +1184,8 @@ def get_instance(cls, name: str, **kwargs) -> "Visualizer": object: Corresponding name instance. """ instance = super().get_instance(name, **kwargs) - Visualizer._instance_dict[name] = instance + # Store instance in the class-level dict for get_current_instance() + if not hasattr(Visualizer, "_instance_dict"): + Visualizer._instance_dict = {} # type: ignore[attr-defined] + Visualizer._instance_dict[name] = instance # type: ignore[attr-defined,index,assignment] return instance diff --git a/visdet/evaluation/metrics/coco_metric.py b/visdet/evaluation/metrics/coco_metric.py index d19a390d..4479d7e0 100644 --- a/visdet/evaluation/metrics/coco_metric.py +++ b/visdet/evaluation/metrics/coco_metric.py @@ -78,8 +78,8 @@ def __init__( metric_items: Sequence[str] | None = None, format_only: bool = False, outfile_prefix: str | None = None, - file_client_args: dict = None, - backend_args: dict = None, + file_client_args: dict | None = None, + backend_args: dict | None = None, collect_device: str = "cpu", prefix: str | None = None, sort_categories: bool = False, @@ -106,7 +106,7 @@ def __init__( # iou_thrs used to compute recall or precision. if iou_thrs is None: iou_thrs = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True) - self.iou_thrs = iou_thrs + self.iou_thrs: Sequence[float] = iou_thrs if isinstance(iou_thrs, Sequence) else [iou_thrs] self.metric_items = metric_items self.format_only = format_only if self.format_only: @@ -143,8 +143,8 @@ def __init__( self._coco_api = None # handle dataset lazy init - self.cat_ids = None - self.img_ids = None + self.cat_ids: list[int] | None = None + self.img_ids: list[int] | None = None def fast_eval_recall( self, @@ -167,6 +167,8 @@ def fast_eval_recall( """ gt_bboxes = [] pred_bboxes = [result["bboxes"] for result in results] + assert self.img_ids is not None, "img_ids must be initialized" + assert self._coco_api is not None, "coco_api must be initialized" for i in range(len(self.img_ids)): ann_ids = self._coco_api.get_ann_ids(img_ids=self.img_ids[i]) ann_info = self._coco_api.load_anns(ann_ids) @@ -240,6 +242,7 @@ def results2json(self, results: Sequence[dict], outfile_prefix: str) -> dict: data["image_id"] = image_id data["bbox"] = self.xyxy2xywh(bboxes[i]) data["score"] = float(scores[i]) + assert self.cat_ids is not None, "cat_ids must be initialized" data["category_id"] = self.cat_ids[label] bbox_json_results.append(data) @@ -254,6 +257,7 @@ def results2json(self, results: Sequence[dict], outfile_prefix: str) -> dict: data["image_id"] = image_id data["bbox"] = self.xyxy2xywh(bboxes[i]) data["score"] = float(mask_scores[i]) + assert self.cat_ids is not None, "cat_ids must be initialized" data["category_id"] = self.cat_ids[label] if isinstance(masks[i]["counts"], bytes): masks[i]["counts"] = masks[i]["counts"].decode() @@ -282,6 +286,7 @@ def gt_to_coco_json(self, gt_dicts: Sequence[dict], outfile_prefix: str) -> str: Returns: str: The filename of the json file. """ + assert self.dataset_meta is not None, "dataset_meta must be initialized" categories = [dict(id=id, name=name) for id, name in enumerate(self.dataset_meta["classes"])] image_infos = [] annotations = [] @@ -411,6 +416,8 @@ def compute_metrics(self, results: list) -> dict[str, float]: self._coco_api = COCO(coco_json_path) # handle lazy init + assert self._coco_api is not None, "coco_api must be initialized" + assert self.dataset_meta is not None, "dataset_meta must be initialized" if self.cat_ids is None: self.cat_ids = self._coco_api.get_cat_ids(cat_names=self.dataset_meta["classes"]) if self.img_ids is None: @@ -516,9 +523,14 @@ def compute_metrics(self, results: list) -> dict[str, float]: if self.classwise: # Compute per-category AP # Compute per-category AP # from https://github.com/facebookresearch/detectron2/ - precisions = coco_eval.eval["precision"] + precisions_raw = coco_eval.eval["precision"] # precision: (iou, recall, cls, area range, max dets) - assert len(self.cat_ids) == precisions.shape[2] + assert isinstance(precisions_raw, np.ndarray), "precisions must be ndarray" + precisions: np.ndarray = precisions_raw + assert self.cat_ids is not None, "cat_ids must be initialized" + # Type narrowing for ndarray shape attribute + precisions_shape: tuple[int, ...] = precisions.shape # type: ignore[assignment] + assert len(self.cat_ids) == precisions_shape[2] results_per_category = [] for idx, cat_id in enumerate(self.cat_ids): diff --git a/visdet/models/backbones/hrnet.py b/visdet/models/backbones/hrnet.py index 902a5e85..f988d374 100644 --- a/visdet/models/backbones/hrnet.py +++ b/visdet/models/backbones/hrnet.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import warnings +from typing import Any import torch.nn as nn from torch.nn.modules.batchnorm import _BatchNorm @@ -17,6 +18,17 @@ class HRModule(BaseModule): is in this module. """ + block_init_cfg: dict[str, Any] | None + in_channels: list[int] + num_branches: int + multiscale_output: bool + norm_cfg: dict[str, Any] + conv_cfg: dict[str, Any] | None + with_cp: bool + branches: ModuleList + fuse_layers: nn.ModuleList | None + relu: nn.ReLU + def __init__( self, num_branches, @@ -32,16 +44,16 @@ def __init__( init_cfg=None, ): super(HRModule, self).__init__(init_cfg) - self.block_init_cfg = block_init_cfg + self.block_init_cfg = block_init_cfg # type: ignore[unresolved-attribute] self._check_branches(num_branches, num_blocks, in_channels, num_channels) self.in_channels = in_channels self.num_branches = num_branches - self.multiscale_output = multiscale_output - self.norm_cfg = norm_cfg - self.conv_cfg = conv_cfg - self.with_cp = with_cp + self.multiscale_output = multiscale_output # type: ignore[unresolved-attribute] + self.norm_cfg = norm_cfg # type: ignore[unresolved-attribute] + self.conv_cfg = conv_cfg # type: ignore[unresolved-attribute] + self.with_cp = with_cp # type: ignore[unresolved-attribute] self.branches = self._make_branches(num_branches, blocks, num_blocks, num_channels) self.fuse_layers = self._make_fuse_layers() self.relu = nn.ReLU(inplace=False) @@ -187,6 +199,7 @@ def forward(self, x): x[i] = self.branches[i](x[i]) x_fuse = [] + assert self.fuse_layers is not None for i in range(len(self.fuse_layers)): y = 0 for j in range(self.num_branches): @@ -276,6 +289,30 @@ class HRNet(BaseModule): blocks_dict = {"BASIC": BasicBlock, "BOTTLENECK": Bottleneck} + pretrained: str | None + extra: dict[str, Any] + conv_cfg: dict[str, Any] | None + norm_cfg: dict[str, Any] + norm_eval: bool + with_cp: bool + zero_init_residual: bool + norm1_name: str + norm2_name: str + conv1: nn.Module + conv2: nn.Module + relu: nn.ReLU + layer1: Sequential + stage1_cfg: dict[str, Any] + stage2_cfg: dict[str, Any] + stage3_cfg: dict[str, Any] + stage4_cfg: dict[str, Any] + transition1: nn.ModuleList + transition2: nn.ModuleList + transition3: nn.ModuleList + stage2: Sequential + stage3: Sequential + stage4: Sequential + def __init__( self, extra, @@ -291,14 +328,14 @@ def __init__( ): super(HRNet, self).__init__(init_cfg) - self.pretrained = pretrained + self.pretrained = pretrained # type: ignore[unresolved-attribute] assert not (init_cfg and pretrained), "init_cfg and pretrained cannot be specified at the same time" if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is deprecated, please use "init_cfg" instead') - self.init_cfg = dict(type="Pretrained", checkpoint=pretrained) + self.init_cfg = dict(type="Pretrained", checkpoint=pretrained) # type: ignore[unresolved-attribute] elif pretrained is None: if init_cfg is None: - self.init_cfg = [ + self.init_cfg = [ # type: ignore[unresolved-attribute] dict(type="Kaiming", layer="Conv2d"), dict(type="Constant", val=1, layer=["_BatchNorm", "GroupNorm"]), ] @@ -314,15 +351,15 @@ def __init__( assert len(cfg["num_blocks"]) == cfg["num_branches"] and len(cfg["num_channels"]) == cfg["num_branches"] self.extra = extra - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.norm_eval = norm_eval - self.with_cp = with_cp - self.zero_init_residual = zero_init_residual + self.conv_cfg = conv_cfg # type: ignore[unresolved-attribute] + self.norm_cfg = norm_cfg # type: ignore[unresolved-attribute] + self.norm_eval = norm_eval # type: ignore[unresolved-attribute] + self.with_cp = with_cp # type: ignore[unresolved-attribute] + self.zero_init_residual = zero_init_residual # type: ignore[unresolved-attribute] # stem net - self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1) - self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2) + self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1) # type: ignore[unresolved-attribute] + self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2) # type: ignore[unresolved-attribute] self.conv1 = build_conv_layer( self.conv_cfg, diff --git a/visdet/models/backbones/regnet.py b/visdet/models/backbones/regnet.py index 546008de..84cf0f59 100644 --- a/visdet/models/backbones/regnet.py +++ b/visdet/models/backbones/regnet.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import warnings +from typing import Any import numpy as np import torch.nn as nn @@ -79,6 +80,11 @@ class RegNet(ResNet): "regnetx_12gf": dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, bot_mul=1.0), } + bottleneck_ratio: list[float] + stage_widths: list[int] + group_widths: list[int] + plugins: Any + def __init__( self, arch, @@ -122,41 +128,41 @@ def __init__( stage_widths, stage_blocks = self.get_stages_from_blocks(widths) # Generate group widths and bot muls group_widths = [arch["group_w"] for _ in range(num_stages)] - self.bottleneck_ratio = [arch["bot_mul"] for _ in range(num_stages)] + self.bottleneck_ratio = [arch["bot_mul"] for _ in range(num_stages)] # type: ignore[unresolved-attribute] # Adjust the compatibility of stage_widths and group_widths stage_widths, group_widths = self.adjust_width_group(stage_widths, self.bottleneck_ratio, group_widths) # Group params by stage - self.stage_widths = stage_widths - self.group_widths = group_widths - self.depth = sum(stage_blocks) - self.stem_channels = stem_channels - self.base_channels = base_channels - self.num_stages = num_stages + self.stage_widths = stage_widths # type: ignore[unresolved-attribute] + self.group_widths = group_widths # type: ignore[unresolved-attribute] + self.depth = sum(stage_blocks) # type: ignore[unresolved-attribute] + self.stem_channels = stem_channels # type: ignore[unresolved-attribute] + self.base_channels = base_channels # type: ignore[unresolved-attribute] + self.num_stages = num_stages # type: ignore[unresolved-attribute] assert num_stages >= 1 and num_stages <= 4 - self.strides = strides - self.dilations = dilations + self.strides = strides # type: ignore[unresolved-attribute] + self.dilations = dilations # type: ignore[unresolved-attribute] assert len(strides) == len(dilations) == num_stages - self.out_indices = out_indices + self.out_indices = out_indices # type: ignore[unresolved-attribute] assert max(out_indices) < num_stages - self.style = style - self.deep_stem = deep_stem - self.avg_down = avg_down - self.frozen_stages = frozen_stages - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.with_cp = with_cp - self.norm_eval = norm_eval - self.dcn = dcn - self.stage_with_dcn = stage_with_dcn + self.style = style # type: ignore[unresolved-attribute] + self.deep_stem = deep_stem # type: ignore[unresolved-attribute] + self.avg_down = avg_down # type: ignore[unresolved-attribute] + self.frozen_stages = frozen_stages # type: ignore[unresolved-attribute] + self.conv_cfg = conv_cfg # type: ignore[unresolved-attribute] + self.norm_cfg = norm_cfg # type: ignore[unresolved-attribute] + self.with_cp = with_cp # type: ignore[unresolved-attribute] + self.norm_eval = norm_eval # type: ignore[unresolved-attribute] + self.dcn = dcn # type: ignore[unresolved-attribute] + self.stage_with_dcn = stage_with_dcn # type: ignore[unresolved-attribute] if dcn is not None: assert len(stage_with_dcn) == num_stages - self.plugins = plugins - self.zero_init_residual = zero_init_residual - self.block = Bottleneck + self.plugins = plugins # type: ignore[unresolved-attribute] + self.zero_init_residual = zero_init_residual # type: ignore[unresolved-attribute] + self.block = Bottleneck # type: ignore[unresolved-attribute] expansion_bak = self.block.expansion self.block.expansion = 1 - self.stage_blocks = stage_blocks[:num_stages] + self.stage_blocks = stage_blocks[:num_stages] # type: ignore[unresolved-attribute] self._make_stem_layer(in_channels, stem_channels) @@ -164,10 +170,10 @@ def __init__( assert not (init_cfg and pretrained), "init_cfg and pretrained cannot be specified at the same time" if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is deprecated, please use "init_cfg" instead') - self.init_cfg = dict(type="Pretrained", checkpoint=pretrained) + self.init_cfg = dict(type="Pretrained", checkpoint=pretrained) # type: ignore[unresolved-attribute] elif pretrained is None: if init_cfg is None: - self.init_cfg = [ + self.init_cfg = [ # type: ignore[unresolved-attribute] dict(type="Kaiming", layer="Conv2d"), dict(type="Constant", val=1, layer=["_BatchNorm", "GroupNorm"]), ] @@ -176,8 +182,8 @@ def __init__( else: raise TypeError("pretrained must be a str or None") - self.inplanes = stem_channels - self.res_layers = [] + self.inplanes = stem_channels # type: ignore[unresolved-attribute] + self.res_layers = [] # type: ignore[unresolved-attribute] for i, num_blocks in enumerate(self.stage_blocks): stride = self.strides[i] dilation = self.dilations[i] @@ -187,7 +193,7 @@ def __init__( dcn = self.dcn if self.stage_with_dcn[i] else None if self.plugins is not None: - stage_plugins = self.make_stage_plugins(self.plugins, i) + stage_plugins = self.make_stage_plugins(self.plugins, i) # type: ignore[call-non-callable] else: stage_plugins = None @@ -210,14 +216,14 @@ def __init__( base_channels=self.stage_widths[i], init_cfg=block_init_cfg, ) - self.inplanes = self.stage_widths[i] + self.inplanes = self.stage_widths[i] # type: ignore[unresolved-attribute] layer_name = f"layer{i + 1}" self.add_module(layer_name, res_layer) self.res_layers.append(layer_name) self._freeze_stages() - self.feat_dim = stage_widths[-1] + self.feat_dim = stage_widths[-1] # type: ignore[unresolved-attribute] self.block.expansion = expansion_bak def _make_stem_layer(self, in_channels, base_channels): @@ -230,7 +236,7 @@ def _make_stem_layer(self, in_channels, base_channels): padding=1, bias=False, ) - self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, base_channels, postfix=1) + self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, base_channels, postfix=1) # type: ignore[unresolved-attribute] self.add_module(self.norm1_name, norm1) self.relu = nn.ReLU(inplace=True) diff --git a/visdet/models/backbones/res2net.py b/visdet/models/backbones/res2net.py index bb1c2e08..e9815aa0 100644 --- a/visdet/models/backbones/res2net.py +++ b/visdet/models/backbones/res2net.py @@ -42,8 +42,8 @@ def __init__( assert scales > 1, "Res2Net degenerates to ResNet when scales = 1." width = int(math.floor(self.planes * (base_width / 64))) - self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, width * scales, postfix=1) - self.norm3_name, norm3 = build_norm_layer(self.norm_cfg, self.planes * self.expansion, postfix=3) + self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, width * scales, postfix=1) # type: ignore[unresolved-attribute] + self.norm3_name, norm3 = build_norm_layer(self.norm_cfg, self.planes * self.expansion, postfix=3) # type: ignore[unresolved-attribute] self.conv1 = build_conv_layer( self.conv_cfg, @@ -87,9 +87,9 @@ def __init__( ) self.add_module(self.norm3_name, norm3) - self.stage_type = stage_type - self.scales = scales - self.width = width + self.stage_type = stage_type # type: ignore[unresolved-attribute] + self.scales = scales # type: ignore[unresolved-attribute] + self.width = width # type: ignore[unresolved-attribute] # Remove conv2 since we replaced it with multi-scale convs delattr(self, "conv2") @@ -291,8 +291,8 @@ def __init__( avg_down=True, **kwargs, ): - self.scales = scales - self.base_width = base_width + self.scales = scales # type: ignore[unresolved-attribute] + self.base_width = base_width # type: ignore[unresolved-attribute] super(Res2Net, self).__init__( depth=depth, deep_stem=deep_stem, diff --git a/visdet/models/backbones/resnest.py b/visdet/models/backbones/resnest.py index 7f213605..64780ad7 100644 --- a/visdet/models/backbones/resnest.py +++ b/visdet/models/backbones/resnest.py @@ -75,8 +75,8 @@ def __init__( ): super(SplitAttentionConv2d, self).__init__(init_cfg) inter_channels = max(in_channels * radix // reduction_factor, 32) - self.radix = radix - self.groups = groups + self.radix = radix # type: ignore[unresolved-attribute] + self.groups = groups # type: ignore[unresolved-attribute] self.channels = channels self.conv = build_conv_layer( conv_cfg, @@ -90,11 +90,11 @@ def __init__( bias=False, ) # To be consistent with original implementation, starting from 0 - self.norm0_name, norm0 = build_norm_layer(norm_cfg, channels * radix, postfix=0) + self.norm0_name, norm0 = build_norm_layer(norm_cfg, channels * radix, postfix=0) # type: ignore[unresolved-attribute] self.add_module(self.norm0_name, norm0) self.relu = nn.ReLU(inplace=True) self.fc1 = build_conv_layer(None, channels, inter_channels, 1, groups=self.groups) - self.norm1_name, norm1 = build_norm_layer(norm_cfg, inter_channels, postfix=1) + self.norm1_name, norm1 = build_norm_layer(norm_cfg, inter_channels, postfix=1) # type: ignore[unresolved-attribute] self.add_module(self.norm1_name, norm1) self.fc2 = build_conv_layer(None, inter_channels, channels * radix, 1, groups=self.groups) self.rsoftmax = RSoftmax(radix, groups) @@ -177,10 +177,10 @@ def __init__( else: width = math.floor(self.planes * (base_width / base_channels)) * groups - self.avg_down_stride = avg_down_stride and self.conv2_stride > 1 + self.avg_down_stride = avg_down_stride and self.conv2_stride > 1 # type: ignore[unresolved-attribute] - self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, width, postfix=1) - self.norm3_name, norm3 = build_norm_layer(self.norm_cfg, self.planes * self.expansion, postfix=3) + self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, width, postfix=1) # type: ignore[unresolved-attribute] + self.norm3_name, norm3 = build_norm_layer(self.norm_cfg, self.planes * self.expansion, postfix=3) # type: ignore[unresolved-attribute] self.conv1 = build_conv_layer( self.conv_cfg, @@ -290,11 +290,11 @@ def __init__( avg_down=True, **kwargs, ): - self.groups = groups - self.base_width = base_width - self.radix = radix - self.reduction_factor = reduction_factor - self.avg_down_stride = avg_down_stride + self.groups = groups # type: ignore[unresolved-attribute] + self.base_width = base_width # type: ignore[unresolved-attribute] + self.radix = radix # type: ignore[unresolved-attribute] + self.reduction_factor = reduction_factor # type: ignore[unresolved-attribute] + self.avg_down_stride = avg_down_stride # type: ignore[unresolved-attribute] super(ResNeSt, self).__init__( depth=depth, deep_stem=deep_stem, diff --git a/visdet/models/backbones/resnet.py b/visdet/models/backbones/resnet.py index 8de2bff3..f0a5c2af 100644 --- a/visdet/models/backbones/resnet.py +++ b/visdet/models/backbones/resnet.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import logging import warnings +from typing import Any import torch.nn as nn import torch.utils.checkpoint as cp @@ -34,6 +35,16 @@ class BasicBlock(BaseModule): expansion = 1 + norm1_name: str + norm2_name: str + conv1: nn.Module + conv2: nn.Module + relu: nn.ReLU + downsample: nn.Module | None + stride: int + dilation: int + with_cp: bool + def __init__( self, inplanes, @@ -53,8 +64,8 @@ def __init__( assert dcn is None, "DCN is not supported in BasicBlock" assert plugins is None, "Plugins are not supported yet" - self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) - self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) + self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) # type: ignore[unresolved-attribute] + self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) # type: ignore[unresolved-attribute] self.conv1 = build_conv_layer( conv_cfg, @@ -71,10 +82,10 @@ def __init__( self.add_module(self.norm2_name, norm2) self.relu = nn.ReLU(inplace=True) - self.downsample = downsample - self.stride = stride - self.dilation = dilation - self.with_cp = with_cp + self.downsample = downsample # type: ignore[unresolved-attribute] + self.stride = stride # type: ignore[unresolved-attribute] + self.dilation = dilation # type: ignore[unresolved-attribute] + self.with_cp = with_cp # type: ignore[unresolved-attribute] @property def norm1(self): @@ -136,6 +147,27 @@ class Bottleneck(BaseModule): expansion = 4 + inplanes: int + planes: int + stride: int + dilation: int + style: str + with_cp: bool + conv_cfg: dict[str, Any] | None + norm_cfg: dict[str, Any] + dcn: dict[str, Any] | None + with_dcn: bool + conv1_stride: int + conv2_stride: int + norm1_name: str + norm2_name: str + norm3_name: str + conv1: nn.Module + conv2: nn.Module + conv3: nn.Module + relu: nn.ReLU + downsample: nn.Module | None + def __init__( self, inplanes, @@ -161,27 +193,27 @@ def __init__( assert dcn is None or isinstance(dcn, dict) assert plugins is None, "Plugins are not supported yet" - self.inplanes = inplanes - self.planes = planes - self.stride = stride - self.dilation = dilation - self.style = style - self.with_cp = with_cp - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.dcn = dcn - self.with_dcn = dcn is not None + self.inplanes = inplanes # type: ignore[unresolved-attribute] + self.planes = planes # type: ignore[unresolved-attribute] + self.stride = stride # type: ignore[unresolved-attribute] + self.dilation = dilation # type: ignore[unresolved-attribute] + self.style = style # type: ignore[unresolved-attribute] + self.with_cp = with_cp # type: ignore[unresolved-attribute] + self.conv_cfg = conv_cfg # type: ignore[unresolved-attribute] + self.norm_cfg = norm_cfg # type: ignore[unresolved-attribute] + self.dcn = dcn # type: ignore[unresolved-attribute] + self.with_dcn = dcn is not None # type: ignore[unresolved-attribute] if self.style == "pytorch": - self.conv1_stride = 1 - self.conv2_stride = stride + self.conv1_stride = 1 # type: ignore[unresolved-attribute] + self.conv2_stride = stride # type: ignore[unresolved-attribute] else: - self.conv1_stride = stride - self.conv2_stride = 1 + self.conv1_stride = stride # type: ignore[unresolved-attribute] + self.conv2_stride = 1 # type: ignore[unresolved-attribute] - self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) - self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) - self.norm3_name, norm3 = build_norm_layer(norm_cfg, planes * self.expansion, postfix=3) + self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) # type: ignore[unresolved-attribute] + self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) # type: ignore[unresolved-attribute] + self.norm3_name, norm3 = build_norm_layer(norm_cfg, planes * self.expansion, postfix=3) # type: ignore[unresolved-attribute] self.conv1 = build_conv_layer( conv_cfg, @@ -215,7 +247,7 @@ def __init__( self.add_module(self.norm3_name, norm3) self.relu = nn.ReLU(inplace=True) - self.downsample = downsample + self.downsample = downsample # type: ignore[unresolved-attribute] @property def norm1(self): @@ -324,6 +356,35 @@ class ResNet(BaseModule): 152: (Bottleneck, (3, 8, 36, 3)), } + zero_init_residual: bool + depth: int + stem_channels: int + base_channels: int + num_stages: int + strides: tuple[int, ...] + dilations: tuple[int, ...] + out_indices: tuple[int, ...] + style: str + deep_stem: bool + avg_down: bool + frozen_stages: int + conv_cfg: dict[str, Any] | None + norm_cfg: dict[str, Any] + with_cp: bool + norm_eval: bool + dcn: dict[str, Any] | None + stage_with_dcn: tuple[bool, ...] + block: type[BasicBlock] | type[Bottleneck] + stage_blocks: tuple[int, ...] + inplanes: int + res_layers: list[str] + feat_dim: int + stem: nn.Sequential + conv1: nn.Module + norm1_name: str + relu: nn.ReLU + maxpool: nn.MaxPool2d + def __init__( self, depth, @@ -349,7 +410,7 @@ def __init__( init_cfg=None, ): super(ResNet, self).__init__(init_cfg=init_cfg) - self.zero_init_residual = zero_init_residual + self.zero_init_residual = zero_init_residual # type: ignore[unresolved-attribute] if depth not in self.arch_settings: raise KeyError(f"invalid depth {depth} for resnet") @@ -357,10 +418,10 @@ def __init__( assert not (init_cfg and pretrained), "init_cfg and pretrained cannot be specified at the same time" if isinstance(pretrained, str): warnings.warn("DeprecationWarning: pretrained is deprecated, please use 'init_cfg' instead") - self.init_cfg = dict(type="Pretrained", checkpoint=pretrained) + self.init_cfg = dict(type="Pretrained", checkpoint=pretrained) # type: ignore[unresolved-attribute] elif pretrained is None: if init_cfg is None: - self.init_cfg = [ + self.init_cfg = [ # type: ignore[unresolved-attribute] dict(type="Kaiming", layer="Conv2d"), dict(type="Constant", val=1, layer=["_BatchNorm", "GroupNorm"]), ] @@ -373,37 +434,37 @@ def __init__( else: raise TypeError("pretrained must be a str or None") - self.depth = depth + self.depth = depth # type: ignore[unresolved-attribute] if stem_channels is None: stem_channels = base_channels - self.stem_channels = stem_channels - self.base_channels = base_channels - self.num_stages = num_stages + self.stem_channels = stem_channels # type: ignore[unresolved-attribute] + self.base_channels = base_channels # type: ignore[unresolved-attribute] + self.num_stages = num_stages # type: ignore[unresolved-attribute] assert num_stages >= 1 and num_stages <= 4 - self.strides = strides - self.dilations = dilations + self.strides = strides # type: ignore[unresolved-attribute] + self.dilations = dilations # type: ignore[unresolved-attribute] assert len(strides) == len(dilations) == num_stages - self.out_indices = out_indices + self.out_indices = out_indices # type: ignore[unresolved-attribute] assert max(out_indices) < num_stages - self.style = style - self.deep_stem = deep_stem - self.avg_down = avg_down - self.frozen_stages = frozen_stages - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.with_cp = with_cp - self.norm_eval = norm_eval - self.dcn = dcn - self.stage_with_dcn = stage_with_dcn + self.style = style # type: ignore[unresolved-attribute] + self.deep_stem = deep_stem # type: ignore[unresolved-attribute] + self.avg_down = avg_down # type: ignore[unresolved-attribute] + self.frozen_stages = frozen_stages # type: ignore[unresolved-attribute] + self.conv_cfg = conv_cfg # type: ignore[unresolved-attribute] + self.norm_cfg = norm_cfg # type: ignore[unresolved-attribute] + self.with_cp = with_cp # type: ignore[unresolved-attribute] + self.norm_eval = norm_eval # type: ignore[unresolved-attribute] + self.dcn = dcn # type: ignore[unresolved-attribute] + self.stage_with_dcn = stage_with_dcn # type: ignore[unresolved-attribute] if dcn is not None: assert len(stage_with_dcn) == num_stages - self.block, stage_blocks = self.arch_settings[depth] - self.stage_blocks = stage_blocks[:num_stages] - self.inplanes = stem_channels + self.block, stage_blocks = self.arch_settings[depth] # type: ignore[unresolved-attribute] + self.stage_blocks = stage_blocks[:num_stages] # type: ignore[unresolved-attribute] + self.inplanes = stem_channels # type: ignore[unresolved-attribute] self._make_stem_layer(in_channels, stem_channels) - self.res_layers = [] + self.res_layers = [] # type: ignore[unresolved-attribute] for i, num_blocks in enumerate(self.stage_blocks): stride = strides[i] dilation = dilations[i] @@ -424,14 +485,14 @@ def __init__( dcn=dcn, init_cfg=block_init_cfg, ) - self.inplanes = planes * self.block.expansion + self.inplanes = planes * self.block.expansion # type: ignore[unresolved-attribute] layer_name = f"layer{i + 1}" self.add_module(layer_name, res_layer) self.res_layers.append(layer_name) self._freeze_stages() - self.feat_dim = self.block.expansion * base_channels * 2 ** (len(self.stage_blocks) - 1) + self.feat_dim = self.block.expansion * base_channels * 2 ** (len(self.stage_blocks) - 1) # type: ignore[unresolved-attribute] def make_res_layer(self, **kwargs): """Pack all blocks in a stage into a ``ResLayer``.""" @@ -489,7 +550,7 @@ def _make_stem_layer(self, in_channels, stem_channels): padding=3, bias=False, ) - self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, stem_channels, postfix=1) + self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, stem_channels, postfix=1) # type: ignore[unresolved-attribute] self.add_module(self.norm1_name, norm1) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) diff --git a/visdet/models/backbones/resnext.py b/visdet/models/backbones/resnext.py index 87110912..25b741a8 100644 --- a/visdet/models/backbones/resnext.py +++ b/visdet/models/backbones/resnext.py @@ -19,9 +19,9 @@ class Bottleneck(_Bottleneck): def __init__(self, inplanes, planes, groups=1, base_width=4, base_channels=64, **kwargs): # Extract groups and base_width before calling parent - self.groups = groups - self.base_width = base_width - self.base_channels = base_channels + self.groups = groups # type: ignore[unresolved-attribute] + self.base_width = base_width # type: ignore[unresolved-attribute] + self.base_channels = base_channels # type: ignore[unresolved-attribute] super(Bottleneck, self).__init__(inplanes, planes, **kwargs) @@ -31,9 +31,9 @@ def __init__(self, inplanes, planes, groups=1, base_width=4, base_channels=64, * width = math.floor(self.planes * (base_width / base_channels)) * groups # Rebuild norm and conv layers with grouped convolutions - self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, width, postfix=1) - self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, width, postfix=2) - self.norm3_name, norm3 = build_norm_layer(self.norm_cfg, self.planes * self.expansion, postfix=3) + self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, width, postfix=1) # type: ignore[unresolved-attribute] + self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, width, postfix=2) # type: ignore[unresolved-attribute] + self.norm3_name, norm3 = build_norm_layer(self.norm_cfg, self.planes * self.expansion, postfix=3) # type: ignore[unresolved-attribute] self.conv1 = build_conv_layer( self.conv_cfg, @@ -104,8 +104,8 @@ class ResNeXt(ResNet): } def __init__(self, groups=1, base_width=4, **kwargs): - self.groups = groups - self.base_width = base_width + self.groups = groups # type: ignore[unresolved-attribute] + self.base_width = base_width # type: ignore[unresolved-attribute] super(ResNeXt, self).__init__(**kwargs) def make_res_layer(self, **kwargs): diff --git a/visdet/models/backbones/swin.py b/visdet/models/backbones/swin.py index b53fecf8..85d1f8c2 100644 --- a/visdet/models/backbones/swin.py +++ b/visdet/models/backbones/swin.py @@ -61,14 +61,14 @@ def __init__( self.num_heads = num_heads head_embed_dims = embed_dims // num_heads self.scale = qk_scale or head_embed_dims**-0.5 - self.init_cfg = init_cfg + self.init_cfg = init_cfg # type: ignore[unresolved-attribute] if backend not in {"torch", "flash"}: raise ValueError(f"Unsupported attention backend: {backend}") # Fall back to torch if flash is not available if backend == "flash" and flash_swin_attn_func is None: - self.backend = "torch" + self.backend = "torch" # type: ignore[unresolved-attribute] else: - self.backend = backend + self.backend = backend # type: ignore[unresolved-attribute] self.head_embed_dims = head_embed_dims # define a parameter table of relative position bias @@ -89,7 +89,7 @@ def __init__( self.proj_drop = nn.Dropout(proj_drop_rate) self.softmax = nn.Softmax(dim=-1) - self._flash_fallback_warned = False + self._flash_fallback_warned = False # type: ignore[unresolved-attribute] def init_weights(self): trunc_normal_(self.relative_position_bias_table, std=0.02) @@ -108,7 +108,7 @@ def forward(self, x, mask=None): # make torchscript happy (cannot use tensor as tuple) q, k, v = qkv[0], qkv[1], qkv[2] - relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( # type: ignore[call-non-callable,no-matching-overload] self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1, @@ -185,9 +185,9 @@ def __init__( super().__init__(init_cfg) self.window_size = window_size - self.shift_size = shift_size + self.shift_size = shift_size # type: ignore[unresolved-attribute] assert 0 <= self.shift_size < self.window_size - self.backend = backend + self.backend = backend # type: ignore[unresolved-attribute] self.w_msa = WindowMSA( embed_dims=embed_dims, @@ -349,9 +349,9 @@ def __init__( ): super(SwinBlock, self).__init__() - self.init_cfg = init_cfg - self.with_cp = with_cp - self.attn_backend = attn_backend + self.init_cfg = init_cfg # type: ignore[unresolved-attribute] + self.with_cp = with_cp # type: ignore[unresolved-attribute] + self.attn_backend = attn_backend # type: ignore[unresolved-attribute] self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1] self.attn = ShiftWindowMSA( @@ -480,7 +480,7 @@ def __init__( ) self.blocks.append(block) - self.downsample = downsample + self.downsample = downsample # type: ignore[unresolved-attribute] def forward(self, x, hw_shape): for block in self.blocks: @@ -580,8 +580,8 @@ def __init__( frozen_stages=-1, init_cfg=None, ): - self.convert_weights = convert_weights - self.frozen_stages = frozen_stages + self.convert_weights = convert_weights # type: ignore[unresolved-attribute] + self.frozen_stages = frozen_stages # type: ignore[unresolved-attribute] if isinstance(pretrain_img_size, int): pretrain_img_size = to_2tuple(pretrain_img_size) elif isinstance(pretrain_img_size, tuple): @@ -594,17 +594,17 @@ def __init__( assert not (init_cfg and pretrained), "init_cfg and pretrained cannot be specified at the same time" if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is deprecated, please use "init_cfg" instead') - self.init_cfg = dict(type="Pretrained", checkpoint=pretrained) + self.init_cfg = dict(type="Pretrained", checkpoint=pretrained) # type: ignore[unresolved-attribute] elif pretrained is None: - self.init_cfg = init_cfg + self.init_cfg = init_cfg # type: ignore[unresolved-attribute] else: raise TypeError("pretrained must be a str or None") super(SwinTransformer, self).__init__(init_cfg=init_cfg) num_layers = len(depths) - self.out_indices = out_indices - self.use_abs_pos_embed = use_abs_pos_embed + self.out_indices = out_indices # type: ignore[unresolved-attribute] + self.use_abs_pos_embed = use_abs_pos_embed # type: ignore[unresolved-attribute] assert strides[0] == patch_size, "Use non-overlapping patch embed." @@ -625,7 +625,7 @@ def __init__( self.absolute_pos_embed = nn.Parameter(torch.zeros((1, num_patches, embed_dims))) self.drop_after_pos = nn.Dropout(p=drop_rate) - self.attn_backend = attn_backend + self.attn_backend = attn_backend # type: ignore[unresolved-attribute] # set stochastic depth decay rule total_depth = sum(depths) @@ -667,7 +667,7 @@ def __init__( if downsample: in_channels = downsample.out_channels - self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)] + self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)] # type: ignore[unresolved-attribute] # Add a norm layer for each output for i in out_indices: layer = build_norm_layer(norm_cfg, self.num_features[i])[1] @@ -715,7 +715,7 @@ def init_weights(self): assert "checkpoint" in self.init_cfg, ( f"Only support specify `Pretrained` in `init_cfg` in {self.__class__.__name__} " ) - ckpt = CheckpointLoader.load_checkpoint(self.init_cfg.checkpoint, logger=logger, map_location="cpu") + ckpt = CheckpointLoader.load_checkpoint(self.init_cfg.checkpoint, logger=logger, map_location="cpu") # type: ignore[possibly-missing-attribute] if "state_dict" in ckpt: _state_dict = ckpt["state_dict"] elif "model" in ckpt: diff --git a/visdet/models/dense_heads/anchor_head.py b/visdet/models/dense_heads/anchor_head.py index 8de7a184..583f049d 100644 --- a/visdet/models/dense_heads/anchor_head.py +++ b/visdet/models/dense_heads/anchor_head.py @@ -1,5 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. +from __future__ import annotations + import warnings +from collections.abc import Mapping, Sequence +from typing import Any, cast import torch import torch.nn as nn @@ -15,13 +19,7 @@ from visdet.models.utils import images_to_levels, multi_apply, unmap from visdet.registry import MODELS, TASK_UTILS from visdet.structures.bbox import BaseBoxes, cat_boxes, get_box_tensor -from visdet.utils import ( - ConfigType, - InstanceList, - OptConfigType, - OptInstanceList, - OptMultiConfig, -) +from visdet.utils import ConfigType, InstanceList, OptConfigType, OptInstanceList, OptMultiConfig @MODELS.register_module() @@ -47,6 +45,19 @@ class AnchorHead(BaseDenseHead): init_cfg (dict or list[dict], optional): Initialization config dict. """ + # Type annotations for attributes + in_channels: int + num_classes: int + feat_channels: int + reg_decoded_bbox: bool + loss_bbox: nn.Module + train_cfg: ConfigType | None + assigner: nn.Module + sampler: nn.Module + num_base_priors: int + conv_cls: nn.Module + conv_reg: nn.Module + def __init__( self, num_classes: int, @@ -72,39 +83,52 @@ def __init__( init_cfg: OptMultiConfig = dict(type="Normal", layer="Conv2d", std=0.01), ) -> None: super().__init__(init_cfg=init_cfg) - self.in_channels = in_channels - self.num_classes = num_classes - self.feat_channels = feat_channels - self.use_sigmoid_cls = loss_cls.get("use_sigmoid", False) + self.in_channels: int = in_channels # type: ignore[misc] + self.num_classes: int = num_classes # type: ignore[misc] + self.feat_channels: int = feat_channels # type: ignore[misc] + if not isinstance(loss_cls, Mapping): + raise TypeError("loss_cls config must be a mapping") + loss_cls_cfg = dict(loss_cls) + self.use_sigmoid_cls: bool = bool(loss_cls_cfg.get("use_sigmoid", False)) # type: ignore[misc] if self.use_sigmoid_cls: - self.cls_out_channels = num_classes + self.cls_out_channels = num_classes # type: ignore[assignment] else: - self.cls_out_channels = num_classes + 1 + self.cls_out_channels = num_classes + 1 # type: ignore[assignment] if self.cls_out_channels <= 0: raise ValueError(f"num_classes={num_classes} is too small") - self.reg_decoded_bbox = reg_decoded_bbox - - self.bbox_coder = TASK_UTILS.build(bbox_coder) - self.loss_cls = MODELS.build(loss_cls) - self.loss_bbox = MODELS.build(loss_bbox) - self.train_cfg = train_cfg - self.test_cfg = test_cfg - if self.train_cfg: - self.assigner = TASK_UTILS.build(self.train_cfg["assigner"]) - if train_cfg.get("sampler", None) is not None: - self.sampler = TASK_UTILS.build(self.train_cfg["sampler"], default_args=dict(context=self)) + self.reg_decoded_bbox = reg_decoded_bbox # type: ignore[assignment] + + if ( + not isinstance(bbox_coder, Mapping) + or not isinstance(loss_bbox, Mapping) + or not isinstance(anchor_generator, Mapping) + ): + raise TypeError("bbox_coder, loss_bbox and anchor_generator configs must be mappings") + self.bbox_coder = TASK_UTILS.build(dict(bbox_coder)) + self.loss_cls = MODELS.build(dict(loss_cls)) + self.loss_bbox = MODELS.build(dict(loss_bbox)) + self.train_cfg: ConfigType | None = dict(train_cfg) if isinstance(train_cfg, Mapping) else train_cfg # type: ignore[misc] + self.test_cfg = dict(test_cfg) if isinstance(test_cfg, Mapping) else test_cfg # type: ignore[assignment] + if isinstance(self.train_cfg, Mapping): + assigner_cfg = self.train_cfg["assigner"] + if not isinstance(assigner_cfg, Mapping): + raise TypeError("assigner cfg must be a mapping") + self.assigner = TASK_UTILS.build(dict(assigner_cfg)) + sampler_cfg = self.train_cfg.get("sampler") + if isinstance(sampler_cfg, Mapping): + self.sampler = TASK_UTILS.build(dict(sampler_cfg), default_args=dict(context=self)) # type: ignore[assignment] else: - self.sampler = PseudoSampler(context=self) + self.sampler = PseudoSampler(context=self) # type: ignore[assignment] - self.fp16_enabled = False + self.fp16_enabled: bool = False # type: ignore[misc] - self.prior_generator = TASK_UTILS.build(anchor_generator) + self.prior_generator: AnchorGenerator = cast(AnchorGenerator, TASK_UTILS.build(dict(anchor_generator))) # Usually the numbers of anchors for each level are the same # except SSD detectors. So it is an int in the most dense # heads but a list of int in SSDHead - self.num_base_priors = self.prior_generator.num_base_priors[0] + self.num_base_priors: int = int(self.prior_generator.num_base_priors[0]) # type: ignore[misc] self._init_layers() @property @@ -121,9 +145,10 @@ def anchor_generator(self) -> AnchorGenerator: def _init_layers(self) -> None: """Initialize layers of the head.""" - self.conv_cls = nn.Conv2d(self.in_channels, self.num_base_priors * self.cls_out_channels, 1) - reg_dim = self.bbox_coder.encode_size - self.conv_reg = nn.Conv2d(self.in_channels, self.num_base_priors * reg_dim, 1) + self.conv_cls = nn.Conv2d(self.in_channels, self.num_base_priors * self.cls_out_channels, 1) # type: ignore[assignment] + reg_dim = self.bbox_coder.encode_size # type: ignore[attr-defined] + assert isinstance(reg_dim, int), "reg_dim must be an integer" + self.conv_reg = nn.Conv2d(self.in_channels, self.num_base_priors * reg_dim, 1) # type: ignore[assignment] def forward_single(self, x: Tensor) -> tuple[Tensor, Tensor]: """Forward feature of a single scale level. @@ -142,7 +167,7 @@ def forward_single(self, x: Tensor) -> tuple[Tensor, Tensor]: bbox_pred = self.conv_reg(x) return cls_score, bbox_pred - def forward(self, x: tuple[Tensor]) -> tuple[list[Tensor]]: + def forward(self, x: tuple[Tensor]) -> tuple[list[Tensor], list[Tensor]]: """Forward features from the upstream network. Args: @@ -159,11 +184,12 @@ def forward(self, x: tuple[Tensor]) -> tuple[list[Tensor]]: scale levels, each is a 4D-tensor, the channels number \ is num_base_priors * 4. """ - return multi_apply(self.forward_single, x) + cls_scores, bbox_preds = multi_apply(self.forward_single, x) + return list(cls_scores), list(bbox_preds) def get_anchors( self, - featmap_sizes: list[tuple], + featmap_sizes: Sequence[tuple[int, int] | torch.Size], batch_img_metas: list[dict], device: torch.device | str = "cuda", ) -> tuple[list[list[Tensor]], list[list[Tensor]]]: @@ -182,17 +208,23 @@ def get_anchors( - valid_flag_list (list[list[Tensor]]): Valid flags of each image. """ + normalized_sizes: list[tuple[int, int]] = [] + for size in featmap_sizes: + h, w = size[:2] + normalized_sizes.append((int(h), int(w))) num_imgs = len(batch_img_metas) # since feature map sizes of all images are the same, we only compute # anchors for one time - multi_level_anchors = self.prior_generator.grid_priors(featmap_sizes, device=device) + # PyTorch stubs incorrectly type AnchorGenerator methods as Tensor (not callable) + multi_level_anchors = self.prior_generator.grid_priors(normalized_sizes, device=device) # type: ignore[call-non-callable] anchor_list = [multi_level_anchors for _ in range(num_imgs)] # for each image, we compute valid flags of multi level anchors valid_flag_list = [] for img_id, img_meta in enumerate(batch_img_metas): - multi_level_flags = self.prior_generator.valid_flags(featmap_sizes, img_meta["pad_shape"], device) + # PyTorch stubs incorrectly type AnchorGenerator.valid_flags as Tensor (not callable) + multi_level_flags = self.prior_generator.valid_flags(normalized_sizes, img_meta["pad_shape"], device) # type: ignore[call-non-callable] valid_flag_list.append(multi_level_flags) return anchor_list, valid_flag_list @@ -238,11 +270,16 @@ def _get_targets_single( - neg_inds (Tensor): negative samples indexes. - sampling_result (:obj:`SamplingResult`): Sampling results. """ + assert self.train_cfg is not None, "train_cfg must be set for training" + train_cfg = self.train_cfg + if not isinstance(train_cfg, Mapping): + raise TypeError("train_cfg must be a mapping when training") + allowed_border = int(train_cfg.get("allowed_border", 0)) inside_flags = anchor_inside_flags( flat_anchors, valid_flags, img_meta["img_shape"][:2], - self.train_cfg["allowed_border"], + allowed_border, ) if not inside_flags.any(): raise ValueError( @@ -254,13 +291,16 @@ def _get_targets_single( anchors = flat_anchors[inside_flags] pred_instances = InstanceData(priors=anchors) - assign_result = self.assigner.assign(pred_instances, gt_instances, gt_instances_ignore) + assign_result = self.assigner.assign(pred_instances, gt_instances, gt_instances_ignore) # type: ignore[attr-defined,call-arg] # No sampling is required except for RPN and # Guided Anchoring algorithms - sampling_result = self.sampler.sample(assign_result, pred_instances, gt_instances) + sampling_result = self.sampler.sample(assign_result, pred_instances, gt_instances) # type: ignore[attr-defined,call-arg] - num_valid_anchors = anchors.shape[0] - target_dim = gt_instances.bboxes.size(-1) if self.reg_decoded_bbox else self.bbox_coder.encode_size + num_valid_anchors = int(anchors.shape[0]) + encode_size = self.bbox_coder.encode_size # type: ignore[attr-defined] + assert isinstance(encode_size, int), "encode_size must be an integer" + target_dim_raw = gt_instances.bboxes.size(-1) if self.reg_decoded_bbox else encode_size + target_dim = int(target_dim_raw) # Convert to int to satisfy type checker bbox_targets = anchors.new_zeros(num_valid_anchors, target_dim) bbox_weights = anchors.new_zeros(num_valid_anchors, target_dim) @@ -275,7 +315,7 @@ def _get_targets_single( # box type `pos_bbox_targets` to tensor. if len(pos_inds) > 0: if not self.reg_decoded_bbox: - pos_bbox_targets = self.bbox_coder.encode(sampling_result.pos_priors, sampling_result.pos_gt_bboxes) + pos_bbox_targets = self.bbox_coder.encode(sampling_result.pos_priors, sampling_result.pos_gt_bboxes) # type: ignore[attr-defined,call-arg] else: pos_bbox_targets = sampling_result.pos_gt_bboxes pos_bbox_targets = get_box_tensor(pos_bbox_targets) @@ -283,16 +323,17 @@ def _get_targets_single( bbox_weights[pos_inds, :] = 1.0 labels[pos_inds] = sampling_result.pos_gt_labels - if self.train_cfg["pos_weight"] <= 0: + pos_weight = float(train_cfg.get("pos_weight", 0)) + if pos_weight <= 0: label_weights[pos_inds] = 1.0 else: - label_weights[pos_inds] = self.train_cfg["pos_weight"] + label_weights[pos_inds] = pos_weight if len(neg_inds) > 0: label_weights[neg_inds] = 1.0 # map up to original set of anchors if unmap_outputs: - num_total_anchors = flat_anchors.size(0) + num_total_anchors = int(flat_anchors.size(0)) labels = unmap(labels, num_total_anchors, inside_flags, fill=self.num_classes) # fill bg label label_weights = unmap(label_weights, num_total_anchors, inside_flags) bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags) @@ -367,7 +408,9 @@ def get_targets( assert len(anchor_list) == len(valid_flag_list) == num_imgs if batch_gt_instances_ignore is None: - batch_gt_instances_ignore = [None] * num_imgs + gt_instances_ignore_list: list[InstanceData | None] = [None for _ in range(num_imgs)] + else: + gt_instances_ignore_list = list(batch_gt_instances_ignore) # anchor number of multi levels num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] @@ -376,7 +419,7 @@ def get_targets( concat_valid_flag_list = [] for i in range(num_imgs): assert len(anchor_list[i]) == len(valid_flag_list[i]) - concat_anchor_list.append(cat_boxes(anchor_list[i])) + concat_anchor_list.append(cat_boxes(anchor_list[i])) # type: ignore[arg-type] concat_valid_flag_list.append(torch.cat(valid_flag_list[i])) # compute targets for each image @@ -386,7 +429,7 @@ def get_targets( concat_valid_flag_list, batch_gt_instances, batch_img_metas, - batch_gt_instances_ignore, + gt_instances_ignore_list, unmap_outputs=unmap_outputs, ) ( @@ -469,13 +512,15 @@ def loss_by_feat_single( target_dim = bbox_targets.size(-1) bbox_targets = bbox_targets.reshape(-1, target_dim) bbox_weights = bbox_weights.reshape(-1, target_dim) - bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, self.bbox_coder.encode_size) + encode_size = self.bbox_coder.encode_size # type: ignore[attr-defined] + assert isinstance(encode_size, int), "encode_size must be an integer" + bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, encode_size) if self.reg_decoded_bbox: # When the regression loss (e.g. `IouLoss`, `GIouLoss`) # is applied directly on the decoded bounding boxes, it # decodes the already encoded coordinates to absolute format. anchors = anchors.reshape(-1, anchors.size(-1)) - bbox_pred = self.bbox_coder.decode(anchors, bbox_pred) + bbox_pred = self.bbox_coder.decode(anchors, bbox_pred) # type: ignore[attr-defined,call-arg] bbox_pred = get_box_tensor(bbox_pred) loss_bbox = self.loss_bbox(bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor) return loss_cls, loss_bbox @@ -535,7 +580,7 @@ def loss_by_feat( # concat all level anchors and flags to a single tensor concat_anchor_list = [] for i in range(len(anchor_list)): - concat_anchor_list.append(cat_boxes(anchor_list[i])) + concat_anchor_list.append(cat_boxes(anchor_list[i])) # type: ignore[arg-type] all_anchor_list = images_to_levels(concat_anchor_list, num_level_anchors) losses_cls, losses_bbox = multi_apply( diff --git a/visdet/models/dense_heads/base_dense_head.py b/visdet/models/dense_heads/base_dense_head.py index 0a993edb..337df274 100644 --- a/visdet/models/dense_heads/base_dense_head.py +++ b/visdet/models/dense_heads/base_dense_head.py @@ -2,9 +2,10 @@ import copy from abc import ABCMeta, abstractmethod from inspect import signature +from typing import Any, Mapping, Sequence, cast import torch -from torch import Tensor +from torch import Tensor, nn from visdet.cv.ops import batched_nms from visdet.engine.config import ConfigDict @@ -13,8 +14,8 @@ from visdet.models.test_time_augs import merge_aug_results from visdet.models.utils import filter_scores_and_topk, select_single_mlvl, unpack_gt_instances from visdet.structures import SampleList -from visdet.structures.bbox import cat_boxes, get_box_tensor, get_box_wh, scale_boxes -from visdet.utils import InstanceList, OptMultiConfig +from visdet.structures.bbox import BaseBoxes, cat_boxes, get_box_tensor, get_box_wh, scale_boxes +from visdet.utils import InstanceList class BaseDenseHead(BaseModule, metaclass=ABCMeta): @@ -54,11 +55,20 @@ class BaseDenseHead(BaseModule, metaclass=ABCMeta): loss_and_predict(): forward() -> loss_by_feat() -> predict_by_feat() """ - def __init__(self, init_cfg: OptMultiConfig = None) -> None: + # Type annotations for attributes set in subclasses + prior_generator: nn.Module + bbox_coder: nn.Module + loss_cls: nn.Module + use_sigmoid_cls: bool + cls_out_channels: int + test_cfg: ConfigDict | None + _raw_positive_infos: dict[str, Any] + + def __init__(self, init_cfg: dict[str, Any] | list[dict[str, Any]] | None = None) -> None: super().__init__(init_cfg=init_cfg) # `_raw_positive_infos` will be used in `get_positive_infos`, which # can get positive information. - self._raw_positive_infos = dict() + self._raw_positive_infos = {} # type: ignore[assignment] def init_weights(self) -> None: """Initialize the weights.""" @@ -69,7 +79,7 @@ def init_weights(self) -> None: if hasattr(m, "conv_offset"): constant_init(m.conv_offset, 0) - def get_positive_infos(self) -> InstanceList: + def get_positive_infos(self) -> InstanceList | None: """Get positive information from sampling results. Returns: @@ -83,7 +93,7 @@ def get_positive_infos(self) -> InstanceList: sampling_results = self._raw_positive_infos.get("sampling_results", None) assert sampling_results is not None positive_infos = [] - for sampling_result in enumerate(sampling_results): + for _, sampling_result in enumerate(sampling_results): pos_info = InstanceData() pos_info.bboxes = sampling_result.pos_gt_bboxes pos_info.labels = sampling_result.pos_gt_labels @@ -163,7 +173,17 @@ def loss_and_predict( ) losses = self.loss_by_feat(*loss_inputs) - predictions = self.predict_by_feat(*outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg) + # Unpack outs explicitly - forward returns (cls_scores, bbox_preds) or (cls_scores, bbox_preds, score_factors) + if len(outs) == 2: + cls_scores, bbox_preds = outs + predictions = self.predict_by_feat( + cls_scores, bbox_preds, batch_img_metas=batch_img_metas, cfg=proposal_cfg + ) + else: + cls_scores, bbox_preds, score_factors = outs + predictions = self.predict_by_feat( + cls_scores, bbox_preds, score_factors, batch_img_metas=batch_img_metas, cfg=proposal_cfg + ) return losses, predictions def predict(self, x: tuple[Tensor], batch_data_samples: SampleList, rescale: bool = False) -> InstanceList: @@ -187,7 +207,15 @@ def predict(self, x: tuple[Tensor], batch_data_samples: SampleList, rescale: boo outs = self(x) - predictions = self.predict_by_feat(*outs, batch_img_metas=batch_img_metas, rescale=rescale) + # Unpack outs explicitly - forward returns (cls_scores, bbox_preds) or (cls_scores, bbox_preds, score_factors) + if len(outs) == 2: + cls_scores, bbox_preds = outs + predictions = self.predict_by_feat(cls_scores, bbox_preds, batch_img_metas=batch_img_metas, rescale=rescale) + else: + cls_scores, bbox_preds, score_factors = outs + predictions = self.predict_by_feat( + cls_scores, bbox_preds, score_factors, batch_img_metas=batch_img_metas, rescale=rescale + ) return predictions def predict_by_feat( @@ -239,6 +267,7 @@ def predict_by_feat( the last dimension 4 arrange as (x1, y1, x2, y2). """ assert len(cls_scores) == len(bbox_preds) + assert batch_img_metas is not None, "batch_img_metas must be provided" if score_factors is None: # e.g. Retina, FreeAnchor, Foveabox, etc. @@ -251,7 +280,9 @@ def predict_by_feat( num_levels = len(cls_scores) featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)] - mlvl_priors = self.prior_generator.grid_priors( + # Type narrow prior_generator - it's a Module with grid_priors method + assert hasattr(self.prior_generator, "grid_priors"), "prior_generator must have grid_priors method" + mlvl_priors = self.prior_generator.grid_priors( # type: ignore[call-arg] featmap_sizes, dtype=cls_scores[0].dtype, device=cls_scores[0].device ) @@ -262,9 +293,15 @@ def predict_by_feat( cls_score_list = select_single_mlvl(cls_scores, img_id, detach=True) bbox_pred_list = select_single_mlvl(bbox_preds, img_id, detach=True) if with_score_factors: - score_factor_list = select_single_mlvl(score_factors, img_id, detach=True) + score_factor_raw = select_single_mlvl(score_factors, img_id, detach=True) + score_factor_list = cast(list[Tensor | None], list(score_factor_raw)) else: - score_factor_list = [None for _ in range(num_levels)] + empty_factors: list[Tensor | None] = [None for _ in range(num_levels)] + score_factor_list = empty_factors + + # Use test_cfg if cfg is not provided + effective_cfg = cfg if cfg is not None else self.test_cfg + assert effective_cfg is not None, "Either cfg or self.test_cfg must be provided" results = self._predict_by_feat_single( cls_score_list=cls_score_list, @@ -272,7 +309,7 @@ def predict_by_feat( score_factor_list=score_factor_list, mlvl_priors=mlvl_priors, img_meta=img_meta, - cfg=cfg, + cfg=effective_cfg, rescale=rescale, with_nms=with_nms, ) @@ -283,7 +320,7 @@ def _predict_by_feat_single( self, cls_score_list: list[Tensor], bbox_pred_list: list[Tensor], - score_factor_list: list[Tensor], + score_factor_list: list[Tensor | None], mlvl_priors: list[Tensor], img_meta: dict, cfg: ConfigDict, @@ -329,7 +366,7 @@ def _predict_by_feat_single( - bboxes (Tensor): Has a shape (num_instances, 4), the last dimension 4 arrange as (x1, y1, x2, y2). """ - if score_factor_list[0] is None: + if not score_factor_list or score_factor_list[0] is None: # e.g. Retina, FreeAnchor, etc. with_score_factors = False else: @@ -346,7 +383,7 @@ def _predict_by_feat_single( mlvl_scores = [] mlvl_labels = [] if with_score_factors: - mlvl_score_factors = [] + mlvl_score_factors: list[Tensor] | None = [] else: mlvl_score_factors = None for level_idx, (cls_score, bbox_pred, score_factor, priors) in enumerate( @@ -354,9 +391,11 @@ def _predict_by_feat_single( ): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] - dim = self.bbox_coder.encode_size + dim_raw = getattr(self.bbox_coder, "encode_size", 4) + dim = int(dim_raw) bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim) if with_score_factors: + assert score_factor is not None score_factor = score_factor.permute(1, 2, 0).reshape(-1).sigmoid() cls_score = cls_score.permute(1, 2, 0).reshape(-1, self.cls_out_channels) @@ -364,7 +403,7 @@ def _predict_by_feat_single( # CrossEntropyCustomLoss and FocalCustomLoss, and is currently used # in v3det. if getattr(self.loss_cls, "custom_cls_channels", False): - scores = self.loss_cls.get_activation(cls_score) + scores = self.loss_cls.get_activation(cls_score) # type: ignore[attr-defined,call-arg] elif self.use_sigmoid_cls: scores = cls_score.sigmoid() else: @@ -383,10 +422,12 @@ def _predict_by_feat_single( results = filter_scores_and_topk(scores, score_thr, nms_pre, dict(bbox_pred=bbox_pred, priors=priors)) scores, labels, keep_idxs, filtered_results = results + assert isinstance(filtered_results, dict) bbox_pred = filtered_results["bbox_pred"] priors = filtered_results["priors"] if with_score_factors: + assert score_factor is not None score_factor = score_factor[keep_idxs] mlvl_bbox_preds.append(bbox_pred) @@ -394,18 +435,23 @@ def _predict_by_feat_single( mlvl_scores.append(scores) mlvl_labels.append(labels) - if with_score_factors: + if with_score_factors and mlvl_score_factors is not None: + assert score_factor is not None mlvl_score_factors.append(score_factor) bbox_pred = torch.cat(mlvl_bbox_preds) - priors = cat_boxes(mlvl_valid_priors) - bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape) + if mlvl_valid_priors and isinstance(mlvl_valid_priors[0], BaseBoxes): + priors = cat_boxes(mlvl_valid_priors) # type: ignore[arg-type] + else: + priors = torch.cat(mlvl_valid_priors) # type: ignore[arg-type] + # Type narrow bbox_coder - it's a Module with decode method + bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape) # type: ignore[attr-defined,call-arg] results = InstanceData() results.bboxes = bboxes results.scores = torch.cat(mlvl_scores) results.labels = torch.cat(mlvl_labels) - if with_score_factors: + if with_score_factors and mlvl_score_factors is not None: results.score_factors = torch.cat(mlvl_score_factors) return self._bbox_post_process( @@ -453,8 +499,11 @@ def _bbox_post_process( the last dimension 4 arrange as (x1, y1, x2, y2). """ if rescale: + assert img_meta is not None assert img_meta.get("scale_factor") is not None - scale_factor = [1 / s for s in img_meta["scale_factor"]] + raw_scale_factor = img_meta["scale_factor"] + assert isinstance(raw_scale_factor, (tuple, list)) and len(raw_scale_factor) >= 2 + scale_factor = (1.0 / float(raw_scale_factor[0]), 1.0 / float(raw_scale_factor[1])) results.bboxes = scale_boxes(results.bboxes, scale_factor) if hasattr(results, "score_factors"): @@ -464,20 +513,26 @@ def _bbox_post_process( results.scores = results.scores * score_factors # filter small size bboxes - if cfg.get("min_bbox_size", -1) >= 0: + min_bbox_size = float(cfg.get("min_bbox_size", -1)) + if min_bbox_size >= 0: w, h = get_box_wh(results.bboxes) - valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size) + valid_mask = (w > min_bbox_size) & (h > min_bbox_size) if not valid_mask.all(): results = results[valid_mask] # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg if with_nms and results.bboxes.numel() > 0: bboxes = get_box_tensor(results.bboxes) - det_bboxes, keep_idxs = batched_nms(bboxes, results.scores, results.labels, cfg.nms) + nms_cfg = cfg.get("nms") + det_bboxes, keep_idxs = batched_nms(bboxes, results.scores, results.labels, nms_cfg) results = results[keep_idxs] # some nms would reweight the score, such as softnms results.scores = det_bboxes[:, -1] - results = results[: cfg.max_per_img] + max_per_img = int(cfg.get("max_per_img", 100)) + if keep_idxs.size(0) > max_per_img: + _, inds = results.scores.sort(descending=True) + inds = inds[:max_per_img] + results = results[inds] return results @@ -519,11 +574,11 @@ def aug_test( the last dimension 4 arrange as (x1, y1, x2, y2). """ # TODO: remove this for detr and deformdetr - sig_of_get_results = signature(self.get_results) - get_results_args = [p.name for p in sig_of_get_results.parameters.values()] - get_results_single_sig = signature(self._get_results_single) - get_results_single_sig_args = [p.name for p in get_results_single_sig.parameters.values()] - assert ("with_nms" in get_results_args) and ("with_nms" in get_results_single_sig_args), ( + sig_of_predict_by_feat = signature(self.predict_by_feat) + predict_by_feat_args = [p.name for p in sig_of_predict_by_feat.parameters.values()] + sig_of_predict_by_feat_single = signature(self._predict_by_feat_single) + predict_by_feat_single_args = [p.name for p in sig_of_predict_by_feat_single.parameters.values()] + assert ("with_nms" in predict_by_feat_args) and ("with_nms" in predict_by_feat_single_args), ( f"{self.__class__.__name__}does not support test-time augmentation " ) @@ -531,27 +586,48 @@ def aug_test( aug_batch_results = [] for x, img_metas in zip(aug_batch_feats, aug_batch_img_metas): outs = self.forward(x) - batch_instance_results = self.get_results( - *outs, - img_metas=img_metas, - cfg=self.test_cfg, - rescale=False, - with_nms=with_ori_nms, - **kwargs, - ) + # Type narrow test_cfg - it's defined in subclasses as ConfigDict + test_cfg = self.test_cfg if hasattr(self, "test_cfg") else None + # Unpack outs for predict_by_feat call + if len(outs) == 2: + cls_scores, bbox_preds = outs + batch_instance_results = self.predict_by_feat( + cls_scores, + bbox_preds, + batch_img_metas=img_metas, + cfg=test_cfg, + rescale=False, + with_nms=with_ori_nms, + ) + else: + cls_scores, bbox_preds, score_factors = outs + batch_instance_results = self.predict_by_feat( + cls_scores, + bbox_preds, + score_factors, + batch_img_metas=img_metas, + cfg=test_cfg, + rescale=False, + with_nms=with_ori_nms, + ) aug_batch_results.append(batch_instance_results) # after merging, bboxes will be rescaled to the original image batch_results = merge_aug_results(aug_batch_results, aug_batch_img_metas) + # Get test_cfg attributes with type narrowing + test_cfg = self.test_cfg if hasattr(self, "test_cfg") else None + nms_cfg = test_cfg.get("nms") if isinstance(test_cfg, ConfigDict) else None + max_per_img = test_cfg.get("max_per_img", 100) if isinstance(test_cfg, ConfigDict) else 100 + final_results = [] for img_id in range(num_imgs): results = batch_results[img_id] - det_bboxes, keep_idxs = batched_nms(results.bboxes, results.scores, results.labels, self.test_cfg.nms) + det_bboxes, keep_idxs = batched_nms(results.bboxes, results.scores, results.labels, nms_cfg) results = results[keep_idxs] # some nms operation may reweight the score such as softnms results.scores = det_bboxes[:, -1] - results = results[: self.test_cfg.max_per_img] + results = results[:max_per_img] if rescale: # all results have been mapped to the original scale # in `merge_aug_results`, so just pass diff --git a/visdet/models/dense_heads/rpn_head.py b/visdet/models/dense_heads/rpn_head.py index 027a1291..04531b7f 100644 --- a/visdet/models/dense_heads/rpn_head.py +++ b/visdet/models/dense_heads/rpn_head.py @@ -1,7 +1,6 @@ # ruff: noqa # fmt: off # isort: skip -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. import copy @@ -68,6 +67,12 @@ class RPNHead(AnchorHead): Defaults to 1. """ + # Type annotations + num_convs: int + rpn_conv: nn.Module + rpn_cls: nn.Module + rpn_reg: nn.Module + def __init__( self, in_channels: int, @@ -78,7 +83,7 @@ def __init__( ) -> None: if init_cfg is None: init_cfg = {"type": "Normal", "layer": "Conv2d", "std": 0.01} - self.num_convs = num_convs + self.num_convs = num_convs # type: ignore[assignment] assert num_classes == 1 super().__init__( num_classes=num_classes, in_channels=in_channels, init_cfg=init_cfg, **kwargs @@ -102,11 +107,12 @@ def _init_layers(self) -> None: self.rpn_conv = nn.Sequential(*rpn_convs) else: self.rpn_conv = nn.Conv2d(self.in_channels, self.feat_channels, 3, padding=1) - self.rpn_cls = nn.Conv2d( + self.rpn_cls = nn.Conv2d( # type: ignore[assignment] self.feat_channels, self.num_base_priors * self.cls_out_channels, 1 ) - reg_dim = self.bbox_coder.encode_size - self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_base_priors * reg_dim, 1) + reg_dim = self.bbox_coder.encode_size # type: ignore[attr-defined] + assert isinstance(reg_dim, int), "reg_dim must be an integer" + self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_base_priors * reg_dim, 1) # type: ignore[assignment] def forward_single(self, x: Tensor) -> tuple[Tensor, Tensor]: """Forward feature of a single scale level. @@ -253,7 +259,7 @@ def _predict_by_feat_single( bbox_pred = torch.cat(mlvl_bbox_preds) priors = cat_boxes(mlvl_valid_priors) - bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape) + bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape) # type: ignore[attr-defined,call-arg] results = InstanceData() results.bboxes = bboxes @@ -299,8 +305,11 @@ def _bbox_post_process( """ assert with_nms, "`with_nms` must be True in RPNHead" if rescale: + assert img_meta is not None assert img_meta.get("scale_factor") is not None - scale_factor = [1 / s for s in img_meta["scale_factor"]] + raw_scale_factor = img_meta["scale_factor"] + assert isinstance(raw_scale_factor, (tuple, list)) and len(raw_scale_factor) >= 2 + scale_factor = (1.0 / float(raw_scale_factor[0]), 1.0 / float(raw_scale_factor[1])) results.bboxes = scale_boxes(results.bboxes, scale_factor) # filter small size bboxes diff --git a/visdet/models/layers/__init__.py b/visdet/models/layers/__init__.py index daa3e1af..7f80b39e 100644 --- a/visdet/models/layers/__init__.py +++ b/visdet/models/layers/__init__.py @@ -33,7 +33,7 @@ def __init__(self, kernel_size=1, stride=1, dilation=1, padding="corner"): stride = to_2tuple(stride) dilation = to_2tuple(dilation) - self.padding = padding + self.padding: str = padding # type: ignore[misc] self.kernel_size = kernel_size self.stride = stride self.dilation = dilation @@ -90,8 +90,8 @@ def __init__( if stride is None: stride = patch_size - self.img_size = img_size - self.patch_size = patch_size + self.img_size: int = img_size # type: ignore[misc] + self.patch_size: int = patch_size # type: ignore[misc] kernel_size = to_2tuple(patch_size) stride = to_2tuple(stride) @@ -107,7 +107,7 @@ def __init__( # disable the padding of conv padding = 0 else: - self.adap_padding = None + self.adap_padding: None = None # type: ignore[misc] padding = to_2tuple(padding) @@ -124,7 +124,7 @@ def __init__( if norm_cfg is not None: self.norm = nn.LayerNorm(embed_dims) else: - self.norm = None + self.norm: None = None # type: ignore[misc] def forward(self, x): B, C, H, W = x.shape @@ -154,7 +154,7 @@ def __init__( super().__init__(init_cfg=init_cfg) self.in_channels = in_channels self.out_channels = out_channels - self.stride = stride + self.stride: int = stride # type: ignore[misc] self.reduction = nn.Linear(stride * stride * in_channels, out_channels, bias=False) self.norm = nn.LayerNorm(stride * stride * in_channels) diff --git a/visdet/models/layers/normed_predictor.py b/visdet/models/layers/normed_predictor.py index 70f50a1f..3605ed26 100644 --- a/visdet/models/layers/normed_predictor.py +++ b/visdet/models/layers/normed_predictor.py @@ -16,23 +16,27 @@ class NormedLinear(nn.Linear): Args: tempeature (float, optional): Tempeature term. Defaults to 20. - power (int, optional): Power term. Defaults to 1.0. + power (float, optional): Power term. Defaults to 1.0. eps (float, optional): The minimal value of divisor to keep numerical stability. Defaults to 1e-6. """ + tempearture: float + power: float + eps: float + def __init__( self, *args, tempearture: float = 20, - power: int = 1.0, + power: float = 1.0, eps: float = 1e-6, **kwargs, ) -> None: super().__init__(*args, **kwargs) - self.tempearture = tempearture - self.power = power - self.eps = eps + self.tempearture = tempearture # type: ignore[misc] + self.power = power # type: ignore[misc] + self.eps = eps # type: ignore[misc] self.init_weights() def init_weights(self) -> None: @@ -56,27 +60,32 @@ class NormedConv2d(nn.Conv2d): Args: tempeature (float, optional): Tempeature term. Defaults to 20. - power (int, optional): Power term. Defaults to 1.0. + power (float, optional): Power term. Defaults to 1.0. eps (float, optional): The minimal value of divisor to keep numerical stability. Defaults to 1e-6. norm_over_kernel (bool, optional): Normalize over kernel. Defaults to False. """ + tempearture: float + power: float + norm_over_kernel: bool + eps: float + def __init__( self, *args, tempearture: float = 20, - power: int = 1.0, + power: float = 1.0, eps: float = 1e-6, norm_over_kernel: bool = False, **kwargs, ) -> None: super().__init__(*args, **kwargs) - self.tempearture = tempearture - self.power = power - self.norm_over_kernel = norm_over_kernel - self.eps = eps + self.tempearture = tempearture # type: ignore[misc] + self.power = power # type: ignore[misc] + self.norm_over_kernel = norm_over_kernel # type: ignore[misc] + self.eps = eps # type: ignore[misc] def forward(self, x: Tensor) -> Tensor: """Forward function for `NormedConv2d`.""" @@ -91,7 +100,7 @@ def forward(self, x: Tensor) -> Tensor: x_ = x_ * self.tempearture if hasattr(self, "conv2d_forward"): - x_ = self.conv2d_forward(x_, weight_) + x_ = self.conv2d_forward(x_, weight_) # type: ignore[misc] else: if digit_version(torch.__version__) >= digit_version("1.8"): x_ = self._conv_forward(x_, weight_, self.bias) diff --git a/visdet/models/losses/accuracy.py b/visdet/models/losses/accuracy.py index ce4a5272..c63c89c7 100644 --- a/visdet/models/losses/accuracy.py +++ b/visdet/models/losses/accuracy.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Any + import torch.nn as nn @@ -49,7 +51,7 @@ def accuracy(pred, target, topk=1, thresh=None): class Accuracy(nn.Module): - def __init__(self, topk=(1,), thresh=None): + def __init__(self, topk: tuple[int, ...] | int = (1,), thresh: float | None = None): """Module to calculate the accuracy. Args: @@ -59,8 +61,8 @@ def __init__(self, topk=(1,), thresh=None): under this threshold are considered incorrect. Default to None. """ super().__init__() - self.topk = topk - self.thresh = thresh + self.topk: tuple[int, ...] | int = topk + self.thresh: float | None = thresh def forward(self, pred, target): """Forward function to calculate accuracy. diff --git a/visdet/models/losses/cross_entropy_loss.py b/visdet/models/losses/cross_entropy_loss.py index 9a6c0878..2e577fee 100644 --- a/visdet/models/losses/cross_entropy_loss.py +++ b/visdet/models/losses/cross_entropy_loss.py @@ -1,9 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. import warnings +from collections.abc import Callable +from typing import Any import torch import torch.nn as nn import torch.nn.functional as F +from torch import Tensor from visdet.models.losses.accuracy import accuracy from visdet.models.losses.utils import weight_reduce_loss @@ -196,14 +199,14 @@ def mask_cross_entropy( class CrossEntropyLoss(nn.Module): def __init__( self, - use_sigmoid=False, - use_mask=False, - reduction="mean", - class_weight=None, - ignore_index=None, - loss_weight=1.0, - avg_non_ignore=False, - ): + use_sigmoid: bool = False, + use_mask: bool = False, + reduction: str = "mean", + class_weight: list[float] | None = None, + ignore_index: int | None = None, + loss_weight: float = 1.0, + avg_non_ignore: bool = False, + ) -> None: """CrossEntropyLoss. Args: @@ -223,13 +226,13 @@ def __init__( """ super(CrossEntropyLoss, self).__init__() assert (use_sigmoid is False) or (use_mask is False) - self.use_sigmoid = use_sigmoid - self.use_mask = use_mask - self.reduction = reduction - self.loss_weight = loss_weight - self.class_weight = class_weight - self.ignore_index = ignore_index - self.avg_non_ignore = avg_non_ignore + self.use_sigmoid: bool = use_sigmoid + self.use_mask: bool = use_mask + self.reduction: str = reduction + self.loss_weight: float = loss_weight + self.class_weight: list[float] | None = class_weight + self.ignore_index: int | None = ignore_index + self.avg_non_ignore: bool = avg_non_ignore if (ignore_index is not None) and not self.avg_non_ignore and self.reduction == "mean": warnings.warn( "Default ``avg_non_ignore`` is False, if you would like to " @@ -239,11 +242,11 @@ def __init__( ) if self.use_sigmoid: - self.cls_criterion = binary_cross_entropy + self.cls_criterion: Callable[..., Tensor] = binary_cross_entropy elif self.use_mask: - self.cls_criterion = mask_cross_entropy + self.cls_criterion: Callable[..., Tensor] = mask_cross_entropy else: - self.cls_criterion = cross_entropy + self.cls_criterion: Callable[..., Tensor] = cross_entropy def extra_repr(self): """Extra repr.""" @@ -252,14 +255,14 @@ def extra_repr(self): def forward( self, - cls_score, - label, - weight=None, - avg_factor=None, - reduction_override=None, - ignore_index=None, - **kwargs, - ): + cls_score: Tensor, + label: Tensor, + weight: Tensor | None = None, + avg_factor: int | None = None, + reduction_override: str | None = None, + ignore_index: int | None = None, + **kwargs: Any, + ) -> Tensor: """Forward function. Args: @@ -302,15 +305,15 @@ def forward( class CrossEntropyCustomLoss(CrossEntropyLoss): def __init__( self, - use_sigmoid=False, - use_mask=False, - reduction="mean", - num_classes=-1, - class_weight=None, - ignore_index=None, - loss_weight=1.0, - avg_non_ignore=False, - ): + use_sigmoid: bool = False, + use_mask: bool = False, + reduction: str = "mean", + num_classes: int = -1, + class_weight: list[float] | None = None, + ignore_index: int | None = None, + loss_weight: float = 1.0, + avg_non_ignore: bool = False, + ) -> None: """CrossEntropyCustomLoss. Args: @@ -331,13 +334,13 @@ def __init__( """ super(CrossEntropyCustomLoss, self).__init__() assert (use_sigmoid is False) or (use_mask is False) - self.use_sigmoid = use_sigmoid - self.use_mask = use_mask - self.reduction = reduction - self.loss_weight = loss_weight - self.class_weight = class_weight - self.ignore_index = ignore_index - self.avg_non_ignore = avg_non_ignore + self.use_sigmoid: bool = use_sigmoid + self.use_mask: bool = use_mask + self.reduction: str = reduction + self.loss_weight: float = loss_weight + self.class_weight: list[float] | None = class_weight + self.ignore_index: int | None = ignore_index + self.avg_non_ignore: bool = avg_non_ignore if (ignore_index is not None) and not self.avg_non_ignore and self.reduction == "mean": warnings.warn( "Default ``avg_non_ignore`` is False, if you would like to " @@ -347,22 +350,22 @@ def __init__( ) if self.use_sigmoid: - self.cls_criterion = binary_cross_entropy + self.cls_criterion: Callable[..., Tensor] = binary_cross_entropy elif self.use_mask: - self.cls_criterion = mask_cross_entropy + self.cls_criterion: Callable[..., Tensor] = mask_cross_entropy else: - self.cls_criterion = cross_entropy + self.cls_criterion: Callable[..., Tensor] = cross_entropy - self.num_classes = num_classes + self.num_classes: int = num_classes assert self.num_classes != -1 # custom output channels of the classifier - self.custom_cls_channels = True + self.custom_cls_channels: bool = True # custom activation of cls_score - self.custom_activation = True + self.custom_activation: bool = True # custom accuracy of the classsifier - self.custom_accuracy = True + self.custom_accuracy: bool = True def get_cls_channels(self, num_classes): assert num_classes == self.num_classes diff --git a/visdet/models/losses/smooth_l1_loss.py b/visdet/models/losses/smooth_l1_loss.py index 0bfcf0ec..b9531c8f 100644 --- a/visdet/models/losses/smooth_l1_loss.py +++ b/visdet/models/losses/smooth_l1_loss.py @@ -64,9 +64,9 @@ class SmoothL1Loss(nn.Module): def __init__(self, beta: float = 1.0, reduction: str = "mean", loss_weight: float = 1.0) -> None: super().__init__() - self.beta = beta - self.reduction = reduction - self.loss_weight = loss_weight + self.beta: float = beta + self.reduction: str = reduction + self.loss_weight: float = loss_weight def forward( self, @@ -123,8 +123,8 @@ class L1Loss(nn.Module): def __init__(self, reduction: str = "mean", loss_weight: float = 1.0) -> None: super().__init__() - self.reduction = reduction - self.loss_weight = loss_weight + self.reduction: str = reduction + self.loss_weight: float = loss_weight def forward( self, diff --git a/visdet/models/losses/utils.py b/visdet/models/losses/utils.py index fad70d7b..ba49f1c8 100644 --- a/visdet/models/losses/utils.py +++ b/visdet/models/losses/utils.py @@ -1,5 +1,3 @@ -# ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. import functools from collections.abc import Callable @@ -19,14 +17,15 @@ def reduce_loss(loss: Tensor, reduction: str) -> Tensor: Return: Tensor: Reduced loss tensor. """ - reduction_enum = F._Reduction.get_enum(reduction) - # none: 0, elementwise_mean:1, sum: 2 - if reduction_enum == 0: + # Use string comparison instead of F._Reduction which is private + if reduction == "none": return loss - elif reduction_enum == 1: + elif reduction == "mean": return loss.mean() - elif reduction_enum == 2: + elif reduction == "sum": return loss.sum() + else: + raise ValueError(f"Invalid reduction mode: {reduction}") def weight_reduce_loss( diff --git a/visdet/models/roi_heads/base_roi_head.py b/visdet/models/roi_heads/base_roi_head.py index 01d1a568..98a50df2 100644 --- a/visdet/models/roi_heads/base_roi_head.py +++ b/visdet/models/roi_heads/base_roi_head.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta, abstractmethod +from typing import Any from torch import Tensor @@ -12,6 +13,11 @@ class BaseRoIHead(BaseModule, metaclass=ABCMeta): """Base class for RoIHeads.""" + train_cfg: Any + test_cfg: Any + predict_bbox: Any # Method attribute defined in subclasses + predict_mask: Any # Method attribute defined in subclasses + def __init__( self, bbox_roi_extractor: OptMultiConfig = None, @@ -24,8 +30,8 @@ def __init__( init_cfg: OptMultiConfig = None, ) -> None: super().__init__(init_cfg=init_cfg) - self.train_cfg = train_cfg - self.test_cfg = test_cfg + self.train_cfg = train_cfg # type: ignore[unresolved-attribute] + self.test_cfg = test_cfg # type: ignore[unresolved-attribute] if shared_head is not None: self.shared_head = MODELS.build(shared_head) diff --git a/visdet/models/roi_heads/bbox_heads/bbox_head.py b/visdet/models/roi_heads/bbox_heads/bbox_head.py index eb26c7a4..6cd403d6 100644 --- a/visdet/models/roi_heads/bbox_heads/bbox_head.py +++ b/visdet/models/roi_heads/bbox_heads/bbox_head.py @@ -1,5 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Any + import torch import torch.nn as nn import torch.nn.functional as F @@ -23,6 +25,26 @@ class BBoxHead(BaseModule): """Simplest RoI head, with only two fc layers for classification and regression respectively.""" + with_avg_pool: bool + with_cls: bool + with_reg: bool + roi_feat_size: tuple[int, int] + roi_feat_area: int + in_channels: int + num_classes: int + predict_box_type: str + reg_class_agnostic: bool + reg_decoded_bbox: bool + reg_predictor_cfg: Any + cls_predictor_cfg: Any + bbox_coder: Any + loss_cls: Any + loss_bbox: Any + fc_cls: Any + fc_reg: Any + avg_pool: Any + debug_imgs: Any + def __init__( self, with_avg_pool: bool = False, @@ -48,26 +70,26 @@ def __init__( ) -> None: super().__init__(init_cfg=init_cfg) assert with_cls or with_reg - self.with_avg_pool = with_avg_pool - self.with_cls = with_cls - self.with_reg = with_reg - self.roi_feat_size = _pair(roi_feat_size) - self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1] - self.in_channels = in_channels - self.num_classes = num_classes - self.predict_box_type = predict_box_type - self.reg_class_agnostic = reg_class_agnostic - self.reg_decoded_bbox = reg_decoded_bbox - self.reg_predictor_cfg = reg_predictor_cfg - self.cls_predictor_cfg = cls_predictor_cfg - - self.bbox_coder = TASK_UTILS.build(bbox_coder) - self.loss_cls = MODELS.build(loss_cls) - self.loss_bbox = MODELS.build(loss_bbox) + self.with_avg_pool = with_avg_pool # type: ignore[unresolved-attribute] + self.with_cls = with_cls # type: ignore[unresolved-attribute] + self.with_reg = with_reg # type: ignore[unresolved-attribute] + self.roi_feat_size = _pair(roi_feat_size) # type: ignore[unresolved-attribute] + self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1] # type: ignore[unresolved-attribute] + self.in_channels = in_channels # type: ignore[unresolved-attribute] + self.num_classes = num_classes # type: ignore[unresolved-attribute] + self.predict_box_type = predict_box_type # type: ignore[unresolved-attribute] + self.reg_class_agnostic = reg_class_agnostic # type: ignore[unresolved-attribute] + self.reg_decoded_bbox = reg_decoded_bbox # type: ignore[unresolved-attribute] + self.reg_predictor_cfg = reg_predictor_cfg # type: ignore[unresolved-attribute] + self.cls_predictor_cfg = cls_predictor_cfg # type: ignore[unresolved-attribute] + + self.bbox_coder = TASK_UTILS.build(bbox_coder) # type: ignore[unresolved-attribute] + self.loss_cls = MODELS.build(loss_cls) # type: ignore[unresolved-attribute] + self.loss_bbox = MODELS.build(loss_bbox) # type: ignore[unresolved-attribute] in_channels = self.in_channels if self.with_avg_pool: - self.avg_pool = nn.AvgPool2d(self.roi_feat_size) + self.avg_pool = nn.AvgPool2d(self.roi_feat_size) # type: ignore[unresolved-attribute] else: in_channels *= self.roi_feat_area if self.with_cls: @@ -78,17 +100,17 @@ def __init__( cls_channels = num_classes + 1 cls_predictor_cfg_ = self.cls_predictor_cfg.copy() cls_predictor_cfg_.update(in_features=in_channels, out_features=cls_channels) - self.fc_cls = MODELS.build(cls_predictor_cfg_) + self.fc_cls = MODELS.build(cls_predictor_cfg_) # type: ignore[unresolved-attribute] if self.with_reg: box_dim = self.bbox_coder.encode_size out_dim_reg = box_dim if reg_class_agnostic else box_dim * num_classes reg_predictor_cfg_ = self.reg_predictor_cfg.copy() if isinstance(reg_predictor_cfg_, (dict, ConfigDict)): reg_predictor_cfg_.update(in_features=in_channels, out_features=out_dim_reg) - self.fc_reg = MODELS.build(reg_predictor_cfg_) - self.debug_imgs = None + self.fc_reg = MODELS.build(reg_predictor_cfg_) # type: ignore[unresolved-attribute] + self.debug_imgs = None # type: ignore[unresolved-attribute] if init_cfg is None: - self.init_cfg = [] + self.init_cfg = [] # type: ignore[unresolved-attribute] if self.with_cls: self.init_cfg += [dict(type="Normal", std=0.01, override=dict(name="fc_cls"))] if self.with_reg: @@ -112,11 +134,11 @@ def custom_accuracy(self) -> bool: """get custom_accuracy from loss_cls.""" return getattr(self.loss_cls, "custom_accuracy", False) - def forward(self, x: tuple[Tensor]) -> tuple: + def forward(self, x: Tensor) -> tuple: """Forward features from the upstream network. Args: - x (tuple[Tensor]): Features from the upstream network, each is + x (Tensor): Features from the upstream network, each is a 4D-tensor. Returns: @@ -136,7 +158,7 @@ def forward(self, x: tuple[Tensor]) -> tuple: else: # avg_pool does not support empty tensor, # so use torch.mean instead it - x = torch.mean(x, dim=(-1, -2)) + x = torch.mean(x, dim=(-1, -2), keepdim=False) cls_score = self.fc_cls(x) if self.with_cls else None bbox_pred = self.fc_reg(x) if self.with_reg else None return cls_score, bbox_pred @@ -199,7 +221,8 @@ def get_bboxes( if cfg is None: return bboxes, scores else: - det_bboxes, det_labels = multiclass_nms(bboxes, scores, cfg.score_thr, cfg.nms, cfg.max_per_img) + nms_result = multiclass_nms(bboxes, scores, cfg.score_thr, cfg.nms, cfg.max_per_img) + det_bboxes, det_labels = nms_result[0], nms_result[1] return det_bboxes, det_labels @@ -381,7 +404,7 @@ def loss_and_target( bbox_pred, rois, *cls_reg_targets, - reduction_override=reduction_override, + reduction_override=reduction_override, # type: ignore[arg-type] ) # cls_reg_targets is only for cascade rcnn @@ -607,8 +630,9 @@ def _predict_by_feat_single( if rescale and bboxes.size(0) > 0: assert img_meta.get("scale_factor") is not None - scale_factor = [1 / s for s in img_meta["scale_factor"]] - bboxes = scale_boxes(bboxes, scale_factor) + scale_factor_list = [1 / s for s in img_meta["scale_factor"]] + scale_factor_tuple = (scale_factor_list[0], scale_factor_list[1]) + bboxes = scale_boxes(bboxes, scale_factor_tuple) # Get the inside tensor when `bboxes` is a box type bboxes = get_box_tensor(bboxes) @@ -621,7 +645,7 @@ def _predict_by_feat_single( results.bboxes = bboxes results.scores = scores else: - det_bboxes, det_labels = multiclass_nms( + nms_result = multiclass_nms( bboxes, scores, rcnn_test_cfg["score_thr"], @@ -629,6 +653,7 @@ def _predict_by_feat_single( rcnn_test_cfg["max_per_img"], box_dim=box_dim, ) + det_bboxes, det_labels = nms_result[0], nms_result[1] results.bboxes = det_bboxes[:, :-1] results.scores = det_bboxes[:, -1] results.labels = det_labels diff --git a/visdet/models/roi_heads/bbox_heads/convfc_bbox_head.py b/visdet/models/roi_heads/bbox_heads/convfc_bbox_head.py index 968b6326..f97273f2 100644 --- a/visdet/models/roi_heads/bbox_heads/convfc_bbox_head.py +++ b/visdet/models/roi_heads/bbox_heads/convfc_bbox_head.py @@ -1,5 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Any + import torch.nn as nn from torch import Tensor @@ -21,6 +23,27 @@ class ConvFCBBoxHead(BBoxHead): \-> reg convs -> reg fcs -> reg """ + num_shared_convs: int + num_shared_fcs: int + num_cls_convs: int + num_cls_fcs: int + num_reg_convs: int + num_reg_fcs: int + conv_out_channels: int + fc_out_channels: int + conv_cfg: Any + norm_cfg: Any + shared_convs: Any + shared_fcs: Any + shared_out_channels: int + cls_convs: Any + cls_fcs: Any + cls_last_dim: int + reg_convs: Any + reg_fcs: Any + reg_last_dim: int + relu: nn.ReLU + def __init__( self, num_shared_convs: int = 0, @@ -37,7 +60,7 @@ def __init__( *args, **kwargs, ) -> None: - super().__init__(*args, init_cfg=init_cfg, **kwargs) + super().__init__(*args, init_cfg=init_cfg, **kwargs) # type: ignore[misc] assert num_shared_convs + num_shared_fcs + num_cls_convs + num_cls_fcs + num_reg_convs + num_reg_fcs > 0 if num_cls_convs > 0 or num_reg_convs > 0: assert num_shared_fcs == 0 @@ -45,30 +68,30 @@ def __init__( assert num_cls_convs == 0 and num_cls_fcs == 0 if not self.with_reg: assert num_reg_convs == 0 and num_reg_fcs == 0 - self.num_shared_convs = num_shared_convs - self.num_shared_fcs = num_shared_fcs - self.num_cls_convs = num_cls_convs - self.num_cls_fcs = num_cls_fcs - self.num_reg_convs = num_reg_convs - self.num_reg_fcs = num_reg_fcs - self.conv_out_channels = conv_out_channels - self.fc_out_channels = fc_out_channels - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg + self.num_shared_convs = num_shared_convs # type: ignore[unresolved-attribute] + self.num_shared_fcs = num_shared_fcs # type: ignore[unresolved-attribute] + self.num_cls_convs = num_cls_convs # type: ignore[unresolved-attribute] + self.num_cls_fcs = num_cls_fcs # type: ignore[unresolved-attribute] + self.num_reg_convs = num_reg_convs # type: ignore[unresolved-attribute] + self.num_reg_fcs = num_reg_fcs # type: ignore[unresolved-attribute] + self.conv_out_channels = conv_out_channels # type: ignore[unresolved-attribute] + self.fc_out_channels = fc_out_channels # type: ignore[unresolved-attribute] + self.conv_cfg = conv_cfg # type: ignore[unresolved-attribute] + self.norm_cfg = norm_cfg # type: ignore[unresolved-attribute] # add shared convs and fcs - self.shared_convs, self.shared_fcs, last_layer_dim = self._add_conv_fc_branch( + self.shared_convs, self.shared_fcs, last_layer_dim = self._add_conv_fc_branch( # type: ignore[unresolved-attribute] self.num_shared_convs, self.num_shared_fcs, self.in_channels, True ) - self.shared_out_channels = last_layer_dim + self.shared_out_channels = last_layer_dim # type: ignore[unresolved-attribute] # add cls specific branch - self.cls_convs, self.cls_fcs, self.cls_last_dim = self._add_conv_fc_branch( + self.cls_convs, self.cls_fcs, self.cls_last_dim = self._add_conv_fc_branch( # type: ignore[unresolved-attribute] self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels ) # add reg specific branch - self.reg_convs, self.reg_fcs, self.reg_last_dim = self._add_conv_fc_branch( + self.reg_convs, self.reg_fcs, self.reg_last_dim = self._add_conv_fc_branch( # type: ignore[unresolved-attribute] self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels ) @@ -78,7 +101,7 @@ def __init__( if self.num_reg_fcs == 0: self.reg_last_dim *= self.roi_feat_area - self.relu = nn.ReLU(inplace=True) + self.relu = nn.ReLU(inplace=True) # type: ignore[unresolved-attribute] # reconstruct fc_cls and fc_reg since input channels are changed if self.with_cls: if self.custom_cls_channels: @@ -87,17 +110,17 @@ def __init__( cls_channels = self.num_classes + 1 cls_predictor_cfg_ = self.cls_predictor_cfg.copy() cls_predictor_cfg_.update(in_features=self.cls_last_dim, out_features=cls_channels) - self.fc_cls = MODELS.build(cls_predictor_cfg_) + self.fc_cls = MODELS.build(cls_predictor_cfg_) # type: ignore[unresolved-attribute] if self.with_reg: box_dim = self.bbox_coder.encode_size out_dim_reg = box_dim if self.reg_class_agnostic else box_dim * self.num_classes reg_predictor_cfg_ = self.reg_predictor_cfg.copy() if isinstance(reg_predictor_cfg_, (dict, ConfigDict)): reg_predictor_cfg_.update(in_features=self.reg_last_dim, out_features=out_dim_reg) - self.fc_reg = MODELS.build(reg_predictor_cfg_) + self.fc_reg = MODELS.build(reg_predictor_cfg_) # type: ignore[unresolved-attribute] if init_cfg is None: - self.init_cfg += [ + self.init_cfg += [ # type: ignore[unresolved-attribute,operator] dict( type="Xavier", distribution="uniform", @@ -150,7 +173,7 @@ def _add_conv_fc_branch( last_layer_dim = self.fc_out_channels return branch_convs, branch_fcs, last_layer_dim - def forward(self, x: tuple[Tensor]) -> tuple: + def forward(self, x: Tensor) -> tuple: """Forward features from the upstream network. Args: @@ -211,7 +234,7 @@ def forward(self, x: tuple[Tensor]) -> tuple: # reduce the dumb classifications errors @MODELS.register_module() class Shared2FCBBoxHead(ConvFCBBoxHead): - def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None: + def __init__(self, fc_out_channels: int = 1024, **kwargs) -> None: super().__init__( num_shared_convs=0, num_shared_fcs=2, @@ -220,14 +243,13 @@ def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None: num_reg_convs=0, num_reg_fcs=0, fc_out_channels=fc_out_channels, - *args, **kwargs, ) @MODELS.register_module() class Shared4Conv1FCBBoxHead(ConvFCBBoxHead): - def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None: + def __init__(self, fc_out_channels: int = 1024, **kwargs) -> None: super().__init__( num_shared_convs=4, num_shared_fcs=1, @@ -236,6 +258,5 @@ def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None: num_reg_convs=0, num_reg_fcs=0, fc_out_channels=fc_out_channels, - *args, **kwargs, ) diff --git a/visdet/models/roi_heads/cascade_roi_head.py b/visdet/models/roi_heads/cascade_roi_head.py index a5cdabdf..ae6e7fc3 100644 --- a/visdet/models/roi_heads/cascade_roi_head.py +++ b/visdet/models/roi_heads/cascade_roi_head.py @@ -1,5 +1,3 @@ -# ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. """Cascade RoI head for visdet.""" @@ -12,7 +10,7 @@ from torch import Tensor from visdet.engine.structures import InstanceData - +from visdet.models.roi_heads.base_roi_head import BaseRoIHead from visdet.models.task_modules.samplers import SamplingResult from visdet.models.test_time_augs import merge_aug_masks from visdet.models.utils import empty_instances, unpack_gt_instances @@ -27,8 +25,6 @@ OptMultiConfig, ) -from visdet.models.roi_heads.base_roi_head import BaseRoIHead - @MODELS.register_module() class CascadeRoIHead(BaseRoIHead): @@ -54,8 +50,8 @@ def __init__( assert bbox_head is not None assert shared_head is None, "Shared head is not supported in Cascade RCNN." - self.num_stages = num_stages - self.stage_loss_weights = stage_loss_weights + self.num_stages = num_stages # type: ignore[misc] + self.stage_loss_weights = stage_loss_weights # type: ignore[misc] super().__init__( bbox_roi_extractor=bbox_roi_extractor, bbox_head=bbox_head, @@ -93,29 +89,29 @@ def init_mask_head(self, mask_roi_extractor: MultiConfig, mask_head: MultiConfig self.mask_head.append(MODELS.build(head_cfg)) if mask_roi_extractor is not None: - self.share_roi_extractor = False - self.mask_roi_extractor = nn.ModuleList() + self.share_roi_extractor = False # type: ignore[misc] + self.mask_roi_extractor = nn.ModuleList() # type: ignore[misc] if not isinstance(mask_roi_extractor, list): mask_roi_extractor = [mask_roi_extractor for _ in range(self.num_stages)] assert len(mask_roi_extractor) == self.num_stages for roi_extractor_cfg in mask_roi_extractor: - self.mask_roi_extractor.append(MODELS.build(roi_extractor_cfg)) + self.mask_roi_extractor.append(MODELS.build(roi_extractor_cfg)) # type: ignore[union-attr] else: - self.share_roi_extractor = True - self.mask_roi_extractor = self.bbox_roi_extractor + self.share_roi_extractor = True # type: ignore[misc] + self.mask_roi_extractor = self.bbox_roi_extractor # type: ignore[misc] def init_assigner_sampler(self) -> None: """Initialize assigner and sampler for each stage.""" - self.bbox_assigner: list | None = [] - self.bbox_sampler: list | None = [] + self.bbox_assigner: list | None = [] # type: ignore[misc] + self.bbox_sampler: list | None = [] # type: ignore[misc] if self.train_cfg is not None: assert isinstance(self.train_cfg, (list, tuple)), ( "Cascade RCNN expects list-style train_cfg for each stage." ) for idx, rcnn_train_cfg in enumerate(self.train_cfg): - self.bbox_assigner.append(TASK_UTILS.build(rcnn_train_cfg["assigner"])) - self.current_stage = idx - self.bbox_sampler.append( + self.bbox_assigner.append(TASK_UTILS.build(rcnn_train_cfg["assigner"])) # type: ignore[union-attr] + self.current_stage = idx # type: ignore[misc] + self.bbox_sampler.append( # type: ignore[union-attr] TASK_UTILS.build(rcnn_train_cfg["sampler"], default_args=dict(context=self)), ) @@ -185,13 +181,13 @@ def loss( losses: dict[str, Tensor] = {} results_list = rpn_results_list for stage in range(self.num_stages): - self.current_stage = stage + self.current_stage = stage # type: ignore[misc] stage_loss_weight = self.stage_loss_weights[stage] sampling_results: list[SamplingResult] = [] if self.with_bbox or self.with_mask: - bbox_assigner = self.bbox_assigner[stage] - bbox_sampler = self.bbox_sampler[stage] + bbox_assigner = self.bbox_assigner[stage] # type: ignore[index] + bbox_sampler = self.bbox_sampler[stage] # type: ignore[index] for i in range(num_imgs): results = results_list[i] results.priors = results.pop("bboxes") @@ -346,7 +342,10 @@ def _refine_roi( refined_bboxes = get_box_tensor(refined_bboxes) refined_rois = torch.cat([rois[img_idx][:, [0]], refined_bboxes], dim=1) refine_rois_list.append(refined_rois) - rois = torch.cat(refine_rois_list) if refine_rois_list else rois[0].new_zeros((0, 5)) + if refine_rois_list: + rois = torch.cat(refine_rois_list, dim=0) + else: + rois = rois[0].new_zeros((0, 5)) cls_scores = [ sum(score_set[i] for score_set in ms_scores) / float(len(ms_scores)) for i in range(len(batch_img_metas)) @@ -371,7 +370,7 @@ def forward( if self.with_mask: aug_masks = [] if isinstance(rois, (list, tuple)): - rois = torch.cat(rois) + rois = torch.cat(list(rois), dim=0) # type: ignore[arg-type] for stage in range(self.num_stages): mask_results = self._mask_forward(stage, x, rois) mask_preds = mask_results["mask_preds"].split(num_proposals_per_img, 0) diff --git a/visdet/models/roi_heads/mask_heads/fcn_mask_head.py b/visdet/models/roi_heads/mask_heads/fcn_mask_head.py index d7ebbd05..b6a6db6a 100644 --- a/visdet/models/roi_heads/mask_heads/fcn_mask_head.py +++ b/visdet/models/roi_heads/mask_heads/fcn_mask_head.py @@ -1,5 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Any + import numpy as np import torch import torch.nn as nn @@ -14,6 +16,7 @@ from visdet.models.task_modules.samplers import SamplingResult from visdet.models.utils import empty_instances from visdet.registry import MODELS +from visdet.structures.bbox import get_box_tensor from visdet.structures.mask import mask_target from visdet.utils import ConfigType, InstanceList, OptConfigType, OptMultiConfig @@ -25,6 +28,26 @@ @MODELS.register_module() class FCNMaskHead(BaseModule): + num_convs: int + roi_feat_size: tuple[int, int] + in_channels: int + conv_kernel_size: int + conv_out_channels: int + upsample_cfg: Any + upsample_method: Any + scale_factor: Any + num_classes: int + class_agnostic: int + conv_cfg: Any + norm_cfg: Any + predictor_cfg: Any + loss_mask: Any + convs: Any + upsample: Any + conv_logits: Any + relu: nn.ReLU + debug_imgs: Any + def __init__( self, num_convs: int = 4, @@ -43,27 +66,27 @@ def __init__( ) -> None: assert init_cfg is None, "To prevent abnormal initialization behavior, init_cfg is not allowed to be set" super().__init__(init_cfg=init_cfg) - self.upsample_cfg = upsample_cfg.copy() + self.upsample_cfg = upsample_cfg.copy() # type: ignore[unresolved-attribute] if self.upsample_cfg["type"] not in [None, "deconv", "nearest", "bilinear"]: raise ValueError( f'Invalid upsample method {self.upsample_cfg["type"]}, accepted methods are "deconv", "nearest", "bilinear"' ) - self.num_convs = num_convs + self.num_convs = num_convs # type: ignore[unresolved-attribute] # WARN: roi_feat_size is reserved and not used - self.roi_feat_size = _pair(roi_feat_size) - self.in_channels = in_channels - self.conv_kernel_size = conv_kernel_size - self.conv_out_channels = conv_out_channels - self.upsample_method = self.upsample_cfg.get("type") - self.scale_factor = self.upsample_cfg.pop("scale_factor", None) - self.num_classes = num_classes - self.class_agnostic = class_agnostic - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.predictor_cfg = predictor_cfg - self.loss_mask = MODELS.build(loss_mask) - - self.convs = ModuleList() + self.roi_feat_size = _pair(roi_feat_size) # type: ignore[unresolved-attribute] + self.in_channels = in_channels # type: ignore[unresolved-attribute] + self.conv_kernel_size = conv_kernel_size # type: ignore[unresolved-attribute] + self.conv_out_channels = conv_out_channels # type: ignore[unresolved-attribute] + self.upsample_method = self.upsample_cfg.get("type") # type: ignore[unresolved-attribute] + self.scale_factor = self.upsample_cfg.pop("scale_factor", None) # type: ignore[unresolved-attribute] + self.num_classes = num_classes # type: ignore[unresolved-attribute] + self.class_agnostic = class_agnostic # type: ignore[unresolved-attribute] + self.conv_cfg = conv_cfg # type: ignore[unresolved-attribute] + self.norm_cfg = norm_cfg # type: ignore[unresolved-attribute] + self.predictor_cfg = predictor_cfg # type: ignore[unresolved-attribute] + self.loss_mask = MODELS.build(loss_mask) # type: ignore[unresolved-attribute] + + self.convs = ModuleList() # type: ignore[unresolved-attribute] for i in range(self.num_convs): in_channels = self.in_channels if i == 0 else self.conv_out_channels padding = (self.conv_kernel_size - 1) // 2 @@ -80,7 +103,7 @@ def __init__( upsample_in_channels = self.conv_out_channels if self.num_convs > 0 else in_channels upsample_cfg_ = self.upsample_cfg.copy() if self.upsample_method is None: - self.upsample = None + self.upsample = None # type: ignore[unresolved-attribute] elif self.upsample_method == "deconv": upsample_cfg_.update( in_channels=upsample_in_channels, @@ -88,7 +111,7 @@ def __init__( kernel_size=self.scale_factor, stride=self.scale_factor, ) - self.upsample = build_upsample_layer(upsample_cfg_) + self.upsample = build_upsample_layer(upsample_cfg_) # type: ignore[unresolved-attribute] else: # suppress warnings align_corners = None if self.upsample_method == "nearest" else False @@ -97,13 +120,13 @@ def __init__( mode=self.upsample_method, align_corners=align_corners, ) - self.upsample = build_upsample_layer(upsample_cfg_) + self.upsample = build_upsample_layer(upsample_cfg_) # type: ignore[unresolved-attribute] out_channels = 1 if self.class_agnostic else self.num_classes logits_in_channel = self.conv_out_channels if self.upsample_method == "deconv" else upsample_in_channels - self.conv_logits = build_conv_layer(self.predictor_cfg, logits_in_channel, out_channels, 1) - self.relu = nn.ReLU(inplace=True) - self.debug_imgs = None + self.conv_logits = build_conv_layer(self.predictor_cfg, logits_in_channel, out_channels, 1) # type: ignore[unresolved-attribute] + self.relu = nn.ReLU(inplace=True) # type: ignore[unresolved-attribute] + self.debug_imgs = None # type: ignore[unresolved-attribute] def init_weights(self) -> None: """Initialize the weights.""" @@ -111,9 +134,11 @@ def init_weights(self) -> None: for m in [self.upsample, self.conv_logits]: if m is None: continue - elif hasattr(m, "weight") and hasattr(m, "bias"): - nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") - nn.init.constant_(m.bias, 0) + elif isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.Linear)): + if m.weight is not None: + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) def forward(self, x: Tensor) -> Tensor: """Forward features from the upstream network. @@ -253,9 +278,11 @@ def predict_by_feat( mask_thr_binary=rcnn_test_cfg.mask_thr_binary, )[0] else: + # Convert bboxes to tensor if needed + bboxes_tensor = get_box_tensor(bboxes) im_mask = self._predict_by_feat_single( mask_preds=mask_preds[img_id], - bboxes=bboxes, + bboxes=bboxes_tensor, labels=results.labels, img_meta=img_meta, rcnn_test_cfg=rcnn_test_cfg, diff --git a/visdet/models/roi_heads/roi_extractors/base_roi_extractor.py b/visdet/models/roi_heads/roi_extractors/base_roi_extractor.py index bb1c1f40..f67dd277 100644 --- a/visdet/models/roi_heads/roi_extractors/base_roi_extractor.py +++ b/visdet/models/roi_heads/roi_extractors/base_roi_extractor.py @@ -22,6 +22,9 @@ class BaseRoIExtractor(BaseModule, metaclass=ABCMeta): dict], optional): Initialization config dict. Defaults to None. """ + out_channels: int + featmap_strides: list[int] + def __init__( self, roi_layer: ConfigType, @@ -31,8 +34,8 @@ def __init__( ) -> None: super().__init__(init_cfg=init_cfg) self.roi_layers = self.build_roi_layers(roi_layer, featmap_strides) - self.out_channels = out_channels - self.featmap_strides = featmap_strides + self.out_channels = out_channels # type: ignore[unresolved-attribute] + self.featmap_strides = featmap_strides # type: ignore[unresolved-attribute] @property def num_inputs(self) -> int: diff --git a/visdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py b/visdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py index 2c588232..29102c72 100644 --- a/visdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py +++ b/visdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py @@ -27,6 +27,8 @@ class SingleRoIExtractor(BaseRoIExtractor): dict], optional): Initialization config dict. Defaults to None. """ + finest_scale: int + def __init__( self, roi_layer: ConfigType, @@ -41,7 +43,7 @@ def __init__( featmap_strides=featmap_strides, init_cfg=init_cfg, ) - self.finest_scale = finest_scale + self.finest_scale = finest_scale # type: ignore[unresolved-attribute] def map_roi_levels(self, rois: Tensor, num_levels: int) -> Tensor: """Map rois to corresponding feature levels by scales. diff --git a/visdet/models/roi_heads/standard_roi_head.py b/visdet/models/roi_heads/standard_roi_head.py index 69c08c2b..d7a8ce80 100644 --- a/visdet/models/roi_heads/standard_roi_head.py +++ b/visdet/models/roi_heads/standard_roi_head.py @@ -1,18 +1,18 @@ -# ruff: noqa -# type: ignore +from typing import List, Optional, Tuple + import torch import torch.nn as nn -from visdet.registry import MODELS, TASK_UTILS -from visdet.structures.bbox import bbox2roi -from visdet.models.utils import empty_instances, unpack_gt_instances -from visdet.engine.structures import InstanceData -from visdet.utils.typing_utils import ConfigType -from visdet.utils import InstanceList, OptConfigType, OptMultiConfig -from visdet.structures import DetDataSample, SampleList -from typing import List, Optional, Tuple from torch import Tensor + +from visdet.engine.structures import InstanceData from visdet.models.roi_heads.base_roi_head import BaseRoIHead from visdet.models.task_modules.samplers import SamplingResult +from visdet.models.utils import empty_instances, unpack_gt_instances +from visdet.registry import MODELS, TASK_UTILS +from visdet.structures import DetDataSample, SampleList +from visdet.structures.bbox import bbox2roi +from visdet.utils import InstanceList, OptConfigType, OptMultiConfig +from visdet.utils.typing_utils import ConfigType @MODELS.register_module() @@ -39,17 +39,17 @@ def init_mask_head(self, mask_roi_extractor: ConfigType, mask_head: ConfigType) mask_head (dict or ConfigDict): Config of mask in mask head. """ if mask_roi_extractor is not None: - self.mask_roi_extractor = MODELS.build(mask_roi_extractor) - self.share_roi_extractor = False + self.mask_roi_extractor = MODELS.build(mask_roi_extractor) # type: ignore[misc] + self.share_roi_extractor = False # type: ignore[misc] else: - self.share_roi_extractor = True - self.mask_roi_extractor = self.bbox_roi_extractor - self.mask_head = MODELS.build(mask_head) + self.share_roi_extractor = True # type: ignore[misc] + self.mask_roi_extractor = self.bbox_roi_extractor # type: ignore[misc] + self.mask_head = MODELS.build(mask_head) # type: ignore[misc] def init_assigner_sampler(self) -> None: """Initialize assigner and sampler.""" - self.bbox_assigner = None - self.bbox_sampler = None + self.bbox_assigner = None # type: ignore[misc] + self.bbox_sampler = None # type: ignore[misc] if self.train_cfg: # Support both direct train_cfg and nested under 'rcnn' key if "rcnn" in self.train_cfg: @@ -63,7 +63,7 @@ def forward( self, x: Tuple[Tensor], rpn_results_list: InstanceList, - batch_data_samples: SampleList = None, + batch_data_samples: SampleList | None = None, ) -> tuple: """Network forward process. Usually includes backbone, neck and head forward without any post-processing. @@ -127,8 +127,8 @@ def loss( rpn_results = rpn_results_list[i] rpn_results.priors = rpn_results.pop("bboxes") - assign_result = self.bbox_assigner.assign(rpn_results, batch_gt_instances[i], batch_gt_instances_ignore[i]) - sampling_result = self.bbox_sampler.sample( + assign_result = self.bbox_assigner.assign(rpn_results, batch_gt_instances[i], batch_gt_instances_ignore[i]) # type: ignore[union-attr] + sampling_result = self.bbox_sampler.sample( # type: ignore[union-attr] assign_result, rpn_results, batch_gt_instances[i], @@ -256,7 +256,7 @@ def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict: def _mask_forward( self, x: Tuple[Tensor], - rois: Tensor = None, + rois: Tensor | None = None, pos_inds: Optional[Tensor] = None, bbox_feats: Optional[Tensor] = None, ) -> dict: diff --git a/visdet/models/task_modules/assigners/assign_result.py b/visdet/models/task_modules/assigners/assign_result.py index afd6db1e..99b572c4 100644 --- a/visdet/models/task_modules/assigners/assign_result.py +++ b/visdet/models/task_modules/assigners/assign_result.py @@ -113,7 +113,7 @@ def random(cls, **kwargs): >>> self = AssignResult.random() >>> print(self.info) """ - from ..samplers.sampling_result import ensure_rng + from visdet.core.bbox.demodata import ensure_rng rng = ensure_rng(kwargs.get("rng", None)) diff --git a/visdet/models/task_modules/samplers/__init__.py b/visdet/models/task_modules/samplers/__init__.py index 83b056e8..73c0b625 100644 --- a/visdet/models/task_modules/samplers/__init__.py +++ b/visdet/models/task_modules/samplers/__init__.py @@ -3,6 +3,7 @@ from visdet.registry import TASK_UTILS from visdet.utils import util_mixins from visdet.engine.structures import InstanceData +from visdet.structures.bbox import BaseBoxes class SamplingResult(util_mixins.NiceRepr): @@ -74,6 +75,10 @@ def sample( priors = pred_instances.priors gt_bboxes = gt_instances.bboxes + # Convert BaseBoxes to tensor if needed + if isinstance(gt_bboxes, BaseBoxes): + gt_bboxes = gt_bboxes.tensor + pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False).squeeze(-1) neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False).squeeze(-1) @@ -111,6 +116,10 @@ def sample( priors = pred_instances.priors gt_bboxes = gt_instances.bboxes + # Convert BaseBoxes to tensor if needed + if isinstance(gt_bboxes, BaseBoxes): + gt_bboxes = gt_bboxes.tensor + num_expected_pos = int(self.num * self.pos_fraction) pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False).squeeze(-1) if pos_inds.numel() > num_expected_pos: diff --git a/visdet/models/utils/image.py b/visdet/models/utils/image.py index c09d197b..da572880 100644 --- a/visdet/models/utils/image.py +++ b/visdet/models/utils/image.py @@ -21,7 +21,7 @@ def imrenormalize(img: Tensor | np.ndarray, img_norm_cfg: dict, new_img_norm_cfg assert img.ndim == 4 and img.shape[0] == 1 new_img = img.squeeze(0).cpu().numpy().transpose(1, 2, 0) new_img = _imrenormalize(new_img, img_norm_cfg, new_img_norm_cfg) - new_img = new_img.transpose(2, 0, 1)[None] + new_img = new_img.transpose(2, 0, 1)[None] # type: ignore[misc] return torch.from_numpy(new_img).to(img) else: return _imrenormalize(img, img_norm_cfg, new_img_norm_cfg) diff --git a/visdet/models/utils/res_layer.py b/visdet/models/utils/res_layer.py index 5b9c192f..c9ec00e9 100644 --- a/visdet/models/utils/res_layer.py +++ b/visdet/models/utils/res_layer.py @@ -149,7 +149,7 @@ def __init__( assert dcn is None, "Not implemented yet." assert plugins is None, "Not implemented yet." assert not with_cp, "Not implemented yet." - self.with_norm = norm_cfg is not None + self.with_norm: bool = norm_cfg is not None # type: ignore[misc] with_bias = True if norm_cfg is None else False self.conv1 = build_conv_layer( conv_cfg, @@ -162,18 +162,20 @@ def __init__( bias=with_bias, ) if self.with_norm: - self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) + norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) + self.norm1_name: str = norm1_name # type: ignore[misc] self.add_module(self.norm1_name, norm1) self.conv2 = build_conv_layer(conv_cfg, planes, planes, 3, padding=1, bias=with_bias) if self.with_norm: - self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) + norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) + self.norm2_name: str = norm2_name # type: ignore[misc] self.add_module(self.norm2_name, norm2) self.relu = nn.ReLU(inplace=True) - self.downsample = downsample - self.stride = stride - self.dilation = dilation - self.with_cp = with_cp + self.downsample: nn.Module | None = downsample # type: ignore[misc] + self.stride: int = stride # type: ignore[misc] + self.dilation: int = dilation # type: ignore[misc] + self.with_cp: bool = with_cp # type: ignore[misc] @property def norm1(self): diff --git a/visdet/runner.py b/visdet/runner.py index 7d78f155..7cb9e577 100644 --- a/visdet/runner.py +++ b/visdet/runner.py @@ -128,7 +128,7 @@ def _deep_merge(self, base: dict, override: dict) -> dict: def _build_config(self) -> None: """Build a full MMEngine-compatible configuration from resolved presets.""" - from visdet.engine import Config + from visdet.engine.config import Config # Automatically sync num_classes from dataset to model self._sync_num_classes() @@ -241,7 +241,7 @@ def train(self) -> None: """ # MMEngineRunner is imported here to avoid potential circular dependencies # and to ensure registries are populated first. - from visdet.engine import DefaultScope + from visdet.engine.registry import DefaultScope from visdet.engine.runner import Runner as MMEngineRunner # Ensure the 'visdet' scope is active for component registration. diff --git a/visdet/structures/bbox/base_boxes.py b/visdet/structures/bbox/base_boxes.py index 572ebc77..71ba7f7c 100644 --- a/visdet/structures/bbox/base_boxes.py +++ b/visdet/structures/bbox/base_boxes.py @@ -1,9 +1,8 @@ # ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. -from abc import ABCMeta, abstractmethod, abstractproperty, abstractstaticmethod +from abc import ABCMeta, abstractmethod from collections.abc import Sequence -from typing import TypeVar, Union +from typing import TypeVar, Union, cast import numpy as np import torch @@ -11,16 +10,13 @@ from visdet.structures.mask.structures import BitmapMasks, PolygonMasks -T = TypeVar("T") +T = TypeVar("T", bound="BaseBoxes") DeviceType = Union[str, torch.device] IndexType = Union[ slice, int, list, - torch.LongTensor, - torch.cuda.LongTensor, - torch.BoolTensor, - torch.cuda.BoolTensor, + Tensor, np.ndarray, ] MaskType = Union[BitmapMasks, PolygonMasks] @@ -117,7 +113,7 @@ def empty_boxes(self: T, dtype: torch.dtype | None = None, device: DeviceType | T: empty boxes with shape of (0, box_dim). """ empty_box = self.tensor.new_zeros(0, self.box_dim, dtype=dtype, device=device) - return type(self)(empty_box, clone=False) + return cast(T, type(self)(empty_box, clone=False)) def fake_boxes( self: T, @@ -139,7 +135,7 @@ def fake_boxes( T: Fake boxes with shape of ``sizes``. """ fake_boxes = self.tensor.new_full(sizes, fill, dtype=dtype, device=device) - return type(self)(fake_boxes, clone=False) + return cast(T, type(self)(fake_boxes, clone=False)) def __getitem__(self: T, index: IndexType) -> T: """Rewrite getitem to protect the last dimension shape.""" @@ -158,12 +154,13 @@ def __getitem__(self: T, index: IndexType) -> T: boxes = boxes[index] if boxes.dim() == 1: boxes = boxes.reshape(1, -1) - return type(self)(boxes, clone=False) + return cast(T, type(self)(boxes, clone=False)) - def __setitem__(self: T, index: IndexType, values: Tensor | T) -> T: + def __setitem__(self: T, index: IndexType, values: Tensor | T) -> None: """Rewrite setitem to protect the last dimension shape.""" + assert isinstance(values, BaseBoxes), "The value to be set must be a BaseBoxes instance" assert type(values) is type(self), "The value to be set must be the same box type as self" - values = values.tensor + values_tensor = values.tensor if isinstance(index, np.ndarray): index = torch.as_tensor(index, device=self.device) @@ -176,7 +173,7 @@ def __setitem__(self: T, index: IndexType, values: Tensor | T) -> T: if Ellipsis in index: assert index[-1] is Ellipsis - self.tensor[index] = values + self.tensor[index] = values_tensor def __len__(self) -> int: """Return the length of self.tensor first dimension.""" @@ -247,84 +244,84 @@ def numpy(self) -> np.ndarray: def to(self: T, *args, **kwargs) -> T: """Reload ``to`` from self.tensor.""" - return type(self)(self.tensor.to(*args, **kwargs), clone=False) + return cast(T, type(self)(self.tensor.to(*args, **kwargs), clone=False)) def cpu(self: T) -> T: """Reload ``cpu`` from self.tensor.""" - return type(self)(self.tensor.cpu(), clone=False) + return cast(T, type(self)(self.tensor.cpu(), clone=False)) def cuda(self: T, *args, **kwargs) -> T: """Reload ``cuda`` from self.tensor.""" - return type(self)(self.tensor.cuda(*args, **kwargs), clone=False) + return cast(T, type(self)(self.tensor.cuda(*args, **kwargs), clone=False)) def clone(self: T) -> T: """Reload ``clone`` from self.tensor.""" - return type(self)(self.tensor) + return cast(T, type(self)(self.tensor)) def detach(self: T) -> T: """Reload ``detach`` from self.tensor.""" - return type(self)(self.tensor.detach(), clone=False) + return cast(T, type(self)(self.tensor.detach(), clone=False)) - def view(self: T, *shape: tuple[int]) -> T: + def view(self: T, *shape: int) -> T: """Reload ``view`` from self.tensor.""" - return type(self)(self.tensor.view(shape), clone=False) + return cast(T, type(self)(self.tensor.view(*shape), clone=False)) - def reshape(self: T, *shape: tuple[int]) -> T: + def reshape(self: T, *shape: int) -> T: """Reload ``reshape`` from self.tensor.""" - return type(self)(self.tensor.reshape(shape), clone=False) + return cast(T, type(self)(self.tensor.reshape(*shape), clone=False)) - def expand(self: T, *sizes: tuple[int]) -> T: + def expand(self: T, *sizes: int) -> T: """Reload ``expand`` from self.tensor.""" - return type(self)(self.tensor.expand(sizes), clone=False) + return cast(T, type(self)(self.tensor.expand(*sizes), clone=False)) - def repeat(self: T, *sizes: tuple[int]) -> T: + def repeat(self: T, *sizes: int) -> T: """Reload ``repeat`` from self.tensor.""" - return type(self)(self.tensor.repeat(sizes), clone=False) + return cast(T, type(self)(self.tensor.repeat(*sizes), clone=False)) def transpose(self: T, dim0: int, dim1: int) -> T: """Reload ``transpose`` from self.tensor.""" ndim = self.tensor.dim() assert dim0 != -1 and dim0 != ndim - 1 assert dim1 != -1 and dim1 != ndim - 1 - return type(self)(self.tensor.transpose(dim0, dim1), clone=False) + return cast(T, type(self)(self.tensor.transpose(dim0, dim1), clone=False)) - def permute(self: T, *dims: tuple[int]) -> T: + def permute(self: T, *dims: int) -> T: """Reload ``permute`` from self.tensor.""" assert dims[-1] == -1 or dims[-1] == self.tensor.dim() - 1 - return type(self)(self.tensor.permute(dims), clone=False) + return cast(T, type(self)(self.tensor.permute(*dims), clone=False)) def split(self: T, split_size_or_sections: int | Sequence[int], dim: int = 0) -> list[T]: """Reload ``split`` from self.tensor.""" assert dim != -1 and dim != self.tensor.dim() - 1 boxes_list = self.tensor.split(split_size_or_sections, dim=dim) - return [type(self)(boxes, clone=False) for boxes in boxes_list] + return [cast(T, type(self)(boxes, clone=False)) for boxes in boxes_list] def chunk(self: T, chunks: int, dim: int = 0) -> list[T]: """Reload ``chunk`` from self.tensor.""" assert dim != -1 and dim != self.tensor.dim() - 1 boxes_list = self.tensor.chunk(chunks, dim=dim) - return [type(self)(boxes, clone=False) for boxes in boxes_list] + return [cast(T, type(self)(boxes, clone=False)) for boxes in boxes_list] - def unbind(self: T, dim: int = 0) -> T: + def unbind(self: T, dim: int = 0) -> list[T]: """Reload ``unbind`` from self.tensor.""" assert dim != -1 and dim != self.tensor.dim() - 1 boxes_list = self.tensor.unbind(dim=dim) - return [type(self)(boxes, clone=False) for boxes in boxes_list] + return [cast(T, type(self)(boxes, clone=False)) for boxes in boxes_list] def flatten(self: T, start_dim: int = 0, end_dim: int = -2) -> T: """Reload ``flatten`` from self.tensor.""" assert end_dim != -1 and end_dim != self.tensor.dim() - 1 - return type(self)(self.tensor.flatten(start_dim, end_dim), clone=False) + return cast(T, type(self)(self.tensor.flatten(start_dim, end_dim), clone=False)) def squeeze(self: T, dim: int | None = None) -> T: """Reload ``squeeze`` from self.tensor.""" boxes = self.tensor.squeeze() if dim is None else self.tensor.squeeze(dim) - return type(self)(boxes, clone=False) + return cast(T, type(self)(boxes, clone=False)) def unsqueeze(self: T, dim: int) -> T: """Reload ``unsqueeze`` from self.tensor.""" assert dim != -1 and dim != self.tensor.dim() - return type(self)(self.tensor.unsqueeze(dim), clone=False) + return cast(T, type(self)(self.tensor.unsqueeze(dim), clone=False)) @classmethod def cat(cls: type[T], box_list: Sequence[T], dim: int = 0) -> T: @@ -371,22 +368,26 @@ def stack(cls: type[T], box_list: Sequence[T], dim: int = 0) -> T: th_box_list = [boxes.tensor for boxes in box_list] return cls(torch.stack(th_box_list, dim=dim), clone=False) - @abstractproperty + @property + @abstractmethod def centers(self) -> Tensor: """Return a tensor representing the centers of boxes.""" pass - @abstractproperty + @property + @abstractmethod def areas(self) -> Tensor: """Return a tensor representing the areas of boxes.""" pass - @abstractproperty + @property + @abstractmethod def widths(self) -> Tensor: """Return a tensor representing the widths of boxes.""" pass - @abstractproperty + @property + @abstractmethod def heights(self) -> Tensor: """Return a tensor representing the heights of boxes.""" pass @@ -516,7 +517,8 @@ def find_inside_points(self, points: Tensor, is_aligned: bool = False) -> BoolTe """ pass - @abstractstaticmethod + @staticmethod + @abstractmethod def overlaps( boxes1: "BaseBoxes", boxes2: "BaseBoxes", @@ -544,7 +546,8 @@ def overlaps( """ pass - @abstractstaticmethod + @staticmethod + @abstractmethod def from_instance_masks(masks: MaskType) -> "BaseBoxes": """Create boxes from instance masks. diff --git a/visdet/structures/bbox/bbox_overlaps.py b/visdet/structures/bbox/bbox_overlaps.py index 6a15bb4d..7772ab4b 100644 --- a/visdet/structures/bbox/bbox_overlaps.py +++ b/visdet/structures/bbox/bbox_overlaps.py @@ -1,10 +1,11 @@ -# ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. +from typing import Literal + import torch +from torch import Tensor -def fp16_clamp(x, min=None, max=None): +def fp16_clamp(x: Tensor, min: float | None = None, max: float | None = None) -> Tensor: if not x.is_cuda and x.dtype == torch.float16: # clamp for cpu float16, tensor fp16 has no clamp implementation return x.float().clamp(min, max).half() @@ -12,7 +13,13 @@ def fp16_clamp(x, min=None, max=None): return x.clamp(min, max) -def bbox_overlaps(bboxes1, bboxes2, mode="iou", is_aligned=False, eps=1e-6): +def bbox_overlaps( + bboxes1: Tensor, + bboxes2: Tensor, + mode: Literal["iou", "iof", "giou"] = "iou", + is_aligned: bool = False, + eps: float = 1e-6, +) -> Tensor: """Calculate overlap between two set of bboxes. FP16 Contributed by https://github.com/open-mmlab/mmdetection/pull/4889 @@ -146,14 +153,14 @@ def bbox_overlaps(bboxes1, bboxes2, mode="iou", is_aligned=False, eps=1e-6): enclosed_lt = torch.min(bboxes1[..., :, None, :2], bboxes2[..., None, :, :2]) enclosed_rb = torch.max(bboxes1[..., :, None, 2:], bboxes2[..., None, :, 2:]) - eps = union.new_tensor([eps]) - union = torch.max(union, eps) + eps_tensor = union.new_tensor([eps]) + union = torch.max(union, eps_tensor) ious = overlap / union if mode in ["iou", "iof"]: return ious # calculate gious enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0) enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] - enclose_area = torch.max(enclose_area, eps) + enclose_area = torch.max(enclose_area, eps_tensor) gious = ious - (enclose_area - union) / enclose_area return gious diff --git a/visdet/structures/bbox/box_type.py b/visdet/structures/bbox/box_type.py index ba019196..85400b62 100644 --- a/visdet/structures/bbox/box_type.py +++ b/visdet/structures/bbox/box_type.py @@ -1,8 +1,6 @@ -# ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. from collections.abc import Callable -from typing import Union +from typing import Any, TypeVar, Union import numpy as np import torch @@ -11,10 +9,12 @@ from visdet.structures.bbox.base_boxes import BaseBoxes BoxType = Union[np.ndarray, Tensor, BaseBoxes] +T = TypeVar("T", bound=type) +F = TypeVar("F", bound=Callable) -box_types: dict = {} -_box_type_to_name: dict = {} -box_converters: dict = {} +box_types: dict[str, type] = {} +_box_type_to_name: dict[type, str] = {} +box_converters: dict[str, Callable] = {} def _register_box(name: str, box_type: type, force: bool = False) -> None: @@ -42,7 +42,7 @@ def _register_box(name: str, box_type: type, force: bool = False) -> None: _box_type_to_name[box_type] = name -def register_box(name: str, box_type: type | None = None, force: bool = False) -> type | Callable: +def register_box(name: str, box_type: T | None = None, force: bool = False) -> T | Callable[[T], T]: """Register a box type. A record will be added to ``bbox_types``, whose key is the box type name @@ -80,7 +80,7 @@ def register_box(name: str, box_type: type | None = None, force: bool = False) - return box_type # use it as a decorator: @register_box(name) - def _register(cls): + def _register(cls: T) -> T: _register_box(name=name, box_type=cls, force=force) return cls @@ -113,9 +113,9 @@ def _register_box_converter( def register_box_converter( src_type: str | type, dst_type: str | type, - converter: Callable | None = None, + converter: F | None = None, force: bool = False, -) -> Callable: +) -> F | Callable[[F], F]: """Register a box converter. A record will be added to ``box_converter``, whose key is @@ -151,7 +151,7 @@ def register_box_converter( return converter # use it as a decorator: @register_box_converter(name) - def _register(func): + def _register(func: F) -> F: _register_box_converter(src_type=src_type, dst_type=dst_type, converter=func, force=force) return func @@ -226,16 +226,17 @@ def convert_box_type( converter = box_converters[converter_name] if is_box_cls: - boxes = converter(boxes.tensor) - return dst_type_cls(boxes) + converted_boxes: Tensor = converter(boxes.tensor) # type: ignore[arg-type] + return dst_type_cls(converted_boxes) elif is_numpy: - boxes = converter(torch.from_numpy(boxes)) - return boxes.numpy() + converted_boxes = converter(torch.from_numpy(boxes)) # type: ignore[arg-type] + assert isinstance(converted_boxes, Tensor) + return converted_boxes.numpy() else: - return converter(boxes) + return converter(boxes) # type: ignore[arg-type,return-value] -def autocast_box_type(dst_box_type="hbox") -> Callable: +def autocast_box_type(dst_box_type: str = "hbox") -> Callable[[Callable], Callable]: """A decorator which automatically casts results['gt_bboxes'] to the destination box type. @@ -253,7 +254,7 @@ def autocast_box_type(dst_box_type="hbox") -> Callable: _, box_type_cls = get_box_type(dst_box_type) def decorator(func: Callable) -> Callable: - def wrapper(self, results: dict, *args, **kwargs) -> dict: + def wrapper(self: Any, results: dict, *args: Any, **kwargs: Any) -> dict: if "gt_bboxes" not in results or isinstance(results["gt_bboxes"], BaseBoxes): return func(self, results) elif isinstance(results["gt_bboxes"], np.ndarray): diff --git a/visdet/structures/bbox/coders/base_bbox_coder.py b/visdet/structures/bbox/coders/base_bbox_coder.py index 96ba1e3e..f5958c96 100644 --- a/visdet/structures/bbox/coders/base_bbox_coder.py +++ b/visdet/structures/bbox/coders/base_bbox_coder.py @@ -1,7 +1,8 @@ -# ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta, abstractmethod +from typing import Any + +from torch import Tensor class BaseBBoxCoder(metaclass=ABCMeta): @@ -15,14 +16,14 @@ class BaseBBoxCoder(metaclass=ABCMeta): # The size of the last of dimension of the encoded tensor. encode_size = 4 - def __init__(self, use_box_type: bool = False, **kwargs): + def __init__(self, use_box_type: bool = False, **kwargs: Any) -> None: self.use_box_type = use_box_type @abstractmethod - def encode(self, bboxes, gt_bboxes): + def encode(self, bboxes: Tensor, gt_bboxes: Tensor) -> Tensor: """Encode deltas between bboxes and ground truth boxes.""" @abstractmethod - def decode(self, bboxes, bboxes_pred): + def decode(self, bboxes: Tensor, bboxes_pred: Tensor) -> Tensor: """Decode the predicted bboxes according to prediction and base boxes.""" diff --git a/visdet/structures/bbox/horizontal_boxes.py b/visdet/structures/bbox/horizontal_boxes.py index 1691aeea..eb4ff98d 100644 --- a/visdet/structures/bbox/horizontal_boxes.py +++ b/visdet/structures/bbox/horizontal_boxes.py @@ -1,7 +1,6 @@ # ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. -from typing import TypeVar, Union +from typing import Literal, TypeVar, Union, cast import cv2 import numpy as np @@ -52,8 +51,8 @@ class HorizontalBoxes(BaseBoxes): def __init__( self, data: Tensor | np.ndarray, - dtype: torch.dtype = None, - device: DeviceType = None, + dtype: torch.dtype | None = None, + device: DeviceType | None = None, clone: bool = True, in_mode: str | None = None, ) -> None: @@ -247,8 +246,8 @@ def rescale_(self, scale_factor: tuple[float, float]) -> None: """ boxes = self.tensor assert len(scale_factor) == 2 - scale_factor = boxes.new_tensor(scale_factor).repeat(2) - self.tensor = boxes * scale_factor + scale_factor_tensor = boxes.new_tensor(scale_factor).repeat(2) + self.tensor = boxes * scale_factor_tensor def resize_(self, scale_factor: tuple[float, float]) -> None: """Resize the box width and height w.r.t scale_factor in-place. @@ -267,8 +266,8 @@ def resize_(self, scale_factor: tuple[float, float]) -> None: assert len(scale_factor) == 2 ctrs = (boxes[..., 2:] + boxes[..., :2]) / 2 wh = boxes[..., 2:] - boxes[..., :2] - scale_factor = boxes.new_tensor(scale_factor) - wh = wh * scale_factor + scale_factor_tensor = boxes.new_tensor(scale_factor) + wh = wh * scale_factor_tensor xy1 = ctrs - 0.5 * wh xy2 = ctrs + 0.5 * wh self.tensor = torch.cat([xy1, xy2], dim=-1) @@ -296,19 +295,21 @@ def is_inside( img_h, img_w = img_shape boxes = self.tensor if all_inside: - return ( + result = ( (boxes[:, 0] >= -allowed_border) & (boxes[:, 1] >= -allowed_border) & (boxes[:, 2] < img_w + allowed_border) & (boxes[:, 3] < img_h + allowed_border) ) + return cast(BoolTensor, result) else: - return ( + result = ( (boxes[..., 0] < img_w + allowed_border) & (boxes[..., 1] < img_h + allowed_border) & (boxes[..., 2] > -allowed_border) & (boxes[..., 3] > -allowed_border) ) + return cast(BoolTensor, result) def find_inside_points(self, points: Tensor, is_aligned: bool = False) -> BoolTensor: """Find inside box points. Boxes dimension must be 2. @@ -335,12 +336,13 @@ def find_inside_points(self, points: Tensor, is_aligned: bool = False) -> BoolTe assert boxes.size(0) == points.size(0) x_min, y_min, x_max, y_max = boxes.unbind(dim=-1) - return ( + result = ( (points[..., 0] >= x_min) & (points[..., 0] <= x_max) & (points[..., 1] >= y_min) & (points[..., 1] <= y_max) ) + return cast(BoolTensor, result) def create_masks(self, img_shape: tuple[int, int]) -> BitmapMasks: """ @@ -388,7 +390,8 @@ def overlaps( """ boxes1 = boxes1.convert_to("hbox") boxes2 = boxes2.convert_to("hbox") - return bbox_overlaps(boxes1.tensor, boxes2.tensor, mode=mode, is_aligned=is_aligned, eps=eps) + mode_literal = cast(Literal["iou", "iof", "giou"], mode) + return bbox_overlaps(boxes1.tensor, boxes2.tensor, mode=mode_literal, is_aligned=is_aligned, eps=eps) @staticmethod def from_instance_masks(masks: MaskType) -> "HorizontalBoxes": diff --git a/visdet/structures/bbox/transforms.py b/visdet/structures/bbox/transforms.py index 836ccc79..4f69d9d9 100644 --- a/visdet/structures/bbox/transforms.py +++ b/visdet/structures/bbox/transforms.py @@ -1,7 +1,6 @@ -# ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. from collections.abc import Sequence +from typing import Literal import numpy as np import torch @@ -25,20 +24,23 @@ def find_inside_bboxes(bboxes: Tensor, img_h: int, img_w: int) -> Tensor: return inside_inds -def bbox_flip(bboxes: Tensor, img_shape: tuple[int], direction: str = "horizontal") -> Tensor: +def bbox_flip( + bboxes: Tensor, + img_shape: tuple[int, int], + direction: Literal["horizontal", "vertical", "diagonal"] = "horizontal", +) -> Tensor: """Flip bboxes horizontally or vertically. Args: bboxes (Tensor): Shape (..., 4*k) - img_shape (Tuple[int]): Image shape. - direction (str): Flip direction, options are "horizontal", "vertical", - "diagonal". Default: "horizontal" + img_shape (tuple[int, int]): Image shape as (height, width). + direction (Literal["horizontal", "vertical", "diagonal"]): Flip direction. + Default: "horizontal" Returns: Tensor: Flipped bboxes. """ assert bboxes.shape[-1] % 4 == 0 - assert direction in ["horizontal", "vertical", "diagonal"] flipped = bboxes.clone() if direction == "horizontal": flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4] @@ -56,10 +58,10 @@ def bbox_flip(bboxes: Tensor, img_shape: tuple[int], direction: str = "horizonta def bbox_mapping( bboxes: Tensor, - img_shape: tuple[int], - scale_factor: float | tuple[float], + img_shape: tuple[int, int], + scale_factor: float | tuple[float, float], flip: bool, - flip_direction: str = "horizontal", + flip_direction: Literal["horizontal", "vertical", "diagonal"] = "horizontal", ) -> Tensor: """Map bboxes from the original image scale to testing scale.""" new_bboxes = bboxes * bboxes.new_tensor(scale_factor) @@ -70,10 +72,10 @@ def bbox_mapping( def bbox_mapping_back( bboxes: Tensor, - img_shape: tuple[int], - scale_factor: float | tuple[float], + img_shape: tuple[int, int], + scale_factor: float | tuple[float, float], flip: bool, - flip_direction: str = "horizontal", + flip_direction: Literal["horizontal", "vertical", "diagonal"] = "horizontal", ) -> Tensor: """Map bboxes from testing scale to original image scale.""" new_bboxes = bbox_flip(bboxes, img_shape, flip_direction) if flip else bboxes @@ -140,6 +142,7 @@ def bbox2result(bboxes: Tensor | np.ndarray, labels: Tensor | np.ndarray, num_cl else: if isinstance(bboxes, torch.Tensor): bboxes = bboxes.detach().cpu().numpy() + if isinstance(labels, torch.Tensor): labels = labels.detach().cpu().numpy() return [bboxes[labels == i, :] for i in range(num_classes)] @@ -182,7 +185,7 @@ def distance2bbox( # clip bboxes with dynamic `min` and `max` for onnx if torch.onnx.is_in_onnx_export(): # TODO: delete - from visdet.core.export import dynamic_clip_for_onnx + from visdet.core.export import dynamic_clip_for_onnx # type: ignore[import-not-found] x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape) bboxes = torch.stack([x1, y1, x2, y2], dim=-1) @@ -220,8 +223,8 @@ def scale_boxes(boxes: Tensor | BaseBoxes, scale_factor: tuple[float, float]) -> else: # Tensor boxes will be treated as horizontal boxes repeat_num = int(boxes.size(-1) / 2) - scale_factor = boxes.new_tensor(scale_factor).repeat((1, repeat_num)) - return boxes * scale_factor + scale_factor_tensor = boxes.new_tensor(scale_factor).repeat((1, repeat_num)) + return boxes * scale_factor_tensor def get_box_tensor(boxes: Tensor | BaseBoxes) -> Tensor: @@ -324,7 +327,7 @@ def bbox_xyxy_to_cxcywh(bbox: Tensor) -> Tensor: return torch.cat(bbox_new, dim=-1) -def bbox2corner(bboxes: torch.Tensor) -> torch.Tensor: +def bbox2corner(bboxes: Tensor) -> Tensor: """Convert bbox coordinates from (x1, y1, x2, y2) to corners ((x1, y1), (x2, y1), (x1, y2), (x2, y2)). @@ -337,7 +340,7 @@ def bbox2corner(bboxes: torch.Tensor) -> torch.Tensor: return torch.cat([x1, y1, x2, y1, x1, y2, x2, y2], dim=1).reshape(-1, 2) -def corner2bbox(corners: torch.Tensor) -> torch.Tensor: +def corner2bbox(corners: Tensor) -> Tensor: """Convert bbox coordinates from corners ((x1, y1), (x2, y1), (x1, y2), (x2, y2)) to (x1, y1, x2, y2). @@ -353,10 +356,10 @@ def corner2bbox(corners: torch.Tensor) -> torch.Tensor: def bbox_project( - bboxes: torch.Tensor | np.ndarray, - homography_matrix: torch.Tensor | np.ndarray, + bboxes: Tensor | np.ndarray, + homography_matrix: Tensor | np.ndarray, img_shape: tuple[int, int] | None = None, -) -> torch.Tensor | np.ndarray: +) -> Tensor | np.ndarray: """Geometric transformation for bbox. Args: @@ -372,6 +375,9 @@ def bbox_project( bboxes = torch.from_numpy(bboxes) if isinstance(homography_matrix, np.ndarray): homography_matrix = torch.from_numpy(homography_matrix) + + # At this point bboxes must be a Tensor + assert isinstance(bboxes, torch.Tensor) corners = bbox2corner(bboxes) corners = torch.cat([corners, corners.new_ones(corners.shape[0], 1)], dim=1) corners = torch.matmul(homography_matrix, corners.t()).t() @@ -401,7 +407,9 @@ def cat_boxes(data_list: list[Tensor | BaseBoxes], dim: int = 0) -> Tensor | Bas if data_list and isinstance(data_list[0], BaseBoxes): return data_list[0].cat(data_list, dim=dim) else: - return torch.cat(data_list, dim=dim) + # Type checker needs to know these are all Tensors + tensor_list: list[Tensor] = [x for x in data_list if isinstance(x, Tensor)] + return torch.cat(tensor_list, dim=dim) def stack_boxes(data_list: list[Tensor | BaseBoxes], dim: int = 0) -> Tensor | BaseBoxes: @@ -419,29 +427,9 @@ def stack_boxes(data_list: list[Tensor | BaseBoxes], dim: int = 0) -> Tensor | B if data_list and isinstance(data_list[0], BaseBoxes): return data_list[0].stack(data_list, dim=dim) else: - return torch.stack(data_list, dim=dim) - - -def scale_boxes(boxes: Tensor | BaseBoxes, scale_factor: tuple[float, float]) -> Tensor | BaseBoxes: - """Scale boxes with type of tensor or box type. - - Args: - boxes (Tensor or :obj:`BaseBoxes`): boxes need to be scaled. Its type - can be a tensor or a box type. - scale_factor (Tuple[float, float]): factors for scaling boxes. - The length should be 2. - - Returns: - Union[Tensor, :obj:`BaseBoxes`]: Scaled boxes. - """ - if isinstance(boxes, BaseBoxes): - boxes.rescale_(scale_factor) - return boxes - else: - # Tensor boxes will be treated as horizontal boxes - repeat_num = int(boxes.size(-1) / 2) - scale_factor = boxes.new_tensor(scale_factor).repeat((1, repeat_num)) - return boxes * scale_factor + # Type checker needs to know these are all Tensors + tensor_list: list[Tensor] = [x for x in data_list if isinstance(x, Tensor)] + return torch.stack(tensor_list, dim=dim) def get_box_wh(boxes: Tensor | BaseBoxes) -> tuple[Tensor, Tensor]: @@ -464,22 +452,6 @@ def get_box_wh(boxes: Tensor | BaseBoxes) -> tuple[Tensor, Tensor]: return w, h -def get_box_tensor(boxes: Tensor | BaseBoxes) -> Tensor: - """Get tensor data from box type boxes. - - Args: - boxes (Tensor or BaseBoxes): boxes with type of tensor or box type. - If its type is a tensor, the boxes will be directly returned. - If its type is a box type, the `boxes.tensor` will be returned. - - Returns: - Tensor: boxes tensor. - """ - if isinstance(boxes, BaseBoxes): - boxes = boxes.tensor - return boxes - - def empty_box_as(boxes: Tensor | BaseBoxes) -> Tensor | BaseBoxes: """Generate empty box according to input ``boxes` type and device. @@ -497,7 +469,7 @@ def empty_box_as(boxes: Tensor | BaseBoxes) -> Tensor | BaseBoxes: return boxes.new_zeros(0, 4) -def bbox_xyxy_to_cxcyah(bboxes: torch.Tensor) -> torch.Tensor: +def bbox_xyxy_to_cxcyah(bboxes: Tensor) -> Tensor: """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, ratio, h). Args: @@ -514,7 +486,7 @@ def bbox_xyxy_to_cxcyah(bboxes: torch.Tensor) -> torch.Tensor: return xyah -def bbox_cxcyah_to_xyxy(bboxes: torch.Tensor) -> torch.Tensor: +def bbox_cxcyah_to_xyxy(bboxes: Tensor) -> Tensor: """Convert bbox coordinates from (cx, cy, ratio, h) to (x1, y1, x2, y2). Args: diff --git a/visdet/structures/det_data_sample.py b/visdet/structures/det_data_sample.py index 71244f14..d9de95e1 100644 --- a/visdet/structures/det_data_sample.py +++ b/visdet/structures/det_data_sample.py @@ -1,7 +1,6 @@ -# ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional + +from typing import TYPE_CHECKING, Any, overload from visdet.engine.structures import BaseDataElement, InstanceData, PixelData @@ -118,101 +117,109 @@ class DetDataSample(BaseDataElement): """ @property - def proposals(self) -> InstanceData: + def proposals(self) -> InstanceData | None: return getattr(self, "_proposals", None) @proposals.setter - def proposals(self, value: InstanceData): + def proposals(self, value: InstanceData) -> None: self.set_field(value, "_proposals", dtype=InstanceData) @proposals.deleter - def proposals(self): - del self._proposals + def proposals(self) -> None: + del self._proposals # type: ignore[has-type] @property - def gt_instances(self) -> InstanceData: + def gt_instances(self) -> InstanceData | None: return getattr(self, "_gt_instances", None) @gt_instances.setter - def gt_instances(self, value: InstanceData): + def gt_instances(self, value: InstanceData) -> None: self.set_field(value, "_gt_instances", dtype=InstanceData) @gt_instances.deleter - def gt_instances(self): - del self._gt_instances + def gt_instances(self) -> None: + del self._gt_instances # type: ignore[has-type] @property - def pred_instances(self) -> InstanceData: + def pred_instances(self) -> InstanceData | None: return getattr(self, "_pred_instances", None) @pred_instances.setter - def pred_instances(self, value: InstanceData): + def pred_instances(self, value: InstanceData) -> None: self.set_field(value, "_pred_instances", dtype=InstanceData) @pred_instances.deleter - def pred_instances(self): - del self._pred_instances + def pred_instances(self) -> None: + del self._pred_instances # type: ignore[has-type] @property - def ignored_instances(self) -> InstanceData: + def ignored_instances(self) -> InstanceData | None: return getattr(self, "_ignored_instances", None) @ignored_instances.setter - def ignored_instances(self, value: InstanceData): + def ignored_instances(self, value: InstanceData) -> None: self.set_field(value, "_ignored_instances", dtype=InstanceData) @ignored_instances.deleter - def ignored_instances(self): - del self._ignored_instances + def ignored_instances(self) -> None: + del self._ignored_instances # type: ignore[has-type] @property - def gt_panoptic_seg(self) -> PixelData: + def gt_panoptic_seg(self) -> PixelData | None: return getattr(self, "_gt_panoptic_seg", None) @gt_panoptic_seg.setter - def gt_panoptic_seg(self, value: PixelData): + def gt_panoptic_seg(self, value: PixelData) -> None: self.set_field(value, "_gt_panoptic_seg", dtype=PixelData) @gt_panoptic_seg.deleter - def gt_panoptic_seg(self): - del self._gt_panoptic_seg + def gt_panoptic_seg(self) -> None: + del self._gt_panoptic_seg # type: ignore[has-type] @property - def pred_panoptic_seg(self) -> PixelData: + def pred_panoptic_seg(self) -> PixelData | None: return getattr(self, "_pred_panoptic_seg", None) @pred_panoptic_seg.setter - def pred_panoptic_seg(self, value: PixelData): + def pred_panoptic_seg(self, value: PixelData) -> None: self.set_field(value, "_pred_panoptic_seg", dtype=PixelData) @pred_panoptic_seg.deleter - def pred_panoptic_seg(self): - del self._pred_panoptic_seg + def pred_panoptic_seg(self) -> None: + del self._pred_panoptic_seg # type: ignore[has-type] @property - def gt_sem_seg(self) -> PixelData: + def gt_sem_seg(self) -> PixelData | None: return getattr(self, "_gt_sem_seg", None) @gt_sem_seg.setter - def gt_sem_seg(self, value: PixelData): + def gt_sem_seg(self, value: PixelData) -> None: self.set_field(value, "_gt_sem_seg", dtype=PixelData) @gt_sem_seg.deleter - def gt_sem_seg(self): - del self._gt_sem_seg + def gt_sem_seg(self) -> None: + del self._gt_sem_seg # type: ignore[has-type] @property - def pred_sem_seg(self) -> PixelData: + def pred_sem_seg(self) -> PixelData | None: return getattr(self, "_pred_sem_seg", None) @pred_sem_seg.setter - def pred_sem_seg(self, value: PixelData): + def pred_sem_seg(self, value: PixelData) -> None: self.set_field(value, "_pred_sem_seg", dtype=PixelData) @pred_sem_seg.deleter - def pred_sem_seg(self): - del self._pred_sem_seg + def pred_sem_seg(self) -> None: + del self._pred_sem_seg # type: ignore[has-type] + + # Provide specific type hints for common attributes + if TYPE_CHECKING: + # These are commonly accessed metainfo attributes in visualization code + img_path: str + text: str | list[str] + tokens_positive: list[list[tuple[int, int]]] + phrase_ids: list[int] SampleList = list[DetDataSample] -OptSampleList = Optional[SampleList] +OptSampleList = SampleList | None diff --git a/visdet/structures/mask/utils.py b/visdet/structures/mask/utils.py index 75b37707..34e2c56f 100644 --- a/visdet/structures/mask/utils.py +++ b/visdet/structures/mask/utils.py @@ -1,13 +1,17 @@ -# ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. +from typing import Any + import numpy as np import pycocotools.mask as mask_util import torch +from torch import Tensor + from visdet.engine.utils import slice_list -def split_combined_polys(polys, poly_lens, polys_per_mask): +def split_combined_polys( + polys: list[Tensor], poly_lens: list[Tensor], polys_per_mask: list[Tensor] +) -> list[list[list[Any]]]: """Split the combined 1-D polys into masks. A mask is represented as a list of polys, and a poly is represented as @@ -37,7 +41,7 @@ def split_combined_polys(polys, poly_lens, polys_per_mask): # TODO: move this function to more proper place -def encode_mask_results(mask_results): +def encode_mask_results(mask_results: list[np.ndarray]) -> list[dict[str, Any]]: """Encode bitmap mask to RLE code. Args: @@ -54,7 +58,7 @@ def encode_mask_results(mask_results): return encoded_mask_results -def mask2bbox(masks): +def mask2bbox(masks: Tensor) -> Tensor: """Obtain tight bounding boxes of binary masks. Args: diff --git a/visdet/tests/test_models/test_backbones/test_swin.py b/visdet/tests/test_models/test_backbones/test_swin.py index 881c3a4b..ecb1150c 100644 --- a/visdet/tests/test_models/test_backbones/test_swin.py +++ b/visdet/tests/test_models/test_backbones/test_swin.py @@ -1,4 +1,4 @@ -import pytest +import pytest # type: ignore[import-not-found] import torch from visdet.models.backbones.swin import ( diff --git a/visdet/tests/test_models/test_roi_heads/test_bbox_heads.py b/visdet/tests/test_models/test_roi_heads/test_bbox_heads.py index bcb34c76..80eece8d 100644 --- a/visdet/tests/test_models/test_roi_heads/test_bbox_heads.py +++ b/visdet/tests/test_models/test_roi_heads/test_bbox_heads.py @@ -1,6 +1,6 @@ """Test cases for bbox heads.""" -import pytest +import pytest # type: ignore[import-not-found] import torch from visdet.engine.config import Config diff --git a/visdet/tests/test_models/test_roi_heads/test_cascade_roi_head.py b/visdet/tests/test_models/test_roi_heads/test_cascade_roi_head.py index e06db3e9..05088ba6 100644 --- a/visdet/tests/test_models/test_roi_heads/test_cascade_roi_head.py +++ b/visdet/tests/test_models/test_roi_heads/test_cascade_roi_head.py @@ -1,6 +1,6 @@ """Tests for CascadeRoIHead to validate cascade logic and mask prediction.""" -import pytest +import pytest # type: ignore[import-not-found] import torch from visdet.engine.config import ConfigDict diff --git a/visdet/utils/setup_env.py b/visdet/utils/setup_env.py index 692e7dd0..7d589501 100644 --- a/visdet/utils/setup_env.py +++ b/visdet/utils/setup_env.py @@ -32,6 +32,7 @@ def register_all_modules(init_default_scope: bool = True) -> None: DefaultScope.get_instance("visdet", scope_name="visdet") return current_scope = DefaultScope.get_current_instance() + assert current_scope is not None, "DefaultScope instance should exist at this point" if current_scope.scope_name != "visdet": warnings.warn( "The current default scope " diff --git a/visdet/utils/typing_utils.py b/visdet/utils/typing_utils.py index 0214403f..18e76bec 100644 --- a/visdet/utils/typing_utils.py +++ b/visdet/utils/typing_utils.py @@ -1,28 +1,24 @@ -# ruff: noqa -# type: ignore # Copyright (c) OpenMMLab. All rights reserved. """Collecting some commonly used type hint in mmdetection.""" from collections.abc import Sequence -from typing import Optional, Union +from typing import Any, Union from visdet.engine.config import ConfigDict from visdet.engine.structures import InstanceData, PixelData # TODO: Need to avoid circular import with assigner and sampler # Type hint of config data -from typing import Dict, Any - -ConfigType = Union[ConfigDict, dict, str, Dict[str, Any]] -OptConfigType = Optional[ConfigType] +ConfigType = Union[ConfigDict, dict, str, dict[str, Any]] +OptConfigType = ConfigType | None # Type hint of one or more config data MultiConfig = Union[ConfigType, list[ConfigType]] -OptMultiConfig = Optional[MultiConfig] +OptMultiConfig = MultiConfig | None InstanceList = list[InstanceData] -OptInstanceList = Optional[InstanceList] +OptInstanceList = InstanceList | None PixelList = list[PixelData] -OptPixelList = Optional[PixelList] +OptPixelList = PixelList | None RangeType = Sequence[tuple[int, int]] diff --git a/visdet/visualization/local_visualizer.py b/visdet/visualization/local_visualizer.py index 6972676e..ec2e8877 100644 --- a/visdet/visualization/local_visualizer.py +++ b/visdet/visualization/local_visualizer.py @@ -1,5 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional +from __future__ import annotations + +from typing import Any, Sequence, cast import cv2 import numpy as np @@ -12,9 +14,13 @@ from visdet.evaluation import INSTANCE_OFFSET from visdet.registry import VISUALIZERS from visdet.structures import DetDataSample +from visdet.structures.bbox import BaseBoxes from visdet.structures.mask import BitmapMasks, PolygonMasks, bitmap_to_polygon from visdet.visualization.palette import _get_adaptive_scales, get_palette, jitter_color +ColorTuple = tuple[int, int, int] +PaletteInput = list[ColorTuple] | ColorTuple | str | None + @VISUALIZERS.register_module() class DetLocalVisualizer(Visualizer): @@ -80,9 +86,9 @@ def __init__( image: np.ndarray | None = None, vis_backends: dict | None = None, save_dir: str | None = None, - bbox_color: str | tuple[int] | None = None, - text_color: str | tuple[int] | None = (200, 200, 200), - mask_color: str | tuple[int] | None = None, + bbox_color: str | ColorTuple | None = None, + text_color: str | ColorTuple = (200, 200, 200), + mask_color: str | ColorTuple | None = None, line_width: int | float = 3, alpha: float = 0.8, ) -> None: @@ -95,14 +101,50 @@ def __init__( # Set default value. When calling # `DetLocalVisualizer().dataset_meta=xxx`, # it will override the default value. - self.dataset_meta = {} + # Meta information attached by runner/metrics; values may vary so keep + # loose typing internally and normalize on read. + self.dataset_meta: dict[str, Any] = {} + + def _meta_classes(self) -> list[str] | None: + meta = self.dataset_meta or {} + classes = meta.get("classes") + if isinstance(classes, Sequence) and all(isinstance(name, str) for name in classes): + return list(classes) + return None + + def _meta_palette(self) -> PaletteInput: + meta = self.dataset_meta or {} + palette = meta.get("palette") + return self._normalize_palette_input(palette) + + def _normalize_palette_input(self, palette: Any) -> PaletteInput: + if palette is None: + return None + if isinstance(palette, list): + normalized: list[ColorTuple] = [] + for color in palette: + if isinstance(color, tuple): + normalized.append(cast(ColorTuple, tuple(int(c) for c in color[:3]))) + elif isinstance(color, list): + normalized.append(cast(ColorTuple, tuple(int(c) for c in color[:3]))) + return normalized or None + if isinstance(palette, np.ndarray): + if palette.ndim == 2: + return [cast(ColorTuple, tuple(int(c) for c in row.tolist()[:3])) for row in palette] + if palette.ndim == 1: + return cast(ColorTuple, tuple(int(c) for c in palette.tolist()[:3])) + if isinstance(palette, tuple): + return cast(ColorTuple, tuple(int(c) for c in palette[:3])) + if isinstance(palette, str): + return palette + return None def _draw_instances( self, image: np.ndarray, - instances: ["InstanceData"], + instances: InstanceData, classes: list[str] | None, - palette: list[tuple] | None, + palette: PaletteInput, ) -> np.ndarray: """Draw instances of GT or prediction. @@ -119,54 +161,71 @@ def _draw_instances( """ self.set_image(image) - if "bboxes" in instances and instances.bboxes.sum() > 0: - bboxes = instances.bboxes - labels = instances.labels + if "bboxes" in instances: + bboxes_raw = instances.bboxes + # Convert BaseBoxes to tensor + if isinstance(bboxes_raw, BaseBoxes): + bboxes = bboxes_raw.tensor + else: + bboxes = bboxes_raw - max_label = int(max(labels) if len(labels) > 0 else 0) - text_palette = get_palette(self.text_color, max_label + 1) - text_colors = [text_palette[label] for label in labels] - - bbox_color = palette if self.bbox_color is None else self.bbox_color - bbox_palette = get_palette(bbox_color, max_label + 1) - colors = [bbox_palette[label] for label in labels] - self.draw_bboxes( - bboxes, - edge_colors=colors, - alpha=self.alpha, - line_widths=self.line_width, - ) + if bboxes.sum() > 0: + label_tensor = cast(torch.Tensor, instances.labels) + label_ids = label_tensor.to(dtype=torch.int64).tolist() - positions = bboxes[:, :2] + self.line_width - areas = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0]) - scales = _get_adaptive_scales(areas) + max_label = int(max(label_ids) if len(label_ids) > 0 else 0) + text_palette = get_palette(self.text_color, max_label + 1) + text_colors = [text_palette[label] for label in label_ids] - for i, (pos, label) in enumerate(zip(positions, labels)): - if "label_names" in instances: - label_text = instances.label_names[i] + if self.bbox_color is None: + bbox_color: PaletteInput = palette else: - label_text = classes[label] if classes is not None else f"class {label}" - if "scores" in instances: - score = round(float(instances.scores[i]) * 100, 1) - label_text += f": {score}" - - self.draw_texts( - label_text, - pos, - colors=text_colors[i], - font_sizes=int(13 * scales[i]), - bboxes=[ - { - "facecolor": "black", - "alpha": 0.8, - "pad": 0.7, - "edgecolor": "none", - } - ], + bbox_color = cast(PaletteInput, self.bbox_color) + bbox_palette = get_palette(bbox_color, max_label + 1) + colors = [bbox_palette[label] for label in label_ids] + self.draw_bboxes( + bboxes, + edge_colors=colors, + alpha=self.alpha, + line_widths=self.line_width, ) + positions = bboxes[:, :2] + self.line_width + if isinstance(positions, torch.Tensor): + positions = positions.cpu().numpy() + areas = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0]) + # Convert to numpy if it's a tensor + if isinstance(areas, torch.Tensor): + areas = areas.cpu().numpy() + scales = _get_adaptive_scales(areas) + + for i, (pos, label) in enumerate(zip(positions, label_ids)): + if "label_names" in instances: + label_text = str(instances.label_names[i]) + else: + label_text = classes[label] if classes is not None else f"class {label}" + if "scores" in instances: + score = round(float(instances.scores[i]) * 100, 1) + label_text += f": {score}" + + self.draw_texts( + label_text, + pos, + colors=text_colors[i], + font_sizes=int(13 * scales[i]), + bboxes=[ + { + "facecolor": "black", + "alpha": 0.8, + "pad": 0.7, + "edgecolor": "none", + } + ], + ) + if "masks" in instances: - labels = instances.labels + label_tensor = cast(torch.Tensor, instances.labels) + label_ids = label_tensor.to(dtype=torch.int64).tolist() masks = instances.masks if isinstance(masks, torch.Tensor): masks = masks.numpy() @@ -182,14 +241,17 @@ def _draw_instances( logger.debug(f"Image shape: {image.shape}") logger.debug(f"Masks shape: {masks.shape}") logger.debug(f"Masks dtype: {masks.dtype}") - logger.debug(f"Number of instances: {len(labels)}") + logger.debug(f"Number of instances: {len(label_ids)}") - max_label = int(max(labels) if len(labels) > 0 else 0) - mask_color = palette if self.mask_color is None else self.mask_color + max_label = int(max(label_ids) if len(label_ids) > 0 else 0) + if self.mask_color is None: + mask_color: PaletteInput = palette + else: + mask_color = cast(PaletteInput, self.mask_color) mask_palette = get_palette(mask_color, max_label + 1) - colors = [jitter_color(mask_palette[label]) for label in labels] + colors = [jitter_color(mask_palette[label]) for label in label_ids] text_palette = get_palette(self.text_color, max_label + 1) - text_colors = [text_palette[label] for label in labels] + text_colors = [text_palette[label] for label in label_ids] polygons = [] for i, mask in enumerate(masks): @@ -198,7 +260,16 @@ def _draw_instances( self.draw_polygons(polygons, edge_colors="w", alpha=self.alpha) self.draw_binary_masks(masks, colors=colors, alphas=self.alpha) - if len(labels) > 0 and ("bboxes" not in instances or instances.bboxes.sum() == 0): + # Check if we need to draw text labels for masks + has_valid_bboxes = False + if "bboxes" in instances: + bboxes_raw = instances.bboxes + if isinstance(bboxes_raw, BaseBoxes): + has_valid_bboxes = bboxes_raw.tensor.sum() > 0 + else: + has_valid_bboxes = bboxes_raw.sum() > 0 + + if len(label_ids) > 0 and not has_valid_bboxes: # instances.bboxes.sum()==0 represent dummy bboxes. # A typical example of SOLO does not exist bbox branch. areas = [] @@ -212,7 +283,7 @@ def _draw_instances( areas = np.stack(areas, axis=0) scales = _get_adaptive_scales(areas) - for i, (pos, label) in enumerate(zip(positions, labels)): + for i, (pos, label) in enumerate(zip(positions, label_ids)): if "label_names" in instances: label_text = instances.label_names[i] else: @@ -241,9 +312,9 @@ def _draw_instances( def _draw_panoptic_seg( self, image: np.ndarray, - panoptic_seg: ["PixelData"], + panoptic_seg: PixelData, classes: list[str] | None, - palette: list | None, + palette: PaletteInput, ) -> np.ndarray: """Draw panoptic seg of GT or prediction. @@ -257,7 +328,10 @@ def _draw_panoptic_seg( np.ndarray: the drawn image which channel is RGB. """ # TODO: Is there a way to bypass? - num_classes = len(classes) + if classes is None: + raise ValueError("classes should not be None when drawing panoptic segmentation") + class_list = list(classes) + num_classes = len(class_list) panoptic_seg_data = panoptic_seg.sem_seg[0] @@ -265,8 +339,8 @@ def _draw_panoptic_seg( if "label_names" in panoptic_seg: # open set panoptic segmentation - classes = panoptic_seg.metainfo["label_names"] - ignore_index = panoptic_seg.metainfo.get("ignore_index", len(classes)) + class_list = list(panoptic_seg.metainfo["label_names"]) + ignore_index = panoptic_seg.metainfo.get("ignore_index", len(class_list)) ids = ids[ids != ignore_index] else: # for VOID label @@ -274,11 +348,15 @@ def _draw_panoptic_seg( labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64) segms = panoptic_seg_data[None] == ids[:, None, None] + segms = np.asarray(segms, dtype=bool) max_label = int(max(labels) if len(labels) > 0 else 0) - mask_color = palette if self.mask_color is None else self.mask_color - mask_palette = get_palette(mask_color, max_label + 1) + if palette is not None: + mask_color_input: PaletteInput = palette + else: + mask_color_input = cast(PaletteInput, self.mask_color) + mask_palette = get_palette(mask_color_input, max_label + 1) colors = [mask_palette[label] for label in labels] self.set_image(image) @@ -306,7 +384,7 @@ def _draw_panoptic_seg( text_colors = [text_palette[label] for label in labels] for i, (pos, label) in enumerate(zip(positions, labels)): - label_text = classes[label] + label_text = class_list[label] if label < len(class_list) else f"class {label}" self.draw_texts( label_text, @@ -329,8 +407,8 @@ def _draw_sem_seg( self, image: np.ndarray, sem_seg: PixelData, - classes: list | None, - palette: list | None, + classes: list[str] | None, + palette: PaletteInput, ) -> np.ndarray: """Draw semantic seg of GT or prediction. @@ -362,20 +440,26 @@ def _draw_sem_seg( if "label_names" in sem_seg: # open set semseg - label_names = sem_seg.metainfo["label_names"] + label_names_seq = sem_seg.metainfo["label_names"] + label_names = list(label_names_seq) else: + if classes is None: + raise ValueError("label_names should not be None") label_names = classes + palette_source: PaletteInput = palette if palette is not None else self.mask_color + palette_list = get_palette(palette_source, len(label_names)) + labels = np.array(ids, dtype=np.int64) - colors = [palette[label] for label in labels] + colors = [palette_list[label] for label in labels] self.set_image(image) # draw semantic masks for i, (label, color) in enumerate(zip(labels, colors)): - masks = sem_seg_data == label + masks = (sem_seg_data == label).astype(bool) self.draw_binary_masks(masks, colors=[color], alphas=self.alpha) - label_text = label_names[label] + label_text = label_names[label] if label < len(label_names) else f"class {label}" _, _, stats, centroids = cv2.connectedComponentsWithStats(masks[0].astype(np.uint8), connectivity=8) if stats.shape[0] > 1: largest_id = np.argmax(stats[1:, -1]) + 1 @@ -407,7 +491,7 @@ def add_datasample( self, name: str, image: np.ndarray, - data_sample: Optional["DetDataSample"] = None, + data_sample: DetDataSample | None = None, draw_gt: bool = True, draw_pred: bool = True, show: bool = False, @@ -445,8 +529,8 @@ def add_datasample( step (int): Global step value to record. Defaults to 0. """ image = image.clip(0, 255).astype(np.uint8) - classes = self.dataset_meta.get("classes", None) - palette = self.dataset_meta.get("palette", None) + classes = self._meta_classes() + palette = self._meta_palette() gt_img_data = None pred_img_data = None @@ -456,12 +540,12 @@ def add_datasample( if draw_gt and data_sample is not None: gt_img_data = image - if "gt_instances" in data_sample: + if "gt_instances" in data_sample and data_sample.gt_instances is not None: gt_img_data = self._draw_instances(image, data_sample.gt_instances, classes, palette) - if "gt_sem_seg" in data_sample: + if "gt_sem_seg" in data_sample and data_sample.gt_sem_seg is not None: gt_img_data = self._draw_sem_seg(gt_img_data, data_sample.gt_sem_seg, classes, palette) - if "gt_panoptic_seg" in data_sample: + if "gt_panoptic_seg" in data_sample and data_sample.gt_panoptic_seg is not None: assert classes is not None, ( "class information is not provided when visualizing panoptic segmentation results." ) @@ -469,15 +553,15 @@ def add_datasample( if draw_pred and data_sample is not None: pred_img_data = image - if "pred_instances" in data_sample: + if "pred_instances" in data_sample and data_sample.pred_instances is not None: pred_instances = data_sample.pred_instances pred_instances = pred_instances[pred_instances.scores > pred_score_thr] pred_img_data = self._draw_instances(image, pred_instances, classes, palette) - if "pred_sem_seg" in data_sample: + if "pred_sem_seg" in data_sample and data_sample.pred_sem_seg is not None: pred_img_data = self._draw_sem_seg(pred_img_data, data_sample.pred_sem_seg, classes, palette) - if "pred_panoptic_seg" in data_sample: + if "pred_panoptic_seg" in data_sample and data_sample.pred_panoptic_seg is not None: assert classes is not None, ( "class information is not provided when visualizing panoptic segmentation results." ) diff --git a/visdet/visualization/palette.py b/visdet/visualization/palette.py index 75d2a677..8f506378 100644 --- a/visdet/visualization/palette.py +++ b/visdet/visualization/palette.py @@ -1,11 +1,32 @@ # ruff: noqa # Copyright (c) OpenMMLab. All rights reserved. +from __future__ import annotations + +from collections.abc import Sequence +from typing import Any, cast + import numpy as np -from visdet.engine.utils import is_str + +ColorTuple = tuple[int, int, int] + + +def _as_color_tuple(color: Any) -> ColorTuple | None: + if isinstance(color, np.ndarray): + values = color.tolist() + ints = [int(c) for c in values[:3]] + elif isinstance(color, (list, tuple)): + ints = [int(c) for c in list(color)[:3]] + else: + return None + if not ints: + return None + while len(ints) < 3: + ints.append(0) + return cast(ColorTuple, tuple(ints[:3])) -def palette_val(palette: list[tuple]) -> list[tuple]: +def palette_val(palette: Sequence[Sequence[int]] | Sequence[int]) -> list[ColorTuple]: """Convert palette to matplotlib palette. Args: @@ -16,35 +37,63 @@ def palette_val(palette: list[tuple]) -> list[tuple]: """ new_palette = [] for color in palette: - color = [c / 255 for c in color] + color_tuple = _as_color_tuple(color) + if color_tuple is None: + continue + color = [c / 255 for c in color_tuple] new_palette.append(tuple(color)) return new_palette -def get_palette(palette: list[tuple] | str | tuple, num_classes: int) -> list[tuple[int]]: +def get_palette( + palette: Sequence[Sequence[int]] | Sequence[int] | np.ndarray | str | ColorTuple | None, + num_classes: int, +) -> list[ColorTuple]: """Get palette from various inputs. Args: - palette (list[tuple] | str | tuple): palette inputs. + palette (list[tuple] | str | tuple | None): palette inputs. num_classes (int): the number of classes. Returns: - list[tuple[int]]: A list of color tuples. + list[tuple[int, ...]]: A list of color tuples. """ assert isinstance(num_classes, int) + dataset_palette: list[ColorTuple] | None = None if isinstance(palette, list): - dataset_palette = palette - elif isinstance(palette, tuple): - dataset_palette = [palette] * num_classes - elif palette == "random" or palette is None: + colors: list[ColorTuple] = [] + for color in palette: + color_tuple = _as_color_tuple(color) + if color_tuple is not None: + colors.append(color_tuple) + if colors: + dataset_palette = colors + elif isinstance(palette, np.ndarray): + if palette.ndim == 1: + color_tuple = _as_color_tuple(palette) + if color_tuple is not None: + dataset_palette = [color_tuple] * num_classes + else: + colors = [] + for row in palette: + color_tuple = _as_color_tuple(row) + if color_tuple is not None: + colors.append(color_tuple) + if colors: + dataset_palette = colors + else: + color_tuple = _as_color_tuple(palette) + if color_tuple is not None: + dataset_palette = [color_tuple] * num_classes + if dataset_palette is None and (palette == "random" or palette is None): state = np.random.get_state() # random color np.random.seed(42) palette = np.random.randint(0, 256, size=(num_classes, 3)) np.random.set_state(state) - dataset_palette = [tuple(c) for c in palette] - elif palette == "coco": + dataset_palette = [cast(ColorTuple, tuple(int(x) for x in c[:3])) for c in palette] + elif dataset_palette is None and palette == "coco": # For now, we'll use a predefined COCO palette # This avoids circular imports from datasets coco_palette = [ @@ -134,8 +183,8 @@ def get_palette(palette: list[tuple] | str | tuple, num_classes: int) -> list[tu # Generate additional colors if needed np.random.seed(42) extra_colors = np.random.randint(0, 256, size=(num_classes - len(dataset_palette), 3)) - dataset_palette.extend([tuple(c) for c in extra_colors]) - elif palette == "citys": + dataset_palette.extend([cast(ColorTuple, tuple(int(x) for x in c[:3])) for c in extra_colors]) + elif dataset_palette is None and palette == "citys": # Cityscapes palette - simplified version citys_palette = [ (128, 64, 128), @@ -162,8 +211,8 @@ def get_palette(palette: list[tuple] | str | tuple, num_classes: int) -> list[tu if len(dataset_palette) < num_classes: np.random.seed(42) extra_colors = np.random.randint(0, 256, size=(num_classes - len(dataset_palette), 3)) - dataset_palette.extend([tuple(c) for c in extra_colors]) - elif palette == "voc": + dataset_palette.extend([cast(ColorTuple, tuple(int(x) for x in c[:3])) for c in extra_colors]) + elif dataset_palette is None and palette == "voc": # VOC palette voc_palette = [ (0, 0, 0), @@ -192,8 +241,8 @@ def get_palette(palette: list[tuple] | str | tuple, num_classes: int) -> list[tu if len(dataset_palette) < num_classes: np.random.seed(42) extra_colors = np.random.randint(0, 256, size=(num_classes - len(dataset_palette), 3)) - dataset_palette.extend([tuple(c) for c in extra_colors]) - elif is_str(palette): + dataset_palette.extend([cast(ColorTuple, tuple(int(x) for x in c[:3])) for c in extra_colors]) + elif dataset_palette is None and isinstance(palette, str): # Convert color string to RGB tuple # Simple color name to RGB mapping color_map = { @@ -208,14 +257,17 @@ def get_palette(palette: list[tuple] | str | tuple, num_classes: int) -> list[tu } rgb = color_map.get(palette.lower(), (128, 128, 128)) dataset_palette = [rgb] * num_classes - else: + if dataset_palette is None: raise TypeError(f"Invalid type for palette: {type(palette)}") - assert len(dataset_palette) >= num_classes, "The length of palette should not be less than `num_classes`." - return dataset_palette + if len(dataset_palette) < num_classes: + last_color = dataset_palette[-1] if dataset_palette else (0, 0, 0) + dataset_palette = dataset_palette + [last_color] * (num_classes - len(dataset_palette)) + return dataset_palette[:num_classes] -def _get_adaptive_scales(areas: np.ndarray, min_area: int = 800, max_area: int = 30000) -> np.ndarray: + +def _get_adaptive_scales(areas: np.ndarray | float, min_area: int = 800, max_area: int = 30000) -> np.ndarray: """Get adaptive scales according to areas. The scale range is [0.5, 1.0]. When the area is less than @@ -223,19 +275,24 @@ def _get_adaptive_scales(areas: np.ndarray, min_area: int = 800, max_area: int = ``max_area``, the scale is 1.0. Args: - areas (ndarray): The areas of bboxes or masks with the - shape of (n, ). + areas (ndarray | float): The areas of bboxes or masks with the + shape of (n, ) or a single float value. min_area (int): Lower bound areas for adaptive scales. Defaults to 800. max_area (int): Upper bound areas for adaptive scales. Defaults to 30000. Returns: - ndarray: The adaotive scales with the shape of (n, ). + ndarray: The adaotive scales with the shape of (n, ) or (1,). """ - scales = 0.5 + (areas - min_area) // (max_area - min_area) - scales = np.clip(scales, 0.5, 1.0) - return scales + if isinstance(areas, np.ndarray): + scales = 0.5 + (areas - min_area) // (max_area - min_area) + scales = np.clip(scales, 0.5, 1.0) + return scales + else: + # Handle scalar case - convert to array + scale = 0.5 + (areas - min_area) / (max_area - min_area) + return np.array([np.clip(scale, 0.5, 1.0)]) def jitter_color(color: tuple) -> tuple: @@ -250,5 +307,5 @@ def jitter_color(color: tuple) -> tuple: """ jitter = np.random.rand(3) jitter = (jitter / np.linalg.norm(jitter) - 0.5) * 0.5 * 255 - color = np.clip(jitter + color, 0, 255).astype(np.uint8) - return tuple(color) + clipped = np.clip(jitter + color, 0, 255).astype(np.uint8) + return cast(ColorTuple, tuple(int(c) for c in clipped.tolist()[:3]))