From b56ceb28af3fcf27fed300afae70b907f51bb226 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 9 Jun 2021 12:52:15 +0800 Subject: [PATCH 01/96] [Feature]Segformer re-implementation --- configs/_base_/datasets/ade20k_align.py | 54 ++ configs/_base_/models/segformer.py | 35 ++ .../segformer_b0_512x512_160k_ade20k.py | 74 +++ mmseg/datasets/pipelines/transforms.py | 226 +++++++++ mmseg/models/backbones/__init__.py | 3 +- mmseg/models/backbones/mit.py | 479 ++++++++++++++++++ mmseg/models/decode_heads/__init__.py | 3 +- mmseg/models/decode_heads/segformer_head.py | 94 ++++ mmseg/models/utils/__init__.py | 5 +- mmseg/models/utils/helpers.py | 20 + 10 files changed, 990 insertions(+), 3 deletions(-) create mode 100644 configs/_base_/datasets/ade20k_align.py create mode 100644 configs/_base_/models/segformer.py create mode 100644 configs/segformer/segformer_b0_512x512_160k_ade20k.py create mode 100644 mmseg/models/backbones/mit.py create mode 100644 mmseg/models/decode_heads/segformer_head.py create mode 100644 mmseg/models/utils/helpers.py diff --git a/configs/_base_/datasets/ade20k_align.py b/configs/_base_/datasets/ade20k_align.py new file mode 100644 index 0000000000..51f24347ab --- /dev/null +++ b/configs/_base_/datasets/ade20k_align.py @@ -0,0 +1,54 @@ +# dataset settings +dataset_type = 'ADE20KDataset' +data_root = 'data/ade/ADEChallengeData2016' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +crop_size = (512, 512) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', reduce_zero_label=True), + dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2048, 512), + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='AlignedResize', keep_ratio=True, size_divisor=32), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/training', + ann_dir='annotations/training', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline)) diff --git a/configs/_base_/models/segformer.py b/configs/_base_/models/segformer.py new file mode 100644 index 0000000000..78498b11e5 --- /dev/null +++ b/configs/_base_/models/segformer.py @@ -0,0 +1,35 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained=None, + backbone=dict( + type='MixVisionTransformer', + img_size=224, + in_chans=3, + embed_dims=[64, 128, 256, 512], + num_heads=[1, 2, 4, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + depths=[3, 4, 6, 3], + sr_ratios=[8, 4, 2, 1]), + decode_head=dict( + type='SegFormerHead', + in_channels=[64, 128, 320, 512], + in_index=[0, 1, 2, 3], + feature_strides=[4, 8, 16, 32], + channels=128, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + decoder_params=dict(), + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/configs/segformer/segformer_b0_512x512_160k_ade20k.py b/configs/segformer/segformer_b0_512x512_160k_ade20k.py new file mode 100644 index 0000000000..7c24194e1f --- /dev/null +++ b/configs/segformer/segformer_b0_512x512_160k_ade20k.py @@ -0,0 +1,74 @@ +_base_ = [ + '../_base_/models/segformer.py', '../_base_/datasets/ade20k_align.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='pretrain/mit_b0.pth', + backbone=dict( + patch_size=4, + embed_dims=[32, 64, 160, 256], + num_heads=[1, 2, 5, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, + depths=[2, 2, 2, 2], + sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, + drop_path_rate=0.1), + decode_head=dict( + type='SegFormerHead', + in_channels=[32, 64, 160, 256], + in_index=[0, 1, 2, 3], + feature_strides=[4, 8, 16, 32], + channels=128, + dropout_ratio=0.1, + num_classes=150, + norm_cfg=norm_cfg, + align_corners=False, + decoder_params=dict( + embed_dim=256, + depth=1, + num_heads=1, + mlp_ratio=3, + num_patches=128 * 128, + attn_ffn=False, + conv_cls_seg=True, + ), + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) + +find_unused_parameters = True + +# optimizer +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.0002, + betas=(0.9, 0.999), + weight_decay=0.0001, + paramwise_cfg=dict( + custom_keys={ + 'pos_block': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.), + 'head': dict(lr_mult=10.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +data = dict(samples_per_gpu=2) +checkpoint_config = dict(by_epoch=False, interval=4000) +evaluation = dict(interval=4000, metric='mIoU') diff --git a/mmseg/datasets/pipelines/transforms.py b/mmseg/datasets/pipelines/transforms.py index 20753bb0fa..8800e84d34 100644 --- a/mmseg/datasets/pipelines/transforms.py +++ b/mmseg/datasets/pipelines/transforms.py @@ -6,6 +6,232 @@ from ..builder import PIPELINES +@PIPELINES.register_module() +class AlignedResize(object): + """Resize images & seg. Align + """ + + def __init__(self, + img_scale=None, + multiscale_mode='range', + ratio_range=None, + keep_ratio=True, + size_divisor=32): + if img_scale is None: + self.img_scale = None + else: + if isinstance(img_scale, list): + self.img_scale = img_scale + else: + self.img_scale = [img_scale] + assert mmcv.is_list_of(self.img_scale, tuple) + + if ratio_range is not None: + # mode 1: given img_scale=None and a range of image ratio + # mode 2: given a scale and a range of image ratio + assert self.img_scale is None or len(self.img_scale) == 1 + else: + # mode 3 and 4: given multiple scales or a range of scales + assert multiscale_mode in ['value', 'range'] + + self.multiscale_mode = multiscale_mode + self.ratio_range = ratio_range + self.keep_ratio = keep_ratio + self.size_divisor = size_divisor + + @staticmethod + def random_select(img_scales): + """Randomly select an img_scale from given candidates. + + Args: + img_scales (list[tuple]): Images scales for selection. + + Returns: + (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, + where ``img_scale`` is the selected image scale and + ``scale_idx`` is the selected index in the given candidates. + """ + + assert mmcv.is_list_of(img_scales, tuple) + scale_idx = np.random.randint(len(img_scales)) + img_scale = img_scales[scale_idx] + return img_scale, scale_idx + + @staticmethod + def random_sample(img_scales): + """Randomly sample an img_scale when ``multiscale_mode=='range'``. + + Args: + img_scales (list[tuple]): Images scale range for sampling. + There must be two tuples in img_scales, which specify the lower + and uper bound of image scales. + + Returns: + (tuple, None): Returns a tuple ``(img_scale, None)``, where + ``img_scale`` is sampled scale and None is just a placeholder + to be consistent with :func:`random_select`. + """ + + assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2 + img_scale_long = [max(s) for s in img_scales] + img_scale_short = [min(s) for s in img_scales] + long_edge = np.random.randint( + min(img_scale_long), + max(img_scale_long) + 1) + short_edge = np.random.randint( + min(img_scale_short), + max(img_scale_short) + 1) + img_scale = (long_edge, short_edge) + return img_scale, None + + @staticmethod + def random_sample_ratio(img_scale, ratio_range): + """Randomly sample an img_scale when ``ratio_range`` is specified. + + A ratio will be randomly sampled from the range specified by + ``ratio_range``. Then it would be multiplied with ``img_scale`` to + generate sampled scale. + + Args: + img_scale (tuple): Images scale base to multiply with ratio. + ratio_range (tuple[float]): The minimum and maximum ratio to scale + the ``img_scale``. + + Returns: + (tuple, None): Returns a tuple ``(scale, None)``, where + ``scale`` is sampled ratio multiplied with ``img_scale`` and + None is just a placeholder to be consistent with + :func:`random_select`. + """ + + assert isinstance(img_scale, tuple) and len(img_scale) == 2 + min_ratio, max_ratio = ratio_range + assert min_ratio <= max_ratio + ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio + scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) + return scale, None + + def _random_scale(self, results): + """Randomly sample an img_scale according to ``ratio_range`` and + ``multiscale_mode``. + + If ``ratio_range`` is specified, a ratio will be sampled and be + multiplied with ``img_scale``. + If multiple scales are specified by ``img_scale``, a scale will be + sampled according to ``multiscale_mode``. + Otherwise, single scale will be used. + + Args: + results (dict): Result dict from :obj:`dataset`. + + Returns: + dict: Two new keys 'scale` and 'scale_idx` are added into + ``results``, which would be used by subsequent pipelines. + """ + + if self.ratio_range is not None: + if self.img_scale is None: + h, w = results['img'].shape[:2] + scale, scale_idx = self.random_sample_ratio((w, h), + self.ratio_range) + else: + scale, scale_idx = self.random_sample_ratio( + self.img_scale[0], self.ratio_range) + elif len(self.img_scale) == 1: + scale, scale_idx = self.img_scale[0], 0 + elif self.multiscale_mode == 'range': + scale, scale_idx = self.random_sample(self.img_scale) + elif self.multiscale_mode == 'value': + scale, scale_idx = self.random_select(self.img_scale) + else: + raise NotImplementedError + + results['scale'] = scale + results['scale_idx'] = scale_idx + + def _align(self, img, size_divisor, interpolation=None): + align_h = int(np.ceil(img.shape[0] / size_divisor)) * size_divisor + align_w = int(np.ceil(img.shape[1] / size_divisor)) * size_divisor + if interpolation is None: + img = mmcv.imresize(img, (align_w, align_h)) + else: + img = mmcv.imresize( + img, (align_w, align_h), interpolation=interpolation) + return img + + def _resize_img(self, results): + """Resize images with ``results['scale']``.""" + if self.keep_ratio: + img, scale_factor = mmcv.imrescale( + results['img'], results['scale'], return_scale=True) + # align # + img = self._align(img, self.size_divisor) + # the w_scale and h_scale has minor difference + # a real fix should be done in the mmcv.imrescale in the future + new_h, new_w = img.shape[:2] + h, w = results['img'].shape[:2] + w_scale = new_w / w + h_scale = new_h / h + else: + img, w_scale, h_scale = mmcv.imresize( + results['img'], results['scale'], return_scale=True) + + h, w = img.shape[:2] + + assert h % self.size_divisor == 0 and w % self.size_divisor == 0, \ + "img size not align. h:{} w:{}".format(h, w) + scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], + dtype=np.float32) + results['img'] = img + results['img_shape'] = img.shape + results['pad_shape'] = img.shape # in case that there is no padding + results['scale_factor'] = scale_factor + results['keep_ratio'] = self.keep_ratio + + def _resize_seg(self, results): + """Resize semantic segmentation map with ``results['scale']``.""" + for key in results.get('seg_fields', []): + if self.keep_ratio: + gt_seg = mmcv.imrescale( + results[key], results['scale'], interpolation='nearest') + gt_seg = self._align( + gt_seg, self.size_divisor, interpolation='nearest') + else: + gt_seg = mmcv.imresize( + results[key], results['scale'], interpolation='nearest') + h, w = gt_seg.shape[:2] + + assert h % self.size_divisor == 0 and w % self.size_divisor ==\ + 0, "gt_seg size not align. h:{} w:{}".format(h, w) + results[key] = gt_seg + + def __call__(self, results): + """Call function to resize images, bounding boxes, masks, semantic + segmentation map. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', + 'keep_ratio' keys are added into result dict. + """ + + if 'scale' not in results: + self._random_scale(results) + self._resize_img(results) + self._resize_seg(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += (f'(img_scale={self.img_scale}, ' + f'multiscale_mode={self.multiscale_mode}, ' + f'ratio_range={self.ratio_range}, ' + f'keep_ratio={self.keep_ratio})') + return repr_str + + @PIPELINES.register_module() class Resize(object): """Resize images & seg. diff --git a/mmseg/models/backbones/__init__.py b/mmseg/models/backbones/__init__.py index eae064b6e5..e4c565b153 100644 --- a/mmseg/models/backbones/__init__.py +++ b/mmseg/models/backbones/__init__.py @@ -1,6 +1,7 @@ from .cgnet import CGNet from .fast_scnn import FastSCNN from .hrnet import HRNet +from .mit import MixVisionTransformer from .mobilenet_v2 import MobileNetV2 from .mobilenet_v3 import MobileNetV3 from .resnest import ResNeSt @@ -12,5 +13,5 @@ __all__ = [ 'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN', 'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3', - 'VisionTransformer' + 'VisionTransformer', 'MixVisionTransformer' ] diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py new file mode 100644 index 0000000000..991dd1914f --- /dev/null +++ b/mmseg/models/backbones/mit.py @@ -0,0 +1,479 @@ +import math +import torch +import torch.nn as nn +from functools import partial + +from mmcv.runner import load_checkpoint + +from ..builder import BACKBONES +from ..utils import DropPath, to_2tuple, trunc_normal_ +from ...utils import get_root_logger + + +class DWConv(nn.Module): + + def __init__(self, dim=768): + super(DWConv, self).__init__() + self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim) + + def forward(self, x, H, W): + B, N, C = x.shape + x = x.transpose(1, 2).view(B, C, H, W) + x = self.dwconv(x) + x = x.flatten(2).transpose(1, 2) + + return x + + +class Mlp(nn.Module): + + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.dwconv = DWConv(hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def forward(self, x, H, W): + x = self.fc1(x) + x = self.dwconv(x, H, W) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0., + sr_ratio=1): + super().__init__() + assert dim % num_heads == 0, f"dim {dim} should be divided by " + f"num_heads {num_heads}." + + self.dim = dim + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.q = nn.Linear(dim, dim, bias=qkv_bias) + self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.sr_ratio = sr_ratio + if sr_ratio > 1: + self.sr = nn.Conv2d( + dim, dim, kernel_size=sr_ratio, stride=sr_ratio) + self.norm = nn.LayerNorm(dim) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def forward(self, x, H, W): + B, N, C = x.shape + q = self.q(x).reshape(B, N, self.num_heads, + C // self.num_heads).permute(0, 2, 1, 3) + + if self.sr_ratio > 1: + x_ = x.permute(0, 2, 1).reshape(B, C, H, W) + x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1) + x_ = self.norm(x_) + kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, + C // self.num_heads).permute( + 2, 0, 3, 1, 4) + else: + kv = self.kv(x).reshape(B, -1, 2, self.num_heads, + C // self.num_heads).permute( + 2, 0, 3, 1, 4) + k, v = kv[0], kv[1] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + + return x + + +class Block(nn.Module): + + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + sr_ratio=1): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + sr_ratio=sr_ratio) + # NOTE: drop path for stochastic depth, we shall see if this is better + # than dropout here + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def forward(self, x, H, W): + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + + return x + + +class OverlapPatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + + def __init__(self, + img_size=224, + patch_size=7, + stride=4, + in_chans=3, + embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + + self.img_size = img_size + self.patch_size = patch_size + self.H, self.W = img_size[0] // patch_size[0], img_size[ + 1] // patch_size[1] + self.num_patches = self.H * self.W + self.proj = nn.Conv2d( + in_chans, + embed_dim, + kernel_size=patch_size, + stride=stride, + padding=(patch_size[0] // 2, patch_size[1] // 2)) + self.norm = nn.LayerNorm(embed_dim) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def forward(self, x): + x = self.proj(x) + _, _, H, W = x.shape + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + + return x, H, W + + +@BACKBONES.register_module() +class MixVisionTransformer(nn.Module): + + def __init__(self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dims=[64, 128, 256, 512], + num_heads=[1, 2, 4, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_layer=partial(nn.LayerNorm, eps=1e-6), + depths=[3, 4, 6, 3], + sr_ratios=[8, 4, 2, 1]): + super().__init__() + self.depths = depths + + # patch_embed + self.patch_embed1 = OverlapPatchEmbed( + img_size=img_size, + patch_size=7, + stride=4, + in_chans=in_chans, + embed_dim=embed_dims[0]) + self.patch_embed2 = OverlapPatchEmbed( + img_size=img_size // 4, + patch_size=3, + stride=2, + in_chans=embed_dims[0], + embed_dim=embed_dims[1]) + self.patch_embed3 = OverlapPatchEmbed( + img_size=img_size // 8, + patch_size=3, + stride=2, + in_chans=embed_dims[1], + embed_dim=embed_dims[2]) + self.patch_embed4 = OverlapPatchEmbed( + img_size=img_size // 16, + patch_size=3, + stride=2, + in_chans=embed_dims[2], + embed_dim=embed_dims[3]) + + # transformer encoder + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + ] # stochastic depth decay rule + cur = 0 + self.block1 = nn.ModuleList([ + Block( + dim=embed_dims[0], + num_heads=num_heads[0], + mlp_ratio=mlp_ratios[0], + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[cur + i], + norm_layer=norm_layer, + sr_ratio=sr_ratios[0]) for i in range(depths[0]) + ]) + self.norm1 = norm_layer(embed_dims[0]) + + cur += depths[0] + self.block2 = nn.ModuleList([ + Block( + dim=embed_dims[1], + num_heads=num_heads[1], + mlp_ratio=mlp_ratios[1], + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[cur + i], + norm_layer=norm_layer, + sr_ratio=sr_ratios[1]) for i in range(depths[1]) + ]) + self.norm2 = norm_layer(embed_dims[1]) + + cur += depths[1] + self.block3 = nn.ModuleList([ + Block( + dim=embed_dims[2], + num_heads=num_heads[2], + mlp_ratio=mlp_ratios[2], + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[cur + i], + norm_layer=norm_layer, + sr_ratio=sr_ratios[2]) for i in range(depths[2]) + ]) + self.norm3 = norm_layer(embed_dims[2]) + + cur += depths[2] + self.block4 = nn.ModuleList([ + Block( + dim=embed_dims[3], + num_heads=num_heads[3], + mlp_ratio=mlp_ratios[3], + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[cur + i], + norm_layer=norm_layer, + sr_ratio=sr_ratios[3]) for i in range(depths[3]) + ]) + self.norm4 = norm_layer(embed_dims[3]) + + # classification head + # self.head = nn.Linear(embed_dims[3], num_classes) if num_classes + # > 0 else nn.Identity() + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint( + self, + pretrained, + map_location='cpu', + strict=False, + logger=logger) + + def reset_drop_path(self, drop_path_rate): + dpr = [ + x.item() + for x in torch.linspace(0, drop_path_rate, sum(self.depths)) + ] + cur = 0 + for i in range(self.depths[0]): + self.block1[i].drop_path.drop_prob = dpr[cur + i] + + cur += self.depths[0] + for i in range(self.depths[1]): + self.block2[i].drop_path.drop_prob = dpr[cur + i] + + cur += self.depths[1] + for i in range(self.depths[2]): + self.block3[i].drop_path.drop_prob = dpr[cur + i] + + cur += self.depths[2] + for i in range(self.depths[3]): + self.block4[i].drop_path.drop_prob = dpr[cur + i] + + def freeze_patch_emb(self): + self.patch_embed1.requires_grad = False + + @torch.jit.ignore + def no_weight_decay(self): + return { + 'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token' + } # has pos_embed may be better + + def forward_features(self, x): + B = x.shape[0] + outs = [] + + # stage 1 + x, H, W = self.patch_embed1(x) + for i, blk in enumerate(self.block1): + x = blk(x, H, W) + x = self.norm1(x) + x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() + outs.append(x) + + # stage 2 + x, H, W = self.patch_embed2(x) + for i, blk in enumerate(self.block2): + x = blk(x, H, W) + x = self.norm2(x) + x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() + outs.append(x) + + # stage 3 + x, H, W = self.patch_embed3(x) + for i, blk in enumerate(self.block3): + x = blk(x, H, W) + x = self.norm3(x) + x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() + outs.append(x) + + # stage 4 + x, H, W = self.patch_embed4(x) + for i, blk in enumerate(self.block4): + x = blk(x, H, W) + x = self.norm4(x) + x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() + outs.append(x) + + return outs + + def forward(self, x): + x = self.forward_features(x) + + return x diff --git a/mmseg/models/decode_heads/__init__.py b/mmseg/models/decode_heads/__init__.py index 662aae3c00..ebc40616c0 100644 --- a/mmseg/models/decode_heads/__init__.py +++ b/mmseg/models/decode_heads/__init__.py @@ -16,6 +16,7 @@ from .point_head import PointHead from .psa_head import PSAHead from .psp_head import PSPHead +from .segformer_head import SegFormerHead from .sep_aspp_head import DepthwiseSeparableASPPHead from .sep_fcn_head import DepthwiseSeparableFCNHead from .uper_head import UPerHead @@ -24,5 +25,5 @@ 'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead', 'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead', 'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead', - 'PointHead', 'APCHead', 'DMHead', 'LRASPPHead' + 'PointHead', 'APCHead', 'DMHead', 'LRASPPHead', 'SegFormerHead' ] diff --git a/mmseg/models/decode_heads/segformer_head.py b/mmseg/models/decode_heads/segformer_head.py new file mode 100644 index 0000000000..cd26e2d089 --- /dev/null +++ b/mmseg/models/decode_heads/segformer_head.py @@ -0,0 +1,94 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from mmcv.cnn import ConvModule + +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +class MLP(nn.Module): + """ + Linear Embedding + """ + + def __init__(self, input_dim=2048, embed_dim=768): + super().__init__() + self.proj = nn.Linear(input_dim, embed_dim) + + def forward(self, x): + x = x.flatten(2).transpose(1, 2) + x = self.proj(x) + return x + + +@HEADS.register_module() +class SegFormerHead(BaseDecodeHead): + """ + SegFormer: Simple and Efficient Design for Semantic Segmentation with + Transformers + """ + + def __init__(self, feature_strides, decoder_params, **kwargs): + super(SegFormerHead, self).__init__( + input_transform='multiple_select', **kwargs) + assert len(feature_strides) == len(self.in_channels) + assert min(feature_strides) == feature_strides[0] + self.feature_strides = feature_strides + + c1_in_channels, c2_in_channels, c3_in_channels, c4_in_channels = \ + self.in_channels + + decoder_params = decoder_params + embedding_dim = decoder_params['embed_dim'] + + self.linear_c4 = MLP(input_dim=c4_in_channels, embed_dim=embedding_dim) + self.linear_c3 = MLP(input_dim=c3_in_channels, embed_dim=embedding_dim) + self.linear_c2 = MLP(input_dim=c2_in_channels, embed_dim=embedding_dim) + self.linear_c1 = MLP(input_dim=c1_in_channels, embed_dim=embedding_dim) + + self.linear_fuse = ConvModule( + in_channels=embedding_dim * 4, + out_channels=embedding_dim, + kernel_size=1, + norm_cfg=dict(type='SyncBN', requires_grad=True)) + + self.linear_pred = nn.Conv2d( + embedding_dim, self.num_classes, kernel_size=1) + + def forward(self, inputs): + x = self._transform_inputs(inputs) # len=4, 1/4,1/8,1/16,1/32 + c1, c2, c3, c4 = x + + # MLP decoder on C1-C4 # + n, _, h, w = c4.shape + + _c4 = self.linear_c4(c4).permute(0, 2, + 1).reshape(n, -1, c4.shape[2], + c4.shape[3]) + _c4 = F.interpolate( + _c4, size=c1.size()[2:], mode='bilinear', align_corners=False) + + _c3 = self.linear_c3(c3).permute(0, 2, + 1).reshape(n, -1, c3.shape[2], + c3.shape[3]) + _c3 = F.interpolate( + _c3, size=c1.size()[2:], mode='bilinear', align_corners=False) + + _c2 = self.linear_c2(c2).permute(0, 2, + 1).reshape(n, -1, c2.shape[2], + c2.shape[3]) + _c2 = F.interpolate( + _c2, size=c1.size()[2:], mode='bilinear', align_corners=False) + + _c1 = self.linear_c1(c1).permute(0, 2, + 1).reshape(n, -1, c1.shape[2], + c1.shape[3]) + + _c = self.linear_fuse(torch.cat([_c4, _c3, _c2, _c1], dim=1)) + + x = self.dropout(_c) + x = self.linear_pred(x) + + return x diff --git a/mmseg/models/utils/__init__.py b/mmseg/models/utils/__init__.py index 3d3bdd349b..c29ad4f4c1 100644 --- a/mmseg/models/utils/__init__.py +++ b/mmseg/models/utils/__init__.py @@ -1,4 +1,5 @@ from .drop import DropPath +from .helpers import to_1tuple, to_2tuple, to_3tuple, to_4tuple, to_ntuple from .inverted_residual import InvertedResidual, InvertedResidualV3 from .make_divisible import make_divisible from .res_layer import ResLayer @@ -9,5 +10,7 @@ __all__ = [ 'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual', - 'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'DropPath', 'trunc_normal_' + 'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'DropPath', + 'trunc_normal_', 'to_1tuple', 'to_2tuple', 'to_3tuple', 'to_4tuple', + 'to_ntuple' ] diff --git a/mmseg/models/utils/helpers.py b/mmseg/models/utils/helpers.py new file mode 100644 index 0000000000..9d2bd96dd2 --- /dev/null +++ b/mmseg/models/utils/helpers.py @@ -0,0 +1,20 @@ +import collections.abc +from itertools import repeat + + +# From PyTorch internals +def _ntuple(n): + + def parse(x): + if isinstance(x, collections.abc.Iterable): + return x + return tuple(repeat(x, n)) + + return parse + + +to_1tuple = _ntuple(1) +to_2tuple = _ntuple(2) +to_3tuple = _ntuple(3) +to_4tuple = _ntuple(4) +to_ntuple = _ntuple From 5a54664ba5cfb23ffa3f234900de4561fdf6de83 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 10 Jun 2021 02:01:29 +0800 Subject: [PATCH 02/96] Using act_cfg and norm_cfg to control activation and normalization --- mmseg/models/backbones/mit.py | 39 +++++++++++++++++------------------ 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index 991dd1914f..93abd467bd 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -1,8 +1,8 @@ import math import torch import torch.nn as nn -from functools import partial +from mmcv.cnn import build_norm_layer, build_activation_layer from mmcv.runner import load_checkpoint from ..builder import BACKBONES @@ -31,14 +31,14 @@ def __init__(self, in_features, hidden_features=None, out_features=None, - act_layer=nn.GELU, + act_cfg=dict(type='GELU'), drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.dwconv = DWConv(hidden_features) - self.act = act_layer() + self.act = build_activation_layer(act_cfg) self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) @@ -73,6 +73,7 @@ class Attention(nn.Module): def __init__(self, dim, + norm_cfg=dict(type='LN'), num_heads=8, qkv_bias=False, qk_scale=None, @@ -98,7 +99,7 @@ def __init__(self, if sr_ratio > 1: self.sr = nn.Conv2d( dim, dim, kernel_size=sr_ratio, stride=sr_ratio) - self.norm = nn.LayerNorm(dim) + _, self.norm = build_norm_layer(norm_cfg, dim) self.apply(self._init_weights) @@ -157,11 +158,11 @@ def __init__(self, drop=0., attn_drop=0., drop_path=0., - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='LN'), sr_ratio=1): super().__init__() - self.norm1 = norm_layer(dim) + _, self.norm1 = build_norm_layer(norm_cfg, dim) self.attn = Attention( dim, num_heads=num_heads, @@ -174,12 +175,12 @@ def __init__(self, # than dropout here self.drop_path = DropPath( drop_path) if drop_path > 0. else nn.Identity() - self.norm2 = norm_layer(dim) + _, self.norm2 = build_norm_layer(norm_cfg, dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, - act_layer=act_layer, + act_cfg=act_cfg, drop=drop) self.apply(self._init_weights) @@ -274,12 +275,11 @@ def __init__(self, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., - norm_layer=partial(nn.LayerNorm, eps=1e-6), + norm_cfg=dict(type='LN', eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1]): super().__init__() self.depths = depths - # patch_embed self.patch_embed1 = OverlapPatchEmbed( img_size=img_size, @@ -321,11 +321,10 @@ def __init__(self, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], - norm_layer=norm_layer, + norm_cfg=norm_cfg, sr_ratio=sr_ratios[0]) for i in range(depths[0]) ]) - self.norm1 = norm_layer(embed_dims[0]) - + _, self.norm1 = build_norm_layer(norm_cfg, embed_dims[0]) cur += depths[0] self.block2 = nn.ModuleList([ Block( @@ -337,10 +336,10 @@ def __init__(self, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], - norm_layer=norm_layer, + norm_cfg=norm_cfg, sr_ratio=sr_ratios[1]) for i in range(depths[1]) ]) - self.norm2 = norm_layer(embed_dims[1]) + _, self.norm2 = build_norm_layer(norm_cfg, embed_dims[1]) cur += depths[1] self.block3 = nn.ModuleList([ @@ -353,10 +352,10 @@ def __init__(self, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], - norm_layer=norm_layer, + norm_cfg=norm_cfg, sr_ratio=sr_ratios[2]) for i in range(depths[2]) ]) - self.norm3 = norm_layer(embed_dims[2]) + _, self.norm3 = build_norm_layer(norm_cfg, embed_dims[2]) cur += depths[2] self.block4 = nn.ModuleList([ @@ -369,10 +368,10 @@ def __init__(self, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], - norm_layer=norm_layer, + norm_cfg=norm_cfg, sr_ratio=sr_ratios[3]) for i in range(depths[3]) ]) - self.norm4 = norm_layer(embed_dims[3]) + _, self.norm4 = build_norm_layer(norm_cfg, embed_dims[3]) # classification head # self.head = nn.Linear(embed_dims[3], num_classes) if num_classes From 0d60aeb98ca5c682944f7598b101b38301420e02 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 10 Jun 2021 18:15:25 +0800 Subject: [PATCH 03/96] Split this PR into several little PRs --- configs/_base_/datasets/ade20k_align.py | 54 ----- configs/_base_/models/segformer.py | 35 --- .../segformer_b0_512x512_160k_ade20k.py | 74 ------ mmseg/datasets/pipelines/transforms.py | 226 ------------------ mmseg/models/decode_heads/segformer_head.py | 94 -------- 5 files changed, 483 deletions(-) delete mode 100644 configs/_base_/datasets/ade20k_align.py delete mode 100644 configs/_base_/models/segformer.py delete mode 100644 configs/segformer/segformer_b0_512x512_160k_ade20k.py delete mode 100644 mmseg/models/decode_heads/segformer_head.py diff --git a/configs/_base_/datasets/ade20k_align.py b/configs/_base_/datasets/ade20k_align.py deleted file mode 100644 index 51f24347ab..0000000000 --- a/configs/_base_/datasets/ade20k_align.py +++ /dev/null @@ -1,54 +0,0 @@ -# dataset settings -dataset_type = 'ADE20KDataset' -data_root = 'data/ade/ADEChallengeData2016' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -crop_size = (512, 512) -train_pipeline = [ - dict(type='LoadImageFromFile'), - dict(type='LoadAnnotations', reduce_zero_label=True), - dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), - dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), - dict(type='RandomFlip', prob=0.5), - dict(type='PhotoMetricDistortion'), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), - dict(type='DefaultFormatBundle'), - dict(type='Collect', keys=['img', 'gt_semantic_seg']), -] -test_pipeline = [ - dict(type='LoadImageFromFile'), - dict( - type='MultiScaleFlipAug', - img_scale=(2048, 512), - # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], - flip=False, - transforms=[ - dict(type='AlignedResize', keep_ratio=True, size_divisor=32), - dict(type='RandomFlip'), - dict(type='Normalize', **img_norm_cfg), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] -data = dict( - samples_per_gpu=4, - workers_per_gpu=4, - train=dict( - type=dataset_type, - data_root=data_root, - img_dir='images/training', - ann_dir='annotations/training', - pipeline=train_pipeline), - val=dict( - type=dataset_type, - data_root=data_root, - img_dir='images/validation', - ann_dir='annotations/validation', - pipeline=test_pipeline), - test=dict( - type=dataset_type, - data_root=data_root, - img_dir='images/validation', - ann_dir='annotations/validation', - pipeline=test_pipeline)) diff --git a/configs/_base_/models/segformer.py b/configs/_base_/models/segformer.py deleted file mode 100644 index 78498b11e5..0000000000 --- a/configs/_base_/models/segformer.py +++ /dev/null @@ -1,35 +0,0 @@ -# model settings -norm_cfg = dict(type='SyncBN', requires_grad=True) -model = dict( - type='EncoderDecoder', - pretrained=None, - backbone=dict( - type='MixVisionTransformer', - img_size=224, - in_chans=3, - embed_dims=[64, 128, 256, 512], - num_heads=[1, 2, 4, 8], - mlp_ratios=[4, 4, 4, 4], - qkv_bias=False, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., - depths=[3, 4, 6, 3], - sr_ratios=[8, 4, 2, 1]), - decode_head=dict( - type='SegFormerHead', - in_channels=[64, 128, 320, 512], - in_index=[0, 1, 2, 3], - feature_strides=[4, 8, 16, 32], - channels=128, - dropout_ratio=0.1, - num_classes=19, - norm_cfg=norm_cfg, - align_corners=False, - decoder_params=dict(), - loss_decode=dict( - type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), - # model training and testing settings - train_cfg=dict(), - test_cfg=dict(mode='whole')) diff --git a/configs/segformer/segformer_b0_512x512_160k_ade20k.py b/configs/segformer/segformer_b0_512x512_160k_ade20k.py deleted file mode 100644 index 7c24194e1f..0000000000 --- a/configs/segformer/segformer_b0_512x512_160k_ade20k.py +++ /dev/null @@ -1,74 +0,0 @@ -_base_ = [ - '../_base_/models/segformer.py', '../_base_/datasets/ade20k_align.py', - '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' -] - -# model settings -norm_cfg = dict(type='SyncBN', requires_grad=True) -model = dict( - type='EncoderDecoder', - pretrained='pretrain/mit_b0.pth', - backbone=dict( - patch_size=4, - embed_dims=[32, 64, 160, 256], - num_heads=[1, 2, 5, 8], - mlp_ratios=[4, 4, 4, 4], - qkv_bias=True, - depths=[2, 2, 2, 2], - sr_ratios=[8, 4, 2, 1], - drop_rate=0.0, - drop_path_rate=0.1), - decode_head=dict( - type='SegFormerHead', - in_channels=[32, 64, 160, 256], - in_index=[0, 1, 2, 3], - feature_strides=[4, 8, 16, 32], - channels=128, - dropout_ratio=0.1, - num_classes=150, - norm_cfg=norm_cfg, - align_corners=False, - decoder_params=dict( - embed_dim=256, - depth=1, - num_heads=1, - mlp_ratio=3, - num_patches=128 * 128, - attn_ffn=False, - conv_cls_seg=True, - ), - loss_decode=dict( - type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), - # model training and testing settings - train_cfg=dict(), - test_cfg=dict(mode='whole')) - -find_unused_parameters = True - -# optimizer -optimizer = dict( - _delete_=True, - type='AdamW', - lr=0.0002, - betas=(0.9, 0.999), - weight_decay=0.0001, - paramwise_cfg=dict( - custom_keys={ - 'pos_block': dict(decay_mult=0.), - 'norm': dict(decay_mult=0.), - 'head': dict(lr_mult=10.) - })) - -lr_config = dict( - _delete_=True, - policy='poly', - warmup='linear', - warmup_iters=1500, - warmup_ratio=1e-6, - power=1.0, - min_lr=0.0, - by_epoch=False) - -data = dict(samples_per_gpu=2) -checkpoint_config = dict(by_epoch=False, interval=4000) -evaluation = dict(interval=4000, metric='mIoU') diff --git a/mmseg/datasets/pipelines/transforms.py b/mmseg/datasets/pipelines/transforms.py index 8800e84d34..20753bb0fa 100644 --- a/mmseg/datasets/pipelines/transforms.py +++ b/mmseg/datasets/pipelines/transforms.py @@ -6,232 +6,6 @@ from ..builder import PIPELINES -@PIPELINES.register_module() -class AlignedResize(object): - """Resize images & seg. Align - """ - - def __init__(self, - img_scale=None, - multiscale_mode='range', - ratio_range=None, - keep_ratio=True, - size_divisor=32): - if img_scale is None: - self.img_scale = None - else: - if isinstance(img_scale, list): - self.img_scale = img_scale - else: - self.img_scale = [img_scale] - assert mmcv.is_list_of(self.img_scale, tuple) - - if ratio_range is not None: - # mode 1: given img_scale=None and a range of image ratio - # mode 2: given a scale and a range of image ratio - assert self.img_scale is None or len(self.img_scale) == 1 - else: - # mode 3 and 4: given multiple scales or a range of scales - assert multiscale_mode in ['value', 'range'] - - self.multiscale_mode = multiscale_mode - self.ratio_range = ratio_range - self.keep_ratio = keep_ratio - self.size_divisor = size_divisor - - @staticmethod - def random_select(img_scales): - """Randomly select an img_scale from given candidates. - - Args: - img_scales (list[tuple]): Images scales for selection. - - Returns: - (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, - where ``img_scale`` is the selected image scale and - ``scale_idx`` is the selected index in the given candidates. - """ - - assert mmcv.is_list_of(img_scales, tuple) - scale_idx = np.random.randint(len(img_scales)) - img_scale = img_scales[scale_idx] - return img_scale, scale_idx - - @staticmethod - def random_sample(img_scales): - """Randomly sample an img_scale when ``multiscale_mode=='range'``. - - Args: - img_scales (list[tuple]): Images scale range for sampling. - There must be two tuples in img_scales, which specify the lower - and uper bound of image scales. - - Returns: - (tuple, None): Returns a tuple ``(img_scale, None)``, where - ``img_scale`` is sampled scale and None is just a placeholder - to be consistent with :func:`random_select`. - """ - - assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2 - img_scale_long = [max(s) for s in img_scales] - img_scale_short = [min(s) for s in img_scales] - long_edge = np.random.randint( - min(img_scale_long), - max(img_scale_long) + 1) - short_edge = np.random.randint( - min(img_scale_short), - max(img_scale_short) + 1) - img_scale = (long_edge, short_edge) - return img_scale, None - - @staticmethod - def random_sample_ratio(img_scale, ratio_range): - """Randomly sample an img_scale when ``ratio_range`` is specified. - - A ratio will be randomly sampled from the range specified by - ``ratio_range``. Then it would be multiplied with ``img_scale`` to - generate sampled scale. - - Args: - img_scale (tuple): Images scale base to multiply with ratio. - ratio_range (tuple[float]): The minimum and maximum ratio to scale - the ``img_scale``. - - Returns: - (tuple, None): Returns a tuple ``(scale, None)``, where - ``scale`` is sampled ratio multiplied with ``img_scale`` and - None is just a placeholder to be consistent with - :func:`random_select`. - """ - - assert isinstance(img_scale, tuple) and len(img_scale) == 2 - min_ratio, max_ratio = ratio_range - assert min_ratio <= max_ratio - ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio - scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) - return scale, None - - def _random_scale(self, results): - """Randomly sample an img_scale according to ``ratio_range`` and - ``multiscale_mode``. - - If ``ratio_range`` is specified, a ratio will be sampled and be - multiplied with ``img_scale``. - If multiple scales are specified by ``img_scale``, a scale will be - sampled according to ``multiscale_mode``. - Otherwise, single scale will be used. - - Args: - results (dict): Result dict from :obj:`dataset`. - - Returns: - dict: Two new keys 'scale` and 'scale_idx` are added into - ``results``, which would be used by subsequent pipelines. - """ - - if self.ratio_range is not None: - if self.img_scale is None: - h, w = results['img'].shape[:2] - scale, scale_idx = self.random_sample_ratio((w, h), - self.ratio_range) - else: - scale, scale_idx = self.random_sample_ratio( - self.img_scale[0], self.ratio_range) - elif len(self.img_scale) == 1: - scale, scale_idx = self.img_scale[0], 0 - elif self.multiscale_mode == 'range': - scale, scale_idx = self.random_sample(self.img_scale) - elif self.multiscale_mode == 'value': - scale, scale_idx = self.random_select(self.img_scale) - else: - raise NotImplementedError - - results['scale'] = scale - results['scale_idx'] = scale_idx - - def _align(self, img, size_divisor, interpolation=None): - align_h = int(np.ceil(img.shape[0] / size_divisor)) * size_divisor - align_w = int(np.ceil(img.shape[1] / size_divisor)) * size_divisor - if interpolation is None: - img = mmcv.imresize(img, (align_w, align_h)) - else: - img = mmcv.imresize( - img, (align_w, align_h), interpolation=interpolation) - return img - - def _resize_img(self, results): - """Resize images with ``results['scale']``.""" - if self.keep_ratio: - img, scale_factor = mmcv.imrescale( - results['img'], results['scale'], return_scale=True) - # align # - img = self._align(img, self.size_divisor) - # the w_scale and h_scale has minor difference - # a real fix should be done in the mmcv.imrescale in the future - new_h, new_w = img.shape[:2] - h, w = results['img'].shape[:2] - w_scale = new_w / w - h_scale = new_h / h - else: - img, w_scale, h_scale = mmcv.imresize( - results['img'], results['scale'], return_scale=True) - - h, w = img.shape[:2] - - assert h % self.size_divisor == 0 and w % self.size_divisor == 0, \ - "img size not align. h:{} w:{}".format(h, w) - scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], - dtype=np.float32) - results['img'] = img - results['img_shape'] = img.shape - results['pad_shape'] = img.shape # in case that there is no padding - results['scale_factor'] = scale_factor - results['keep_ratio'] = self.keep_ratio - - def _resize_seg(self, results): - """Resize semantic segmentation map with ``results['scale']``.""" - for key in results.get('seg_fields', []): - if self.keep_ratio: - gt_seg = mmcv.imrescale( - results[key], results['scale'], interpolation='nearest') - gt_seg = self._align( - gt_seg, self.size_divisor, interpolation='nearest') - else: - gt_seg = mmcv.imresize( - results[key], results['scale'], interpolation='nearest') - h, w = gt_seg.shape[:2] - - assert h % self.size_divisor == 0 and w % self.size_divisor ==\ - 0, "gt_seg size not align. h:{} w:{}".format(h, w) - results[key] = gt_seg - - def __call__(self, results): - """Call function to resize images, bounding boxes, masks, semantic - segmentation map. - - Args: - results (dict): Result dict from loading pipeline. - - Returns: - dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', - 'keep_ratio' keys are added into result dict. - """ - - if 'scale' not in results: - self._random_scale(results) - self._resize_img(results) - self._resize_seg(results) - return results - - def __repr__(self): - repr_str = self.__class__.__name__ - repr_str += (f'(img_scale={self.img_scale}, ' - f'multiscale_mode={self.multiscale_mode}, ' - f'ratio_range={self.ratio_range}, ' - f'keep_ratio={self.keep_ratio})') - return repr_str - - @PIPELINES.register_module() class Resize(object): """Resize images & seg. diff --git a/mmseg/models/decode_heads/segformer_head.py b/mmseg/models/decode_heads/segformer_head.py deleted file mode 100644 index cd26e2d089..0000000000 --- a/mmseg/models/decode_heads/segformer_head.py +++ /dev/null @@ -1,94 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - -from mmcv.cnn import ConvModule - -from ..builder import HEADS -from .decode_head import BaseDecodeHead - - -class MLP(nn.Module): - """ - Linear Embedding - """ - - def __init__(self, input_dim=2048, embed_dim=768): - super().__init__() - self.proj = nn.Linear(input_dim, embed_dim) - - def forward(self, x): - x = x.flatten(2).transpose(1, 2) - x = self.proj(x) - return x - - -@HEADS.register_module() -class SegFormerHead(BaseDecodeHead): - """ - SegFormer: Simple and Efficient Design for Semantic Segmentation with - Transformers - """ - - def __init__(self, feature_strides, decoder_params, **kwargs): - super(SegFormerHead, self).__init__( - input_transform='multiple_select', **kwargs) - assert len(feature_strides) == len(self.in_channels) - assert min(feature_strides) == feature_strides[0] - self.feature_strides = feature_strides - - c1_in_channels, c2_in_channels, c3_in_channels, c4_in_channels = \ - self.in_channels - - decoder_params = decoder_params - embedding_dim = decoder_params['embed_dim'] - - self.linear_c4 = MLP(input_dim=c4_in_channels, embed_dim=embedding_dim) - self.linear_c3 = MLP(input_dim=c3_in_channels, embed_dim=embedding_dim) - self.linear_c2 = MLP(input_dim=c2_in_channels, embed_dim=embedding_dim) - self.linear_c1 = MLP(input_dim=c1_in_channels, embed_dim=embedding_dim) - - self.linear_fuse = ConvModule( - in_channels=embedding_dim * 4, - out_channels=embedding_dim, - kernel_size=1, - norm_cfg=dict(type='SyncBN', requires_grad=True)) - - self.linear_pred = nn.Conv2d( - embedding_dim, self.num_classes, kernel_size=1) - - def forward(self, inputs): - x = self._transform_inputs(inputs) # len=4, 1/4,1/8,1/16,1/32 - c1, c2, c3, c4 = x - - # MLP decoder on C1-C4 # - n, _, h, w = c4.shape - - _c4 = self.linear_c4(c4).permute(0, 2, - 1).reshape(n, -1, c4.shape[2], - c4.shape[3]) - _c4 = F.interpolate( - _c4, size=c1.size()[2:], mode='bilinear', align_corners=False) - - _c3 = self.linear_c3(c3).permute(0, 2, - 1).reshape(n, -1, c3.shape[2], - c3.shape[3]) - _c3 = F.interpolate( - _c3, size=c1.size()[2:], mode='bilinear', align_corners=False) - - _c2 = self.linear_c2(c2).permute(0, 2, - 1).reshape(n, -1, c2.shape[2], - c2.shape[3]) - _c2 = F.interpolate( - _c2, size=c1.size()[2:], mode='bilinear', align_corners=False) - - _c1 = self.linear_c1(c1).permute(0, 2, - 1).reshape(n, -1, c1.shape[2], - c1.shape[3]) - - _c = self.linear_fuse(torch.cat([_c4, _c3, _c2, _c1], dim=1)) - - x = self.dropout(_c) - x = self.linear_pred(x) - - return x From c2b0ffde210602d2f178586c72aa7102d15a6c5e Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 10 Jun 2021 18:17:35 +0800 Subject: [PATCH 04/96] Fix lint error --- mmseg/models/backbones/mit.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index 93abd467bd..bf9baafcb0 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -1,13 +1,13 @@ import math + import torch import torch.nn as nn - -from mmcv.cnn import build_norm_layer, build_activation_layer +from mmcv.cnn import build_activation_layer, build_norm_layer from mmcv.runner import load_checkpoint +from ...utils import get_root_logger from ..builder import BACKBONES from ..utils import DropPath, to_2tuple, trunc_normal_ -from ...utils import get_root_logger class DWConv(nn.Module): @@ -81,8 +81,8 @@ def __init__(self, proj_drop=0., sr_ratio=1): super().__init__() - assert dim % num_heads == 0, f"dim {dim} should be divided by " - f"num_heads {num_heads}." + assert dim % num_heads == 0, f'dim {dim} should be divided by ' + f'num_heads {num_heads}.' self.dim = dim self.num_heads = num_heads @@ -208,8 +208,7 @@ def forward(self, x, H, W): class OverlapPatchEmbed(nn.Module): - """ Image to Patch Embedding - """ + """Image to Patch Embedding.""" def __init__(self, img_size=224, From c7e676e90dd9bc3b2f911e45eb8f92cf3ebb24b9 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 10 Jun 2021 18:23:55 +0800 Subject: [PATCH 05/96] Remove SegFormerHead --- mmseg/models/decode_heads/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mmseg/models/decode_heads/__init__.py b/mmseg/models/decode_heads/__init__.py index ebc40616c0..662aae3c00 100644 --- a/mmseg/models/decode_heads/__init__.py +++ b/mmseg/models/decode_heads/__init__.py @@ -16,7 +16,6 @@ from .point_head import PointHead from .psa_head import PSAHead from .psp_head import PSPHead -from .segformer_head import SegFormerHead from .sep_aspp_head import DepthwiseSeparableASPPHead from .sep_fcn_head import DepthwiseSeparableFCNHead from .uper_head import UPerHead @@ -25,5 +24,5 @@ 'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead', 'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead', 'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead', - 'PointHead', 'APCHead', 'DMHead', 'LRASPPHead', 'SegFormerHead' + 'PointHead', 'APCHead', 'DMHead', 'LRASPPHead' ] From e7ec3dad92c80314571abd709792c0a3b514f03c Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Fri, 11 Jun 2021 15:47:26 +0800 Subject: [PATCH 06/96] [Feature] Add segformer decode head and related train config --- configs/_base_/models/segformer.py | 35 +++++++ .../segformer_b0_512x512_160k_ade20k.py | 74 +++++++++++++++ mmseg/models/decode_heads/__init__.py | 3 +- mmseg/models/decode_heads/segformer_head.py | 91 +++++++++++++++++++ 4 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 configs/_base_/models/segformer.py create mode 100644 configs/segformer/segformer_b0_512x512_160k_ade20k.py create mode 100644 mmseg/models/decode_heads/segformer_head.py diff --git a/configs/_base_/models/segformer.py b/configs/_base_/models/segformer.py new file mode 100644 index 0000000000..78498b11e5 --- /dev/null +++ b/configs/_base_/models/segformer.py @@ -0,0 +1,35 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained=None, + backbone=dict( + type='MixVisionTransformer', + img_size=224, + in_chans=3, + embed_dims=[64, 128, 256, 512], + num_heads=[1, 2, 4, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + depths=[3, 4, 6, 3], + sr_ratios=[8, 4, 2, 1]), + decode_head=dict( + type='SegFormerHead', + in_channels=[64, 128, 320, 512], + in_index=[0, 1, 2, 3], + feature_strides=[4, 8, 16, 32], + channels=128, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + decoder_params=dict(), + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/configs/segformer/segformer_b0_512x512_160k_ade20k.py b/configs/segformer/segformer_b0_512x512_160k_ade20k.py new file mode 100644 index 0000000000..eba2774608 --- /dev/null +++ b/configs/segformer/segformer_b0_512x512_160k_ade20k.py @@ -0,0 +1,74 @@ +_base_ = [ + '../_base_/models/segformer.py', '../_base_/datasets/ade20k.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='pretrain/mit_b0.pth', + backbone=dict( + patch_size=4, + embed_dims=[32, 64, 160, 256], + num_heads=[1, 2, 5, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, + depths=[2, 2, 2, 2], + sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, + drop_path_rate=0.1), + decode_head=dict( + type='SegFormerHead', + in_channels=[32, 64, 160, 256], + in_index=[0, 1, 2, 3], + feature_strides=[4, 8, 16, 32], + channels=128, + dropout_ratio=0.1, + num_classes=150, + norm_cfg=norm_cfg, + align_corners=False, + decoder_params=dict( + embed_dim=256, + depth=1, + num_heads=1, + mlp_ratio=3, + num_patches=128 * 128, + attn_ffn=False, + conv_cls_seg=True, + ), + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) + +find_unused_parameters = True + +# optimizer +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.0002, + betas=(0.9, 0.999), + weight_decay=0.0001, + paramwise_cfg=dict( + custom_keys={ + 'pos_block': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.), + 'head': dict(lr_mult=10.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +data = dict(samples_per_gpu=2) +checkpoint_config = dict(by_epoch=False, interval=4000) +evaluation = dict(interval=4000, metric='mIoU') diff --git a/mmseg/models/decode_heads/__init__.py b/mmseg/models/decode_heads/__init__.py index 662aae3c00..ebc40616c0 100644 --- a/mmseg/models/decode_heads/__init__.py +++ b/mmseg/models/decode_heads/__init__.py @@ -16,6 +16,7 @@ from .point_head import PointHead from .psa_head import PSAHead from .psp_head import PSPHead +from .segformer_head import SegFormerHead from .sep_aspp_head import DepthwiseSeparableASPPHead from .sep_fcn_head import DepthwiseSeparableFCNHead from .uper_head import UPerHead @@ -24,5 +25,5 @@ 'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead', 'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead', 'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead', - 'PointHead', 'APCHead', 'DMHead', 'LRASPPHead' + 'PointHead', 'APCHead', 'DMHead', 'LRASPPHead', 'SegFormerHead' ] diff --git a/mmseg/models/decode_heads/segformer_head.py b/mmseg/models/decode_heads/segformer_head.py new file mode 100644 index 0000000000..7fba8b349a --- /dev/null +++ b/mmseg/models/decode_heads/segformer_head.py @@ -0,0 +1,91 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule + +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +class MLP(nn.Module): + """Linear Embedding.""" + + def __init__(self, input_dim=2048, embed_dim=768): + super().__init__() + self.proj = nn.Linear(input_dim, embed_dim) + + def forward(self, x): + x = x.flatten(2).transpose(1, 2) + x = self.proj(x) + return x + + +@HEADS.register_module() +class SegFormerHead(BaseDecodeHead): + """ + SegFormer: Simple and Efficient Design for Semantic Segmentation with + Transformers + """ + + def __init__(self, feature_strides, decoder_params, **kwargs): + super(SegFormerHead, self).__init__( + input_transform='multiple_select', **kwargs) + assert len(feature_strides) == len(self.in_channels) + assert min(feature_strides) == feature_strides[0] + self.feature_strides = feature_strides + + c1_in_channels, c2_in_channels, c3_in_channels, c4_in_channels = \ + self.in_channels + + decoder_params = decoder_params + embedding_dim = decoder_params['embed_dim'] + + self.linear_c4 = MLP(input_dim=c4_in_channels, embed_dim=embedding_dim) + self.linear_c3 = MLP(input_dim=c3_in_channels, embed_dim=embedding_dim) + self.linear_c2 = MLP(input_dim=c2_in_channels, embed_dim=embedding_dim) + self.linear_c1 = MLP(input_dim=c1_in_channels, embed_dim=embedding_dim) + + self.linear_fuse = ConvModule( + in_channels=embedding_dim * 4, + out_channels=embedding_dim, + kernel_size=1, + norm_cfg=dict(type='SyncBN', requires_grad=True)) + + self.linear_pred = nn.Conv2d( + embedding_dim, self.num_classes, kernel_size=1) + + def forward(self, inputs): + x = self._transform_inputs(inputs) # len=4, 1/4,1/8,1/16,1/32 + c1, c2, c3, c4 = x + + # MLP decoder on C1-C4 # + n, _, h, w = c4.shape + + _c4 = self.linear_c4(c4).permute(0, 2, + 1).reshape(n, -1, c4.shape[2], + c4.shape[3]) + _c4 = F.interpolate( + _c4, size=c1.size()[2:], mode='bilinear', align_corners=False) + + _c3 = self.linear_c3(c3).permute(0, 2, + 1).reshape(n, -1, c3.shape[2], + c3.shape[3]) + _c3 = F.interpolate( + _c3, size=c1.size()[2:], mode='bilinear', align_corners=False) + + _c2 = self.linear_c2(c2).permute(0, 2, + 1).reshape(n, -1, c2.shape[2], + c2.shape[3]) + _c2 = F.interpolate( + _c2, size=c1.size()[2:], mode='bilinear', align_corners=False) + + _c1 = self.linear_c1(c1).permute(0, 2, + 1).reshape(n, -1, c1.shape[2], + c1.shape[3]) + + _c = self.linear_fuse(torch.cat([_c4, _c3, _c2, _c1], dim=1)) + + x = self.dropout(_c) + x = self.linear_pred(x) + + return x From d27552f7af25052b494f8d42f2d880b0a91d8eb9 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sat, 12 Jun 2021 01:19:34 +0800 Subject: [PATCH 07/96] Add ade20K trainval support for segformer 1. Add related train and val configs; 2. Add AlignedResize; --- configs/_base_/datasets/ade20k_aligned.py | 54 +++++ .../_base_/datasets/ade20k_aligned_640x640.py | 53 ++++ .../segformer_b0_512x512_160k_ade20k.py | 20 +- .../segformer_b1_512x512_160k_ade20k.py | 64 +++++ .../segformer_b2_512x512_160k_ade20k.py | 64 +++++ .../segformer_b3_512x512_160k_ade20k.py | 64 +++++ .../segformer_b4_512x512_160k_ade20k.py | 64 +++++ .../segformer_b5_512x512_160k_ade20k.py | 65 +++++ mmseg/datasets/pipelines/transforms.py | 228 ++++++++++++++++++ 9 files changed, 660 insertions(+), 16 deletions(-) create mode 100644 configs/_base_/datasets/ade20k_aligned.py create mode 100644 configs/_base_/datasets/ade20k_aligned_640x640.py create mode 100644 configs/segformer/segformer_b1_512x512_160k_ade20k.py create mode 100644 configs/segformer/segformer_b2_512x512_160k_ade20k.py create mode 100644 configs/segformer/segformer_b3_512x512_160k_ade20k.py create mode 100644 configs/segformer/segformer_b4_512x512_160k_ade20k.py create mode 100644 configs/segformer/segformer_b5_512x512_160k_ade20k.py diff --git a/configs/_base_/datasets/ade20k_aligned.py b/configs/_base_/datasets/ade20k_aligned.py new file mode 100644 index 0000000000..51f24347ab --- /dev/null +++ b/configs/_base_/datasets/ade20k_aligned.py @@ -0,0 +1,54 @@ +# dataset settings +dataset_type = 'ADE20KDataset' +data_root = 'data/ade/ADEChallengeData2016' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +crop_size = (512, 512) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', reduce_zero_label=True), + dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2048, 512), + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='AlignedResize', keep_ratio=True, size_divisor=32), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/training', + ann_dir='annotations/training', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline)) diff --git a/configs/_base_/datasets/ade20k_aligned_640x640.py b/configs/_base_/datasets/ade20k_aligned_640x640.py new file mode 100644 index 0000000000..1d65d9a92f --- /dev/null +++ b/configs/_base_/datasets/ade20k_aligned_640x640.py @@ -0,0 +1,53 @@ +dataset_type = 'ADE20KDataset' +data_root = 'data/ade/ADEChallengeData2016' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +crop_size = (640, 640) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', reduce_zero_label=True), + dict(type='Resize', img_scale=(2048, 640), ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2048, 640), + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='AlignedResize', keep_ratio=True, size_divisor=32), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/training', + ann_dir='annotations/training', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline)) diff --git a/configs/segformer/segformer_b0_512x512_160k_ade20k.py b/configs/segformer/segformer_b0_512x512_160k_ade20k.py index eba2774608..d3b2ff414d 100644 --- a/configs/segformer/segformer_b0_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_b0_512x512_160k_ade20k.py @@ -1,5 +1,5 @@ _base_ = [ - '../_base_/models/segformer.py', '../_base_/datasets/ade20k.py', + '../_base_/models/segformer.py', '../_base_/datasets/ade20k_aligned.py', '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' ] @@ -28,30 +28,20 @@ num_classes=150, norm_cfg=norm_cfg, align_corners=False, - decoder_params=dict( - embed_dim=256, - depth=1, - num_heads=1, - mlp_ratio=3, - num_patches=128 * 128, - attn_ffn=False, - conv_cls_seg=True, - ), + decoder_params=dict(embed_dim=256), loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) -find_unused_parameters = True - # optimizer optimizer = dict( _delete_=True, type='AdamW', - lr=0.0002, + lr=0.00006, betas=(0.9, 0.999), - weight_decay=0.0001, + weight_decay=0.01, paramwise_cfg=dict( custom_keys={ 'pos_block': dict(decay_mult=0.), @@ -70,5 +60,3 @@ by_epoch=False) data = dict(samples_per_gpu=2) -checkpoint_config = dict(by_epoch=False, interval=4000) -evaluation = dict(interval=4000, metric='mIoU') diff --git a/configs/segformer/segformer_b1_512x512_160k_ade20k.py b/configs/segformer/segformer_b1_512x512_160k_ade20k.py new file mode 100644 index 0000000000..cee3828c07 --- /dev/null +++ b/configs/segformer/segformer_b1_512x512_160k_ade20k.py @@ -0,0 +1,64 @@ +_base_ = [ + '../_base_/models/segformer.py', '../_base_/datasets/ade20k_aligned.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='pretrain/mit_b1.pth', + backbone=dict( + patch_size=4, + embed_dims=[64, 128, 320, 512], + num_heads=[1, 2, 5, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, + depths=[2, 2, 2, 2], + sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, + drop_path_rate=0.1), + decode_head=dict( + type='SegFormerHead', + in_channels=[64, 128, 320, 512], + in_index=[0, 1, 2, 3], + feature_strides=[4, 8, 16, 32], + channels=128, + dropout_ratio=0.1, + num_classes=150, + norm_cfg=norm_cfg, + align_corners=False, + decoder_params=dict(embed_dim=256), + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) + +find_unused_parameters = True + +# optimizer +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.00006, + betas=(0.9, 0.999), + weight_decay=0.01, + paramwise_cfg=dict( + custom_keys={ + 'pos_block': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.), + 'head': dict(lr_mult=10.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +data = dict(samples_per_gpu=2) diff --git a/configs/segformer/segformer_b2_512x512_160k_ade20k.py b/configs/segformer/segformer_b2_512x512_160k_ade20k.py new file mode 100644 index 0000000000..fa6523fe95 --- /dev/null +++ b/configs/segformer/segformer_b2_512x512_160k_ade20k.py @@ -0,0 +1,64 @@ +_base_ = [ + '../_base_/models/segformer.py', '../_base_/datasets/ade20k_aligned.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='pretrain/mit_b2.pth', + backbone=dict( + patch_size=4, + embed_dims=[64, 128, 320, 512], + num_heads=[1, 2, 5, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, + depths=[3, 4, 6, 3], + sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, + drop_path_rate=0.1), + decode_head=dict( + type='SegFormerHead', + in_channels=[64, 128, 320, 512], + in_index=[0, 1, 2, 3], + feature_strides=[4, 8, 16, 32], + channels=128, + dropout_ratio=0.1, + num_classes=150, + norm_cfg=norm_cfg, + align_corners=False, + decoder_params=dict(embed_dim=768), + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) + +find_unused_parameters = True + +# optimizer +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.00006, + betas=(0.9, 0.999), + weight_decay=0.01, + paramwise_cfg=dict( + custom_keys={ + 'pos_block': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.), + 'head': dict(lr_mult=10.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +data = dict(samples_per_gpu=2) diff --git a/configs/segformer/segformer_b3_512x512_160k_ade20k.py b/configs/segformer/segformer_b3_512x512_160k_ade20k.py new file mode 100644 index 0000000000..335ed1e797 --- /dev/null +++ b/configs/segformer/segformer_b3_512x512_160k_ade20k.py @@ -0,0 +1,64 @@ +_base_ = [ + '../_base_/models/segformer.py', '../_base_/datasets/ade20k_aligned.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='pretrain/mit_b3.pth', + backbone=dict( + patch_size=4, + embed_dims=[64, 128, 320, 512], + num_heads=[1, 2, 5, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, + depths=[3, 4, 18, 3], + sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, + drop_path_rate=0.1), + decode_head=dict( + type='SegFormerHead', + in_channels=[64, 128, 320, 512], + in_index=[0, 1, 2, 3], + feature_strides=[4, 8, 16, 32], + channels=128, + dropout_ratio=0.1, + num_classes=150, + norm_cfg=norm_cfg, + align_corners=False, + decoder_params=dict(embed_dim=768), + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) + +find_unused_parameters = True + +# optimizer +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.00006, + betas=(0.9, 0.999), + weight_decay=0.01, + paramwise_cfg=dict( + custom_keys={ + 'pos_block': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.), + 'head': dict(lr_mult=10.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +data = dict(samples_per_gpu=2) diff --git a/configs/segformer/segformer_b4_512x512_160k_ade20k.py b/configs/segformer/segformer_b4_512x512_160k_ade20k.py new file mode 100644 index 0000000000..3704614aac --- /dev/null +++ b/configs/segformer/segformer_b4_512x512_160k_ade20k.py @@ -0,0 +1,64 @@ +_base_ = [ + '../_base_/models/segformer.py', '../_base_/datasets/ade20k_aligned.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='pretrain/mit_b4.pth', + backbone=dict( + patch_size=4, + embed_dims=[64, 128, 320, 512], + num_heads=[1, 2, 5, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, + depths=[3, 8, 27, 3], + sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, + drop_path_rate=0.1), + decode_head=dict( + type='SegFormerHead', + in_channels=[64, 128, 320, 512], + in_index=[0, 1, 2, 3], + feature_strides=[4, 8, 16, 32], + channels=128, + dropout_ratio=0.1, + num_classes=150, + norm_cfg=norm_cfg, + align_corners=False, + decoder_params=dict(embed_dim=768), + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) + +find_unused_parameters = True + +# optimizer +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.00006, + betas=(0.9, 0.999), + weight_decay=0.01, + paramwise_cfg=dict( + custom_keys={ + 'pos_block': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.), + 'head': dict(lr_mult=10.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +data = dict(samples_per_gpu=2) diff --git a/configs/segformer/segformer_b5_512x512_160k_ade20k.py b/configs/segformer/segformer_b5_512x512_160k_ade20k.py new file mode 100644 index 0000000000..53d8a3c506 --- /dev/null +++ b/configs/segformer/segformer_b5_512x512_160k_ade20k.py @@ -0,0 +1,65 @@ +_base_ = [ + '../_base_/models/segformer.py', '../_base_/datasets/ade20k_aligned.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='pretrain/mit_b5.pth', + backbone=dict( + patch_size=4, + embed_dims=[64, 128, 320, 512], + num_heads=[1, 2, 5, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, + depths=[3, 6, 40, 3], + sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, + drop_path_rate=0.1), + decode_head=dict( + type='SegFormerHead', + in_channels=[64, 128, 320, 512], + in_index=[0, 1, 2, 3], + feature_strides=[4, 8, 16, 32], + channels=128, + dropout_ratio=0.1, + num_classes=150, + norm_cfg=norm_cfg, + align_corners=False, + decoder_params=dict(embed_dim=768), + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) + +find_unused_parameters = True + +# optimizer +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.00006, + betas=(0.9, 0.999), + weight_decay=0.01, + paramwise_cfg=dict( + custom_keys={ + 'pos_block': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.), + 'head': dict(lr_mult=10.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +# By default, models are trained on 8 GPUs with 2 images per GPU +data = dict(samples_per_gpu=2) diff --git a/mmseg/datasets/pipelines/transforms.py b/mmseg/datasets/pipelines/transforms.py index 20753bb0fa..b334c6182a 100644 --- a/mmseg/datasets/pipelines/transforms.py +++ b/mmseg/datasets/pipelines/transforms.py @@ -6,6 +6,234 @@ from ..builder import PIPELINES +@PIPELINES.register_module() +class AlignedResize(object): + """Resize images & seg. + + Align + """ + + def __init__(self, + img_scale=None, + multiscale_mode='range', + ratio_range=None, + keep_ratio=True, + size_divisor=32): + if img_scale is None: + self.img_scale = None + else: + if isinstance(img_scale, list): + self.img_scale = img_scale + else: + self.img_scale = [img_scale] + assert mmcv.is_list_of(self.img_scale, tuple) + + if ratio_range is not None: + # mode 1: given img_scale=None and a range of image ratio + # mode 2: given a scale and a range of image ratio + assert self.img_scale is None or len(self.img_scale) == 1 + else: + # mode 3 and 4: given multiple scales or a range of scales + assert multiscale_mode in ['value', 'range'] + + self.multiscale_mode = multiscale_mode + self.ratio_range = ratio_range + self.keep_ratio = keep_ratio + self.size_divisor = size_divisor + + @staticmethod + def random_select(img_scales): + """Randomly select an img_scale from given candidates. + + Args: + img_scales (list[tuple]): Images scales for selection. + + Returns: + (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, + where ``img_scale`` is the selected image scale and + ``scale_idx`` is the selected index in the given candidates. + """ + + assert mmcv.is_list_of(img_scales, tuple) + scale_idx = np.random.randint(len(img_scales)) + img_scale = img_scales[scale_idx] + return img_scale, scale_idx + + @staticmethod + def random_sample(img_scales): + """Randomly sample an img_scale when ``multiscale_mode=='range'``. + + Args: + img_scales (list[tuple]): Images scale range for sampling. + There must be two tuples in img_scales, which specify the lower + and uper bound of image scales. + + Returns: + (tuple, None): Returns a tuple ``(img_scale, None)``, where + ``img_scale`` is sampled scale and None is just a placeholder + to be consistent with :func:`random_select`. + """ + + assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2 + img_scale_long = [max(s) for s in img_scales] + img_scale_short = [min(s) for s in img_scales] + long_edge = np.random.randint( + min(img_scale_long), + max(img_scale_long) + 1) + short_edge = np.random.randint( + min(img_scale_short), + max(img_scale_short) + 1) + img_scale = (long_edge, short_edge) + return img_scale, None + + @staticmethod + def random_sample_ratio(img_scale, ratio_range): + """Randomly sample an img_scale when ``ratio_range`` is specified. + + A ratio will be randomly sampled from the range specified by + ``ratio_range``. Then it would be multiplied with ``img_scale`` to + generate sampled scale. + + Args: + img_scale (tuple): Images scale base to multiply with ratio. + ratio_range (tuple[float]): The minimum and maximum ratio to scale + the ``img_scale``. + + Returns: + (tuple, None): Returns a tuple ``(scale, None)``, where + ``scale`` is sampled ratio multiplied with ``img_scale`` and + None is just a placeholder to be consistent with + :func:`random_select`. + """ + + assert isinstance(img_scale, tuple) and len(img_scale) == 2 + min_ratio, max_ratio = ratio_range + assert min_ratio <= max_ratio + ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio + scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) + return scale, None + + def _random_scale(self, results): + """Randomly sample an img_scale according to ``ratio_range`` and + ``multiscale_mode``. + + If ``ratio_range`` is specified, a ratio will be sampled and be + multiplied with ``img_scale``. + If multiple scales are specified by ``img_scale``, a scale will be + sampled according to ``multiscale_mode``. + Otherwise, single scale will be used. + + Args: + results (dict): Result dict from :obj:`dataset`. + + Returns: + dict: Two new keys 'scale` and 'scale_idx` are added into + ``results``, which would be used by subsequent pipelines. + """ + + if self.ratio_range is not None: + if self.img_scale is None: + h, w = results['img'].shape[:2] + scale, scale_idx = self.random_sample_ratio((w, h), + self.ratio_range) + else: + scale, scale_idx = self.random_sample_ratio( + self.img_scale[0], self.ratio_range) + elif len(self.img_scale) == 1: + scale, scale_idx = self.img_scale[0], 0 + elif self.multiscale_mode == 'range': + scale, scale_idx = self.random_sample(self.img_scale) + elif self.multiscale_mode == 'value': + scale, scale_idx = self.random_select(self.img_scale) + else: + raise NotImplementedError + + results['scale'] = scale + results['scale_idx'] = scale_idx + + def _align(self, img, size_divisor, interpolation=None): + align_h = int(np.ceil(img.shape[0] / size_divisor)) * size_divisor + align_w = int(np.ceil(img.shape[1] / size_divisor)) * size_divisor + if interpolation is None: + img = mmcv.imresize(img, (align_w, align_h)) + else: + img = mmcv.imresize( + img, (align_w, align_h), interpolation=interpolation) + return img + + def _resize_img(self, results): + """Resize images with ``results['scale']``.""" + if self.keep_ratio: + img, scale_factor = mmcv.imrescale( + results['img'], results['scale'], return_scale=True) + # align # + img = self._align(img, self.size_divisor) + # the w_scale and h_scale has minor difference + # a real fix should be done in the mmcv.imrescale in the future + new_h, new_w = img.shape[:2] + h, w = results['img'].shape[:2] + w_scale = new_w / w + h_scale = new_h / h + else: + img, w_scale, h_scale = mmcv.imresize( + results['img'], results['scale'], return_scale=True) + + h, w = img.shape[:2] + sd = self.size_divisor + assert h % sd == 0 and w % sd == 0, \ + 'img size not align. h:{} w:{}'.format(h, w) + scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], + dtype=np.float32) + results['img'] = img + results['img_shape'] = img.shape + results['pad_shape'] = img.shape # in case that there is no padding + results['scale_factor'] = scale_factor + results['keep_ratio'] = self.keep_ratio + + def _resize_seg(self, results): + """Resize semantic segmentation map with ``results['scale']``.""" + for key in results.get('seg_fields', []): + if self.keep_ratio: + gt_seg = mmcv.imrescale( + results[key], results['scale'], interpolation='nearest') + gt_seg = self._align( + gt_seg, self.size_divisor, interpolation='nearest') + else: + gt_seg = mmcv.imresize( + results[key], results['scale'], interpolation='nearest') + h, w = gt_seg.shape[:2] + sd = self.size_divisor + assert h % sd == 0 and w % sd == 0, \ + 'gt_seg size not align. h:{} w:{}'.format(h, w) + results[key] = gt_seg + + def __call__(self, results): + """Call function to resize images, bounding boxes, masks, semantic + segmentation map. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', + 'keep_ratio' keys are added into result dict. + """ + + if 'scale' not in results: + self._random_scale(results) + self._resize_img(results) + self._resize_seg(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += (f'(img_scale={self.img_scale}, ' + f'multiscale_mode={self.multiscale_mode}, ' + f'ratio_range={self.ratio_range}, ' + f'keep_ratio={self.keep_ratio})') + return repr_str + + @PIPELINES.register_module() class Resize(object): """Resize images & seg. From e94d6d9dacfd7a0d5a2107b5725ef13ae64c07ad Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sat, 12 Jun 2021 02:38:16 +0800 Subject: [PATCH 08/96] Set arg: find_unused_parameters = True --- configs/_base_/models/segformer.py | 2 ++ configs/segformer/segformer_b1_512x512_160k_ade20k.py | 2 -- configs/segformer/segformer_b2_512x512_160k_ade20k.py | 2 -- configs/segformer/segformer_b3_512x512_160k_ade20k.py | 2 -- configs/segformer/segformer_b4_512x512_160k_ade20k.py | 2 -- ...12_160k_ade20k.py => segformer_b5_640x640_160k_ade20k.py} | 5 ++--- 6 files changed, 4 insertions(+), 11 deletions(-) rename configs/segformer/{segformer_b5_512x512_160k_ade20k.py => segformer_b5_640x640_160k_ade20k.py} (93%) diff --git a/configs/_base_/models/segformer.py b/configs/_base_/models/segformer.py index 78498b11e5..83c7639fa9 100644 --- a/configs/_base_/models/segformer.py +++ b/configs/_base_/models/segformer.py @@ -33,3 +33,5 @@ # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) + +find_unused_parameters = True diff --git a/configs/segformer/segformer_b1_512x512_160k_ade20k.py b/configs/segformer/segformer_b1_512x512_160k_ade20k.py index cee3828c07..26f1ff05e0 100644 --- a/configs/segformer/segformer_b1_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_b1_512x512_160k_ade20k.py @@ -35,8 +35,6 @@ train_cfg=dict(), test_cfg=dict(mode='whole')) -find_unused_parameters = True - # optimizer optimizer = dict( _delete_=True, diff --git a/configs/segformer/segformer_b2_512x512_160k_ade20k.py b/configs/segformer/segformer_b2_512x512_160k_ade20k.py index fa6523fe95..2812d0fc7c 100644 --- a/configs/segformer/segformer_b2_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_b2_512x512_160k_ade20k.py @@ -35,8 +35,6 @@ train_cfg=dict(), test_cfg=dict(mode='whole')) -find_unused_parameters = True - # optimizer optimizer = dict( _delete_=True, diff --git a/configs/segformer/segformer_b3_512x512_160k_ade20k.py b/configs/segformer/segformer_b3_512x512_160k_ade20k.py index 335ed1e797..823daa0d5d 100644 --- a/configs/segformer/segformer_b3_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_b3_512x512_160k_ade20k.py @@ -35,8 +35,6 @@ train_cfg=dict(), test_cfg=dict(mode='whole')) -find_unused_parameters = True - # optimizer optimizer = dict( _delete_=True, diff --git a/configs/segformer/segformer_b4_512x512_160k_ade20k.py b/configs/segformer/segformer_b4_512x512_160k_ade20k.py index 3704614aac..59c2bcbede 100644 --- a/configs/segformer/segformer_b4_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_b4_512x512_160k_ade20k.py @@ -35,8 +35,6 @@ train_cfg=dict(), test_cfg=dict(mode='whole')) -find_unused_parameters = True - # optimizer optimizer = dict( _delete_=True, diff --git a/configs/segformer/segformer_b5_512x512_160k_ade20k.py b/configs/segformer/segformer_b5_640x640_160k_ade20k.py similarity index 93% rename from configs/segformer/segformer_b5_512x512_160k_ade20k.py rename to configs/segformer/segformer_b5_640x640_160k_ade20k.py index 53d8a3c506..92b40c91b1 100644 --- a/configs/segformer/segformer_b5_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_b5_640x640_160k_ade20k.py @@ -1,5 +1,6 @@ _base_ = [ - '../_base_/models/segformer.py', '../_base_/datasets/ade20k_aligned.py', + '../_base_/models/segformer.py', + '../_base_/datasets/ade20k_aligned_640x640.py', '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' ] @@ -35,8 +36,6 @@ train_cfg=dict(), test_cfg=dict(mode='whole')) -find_unused_parameters = True - # optimizer optimizer = dict( _delete_=True, From 9edbdddc3734f104e2af0002e60178a2d9a50fe1 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sat, 19 Jun 2021 13:38:25 +0800 Subject: [PATCH 09/96] parameters init refactor --- mmseg/models/backbones/mit.py | 231 +++++++++++++++++----------------- 1 file changed, 119 insertions(+), 112 deletions(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index bf9baafcb0..d90517f204 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -2,22 +2,30 @@ import torch import torch.nn as nn -from mmcv.cnn import build_activation_layer, build_norm_layer -from mmcv.runner import load_checkpoint +from mmcv.cnn import (ConvModule, Linear, build_activation_layer, + build_norm_layer) +from mmcv.runner import BaseModule, ModuleList, load_checkpoint from ...utils import get_root_logger from ..builder import BACKBONES from ..utils import DropPath, to_2tuple, trunc_normal_ -class DWConv(nn.Module): +class DWConv(BaseModule): def __init__(self, dim=768): super(DWConv, self).__init__() - self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim) + self.dwconv = ConvModule( + in_channels=dim, + out_channels=dim, + kernel_size=3, + stride=1, + padding=(3 - 1) // 2, + bias=True, + groups=dim) def forward(self, x, H, W): - B, N, C = x.shape + B, _, C = x.shape x = x.transpose(1, 2).view(B, C, H, W) x = self.dwconv(x) x = x.flatten(2).transpose(1, 2) @@ -25,7 +33,7 @@ def forward(self, x, H, W): return x -class Mlp(nn.Module): +class Mlp(BaseModule): def __init__(self, in_features, @@ -36,28 +44,27 @@ def __init__(self, super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features - self.fc1 = nn.Linear(in_features, hidden_features) + self.fc1 = Linear(in_features, hidden_features) self.dwconv = DWConv(hidden_features) self.act = build_activation_layer(act_cfg) - self.fc2 = nn.Linear(hidden_features, out_features) + self.fc2 = Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) - self.apply(self._init_weights) - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) - if isinstance(m, nn.Linear) and m.bias is not None: + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - elif isinstance(m, nn.Conv2d): - fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - fan_out //= m.groups - m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) - if m.bias is not None: - m.bias.data.zero_() + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() def forward(self, x, H, W): x = self.fc1(x) @@ -69,7 +76,7 @@ def forward(self, x, H, W): return x -class Attention(nn.Module): +class Attention(BaseModule): def __init__(self, dim, @@ -89,34 +96,36 @@ def __init__(self, head_dim = dim // num_heads self.scale = qk_scale or head_dim**-0.5 - self.q = nn.Linear(dim, dim, bias=qkv_bias) - self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias) + self.q = Linear(dim, dim, bias=qkv_bias) + self.kv = Linear(dim, dim * 2, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(dim, dim) + self.proj = Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.sr_ratio = sr_ratio if sr_ratio > 1: - self.sr = nn.Conv2d( - dim, dim, kernel_size=sr_ratio, stride=sr_ratio) + self.sr = ConvModule( + in_channels=dim, + out_channels=dim, + kernel_size=sr_ratio, + stride=sr_ratio) _, self.norm = build_norm_layer(norm_cfg, dim) - self.apply(self._init_weights) - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) - if isinstance(m, nn.Linear) and m.bias is not None: + def init_weights(self): + for m in self.modules: + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - elif isinstance(m, nn.Conv2d): - fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - fan_out //= m.groups - m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) - if m.bias is not None: - m.bias.data.zero_() + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() def forward(self, x, H, W): B, N, C = x.shape @@ -147,7 +156,7 @@ def forward(self, x, H, W): return x -class Block(nn.Module): +class Block(BaseModule): def __init__(self, dim, @@ -183,22 +192,21 @@ def __init__(self, act_cfg=act_cfg, drop=drop) - self.apply(self._init_weights) - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) - if isinstance(m, nn.Linear) and m.bias is not None: + def init_weights(self): + for m in self.modules: + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - elif isinstance(m, nn.Conv2d): - fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - fan_out //= m.groups - m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) - if m.bias is not None: - m.bias.data.zero_() + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() def forward(self, x, H, W): x = x + self.drop_path(self.attn(self.norm1(x), H, W)) @@ -207,7 +215,7 @@ def forward(self, x, H, W): return x -class OverlapPatchEmbed(nn.Module): +class OverlapPatchEmbed(BaseModule): """Image to Patch Embedding.""" def __init__(self, @@ -233,22 +241,21 @@ def __init__(self, padding=(patch_size[0] // 2, patch_size[1] // 2)) self.norm = nn.LayerNorm(embed_dim) - self.apply(self._init_weights) - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) - if isinstance(m, nn.Linear) and m.bias is not None: + def init_weights(self): + for m in self.modules: + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - elif isinstance(m, nn.Conv2d): - fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - fan_out //= m.groups - m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) - if m.bias is not None: - m.bias.data.zero_() + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() def forward(self, x): x = self.proj(x) @@ -260,7 +267,13 @@ def forward(self, x): @BACKBONES.register_module() -class MixVisionTransformer(nn.Module): +class MixVisionTransformer(BaseModule): + """Segformer. + + A PyTorch implement of : `An Image is Worth 16x16 Words: + Transformers for Image Recognition at Scale` - + https://arxiv.org/abs/2010.11929 + """ def __init__(self, img_size=224, @@ -276,8 +289,12 @@ def __init__(self, drop_path_rate=0., norm_cfg=dict(type='LN', eps=1e-6), depths=[3, 4, 6, 3], - sr_ratios=[8, 4, 2, 1]): + sr_ratios=[8, 4, 2, 1], + init_cfg=None, + pretrained=None): super().__init__() + self.init_cfg = init_cfg + self.pretrained = pretrained self.depths = depths # patch_embed self.patch_embed1 = OverlapPatchEmbed( @@ -310,7 +327,7 @@ def __init__(self, x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) ] # stochastic depth decay rule cur = 0 - self.block1 = nn.ModuleList([ + self.block1 = ModuleList([ Block( dim=embed_dims[0], num_heads=num_heads[0], @@ -325,7 +342,7 @@ def __init__(self, ]) _, self.norm1 = build_norm_layer(norm_cfg, embed_dims[0]) cur += depths[0] - self.block2 = nn.ModuleList([ + self.block2 = ModuleList([ Block( dim=embed_dims[1], num_heads=num_heads[1], @@ -341,7 +358,7 @@ def __init__(self, _, self.norm2 = build_norm_layer(norm_cfg, embed_dims[1]) cur += depths[1] - self.block3 = nn.ModuleList([ + self.block3 = ModuleList([ Block( dim=embed_dims[2], num_heads=num_heads[2], @@ -357,7 +374,7 @@ def __init__(self, _, self.norm3 = build_norm_layer(norm_cfg, embed_dims[2]) cur += depths[2] - self.block4 = nn.ModuleList([ + self.block4 = ModuleList([ Block( dim=embed_dims[3], num_heads=num_heads[3], @@ -372,33 +389,28 @@ def __init__(self, ]) _, self.norm4 = build_norm_layer(norm_cfg, embed_dims[3]) - # classification head - # self.head = nn.Linear(embed_dims[3], num_classes) if num_classes - # > 0 else nn.Identity() - - self.apply(self._init_weights) - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - elif isinstance(m, nn.Conv2d): - fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - fan_out //= m.groups - m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) - if m.bias is not None: - m.bias.data.zero_() - - def init_weights(self, pretrained=None): - if isinstance(pretrained, str): + def init_weights(self): + if isinstance(self.pretrained, type(None)): + for m in self.modules: + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[ + 1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(self.pretrained, str): logger = get_root_logger() load_checkpoint( self, - pretrained, + self.pretrained, map_location='cpu', strict=False, logger=logger) @@ -433,7 +445,7 @@ def no_weight_decay(self): 'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token' } # has pos_embed may be better - def forward_features(self, x): + def forward(self, x): B = x.shape[0] outs = [] @@ -470,8 +482,3 @@ def forward_features(self, x): outs.append(x) return outs - - def forward(self, x): - x = self.forward_features(x) - - return x From 99d7308626cc9fcb893bba13e2ab2bc10d9d0eee Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sun, 20 Jun 2021 11:36:25 +0800 Subject: [PATCH 10/96] 1. Refactor segformer backbone parameters init; 2. Remove rebundant functions and unit tests; --- mmseg/models/backbones/mit.py | 146 ++++++++-------------- mmseg/models/utils/__init__.py | 3 +- mmseg/models/utils/drop.py | 31 ----- mmseg/models/utils/helpers.py | 20 --- tests/test_models/test_utils/test_drop.py | 28 ----- 5 files changed, 50 insertions(+), 178 deletions(-) delete mode 100644 mmseg/models/utils/drop.py delete mode 100644 mmseg/models/utils/helpers.py delete mode 100644 tests/test_models/test_utils/test_drop.py diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index d90517f204..c2226815c7 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -3,18 +3,20 @@ import torch import torch.nn as nn from mmcv.cnn import (ConvModule, Linear, build_activation_layer, - build_norm_layer) + build_norm_layer, constant_init, normal_init, + trunc_normal_init) +from mmcv.cnn.bricks import DropPath from mmcv.runner import BaseModule, ModuleList, load_checkpoint +from torch.nn.modules.utils import _pair as to_2tuple from ...utils import get_root_logger from ..builder import BACKBONES -from ..utils import DropPath, to_2tuple, trunc_normal_ class DWConv(BaseModule): def __init__(self, dim=768): - super(DWConv, self).__init__() + super().__init__() self.dwconv = ConvModule( in_channels=dim, out_channels=dim, @@ -26,9 +28,9 @@ def __init__(self, dim=768): def forward(self, x, H, W): B, _, C = x.shape - x = x.transpose(1, 2).view(B, C, H, W) + x = x.transpose(1, 2).reshape(B, C, H, W) x = self.dwconv(x) - x = x.flatten(2).transpose(1, 2) + x = x.flatten(2).transpose(1, 2).contiguous() return x @@ -36,41 +38,23 @@ def forward(self, x, H, W): class Mlp(BaseModule): def __init__(self, - in_features, - hidden_features=None, - out_features=None, + embed_dims, + feedforward_channels, act_cfg=dict(type='GELU'), - drop=0.): + drop_rate=0.): super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.fc1 = Linear(in_features, hidden_features) - self.dwconv = DWConv(hidden_features) - self.act = build_activation_layer(act_cfg) - self.fc2 = Linear(hidden_features, out_features) - self.drop = nn.Dropout(drop) - def init_weights(self): - for m in self.modules(): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - elif isinstance(m, nn.Conv2d): - fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - fan_out //= m.groups - m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) - if m.bias is not None: - m.bias.data.zero_() + in_channels = embed_dims + self.fc1 = Linear(in_channels, feedforward_channels) + self.dwconv = DWConv(feedforward_channels) + self.act = build_activation_layer(act_cfg) + self.fc2 = Linear(feedforward_channels, in_channels) + self.drop = nn.Dropout(drop_rate) def forward(self, x, H, W): x = self.fc1(x) x = self.dwconv(x, H, W) x = self.act(x) - x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x @@ -111,36 +95,20 @@ def __init__(self, stride=sr_ratio) _, self.norm = build_norm_layer(norm_cfg, dim) - def init_weights(self): - for m in self.modules: - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - elif isinstance(m, nn.Conv2d): - fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - fan_out //= m.groups - m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) - if m.bias is not None: - m.bias.data.zero_() - def forward(self, x, H, W): B, N, C = x.shape q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) if self.sr_ratio > 1: - x_ = x.permute(0, 2, 1).reshape(B, C, H, W) - x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1) - x_ = self.norm(x_) - kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, - C // self.num_heads).permute( - 2, 0, 3, 1, 4) + x = x.permute(0, 2, 1).reshape(B, C, H, W) + x = self.sr(x).reshape(B, C, -1).permute(0, 2, 1) + x = self.norm(x) + kv = self.kv(x).reshape(B, N, 2, self.num_heads, + C // self.num_heads).permute( + 2, 0, 3, 1, 4) else: - kv = self.kv(x).reshape(B, -1, 2, self.num_heads, + kv = self.kv(x).reshape(B, N, 2, self.num_heads, C // self.num_heads).permute( 2, 0, 3, 1, 4) k, v = kv[0], kv[1] @@ -164,9 +132,9 @@ def __init__(self, mlp_ratio=4., qkv_bias=False, qk_scale=None, - drop=0., - attn_drop=0., - drop_path=0., + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), sr_ratio=1): @@ -177,36 +145,36 @@ def __init__(self, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, - attn_drop=attn_drop, - proj_drop=drop, + attn_drop=attn_drop_rate, + proj_drop=drop_rate, sr_ratio=sr_ratio) # NOTE: drop path for stochastic depth, we shall see if this is better # than dropout here self.drop_path = DropPath( - drop_path) if drop_path > 0. else nn.Identity() + drop_path_rate) if drop_path_rate > 0. else nn.Identity() _, self.norm2 = build_norm_layer(norm_cfg, dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( - in_features=dim, - hidden_features=mlp_hidden_dim, + embed_dims=dim, + feedforward_channels=mlp_hidden_dim, act_cfg=act_cfg, - drop=drop) + drop_rate=drop_rate) def init_weights(self): for m in self.modules: if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) + trunc_normal_init(m.weight, std=.02) + if m.bias is not None: + constant_init(m.bias, 0) elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) + constant_init(m.bias, 0) + constant_init(m.weight, 1.0) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels fan_out //= m.groups - m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + normal_init(m.weight, 0, math.sqrt(2.0 / fan_out)) if m.bias is not None: - m.bias.data.zero_() + constant_init(m.bias, 0) def forward(self, x, H, W): x = x + self.drop_path(self.attn(self.norm1(x), H, W)) @@ -230,9 +198,9 @@ def __init__(self, self.img_size = img_size self.patch_size = patch_size - self.H, self.W = img_size[0] // patch_size[0], img_size[ + num_rows, num_cols = img_size[0] // patch_size[0], img_size[ 1] // patch_size[1] - self.num_patches = self.H * self.W + self.num_patches = num_rows * num_cols self.proj = nn.Conv2d( in_chans, embed_dim, @@ -241,22 +209,6 @@ def __init__(self, padding=(patch_size[0] // 2, patch_size[1] // 2)) self.norm = nn.LayerNorm(embed_dim) - def init_weights(self): - for m in self.modules: - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - elif isinstance(m, nn.Conv2d): - fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - fan_out //= m.groups - m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) - if m.bias is not None: - m.bias.data.zero_() - def forward(self, x): x = self.proj(x) _, _, H, W = x.shape @@ -390,22 +342,22 @@ def __init__(self, _, self.norm4 = build_norm_layer(norm_cfg, embed_dims[3]) def init_weights(self): - if isinstance(self.pretrained, type(None)): + if self.pretrained is None: for m in self.modules: if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) + trunc_normal_init(m.weight, std=.02) + if m.bias is not None: + constant_init(m.bias, 0) elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) + constant_init(m.bias, 0) + constant_init(m.weight, 1.0) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[ 1] * m.out_channels fan_out //= m.groups - m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + normal_init(m.weight, 0, math.sqrt(2.0 / fan_out)) if m.bias is not None: - m.bias.data.zero_() + constant_init(m.bias) elif isinstance(self.pretrained, str): logger = get_root_logger() load_checkpoint( diff --git a/mmseg/models/utils/__init__.py b/mmseg/models/utils/__init__.py index be11d77f4e..b7066eb03e 100644 --- a/mmseg/models/utils/__init__.py +++ b/mmseg/models/utils/__init__.py @@ -1,4 +1,3 @@ -from .drop import DropPath from .inverted_residual import InvertedResidual, InvertedResidualV3 from .make_divisible import make_divisible from .res_layer import ResLayer @@ -9,5 +8,5 @@ __all__ = [ 'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual', - 'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'DropPath', 'vit_convert' + 'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'vit_convert' ] diff --git a/mmseg/models/utils/drop.py b/mmseg/models/utils/drop.py deleted file mode 100644 index 4520b0ff40..0000000000 --- a/mmseg/models/utils/drop.py +++ /dev/null @@ -1,31 +0,0 @@ -"""Modified from https://github.com/rwightman/pytorch-image- -models/blob/master/timm/models/layers/drop.py.""" - -import torch -from torch import nn - - -class DropPath(nn.Module): - """Drop paths (Stochastic Depth) per sample (when applied in main path of - residual blocks). - - Args: - drop_prob (float): Drop rate for paths of model. Dropout rate has - to be between 0 and 1. Default: 0. - """ - - def __init__(self, drop_prob=0.): - super(DropPath, self).__init__() - self.drop_prob = drop_prob - self.keep_prob = 1 - drop_prob - - def forward(self, x): - if self.drop_prob == 0. or not self.training: - return x - shape = (x.shape[0], ) + (1, ) * ( - x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets - random_tensor = self.keep_prob + torch.rand( - shape, dtype=x.dtype, device=x.device) - random_tensor.floor_() # binarize - output = x.div(self.keep_prob) * random_tensor - return output diff --git a/mmseg/models/utils/helpers.py b/mmseg/models/utils/helpers.py deleted file mode 100644 index 9d2bd96dd2..0000000000 --- a/mmseg/models/utils/helpers.py +++ /dev/null @@ -1,20 +0,0 @@ -import collections.abc -from itertools import repeat - - -# From PyTorch internals -def _ntuple(n): - - def parse(x): - if isinstance(x, collections.abc.Iterable): - return x - return tuple(repeat(x, n)) - - return parse - - -to_1tuple = _ntuple(1) -to_2tuple = _ntuple(2) -to_3tuple = _ntuple(3) -to_4tuple = _ntuple(4) -to_ntuple = _ntuple diff --git a/tests/test_models/test_utils/test_drop.py b/tests/test_models/test_utils/test_drop.py deleted file mode 100644 index 1331af8d01..0000000000 --- a/tests/test_models/test_utils/test_drop.py +++ /dev/null @@ -1,28 +0,0 @@ -import torch - -from mmseg.models.utils import DropPath - - -def test_drop_path(): - - # zero drop - layer = DropPath() - - # input NLC format feature - x = torch.randn((1, 16, 32)) - layer(x) - - # input NLHW format feature - x = torch.randn((1, 32, 4, 4)) - layer(x) - - # non-zero drop - layer = DropPath(0.1) - - # input NLC format feature - x = torch.randn((1, 16, 32)) - layer(x) - - # input NLHW format feature - x = torch.randn((1, 32, 4, 4)) - layer(x) From 337382776810068d3c1c3a23edbc3be3f19be3bd Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sun, 20 Jun 2021 11:39:41 +0800 Subject: [PATCH 11/96] Remove rebundant codes --- mmseg/models/backbones/mit.py | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index c2226815c7..ea4d5e3856 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -104,11 +104,11 @@ def forward(self, x, H, W): x = x.permute(0, 2, 1).reshape(B, C, H, W) x = self.sr(x).reshape(B, C, -1).permute(0, 2, 1) x = self.norm(x) - kv = self.kv(x).reshape(B, N, 2, self.num_heads, + kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute( 2, 0, 3, 1, 4) else: - kv = self.kv(x).reshape(B, N, 2, self.num_heads, + kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute( 2, 0, 3, 1, 4) k, v = kv[0], kv[1] @@ -160,22 +160,6 @@ def __init__(self, act_cfg=act_cfg, drop_rate=drop_rate) - def init_weights(self): - for m in self.modules: - if isinstance(m, nn.Linear): - trunc_normal_init(m.weight, std=.02) - if m.bias is not None: - constant_init(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - constant_init(m.bias, 0) - constant_init(m.weight, 1.0) - elif isinstance(m, nn.Conv2d): - fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - fan_out //= m.groups - normal_init(m.weight, 0, math.sqrt(2.0 / fan_out)) - if m.bias is not None: - constant_init(m.bias, 0) - def forward(self, x, H, W): x = x + self.drop_path(self.attn(self.norm1(x), H, W)) x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) From a6b02ee0c4115ad2291d3d9909499bf72b220a53 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sun, 20 Jun 2021 21:56:38 +0800 Subject: [PATCH 12/96] Replace Linear Layer to 1X1 Conv --- mmseg/models/decode_heads/segformer_head.py | 128 ++++++++++---------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/mmseg/models/decode_heads/segformer_head.py b/mmseg/models/decode_heads/segformer_head.py index 7fba8b349a..69f82ed9b8 100644 --- a/mmseg/models/decode_heads/segformer_head.py +++ b/mmseg/models/decode_heads/segformer_head.py @@ -1,91 +1,91 @@ import torch -import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import ConvModule -from ..builder import HEADS -from .decode_head import BaseDecodeHead - - -class MLP(nn.Module): - """Linear Embedding.""" - - def __init__(self, input_dim=2048, embed_dim=768): - super().__init__() - self.proj = nn.Linear(input_dim, embed_dim) - - def forward(self, x): - x = x.flatten(2).transpose(1, 2) - x = self.proj(x) - return x +from mmseg.models.builder import HEADS +from mmseg.models.decode_heads.decode_head import BaseDecodeHead @HEADS.register_module() class SegFormerHead(BaseDecodeHead): - """ - SegFormer: Simple and Efficient Design for Semantic Segmentation with - Transformers - """ + """The MLP Head of segformer.""" - def __init__(self, feature_strides, decoder_params, **kwargs): + def __init__(self, interpolate_mode='bilinear', **kwargs): super(SegFormerHead, self).__init__( input_transform='multiple_select', **kwargs) - assert len(feature_strides) == len(self.in_channels) - assert min(feature_strides) == feature_strides[0] - self.feature_strides = feature_strides - - c1_in_channels, c2_in_channels, c3_in_channels, c4_in_channels = \ - self.in_channels + self.interpolate_mode = interpolate_mode - decoder_params = decoder_params - embedding_dim = decoder_params['embed_dim'] + embed_dim = self.channels - self.linear_c4 = MLP(input_dim=c4_in_channels, embed_dim=embedding_dim) - self.linear_c3 = MLP(input_dim=c3_in_channels, embed_dim=embedding_dim) - self.linear_c2 = MLP(input_dim=c2_in_channels, embed_dim=embedding_dim) - self.linear_c1 = MLP(input_dim=c1_in_channels, embed_dim=embedding_dim) - - self.linear_fuse = ConvModule( - in_channels=embedding_dim * 4, - out_channels=embedding_dim, + self.conv_c4 = ConvModule( + in_channels=self.in_channels[-1], + out_channels=embed_dim, + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.conv_c3 = ConvModule( + in_channels=self.in_channels[-2], + out_channels=embed_dim, + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.conv_c2 = ConvModule( + in_channels=self.in_channels[-3], + out_channels=embed_dim, kernel_size=1, - norm_cfg=dict(type='SyncBN', requires_grad=True)) + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.conv_c1 = ConvModule( + in_channels=self.in_channels[-4], + out_channels=embed_dim, + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) - self.linear_pred = nn.Conv2d( - embedding_dim, self.num_classes, kernel_size=1) + self.conv_fuse = ConvModule( + in_channels=embed_dim * 4, + out_channels=embed_dim, + kernel_size=1, + norm_cfg=self.norm_cfg) def forward(self, inputs): - x = self._transform_inputs(inputs) # len=4, 1/4,1/8,1/16,1/32 + # Receive 4 stage backbone feature map: 1/4, 1/8, 1/16, 1/32 + x = self._transform_inputs(inputs) c1, c2, c3, c4 = x - # MLP decoder on C1-C4 # - n, _, h, w = c4.shape + # MLP decoder on C1-C4 + n, _, h, w = c1.shape - _c4 = self.linear_c4(c4).permute(0, 2, - 1).reshape(n, -1, c4.shape[2], - c4.shape[3]) - _c4 = F.interpolate( - _c4, size=c1.size()[2:], mode='bilinear', align_corners=False) + out_c4 = self.conv_c4(c4) + out_c4 = F.interpolate( + out_c4, + size=(h, w), + mode=self.mode, + align_corners=self.align_corners) - _c3 = self.linear_c3(c3).permute(0, 2, - 1).reshape(n, -1, c3.shape[2], - c3.shape[3]) - _c3 = F.interpolate( - _c3, size=c1.size()[2:], mode='bilinear', align_corners=False) + out_c3 = self.conv_c3(c3) + out_c3 = F.interpolate( + out_c3, + size=(h, w), + mode=self.mode, + align_corners=self.align_corners) - _c2 = self.linear_c2(c2).permute(0, 2, - 1).reshape(n, -1, c2.shape[2], - c2.shape[3]) - _c2 = F.interpolate( - _c2, size=c1.size()[2:], mode='bilinear', align_corners=False) + out_c2 = self.conv_c2(c2) + out_c2 = F.interpolate( + out_c2, + size=(h, w), + mode=self.mode, + align_corners=self.align_corners) - _c1 = self.linear_c1(c1).permute(0, 2, - 1).reshape(n, -1, c1.shape[2], - c1.shape[3]) + out_c1 = self.conv_c1(c1) - _c = self.linear_fuse(torch.cat([_c4, _c3, _c2, _c1], dim=1)) + out = self.conv_fuse( + torch.cat([out_c4, out_c3, out_c2, out_c1], dim=1)) - x = self.dropout(_c) - x = self.linear_pred(x) + x = self.cls_seg(out) return x From 6365d168157b8c66f598923e04c18335d2840a9d Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sun, 20 Jun 2021 22:34:54 +0800 Subject: [PATCH 13/96] Use nn.ModuleList to refactor segformer head. --- mmseg/models/decode_heads/segformer_head.py | 99 +++++++-------------- 1 file changed, 34 insertions(+), 65 deletions(-) diff --git a/mmseg/models/decode_heads/segformer_head.py b/mmseg/models/decode_heads/segformer_head.py index 69f82ed9b8..0fdc3b9179 100644 --- a/mmseg/models/decode_heads/segformer_head.py +++ b/mmseg/models/decode_heads/segformer_head.py @@ -1,91 +1,60 @@ import torch -import torch.nn.functional as F +import torch.nn as nn from mmcv.cnn import ConvModule from mmseg.models.builder import HEADS from mmseg.models.decode_heads.decode_head import BaseDecodeHead +from mmseg.ops import resize @HEADS.register_module() class SegFormerHead(BaseDecodeHead): - """The MLP Head of segformer.""" + """The MLP Head of segformer. + + Args: + interpolate_mode: The interpolate mode of MLP head upsample operation. + Default: 'bilinear'. + """ def __init__(self, interpolate_mode='bilinear', **kwargs): super(SegFormerHead, self).__init__( input_transform='multiple_select', **kwargs) + self.interpolate_mode = interpolate_mode - embed_dim = self.channels + num_inputs = len(self.in_channels) - self.conv_c4 = ConvModule( - in_channels=self.in_channels[-1], - out_channels=embed_dim, - kernel_size=1, - stride=1, - norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg) - self.conv_c3 = ConvModule( - in_channels=self.in_channels[-2], - out_channels=embed_dim, - kernel_size=1, - stride=1, - norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg) - self.conv_c2 = ConvModule( - in_channels=self.in_channels[-3], - out_channels=embed_dim, - kernel_size=1, - stride=1, - norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg) - self.conv_c1 = ConvModule( - in_channels=self.in_channels[-4], - out_channels=embed_dim, - kernel_size=1, - stride=1, - norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg) + self.convs = nn.ModuleList() + for i in range(num_inputs): + self.convs.append( + ConvModule( + in_channels=self.in_channels[i], + out_channels=self.channels, + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) self.conv_fuse = ConvModule( - in_channels=embed_dim * 4, - out_channels=embed_dim, + in_channels=self.channels * num_inputs, + out_channels=self.channels, kernel_size=1, norm_cfg=self.norm_cfg) def forward(self, inputs): # Receive 4 stage backbone feature map: 1/4, 1/8, 1/16, 1/32 - x = self._transform_inputs(inputs) - c1, c2, c3, c4 = x - - # MLP decoder on C1-C4 - n, _, h, w = c1.shape - - out_c4 = self.conv_c4(c4) - out_c4 = F.interpolate( - out_c4, - size=(h, w), - mode=self.mode, - align_corners=self.align_corners) - - out_c3 = self.conv_c3(c3) - out_c3 = F.interpolate( - out_c3, - size=(h, w), - mode=self.mode, - align_corners=self.align_corners) - - out_c2 = self.conv_c2(c2) - out_c2 = F.interpolate( - out_c2, - size=(h, w), - mode=self.mode, - align_corners=self.align_corners) - - out_c1 = self.conv_c1(c1) + inputs = self._transform_inputs(inputs) + outs = [] + for x, conv in zip(inputs, self.convs): + outs.append( + resize( + input=conv(x), + size=inputs[0].shape[2:], + mode=self.interpolate_mode, + align_corners=self.align_corners)) - out = self.conv_fuse( - torch.cat([out_c4, out_c3, out_c2, out_c1], dim=1)) + out = self.conv_fuse(torch.cat(outs, dim=1)) - x = self.cls_seg(out) + out = self.cls_seg(out) - return x + return out From 7e12d7c805a8be92456a362f1407f94ba8a1fe8f Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sun, 20 Jun 2021 22:40:27 +0800 Subject: [PATCH 14/96] Remove local to_xtuple --- mmseg/models/utils/helpers.py | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 mmseg/models/utils/helpers.py diff --git a/mmseg/models/utils/helpers.py b/mmseg/models/utils/helpers.py deleted file mode 100644 index 9d2bd96dd2..0000000000 --- a/mmseg/models/utils/helpers.py +++ /dev/null @@ -1,20 +0,0 @@ -import collections.abc -from itertools import repeat - - -# From PyTorch internals -def _ntuple(n): - - def parse(x): - if isinstance(x, collections.abc.Iterable): - return x - return tuple(repeat(x, n)) - - return parse - - -to_1tuple = _ntuple(1) -to_2tuple = _ntuple(2) -to_3tuple = _ntuple(3) -to_4tuple = _ntuple(4) -to_ntuple = _ntuple From d967a9f40492391fb0b0926f3c3b475803849553 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Mon, 21 Jun 2021 01:06:57 +0800 Subject: [PATCH 15/96] 1. Remove rebundant codes; 2. Modify module name; --- mmseg/models/backbones/mit.py | 108 +++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 48 deletions(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index ea4d5e3856..e32bff01c0 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -6,6 +6,7 @@ build_norm_layer, constant_init, normal_init, trunc_normal_init) from mmcv.cnn.bricks import DropPath +from mmcv.cnn.bricks.transformer import MultiheadAttention from mmcv.runner import BaseModule, ModuleList, load_checkpoint from torch.nn.modules.utils import _pair as to_2tuple @@ -13,29 +14,48 @@ from ..builder import BACKBONES -class DWConv(BaseModule): +def nlc2nchw(tensor, H, W): + assert len(tensor.shape) == 3 + B, _, C = tensor.shape + return tensor.transpose(1, 2).reshape(B, C, H, W) - def __init__(self, dim=768): + +def nchw2nlc(tensor): + assert len(tensor.shape) == 4 + return tensor.flatten(2).transpose(1, 2).contiguous() + + +class PEConv(BaseModule): + """Mix-FFN use 3x3 depth-wise Conv to provide positional encode + information. + + Args: + embed_dims (int): The channels of token after embedding. + kernel_size (int): The kernel size of convolution operation. + Default: 3. + stride (int): The kernel slide move distance of one step. + Default: 1. + """ + + def __init__(self, embed_dims, kernel_size=3, stride=1): super().__init__() - self.dwconv = ConvModule( - in_channels=dim, - out_channels=dim, - kernel_size=3, - stride=1, - padding=(3 - 1) // 2, + self.conv = ConvModule( + in_channels=embed_dims, + out_channels=embed_dims, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, bias=True, - groups=dim) + groups=embed_dims) def forward(self, x, H, W): - B, _, C = x.shape - x = x.transpose(1, 2).reshape(B, C, H, W) - x = self.dwconv(x) - x = x.flatten(2).transpose(1, 2).contiguous() - + x = nlc2nchw(x, H, W) + x = self.conv(x) + x = nchw2nlc(x) return x -class Mlp(BaseModule): +class MixFFN(BaseModule): def __init__(self, embed_dims, @@ -46,21 +66,24 @@ def __init__(self, in_channels = embed_dims self.fc1 = Linear(in_channels, feedforward_channels) - self.dwconv = DWConv(feedforward_channels) + self.pe_conv = PEConv(feedforward_channels) self.act = build_activation_layer(act_cfg) self.fc2 = Linear(feedforward_channels, in_channels) self.drop = nn.Dropout(drop_rate) def forward(self, x, H, W): x = self.fc1(x) - x = self.dwconv(x, H, W) + x = self.pe_conv(x, H, W) x = self.act(x) + x = self.drop(x) x = self.fc2(x) + # NOTE: Add a activation function to follow origin FFN. + # x = self.act(x) x = self.drop(x) return x -class Attention(BaseModule): +class EfficientMultiheadAttention(MultiheadAttention): def __init__(self, dim, @@ -68,8 +91,8 @@ def __init__(self, num_heads=8, qkv_bias=False, qk_scale=None, - attn_drop=0., - proj_drop=0., + attn_drop_rate=0., + proj_drop_rate=0., sr_ratio=1): super().__init__() assert dim % num_heads == 0, f'dim {dim} should be divided by ' @@ -82,9 +105,9 @@ def __init__(self, self.q = Linear(dim, dim, bias=qkv_bias) self.kv = Linear(dim, dim * 2, bias=qkv_bias) - self.attn_drop = nn.Dropout(attn_drop) + self.attn_drop = nn.Dropout(attn_drop_rate) self.proj = Linear(dim, dim) - self.proj_drop = nn.Dropout(proj_drop) + self.proj_drop = nn.Dropout(proj_drop_rate) self.sr_ratio = sr_ratio if sr_ratio > 1: @@ -140,21 +163,19 @@ def __init__(self, sr_ratio=1): super().__init__() _, self.norm1 = build_norm_layer(norm_cfg, dim) - self.attn = Attention( + self.attn = EfficientMultiheadAttention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, - attn_drop=attn_drop_rate, - proj_drop=drop_rate, + attn_drop_rate=attn_drop_rate, + proj_drop_rate=drop_rate, sr_ratio=sr_ratio) - # NOTE: drop path for stochastic depth, we shall see if this is better - # than dropout here self.drop_path = DropPath( drop_path_rate) if drop_path_rate > 0. else nn.Identity() _, self.norm2 = build_norm_layer(norm_cfg, dim) mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp( + self.mlp = MixFFN( embed_dims=dim, feedforward_channels=mlp_hidden_dim, act_cfg=act_cfg, @@ -170,27 +191,17 @@ def forward(self, x, H, W): class OverlapPatchEmbed(BaseModule): """Image to Patch Embedding.""" - def __init__(self, - img_size=224, - patch_size=7, - stride=4, - in_chans=3, - embed_dim=768): + def __init__(self, patch_size=7, stride=4, in_chans=3, embed_dim=768): super().__init__() - img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) - self.img_size = img_size - self.patch_size = patch_size - num_rows, num_cols = img_size[0] // patch_size[0], img_size[ - 1] // patch_size[1] - self.num_patches = num_rows * num_cols - self.proj = nn.Conv2d( - in_chans, - embed_dim, + self.proj = ConvModule( + in_channels=in_chans, + out_channels=embed_dim, kernel_size=patch_size, stride=stride, - padding=(patch_size[0] // 2, patch_size[1] // 2)) + padding=(patch_size[0] // 2, patch_size[1] // 2), + act_cfg=None) self.norm = nn.LayerNorm(embed_dim) def forward(self, x): @@ -204,11 +215,11 @@ def forward(self, x): @BACKBONES.register_module() class MixVisionTransformer(BaseModule): - """Segformer. + """The backbone of Segformer. - A PyTorch implement of : `An Image is Worth 16x16 Words: - Transformers for Image Recognition at Scale` - - https://arxiv.org/abs/2010.11929 + A PyTorch implement of : `SegFormer: Simple and Efficient Design for + Semantic Segmentation with Transformers` - + https://arxiv.org/pdf/2105.15203.pdf """ def __init__(self, @@ -223,6 +234,7 @@ def __init__(self, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., + act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN', eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], From 47326de589ab1c50a5d2bd7f4a79454ac6d75cb6 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Mon, 21 Jun 2021 17:11:07 +0800 Subject: [PATCH 16/96] Refactor the backbone of segformer using mmcv.cnn.bricks.transformer.py --- mmseg/models/backbones/mit.py | 517 +++++++++++++++++----------------- 1 file changed, 254 insertions(+), 263 deletions(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index e32bff01c0..172c017674 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -1,26 +1,25 @@ import math +from functools import partial import torch import torch.nn as nn -from mmcv.cnn import (ConvModule, Linear, build_activation_layer, - build_norm_layer, constant_init, normal_init, - trunc_normal_init) -from mmcv.cnn.bricks import DropPath +from mmcv.cnn import (ConvModule, build_activation_layer, build_norm_layer, + constant_init, normal_init, trunc_normal_init) +from mmcv.cnn.bricks.drop import build_dropout from mmcv.cnn.bricks.transformer import MultiheadAttention -from mmcv.runner import BaseModule, ModuleList, load_checkpoint -from torch.nn.modules.utils import _pair as to_2tuple +from mmcv.runner import BaseModule, ModuleList, Sequential, load_checkpoint from ...utils import get_root_logger from ..builder import BACKBONES -def nlc2nchw(tensor, H, W): +def nlc_to_nchw(tensor, H, W): assert len(tensor.shape) == 3 B, _, C = tensor.shape return tensor.transpose(1, 2).reshape(B, C, H, W) -def nchw2nlc(tensor): +def nchw_to_nlc(tensor): assert len(tensor.shape) == 4 return tensor.flatten(2).transpose(1, 2).contiguous() @@ -49,165 +48,267 @@ def __init__(self, embed_dims, kernel_size=3, stride=1): groups=embed_dims) def forward(self, x, H, W): - x = nlc2nchw(x, H, W) + x = self.conv(x) - x = nchw2nlc(x) + return x class MixFFN(BaseModule): + """An implementation of MixFFN of Segformer. + + The differences between MixFFN & FFN: + 1. Use 1X1 Conv to replace Linear layer. + 2. Introduce 3X3 Conv to encode positional information. + + Args: + embed_dims (int): The feature dimension. Same as + `MultiheadAttention`. Defaults: 256. + feedforward_channels (int): The hidden dimension of FFNs. + Defaults: 1024. + num_fcs (int, optional): The number of fully-connected layers in + FFNs. Default: 2. + act_cfg (dict, optional): The activation config for FFNs. + Default: dict(type='ReLU') + ffn_drop (float, optional): Probability of an element to be + zeroed in FFN. Default 0.0. + dropout_layer (obj:`ConfigDict`): The dropout_layer used + when adding the shortcut. + add_identity (bool, optional): Whether to add the + identity connection. Default: `True`. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ def __init__(self, embed_dims, feedforward_channels, + num_fcs=2, act_cfg=dict(type='GELU'), - drop_rate=0.): - super().__init__() + ffn_drop=0., + dropout_layer=None, + add_identity=True, + init_cfg=None): + super(MixFFN, self).__init__(init_cfg) + assert num_fcs >= 2, 'num_fcs should be no less ' \ + f'than 2. got {num_fcs}.' + + self.embed_dims = embed_dims + self.feedforward_channels = feedforward_channels + self.num_fcs = num_fcs + self.act_cfg = act_cfg + self.activate = build_activation_layer(act_cfg) + + conv1x1 = partial( + ConvModule, + kernel_size=1, + stride=1, + bias=True, + norm_cfg=None, + act_cfg=None) + layers = [] in_channels = embed_dims - self.fc1 = Linear(in_channels, feedforward_channels) - self.pe_conv = PEConv(feedforward_channels) - self.act = build_activation_layer(act_cfg) - self.fc2 = Linear(feedforward_channels, in_channels) - self.drop = nn.Dropout(drop_rate) - - def forward(self, x, H, W): - x = self.fc1(x) - x = self.pe_conv(x, H, W) - x = self.act(x) - x = self.drop(x) - x = self.fc2(x) - # NOTE: Add a activation function to follow origin FFN. - # x = self.act(x) - x = self.drop(x) - return x + for _ in range(num_fcs - 1): + layers.append( + Sequential( + conv1x1( + in_channels=in_channels, + out_channels=feedforward_channels), + PEConv(feedforward_channels), self.activate, + nn.Dropout(ffn_drop))) + layers.append( + conv1x1( + in_channels=in_channels, out_channels=feedforward_channels)) + layers.append(nn.Dropout(ffn_drop)) + self.layers = Sequential(*layers) + self.dropout_layer = build_dropout( + dropout_layer) if dropout_layer else torch.nn.Identity() + self.add_identity = add_identity + + def forward(self, x, H, W, identity=None): + out = nlc_to_nchw(x, H, W) + out = self.layers(out) + out = nchw_to_nlc(out) + if not self.add_identity: + return self.dropout_layer(out) + if identity is None: + identity = x + return identity + self.dropout_layer(out) class EfficientMultiheadAttention(MultiheadAttention): + """An implementation of Efficient Multi-head Attention of Segformer. + + This module is modified from MultiheadAttention which is a module from + mmcv.cnn.bricks.transformer. + + Args: + embed_dims (int): The embedding dimension. + num_heads (int): Parallel attention heads. + attn_drop (float): A Dropout layer on attn_output_weights. + Default: 0.0. + proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. + Default: 0.0. + dropout_layer (obj:`ConfigDict`): The dropout_layer used + when adding the shortcut. Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default: False. + qkv_bias (bool): enable bias for qkv if True. Default True. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head + Attention of Segformer. Default: 1. + """ def __init__(self, - dim, - norm_cfg=dict(type='LN'), - num_heads=8, + embed_dims, + num_heads, + attn_drop=0., + proj_drop=0., + dropout_layer=None, + init_cfg=None, + batch_first=True, qkv_bias=False, - qk_scale=None, - attn_drop_rate=0., - proj_drop_rate=0., + norm_cfg=dict(type='LN'), sr_ratio=1): - super().__init__() - assert dim % num_heads == 0, f'dim {dim} should be divided by ' - f'num_heads {num_heads}.' - - self.dim = dim - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = qk_scale or head_dim**-0.5 - - self.q = Linear(dim, dim, bias=qkv_bias) - self.kv = Linear(dim, dim * 2, bias=qkv_bias) - self.attn_drop = nn.Dropout(attn_drop_rate) - self.proj = Linear(dim, dim) - self.proj_drop = nn.Dropout(proj_drop_rate) + super().__init__( + embed_dims, + num_heads, + attn_drop, + proj_drop, + dropout_layer=dropout_layer, + init_cfg=init_cfg, + batch_first=batch_first, + bias=qkv_bias) self.sr_ratio = sr_ratio if sr_ratio > 1: self.sr = ConvModule( - in_channels=dim, - out_channels=dim, + in_channels=embed_dims, + out_channels=embed_dims, kernel_size=sr_ratio, stride=sr_ratio) - _, self.norm = build_norm_layer(norm_cfg, dim) + _, self.norm = build_norm_layer(norm_cfg, embed_dims) - def forward(self, x, H, W): - B, N, C = x.shape - q = self.q(x).reshape(B, N, self.num_heads, - C // self.num_heads).permute(0, 2, 1, 3) + def forward(self, x, H, W, identity=None): + B, _, C = x.shape + x_q = x if self.sr_ratio > 1: - x = x.permute(0, 2, 1).reshape(B, C, H, W) - x = self.sr(x).reshape(B, C, -1).permute(0, 2, 1) - x = self.norm(x) - kv = self.kv(x).reshape(B, -1, 2, self.num_heads, - C // self.num_heads).permute( - 2, 0, 3, 1, 4) - else: - kv = self.kv(x).reshape(B, -1, 2, self.num_heads, - C // self.num_heads).permute( - 2, 0, 3, 1, 4) - k, v = kv[0], kv[1] - - attn = (q @ k.transpose(-2, -1)) * self.scale - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) - - x = (attn @ v).transpose(1, 2).reshape(B, N, C) - x = self.proj(x) - x = self.proj_drop(x) + x_kv = x.permute(0, 2, 1).reshape(B, C, H, W) + x_kv = self.sr(x_kv).reshape(B, C, -1).permute(0, 2, 1) + x_kv = self.norm(x_kv) - return x + if identity is None: + identity = x_q + out = self.attn(query=x_q, key=x_kv, value=x_kv)[0] -class Block(BaseModule): + return identity + self.dropout_layer(self.proj_drop(out)) + + +class TransformerEncoderLayer(BaseModule): + """Implements one encoder layer in Segformer. + + Args: + embed_dims (int): The feature dimension. + num_heads (int): Parallel attention heads. + feedforward_channels (int): The hidden dimension for FFNs. + drop_rate (float): Probability of an element to be zeroed. + after the feed forward layer. Default 0.0. + attn_drop_rate (float): The drop out rate for attention layer. + Default 0.0. + drop_path_rate (float): stochastic depth rate. Default 0.0. + num_fcs (int): The number of fully-connected layers for FFNs. + Default: 2. + qkv_bias (bool): enable bias for qkv if True. + Default: True. + act_cfg (dict): The activation config for FFNs. + Defalut: dict(type='GELU'). + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default: False. + init_cfg (dict, optional): Initialization config dict. + Default:None. + sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head + Attention of Segformer. Default: 1. + """ def __init__(self, - dim, + embed_dims, num_heads, - mlp_ratio=4., - qkv_bias=False, - qk_scale=None, + feedforward_channels, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., + num_fcs=2, + qkv_bias=True, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), + batch_first=True, sr_ratio=1): - super().__init__() - _, self.norm1 = build_norm_layer(norm_cfg, dim) + super(TransformerEncoderLayer, self).__init__() + + _, self.norm1 = build_norm_layer(norm_cfg, embed_dims) + self.attn = EfficientMultiheadAttention( - dim, + embed_dims=embed_dims, num_heads=num_heads, + attn_drop=attn_drop_rate, + proj_drop=drop_rate, + dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), + batch_first=batch_first, qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop_rate=attn_drop_rate, - proj_drop_rate=drop_rate, + norm_cfg=norm_cfg, sr_ratio=sr_ratio) - self.drop_path = DropPath( - drop_path_rate) if drop_path_rate > 0. else nn.Identity() - _, self.norm2 = build_norm_layer(norm_cfg, dim) - mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = MixFFN( - embed_dims=dim, - feedforward_channels=mlp_hidden_dim, - act_cfg=act_cfg, - drop_rate=drop_rate) - def forward(self, x, H, W): - x = x + self.drop_path(self.attn(self.norm1(x), H, W)) - x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + _, self.norm2 = build_norm_layer(norm_cfg, embed_dims) + + self.ffn = MixFFN( + embed_dims=embed_dims, + feedforward_channels=feedforward_channels, + num_fcs=num_fcs, + ffn_drop=drop_rate, + dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), + act_cfg=act_cfg) + def forward(self, x, H, W): + x = self.attn(self.norm1(x), H, W, identity=x) + x = self.ffn(self.norm2(x), H, W, identity=x) return x class OverlapPatchEmbed(BaseModule): """Image to Patch Embedding.""" - def __init__(self, patch_size=7, stride=4, in_chans=3, embed_dim=768): + def __init__(self, + patch_size, + in_channels, + embed_dims, + stride, + norm_cfg=None): super().__init__() - patch_size = to_2tuple(patch_size) self.proj = ConvModule( - in_channels=in_chans, - out_channels=embed_dim, + in_channels=in_channels, + out_channels=embed_dims, kernel_size=patch_size, stride=stride, - padding=(patch_size[0] // 2, patch_size[1] // 2), - act_cfg=None) - self.norm = nn.LayerNorm(embed_dim) + padding=patch_size // 2, + act_cfg=None, + norm_cfg=None) + self.norm = build_norm_layer(norm_cfg, embed_dims) def forward(self, x): x = self.proj(x) _, _, H, W = x.shape - x = x.flatten(2).transpose(1, 2) + x = nchw_to_nlc(x) x = self.norm(x) return x, H, W @@ -223,119 +324,63 @@ class MixVisionTransformer(BaseModule): """ def __init__(self, - img_size=224, - patch_size=16, - in_chans=3, + in_channels=3, embed_dims=[64, 128, 256, 512], + num_layers=[3, 4, 6, 3], num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], - qkv_bias=False, - qk_scale=None, + out_indices=(0, 1, 2, 3), + qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN', eps=1e-6), - depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], - init_cfg=None, - pretrained=None): + pretrained=None, + init_cfg=None): super().__init__() + + self.out_indices = out_indices self.init_cfg = init_cfg self.pretrained = pretrained - self.depths = depths - # patch_embed - self.patch_embed1 = OverlapPatchEmbed( - img_size=img_size, - patch_size=7, - stride=4, - in_chans=in_chans, - embed_dim=embed_dims[0]) - self.patch_embed2 = OverlapPatchEmbed( - img_size=img_size // 4, - patch_size=3, - stride=2, - in_chans=embed_dims[0], - embed_dim=embed_dims[1]) - self.patch_embed3 = OverlapPatchEmbed( - img_size=img_size // 8, - patch_size=3, - stride=2, - in_chans=embed_dims[1], - embed_dim=embed_dims[2]) - self.patch_embed4 = OverlapPatchEmbed( - img_size=img_size // 16, - patch_size=3, - stride=2, - in_chans=embed_dims[2], - embed_dim=embed_dims[3]) + + patch_sizes = [7, 3, 3, 3] + strides = [4, 2, 2, 2] # transformer encoder dpr = [ - x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + x.item() + for x in torch.linspace(0, drop_path_rate, sum(num_layers)) ] # stochastic depth decay rule + cur = 0 - self.block1 = ModuleList([ - Block( - dim=embed_dims[0], - num_heads=num_heads[0], - mlp_ratio=mlp_ratios[0], - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[cur + i], - norm_cfg=norm_cfg, - sr_ratio=sr_ratios[0]) for i in range(depths[0]) - ]) - _, self.norm1 = build_norm_layer(norm_cfg, embed_dims[0]) - cur += depths[0] - self.block2 = ModuleList([ - Block( - dim=embed_dims[1], - num_heads=num_heads[1], - mlp_ratio=mlp_ratios[1], - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[cur + i], - norm_cfg=norm_cfg, - sr_ratio=sr_ratios[1]) for i in range(depths[1]) - ]) - _, self.norm2 = build_norm_layer(norm_cfg, embed_dims[1]) - - cur += depths[1] - self.block3 = ModuleList([ - Block( - dim=embed_dims[2], - num_heads=num_heads[2], - mlp_ratio=mlp_ratios[2], - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[cur + i], - norm_cfg=norm_cfg, - sr_ratio=sr_ratios[2]) for i in range(depths[2]) - ]) - _, self.norm3 = build_norm_layer(norm_cfg, embed_dims[2]) - - cur += depths[2] - self.block4 = ModuleList([ - Block( - dim=embed_dims[3], - num_heads=num_heads[3], - mlp_ratio=mlp_ratios[3], - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[cur + i], - norm_cfg=norm_cfg, - sr_ratio=sr_ratios[3]) for i in range(depths[3]) - ]) - _, self.norm4 = build_norm_layer(norm_cfg, embed_dims[3]) + self.layers = ModuleList() + for stage_id in range(len(num_layers)): + patch_embed = OverlapPatchEmbed( + patch_size=patch_sizes[stage_id], + in_channels=in_channels, + embed_dims=embed_dims[0], + stride=strides[stage_id], + norm_cfg=norm_cfg) + layer = ModuleList([ + TransformerEncoderLayer( + embed_dims=embed_dims[stage_id], + num_heads=num_heads[stage_id], + feedforward_channels=mlp_ratios[stage_id] * embed_dims, + drop_rate=drop_rate, + attn_drop_rate=attn_drop_rate, + drop_path_rate=dpr[cur + i], + num_fcs=2, + qkv_bias=qkv_bias, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + sr_ratio=sr_ratios[stage_id]) + for i in range(num_layers[stage_id]) + ]) + _, norm = build_norm_layer(norm_cfg, embed_dims[stage_id]) + self.layers.append(ModuleList(patch_embed, layer, norm)) + cur += num_layers[stage_id] def init_weights(self): if self.pretrained is None: @@ -363,70 +408,16 @@ def init_weights(self): strict=False, logger=logger) - def reset_drop_path(self, drop_path_rate): - dpr = [ - x.item() - for x in torch.linspace(0, drop_path_rate, sum(self.depths)) - ] - cur = 0 - for i in range(self.depths[0]): - self.block1[i].drop_path.drop_prob = dpr[cur + i] - - cur += self.depths[0] - for i in range(self.depths[1]): - self.block2[i].drop_path.drop_prob = dpr[cur + i] - - cur += self.depths[1] - for i in range(self.depths[2]): - self.block3[i].drop_path.drop_prob = dpr[cur + i] - - cur += self.depths[2] - for i in range(self.depths[3]): - self.block4[i].drop_path.drop_prob = dpr[cur + i] - - def freeze_patch_emb(self): - self.patch_embed1.requires_grad = False - - @torch.jit.ignore - def no_weight_decay(self): - return { - 'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token' - } # has pos_embed may be better - def forward(self, x): B = x.shape[0] outs = [] - # stage 1 - x, H, W = self.patch_embed1(x) - for i, blk in enumerate(self.block1): - x = blk(x, H, W) - x = self.norm1(x) - x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() - outs.append(x) - - # stage 2 - x, H, W = self.patch_embed2(x) - for i, blk in enumerate(self.block2): - x = blk(x, H, W) - x = self.norm2(x) - x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() - outs.append(x) - - # stage 3 - x, H, W = self.patch_embed3(x) - for i, blk in enumerate(self.block3): - x = blk(x, H, W) - x = self.norm3(x) - x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() - outs.append(x) - - # stage 4 - x, H, W = self.patch_embed4(x) - for i, blk in enumerate(self.block4): - x = blk(x, H, W) - x = self.norm4(x) - x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() - outs.append(x) + for i, layer in enumerate(self.layers): + x, H, W = layer[0](x) + x = layer[1](x, H, W) + x = layer[2](x) + x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() + if i in self.out_indices: + outs.append(x) return outs From 589af69ecb48a58f98460ae92a14fdad2fc5b5e0 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Tue, 22 Jun 2021 02:28:19 +0800 Subject: [PATCH 17/96] Fix some code logic bugs. --- mmseg/models/backbones/mit.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index 172c017674..8899462f5d 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -47,7 +47,7 @@ def __init__(self, embed_dims, kernel_size=3, stride=1): bias=True, groups=embed_dims) - def forward(self, x, H, W): + def forward(self, x): x = self.conv(x) @@ -119,7 +119,7 @@ def __init__(self, nn.Dropout(ffn_drop))) layers.append( conv1x1( - in_channels=in_channels, out_channels=feedforward_channels)) + in_channels=feedforward_channels, out_channels=in_channels)) layers.append(nn.Dropout(ffn_drop)) self.layers = Sequential(*layers) self.dropout_layer = build_dropout( @@ -202,6 +202,8 @@ def forward(self, x, H, W, identity=None): x_kv = x.permute(0, 2, 1).reshape(B, C, H, W) x_kv = self.sr(x_kv).reshape(B, C, -1).permute(0, 2, 1) x_kv = self.norm(x_kv) + else: + x_kv = x if identity is None: identity = x_q @@ -303,7 +305,7 @@ def __init__(self, padding=patch_size // 2, act_cfg=None, norm_cfg=None) - self.norm = build_norm_layer(norm_cfg, embed_dims) + _, self.norm = build_norm_layer(norm_cfg, embed_dims) def forward(self, x): x = self.proj(x) @@ -360,14 +362,15 @@ def __init__(self, patch_embed = OverlapPatchEmbed( patch_size=patch_sizes[stage_id], in_channels=in_channels, - embed_dims=embed_dims[0], + embed_dims=embed_dims[stage_id], stride=strides[stage_id], norm_cfg=norm_cfg) layer = ModuleList([ TransformerEncoderLayer( embed_dims=embed_dims[stage_id], num_heads=num_heads[stage_id], - feedforward_channels=mlp_ratios[stage_id] * embed_dims, + feedforward_channels=mlp_ratios[stage_id] * + embed_dims[stage_id], drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[cur + i], @@ -378,8 +381,9 @@ def __init__(self, sr_ratio=sr_ratios[stage_id]) for i in range(num_layers[stage_id]) ]) + in_channels = embed_dims[stage_id] _, norm = build_norm_layer(norm_cfg, embed_dims[stage_id]) - self.layers.append(ModuleList(patch_embed, layer, norm)) + self.layers.append(ModuleList([patch_embed, layer, norm])) cur += num_layers[stage_id] def init_weights(self): @@ -414,7 +418,8 @@ def forward(self, x): for i, layer in enumerate(self.layers): x, H, W = layer[0](x) - x = layer[1](x, H, W) + for block in layer[1]: + x = block(x, H, W) x = layer[2](x) x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() if i in self.out_indices: From 28099de2dea41179c4cbfe0a04276e5fd5100961 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 23 Jun 2021 02:08:15 +0800 Subject: [PATCH 18/96] Add mit_convert.py to match pretrain keys of segformer. --- mmseg/models/backbones/mit.py | 46 +++++++++++---- mmseg/models/utils/__init__.py | 5 +- mmseg/models/utils/timm_convert.py | 32 ----------- mmseg/models/utils/weights_convert.py | 81 +++++++++++++++++++++++++++ 4 files changed, 120 insertions(+), 44 deletions(-) delete mode 100644 mmseg/models/utils/timm_convert.py create mode 100644 mmseg/models/utils/weights_convert.py diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index 8899462f5d..31277327a8 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -1,4 +1,5 @@ import math +import warnings from functools import partial import torch @@ -7,10 +8,11 @@ constant_init, normal_init, trunc_normal_init) from mmcv.cnn.bricks.drop import build_dropout from mmcv.cnn.bricks.transformer import MultiheadAttention -from mmcv.runner import BaseModule, ModuleList, Sequential, load_checkpoint +from mmcv.runner import BaseModule, ModuleList, Sequential, _load_checkpoint from ...utils import get_root_logger from ..builder import BACKBONES +from ..utils import mit_convert def nlc_to_nchw(tensor, H, W): @@ -45,7 +47,9 @@ def __init__(self, embed_dims, kernel_size=3, stride=1): stride=stride, padding=(kernel_size - 1) // 2, bias=True, - groups=embed_dims) + groups=embed_dims, + norm_cfg=None, + act_cfg=None) def forward(self, x): @@ -191,7 +195,9 @@ def __init__(self, in_channels=embed_dims, out_channels=embed_dims, kernel_size=sr_ratio, - stride=sr_ratio) + stride=sr_ratio, + norm_cfg=None, + act_cfg=None) _, self.norm = build_norm_layer(norm_cfg, embed_dims) def forward(self, x, H, W, identity=None): @@ -339,13 +345,23 @@ def __init__(self, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN', eps=1e-6), sr_ratios=[8, 4, 2, 1], + pretrain_style='official', pretrained=None, init_cfg=None): super().__init__() + assert pretrain_style in ['official', 'mmcls'] + + if isinstance(pretrained, str) or pretrained is None: + warnings.warn('DeprecationWarning: pretrained is a deprecated, ' + 'please use "init_cfg" instead') + else: + raise TypeError('pretrained must be a str or None') + self.out_indices = out_indices - self.init_cfg = init_cfg + self.pretrain_style = pretrain_style self.pretrained = pretrained + self.init_cfg = init_cfg patch_sizes = [7, 3, 3, 3] strides = [4, 2, 2, 2] @@ -405,12 +421,22 @@ def init_weights(self): constant_init(m.bias) elif isinstance(self.pretrained, str): logger = get_root_logger() - load_checkpoint( - self, - self.pretrained, - map_location='cpu', - strict=False, - logger=logger) + checkpoint = _load_checkpoint( + self.pretrained, logger=logger, map_location='cpu') + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + elif 'model' in checkpoint: + state_dict = checkpoint['model'] + else: + state_dict = checkpoint + + if self.pretrain_style == 'official': + # Because segformer backbone is not support by mmcls, + # so we need to convert pretrain weights to match this + # implementation. + state_dict = mit_convert(state_dict) + + self.load_state_dict(state_dict, False) def forward(self, x): B = x.shape[0] diff --git a/mmseg/models/utils/__init__.py b/mmseg/models/utils/__init__.py index b7066eb03e..d90792c79e 100644 --- a/mmseg/models/utils/__init__.py +++ b/mmseg/models/utils/__init__.py @@ -3,10 +3,11 @@ from .res_layer import ResLayer from .se_layer import SELayer from .self_attention_block import SelfAttentionBlock -from .timm_convert import vit_convert from .up_conv_block import UpConvBlock +from .weights_convert import mit_convert, vit_convert __all__ = [ 'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual', - 'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'vit_convert' + 'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'vit_convert', + 'mit_convert' ] diff --git a/mmseg/models/utils/timm_convert.py b/mmseg/models/utils/timm_convert.py deleted file mode 100644 index 2ce48b06d6..0000000000 --- a/mmseg/models/utils/timm_convert.py +++ /dev/null @@ -1,32 +0,0 @@ -from collections import OrderedDict - - -def vit_convert(timm_dict): - - mmseg_dict = OrderedDict() - - for k, v in timm_dict.items(): - if k.startswith('head'): - continue - if k.startswith('norm'): - new_k = k.replace('norm.', 'ln1.') - elif k.startswith('patch_embed'): - if 'proj' in k: - new_k = k.replace('proj', 'projection') - elif k.startswith('blocks'): - new_k = k.replace('blocks.', 'layers.') - if 'norm' in new_k: - new_k = new_k.replace('norm', 'ln') - elif 'mlp.fc1' in new_k: - new_k = new_k.replace('mlp.fc1', 'ffn.layers.0.0') - elif 'mlp.fc2' in new_k: - new_k = new_k.replace('mlp.fc2', 'ffn.layers.1') - elif 'attn.qkv' in new_k: - new_k = new_k.replace('attn.qkv.', 'attn.attn.in_proj_') - elif 'attn.proj' in new_k: - new_k = new_k.replace('attn.proj', 'attn.attn.out_proj') - else: - new_k = k - mmseg_dict[new_k] = v - - return mmseg_dict diff --git a/mmseg/models/utils/weights_convert.py b/mmseg/models/utils/weights_convert.py new file mode 100644 index 0000000000..ce1fb3f6d2 --- /dev/null +++ b/mmseg/models/utils/weights_convert.py @@ -0,0 +1,81 @@ +from collections import OrderedDict + +import torch + + +def vit_convert(timm_dict): + + mmseg_dict = OrderedDict() + + for k, v in timm_dict.items(): + if k.startswith('head'): + continue + if k.startswith('norm'): + new_k = k.replace('norm.', 'ln1.') + elif k.startswith('patch_embed'): + if 'proj' in k: + new_k = k.replace('proj', 'projection') + elif k.startswith('blocks'): + new_k = k.replace('blocks.', 'layers.') + if 'norm' in new_k: + new_k = new_k.replace('norm', 'ln') + elif 'mlp.fc1' in new_k: + new_k = new_k.replace('mlp.fc1', 'ffn.layers.0.0') + elif 'mlp.fc2' in new_k: + new_k = new_k.replace('mlp.fc2', 'ffn.layers.1') + elif 'attn.qkv' in new_k: + new_k = new_k.replace('attn.qkv.', 'attn.attn.in_proj_') + elif 'attn.proj' in new_k: + new_k = new_k.replace('attn.proj', 'attn.attn.out_proj') + else: + new_k = k + mmseg_dict[new_k] = v + + return mmseg_dict + + +def mit_convert(ckpt): + new_ckpt = OrderedDict() + # Process the concat between q linear weights and kv linear weights + for k, v in ckpt.items(): + if k.startswith('head'): + continue + elif k.startswith('patch_embed'): + stage_i = int(k.split('.')[0].replace('patch_embed', '')) + new_k = k.replace(f'patch_embed{stage_i}', f'layers.{stage_i-1}.0') + new_v = v + if 'proj.' in new_k: + new_k = new_k.replace('proj.', 'proj.conv.') + elif k.startswith('block'): + stage_i = int(k.split('.')[0].replace('block', '')) + new_k = k.replace(f'block{stage_i}', f'layers.{stage_i-1}.1') + new_v = v + if 'attn.q.' in new_k: + sub_item_k = k.replace('q.', 'kv.') + new_k = new_k.replace('q.', 'attn.in_proj_') + new_v = torch.cat([v, ckpt[sub_item_k]], dim=0) + elif 'attn.kv.' in new_k: + continue + elif 'attn.proj.' in new_k: + new_k = new_k.replace('proj.', 'attn.out_proj.') + elif 'attn.sr.' in new_k: + new_k = new_k.replace('sr.', 'sr.conv.') + elif 'mlp.' in new_k: + string = f'{new_k}-' + new_k = new_k.replace('mlp.', 'ffn.layers.') + if 'fc1.weight' in new_k or 'fc2.weight' in new_k: + new_v = v.reshape((*v.shape, 1, 1)) + new_k = new_k.replace('fc1.', '0.0.conv.') + new_k = new_k.replace('dwconv.dwconv.', '0.1.conv.conv.') + new_k = new_k.replace('fc2.', '1.conv.') + string += f'{new_k} {v.shape}-{new_v.shape}' + # print(string) + elif k.startswith('norm'): + stage_i = int(k.split('.')[0].replace('norm', '')) + new_k = k.replace(f'norm{stage_i}', f'layers.{stage_i-1}.2') + new_v = v + else: + new_k = k + new_v = v + new_ckpt[new_k] = new_v + return new_ckpt From a2110ae0016ab14b109e40683c6a5c4c44e2b11d Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Tue, 6 Jul 2021 15:42:11 +0800 Subject: [PATCH 19/96] Resolve some comments. --- mmseg/models/backbones/mit.py | 114 ++++++++----------- mmseg/models/utils/ckpt_convert.py | 2 +- mmseg/models/utils/embed.py | 22 ++-- tests/test_models/test_backbones/test_mit.py | 17 +++ 4 files changed, 80 insertions(+), 75 deletions(-) create mode 100644 tests/test_models/test_backbones/test_mit.py diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index 31277327a8..257b5c1624 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -12,7 +12,7 @@ from ...utils import get_root_logger from ..builder import BACKBONES -from ..utils import mit_convert +from ..utils import PatchEmbed, mit_convert def nlc_to_nchw(tensor, H, W): @@ -90,6 +90,7 @@ def __init__(self, num_fcs=2, act_cfg=dict(type='GELU'), ffn_drop=0., + pe_index=0., dropout_layer=None, add_identity=True, init_cfg=None): @@ -113,14 +114,17 @@ def __init__(self, layers = [] in_channels = embed_dims - for _ in range(num_fcs - 1): - layers.append( - Sequential( - conv1x1( - in_channels=in_channels, - out_channels=feedforward_channels), - PEConv(feedforward_channels), self.activate, - nn.Dropout(ffn_drop))) + for idx in range(num_fcs - 1): + container = [] + container.append( + conv1x1( + in_channels=in_channels, + out_channels=feedforward_channels)) + if pe_index == idx: + container.append(PEConv(feedforward_channels)) + container.append(self.activate) + container.append(nn.Dropout(ffn_drop)) + layers.append(Sequential(*container)) layers.append( conv1x1( in_channels=feedforward_channels, out_channels=in_channels)) @@ -198,15 +202,16 @@ def __init__(self, stride=sr_ratio, norm_cfg=None, act_cfg=None) - _, self.norm = build_norm_layer(norm_cfg, embed_dims) + # The ret[0] of build_norm_layer is norm name. + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] def forward(self, x, H, W, identity=None): - B, _, C = x.shape x_q = x if self.sr_ratio > 1: - x_kv = x.permute(0, 2, 1).reshape(B, C, H, W) - x_kv = self.sr(x_kv).reshape(B, C, -1).permute(0, 2, 1) + x_kv = nlc_to_nchw(x, H, W) + x_kv = self.sr(x_kv) + x_kv = nchw_to_nlc(x_kv) x_kv = self.norm(x_kv) else: x_kv = x @@ -263,7 +268,8 @@ def __init__(self, sr_ratio=1): super(TransformerEncoderLayer, self).__init__() - _, self.norm1 = build_norm_layer(norm_cfg, embed_dims) + # The ret[0] of build_norm_layer is norm name. + self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1] self.attn = EfficientMultiheadAttention( embed_dims=embed_dims, @@ -276,7 +282,8 @@ def __init__(self, norm_cfg=norm_cfg, sr_ratio=sr_ratio) - _, self.norm2 = build_norm_layer(norm_cfg, embed_dims) + # The ret[0] of build_norm_layer is norm name. + self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1] self.ffn = MixFFN( embed_dims=embed_dims, @@ -292,36 +299,6 @@ def forward(self, x, H, W): return x -class OverlapPatchEmbed(BaseModule): - """Image to Patch Embedding.""" - - def __init__(self, - patch_size, - in_channels, - embed_dims, - stride, - norm_cfg=None): - super().__init__() - - self.proj = ConvModule( - in_channels=in_channels, - out_channels=embed_dims, - kernel_size=patch_size, - stride=stride, - padding=patch_size // 2, - act_cfg=None, - norm_cfg=None) - _, self.norm = build_norm_layer(norm_cfg, embed_dims) - - def forward(self, x): - x = self.proj(x) - _, _, H, W = x.shape - x = nchw_to_nlc(x) - x = self.norm(x) - - return x, H, W - - @BACKBONES.register_module() class MixVisionTransformer(BaseModule): """The backbone of Segformer. @@ -336,6 +313,8 @@ def __init__(self, embed_dims=[64, 128, 256, 512], num_layers=[3, 4, 6, 3], num_heads=[1, 2, 4, 8], + patch_sizes=[7, 3, 3, 3], + strides=[4, 2, 2, 2], mlp_ratios=[4, 4, 4, 4], out_indices=(0, 1, 2, 3), qkv_bias=True, @@ -350,7 +329,9 @@ def __init__(self, init_cfg=None): super().__init__() - assert pretrain_style in ['official', 'mmcls'] + assert pretrain_style in [ + 'official', 'mmcls' + ], 'we only support official weights or mmcls weights.' if isinstance(pretrained, str) or pretrained is None: warnings.warn('DeprecationWarning: pretrained is a deprecated, ' @@ -358,14 +339,14 @@ def __init__(self, else: raise TypeError('pretrained must be a str or None') + self.patch_sizes = patch_sizes + self.strides = strides + self.out_indices = out_indices self.pretrain_style = pretrain_style self.pretrained = pretrained self.init_cfg = init_cfg - patch_sizes = [7, 3, 3, 3] - strides = [4, 2, 2, 2] - # transformer encoder dpr = [ x.item() @@ -374,33 +355,33 @@ def __init__(self, cur = 0 self.layers = ModuleList() - for stage_id in range(len(num_layers)): - patch_embed = OverlapPatchEmbed( - patch_size=patch_sizes[stage_id], + for i, num_layer in enumerate(num_layers): + patch_embed = PatchEmbed( in_channels=in_channels, - embed_dims=embed_dims[stage_id], - stride=strides[stage_id], + embed_dims=embed_dims[i], + kernel_size=patch_sizes[i], + stride=strides[i], + padding=patch_sizes[i] // 2, norm_cfg=norm_cfg) layer = ModuleList([ TransformerEncoderLayer( - embed_dims=embed_dims[stage_id], - num_heads=num_heads[stage_id], - feedforward_channels=mlp_ratios[stage_id] * - embed_dims[stage_id], + embed_dims=embed_dims[i], + num_heads=num_heads[i], + feedforward_channels=mlp_ratios[i] * embed_dims[i], drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, - drop_path_rate=dpr[cur + i], + drop_path_rate=dpr[cur + idx], num_fcs=2, qkv_bias=qkv_bias, act_cfg=act_cfg, norm_cfg=norm_cfg, - sr_ratio=sr_ratios[stage_id]) - for i in range(num_layers[stage_id]) + sr_ratio=sr_ratios[i]) for idx in range(num_layer) ]) - in_channels = embed_dims[stage_id] - _, norm = build_norm_layer(norm_cfg, embed_dims[stage_id]) + in_channels = embed_dims[i] + # The ret[0] of build_norm_layer is norm name. + norm = build_norm_layer(norm_cfg, embed_dims[i])[1] self.layers.append(ModuleList([patch_embed, layer, norm])) - cur += num_layers[stage_id] + cur += num_layer def init_weights(self): if self.pretrained is None: @@ -439,15 +420,14 @@ def init_weights(self): self.load_state_dict(state_dict, False) def forward(self, x): - B = x.shape[0] outs = [] for i, layer in enumerate(self.layers): - x, H, W = layer[0](x) + x, H, W = layer[0](x), layer[0].DH, layer[0].DW for block in layer[1]: x = block(x, H, W) x = layer[2](x) - x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() + x = nlc_to_nchw(x, H, W) if i in self.out_indices: outs.append(x) diff --git a/mmseg/models/utils/ckpt_convert.py b/mmseg/models/utils/ckpt_convert.py index 48979dff47..fce860a659 100644 --- a/mmseg/models/utils/ckpt_convert.py +++ b/mmseg/models/utils/ckpt_convert.py @@ -103,7 +103,7 @@ def mit_convert(ckpt): new_k = k.replace(f'patch_embed{stage_i}', f'layers.{stage_i-1}.0') new_v = v if 'proj.' in new_k: - new_k = new_k.replace('proj.', 'proj.conv.') + new_k = new_k.replace('proj.', 'projection.') elif k.startswith('block'): stage_i = int(k.split('.')[0].replace('block', '')) new_k = k.replace(f'block{stage_i}', f'layers.{stage_i-1}.1') diff --git a/mmseg/models/utils/embed.py b/mmseg/models/utils/embed.py index 3bbb45b37a..214bfe6a5c 100644 --- a/mmseg/models/utils/embed.py +++ b/mmseg/models/utils/embed.py @@ -42,6 +42,8 @@ def __init__(self, if stride is None: stride = kernel_size + self.overlapping = stride == kernel_size + # The default setting of patch size is eaual to kernel size. patch_size = kernel_size if isinstance(patch_size, int): @@ -56,7 +58,7 @@ def __init__(self, self.patch_size = patch_size # Use conv layer to embed - conv_type = conv_type or dict(type='Conv2d') + conv_type = conv_type or 'Conv2d' self.projection = build_conv_layer( dict(type=conv_type), in_channels=in_channels, @@ -73,12 +75,18 @@ def __init__(self, def forward(self, x): H, W = x.shape[2], x.shape[3] - if H % self.patch_size[0] != 0: - x = F.pad(x, - (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) - if W % self.patch_size[1] != 0: - x = F.pad(x, - (0, self.patch_size[1] - W % self.patch_size[1], 0, 0)) + + # Modify H, W to multiple of patch size. + if self.overlapping: + pass + else: + if H % self.patch_size[0] != 0: + x = F.pad( + x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) + if W % self.patch_size[1] != 0: + x = F.pad( + x, (0, self.patch_size[1] - W % self.patch_size[1], 0, 0)) + x = self.projection(x) self.DH, self.DW = x.shape[2], x.shape[3] x = x.flatten(2).transpose(1, 2) diff --git a/tests/test_models/test_backbones/test_mit.py b/tests/test_models/test_backbones/test_mit.py new file mode 100644 index 0000000000..689b4df0ca --- /dev/null +++ b/tests/test_models/test_backbones/test_mit.py @@ -0,0 +1,17 @@ +import torch + +from mmseg.models.backbones import MixVisionTransformer + +# import pytest + + +def test_mit(): + H, W = (512, 224) + temp = torch.randn((1, 3, H, W)) + model = MixVisionTransformer( + embed_dims=(16, 32, 64, 128), out_indices=(0, 1, 2, 3)) + outs = model(temp) + assert outs[0].shape == (1, 16, H // 4, W // 4) + assert outs[1].shape == (1, 32, H // 8, W // 8) + assert outs[2].shape == (1, 64, H // 16, W // 16) + assert outs[3].shape == (1, 128, H // 32, W // 32) From 8b7ca036b3d491c8d2efb3c2fd7e525237044eb1 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sat, 10 Jul 2021 01:08:38 +0800 Subject: [PATCH 20/96] 1. Add some assert to ensure right params; 2. Support flexible peconv position; --- mmseg/models/backbones/mit.py | 38 +++++++++++++++++++++++++---------- mmseg/models/utils/embed.py | 5 +++-- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index 257b5c1624..aeea5df795 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -17,7 +17,8 @@ def nlc_to_nchw(tensor, H, W): assert len(tensor.shape) == 3 - B, _, C = tensor.shape + B, L, C = tensor.shape + assert L == H * W, 'The seq_len doesn\'t match H, W' return tensor.transpose(1, 2).reshape(B, C, H, W) @@ -114,13 +115,17 @@ def __init__(self, layers = [] in_channels = embed_dims + # first position of MixFFN + if pe_index == 0: + layers.append(PEConv(in_channels)) for idx in range(num_fcs - 1): container = [] container.append( conv1x1( in_channels=in_channels, out_channels=feedforward_channels)) - if pe_index == idx: + # middle position of MixFFN + if pe_index == idx + 1: container.append(PEConv(feedforward_channels)) container.append(self.activate) container.append(nn.Dropout(ffn_drop)) @@ -128,6 +133,9 @@ def __init__(self, layers.append( conv1x1( in_channels=feedforward_channels, out_channels=in_channels)) + # Last position of MixFFN + if pe_index == num_fcs: + layers.append(PEConv(feedforward_channels)) layers.append(nn.Dropout(ffn_drop)) self.layers = Sequential(*layers) self.dropout_layer = build_dropout( @@ -310,20 +318,21 @@ class MixVisionTransformer(BaseModule): def __init__(self, in_channels=3, + num_stages=4, embed_dims=[64, 128, 256, 512], - num_layers=[3, 4, 6, 3], + depths=[3, 4, 6, 3], num_heads=[1, 2, 4, 8], patch_sizes=[7, 3, 3, 3], strides=[4, 2, 2, 2], - mlp_ratios=[4, 4, 4, 4], + sr_ratios=[8, 4, 2, 1], out_indices=(0, 1, 2, 3), + mlp_ratio=4, qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN', eps=1e-6), - sr_ratios=[8, 4, 2, 1], pretrain_style='official', pretrained=None, init_cfg=None): @@ -339,23 +348,30 @@ def __init__(self, else: raise TypeError('pretrained must be a str or None') + self.num_stages = num_stages + self.embed_dims = embed_dims + self.depths = depths + self.num_heads = num_heads self.patch_sizes = patch_sizes self.strides = strides + self.sr_ratios = sr_ratios + assert num_stages == len(embed_dims) == len(depths) == len(num_heads) \ + == len(patch_sizes) == len(strides) == len(sr_ratios) self.out_indices = out_indices + assert max(out_indices) < self.num_stages self.pretrain_style = pretrain_style self.pretrained = pretrained self.init_cfg = init_cfg # transformer encoder dpr = [ - x.item() - for x in torch.linspace(0, drop_path_rate, sum(num_layers)) + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) ] # stochastic depth decay rule cur = 0 self.layers = ModuleList() - for i, num_layer in enumerate(num_layers): + for i, depth in enumerate(depths): patch_embed = PatchEmbed( in_channels=in_channels, embed_dims=embed_dims[i], @@ -367,7 +383,7 @@ def __init__(self, TransformerEncoderLayer( embed_dims=embed_dims[i], num_heads=num_heads[i], - feedforward_channels=mlp_ratios[i] * embed_dims[i], + feedforward_channels=mlp_ratio * embed_dims[i], drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[cur + idx], @@ -375,13 +391,13 @@ def __init__(self, qkv_bias=qkv_bias, act_cfg=act_cfg, norm_cfg=norm_cfg, - sr_ratio=sr_ratios[i]) for idx in range(num_layer) + sr_ratio=sr_ratios[i]) for idx in range(depth) ]) in_channels = embed_dims[i] # The ret[0] of build_norm_layer is norm name. norm = build_norm_layer(norm_cfg, embed_dims[i])[1] self.layers.append(ModuleList([patch_embed, layer, norm])) - cur += num_layer + cur += depth def init_weights(self): if self.pretrained is None: diff --git a/mmseg/models/utils/embed.py b/mmseg/models/utils/embed.py index 214bfe6a5c..6386c717df 100644 --- a/mmseg/models/utils/embed.py +++ b/mmseg/models/utils/embed.py @@ -44,7 +44,7 @@ def __init__(self, self.overlapping = stride == kernel_size - # The default setting of patch size is eaual to kernel size. + # The default setting of patch size is equal to kernel size. patch_size = kernel_size if isinstance(patch_size, int): patch_size = to_2tuple(patch_size) @@ -76,10 +76,11 @@ def __init__(self, def forward(self, x): H, W = x.shape[2], x.shape[3] - # Modify H, W to multiple of patch size. + # TODO: Process overlapping op if self.overlapping: pass else: + # Modify H, W to multiple of patch size. if H % self.patch_size[0] != 0: x = F.pad( x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) From e137df3ab843dca641f0e2cfcd7852ee1c3bf92a Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sat, 10 Jul 2021 01:36:43 +0800 Subject: [PATCH 21/96] Add pe_index assert and fix unit test. --- mmseg/models/backbones/mit.py | 5 ++++- mmseg/models/utils/embed.py | 2 +- tests/test_models/test_backbones/test_mit.py | 14 ++++++++------ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index aeea5df795..f2c5c437b2 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -91,7 +91,7 @@ def __init__(self, num_fcs=2, act_cfg=dict(type='GELU'), ffn_drop=0., - pe_index=0., + pe_index=1, dropout_layer=None, add_identity=True, init_cfg=None): @@ -99,6 +99,9 @@ def __init__(self, assert num_fcs >= 2, 'num_fcs should be no less ' \ f'than 2. got {num_fcs}.' + assert pe_index <= num_fcs and pe_index >= 0, 'pe_index range from 0 ' + 'to num_fcs' + self.embed_dims = embed_dims self.feedforward_channels = feedforward_channels self.num_fcs = num_fcs diff --git a/mmseg/models/utils/embed.py b/mmseg/models/utils/embed.py index 6386c717df..796ee2f404 100644 --- a/mmseg/models/utils/embed.py +++ b/mmseg/models/utils/embed.py @@ -42,7 +42,7 @@ def __init__(self, if stride is None: stride = kernel_size - self.overlapping = stride == kernel_size + self.overlapping = stride < kernel_size # The default setting of patch size is equal to kernel size. patch_size = kernel_size diff --git a/tests/test_models/test_backbones/test_mit.py b/tests/test_models/test_backbones/test_mit.py index 689b4df0ca..eed5c5ee2b 100644 --- a/tests/test_models/test_backbones/test_mit.py +++ b/tests/test_models/test_backbones/test_mit.py @@ -6,12 +6,14 @@ def test_mit(): - H, W = (512, 224) + H, W = (224, 224) temp = torch.randn((1, 3, H, W)) model = MixVisionTransformer( - embed_dims=(16, 32, 64, 128), out_indices=(0, 1, 2, 3)) + embed_dims=[32, 64, 160, 256], + num_heads=[1, 2, 5, 8], + out_indices=(0, 1, 2, 3)) outs = model(temp) - assert outs[0].shape == (1, 16, H // 4, W // 4) - assert outs[1].shape == (1, 32, H // 8, W // 8) - assert outs[2].shape == (1, 64, H // 16, W // 16) - assert outs[3].shape == (1, 128, H // 32, W // 32) + assert outs[0].shape == (1, 32, H // 4, W // 4) + assert outs[1].shape == (1, 64, H // 8, W // 8) + assert outs[2].shape == (1, 160, H // 16, W // 16) + assert outs[3].shape == (1, 256, H // 32, W // 32) From b4ae0b528c3a54568c05041cd0ae6c622a6d1d32 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Tue, 13 Jul 2021 01:23:46 +0800 Subject: [PATCH 22/96] 1. Add doc string for MixVisionTransformer; 2. Add some unit tests for MixVisionTransformer; --- mmseg/models/backbones/mit.py | 82 +++++++++++++------- mmseg/models/utils/__init__.py | 3 +- mmseg/models/utils/embed.py | 4 +- mmseg/models/utils/shape_convert.py | 10 +++ tests/test_models/test_backbones/test_mit.py | 25 ++++-- 5 files changed, 87 insertions(+), 37 deletions(-) create mode 100644 mmseg/models/utils/shape_convert.py diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index f2c5c437b2..4d0e48f45e 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -12,19 +12,7 @@ from ...utils import get_root_logger from ..builder import BACKBONES -from ..utils import PatchEmbed, mit_convert - - -def nlc_to_nchw(tensor, H, W): - assert len(tensor.shape) == 3 - B, L, C = tensor.shape - assert L == H * W, 'The seq_len doesn\'t match H, W' - return tensor.transpose(1, 2).reshape(B, C, H, W) - - -def nchw_to_nlc(tensor): - assert len(tensor.shape) == 4 - return tensor.flatten(2).transpose(1, 2).contiguous() +from ..utils import PatchEmbed, mit_convert, nchw_to_nlc, nlc_to_nchw class PEConv(BaseModule): @@ -317,13 +305,48 @@ class MixVisionTransformer(BaseModule): A PyTorch implement of : `SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers` - https://arxiv.org/pdf/2105.15203.pdf + + in_channels (int): Number of input channels. Default: 3. + embed_dims (int): Embedding dimension. Default: 768. + num_stags (int): The num of stages. Default: 4. + num_layers (list[int]): The layer number of each transformer encode + layer. Default: [3, 4, 6, 3]. + num_heads (list[int]): The attention heads of each transformer + encode layer. Default: [1, 2, 4, 8]. + patch_sizes (list[int]): The patch_size of each overlapped patch embedding. + Default: [7, 3, 3, 3]. + strides (list[int]): The stride of each overlapped patch embedding. + Default: [4, 2, 2, 2]. + sr_ratios (list[int]): The spatial reduction rate of each transformer + encode layer. Default: [8, 4, 2, 1]. + out_indices (list[int] | tuple[int] | int): Output from which stages. + Default: (0, 1, 2, 3). + mlp_ratio (int): ratio of mlp hidden dim to embedding dim. + Default: 4. + out_indices (list | tuple | int): Output from which stages. + Default: -1. + qkv_bias (bool): Enable bias for qkv if True. Default: True. + drop_rate (float): Probability of an element to be zeroed. + Default 0.0 + attn_drop_rate (float): The drop out rate for attention layer. + Default 0.0 + drop_path_rate (float): stochastic depth rate. Default 0.0 + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN') + act_cfg (dict): The activation config for FFNs. + Defalut: dict(type='GELU'). + pretrain_style (str): Choose to use official or mmcls pretrain weights. + Default: official. + pretrained (str, optional): model pretrained path. Default: None. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. """ def __init__(self, in_channels=3, + embed_dims=64, num_stages=4, - embed_dims=[64, 128, 256, 512], - depths=[3, 4, 6, 3], + num_layers=[3, 4, 6, 3], num_heads=[1, 2, 4, 8], patch_sizes=[7, 3, 3, 3], strides=[4, 2, 2, 2], @@ -351,14 +374,15 @@ def __init__(self, else: raise TypeError('pretrained must be a str or None') - self.num_stages = num_stages self.embed_dims = embed_dims - self.depths = depths + + self.num_stages = num_stages + self.num_layers = num_layers self.num_heads = num_heads self.patch_sizes = patch_sizes self.strides = strides self.sr_ratios = sr_ratios - assert num_stages == len(embed_dims) == len(depths) == len(num_heads) \ + assert num_stages == len(num_layers) == len(num_heads) \ == len(patch_sizes) == len(strides) == len(sr_ratios) self.out_indices = out_indices @@ -369,24 +393,26 @@ def __init__(self, # transformer encoder dpr = [ - x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) - ] # stochastic depth decay rule + x.item() + for x in torch.linspace(0, drop_path_rate, sum(num_layers)) + ] # stochastic num_layer decay rule cur = 0 self.layers = ModuleList() - for i, depth in enumerate(depths): + for i, num_layer in enumerate(num_layers): + embed_dims_i = embed_dims * num_heads[i] patch_embed = PatchEmbed( in_channels=in_channels, - embed_dims=embed_dims[i], + embed_dims=embed_dims_i, kernel_size=patch_sizes[i], stride=strides[i], padding=patch_sizes[i] // 2, norm_cfg=norm_cfg) layer = ModuleList([ TransformerEncoderLayer( - embed_dims=embed_dims[i], + embed_dims=embed_dims_i, num_heads=num_heads[i], - feedforward_channels=mlp_ratio * embed_dims[i], + feedforward_channels=mlp_ratio * embed_dims_i, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[cur + idx], @@ -394,13 +420,13 @@ def __init__(self, qkv_bias=qkv_bias, act_cfg=act_cfg, norm_cfg=norm_cfg, - sr_ratio=sr_ratios[i]) for idx in range(depth) + sr_ratio=sr_ratios[i]) for idx in range(num_layer) ]) - in_channels = embed_dims[i] + in_channels = embed_dims_i # The ret[0] of build_norm_layer is norm name. - norm = build_norm_layer(norm_cfg, embed_dims[i])[1] + norm = build_norm_layer(norm_cfg, embed_dims_i)[1] self.layers.append(ModuleList([patch_embed, layer, norm])) - cur += depth + cur += num_layer def init_weights(self): if self.pretrained is None: diff --git a/mmseg/models/utils/__init__.py b/mmseg/models/utils/__init__.py index 761489f656..32a953b834 100644 --- a/mmseg/models/utils/__init__.py +++ b/mmseg/models/utils/__init__.py @@ -5,10 +5,11 @@ from .res_layer import ResLayer from .se_layer import SELayer from .self_attention_block import SelfAttentionBlock +from .shape_convert import nchw_to_nlc, nlc_to_nchw from .up_conv_block import UpConvBlock __all__ = [ 'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual', 'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'vit_convert', - 'mit_convert', 'swin_convert', 'PatchEmbed' + 'mit_convert', 'swin_convert', 'PatchEmbed', 'nchw_to_nlc', 'nlc_to_nchw' ] diff --git a/mmseg/models/utils/embed.py b/mmseg/models/utils/embed.py index 796ee2f404..33a8c7db83 100644 --- a/mmseg/models/utils/embed.py +++ b/mmseg/models/utils/embed.py @@ -77,9 +77,7 @@ def forward(self, x): H, W = x.shape[2], x.shape[3] # TODO: Process overlapping op - if self.overlapping: - pass - else: + if not self.overlapping: # Modify H, W to multiple of patch size. if H % self.patch_size[0] != 0: x = F.pad( diff --git a/mmseg/models/utils/shape_convert.py b/mmseg/models/utils/shape_convert.py new file mode 100644 index 0000000000..8362639cfa --- /dev/null +++ b/mmseg/models/utils/shape_convert.py @@ -0,0 +1,10 @@ +def nlc_to_nchw(tensor, H, W): + assert len(tensor.shape) == 3 + B, L, C = tensor.shape + assert L == H * W, 'The seq_len doesn\'t match H, W' + return tensor.transpose(1, 2).reshape(B, C, H, W) + + +def nchw_to_nlc(tensor): + assert len(tensor.shape) == 4 + return tensor.flatten(2).transpose(1, 2).contiguous() diff --git a/tests/test_models/test_backbones/test_mit.py b/tests/test_models/test_backbones/test_mit.py index eed5c5ee2b..8b826c5a3f 100644 --- a/tests/test_models/test_backbones/test_mit.py +++ b/tests/test_models/test_backbones/test_mit.py @@ -1,17 +1,32 @@ +import pytest import torch from mmseg.models.backbones import MixVisionTransformer -# import pytest - def test_mit(): + with pytest.raises(AssertionError): + # It's only support official style and mmcls style now. + MixVisionTransformer(pretrain_style='timm') + + with pytest.raises(TypeError): + # Pretrained represents pretrain url and must be str or None. + MixVisionTransformer(pretrained=123) + + # Test normal input H, W = (224, 224) temp = torch.randn((1, 3, H, W)) model = MixVisionTransformer( - embed_dims=[32, 64, 160, 256], - num_heads=[1, 2, 5, 8], - out_indices=(0, 1, 2, 3)) + embed_dims=32, num_heads=[1, 2, 5, 8], out_indices=(0, 1, 2, 3)) + outs = model(temp) + assert outs[0].shape == (1, 32, H // 4, W // 4) + assert outs[1].shape == (1, 64, H // 8, W // 8) + assert outs[2].shape == (1, 160, H // 16, W // 16) + assert outs[3].shape == (1, 256, H // 32, W // 32) + + # Test non-squared input + H, W = (224, 320) + temp = torch.randn((1, 3, H, W)) outs = model(temp) assert outs[0].shape == (1, 32, H // 4, W // 4) assert outs[1].shape == (1, 64, H // 8, W // 8) From 38f7bbefe5480d0bbfde370a4b4b16dde4866f28 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 14 Jul 2021 11:06:43 +0800 Subject: [PATCH 23/96] Use hw_shape to pass shape of feature map. --- mmseg/models/backbones/mit.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index 4d0e48f45e..1aba6e244e 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -133,8 +133,8 @@ def __init__(self, dropout_layer) if dropout_layer else torch.nn.Identity() self.add_identity = add_identity - def forward(self, x, H, W, identity=None): - out = nlc_to_nchw(x, H, W) + def forward(self, x, hw_shape, identity=None): + out = nlc_to_nchw(x, *hw_shape) out = self.layers(out) out = nchw_to_nlc(out) if not self.add_identity: @@ -204,11 +204,11 @@ def __init__(self, # The ret[0] of build_norm_layer is norm name. self.norm = build_norm_layer(norm_cfg, embed_dims)[1] - def forward(self, x, H, W, identity=None): + def forward(self, x, hw_shape, identity=None): x_q = x if self.sr_ratio > 1: - x_kv = nlc_to_nchw(x, H, W) + x_kv = nlc_to_nchw(x, *hw_shape) x_kv = self.sr(x_kv) x_kv = nchw_to_nlc(x_kv) x_kv = self.norm(x_kv) @@ -292,9 +292,9 @@ def __init__(self, dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), act_cfg=act_cfg) - def forward(self, x, H, W): - x = self.attn(self.norm1(x), H, W, identity=x) - x = self.ffn(self.norm2(x), H, W, identity=x) + def forward(self, x, hw_shape): + x = self.attn(self.norm1(x), hw_shape, identity=x) + x = self.ffn(self.norm2(x), hw_shape, identity=x) return x @@ -469,10 +469,11 @@ def forward(self, x): for i, layer in enumerate(self.layers): x, H, W = layer[0](x), layer[0].DH, layer[0].DW + hw_shape = (H, W) for block in layer[1]: - x = block(x, H, W) + x = block(x, hw_shape) x = layer[2](x) - x = nlc_to_nchw(x, H, W) + x = nlc_to_nchw(x, hw_shape) if i in self.out_indices: outs.append(x) From 443438f70054255a5287a3ec3c706185587558a8 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 14 Jul 2021 11:54:47 +0800 Subject: [PATCH 24/96] 1. Fix doc string of MixVisionTransformer; 2. Simplify MixFFN; 3. Modify H, W to hw_shape; --- mmseg/models/backbones/mit.py | 177 +++++++++------------------- mmseg/models/utils/shape_convert.py | 3 +- 2 files changed, 60 insertions(+), 120 deletions(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index 1aba6e244e..4810ccf06c 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -1,10 +1,9 @@ import math import warnings -from functools import partial import torch import torch.nn as nn -from mmcv.cnn import (ConvModule, build_activation_layer, build_norm_layer, +from mmcv.cnn import (Conv2d, build_activation_layer, build_norm_layer, constant_init, normal_init, trunc_normal_init) from mmcv.cnn.bricks.drop import build_dropout from mmcv.cnn.bricks.transformer import MultiheadAttention @@ -15,38 +14,6 @@ from ..utils import PatchEmbed, mit_convert, nchw_to_nlc, nlc_to_nchw -class PEConv(BaseModule): - """Mix-FFN use 3x3 depth-wise Conv to provide positional encode - information. - - Args: - embed_dims (int): The channels of token after embedding. - kernel_size (int): The kernel size of convolution operation. - Default: 3. - stride (int): The kernel slide move distance of one step. - Default: 1. - """ - - def __init__(self, embed_dims, kernel_size=3, stride=1): - super().__init__() - self.conv = ConvModule( - in_channels=embed_dims, - out_channels=embed_dims, - kernel_size=kernel_size, - stride=stride, - padding=(kernel_size - 1) // 2, - bias=True, - groups=embed_dims, - norm_cfg=None, - act_cfg=None) - - def forward(self, x): - - x = self.conv(x) - - return x - - class MixFFN(BaseModule): """An implementation of MixFFN of Segformer. @@ -59,16 +26,12 @@ class MixFFN(BaseModule): `MultiheadAttention`. Defaults: 256. feedforward_channels (int): The hidden dimension of FFNs. Defaults: 1024. - num_fcs (int, optional): The number of fully-connected layers in - FFNs. Default: 2. act_cfg (dict, optional): The activation config for FFNs. Default: dict(type='ReLU') ffn_drop (float, optional): Probability of an element to be zeroed in FFN. Default 0.0. dropout_layer (obj:`ConfigDict`): The dropout_layer used when adding the shortcut. - add_identity (bool, optional): Whether to add the - identity connection. Default: `True`. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. """ @@ -76,69 +39,49 @@ class MixFFN(BaseModule): def __init__(self, embed_dims, feedforward_channels, - num_fcs=2, act_cfg=dict(type='GELU'), ffn_drop=0., - pe_index=1, dropout_layer=None, - add_identity=True, init_cfg=None): super(MixFFN, self).__init__(init_cfg) - assert num_fcs >= 2, 'num_fcs should be no less ' \ - f'than 2. got {num_fcs}.' - - assert pe_index <= num_fcs and pe_index >= 0, 'pe_index range from 0 ' - 'to num_fcs' self.embed_dims = embed_dims self.feedforward_channels = feedforward_channels - self.num_fcs = num_fcs self.act_cfg = act_cfg self.activate = build_activation_layer(act_cfg) - conv1x1 = partial( - ConvModule, + in_channels = embed_dims + fc1 = Conv2d( + in_channels=in_channels, + out_channels=feedforward_channels, kernel_size=1, stride=1, + bias=True) + # 3x3 depth wise conv to provide positional encode information + pe_conv = Conv2d( + in_channels=feedforward_channels, + out_channels=feedforward_channels, + kernel_size=3, + stride=1, + padding=(3 - 1) // 2, bias=True, - norm_cfg=None, - act_cfg=None) - - layers = [] - in_channels = embed_dims - # first position of MixFFN - if pe_index == 0: - layers.append(PEConv(in_channels)) - for idx in range(num_fcs - 1): - container = [] - container.append( - conv1x1( - in_channels=in_channels, - out_channels=feedforward_channels)) - # middle position of MixFFN - if pe_index == idx + 1: - container.append(PEConv(feedforward_channels)) - container.append(self.activate) - container.append(nn.Dropout(ffn_drop)) - layers.append(Sequential(*container)) - layers.append( - conv1x1( - in_channels=feedforward_channels, out_channels=in_channels)) - # Last position of MixFFN - if pe_index == num_fcs: - layers.append(PEConv(feedforward_channels)) - layers.append(nn.Dropout(ffn_drop)) + groups=feedforward_channels) + fc2 = Conv2d( + in_channels=feedforward_channels, + out_channels=in_channels, + kernel_size=1, + stride=1, + bias=True) + drop = nn.Dropout(ffn_drop) + layers = [fc1, pe_conv, drop, fc2, drop] self.layers = Sequential(*layers) self.dropout_layer = build_dropout( dropout_layer) if dropout_layer else torch.nn.Identity() - self.add_identity = add_identity def forward(self, x, hw_shape, identity=None): - out = nlc_to_nchw(x, *hw_shape) + out = nlc_to_nchw(x, hw_shape) out = self.layers(out) out = nchw_to_nlc(out) - if not self.add_identity: - return self.dropout_layer(out) if identity is None: identity = x return identity + self.dropout_layer(out) @@ -194,13 +137,11 @@ def __init__(self, self.sr_ratio = sr_ratio if sr_ratio > 1: - self.sr = ConvModule( + self.sr = Conv2d( in_channels=embed_dims, out_channels=embed_dims, kernel_size=sr_ratio, - stride=sr_ratio, - norm_cfg=None, - act_cfg=None) + stride=sr_ratio) # The ret[0] of build_norm_layer is norm name. self.norm = build_norm_layer(norm_cfg, embed_dims)[1] @@ -208,7 +149,7 @@ def forward(self, x, hw_shape, identity=None): x_q = x if self.sr_ratio > 1: - x_kv = nlc_to_nchw(x, *hw_shape) + x_kv = nlc_to_nchw(x, hw_shape) x_kv = self.sr(x_kv) x_kv = nchw_to_nlc(x_kv) x_kv = self.norm(x_kv) @@ -287,7 +228,6 @@ def __init__(self, self.ffn = MixFFN( embed_dims=embed_dims, feedforward_channels=feedforward_channels, - num_fcs=num_fcs, ffn_drop=drop_rate, dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), act_cfg=act_cfg) @@ -306,40 +246,39 @@ class MixVisionTransformer(BaseModule): Semantic Segmentation with Transformers` - https://arxiv.org/pdf/2105.15203.pdf - in_channels (int): Number of input channels. Default: 3. - embed_dims (int): Embedding dimension. Default: 768. - num_stags (int): The num of stages. Default: 4. - num_layers (list[int]): The layer number of each transformer encode - layer. Default: [3, 4, 6, 3]. - num_heads (list[int]): The attention heads of each transformer - encode layer. Default: [1, 2, 4, 8]. - patch_sizes (list[int]): The patch_size of each overlapped patch embedding. - Default: [7, 3, 3, 3]. - strides (list[int]): The stride of each overlapped patch embedding. - Default: [4, 2, 2, 2]. - sr_ratios (list[int]): The spatial reduction rate of each transformer - encode layer. Default: [8, 4, 2, 1]. - out_indices (list[int] | tuple[int] | int): Output from which stages. + Args: + in_channels (int): Number of input channels. Default: 3. + embed_dims (int): Embedding dimension. Default: 768. + num_stags (int): The num of stages. Default: 4. + num_layers (Sequence[int]): The layer number of each transformer encode + layer. Default: [3, 4, 6, 3]. + num_heads (Sequence[int]): The attention heads of each transformer + encode layer. Default: [1, 2, 4, 8]. + patch_sizes (Sequence[int]): The patch_size of each overlapped patch + embedding. Default: [7, 3, 3, 3]. + strides (Sequence[int]): The stride of each overlapped patch embedding. + Default: [4, 2, 2, 2]. + sr_ratios (Sequence[int]): The spatial reduction rate of each + transformer encode layer. Default: [8, 4, 2, 1]. + out_indices (Sequence[int] | int): Output from which stages. Default: (0, 1, 2, 3). - mlp_ratio (int): ratio of mlp hidden dim to embedding dim. - Default: 4. - out_indices (list | tuple | int): Output from which stages. - Default: -1. - qkv_bias (bool): Enable bias for qkv if True. Default: True. - drop_rate (float): Probability of an element to be zeroed. - Default 0.0 - attn_drop_rate (float): The drop out rate for attention layer. - Default 0.0 - drop_path_rate (float): stochastic depth rate. Default 0.0 - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='LN') - act_cfg (dict): The activation config for FFNs. - Defalut: dict(type='GELU'). - pretrain_style (str): Choose to use official or mmcls pretrain weights. - Default: official. - pretrained (str, optional): model pretrained path. Default: None. - init_cfg (dict or list[dict], optional): Initialization config dict. - Default: None. + mlp_ratio (int): ratio of mlp hidden dim to embedding dim. + Default: 4. + qkv_bias (bool): Enable bias for qkv if True. Default: True. + drop_rate (float): Probability of an element to be zeroed. + Default 0.0 + attn_drop_rate (float): The drop out rate for attention layer. + Default 0.0 + drop_path_rate (float): stochastic depth rate. Default 0.0 + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN') + act_cfg (dict): The activation config for FFNs. + Defalut: dict(type='GELU'). + pretrain_style (str): Choose to use official or mmcls pretrain weights. + Default: official. + pretrained (str, optional): model pretrained path. Default: None. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. """ def __init__(self, diff --git a/mmseg/models/utils/shape_convert.py b/mmseg/models/utils/shape_convert.py index 8362639cfa..9d2051f31f 100644 --- a/mmseg/models/utils/shape_convert.py +++ b/mmseg/models/utils/shape_convert.py @@ -1,4 +1,5 @@ -def nlc_to_nchw(tensor, H, W): +def nlc_to_nchw(tensor, hw_shape): + H, W = hw_shape assert len(tensor.shape) == 3 B, L, C = tensor.shape assert L == H * W, 'The seq_len doesn\'t match H, W' From 1affeb6c6e3b739b7ea4ff62f39114c2f85f1b9c Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 14 Jul 2021 12:29:57 +0800 Subject: [PATCH 25/96] Add more unit tests. --- mmseg/models/backbones/mit.py | 4 ---- tests/test_models/test_backbones/test_mit.py | 25 ++++++++++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index 4810ccf06c..1ef7ae9041 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -176,8 +176,6 @@ class TransformerEncoderLayer(BaseModule): attn_drop_rate (float): The drop out rate for attention layer. Default 0.0. drop_path_rate (float): stochastic depth rate. Default 0.0. - num_fcs (int): The number of fully-connected layers for FFNs. - Default: 2. qkv_bias (bool): enable bias for qkv if True. Default: True. act_cfg (dict): The activation config for FFNs. @@ -200,7 +198,6 @@ def __init__(self, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., - num_fcs=2, qkv_bias=True, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), @@ -355,7 +352,6 @@ def __init__(self, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[cur + idx], - num_fcs=2, qkv_bias=qkv_bias, act_cfg=act_cfg, norm_cfg=norm_cfg, diff --git a/tests/test_models/test_backbones/test_mit.py b/tests/test_models/test_backbones/test_mit.py index 8b826c5a3f..7d180ed973 100644 --- a/tests/test_models/test_backbones/test_mit.py +++ b/tests/test_models/test_backbones/test_mit.py @@ -2,6 +2,7 @@ import torch from mmseg.models.backbones import MixVisionTransformer +from mmseg.models.backbones.mit import EfficientMultiheadAttention, MixFFN def test_mit(): @@ -32,3 +33,27 @@ def test_mit(): assert outs[1].shape == (1, 64, H // 8, W // 8) assert outs[2].shape == (1, 160, H // 16, W // 16) assert outs[3].shape == (1, 256, H // 32, W // 32) + + # Test MixFFN + FFN = MixFFN(128, 512) + hw_shape = (32, 32) + token_len = 32 * 32 + temp = torch.randn((1, token_len, 128)) + # Self identity + out = FFN(temp, hw_shape) + assert out.shape == (1, token_len, 128) + # Out identity + outs = FFN(temp, hw_shape, temp) + assert out.shape == (1, token_len, 128) + + # Test EfficientMHA + MHA = EfficientMultiheadAttention(128, 2) + hw_shape = (32, 32) + token_len = 32 * 32 + temp = torch.randn((1, token_len, 128)) + # Self identity + out = MHA(temp, hw_shape) + assert out.shape == (1, token_len, 128) + # Out identity + outs = MHA(temp, hw_shape, temp) + assert out.shape == (1, token_len, 128) From e969496568ef16157e0572ad8d6dfa85f1e7f7a8 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 14 Jul 2021 12:35:10 +0800 Subject: [PATCH 26/96] Add doc string for shape convertion functions. --- mmseg/models/utils/shape_convert.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/mmseg/models/utils/shape_convert.py b/mmseg/models/utils/shape_convert.py index 9d2051f31f..69388de49a 100644 --- a/mmseg/models/utils/shape_convert.py +++ b/mmseg/models/utils/shape_convert.py @@ -1,11 +1,22 @@ -def nlc_to_nchw(tensor, hw_shape): +def nlc_to_nchw(x, hw_shape): + """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor. + + Args: + x (Tensor): The input tensor for convertion. + hw_shape (Sequence[int]): The height and width of output feature map. + """ H, W = hw_shape - assert len(tensor.shape) == 3 - B, L, C = tensor.shape + assert len(x.shape) == 3 + B, L, C = x.shape assert L == H * W, 'The seq_len doesn\'t match H, W' - return tensor.transpose(1, 2).reshape(B, C, H, W) + return x.transpose(1, 2).reshape(B, C, H, W) + +def nchw_to_nlc(x): + """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor. -def nchw_to_nlc(tensor): - assert len(tensor.shape) == 4 - return tensor.flatten(2).transpose(1, 2).contiguous() + Args: + x (Tensor): The input tensor for convertion. + """ + assert len(x.shape) == 4 + return x.flatten(2).transpose(1, 2).contiguous() From 582939bb860bcdd6cc0801ac62da41554872d9a2 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 15 Jul 2021 00:50:01 +0800 Subject: [PATCH 27/96] Add some unit tests to improve code coverage. --- mmseg/models/backbones/mit.py | 4 ++-- tests/test_models/test_backbones/test_mit.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index 1ef7ae9041..1e060697c2 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -365,7 +365,7 @@ def __init__(self, def init_weights(self): if self.pretrained is None: - for m in self.modules: + for m in self.modules(): if isinstance(m, nn.Linear): trunc_normal_init(m.weight, std=.02) if m.bias is not None: @@ -379,7 +379,7 @@ def init_weights(self): fan_out //= m.groups normal_init(m.weight, 0, math.sqrt(2.0 / fan_out)) if m.bias is not None: - constant_init(m.bias) + constant_init(m.bias, 0) elif isinstance(self.pretrained, str): logger = get_root_logger() checkpoint = _load_checkpoint( diff --git a/tests/test_models/test_backbones/test_mit.py b/tests/test_models/test_backbones/test_mit.py index 7d180ed973..bf6cca1649 100644 --- a/tests/test_models/test_backbones/test_mit.py +++ b/tests/test_models/test_backbones/test_mit.py @@ -19,6 +19,7 @@ def test_mit(): temp = torch.randn((1, 3, H, W)) model = MixVisionTransformer( embed_dims=32, num_heads=[1, 2, 5, 8], out_indices=(0, 1, 2, 3)) + model.init_weights() outs = model(temp) assert outs[0].shape == (1, 32, H // 4, W // 4) assert outs[1].shape == (1, 64, H // 8, W // 8) From e5acfe84523699c0530c84e338854d74ebfb8bf8 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 15 Jul 2021 12:55:21 +0800 Subject: [PATCH 28/96] Fix Segformer backbone pretrain weights match bug. --- mmseg/models/backbones/mit.py | 2 +- mmseg/models/utils/ckpt_convert.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index 1e060697c2..6ba2fbdd9c 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -73,7 +73,7 @@ def __init__(self, stride=1, bias=True) drop = nn.Dropout(ffn_drop) - layers = [fc1, pe_conv, drop, fc2, drop] + layers = [fc1, pe_conv, self.activate, drop, fc2, drop] self.layers = Sequential(*layers) self.dropout_layer = build_dropout( dropout_layer) if dropout_layer else torch.nn.Identity() diff --git a/mmseg/models/utils/ckpt_convert.py b/mmseg/models/utils/ckpt_convert.py index fce860a659..26a1b96df9 100644 --- a/mmseg/models/utils/ckpt_convert.py +++ b/mmseg/models/utils/ckpt_convert.py @@ -117,15 +117,15 @@ def mit_convert(ckpt): elif 'attn.proj.' in new_k: new_k = new_k.replace('proj.', 'attn.out_proj.') elif 'attn.sr.' in new_k: - new_k = new_k.replace('sr.', 'sr.conv.') + new_k = new_k.replace('sr.', 'sr.') elif 'mlp.' in new_k: string = f'{new_k}-' new_k = new_k.replace('mlp.', 'ffn.layers.') if 'fc1.weight' in new_k or 'fc2.weight' in new_k: new_v = v.reshape((*v.shape, 1, 1)) - new_k = new_k.replace('fc1.', '0.0.conv.') - new_k = new_k.replace('dwconv.dwconv.', '0.1.conv.conv.') - new_k = new_k.replace('fc2.', '1.conv.') + new_k = new_k.replace('fc1.', '0.') + new_k = new_k.replace('dwconv.dwconv.', '1.') + new_k = new_k.replace('fc2.', '4.') string += f'{new_k} {v.shape}-{new_v.shape}' # print(string) elif k.startswith('norm'): From af8f309b45a57c82f20ba8172ddb5dc6086579c5 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 15 Jul 2021 13:20:17 +0800 Subject: [PATCH 29/96] Modify configs of segformer. --- .../{segformer.py => segformer_mit-b0.py} | 34 +++++----- .../segformer_b0_512x512_160k_ade20k.py | 62 ------------------ .../segformer_b1_512x512_160k_ade20k.py | 62 ------------------ .../segformer_b2_512x512_160k_ade20k.py | 62 ------------------ .../segformer_b3_512x512_160k_ade20k.py | 62 ------------------ .../segformer_b4_512x512_160k_ade20k.py | 62 ------------------ .../segformer_b5_640x640_160k_ade20k.py | 64 ------------------- .../segformer_mit-b0_512x512_160k_ade20k.py | 31 +++++++++ .../segformer_mit-b1_512x512_160k_ade20k.py | 38 +++++++++++ .../segformer_mit-b2_512x512_160k_ade20k.py | 38 +++++++++++ .../segformer_mit-b3_512x512_160k_ade20k.py | 38 +++++++++++ .../segformer_mit-b4_512x512_160k_ade20k.py | 40 ++++++++++++ .../segformer_mit-b5_640x640_160k_ade20k.py | 39 +++++++++++ mmseg/models/decode_heads/segformer_head.py | 7 +- 14 files changed, 243 insertions(+), 396 deletions(-) rename configs/_base_/models/{segformer.py => segformer_mit-b0.py} (54%) delete mode 100644 configs/segformer/segformer_b0_512x512_160k_ade20k.py delete mode 100644 configs/segformer/segformer_b1_512x512_160k_ade20k.py delete mode 100644 configs/segformer/segformer_b2_512x512_160k_ade20k.py delete mode 100644 configs/segformer/segformer_b3_512x512_160k_ade20k.py delete mode 100644 configs/segformer/segformer_b4_512x512_160k_ade20k.py delete mode 100644 configs/segformer/segformer_b5_640x640_160k_ade20k.py create mode 100644 configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py create mode 100644 configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py create mode 100644 configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py create mode 100644 configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py create mode 100644 configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py create mode 100644 configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py diff --git a/configs/_base_/models/segformer.py b/configs/_base_/models/segformer_mit-b0.py similarity index 54% rename from configs/_base_/models/segformer.py rename to configs/_base_/models/segformer_mit-b0.py index 83c7639fa9..62e32e7610 100644 --- a/configs/_base_/models/segformer.py +++ b/configs/_base_/models/segformer_mit-b0.py @@ -5,33 +5,31 @@ pretrained=None, backbone=dict( type='MixVisionTransformer', - img_size=224, - in_chans=3, - embed_dims=[64, 128, 256, 512], - num_heads=[1, 2, 4, 8], - mlp_ratios=[4, 4, 4, 4], - qkv_bias=False, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., - depths=[3, 4, 6, 3], - sr_ratios=[8, 4, 2, 1]), + in_channels=3, + embed_dims=32, + num_stages=4, + num_layers=[2, 2, 2, 2], + num_heads=[1, 2, 5, 8], + patch_sizes=[7, 4, 4, 4], + sr_ratios=[8, 4, 2, 1], + out_indices=(0, 1, 2, 3), + mlp_ratio=4, + qkv_bias=True, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.1), decode_head=dict( - type='SegFormerHead', - in_channels=[64, 128, 320, 512], + type='SegformerHead', + in_channels=[32, 64, 160, 256], in_index=[0, 1, 2, 3], feature_strides=[4, 8, 16, 32], - channels=128, + channels=256, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, - decoder_params=dict(), loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) - -find_unused_parameters = True diff --git a/configs/segformer/segformer_b0_512x512_160k_ade20k.py b/configs/segformer/segformer_b0_512x512_160k_ade20k.py deleted file mode 100644 index d3b2ff414d..0000000000 --- a/configs/segformer/segformer_b0_512x512_160k_ade20k.py +++ /dev/null @@ -1,62 +0,0 @@ -_base_ = [ - '../_base_/models/segformer.py', '../_base_/datasets/ade20k_aligned.py', - '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' -] - -# model settings -norm_cfg = dict(type='SyncBN', requires_grad=True) -model = dict( - type='EncoderDecoder', - pretrained='pretrain/mit_b0.pth', - backbone=dict( - patch_size=4, - embed_dims=[32, 64, 160, 256], - num_heads=[1, 2, 5, 8], - mlp_ratios=[4, 4, 4, 4], - qkv_bias=True, - depths=[2, 2, 2, 2], - sr_ratios=[8, 4, 2, 1], - drop_rate=0.0, - drop_path_rate=0.1), - decode_head=dict( - type='SegFormerHead', - in_channels=[32, 64, 160, 256], - in_index=[0, 1, 2, 3], - feature_strides=[4, 8, 16, 32], - channels=128, - dropout_ratio=0.1, - num_classes=150, - norm_cfg=norm_cfg, - align_corners=False, - decoder_params=dict(embed_dim=256), - loss_decode=dict( - type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), - # model training and testing settings - train_cfg=dict(), - test_cfg=dict(mode='whole')) - -# optimizer -optimizer = dict( - _delete_=True, - type='AdamW', - lr=0.00006, - betas=(0.9, 0.999), - weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={ - 'pos_block': dict(decay_mult=0.), - 'norm': dict(decay_mult=0.), - 'head': dict(lr_mult=10.) - })) - -lr_config = dict( - _delete_=True, - policy='poly', - warmup='linear', - warmup_iters=1500, - warmup_ratio=1e-6, - power=1.0, - min_lr=0.0, - by_epoch=False) - -data = dict(samples_per_gpu=2) diff --git a/configs/segformer/segformer_b1_512x512_160k_ade20k.py b/configs/segformer/segformer_b1_512x512_160k_ade20k.py deleted file mode 100644 index 26f1ff05e0..0000000000 --- a/configs/segformer/segformer_b1_512x512_160k_ade20k.py +++ /dev/null @@ -1,62 +0,0 @@ -_base_ = [ - '../_base_/models/segformer.py', '../_base_/datasets/ade20k_aligned.py', - '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' -] - -# model settings -norm_cfg = dict(type='SyncBN', requires_grad=True) -model = dict( - type='EncoderDecoder', - pretrained='pretrain/mit_b1.pth', - backbone=dict( - patch_size=4, - embed_dims=[64, 128, 320, 512], - num_heads=[1, 2, 5, 8], - mlp_ratios=[4, 4, 4, 4], - qkv_bias=True, - depths=[2, 2, 2, 2], - sr_ratios=[8, 4, 2, 1], - drop_rate=0.0, - drop_path_rate=0.1), - decode_head=dict( - type='SegFormerHead', - in_channels=[64, 128, 320, 512], - in_index=[0, 1, 2, 3], - feature_strides=[4, 8, 16, 32], - channels=128, - dropout_ratio=0.1, - num_classes=150, - norm_cfg=norm_cfg, - align_corners=False, - decoder_params=dict(embed_dim=256), - loss_decode=dict( - type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), - # model training and testing settings - train_cfg=dict(), - test_cfg=dict(mode='whole')) - -# optimizer -optimizer = dict( - _delete_=True, - type='AdamW', - lr=0.00006, - betas=(0.9, 0.999), - weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={ - 'pos_block': dict(decay_mult=0.), - 'norm': dict(decay_mult=0.), - 'head': dict(lr_mult=10.) - })) - -lr_config = dict( - _delete_=True, - policy='poly', - warmup='linear', - warmup_iters=1500, - warmup_ratio=1e-6, - power=1.0, - min_lr=0.0, - by_epoch=False) - -data = dict(samples_per_gpu=2) diff --git a/configs/segformer/segformer_b2_512x512_160k_ade20k.py b/configs/segformer/segformer_b2_512x512_160k_ade20k.py deleted file mode 100644 index 2812d0fc7c..0000000000 --- a/configs/segformer/segformer_b2_512x512_160k_ade20k.py +++ /dev/null @@ -1,62 +0,0 @@ -_base_ = [ - '../_base_/models/segformer.py', '../_base_/datasets/ade20k_aligned.py', - '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' -] - -# model settings -norm_cfg = dict(type='SyncBN', requires_grad=True) -model = dict( - type='EncoderDecoder', - pretrained='pretrain/mit_b2.pth', - backbone=dict( - patch_size=4, - embed_dims=[64, 128, 320, 512], - num_heads=[1, 2, 5, 8], - mlp_ratios=[4, 4, 4, 4], - qkv_bias=True, - depths=[3, 4, 6, 3], - sr_ratios=[8, 4, 2, 1], - drop_rate=0.0, - drop_path_rate=0.1), - decode_head=dict( - type='SegFormerHead', - in_channels=[64, 128, 320, 512], - in_index=[0, 1, 2, 3], - feature_strides=[4, 8, 16, 32], - channels=128, - dropout_ratio=0.1, - num_classes=150, - norm_cfg=norm_cfg, - align_corners=False, - decoder_params=dict(embed_dim=768), - loss_decode=dict( - type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), - # model training and testing settings - train_cfg=dict(), - test_cfg=dict(mode='whole')) - -# optimizer -optimizer = dict( - _delete_=True, - type='AdamW', - lr=0.00006, - betas=(0.9, 0.999), - weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={ - 'pos_block': dict(decay_mult=0.), - 'norm': dict(decay_mult=0.), - 'head': dict(lr_mult=10.) - })) - -lr_config = dict( - _delete_=True, - policy='poly', - warmup='linear', - warmup_iters=1500, - warmup_ratio=1e-6, - power=1.0, - min_lr=0.0, - by_epoch=False) - -data = dict(samples_per_gpu=2) diff --git a/configs/segformer/segformer_b3_512x512_160k_ade20k.py b/configs/segformer/segformer_b3_512x512_160k_ade20k.py deleted file mode 100644 index 823daa0d5d..0000000000 --- a/configs/segformer/segformer_b3_512x512_160k_ade20k.py +++ /dev/null @@ -1,62 +0,0 @@ -_base_ = [ - '../_base_/models/segformer.py', '../_base_/datasets/ade20k_aligned.py', - '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' -] - -# model settings -norm_cfg = dict(type='SyncBN', requires_grad=True) -model = dict( - type='EncoderDecoder', - pretrained='pretrain/mit_b3.pth', - backbone=dict( - patch_size=4, - embed_dims=[64, 128, 320, 512], - num_heads=[1, 2, 5, 8], - mlp_ratios=[4, 4, 4, 4], - qkv_bias=True, - depths=[3, 4, 18, 3], - sr_ratios=[8, 4, 2, 1], - drop_rate=0.0, - drop_path_rate=0.1), - decode_head=dict( - type='SegFormerHead', - in_channels=[64, 128, 320, 512], - in_index=[0, 1, 2, 3], - feature_strides=[4, 8, 16, 32], - channels=128, - dropout_ratio=0.1, - num_classes=150, - norm_cfg=norm_cfg, - align_corners=False, - decoder_params=dict(embed_dim=768), - loss_decode=dict( - type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), - # model training and testing settings - train_cfg=dict(), - test_cfg=dict(mode='whole')) - -# optimizer -optimizer = dict( - _delete_=True, - type='AdamW', - lr=0.00006, - betas=(0.9, 0.999), - weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={ - 'pos_block': dict(decay_mult=0.), - 'norm': dict(decay_mult=0.), - 'head': dict(lr_mult=10.) - })) - -lr_config = dict( - _delete_=True, - policy='poly', - warmup='linear', - warmup_iters=1500, - warmup_ratio=1e-6, - power=1.0, - min_lr=0.0, - by_epoch=False) - -data = dict(samples_per_gpu=2) diff --git a/configs/segformer/segformer_b4_512x512_160k_ade20k.py b/configs/segformer/segformer_b4_512x512_160k_ade20k.py deleted file mode 100644 index 59c2bcbede..0000000000 --- a/configs/segformer/segformer_b4_512x512_160k_ade20k.py +++ /dev/null @@ -1,62 +0,0 @@ -_base_ = [ - '../_base_/models/segformer.py', '../_base_/datasets/ade20k_aligned.py', - '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' -] - -# model settings -norm_cfg = dict(type='SyncBN', requires_grad=True) -model = dict( - type='EncoderDecoder', - pretrained='pretrain/mit_b4.pth', - backbone=dict( - patch_size=4, - embed_dims=[64, 128, 320, 512], - num_heads=[1, 2, 5, 8], - mlp_ratios=[4, 4, 4, 4], - qkv_bias=True, - depths=[3, 8, 27, 3], - sr_ratios=[8, 4, 2, 1], - drop_rate=0.0, - drop_path_rate=0.1), - decode_head=dict( - type='SegFormerHead', - in_channels=[64, 128, 320, 512], - in_index=[0, 1, 2, 3], - feature_strides=[4, 8, 16, 32], - channels=128, - dropout_ratio=0.1, - num_classes=150, - norm_cfg=norm_cfg, - align_corners=False, - decoder_params=dict(embed_dim=768), - loss_decode=dict( - type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), - # model training and testing settings - train_cfg=dict(), - test_cfg=dict(mode='whole')) - -# optimizer -optimizer = dict( - _delete_=True, - type='AdamW', - lr=0.00006, - betas=(0.9, 0.999), - weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={ - 'pos_block': dict(decay_mult=0.), - 'norm': dict(decay_mult=0.), - 'head': dict(lr_mult=10.) - })) - -lr_config = dict( - _delete_=True, - policy='poly', - warmup='linear', - warmup_iters=1500, - warmup_ratio=1e-6, - power=1.0, - min_lr=0.0, - by_epoch=False) - -data = dict(samples_per_gpu=2) diff --git a/configs/segformer/segformer_b5_640x640_160k_ade20k.py b/configs/segformer/segformer_b5_640x640_160k_ade20k.py deleted file mode 100644 index 92b40c91b1..0000000000 --- a/configs/segformer/segformer_b5_640x640_160k_ade20k.py +++ /dev/null @@ -1,64 +0,0 @@ -_base_ = [ - '../_base_/models/segformer.py', - '../_base_/datasets/ade20k_aligned_640x640.py', - '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' -] - -# model settings -norm_cfg = dict(type='SyncBN', requires_grad=True) -model = dict( - type='EncoderDecoder', - pretrained='pretrain/mit_b5.pth', - backbone=dict( - patch_size=4, - embed_dims=[64, 128, 320, 512], - num_heads=[1, 2, 5, 8], - mlp_ratios=[4, 4, 4, 4], - qkv_bias=True, - depths=[3, 6, 40, 3], - sr_ratios=[8, 4, 2, 1], - drop_rate=0.0, - drop_path_rate=0.1), - decode_head=dict( - type='SegFormerHead', - in_channels=[64, 128, 320, 512], - in_index=[0, 1, 2, 3], - feature_strides=[4, 8, 16, 32], - channels=128, - dropout_ratio=0.1, - num_classes=150, - norm_cfg=norm_cfg, - align_corners=False, - decoder_params=dict(embed_dim=768), - loss_decode=dict( - type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), - # model training and testing settings - train_cfg=dict(), - test_cfg=dict(mode='whole')) - -# optimizer -optimizer = dict( - _delete_=True, - type='AdamW', - lr=0.00006, - betas=(0.9, 0.999), - weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={ - 'pos_block': dict(decay_mult=0.), - 'norm': dict(decay_mult=0.), - 'head': dict(lr_mult=10.) - })) - -lr_config = dict( - _delete_=True, - policy='poly', - warmup='linear', - warmup_iters=1500, - warmup_ratio=1e-6, - power=1.0, - min_lr=0.0, - by_epoch=False) - -# By default, models are trained on 8 GPUs with 2 images per GPU -data = dict(samples_per_gpu=2) diff --git a/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py new file mode 100644 index 0000000000..b9ca1eb7c7 --- /dev/null +++ b/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py @@ -0,0 +1,31 @@ +_base_ = [ + '../_base_/models/segformer_mit-b0.py', + '../_base_/datasets/ade20k_aligned.py', '../_base_/default_runtime.py', + '../_base_/schedules/schedule_160k.py' +] + +# optimizer +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.00006, + betas=(0.9, 0.999), + weight_decay=0.01, + paramwise_cfg=dict( + custom_keys={ + 'pos_block': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.), + 'head': dict(lr_mult=10.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +data = dict(samples_per_gpu=2) diff --git a/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py new file mode 100644 index 0000000000..0f655ba603 --- /dev/null +++ b/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py @@ -0,0 +1,38 @@ +_base_ = [ + '../_base_/models/segformer_mit-b0.py', + '../_base_/datasets/ade20k_aligned.py', '../_base_/default_runtime.py', + '../_base_/schedules/schedule_160k.py' +] + +# model settings +model = dict( + pretrained='pretrain/mit_b1.pth', + backbone=dict( + embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[2, 2, 2, 2]), + decode_head=dict(in_channels=[64, 128, 320, 512])) + +# optimizer +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.00006, + betas=(0.9, 0.999), + weight_decay=0.01, + paramwise_cfg=dict( + custom_keys={ + 'pos_block': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.), + 'head': dict(lr_mult=10.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +data = dict(samples_per_gpu=2) diff --git a/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py new file mode 100644 index 0000000000..caef2197fd --- /dev/null +++ b/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py @@ -0,0 +1,38 @@ +_base_ = [ + '../_base_/models/segformer_mit-b0.py', + '../_base_/datasets/ade20k_aligned.py', '../_base_/default_runtime.py', + '../_base_/schedules/schedule_160k.py' +] + +# model settings +model = dict( + pretrained='pretrain/mit_b2.pth', + backbone=dict( + embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 4, 6, 3]), + decode_head=dict(in_channels=[64, 128, 320, 512])) + +# optimizer +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.00006, + betas=(0.9, 0.999), + weight_decay=0.01, + paramwise_cfg=dict( + custom_keys={ + 'pos_block': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.), + 'head': dict(lr_mult=10.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +data = dict(samples_per_gpu=2) diff --git a/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py new file mode 100644 index 0000000000..4f8269c76b --- /dev/null +++ b/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py @@ -0,0 +1,38 @@ +_base_ = [ + '../_base_/models/segformer_mit-b0.py', + '../_base_/datasets/ade20k_aligned.py', '../_base_/default_runtime.py', + '../_base_/schedules/schedule_160k.py' +] + +# model settings +model = dict( + pretrained='pretrain/mit_b3.pth', + backbone=dict( + embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 4, 18, 3]), + decode_head=dict(in_channels=[64, 128, 320, 512])) + +# optimizer +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.00006, + betas=(0.9, 0.999), + weight_decay=0.01, + paramwise_cfg=dict( + custom_keys={ + 'pos_block': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.), + 'head': dict(lr_mult=10.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +data = dict(samples_per_gpu=2) diff --git a/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py new file mode 100644 index 0000000000..8b765b1dad --- /dev/null +++ b/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py @@ -0,0 +1,40 @@ +_base_ = [ + '../_base_/models/segformer_mit-b0.py', + '../_base_/datasets/ade20k_aligned.py', '../_base_/default_runtime.py', + '../_base_/schedules/schedule_160k.py' +] + +# model settings +model = dict( + pretrained='pretrain/mit_b4.pth', + backbone=dict( + embed_dims=[64, 128, 320, 512], + num_heads=[1, 2, 5, 8], + num_layers=[3, 8, 27, 3]), + decode_head=dict(in_channels=[64, 128, 320, 512])) + +# optimizer +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.00006, + betas=(0.9, 0.999), + weight_decay=0.01, + paramwise_cfg=dict( + custom_keys={ + 'pos_block': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.), + 'head': dict(lr_mult=10.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +data = dict(samples_per_gpu=2) diff --git a/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py b/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py new file mode 100644 index 0000000000..ea437142de --- /dev/null +++ b/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/segformer_mit-b0.py', + '../_base_/datasets/ade20k_aligned.py', '../_base_/default_runtime.py', + '../_base_/schedules/schedule_160k.py' +] + +# model settings +model = dict( + pretrained='pretrain/mit_b5.pth', + backbone=dict( + embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 6, 40, 3]), + decode_head=dict(in_channels=[64, 128, 320, 512])) + +# optimizer +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.00006, + betas=(0.9, 0.999), + weight_decay=0.01, + paramwise_cfg=dict( + custom_keys={ + 'pos_block': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.), + 'head': dict(lr_mult=10.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +# By default, models are trained on 8 GPUs with 2 images per GPU +data = dict(samples_per_gpu=2) diff --git a/mmseg/models/decode_heads/segformer_head.py b/mmseg/models/decode_heads/segformer_head.py index 7e526a1cfe..5c1f9a8733 100644 --- a/mmseg/models/decode_heads/segformer_head.py +++ b/mmseg/models/decode_heads/segformer_head.py @@ -8,8 +8,8 @@ @HEADS.register_module() -class SegFormerHead(BaseDecodeHead): - """The MLP Head of segformer. +class SegformerHead(BaseDecodeHead): + """The all mlp Head of segformer. Args: interpolate_mode: The interpolate mode of MLP head upsample operation. @@ -17,8 +17,7 @@ class SegFormerHead(BaseDecodeHead): """ def __init__(self, interpolate_mode='bilinear', **kwargs): - super(SegFormerHead, self).__init__( - input_transform='multiple_select', **kwargs) + super().__init__(input_transform='multiple_select', **kwargs) self.interpolate_mode = interpolate_mode From 71df61784743ffbd56e10e9a2f56dff0318ca907 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 15 Jul 2021 13:25:11 +0800 Subject: [PATCH 30/96] resolve the shape convertion functions doc string. --- mmseg/models/utils/shape_convert.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mmseg/models/utils/shape_convert.py b/mmseg/models/utils/shape_convert.py index 69388de49a..744416092c 100644 --- a/mmseg/models/utils/shape_convert.py +++ b/mmseg/models/utils/shape_convert.py @@ -2,8 +2,11 @@ def nlc_to_nchw(x, hw_shape): """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor. Args: - x (Tensor): The input tensor for convertion. + x (Tensor): The input tensor of shape [N, L, C] before convertion. hw_shape (Sequence[int]): The height and width of output feature map. + + Returns: + Tensor: The output tensor of shape [N, C, H, W] after convertion. """ H, W = hw_shape assert len(x.shape) == 3 @@ -16,7 +19,10 @@ def nchw_to_nlc(x): """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor. Args: - x (Tensor): The input tensor for convertion. + x (Tensor): The input tensor of shape [N, C, H, W] before convertion. + + Returns: + Tensor: The output tensor of shape [N, L, C] after convertion. """ assert len(x.shape) == 4 return x.flatten(2).transpose(1, 2).contiguous() From 38b2db81b6fbf85a0da954edb60fe6f0c4ce92df Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Fri, 16 Jul 2021 12:35:53 +0800 Subject: [PATCH 31/96] Add pad_to_patch_size arg. --- mmseg/models/backbones/mit.py | 1 + mmseg/models/backbones/swin.py | 1 + mmseg/models/backbones/vit.py | 1 + mmseg/models/utils/embed.py | 7 +++++-- 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index 6ba2fbdd9c..cad0b43134 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -343,6 +343,7 @@ def __init__(self, kernel_size=patch_sizes[i], stride=strides[i], padding=patch_sizes[i] // 2, + pad_to_patch_size=False, norm_cfg=norm_cfg) layer = ModuleList([ TransformerEncoderLayer( diff --git a/mmseg/models/backbones/swin.py b/mmseg/models/backbones/swin.py index a798ad1ebf..1ea6389fa4 100644 --- a/mmseg/models/backbones/swin.py +++ b/mmseg/models/backbones/swin.py @@ -628,6 +628,7 @@ def __init__(self, conv_type='Conv2d', kernel_size=patch_size, stride=strides[0], + pad_to_patch_size=True, norm_cfg=norm_cfg if patch_norm else None, init_cfg=None) diff --git a/mmseg/models/backbones/vit.py b/mmseg/models/backbones/vit.py index 1ad20a1ca6..c7b8c1a1c5 100644 --- a/mmseg/models/backbones/vit.py +++ b/mmseg/models/backbones/vit.py @@ -210,6 +210,7 @@ def __init__(self, conv_type='Conv2d', kernel_size=patch_size, stride=patch_size, + pad_to_patch_size=True, norm_cfg=norm_cfg if patch_norm else None, init_cfg=None, ) diff --git a/mmseg/models/utils/embed.py b/mmseg/models/utils/embed.py index 33a8c7db83..e75f8e1977 100644 --- a/mmseg/models/utils/embed.py +++ b/mmseg/models/utils/embed.py @@ -19,6 +19,8 @@ class PatchEmbed(BaseModule): Default: None (Default to be equal with kernel_size). padding (int): The padding length of embedding conv. Default: 0. dilation (int): The dilation rate of embedding conv. Default: 1. + pad_to_patch_size (bool, optional): Whether to pad feature map shape + to multiple patch size. Default: False. norm_cfg (dict, optional): Config dict for normalization layer. init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization. Default: None. @@ -32,6 +34,7 @@ def __init__(self, stride=16, padding=0, dilation=1, + pad_to_patch_size=False, norm_cfg=None, init_cfg=None): super(PatchEmbed, self).__init__() @@ -42,7 +45,7 @@ def __init__(self, if stride is None: stride = kernel_size - self.overlapping = stride < kernel_size + self.pad_to_patch_size = pad_to_patch_size # The default setting of patch size is equal to kernel size. patch_size = kernel_size @@ -77,7 +80,7 @@ def forward(self, x): H, W = x.shape[2], x.shape[3] # TODO: Process overlapping op - if not self.overlapping: + if self.pad_to_patch_size: # Modify H, W to multiple of patch size. if H % self.patch_size[0] != 0: x = F.pad( From f3aaeccc3ea7f725da325ea000dca5493a7a376e Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Fri, 16 Jul 2021 21:22:48 +0800 Subject: [PATCH 32/96] Support progressive test with fewer memory cost. --- mmseg/apis/__init__.py | 6 +- mmseg/apis/test.py | 152 +++++++++++++++++++++++++++++- mmseg/core/evaluation/__init__.py | 5 +- mmseg/core/evaluation/metrics.py | 66 +++++++++++++ mmseg/datasets/custom.py | 84 +++++++++++++++++ tools/test.py | 34 +++++-- 6 files changed, 335 insertions(+), 12 deletions(-) diff --git a/mmseg/apis/__init__.py b/mmseg/apis/__init__.py index 170724be38..eada290f58 100644 --- a/mmseg/apis/__init__.py +++ b/mmseg/apis/__init__.py @@ -1,9 +1,11 @@ from .inference import inference_segmentor, init_segmentor, show_result_pyplot -from .test import multi_gpu_test, single_gpu_test +from .test import (multi_gpu_test, progressive_multi_gpu_test, + progressive_single_gpu_test, single_gpu_test) from .train import get_root_logger, set_random_seed, train_segmentor __all__ = [ 'get_root_logger', 'set_random_seed', 'train_segmentor', 'init_segmentor', 'inference_segmentor', 'multi_gpu_test', 'single_gpu_test', - 'show_result_pyplot' + 'show_result_pyplot', 'progressive_single_gpu_test', + 'progressive_multi_gpu_test' ] diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index 0034159689..cef47f46c3 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -8,6 +8,8 @@ from mmcv.image import tensor2imgs from mmcv.runner import get_dist_info +from mmseg.core.evaluation.metrics import intersect_and_union + def np2tmp(array, temp_file_name=None, tmpdir=None): """Save ndarray to local numpy file. @@ -44,8 +46,8 @@ def single_gpu_test(model, show (bool): Whether show results during inference. Default: False. out_dir (str, optional): If specified, the results will be dumped into the directory to save output results. - efficient_test (bool): Whether save the results as local numpy files to - save CPU memory during evaluation. Default: False. + efficient_test (bool, optional): Whether save the results as local + numpy files to save CPU memory during evaluation. Default: False. opacity(float): Opacity of painted segmentation map. Default 0.5. Must be in (0, 1] range. @@ -163,3 +165,149 @@ def multi_gpu_test(model, else: results = collect_results_cpu(results, len(dataset), tmpdir) return results + + +def progressive_single_gpu_test(model, + data_loader, + show=False, + out_dir=None, + opacity=0.5): + model.eval() + dataset = data_loader.dataset + num_classes = len(dataset.CLASSES) + prog_bar = mmcv.ProgressBar(len(dataset)) + + total_area_intersect = torch.zeros((num_classes, ), dtype=torch.float64) + total_area_union = torch.zeros((num_classes, ), dtype=torch.float64) + total_area_pred_label = torch.zeros((num_classes, ), dtype=torch.float64) + total_area_label = torch.zeros((num_classes, ), dtype=torch.float64) + + cur = 0 + for _, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, **data) + + if show or out_dir: + img_tensor = data['img'][0] + img_metas = data['img_metas'][0].data[0] + imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg']) + assert len(imgs) == len(img_metas) + + for img, img_meta in zip(imgs, img_metas): + h, w, _ = img_meta['img_shape'] + img_show = img[:h, :w, :] + + ori_h, ori_w = img_meta['ori_shape'][:-1] + img_show = mmcv.imresize(img_show, (ori_w, ori_h)) + + if out_dir: + out_file = osp.join(out_dir, img_meta['ori_filename']) + else: + out_file = None + + model.module.show_result( + img_show, + result, + palette=dataset.PALETTE, + show=show, + out_file=out_file, + opacity=opacity) + + for i in range(len(result)): + gt_semantic_map = dataset.get_gt_seg_map(cur + i) + + area_intersect, area_union, area_pred_label, area_label = \ + intersect_and_union( + result[i], gt_semantic_map, num_classes, + dataset.ignore_index, dataset.label_map, + dataset.reduce_zero_label) + + total_area_intersect += area_intersect + total_area_union += area_union + total_area_pred_label += area_pred_label + total_area_label += area_label + + print(total_area_intersect / total_area_union) + + prog_bar.update() + + cur += len(result) + + return total_area_intersect, total_area_union, total_area_pred_label, \ + total_area_label + + +# TODO: Support distributed test api +def progressive_multi_gpu_test(model, + data_loader, + tmpdir=None, + gpu_collect=False): + + model.eval() + dataset = data_loader.dataset + num_classes = len(dataset.CLASSES) + rank, world_size = get_dist_info() + if rank == 0: + prog_bar = mmcv.ProgressBar(len(dataset)) + + total_area_intersect = torch.zeros((num_classes, ), dtype=torch.float64) + total_area_union = torch.zeros((num_classes, ), dtype=torch.float64) + total_area_pred_label = torch.zeros((num_classes, ), dtype=torch.float64) + total_area_label = torch.zeros((num_classes, ), dtype=torch.float64) + + cur = 0 + for _, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, rescale=True, **data) + + for i in range(len(result)): + gt_semantic_map = dataset.get_gt_seg_map(cur + i * world_size) + + area_intersect, area_union, area_pred_label, area_label = \ + intersect_and_union( + result[i], gt_semantic_map, num_classes, + dataset.ignore_index, dataset.label_map, + dataset.reduce_zero_label) + + total_area_intersect += area_intersect + total_area_union += area_union + total_area_pred_label += area_pred_label + total_area_label += area_label + + if rank == 0: + for _ in range(len(result) * world_size): + prog_bar.update() + + cur += len(result) * world_size + + pixel_count_matrix = [ + total_area_intersect, total_area_union, total_area_pred_label, + total_area_label + ] + # collect results from all ranks + if gpu_collect: + results = collect_count_results_gpu(pixel_count_matrix, 4 * world_size) + else: + results = collect_count_results_cpu(pixel_count_matrix, 4 * world_size, + tmpdir) + return results + + +def collect_count_results_gpu(result_part, size): + """Collect pixel count matrix result under gpu mode. + + On gpu mode, this function will encode results to gpu tensors and use gpu + communication for results collection. + + Args: + result_part (list[Tensor]): four type of pixel count matrix -- + {area_intersect, area_union, area_pred_label, area_label}, These + four tensor shape of (num_classes, ). + size (int): Size of the results, commonly equal to length of + the results. + """ + pass + + +def collect_count_results_cpu(result_part, size, tmpdir=None): + pass diff --git a/mmseg/core/evaluation/__init__.py b/mmseg/core/evaluation/__init__.py index f7cc4b2341..2db4052490 100644 --- a/mmseg/core/evaluation/__init__.py +++ b/mmseg/core/evaluation/__init__.py @@ -1,8 +1,9 @@ from .class_names import get_classes, get_palette from .eval_hooks import DistEvalHook, EvalHook -from .metrics import eval_metrics, mean_dice, mean_fscore, mean_iou +from .metrics import (calculate_metrics, eval_metrics, mean_dice, mean_fscore, + mean_iou) __all__ = [ 'EvalHook', 'DistEvalHook', 'mean_dice', 'mean_iou', 'mean_fscore', - 'eval_metrics', 'get_classes', 'get_palette' + 'eval_metrics', 'get_classes', 'get_palette', 'calculate_metrics' ] diff --git a/mmseg/core/evaluation/metrics.py b/mmseg/core/evaluation/metrics.py index a216afefe6..7aeadc041f 100644 --- a/mmseg/core/evaluation/metrics.py +++ b/mmseg/core/evaluation/metrics.py @@ -324,3 +324,69 @@ def eval_metrics(results, for metric, metric_value in ret_metrics.items() }) return ret_metrics + + +def calculate_metrics(total_area_intersect, + total_area_union, + total_area_pred_label, + total_area_label, + metrics=['mIoU'], + nan_to_num=None, + beta=1): + """Calculate evaluation metrics + Args: + results (list[ndarray] | list[str]): List of prediction segmentation + maps or list of prediction result filenames. + gt_seg_maps (list[ndarray] | list[str]): list of ground truth + segmentation maps or list of label filenames. + num_classes (int): Number of categories. + ignore_index (int): Index that will be ignored in evaluation. + metrics (list[str] | str): Metrics to be evaluated, 'mIoU' and 'mDice'. + nan_to_num (int, optional): If specified, NaN values will be replaced + by the numbers defined by the user. Default: None. + label_map (dict): Mapping old labels to new labels. Default: dict(). + reduce_zero_label (bool): Wether ignore zero label. Default: False. + Returns: + float: Overall accuracy on all images. + ndarray: Per category accuracy, shape (num_classes, ). + ndarray: Per category evaluation metrics, shape (num_classes, ). + """ + if isinstance(metrics, str): + metrics = [metrics] + allowed_metrics = ['mIoU', 'mDice', 'mFscore'] + if not set(metrics).issubset(set(allowed_metrics)): + raise KeyError('metrics {} is not supported'.format(metrics)) + + all_acc = total_area_intersect.sum() / total_area_label.sum() + ret_metrics = OrderedDict({'aAcc': all_acc}) + for metric in metrics: + if metric == 'mIoU': + iou = total_area_intersect / total_area_union + acc = total_area_intersect / total_area_label + ret_metrics['IoU'] = iou + ret_metrics['Acc'] = acc + elif metric == 'mDice': + dice = 2 * total_area_intersect / ( + total_area_pred_label + total_area_label) + acc = total_area_intersect / total_area_label + ret_metrics['Dice'] = dice + ret_metrics['Acc'] = acc + elif metric == 'mFscore': + precision = total_area_intersect / total_area_pred_label + recall = total_area_intersect / total_area_label + f_value = torch.tensor( + [f_score(x[0], x[1], beta) for x in zip(precision, recall)]) + ret_metrics['Fscore'] = f_value + ret_metrics['Precision'] = precision + ret_metrics['Recall'] = recall + + ret_metrics = { + metric: value.numpy() + for metric, value in ret_metrics.items() + } + if nan_to_num is not None: + ret_metrics = OrderedDict({ + metric: np.nan_to_num(metric_value, nan=nan_to_num) + for metric, metric_value in ret_metrics.items() + }) + return ret_metrics diff --git a/mmseg/datasets/custom.py b/mmseg/datasets/custom.py index 9c88235e39..753dcb103e 100644 --- a/mmseg/datasets/custom.py +++ b/mmseg/datasets/custom.py @@ -10,6 +10,7 @@ from torch.utils.data import Dataset from mmseg.core import eval_metrics +from mmseg.core.evaluation.metrics import calculate_metrics from mmseg.utils import get_root_logger from .builder import DATASETS from .pipelines import Compose @@ -240,6 +241,13 @@ def get_gt_seg_maps(self, efficient_test=False): gt_seg_maps.append(gt_seg_map) return gt_seg_maps + def get_gt_seg_map(self, idx): + """Get ground truth segmentation maps for evaluation.""" + seg_map = osp.join(self.ann_dir, self.img_infos[idx]['ann']['seg_map']) + gt_seg_map = mmcv.imread(seg_map, flag='unchanged', backend='pillow') + + return gt_seg_map + def get_classes_and_palette(self, classes=None, palette=None): """Get class names of current dataset. @@ -303,6 +311,82 @@ def get_palette_for_custom_classes(self, class_names, palette=None): return palette + def progressive_evaluate(self, + results, + metric='mIoU', + logger=None, + **kwargs): + if isinstance(metric, str): + metric = [metric] + allowed_metrics = ['mIoU', 'mDice', 'mFscore'] + if not set(metric).issubset(set(allowed_metrics)): + raise KeyError('metric {} is not supported'.format(metric)) + + eval_results = {} + + total_area_intersect, total_area_union, total_area_pred_label, \ + total_area_label = results + + ret_metrics = calculate_metrics(total_area_intersect, total_area_union, + total_area_pred_label, + total_area_label, metric) + + # Because dataset.CLASSES is required in progressive_single_gpu_test, + # progressive_multi_gpu_test, so it's necessary to keep + # dataset.CLASSES. + class_names = self.CLASSES + + # summary table + ret_metrics_summary = OrderedDict({ + ret_metric: np.round(np.nanmean(ret_metric_value) * 100, 2) + for ret_metric, ret_metric_value in ret_metrics.items() + }) + + # each class table + ret_metrics.pop('aAcc', None) + ret_metrics_class = OrderedDict({ + ret_metric: np.round(ret_metric_value * 100, 2) + for ret_metric, ret_metric_value in ret_metrics.items() + }) + ret_metrics_class.update({'Class': class_names}) + ret_metrics_class.move_to_end('Class', last=False) + + # for logger + class_table_data = PrettyTable() + for key, val in ret_metrics_class.items(): + class_table_data.add_column(key, val) + + summary_table_data = PrettyTable() + for key, val in ret_metrics_summary.items(): + if key == 'aAcc': + summary_table_data.add_column(key, [val]) + else: + summary_table_data.add_column('m' + key, [val]) + + print_log('per class results:', logger) + print_log('\n' + class_table_data.get_string(), logger=logger) + print_log('Summary:', logger) + print_log('\n' + summary_table_data.get_string(), logger=logger) + + # each metric dict + for key, value in ret_metrics_summary.items(): + if key == 'aAcc': + eval_results[key] = value / 100.0 + else: + eval_results['m' + key] = value / 100.0 + + ret_metrics_class.pop('Class', None) + for key, value in ret_metrics_class.items(): + eval_results.update({ + key + '.' + str(name): value[idx] / 100.0 + for idx, name in enumerate(class_names) + }) + + if mmcv.is_list_of(results, str): + for file_name in results: + os.remove(file_name) + return eval_results + def evaluate(self, results, metric='mIoU', diff --git a/tools/test.py b/tools/test.py index ab2bd60175..6539e1c3b0 100644 --- a/tools/test.py +++ b/tools/test.py @@ -8,7 +8,8 @@ wrap_fp16_model) from mmcv.utils import DictAction -from mmseg.apis import multi_gpu_test, single_gpu_test +from mmseg.apis import (multi_gpu_test, progressive_multi_gpu_test, + progressive_single_gpu_test, single_gpu_test) from mmseg.datasets import build_dataloader, build_dataset from mmseg.models import build_segmentor @@ -90,6 +91,8 @@ def main(): if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True if args.aug_test: + assert not (args.show or args.show_dir + ), 'when aug test, it is not supported to show result.' # hard code index cfg.data.test.pipeline[1].img_ratios = [ 0.5, 0.75, 1.0, 1.25, 1.5, 1.75 @@ -134,20 +137,36 @@ def main(): model.PALETTE = dataset.PALETTE efficient_test = False + only_pixel_count = False if args.eval_options is not None: efficient_test = args.eval_options.get('efficient_test', False) + only_pixel_count = args.eval_options.get('only_pixel_count', False) + assert not (args.format_only and only_pixel_count), 'format_only' + 'and only_pixel_count can\'t be set at the same time.' + assert not (args.out and only_pixel_count), 'format_only' + 'and only_pixel_count can\'t be set at the same time.' if not distributed: model = MMDataParallel(model, device_ids=[0]) - outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, - efficient_test, args.opacity) + if only_pixel_count: + outputs = progressive_single_gpu_test(model, data_loader, + args.show, args.show_dir, + args.opacity) + else: + outputs = single_gpu_test(model, data_loader, args.show, + args.show_dir, efficient_test, + args.opacity) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) - outputs = multi_gpu_test(model, data_loader, args.tmpdir, - args.gpu_collect, efficient_test) + if only_pixel_count: + outputs = progressive_multi_gpu_test(model, data_loader, + args.gpu_collect) + else: + outputs = multi_gpu_test(model, data_loader, args.tmpdir, + args.gpu_collect, efficient_test) rank, _ = get_dist_info() if rank == 0: @@ -158,7 +177,10 @@ def main(): if args.format_only: dataset.format_results(outputs, **kwargs) if args.eval: - dataset.evaluate(outputs, args.eval, **kwargs) + if only_pixel_count: + dataset.progressive_evaluate(outputs, args.eval, **kwargs) + else: + dataset.evaluate(outputs, args.eval, **kwargs) if __name__ == '__main__': From e7039ffceee9256a6234c576d10fa577fbcbfef3 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Mon, 19 Jul 2021 16:36:27 +0800 Subject: [PATCH 33/96] Modify default value of pad_to_patch_size arg. --- mmseg/models/utils/embed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mmseg/models/utils/embed.py b/mmseg/models/utils/embed.py index e75f8e1977..73d8ed1f11 100644 --- a/mmseg/models/utils/embed.py +++ b/mmseg/models/utils/embed.py @@ -20,7 +20,7 @@ class PatchEmbed(BaseModule): padding (int): The padding length of embedding conv. Default: 0. dilation (int): The dilation rate of embedding conv. Default: 1. pad_to_patch_size (bool, optional): Whether to pad feature map shape - to multiple patch size. Default: False. + to multiple patch size. Default: True. norm_cfg (dict, optional): Config dict for normalization layer. init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization. Default: None. @@ -34,7 +34,7 @@ def __init__(self, stride=16, padding=0, dilation=1, - pad_to_patch_size=False, + pad_to_patch_size=True, norm_cfg=None, init_cfg=None): super(PatchEmbed, self).__init__() From 7c944cfb121cf6ec1989e64c02f1a964dfc4921e Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 22 Jul 2021 15:45:18 +0800 Subject: [PATCH 34/96] Temp code --- mmseg/apis/test.py | 230 +++++++++++++++++++-------- mmseg/core/evaluation/__init__.py | 7 +- mmseg/core/evaluation/metrics.py | 94 ++++++++++- mmseg/datasets/cityscapes.py | 5 +- mmseg/datasets/custom.py | 254 ++++++++++++++++-------------- tools/test.py | 46 ++---- 6 files changed, 412 insertions(+), 224 deletions(-) diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index cef47f46c3..ffda7c8684 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -1,14 +1,17 @@ import os.path as osp +import pickle +import shutil import tempfile import mmcv import numpy as np import torch +import torch.distributed as dist from mmcv.engine import collect_results_cpu, collect_results_gpu from mmcv.image import tensor2imgs from mmcv.runner import get_dist_info -from mmseg.core.evaluation.metrics import intersect_and_union +from mmseg.core.evaluation.metrics import ResultProcessor def np2tmp(array, temp_file_name=None, tmpdir=None): @@ -169,24 +172,55 @@ def multi_gpu_test(model, def progressive_single_gpu_test(model, data_loader, + middle_save=False, show=False, out_dir=None, opacity=0.5): + """Test with single GPU by progressive mode. + + Args: + model (nn.Module): Model to be tested. + data_loader (utils.data.Dataloader): Pytorch data loader. + show (bool): Whether show results during inference. Default: False. + middle_save (bool, optional): Whether to save middle variables when + progressive test. Default: False. + out_dir (str, optional): If specified, the results will be dumped into + the directory to save output results. + opacity(float): Opacity of painted segmentation map. + Default 0.5. + Must be in (0, 1] range. + Returns: + list: The prediction results. + """ model.eval() dataset = data_loader.dataset - num_classes = len(dataset.CLASSES) prog_bar = mmcv.ProgressBar(len(dataset)) - total_area_intersect = torch.zeros((num_classes, ), dtype=torch.float64) - total_area_union = torch.zeros((num_classes, ), dtype=torch.float64) - total_area_pred_label = torch.zeros((num_classes, ), dtype=torch.float64) - total_area_label = torch.zeros((num_classes, ), dtype=torch.float64) + if middle_save: + collector = ResultProcessor( + num_classes=len(dataset.CLASSES), + ignore_index=dataset.ignore_index, + collect_type='seg_map', + label_map=dataset.label_map, + reduce_zero_label=dataset.reduce_zero_label) + else: + collector = ResultProcessor( + num_classes=len(dataset.CLASSES), + ignore_index=dataset.ignore_index, + collect_type='pixels_count', + label_map=dataset.label_map, + reduce_zero_label=dataset.reduce_zero_label) + + gt_maps_generator = dataset.get_gt_seg_maps() - cur = 0 for _, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, **data) + gt_map = next(gt_maps_generator) + meta = data['img_metas'][0].data + collector.collect(result, gt_map, meta) + if show or out_dir: img_tensor = data['img'][0] img_metas = data['img_metas'][0].data[0] @@ -213,101 +247,163 @@ def progressive_single_gpu_test(model, out_file=out_file, opacity=opacity) - for i in range(len(result)): - gt_semantic_map = dataset.get_gt_seg_map(cur + i) - - area_intersect, area_union, area_pred_label, area_label = \ - intersect_and_union( - result[i], gt_semantic_map, num_classes, - dataset.ignore_index, dataset.label_map, - dataset.reduce_zero_label) - - total_area_intersect += area_intersect - total_area_union += area_union - total_area_pred_label += area_pred_label - total_area_label += area_label - - print(total_area_intersect / total_area_union) - + batch_size = len(result) + for _ in range(batch_size): prog_bar.update() - cur += len(result) - - return total_area_intersect, total_area_union, total_area_pred_label, \ - total_area_label + return collector # TODO: Support distributed test api def progressive_multi_gpu_test(model, data_loader, + middle_save=False, tmpdir=None, gpu_collect=False): model.eval() dataset = data_loader.dataset - num_classes = len(dataset.CLASSES) + if middle_save: + collector = ResultProcessor( + num_classes=len(dataset.CLASSES), + ignore_index=dataset.ignore_index, + collect_type='seg_map', + label_map=dataset.label_map, + reduce_zero_label=dataset.reduce_zero_label) + else: + collector = ResultProcessor( + num_classes=len(dataset.CLASSES), + ignore_index=dataset.ignore_index, + collect_type='pixelx_count', + label_map=dataset.label_map, + reduce_zero_label=dataset.reduce_zero_label) + rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) - total_area_intersect = torch.zeros((num_classes, ), dtype=torch.float64) - total_area_union = torch.zeros((num_classes, ), dtype=torch.float64) - total_area_pred_label = torch.zeros((num_classes, ), dtype=torch.float64) - total_area_label = torch.zeros((num_classes, ), dtype=torch.float64) - cur = 0 for _, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) - for i in range(len(result)): - gt_semantic_map = dataset.get_gt_seg_map(cur + i * world_size) - - area_intersect, area_union, area_pred_label, area_label = \ - intersect_and_union( - result[i], gt_semantic_map, num_classes, - dataset.ignore_index, dataset.label_map, - dataset.reduce_zero_label) - - total_area_intersect += area_intersect - total_area_union += area_union - total_area_pred_label += area_pred_label - total_area_label += area_label + gt_seg_map = dataset.index_gt_seg_maps(cur + rank) + meta = data['img_metas'][0].data + collector.collect(result, gt_seg_map, meta) - if rank == 0: - for _ in range(len(result) * world_size): - prog_bar.update() + if rank == 0: + for _ in range(len(result) * world_size): + prog_bar.update() cur += len(result) * world_size - pixel_count_matrix = [ - total_area_intersect, total_area_union, total_area_pred_label, - total_area_label - ] # collect results from all ranks if gpu_collect: - results = collect_count_results_gpu(pixel_count_matrix, 4 * world_size) + collector = collect_collector_gpu(collector) else: - results = collect_count_results_cpu(pixel_count_matrix, 4 * world_size, - tmpdir) - return results + collector = collect_collector_cpu(collector, tmpdir) + return collector -def collect_count_results_gpu(result_part, size): - """Collect pixel count matrix result under gpu mode. +def collect_collector_gpu(collector): + """Collect result collectors under gpu mode. On gpu mode, this function will encode results to gpu tensors and use gpu communication for results collection. Args: - result_part (list[Tensor]): four type of pixel count matrix -- - {area_intersect, area_union, area_pred_label, area_label}, These - four tensor shape of (num_classes, ). - size (int): Size of the results, commonly equal to length of - the results. + collector (object): Result collector containing predictions and labels + to be collected. + Returns: + object: The gathered collector. """ - pass + rank, world_size = get_dist_info() + # dump result part to tensor with pickle + part_tensor = torch.tensor( + bytearray(pickle.dumps(collector)), dtype=torch.uint8, device='cuda') + # gather all result part tensor shape + shape_tensor = torch.tensor(part_tensor.shape, device='cuda') + shape_list = [shape_tensor.clone() for _ in range(world_size)] + dist.all_gather(shape_list, shape_tensor) + # padding result part tensor to max length + shape_max = torch.tensor(shape_list).max() + part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') + part_send[:shape_tensor[0]] = part_tensor + part_recv_list = [ + part_tensor.new_zeros(shape_max) for _ in range(world_size) + ] + # gather all result part + dist.all_gather(part_recv_list, part_send) + + if rank == 0: + # load results of all parts from tmp dir + main_collector = pickle.loads( + part_recv_list[0][:shape_list[0]].cpu().numpy().tobytes()) + sub_collectors = [] + for recv, shape in zip(part_recv_list, shape_list): + part_collector = pickle.loads( + recv[:shape[0]].cpu().numpy().tobytes()) + # When data is severely insufficient, an empty part_result + # on a certain gpu could makes the overall outputs empty. + if part_collector: + sub_collectors.append(part_collector) + main_collector.merge(sub_collectors) + return main_collector + + +def collect_collector_cpu(collector, tmpdir=None): + """Collect result collectors under cpu mode. + + On cpu mode, this function will save the result collectors on different + gpus to``tmpdir`` and collect them by the rank 0 worker. + Args: + collector (object): Result collector containing predictions and labels + to be collected. + tmpdir (str | None): temporal directory for collected results to + store. If set to None, it will create a random temporal directory + for it. -def collect_count_results_cpu(result_part, size, tmpdir=None): - pass + Returns: + object: The gathered collector. + """ + rank, world_size = get_dist_info() + # create a tmp dir if it is not specified + if tmpdir is None: + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full((MAX_LEN, ), + 32, + dtype=torch.uint8, + device='cuda') + if rank == 0: + mmcv.mkdir_or_exist('.dist_test') + tmpdir = tempfile.mkdtemp(dir='.dist_test') + tmpdir = torch.tensor( + bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') + dir_tensor[:len(tmpdir)] = tmpdir + dist.broadcast(dir_tensor, 0) + tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() + else: + mmcv.mkdir_or_exist(tmpdir) + # dump the part result to the dir + mmcv.dump(collector, osp.join(tmpdir, f'part_{rank}.pkl')) + dist.barrier() + # collect all parts + if rank != 0: + return None + else: + # load results of all parts from tmp dir + main_collector = mmcv.load(osp.join(tmpdir, f'part_{0}.pkl')) + sub_collectors = [] + for i in range(1, world_size): + part_file = osp.join(tmpdir, f'part_{i}.pkl') + part_collector = mmcv.load(part_file) + # When data is severely insufficient, an empty part_result + # on a certain gpu could makes the overall outputs empty. + if part_collector: + sub_collectors.append(part_collector) + main_collector.merge(sub_collectors) + # remove tmp dir + shutil.rmtree(tmpdir) + return main_collector diff --git a/mmseg/core/evaluation/__init__.py b/mmseg/core/evaluation/__init__.py index 2db4052490..56af4e91c2 100644 --- a/mmseg/core/evaluation/__init__.py +++ b/mmseg/core/evaluation/__init__.py @@ -1,9 +1,10 @@ from .class_names import get_classes, get_palette from .eval_hooks import DistEvalHook, EvalHook -from .metrics import (calculate_metrics, eval_metrics, mean_dice, mean_fscore, - mean_iou) +from .metrics import (ResultProcessor, calculate_metrics, eval_metrics, + mean_dice, mean_fscore, mean_iou) __all__ = [ 'EvalHook', 'DistEvalHook', 'mean_dice', 'mean_iou', 'mean_fscore', - 'eval_metrics', 'get_classes', 'get_palette', 'calculate_metrics' + 'eval_metrics', 'get_classes', 'get_palette', 'calculate_metrics', + 'ResultProcessor' ] diff --git a/mmseg/core/evaluation/metrics.py b/mmseg/core/evaluation/metrics.py index 7aeadc041f..ac1dd2eea3 100644 --- a/mmseg/core/evaluation/metrics.py +++ b/mmseg/core/evaluation/metrics.py @@ -1,4 +1,4 @@ -from collections import OrderedDict +from collections import Iterable, OrderedDict import mmcv import numpy as np @@ -390,3 +390,95 @@ def calculate_metrics(total_area_intersect, for metric, metric_value in ret_metrics.items() }) return ret_metrics + + +class ResultProcessor(object): + """Collect container when progressive evaluation.""" + + def __init__(self, + num_classes, + ignore_index=255, + collect_type='pixels_count', + label_map=dict(), + reduce_zero_label=False): + self.num_classes = num_classes + self.collect_type = collect_type + + self.ignore_index = ignore_index + self.label_map = label_map + self.reduce_zero_label = reduce_zero_label + + assert collect_type.lower() in ['pixels_count', 'seg_map'] + + self.prediction_pool = [] + self.label_pool = [] + self.meta_pool = [] + + self.total_area_intersect = torch.zeros((self.num_classes, ), + dtype=torch.float64) + self.total_area_union = torch.zeros((self.num_classes, ), + dtype=torch.float64) + self.total_area_pred_label = torch.zeros((self.num_classes, ), + dtype=torch.float64) + self.total_area_label = torch.zeros((self.num_classes, ), + dtype=torch.float64) + + def collect(self, preds, labels, metas): + if not isinstance(preds, Iterable): + preds = [preds] + if not isinstance(labels, Iterable): + labels = [labels] + if not isinstance(metas, Iterable): + metas = [metas] + + ret_value = total_intersect_and_union( + preds, + labels, + self.num_classes, + ignore_index=self.ignore_index, + label_map=self.label_map, + reduce_zero_label=self.reduce_zero_label) + self.total_area_intersect += ret_value[0] + self.total_area_union += ret_value[1] + self.total_area_pred_label += ret_value[2] + self.total_area_label += ret_value[3] + + if self.collect_type == 'seg_map': + if isinstance(preds, Iterable): + self.prediction_pool.extend(preds) + self.label_pool.extend(labels) + self.meta_pool.extend(metas) + else: + self.prediction_pool.append(preds) + self.label_pool.append(labels) + self.meta_pool.append(metas) + + def retrieval(self): + if self.collect_type == 'pixels_count': + + return (self.total_area_intersect, self.total_area_union, + self.total_area_pred_label, self.total_area_label) + elif self.collect_type == 'seg_map': + return self.prediction_pool, self.label_pool, self.meta_pool + + def calculate(self, metrics): + return calculate_metrics( + self.total_area_intersect, + self.total_area_union, + self.total_area_pred_label, + self.total_area_label, + metrics=metrics) + + def merge(self, collectors): + if not isinstance(collectors, Iterable): + collectors = [collectors] + for collector in collectors: + self.total_area_intersect += collector.total_area_intersect + self.total_area_union += collector.total_area_union + self.total_area_pred_label += collector.total_area_pred_label + self.total_area_label += collector.total_area_label + + if self.collect_type == 'seg_map': + self.prediction_pool.extend(collector.prediction_pool) + self.label_pool.extend(collector.label_pool) + self.meta_pool.extend(collector.meta_pool) diff --git a/mmseg/datasets/cityscapes.py b/mmseg/datasets/cityscapes.py index fa9958ac14..a6629d4019 100644 --- a/mmseg/datasets/cityscapes.py +++ b/mmseg/datasets/cityscapes.py @@ -125,8 +125,7 @@ def evaluate(self, results, metric='mIoU', logger=None, - imgfile_prefix=None, - efficient_test=False): + imgfile_prefix=None): """Evaluation in Cityscapes/default protocol. Args: @@ -157,7 +156,7 @@ def evaluate(self, if len(metrics) > 0: eval_results.update( super(CityscapesDataset, - self).evaluate(results, metrics, logger, efficient_test)) + self).evaluate(results, metrics, logger)) return eval_results diff --git a/mmseg/datasets/custom.py b/mmseg/datasets/custom.py index 753dcb103e..3cb59a8aba 100644 --- a/mmseg/datasets/custom.py +++ b/mmseg/datasets/custom.py @@ -1,7 +1,6 @@ import os import os.path as osp -from collections import OrderedDict -from functools import reduce +from collections import Iterable, OrderedDict import mmcv import numpy as np @@ -9,8 +8,6 @@ from prettytable import PrettyTable from torch.utils.data import Dataset -from mmseg.core import eval_metrics -from mmseg.core.evaluation.metrics import calculate_metrics from mmseg.utils import get_root_logger from .builder import DATASETS from .pipelines import Compose @@ -228,26 +225,39 @@ def prepare_test_img(self, idx): def format_results(self, results, **kwargs): """Place holder to format result to dataset specific output.""" - def get_gt_seg_maps(self, efficient_test=False): + # def get_gt_seg_maps(self, efficient_test): + # """Get ground truth segmentation maps for evaluation.""" + # gt_seg_maps = [] + # for img_info in self.img_infos: + # seg_map = osp.join(self.ann_dir, img_info['ann']['seg_map']) + # if efficient_test: + # gt_seg_map = seg_map + # else: + # gt_seg_map = mmcv.imread( + # seg_map, flag='unchanged', backend='pillow') + # gt_seg_maps.append(gt_seg_map) + # return gt_seg_maps + + def get_gt_seg_maps(self): """Get ground truth segmentation maps for evaluation.""" - gt_seg_maps = [] for img_info in self.img_infos: seg_map = osp.join(self.ann_dir, img_info['ann']['seg_map']) - if efficient_test: - gt_seg_map = seg_map - else: - gt_seg_map = mmcv.imread( - seg_map, flag='unchanged', backend='pillow') - gt_seg_maps.append(gt_seg_map) + gt_seg_map = mmcv.imread( + seg_map, flag='unchanged', backend='pillow') + yield [gt_seg_map] + + def index_gt_seg_maps(self, indexes): + """Get ground truth segmentation map by index for evaluation.""" + if not isinstance(indexes, Iterable): + indexes = [indexes] + gt_seg_maps = [] + for index in indexes: + seg_map = osp.join(self.ann_dir, + self.img_infos[index]['ann']['seg_map']) + gt_seg_maps.append( + mmcv.imread(seg_map, flag='unchanged', backend='pillow')) return gt_seg_maps - def get_gt_seg_map(self, idx): - """Get ground truth segmentation maps for evaluation.""" - seg_map = osp.join(self.ann_dir, self.img_infos[idx]['ann']['seg_map']) - gt_seg_map = mmcv.imread(seg_map, flag='unchanged', backend='pillow') - - return gt_seg_map - def get_classes_and_palette(self, classes=None, palette=None): """Get class names of current dataset. @@ -311,88 +321,103 @@ def get_palette_for_custom_classes(self, class_names, palette=None): return palette - def progressive_evaluate(self, - results, - metric='mIoU', - logger=None, - **kwargs): - if isinstance(metric, str): - metric = [metric] - allowed_metrics = ['mIoU', 'mDice', 'mFscore'] - if not set(metric).issubset(set(allowed_metrics)): - raise KeyError('metric {} is not supported'.format(metric)) - - eval_results = {} - - total_area_intersect, total_area_union, total_area_pred_label, \ - total_area_label = results - - ret_metrics = calculate_metrics(total_area_intersect, total_area_union, - total_area_pred_label, - total_area_label, metric) - - # Because dataset.CLASSES is required in progressive_single_gpu_test, - # progressive_multi_gpu_test, so it's necessary to keep - # dataset.CLASSES. - class_names = self.CLASSES - - # summary table - ret_metrics_summary = OrderedDict({ - ret_metric: np.round(np.nanmean(ret_metric_value) * 100, 2) - for ret_metric, ret_metric_value in ret_metrics.items() - }) - - # each class table - ret_metrics.pop('aAcc', None) - ret_metrics_class = OrderedDict({ - ret_metric: np.round(ret_metric_value * 100, 2) - for ret_metric, ret_metric_value in ret_metrics.items() - }) - ret_metrics_class.update({'Class': class_names}) - ret_metrics_class.move_to_end('Class', last=False) - - # for logger - class_table_data = PrettyTable() - for key, val in ret_metrics_class.items(): - class_table_data.add_column(key, val) - - summary_table_data = PrettyTable() - for key, val in ret_metrics_summary.items(): - if key == 'aAcc': - summary_table_data.add_column(key, [val]) - else: - summary_table_data.add_column('m' + key, [val]) - - print_log('per class results:', logger) - print_log('\n' + class_table_data.get_string(), logger=logger) - print_log('Summary:', logger) - print_log('\n' + summary_table_data.get_string(), logger=logger) - - # each metric dict - for key, value in ret_metrics_summary.items(): - if key == 'aAcc': - eval_results[key] = value / 100.0 - else: - eval_results['m' + key] = value / 100.0 - - ret_metrics_class.pop('Class', None) - for key, value in ret_metrics_class.items(): - eval_results.update({ - key + '.' + str(name): value[idx] / 100.0 - for idx, name in enumerate(class_names) - }) - - if mmcv.is_list_of(results, str): - for file_name in results: - os.remove(file_name) - return eval_results - - def evaluate(self, - results, - metric='mIoU', - logger=None, - efficient_test=False, - **kwargs): + # def evaluate(self, + # results, + # metric='mIoU', + # logger=None, + # efficient_test=False, + # **kwargs): + # """Evaluate the dataset. + + # Args: + # results (list): Testing results of the dataset. + # metric (str | list[str]): Metrics to be evaluated. 'mIoU', + # 'mDice' and 'mFscore' are supported. + # logger (logging.Logger | None | str): Logger used for printing + # related information during evaluation. Default: None. + + # Returns: + # dict[str, float]: Default metrics. + # """ + + # if isinstance(metric, str): + # metric = [metric] + # allowed_metrics = ['mIoU', 'mDice', 'mFscore'] + # if not set(metric).issubset(set(allowed_metrics)): + # raise KeyError('metric {} is not supported'.format(metric)) + # eval_results = {} + # gt_seg_maps = self.get_gt_seg_maps(efficient_test) + # if self.CLASSES is None: + # num_classes = len( + # reduce(np.union1d, [np.unique(_) for _ in gt_seg_maps])) + # else: + # num_classes = len(self.CLASSES) + # ret_metrics = eval_metrics( + # results, + # gt_seg_maps, + # num_classes, + # self.ignore_index, + # metric, + # label_map=self.label_map, + # reduce_zero_label=self.reduce_zero_label) + + # if self.CLASSES is None: + # class_names = tuple(range(num_classes)) + # else: + # class_names = self.CLASSES + + # # summary table + # ret_metrics_summary = OrderedDict({ + # ret_metric: np.round(np.nanmean(ret_metric_value) * 100, 2) + # for ret_metric, ret_metric_value in ret_metrics.items() + # }) + + # # each class table + # ret_metrics.pop('aAcc', None) + # ret_metrics_class = OrderedDict({ + # ret_metric: np.round(ret_metric_value * 100, 2) + # for ret_metric, ret_metric_value in ret_metrics.items() + # }) + # ret_metrics_class.update({'Class': class_names}) + # ret_metrics_class.move_to_end('Class', last=False) + + # # for logger + # class_table_data = PrettyTable() + # for key, val in ret_metrics_class.items(): + # class_table_data.add_column(key, val) + + # summary_table_data = PrettyTable() + # for key, val in ret_metrics_summary.items(): + # if key == 'aAcc': + # summary_table_data.add_column(key, [val]) + # else: + # summary_table_data.add_column('m' + key, [val]) + + # print_log('per class results:', logger) + # print_log('\n' + class_table_data.get_string(), logger=logger) + # print_log('Summary:', logger) + # print_log('\n' + summary_table_data.get_string(), logger=logger) + + # # each metric dict + # for key, value in ret_metrics_summary.items(): + # if key == 'aAcc': + # eval_results[key] = value / 100.0 + # else: + # eval_results['m' + key] = value / 100.0 + + # ret_metrics_class.pop('Class', None) + # for key, value in ret_metrics_class.items(): + # eval_results.update({ + # key + '.' + str(name): value[idx] / 100.0 + # for idx, name in enumerate(class_names) + # }) + + # if mmcv.is_list_of(results, str): + # for file_name in results: + # os.remove(file_name) + # return eval_results + + def evaluate(self, results, metric='mIoU', logger=None, **kwargs): """Evaluate the dataset. Args: @@ -405,32 +430,19 @@ def evaluate(self, Returns: dict[str, float]: Default metrics. """ - if isinstance(metric, str): metric = [metric] allowed_metrics = ['mIoU', 'mDice', 'mFscore'] if not set(metric).issubset(set(allowed_metrics)): raise KeyError('metric {} is not supported'.format(metric)) + eval_results = {} - gt_seg_maps = self.get_gt_seg_maps(efficient_test) - if self.CLASSES is None: - num_classes = len( - reduce(np.union1d, [np.unique(_) for _ in gt_seg_maps])) - else: - num_classes = len(self.CLASSES) - ret_metrics = eval_metrics( - results, - gt_seg_maps, - num_classes, - self.ignore_index, - metric, - label_map=self.label_map, - reduce_zero_label=self.reduce_zero_label) - - if self.CLASSES is None: - class_names = tuple(range(num_classes)) - else: - class_names = self.CLASSES + ret_metrics = results.calculate(metric) + + # Because dataset.CLASSES is required in progressive_single_gpu_test, + # progressive_multi_gpu_test, so it's necessary to keep + # dataset.CLASSES. + class_names = self.CLASSES # summary table ret_metrics_summary = OrderedDict({ diff --git a/tools/test.py b/tools/test.py index 6539e1c3b0..e85eb370f9 100644 --- a/tools/test.py +++ b/tools/test.py @@ -8,8 +8,7 @@ wrap_fp16_model) from mmcv.utils import DictAction -from mmseg.apis import (multi_gpu_test, progressive_multi_gpu_test, - progressive_single_gpu_test, single_gpu_test) +from mmseg.apis import progressive_multi_gpu_test, progressive_single_gpu_test from mmseg.datasets import build_dataloader, build_dataset from mmseg.models import build_segmentor @@ -136,37 +135,29 @@ def main(): print('"PALETTE" not found in meta, use dataset.PALETTE instead') model.PALETTE = dataset.PALETTE - efficient_test = False - only_pixel_count = False - if args.eval_options is not None: - efficient_test = args.eval_options.get('efficient_test', False) - only_pixel_count = args.eval_options.get('only_pixel_count', False) - assert not (args.format_only and only_pixel_count), 'format_only' - 'and only_pixel_count can\'t be set at the same time.' - assert not (args.out and only_pixel_count), 'format_only' - 'and only_pixel_count can\'t be set at the same time.' + # efficient_test = False + # if args.eval_options is not None: + # efficient_test = args.eval_options.get('efficient_test', False) + + torch.cuda.empty_cache() if not distributed: model = MMDataParallel(model, device_ids=[0]) - if only_pixel_count: - outputs = progressive_single_gpu_test(model, data_loader, - args.show, args.show_dir, - args.opacity) - else: - outputs = single_gpu_test(model, data_loader, args.show, - args.show_dir, efficient_test, - args.opacity) + outputs = progressive_single_gpu_test(model, data_loader, False, + args.show, args.show_dir, + args.opacity) + # outputs = single_gpu_test(model, data_loader, args.show, + # args.show_dir, efficient_test, + # args.opacity) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) - if only_pixel_count: - outputs = progressive_multi_gpu_test(model, data_loader, - args.gpu_collect) - else: - outputs = multi_gpu_test(model, data_loader, args.tmpdir, - args.gpu_collect, efficient_test) + outputs = progressive_multi_gpu_test(model, data_loader, False, + args.tmpdir, args.gpu_collect) + # outputs = multi_gpu_test(model, data_loader, args.tmpdir, + # args.gpu_collect, efficient_test) rank, _ = get_dist_info() if rank == 0: @@ -177,10 +168,7 @@ def main(): if args.format_only: dataset.format_results(outputs, **kwargs) if args.eval: - if only_pixel_count: - dataset.progressive_evaluate(outputs, args.eval, **kwargs) - else: - dataset.evaluate(outputs, args.eval, **kwargs) + dataset.evaluate(outputs, args.eval, **kwargs) if __name__ == '__main__': From 3a97df65c3b2dbf3c3ab36bdb7858cb68ed2fd2a Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 22 Jul 2021 16:41:05 +0800 Subject: [PATCH 35/96] Using processor to refactor evaluation workflow. --- mmseg/apis/test.py | 254 ++++++++++++++++++++++-------- mmseg/core/evaluation/__init__.py | 7 +- mmseg/core/evaluation/metrics.py | 97 +++++++++++- mmseg/datasets/cityscapes.py | 76 +++++++++ mmseg/datasets/custom.py | 125 ++++++++------- tools/test.py | 49 +++--- 6 files changed, 447 insertions(+), 161 deletions(-) diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index cef47f46c3..19edc6a904 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -1,14 +1,17 @@ import os.path as osp +import pickle +import shutil import tempfile import mmcv import numpy as np import torch +import torch.distributed as dist from mmcv.engine import collect_results_cpu, collect_results_gpu from mmcv.image import tensor2imgs from mmcv.runner import get_dist_info -from mmseg.core.evaluation.metrics import intersect_and_union +from mmseg.core.evaluation.metrics import ResultProcessor def np2tmp(array, temp_file_name=None, tmpdir=None): @@ -169,24 +172,56 @@ def multi_gpu_test(model, def progressive_single_gpu_test(model, data_loader, + middle_save=False, show=False, out_dir=None, opacity=0.5): + """Test with single GPU by progressive mode. + + Args: + model (nn.Module): Model to be tested. + data_loader (utils.data.Dataloader): Pytorch data loader. + show (bool): Whether show results during inference. Default: False. + middle_save (bool, optional): Whether to save middle variables when + progressive test. Default: False. + out_dir (str, optional): If specified, the results will be dumped into + the directory to save output results. + opacity(float): Opacity of painted segmentation map. + Default 0.5. + Must be in (0, 1] range. + Returns: + object: The processor containing results. + """ model.eval() dataset = data_loader.dataset - num_classes = len(dataset.CLASSES) prog_bar = mmcv.ProgressBar(len(dataset)) - total_area_intersect = torch.zeros((num_classes, ), dtype=torch.float64) - total_area_union = torch.zeros((num_classes, ), dtype=torch.float64) - total_area_pred_label = torch.zeros((num_classes, ), dtype=torch.float64) - total_area_label = torch.zeros((num_classes, ), dtype=torch.float64) + if middle_save: + processor = ResultProcessor( + num_classes=len(dataset.CLASSES), + ignore_index=dataset.ignore_index, + collect_type='seg_map', + label_map=dataset.label_map, + reduce_zero_label=dataset.reduce_zero_label) + else: + processor = ResultProcessor( + num_classes=len(dataset.CLASSES), + ignore_index=dataset.ignore_index, + collect_type='pixels_count', + label_map=dataset.label_map, + reduce_zero_label=dataset.reduce_zero_label) + + gt_maps_generator = dataset.get_gt_seg_maps() - cur = 0 for _, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, **data) + # Collect meta to avoid sorting for results collected from multi gpu. + gt_map = next(gt_maps_generator) + meta = data['img_metas'][0].data + processor.collect(result, gt_map, meta) + if show or out_dir: img_tensor = data['img'][0] img_metas = data['img_metas'][0].data[0] @@ -213,101 +248,184 @@ def progressive_single_gpu_test(model, out_file=out_file, opacity=opacity) - for i in range(len(result)): - gt_semantic_map = dataset.get_gt_seg_map(cur + i) - - area_intersect, area_union, area_pred_label, area_label = \ - intersect_and_union( - result[i], gt_semantic_map, num_classes, - dataset.ignore_index, dataset.label_map, - dataset.reduce_zero_label) - - total_area_intersect += area_intersect - total_area_union += area_union - total_area_pred_label += area_pred_label - total_area_label += area_label - - print(total_area_intersect / total_area_union) - + batch_size = len(result) + for _ in range(batch_size): prog_bar.update() - cur += len(result) - - return total_area_intersect, total_area_union, total_area_pred_label, \ - total_area_label + return processor -# TODO: Support distributed test api def progressive_multi_gpu_test(model, data_loader, + middle_save=False, tmpdir=None, gpu_collect=False): + """Test model with multiple gpus by progressive mode. + + This method tests model with multiple gpus and collects the results + under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' + it encodes results to gpu tensors and use gpu communication for results + collection. On cpu mode it saves the results on different gpus to 'tmpdir' + and collects them by the rank 0 worker. + + Args: + model (nn.Module): Model to be tested. + data_loader (utils.data.Dataloader): Pytorch data loader. + middle_save (bool, optional): Whether to save middle variables when + progressive test. Default: False. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. The same path is used for efficient + test. + gpu_collect (bool): Option to use either gpu or cpu to collect results. + Returns: + object: The processor containing results + """ model.eval() dataset = data_loader.dataset - num_classes = len(dataset.CLASSES) + if middle_save: + processor = ResultProcessor( + num_classes=len(dataset.CLASSES), + ignore_index=dataset.ignore_index, + collect_type='seg_map', + label_map=dataset.label_map, + reduce_zero_label=dataset.reduce_zero_label) + else: + processor = ResultProcessor( + num_classes=len(dataset.CLASSES), + ignore_index=dataset.ignore_index, + collect_type='pixels_count', + label_map=dataset.label_map, + reduce_zero_label=dataset.reduce_zero_label) + rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) - total_area_intersect = torch.zeros((num_classes, ), dtype=torch.float64) - total_area_union = torch.zeros((num_classes, ), dtype=torch.float64) - total_area_pred_label = torch.zeros((num_classes, ), dtype=torch.float64) - total_area_label = torch.zeros((num_classes, ), dtype=torch.float64) - cur = 0 for _, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) - for i in range(len(result)): - gt_semantic_map = dataset.get_gt_seg_map(cur + i * world_size) + # TODO: adapt samples_per_gpu > 1. + # only samples_per_gpu=1 valid now + gt_seg_map = dataset.index_gt_seg_maps(cur + rank) + meta = data['img_metas'][0].data + processor.collect(result, gt_seg_map, meta) - area_intersect, area_union, area_pred_label, area_label = \ - intersect_and_union( - result[i], gt_semantic_map, num_classes, - dataset.ignore_index, dataset.label_map, - dataset.reduce_zero_label) - - total_area_intersect += area_intersect - total_area_union += area_union - total_area_pred_label += area_pred_label - total_area_label += area_label - - if rank == 0: - for _ in range(len(result) * world_size): - prog_bar.update() + if rank == 0: + for _ in range(len(result) * world_size): + prog_bar.update() cur += len(result) * world_size - pixel_count_matrix = [ - total_area_intersect, total_area_union, total_area_pred_label, - total_area_label - ] # collect results from all ranks if gpu_collect: - results = collect_count_results_gpu(pixel_count_matrix, 4 * world_size) + processor = collect_processors_gpu(processor) else: - results = collect_count_results_cpu(pixel_count_matrix, 4 * world_size, - tmpdir) - return results + processor = collect_processors_cpu(processor, tmpdir) + return processor -def collect_count_results_gpu(result_part, size): - """Collect pixel count matrix result under gpu mode. +def collect_processors_gpu(processor): + """Collect result processors under gpu mode. On gpu mode, this function will encode results to gpu tensors and use gpu communication for results collection. Args: - result_part (list[Tensor]): four type of pixel count matrix -- - {area_intersect, area_union, area_pred_label, area_label}, These - four tensor shape of (num_classes, ). - size (int): Size of the results, commonly equal to length of - the results. + processor (object): Result processor containing predictions and labels + to be collected. + Returns: + object: The gathered processor. """ - pass + rank, world_size = get_dist_info() + # dump result part to tensor with pickle + part_tensor = torch.tensor( + bytearray(pickle.dumps(processor)), dtype=torch.uint8, device='cuda') + # gather all result part tensor shape + shape_tensor = torch.tensor(part_tensor.shape, device='cuda') + shape_list = [shape_tensor.clone() for _ in range(world_size)] + dist.all_gather(shape_list, shape_tensor) + # padding result part tensor to max length + shape_max = torch.tensor(shape_list).max() + part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') + part_send[:shape_tensor[0]] = part_tensor + part_recv_list = [ + part_tensor.new_zeros(shape_max) for _ in range(world_size) + ] + # gather all result part + dist.all_gather(part_recv_list, part_send) + if rank == 0: + # load results of all parts from tmp dir + main_processor = pickle.loads( + part_recv_list[0][:shape_list[0]].cpu().numpy().tobytes()) + sub_processors = [] + for recv, shape in zip(part_recv_list, shape_list): + part_processor = pickle.loads( + recv[:shape[0]].cpu().numpy().tobytes()) + # When data is severely insufficient, an empty part_result + # on a certain gpu could makes the overall outputs empty. + if part_processor: + sub_processors.append(part_processor) + main_processor.merge(sub_processors) + return main_processor + + +def collect_processors_cpu(processor, tmpdir=None): + """Collect result processors under cpu mode. + + On cpu mode, this function will save the result processors on different + gpus to``tmpdir`` and collect them by the rank 0 worker. -def collect_count_results_cpu(result_part, size, tmpdir=None): - pass + Args: + processor (object): Result processor containing predictions and labels + to be collected. + tmpdir (str | None): temporal directory for collected results to + store. If set to None, it will create a random temporal directory + for it. + + Returns: + object: The gathered processor. + """ + rank, world_size = get_dist_info() + # create a tmp dir if it is not specified + if tmpdir is None: + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full((MAX_LEN, ), + 32, + dtype=torch.uint8, + device='cuda') + if rank == 0: + mmcv.mkdir_or_exist('.dist_test') + tmpdir = tempfile.mkdtemp(dir='.dist_test') + tmpdir = torch.tensor( + bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') + dir_tensor[:len(tmpdir)] = tmpdir + dist.broadcast(dir_tensor, 0) + tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() + else: + mmcv.mkdir_or_exist(tmpdir) + # dump the part result to the dir + mmcv.dump(processor, osp.join(tmpdir, f'part_{rank}.pkl')) + dist.barrier() + # collect all parts + if rank != 0: + return None + else: + # load results of all parts from tmp dir + main_processor = mmcv.load(osp.join(tmpdir, f'part_{0}.pkl')) + sub_processors = [] + for i in range(1, world_size): + part_file = osp.join(tmpdir, f'part_{i}.pkl') + part_processor = mmcv.load(part_file) + # When data is severely insufficient, an empty part_result + # on a certain gpu could makes the overall outputs empty. + if part_processor: + sub_processors.append(part_processor) + main_processor.merge(sub_processors) + # remove tmp dir + shutil.rmtree(tmpdir) + return main_processor diff --git a/mmseg/core/evaluation/__init__.py b/mmseg/core/evaluation/__init__.py index 2db4052490..56af4e91c2 100644 --- a/mmseg/core/evaluation/__init__.py +++ b/mmseg/core/evaluation/__init__.py @@ -1,9 +1,10 @@ from .class_names import get_classes, get_palette from .eval_hooks import DistEvalHook, EvalHook -from .metrics import (calculate_metrics, eval_metrics, mean_dice, mean_fscore, - mean_iou) +from .metrics import (ResultProcessor, calculate_metrics, eval_metrics, + mean_dice, mean_fscore, mean_iou) __all__ = [ 'EvalHook', 'DistEvalHook', 'mean_dice', 'mean_iou', 'mean_fscore', - 'eval_metrics', 'get_classes', 'get_palette', 'calculate_metrics' + 'eval_metrics', 'get_classes', 'get_palette', 'calculate_metrics', + 'ResultProcessor' ] diff --git a/mmseg/core/evaluation/metrics.py b/mmseg/core/evaluation/metrics.py index 7aeadc041f..5052dd18e4 100644 --- a/mmseg/core/evaluation/metrics.py +++ b/mmseg/core/evaluation/metrics.py @@ -1,4 +1,4 @@ -from collections import OrderedDict +from collections import Iterable, OrderedDict import mmcv import numpy as np @@ -390,3 +390,98 @@ def calculate_metrics(total_area_intersect, for metric, metric_value in ret_metrics.items() }) return ret_metrics + + +class ResultProcessor(object): + """collect and process results when progressive evaluation.""" + + def __init__(self, + num_classes, + ignore_index=255, + collect_type='pixels_count', + label_map=dict(), + reduce_zero_label=False): + self.num_classes = num_classes + self.collect_type = collect_type + + self.ignore_index = ignore_index + self.label_map = label_map + self.reduce_zero_label = reduce_zero_label + + assert collect_type.lower() in ['pixels_count', 'seg_map'] + + self.prediction_ram = [] + self.label_ram = [] + self.meta_ram = [] + + self.total_area_intersect = torch.zeros((self.num_classes, ), + dtype=torch.float64) + self.total_area_union = torch.zeros((self.num_classes, ), + dtype=torch.float64) + self.total_area_pred_label = torch.zeros((self.num_classes, ), + dtype=torch.float64) + self.total_area_label = torch.zeros((self.num_classes, ), + dtype=torch.float64) + + def collect(self, preds, labels, metas): + """collect predictions, ground truth labels and meta information.""" + if not isinstance(preds, Iterable): + preds = [preds] + if not isinstance(labels, Iterable): + labels = [labels] + if not isinstance(metas, Iterable): + metas = [metas] + + ret_value = total_intersect_and_union( + preds, + labels, + self.num_classes, + ignore_index=self.ignore_index, + label_map=self.label_map, + reduce_zero_label=self.reduce_zero_label) + self.total_area_intersect += ret_value[0] + self.total_area_union += ret_value[1] + self.total_area_pred_label += ret_value[2] + self.total_area_label += ret_value[3] + + if self.collect_type == 'seg_map': + if isinstance(preds, Iterable): + self.prediction_ram.extend(preds) + self.label_ram.extend(labels) + self.meta_ram.extend(metas) + else: + self.prediction_ram.append(preds) + self.label_ram.append(labels) + self.meta_ram.append(metas) + + def retrieval(self): + """Get processor content by collect type.""" + if self.collect_type == 'pixels_count': + return (self.total_area_intersect, self.total_area_union, + self.total_area_pred_label, self.total_area_label) + elif self.collect_type == 'seg_map': + return self.prediction_ram, self.label_ram, self.meta_ram + + def calculate(self, metrics): + """Calculate metric by using collected pixelx count matrix.""" + return calculate_metrics( + self.total_area_intersect, + self.total_area_union, + self.total_area_pred_label, + self.total_area_label, + metrics=metrics) + + def merge(self, processors): + """Merge other processors into this processor.""" + if not isinstance(processors, Iterable): + processors = [processors] + for processor in processors: + self.total_area_intersect += processor.total_area_intersect + self.total_area_union += processor.total_area_union + self.total_area_pred_label += processor.total_area_pred_label + self.total_area_label += processor.total_area_label + + if self.collect_type == 'seg_map': + self.prediction_ram.extend(processor.prediction_ram) + self.label_ram.extend(processor.label_ram) + self.meta_ram.extend(processor.meta_ram) diff --git a/mmseg/datasets/cityscapes.py b/mmseg/datasets/cityscapes.py index fa9958ac14..3310886818 100644 --- a/mmseg/datasets/cityscapes.py +++ b/mmseg/datasets/cityscapes.py @@ -161,6 +161,82 @@ def evaluate(self, return eval_results + # TODO: Use processor to format results. + def progressive_format_results(self, + results, + imgfile_prefix=None, + to_label_id=True): + """Format the results into dir (standard format for Cityscapes + evaluation). + + Args: + results (list): Testing results of the dataset. + imgfile_prefix (str | None): The prefix of images files. It + includes the file path and the prefix of filename, e.g., + "a/b/prefix". If not specified, a temp file will be created. + Default: None. + to_label_id (bool): whether convert output to label_id for + submission. Default: False + + Returns: + tuple: (result_files, tmp_dir), result_files is a list containing + the image paths, tmp_dir is the temporal directory created + for saving json/png files when img_prefix is not specified. + """ + + assert isinstance(results, list), 'results must be a list' + assert len(results) == len(self), ( + 'The length of results is not equal to the dataset len: ' + f'{len(results)} != {len(self)}') + + if imgfile_prefix is None: + tmp_dir = tempfile.TemporaryDirectory() + imgfile_prefix = tmp_dir.name + else: + tmp_dir = None + result_files = self.results2img(results, imgfile_prefix, to_label_id) + + return result_files, tmp_dir + + def progressive_evaluate(self, + results, + metric='mIoU', + logger=None, + imgfile_prefix=None): + """Evaluation in Cityscapes/default protocol. + + Args: + results (list): Testing results of the dataset. + metric (str | list[str]): Metrics to be evaluated. + logger (logging.Logger | None | str): Logger used for printing + related information during evaluation. Default: None. + imgfile_prefix (str | None): The prefix of output image file, + for cityscapes evaluation only. It includes the file path and + the prefix of filename, e.g., "a/b/prefix". + If results are evaluated with cityscapes protocol, it would be + the prefix of output png files. The output files would be + png images under folder "a/b/prefix/xxx.png", where "xxx" is + the image name of cityscapes. If not specified, a temp file + will be created for evaluation. + Default: None. + + Returns: + dict[str, float]: Cityscapes/default metrics. + """ + + eval_results = dict() + metrics = metric.copy() if isinstance(metric, list) else [metric] + if 'cityscapes' in metrics: + eval_results.update( + self._evaluate_cityscapes(results, logger, imgfile_prefix)) + metrics.remove('cityscapes') + if len(metrics) > 0: + eval_results.update( + super(CityscapesDataset, + self).progressive_evaluate(results, metrics, logger)) + + return eval_results + def _evaluate_cityscapes(self, results, logger, imgfile_prefix): """Evaluation in Cityscapes protocol. diff --git a/mmseg/datasets/custom.py b/mmseg/datasets/custom.py index 753dcb103e..e840efe426 100644 --- a/mmseg/datasets/custom.py +++ b/mmseg/datasets/custom.py @@ -1,6 +1,6 @@ import os import os.path as osp -from collections import OrderedDict +from collections import Iterable, OrderedDict from functools import reduce import mmcv @@ -9,8 +9,7 @@ from prettytable import PrettyTable from torch.utils.data import Dataset -from mmseg.core import eval_metrics -from mmseg.core.evaluation.metrics import calculate_metrics +from mmseg.core.evaluation import eval_metrics from mmseg.utils import get_root_logger from .builder import DATASETS from .pipelines import Compose @@ -228,26 +227,26 @@ def prepare_test_img(self, idx): def format_results(self, results, **kwargs): """Place holder to format result to dataset specific output.""" - def get_gt_seg_maps(self, efficient_test=False): + def get_gt_seg_maps(self): """Get ground truth segmentation maps for evaluation.""" - gt_seg_maps = [] for img_info in self.img_infos: seg_map = osp.join(self.ann_dir, img_info['ann']['seg_map']) - if efficient_test: - gt_seg_map = seg_map - else: - gt_seg_map = mmcv.imread( - seg_map, flag='unchanged', backend='pillow') - gt_seg_maps.append(gt_seg_map) + gt_seg_map = mmcv.imread( + seg_map, flag='unchanged', backend='pillow') + yield [gt_seg_map] + + def index_gt_seg_maps(self, indexes): + """Get ground truth segmentation map by index for evaluation.""" + if not isinstance(indexes, Iterable): + indexes = [indexes] + gt_seg_maps = [] + for index in indexes: + seg_map = osp.join(self.ann_dir, + self.img_infos[index]['ann']['seg_map']) + gt_seg_maps.append( + mmcv.imread(seg_map, flag='unchanged', backend='pillow')) return gt_seg_maps - def get_gt_seg_map(self, idx): - """Get ground truth segmentation maps for evaluation.""" - seg_map = osp.join(self.ann_dir, self.img_infos[idx]['ann']['seg_map']) - gt_seg_map = mmcv.imread(seg_map, flag='unchanged', backend='pillow') - - return gt_seg_map - def get_classes_and_palette(self, classes=None, palette=None): """Get class names of current dataset. @@ -311,30 +310,50 @@ def get_palette_for_custom_classes(self, class_names, palette=None): return palette - def progressive_evaluate(self, - results, - metric='mIoU', - logger=None, - **kwargs): + def evaluate(self, + results, + metric='mIoU', + logger=None, + efficient_test=False, + **kwargs): + """Evaluate the dataset. + + Args: + results (list): Testing results of the dataset. + metric (str | list[str]): Metrics to be evaluated. 'mIoU', + 'mDice' and 'mFscore' are supported. + logger (logging.Logger | None | str): Logger used for printing + related information during evaluation. Default: None. + + Returns: + dict[str, float]: Default metrics. + """ + if isinstance(metric, str): metric = [metric] allowed_metrics = ['mIoU', 'mDice', 'mFscore'] if not set(metric).issubset(set(allowed_metrics)): raise KeyError('metric {} is not supported'.format(metric)) - eval_results = {} + gt_seg_maps = self.get_gt_seg_maps(efficient_test) + if self.CLASSES is None: + num_classes = len( + reduce(np.union1d, [np.unique(_) for _ in gt_seg_maps])) + else: + num_classes = len(self.CLASSES) + ret_metrics = eval_metrics( + results, + gt_seg_maps, + num_classes, + self.ignore_index, + metric, + label_map=self.label_map, + reduce_zero_label=self.reduce_zero_label) - total_area_intersect, total_area_union, total_area_pred_label, \ - total_area_label = results - - ret_metrics = calculate_metrics(total_area_intersect, total_area_union, - total_area_pred_label, - total_area_label, metric) - - # Because dataset.CLASSES is required in progressive_single_gpu_test, - # progressive_multi_gpu_test, so it's necessary to keep - # dataset.CLASSES. - class_names = self.CLASSES + if self.CLASSES is None: + class_names = tuple(range(num_classes)) + else: + class_names = self.CLASSES # summary table ret_metrics_summary = OrderedDict({ @@ -387,12 +406,11 @@ def progressive_evaluate(self, os.remove(file_name) return eval_results - def evaluate(self, - results, - metric='mIoU', - logger=None, - efficient_test=False, - **kwargs): + def progressive_evaluate(self, + results, + metric='mIoU', + logger=None, + **kwargs): """Evaluate the dataset. Args: @@ -405,32 +423,19 @@ def evaluate(self, Returns: dict[str, float]: Default metrics. """ - if isinstance(metric, str): metric = [metric] allowed_metrics = ['mIoU', 'mDice', 'mFscore'] if not set(metric).issubset(set(allowed_metrics)): raise KeyError('metric {} is not supported'.format(metric)) + eval_results = {} - gt_seg_maps = self.get_gt_seg_maps(efficient_test) - if self.CLASSES is None: - num_classes = len( - reduce(np.union1d, [np.unique(_) for _ in gt_seg_maps])) - else: - num_classes = len(self.CLASSES) - ret_metrics = eval_metrics( - results, - gt_seg_maps, - num_classes, - self.ignore_index, - metric, - label_map=self.label_map, - reduce_zero_label=self.reduce_zero_label) + ret_metrics = results.calculate(metric) - if self.CLASSES is None: - class_names = tuple(range(num_classes)) - else: - class_names = self.CLASSES + # Because dataset.CLASSES is required in progressive_single_gpu_test, + # progressive_multi_gpu_test, so it's necessary to keep + # dataset.CLASSES. + class_names = self.CLASSES # summary table ret_metrics_summary = OrderedDict({ diff --git a/tools/test.py b/tools/test.py index 6539e1c3b0..0db262c103 100644 --- a/tools/test.py +++ b/tools/test.py @@ -8,8 +8,7 @@ wrap_fp16_model) from mmcv.utils import DictAction -from mmseg.apis import (multi_gpu_test, progressive_multi_gpu_test, - progressive_single_gpu_test, single_gpu_test) +from mmseg.apis import progressive_multi_gpu_test, progressive_single_gpu_test from mmseg.datasets import build_dataloader, build_dataset from mmseg.models import build_segmentor @@ -136,37 +135,32 @@ def main(): print('"PALETTE" not found in meta, use dataset.PALETTE instead') model.PALETTE = dataset.PALETTE - efficient_test = False - only_pixel_count = False + # efficient_test = False + middle_save = False if args.eval_options is not None: - efficient_test = args.eval_options.get('efficient_test', False) - only_pixel_count = args.eval_options.get('only_pixel_count', False) - assert not (args.format_only and only_pixel_count), 'format_only' - 'and only_pixel_count can\'t be set at the same time.' - assert not (args.out and only_pixel_count), 'format_only' - 'and only_pixel_count can\'t be set at the same time.' + # efficient_test = args.eval_options.get('efficient_test', False) + middle_save = args.eval_options.get('efficient_test', False) + + # clean gpu memory when starting a new evaluation. + torch.cuda.empty_cache() if not distributed: model = MMDataParallel(model, device_ids=[0]) - if only_pixel_count: - outputs = progressive_single_gpu_test(model, data_loader, - args.show, args.show_dir, - args.opacity) - else: - outputs = single_gpu_test(model, data_loader, args.show, - args.show_dir, efficient_test, - args.opacity) + outputs = progressive_single_gpu_test(model, data_loader, middle_save, + args.show, args.show_dir, + args.opacity) + # outputs = single_gpu_test(model, data_loader, args.show, + # args.show_dir, efficient_test, + # args.opacity) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) - if only_pixel_count: - outputs = progressive_multi_gpu_test(model, data_loader, - args.gpu_collect) - else: - outputs = multi_gpu_test(model, data_loader, args.tmpdir, - args.gpu_collect, efficient_test) + outputs = progressive_multi_gpu_test(model, data_loader, middle_save, + args.tmpdir, args.gpu_collect) + # outputs = multi_gpu_test(model, data_loader, args.tmpdir, + # args.gpu_collect, efficient_test) rank, _ = get_dist_info() if rank == 0: @@ -175,12 +169,9 @@ def main(): mmcv.dump(outputs, args.out) kwargs = {} if args.eval_options is None else args.eval_options if args.format_only: - dataset.format_results(outputs, **kwargs) + dataset.progressive_format_results(outputs, **kwargs) if args.eval: - if only_pixel_count: - dataset.progressive_evaluate(outputs, args.eval, **kwargs) - else: - dataset.evaluate(outputs, args.eval, **kwargs) + dataset.progressive_evaluate(outputs, args.eval, **kwargs) if __name__ == '__main__': From e6be6b40649d30121cd9e0dff57545fde1308a5b Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 22 Jul 2021 20:35:37 +0800 Subject: [PATCH 36/96] refactor eval hook. --- mmseg/core/evaluation/eval_hooks.py | 79 ++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 14 deletions(-) diff --git a/mmseg/core/evaluation/eval_hooks.py b/mmseg/core/evaluation/eval_hooks.py index 928f2ba612..97b2de4d42 100644 --- a/mmseg/core/evaluation/eval_hooks.py +++ b/mmseg/core/evaluation/eval_hooks.py @@ -25,19 +25,44 @@ def __init__(self, *args, by_epoch=False, efficient_test=False, **kwargs): super().__init__(*args, by_epoch=by_epoch, **kwargs) self.efficient_test = efficient_test + def progressive_evaluate(self, runner, results): + """Evaluate the results by progressive mode. + + Args: + runner (:obj:`mmcv.Runner`): The underlined training runner. + results (list): Output results. + """ + eval_res = self.dataloader.dataset.progressive_evaluate( + results, logger=runner.logger, **self.eval_kwargs) + + # TODO: Blocked by mmcv pr: #1213 + # evaluation info specific buffer + # runner.log_buffer.output['eval_res'] = {} + # for name, val in eval_res.items(): + # runner.log_buffer.output['eval_res'][name] = val + runner.log_buffer.clear() + for name, val in eval_res.items(): + runner.log_buffer.output[name] = val + runner.log_buffer.ready = True + + if self.save_best is not None: + if self.key_indicator == 'auto': + # infer from eval_results + self._init_rule(self.rule, list(eval_res.keys())[0]) + return eval_res[self.key_indicator] + + return None + def _do_evaluate(self, runner): """perform evaluation and save ckpt.""" if not self._should_evaluate(runner): return - from mmseg.apis import single_gpu_test - results = single_gpu_test( - runner.model, - self.dataloader, - show=False, - efficient_test=self.efficient_test) + from mmseg.apis import progressive_single_gpu_test + results = progressive_single_gpu_test( + runner.model, self.dataloader, False, show=False) runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) - key_score = self.evaluate(runner, results) + key_score = self.progressive_evaluate(runner, results) if self.save_best: self._save_ckpt(runner, key_score) @@ -57,9 +82,35 @@ class DistEvalHook(_DistEvalHook): greater_keys = ['mIoU', 'mAcc', 'aAcc'] - def __init__(self, *args, by_epoch=False, efficient_test=False, **kwargs): + def __init__(self, *args, by_epoch=False, **kwargs): super().__init__(*args, by_epoch=by_epoch, **kwargs) - self.efficient_test = efficient_test + + def progressive_evaluate(self, runner, results): + """Evaluate the results by progressive mode. + + Args: + runner (:obj:`mmcv.Runner`): The underlined training runner. + results (list): Output results. + """ + eval_res = self.dataloader.dataset.progressive_evaluate( + results, logger=runner.logger, **self.eval_kwargs) + # TODO: Blocked by mmcv pr: #1213 + # evaluation info specific buffer + # runner.log_buffer.output['eval_res'] = {} + # for name, val in eval_res.items(): + # runner.log_buffer.output['eval_res'][name] = val + runner.log_buffer.clear() + for name, val in eval_res.items(): + runner.log_buffer.output[name] = val + runner.log_buffer.ready = True + + if self.save_best is not None: + if self.key_indicator == 'auto': + # infer from eval_results + self._init_rule(self.rule, list(eval_res.keys())[0]) + return eval_res[self.key_indicator] + + return None def _do_evaluate(self, runner): """perform evaluation and save ckpt.""" @@ -83,17 +134,17 @@ def _do_evaluate(self, runner): if tmpdir is None: tmpdir = osp.join(runner.work_dir, '.eval_hook') - from mmseg.apis import multi_gpu_test - results = multi_gpu_test( + from mmseg.apis import progressive_multi_gpu_test + results = progressive_multi_gpu_test( runner.model, self.dataloader, + False, tmpdir=tmpdir, - gpu_collect=self.gpu_collect, - efficient_test=self.efficient_test) + gpu_collect=self.gpu_collect) if runner.rank == 0: print('\n') runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) - key_score = self.evaluate(runner, results) + key_score = self.progressive_evaluate(runner, results) if self.save_best: self._save_ckpt(runner, key_score) From 8653c9615f2a1f2b5e86cc40009ff1fefaf757a9 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 22 Jul 2021 22:29:12 +0800 Subject: [PATCH 37/96] Fix process bar. --- mmseg/apis/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index f263f1ef1a..19cecabd7a 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -317,13 +317,13 @@ def progressive_multi_gpu_test(model, if rank == 0: batch_size = len(result) if cur + world_size >= len(dataset): - total_samples = cur + world_size + 1 - len(dataset) + total_samples = len(dataset) - cur else: total_samples = batch_size * world_size for _ in range(total_samples): prog_bar.update() - cur += len(result) * world_size + cur += len(result) * world_size # collect results from all ranks if gpu_collect: From 842bdef7eaf615c7d10ace8c91f60e5cdcdfa4a7 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 22 Jul 2021 22:47:08 +0800 Subject: [PATCH 38/96] Fix middle save argument. --- tools/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/test.py b/tools/test.py index 0db262c103..d0cd266a70 100644 --- a/tools/test.py +++ b/tools/test.py @@ -139,7 +139,7 @@ def main(): middle_save = False if args.eval_options is not None: # efficient_test = args.eval_options.get('efficient_test', False) - middle_save = args.eval_options.get('efficient_test', False) + middle_save = args.eval_options.get('middle_save', False) # clean gpu memory when starting a new evaluation. torch.cuda.empty_cache() From d3896657c1283eb1f015f5c9b81f892c5509821a Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 22 Jul 2021 23:03:09 +0800 Subject: [PATCH 39/96] Modify some variable name of dataset evaluate api. --- mmseg/datasets/custom.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mmseg/datasets/custom.py b/mmseg/datasets/custom.py index e840efe426..e4c357b8d0 100644 --- a/mmseg/datasets/custom.py +++ b/mmseg/datasets/custom.py @@ -407,14 +407,14 @@ def evaluate(self, return eval_results def progressive_evaluate(self, - results, + processor, metric='mIoU', logger=None, **kwargs): """Evaluate the dataset. Args: - results (list): Testing results of the dataset. + processor (object): The result processor for progressive mode. metric (str | list[str]): Metrics to be evaluated. 'mIoU', 'mDice' and 'mFscore' are supported. logger (logging.Logger | None | str): Logger used for printing @@ -430,7 +430,7 @@ def progressive_evaluate(self, raise KeyError('metric {} is not supported'.format(metric)) eval_results = {} - ret_metrics = results.calculate(metric) + ret_metrics = processor.calculate(metric) # Because dataset.CLASSES is required in progressive_single_gpu_test, # progressive_multi_gpu_test, so it's necessary to keep @@ -483,7 +483,4 @@ def progressive_evaluate(self, for idx, name in enumerate(class_names) }) - if mmcv.is_list_of(results, str): - for file_name in results: - os.remove(file_name) return eval_results From 9349970735852c3a940c399ffe23ef0d24f3c815 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 22 Jul 2021 23:17:37 +0800 Subject: [PATCH 40/96] Modify some viriable name of eval hook. --- mmseg/core/evaluation/eval_hooks.py | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/mmseg/core/evaluation/eval_hooks.py b/mmseg/core/evaluation/eval_hooks.py index 97b2de4d42..132b34ff88 100644 --- a/mmseg/core/evaluation/eval_hooks.py +++ b/mmseg/core/evaluation/eval_hooks.py @@ -13,27 +13,24 @@ class EvalHook(_EvalHook): by_epoch (bool): Determine perform evaluation by epoch or by iteration. If set to True, it will perform by epoch. Otherwise, by iteration. Default: False. - efficient_test (bool): Whether save the results as local numpy files to - save CPU memory during evaluation. Default: False. Returns: list: The prediction results. """ greater_keys = ['mIoU', 'mAcc', 'aAcc'] - def __init__(self, *args, by_epoch=False, efficient_test=False, **kwargs): + def __init__(self, *args, by_epoch=False, **kwargs): super().__init__(*args, by_epoch=by_epoch, **kwargs) - self.efficient_test = efficient_test - def progressive_evaluate(self, runner, results): + def progressive_evaluate(self, runner, processor): """Evaluate the results by progressive mode. Args: runner (:obj:`mmcv.Runner`): The underlined training runner. - results (list): Output results. + processor (object): Output processor. """ eval_res = self.dataloader.dataset.progressive_evaluate( - results, logger=runner.logger, **self.eval_kwargs) + processor, logger=runner.logger, **self.eval_kwargs) # TODO: Blocked by mmcv pr: #1213 # evaluation info specific buffer @@ -59,10 +56,10 @@ def _do_evaluate(self, runner): return from mmseg.apis import progressive_single_gpu_test - results = progressive_single_gpu_test( + processor = progressive_single_gpu_test( runner.model, self.dataloader, False, show=False) runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) - key_score = self.progressive_evaluate(runner, results) + key_score = self.progressive_evaluate(runner, processor) if self.save_best: self._save_ckpt(runner, key_score) @@ -74,8 +71,6 @@ class DistEvalHook(_DistEvalHook): by_epoch (bool): Determine perform evaluation by epoch or by iteration. If set to True, it will perform by epoch. Otherwise, by iteration. Default: False. - efficient_test (bool): Whether save the results as local numpy files to - save CPU memory during evaluation. Default: False. Returns: list: The prediction results. """ @@ -85,15 +80,15 @@ class DistEvalHook(_DistEvalHook): def __init__(self, *args, by_epoch=False, **kwargs): super().__init__(*args, by_epoch=by_epoch, **kwargs) - def progressive_evaluate(self, runner, results): + def progressive_evaluate(self, runner, processor): """Evaluate the results by progressive mode. Args: runner (:obj:`mmcv.Runner`): The underlined training runner. - results (list): Output results. + processor (object): Output processor. """ eval_res = self.dataloader.dataset.progressive_evaluate( - results, logger=runner.logger, **self.eval_kwargs) + processor, logger=runner.logger, **self.eval_kwargs) # TODO: Blocked by mmcv pr: #1213 # evaluation info specific buffer # runner.log_buffer.output['eval_res'] = {} @@ -135,7 +130,7 @@ def _do_evaluate(self, runner): tmpdir = osp.join(runner.work_dir, '.eval_hook') from mmseg.apis import progressive_multi_gpu_test - results = progressive_multi_gpu_test( + processor = progressive_multi_gpu_test( runner.model, self.dataloader, False, @@ -144,7 +139,7 @@ def _do_evaluate(self, runner): if runner.rank == 0: print('\n') runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) - key_score = self.progressive_evaluate(runner, results) + key_score = self.progressive_evaluate(runner, processor) if self.save_best: self._save_ckpt(runner, key_score) From 3a3b3ec563f159425193b3bfff1e57fd48773563 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 22 Jul 2021 23:58:04 +0800 Subject: [PATCH 41/96] Fix some priority bugs of eval hook. --- mmseg/apis/train.py | 3 ++- mmseg/core/evaluation/eval_hooks.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mmseg/apis/train.py b/mmseg/apis/train.py index 5f526df2b0..49c4be0f7e 100644 --- a/mmseg/apis/train.py +++ b/mmseg/apis/train.py @@ -107,7 +107,8 @@ def train_segmentor(model, eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook - runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) + runner.register_hook( + eval_hook(val_dataloader, **eval_cfg), priority='LOW') if cfg.resume_from: runner.resume(cfg.resume_from) diff --git a/mmseg/core/evaluation/eval_hooks.py b/mmseg/core/evaluation/eval_hooks.py index 132b34ff88..b2895c0c15 100644 --- a/mmseg/core/evaluation/eval_hooks.py +++ b/mmseg/core/evaluation/eval_hooks.py @@ -38,6 +38,7 @@ def progressive_evaluate(self, runner, processor): # for name, val in eval_res.items(): # runner.log_buffer.output['eval_res'][name] = val runner.log_buffer.clear() + runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) for name, val in eval_res.items(): runner.log_buffer.output[name] = val runner.log_buffer.ready = True @@ -58,7 +59,6 @@ def _do_evaluate(self, runner): from mmseg.apis import progressive_single_gpu_test processor = progressive_single_gpu_test( runner.model, self.dataloader, False, show=False) - runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) key_score = self.progressive_evaluate(runner, processor) if self.save_best: self._save_ckpt(runner, key_score) @@ -95,6 +95,7 @@ def progressive_evaluate(self, runner, processor): # for name, val in eval_res.items(): # runner.log_buffer.output['eval_res'][name] = val runner.log_buffer.clear() + runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) for name, val in eval_res.items(): runner.log_buffer.output[name] = val runner.log_buffer.ready = True @@ -138,7 +139,6 @@ def _do_evaluate(self, runner): gpu_collect=self.gpu_collect) if runner.rank == 0: print('\n') - runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) key_score = self.progressive_evaluate(runner, processor) if self.save_best: From 1475f8014319f8ac2b1d8073437dce6d04c03193 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sat, 24 Jul 2021 21:28:26 +0800 Subject: [PATCH 42/96] Fix some bugs about model loading and eval hook. --- mmseg/apis/train.py | 3 ++- mmseg/models/decode_heads/__init__.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mmseg/apis/train.py b/mmseg/apis/train.py index 5f526df2b0..49c4be0f7e 100644 --- a/mmseg/apis/train.py +++ b/mmseg/apis/train.py @@ -107,7 +107,8 @@ def train_segmentor(model, eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook - runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) + runner.register_hook( + eval_hook(val_dataloader, **eval_cfg), priority='LOW') if cfg.resume_from: runner.resume(cfg.resume_from) diff --git a/mmseg/models/decode_heads/__init__.py b/mmseg/models/decode_heads/__init__.py index 5872f69ae2..5b64125056 100644 --- a/mmseg/models/decode_heads/__init__.py +++ b/mmseg/models/decode_heads/__init__.py @@ -16,7 +16,7 @@ from .point_head import PointHead from .psa_head import PSAHead from .psp_head import PSPHead -from .segformer_head import SegFormerHead +from .segformer_head import SegformerHead from .sep_aspp_head import DepthwiseSeparableASPPHead from .sep_fcn_head import DepthwiseSeparableFCNHead from .setr_mla_head import SETRMLAHead @@ -28,5 +28,5 @@ 'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead', 'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead', 'PointHead', 'APCHead', 'DMHead', 'LRASPPHead', 'SETRUPHead', - 'SETRMLAHead', 'SegFormerHead' + 'SETRMLAHead', 'SegformerHead' ] From 35503a925e4ab26c1eda91027b6bb34b3218b641 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sat, 24 Jul 2021 21:29:00 +0800 Subject: [PATCH 43/96] Add ade20k 640x640 dataset. --- configs/_base_/datasets/ade20k_640x640.py | 54 +++++++++++++++++++ .../_base_/datasets/ade20k_aligned_640x640.py | 2 +- 2 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 configs/_base_/datasets/ade20k_640x640.py diff --git a/configs/_base_/datasets/ade20k_640x640.py b/configs/_base_/datasets/ade20k_640x640.py new file mode 100644 index 0000000000..58d36e4a38 --- /dev/null +++ b/configs/_base_/datasets/ade20k_640x640.py @@ -0,0 +1,54 @@ +# dataset settings +dataset_type = 'ADE20KDataset' +data_root = 'data/ade/ADEChallengeData2016' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +crop_size = (640, 640) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', reduce_zero_label=True), + dict(type='Resize', img_scale=(2048, 640), ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2048, 640), + # img_ratios=[0.5640.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/training', + ann_dir='annotations/training', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline)) diff --git a/configs/_base_/datasets/ade20k_aligned_640x640.py b/configs/_base_/datasets/ade20k_aligned_640x640.py index 1d65d9a92f..19a9061397 100644 --- a/configs/_base_/datasets/ade20k_aligned_640x640.py +++ b/configs/_base_/datasets/ade20k_aligned_640x640.py @@ -31,7 +31,7 @@ ]) ] data = dict( - samples_per_gpu=2, + samples_per_gpu=4, workers_per_gpu=4, train=dict( type=dataset_type, From ef7b2cabc069cc29e8f4b0b6997220883aa0f168 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sat, 24 Jul 2021 21:29:35 +0800 Subject: [PATCH 44/96] Fix related segformer configs. --- configs/_base_/models/segformer_mit-b0.py | 1 - configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py | 2 ++ configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py | 2 +- configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py | 2 +- configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py | 2 +- configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py | 2 +- configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py | 2 +- 7 files changed, 7 insertions(+), 6 deletions(-) diff --git a/configs/_base_/models/segformer_mit-b0.py b/configs/_base_/models/segformer_mit-b0.py index 62e32e7610..5af1c08494 100644 --- a/configs/_base_/models/segformer_mit-b0.py +++ b/configs/_base_/models/segformer_mit-b0.py @@ -22,7 +22,6 @@ type='SegformerHead', in_channels=[32, 64, 160, 256], in_index=[0, 1, 2, 3], - feature_strides=[4, 8, 16, 32], channels=256, dropout_ratio=0.1, num_classes=19, diff --git a/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py index b9ca1eb7c7..214d2f5662 100644 --- a/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py @@ -4,6 +4,8 @@ '../_base_/schedules/schedule_160k.py' ] +model = dict(decode_head=dict(num_classes=150)) + # optimizer optimizer = dict( _delete_=True, diff --git a/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py index 0f655ba603..0c6eab4f72 100644 --- a/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py @@ -9,7 +9,7 @@ pretrained='pretrain/mit_b1.pth', backbone=dict( embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[2, 2, 2, 2]), - decode_head=dict(in_channels=[64, 128, 320, 512])) + decode_head=dict(in_channels=[64, 128, 320, 512], num_classes=150)) # optimizer optimizer = dict( diff --git a/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py index caef2197fd..6123ec8968 100644 --- a/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py @@ -9,7 +9,7 @@ pretrained='pretrain/mit_b2.pth', backbone=dict( embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 4, 6, 3]), - decode_head=dict(in_channels=[64, 128, 320, 512])) + decode_head=dict(in_channels=[64, 128, 320, 512], num_classes=150)) # optimizer optimizer = dict( diff --git a/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py index 4f8269c76b..7e8cacddbc 100644 --- a/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py @@ -9,7 +9,7 @@ pretrained='pretrain/mit_b3.pth', backbone=dict( embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 4, 18, 3]), - decode_head=dict(in_channels=[64, 128, 320, 512])) + decode_head=dict(in_channels=[64, 128, 320, 512], num_classes=150)) # optimizer optimizer = dict( diff --git a/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py index 8b765b1dad..afa6209727 100644 --- a/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py @@ -11,7 +11,7 @@ embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], num_layers=[3, 8, 27, 3]), - decode_head=dict(in_channels=[64, 128, 320, 512])) + decode_head=dict(in_channels=[64, 128, 320, 512], num_classes=150)) # optimizer optimizer = dict( diff --git a/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py b/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py index ea437142de..56dce70bad 100644 --- a/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py @@ -9,7 +9,7 @@ pretrained='pretrain/mit_b5.pth', backbone=dict( embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 6, 40, 3]), - decode_head=dict(in_channels=[64, 128, 320, 512])) + decode_head=dict(in_channels=[64, 128, 320, 512], num_classes=150)) # optimizer optimizer = dict( From e03ee571092ec6c7c016687bb93e6dbb248228c5 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sat, 24 Jul 2021 22:02:41 +0800 Subject: [PATCH 45/96] Depreciated efficient_test. --- tools/deploy_test.py | 16 +++++++++------- tools/test.py | 26 ++++++++++---------------- 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/tools/deploy_test.py b/tools/deploy_test.py index bef3512d71..a3b2e5f320 100644 --- a/tools/deploy_test.py +++ b/tools/deploy_test.py @@ -227,24 +227,26 @@ def main(): model.CLASSES = dataset.CLASSES model.PALETTE = dataset.PALETTE - efficient_test = False + middle_save = False if args.eval_options is not None: - efficient_test = args.eval_options.get('efficient_test', False) + middle_save = args.eval_options.get('middle_save', False) model = MMDataParallel(model, device_ids=[0]) - outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, - efficient_test, args.opacity) + processor = single_gpu_test(model, data_loader, args.show, args.show_dir, + middle_save, args.opacity) rank, _ = get_dist_info() if rank == 0: if args.out: print(f'\nwriting results to {args.out}') - mmcv.dump(outputs, args.out) + mmcv.dump(processor.retrieval(), args.out) kwargs = {} if args.eval_options is None else args.eval_options if args.format_only: - dataset.format_results(outputs, **kwargs) + assert middle_save, 'When `middle_save` is True, the ' + '`--format-only` is valid.' + dataset.format_results(processor.retrieval(), **kwargs) if args.eval: - dataset.evaluate(outputs, args.eval, **kwargs) + dataset.evaluate(processor, args.eval, **kwargs) if __name__ == '__main__': diff --git a/tools/test.py b/tools/test.py index d0cd266a70..416ccac256 100644 --- a/tools/test.py +++ b/tools/test.py @@ -8,7 +8,7 @@ wrap_fp16_model) from mmcv.utils import DictAction -from mmseg.apis import progressive_multi_gpu_test, progressive_single_gpu_test +from mmseg.apis import multi_gpu_test, single_gpu_test from mmseg.datasets import build_dataloader, build_dataset from mmseg.models import build_segmentor @@ -135,10 +135,8 @@ def main(): print('"PALETTE" not found in meta, use dataset.PALETTE instead') model.PALETTE = dataset.PALETTE - # efficient_test = False middle_save = False if args.eval_options is not None: - # efficient_test = args.eval_options.get('efficient_test', False) middle_save = args.eval_options.get('middle_save', False) # clean gpu memory when starting a new evaluation. @@ -146,32 +144,28 @@ def main(): if not distributed: model = MMDataParallel(model, device_ids=[0]) - outputs = progressive_single_gpu_test(model, data_loader, middle_save, - args.show, args.show_dir, - args.opacity) - # outputs = single_gpu_test(model, data_loader, args.show, - # args.show_dir, efficient_test, - # args.opacity) + processor = single_gpu_test(model, data_loader, middle_save, args.show, + args.show_dir, args.opacity) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) - outputs = progressive_multi_gpu_test(model, data_loader, middle_save, - args.tmpdir, args.gpu_collect) - # outputs = multi_gpu_test(model, data_loader, args.tmpdir, - # args.gpu_collect, efficient_test) + processor = multi_gpu_test(model, data_loader, middle_save, + args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: if args.out: print(f'\nwriting results to {args.out}') - mmcv.dump(outputs, args.out) + mmcv.dump(processor.retrieval(), args.out) kwargs = {} if args.eval_options is None else args.eval_options if args.format_only: - dataset.progressive_format_results(outputs, **kwargs) + assert middle_save, 'When `middle_save` is True, the ' + '`--format-only` is valid.' + dataset.format_results(processor.retrieval(), **kwargs) if args.eval: - dataset.progressive_evaluate(outputs, args.eval, **kwargs) + dataset.evaluate(processor, args.eval, **kwargs) if __name__ == '__main__': From c3c491d142bdeea740aa040ef75b4c6c47d7b2f9 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sat, 24 Jul 2021 22:03:11 +0800 Subject: [PATCH 46/96] Fix training progress blocked by eval hook. --- mmseg/core/evaluation/eval_hooks.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/mmseg/core/evaluation/eval_hooks.py b/mmseg/core/evaluation/eval_hooks.py index b2895c0c15..f48e5dfaaa 100644 --- a/mmseg/core/evaluation/eval_hooks.py +++ b/mmseg/core/evaluation/eval_hooks.py @@ -22,14 +22,14 @@ class EvalHook(_EvalHook): def __init__(self, *args, by_epoch=False, **kwargs): super().__init__(*args, by_epoch=by_epoch, **kwargs) - def progressive_evaluate(self, runner, processor): + def evaluate(self, runner, processor): """Evaluate the results by progressive mode. Args: runner (:obj:`mmcv.Runner`): The underlined training runner. processor (object): Output processor. """ - eval_res = self.dataloader.dataset.progressive_evaluate( + eval_res = self.dataloader.dataset.evaluate( processor, logger=runner.logger, **self.eval_kwargs) # TODO: Blocked by mmcv pr: #1213 @@ -37,7 +37,6 @@ def progressive_evaluate(self, runner, processor): # runner.log_buffer.output['eval_res'] = {} # for name, val in eval_res.items(): # runner.log_buffer.output['eval_res'][name] = val - runner.log_buffer.clear() runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) for name, val in eval_res.items(): runner.log_buffer.output[name] = val @@ -56,10 +55,13 @@ def _do_evaluate(self, runner): if not self._should_evaluate(runner): return - from mmseg.apis import progressive_single_gpu_test - processor = progressive_single_gpu_test( + from mmseg.apis import single_gpu_test + processor = single_gpu_test( runner.model, self.dataloader, False, show=False) - key_score = self.progressive_evaluate(runner, processor) + + runner.log_buffer.clear() + + key_score = self.evaluate(runner, processor) if self.save_best: self._save_ckpt(runner, key_score) @@ -80,21 +82,20 @@ class DistEvalHook(_DistEvalHook): def __init__(self, *args, by_epoch=False, **kwargs): super().__init__(*args, by_epoch=by_epoch, **kwargs) - def progressive_evaluate(self, runner, processor): + def evaluate(self, runner, processor): """Evaluate the results by progressive mode. Args: runner (:obj:`mmcv.Runner`): The underlined training runner. processor (object): Output processor. """ - eval_res = self.dataloader.dataset.progressive_evaluate( + eval_res = self.dataloader.dataset.evaluate( processor, logger=runner.logger, **self.eval_kwargs) # TODO: Blocked by mmcv pr: #1213 # evaluation info specific buffer # runner.log_buffer.output['eval_res'] = {} # for name, val in eval_res.items(): # runner.log_buffer.output['eval_res'][name] = val - runner.log_buffer.clear() runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) for name, val in eval_res.items(): runner.log_buffer.output[name] = val @@ -130,16 +131,19 @@ def _do_evaluate(self, runner): if tmpdir is None: tmpdir = osp.join(runner.work_dir, '.eval_hook') - from mmseg.apis import progressive_multi_gpu_test - processor = progressive_multi_gpu_test( + from mmseg.apis import multi_gpu_test + processor = multi_gpu_test( runner.model, self.dataloader, False, tmpdir=tmpdir, gpu_collect=self.gpu_collect) + + runner.log_buffer.clear() + if runner.rank == 0: print('\n') - key_score = self.progressive_evaluate(runner, processor) + key_score = self.evaluate(runner, processor) if self.save_best: self._save_ckpt(runner, key_score) From 0cb97f456aac0ec4bebac2b9fa49fd017b57e14b Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sat, 24 Jul 2021 22:04:42 +0800 Subject: [PATCH 47/96] Depreciated old test api. --- mmseg/apis/__init__.py | 6 +- mmseg/apis/test.py | 170 ++--------------------------------- mmseg/datasets/ade.py | 1 + mmseg/datasets/cityscapes.py | 82 +---------------- mmseg/datasets/custom.py | 109 +--------------------- tests/test_eval_hook.py | 6 +- 6 files changed, 16 insertions(+), 358 deletions(-) diff --git a/mmseg/apis/__init__.py b/mmseg/apis/__init__.py index eada290f58..170724be38 100644 --- a/mmseg/apis/__init__.py +++ b/mmseg/apis/__init__.py @@ -1,11 +1,9 @@ from .inference import inference_segmentor, init_segmentor, show_result_pyplot -from .test import (multi_gpu_test, progressive_multi_gpu_test, - progressive_single_gpu_test, single_gpu_test) +from .test import multi_gpu_test, single_gpu_test from .train import get_root_logger, set_random_seed, train_segmentor __all__ = [ 'get_root_logger', 'set_random_seed', 'train_segmentor', 'init_segmentor', 'inference_segmentor', 'multi_gpu_test', 'single_gpu_test', - 'show_result_pyplot', 'progressive_single_gpu_test', - 'progressive_multi_gpu_test' + 'show_result_pyplot' ] diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index 19cecabd7a..083fe1310c 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -4,178 +4,20 @@ import tempfile import mmcv -import numpy as np import torch import torch.distributed as dist -from mmcv.engine import collect_results_cpu, collect_results_gpu from mmcv.image import tensor2imgs from mmcv.runner import get_dist_info from mmseg.core.evaluation.metrics import ResultProcessor -def np2tmp(array, temp_file_name=None, tmpdir=None): - """Save ndarray to local numpy file. - - Args: - array (ndarray): Ndarray to save. - temp_file_name (str): Numpy file name. If 'temp_file_name=None', this - function will generate a file name with tempfile.NamedTemporaryFile - to save ndarray. Default: None. - tmpdir (str): Temporary directory to save Ndarray files. Default: None. - - Returns: - str: The numpy file name. - """ - - if temp_file_name is None: - temp_file_name = tempfile.NamedTemporaryFile( - suffix='.npy', delete=False, dir=tmpdir).name - np.save(temp_file_name, array) - return temp_file_name - - def single_gpu_test(model, data_loader, + middle_save=False, show=False, out_dir=None, - efficient_test=False, opacity=0.5): - """Test with single GPU. - - Args: - model (nn.Module): Model to be tested. - data_loader (utils.data.Dataloader): Pytorch data loader. - show (bool): Whether show results during inference. Default: False. - out_dir (str, optional): If specified, the results will be dumped into - the directory to save output results. - efficient_test (bool, optional): Whether save the results as local - numpy files to save CPU memory during evaluation. Default: False. - opacity(float): Opacity of painted segmentation map. - Default 0.5. - Must be in (0, 1] range. - Returns: - list: The prediction results. - """ - - model.eval() - results = [] - dataset = data_loader.dataset - prog_bar = mmcv.ProgressBar(len(dataset)) - if efficient_test: - mmcv.mkdir_or_exist('.efficient_test') - for i, data in enumerate(data_loader): - with torch.no_grad(): - result = model(return_loss=False, **data) - - if show or out_dir: - img_tensor = data['img'][0] - img_metas = data['img_metas'][0].data[0] - imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg']) - assert len(imgs) == len(img_metas) - - for img, img_meta in zip(imgs, img_metas): - h, w, _ = img_meta['img_shape'] - img_show = img[:h, :w, :] - - ori_h, ori_w = img_meta['ori_shape'][:-1] - img_show = mmcv.imresize(img_show, (ori_w, ori_h)) - - if out_dir: - out_file = osp.join(out_dir, img_meta['ori_filename']) - else: - out_file = None - - model.module.show_result( - img_show, - result, - palette=dataset.PALETTE, - show=show, - out_file=out_file, - opacity=opacity) - - if isinstance(result, list): - if efficient_test: - result = [np2tmp(_, tmpdir='.efficient_test') for _ in result] - results.extend(result) - else: - if efficient_test: - result = np2tmp(result, tmpdir='.efficient_test') - results.append(result) - - batch_size = len(result) - for _ in range(batch_size): - prog_bar.update() - return results - - -def multi_gpu_test(model, - data_loader, - tmpdir=None, - gpu_collect=False, - efficient_test=False): - """Test model with multiple gpus. - - This method tests model with multiple gpus and collects the results - under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' - it encodes results to gpu tensors and use gpu communication for results - collection. On cpu mode it saves the results on different gpus to 'tmpdir' - and collects them by the rank 0 worker. - - Args: - model (nn.Module): Model to be tested. - data_loader (utils.data.Dataloader): Pytorch data loader. - tmpdir (str): Path of directory to save the temporary results from - different gpus under cpu mode. The same path is used for efficient - test. - gpu_collect (bool): Option to use either gpu or cpu to collect results. - efficient_test (bool): Whether save the results as local numpy files to - save CPU memory during evaluation. Default: False. - - Returns: - list: The prediction results. - """ - - model.eval() - results = [] - dataset = data_loader.dataset - rank, world_size = get_dist_info() - if rank == 0: - prog_bar = mmcv.ProgressBar(len(dataset)) - if efficient_test: - mmcv.mkdir_or_exist('.efficient_test') - for i, data in enumerate(data_loader): - with torch.no_grad(): - result = model(return_loss=False, rescale=True, **data) - - if isinstance(result, list): - if efficient_test: - result = [np2tmp(_, tmpdir='.efficient_test') for _ in result] - results.extend(result) - else: - if efficient_test: - result = np2tmp(result, tmpdir='.efficient_test') - results.append(result) - - if rank == 0: - batch_size = len(result) - for _ in range(batch_size * world_size): - prog_bar.update() - - # collect results from all ranks - if gpu_collect: - results = collect_results_gpu(results, len(dataset)) - else: - results = collect_results_cpu(results, len(dataset), tmpdir) - return results - - -def progressive_single_gpu_test(model, - data_loader, - middle_save=False, - show=False, - out_dir=None, - opacity=0.5): """Test with single GPU by progressive mode. Args: @@ -255,11 +97,11 @@ def progressive_single_gpu_test(model, return processor -def progressive_multi_gpu_test(model, - data_loader, - middle_save=False, - tmpdir=None, - gpu_collect=False): +def multi_gpu_test(model, + data_loader, + middle_save=False, + tmpdir=None, + gpu_collect=False): """Test model with multiple gpus by progressive mode. This method tests model with multiple gpus and collects the results diff --git a/mmseg/datasets/ade.py b/mmseg/datasets/ade.py index 5daf7e3731..942b37adaa 100644 --- a/mmseg/datasets/ade.py +++ b/mmseg/datasets/ade.py @@ -130,6 +130,7 @@ def results2img(self, results, imgfile_prefix, to_label_id): return result_files + # TODO: Use processor to format results. def format_results(self, results, imgfile_prefix=None, to_label_id=True): """Format the results into dir (standard format for ade20k evaluation). diff --git a/mmseg/datasets/cityscapes.py b/mmseg/datasets/cityscapes.py index 3310886818..5d1c142890 100644 --- a/mmseg/datasets/cityscapes.py +++ b/mmseg/datasets/cityscapes.py @@ -88,6 +88,7 @@ def results2img(self, results, imgfile_prefix, to_label_id): return result_files + # TODO: Use processor to format results. def format_results(self, results, imgfile_prefix=None, to_label_id=True): """Format the results into dir (standard format for Cityscapes evaluation). @@ -125,84 +126,7 @@ def evaluate(self, results, metric='mIoU', logger=None, - imgfile_prefix=None, - efficient_test=False): - """Evaluation in Cityscapes/default protocol. - - Args: - results (list): Testing results of the dataset. - metric (str | list[str]): Metrics to be evaluated. - logger (logging.Logger | None | str): Logger used for printing - related information during evaluation. Default: None. - imgfile_prefix (str | None): The prefix of output image file, - for cityscapes evaluation only. It includes the file path and - the prefix of filename, e.g., "a/b/prefix". - If results are evaluated with cityscapes protocol, it would be - the prefix of output png files. The output files would be - png images under folder "a/b/prefix/xxx.png", where "xxx" is - the image name of cityscapes. If not specified, a temp file - will be created for evaluation. - Default: None. - - Returns: - dict[str, float]: Cityscapes/default metrics. - """ - - eval_results = dict() - metrics = metric.copy() if isinstance(metric, list) else [metric] - if 'cityscapes' in metrics: - eval_results.update( - self._evaluate_cityscapes(results, logger, imgfile_prefix)) - metrics.remove('cityscapes') - if len(metrics) > 0: - eval_results.update( - super(CityscapesDataset, - self).evaluate(results, metrics, logger, efficient_test)) - - return eval_results - - # TODO: Use processor to format results. - def progressive_format_results(self, - results, - imgfile_prefix=None, - to_label_id=True): - """Format the results into dir (standard format for Cityscapes - evaluation). - - Args: - results (list): Testing results of the dataset. - imgfile_prefix (str | None): The prefix of images files. It - includes the file path and the prefix of filename, e.g., - "a/b/prefix". If not specified, a temp file will be created. - Default: None. - to_label_id (bool): whether convert output to label_id for - submission. Default: False - - Returns: - tuple: (result_files, tmp_dir), result_files is a list containing - the image paths, tmp_dir is the temporal directory created - for saving json/png files when img_prefix is not specified. - """ - - assert isinstance(results, list), 'results must be a list' - assert len(results) == len(self), ( - 'The length of results is not equal to the dataset len: ' - f'{len(results)} != {len(self)}') - - if imgfile_prefix is None: - tmp_dir = tempfile.TemporaryDirectory() - imgfile_prefix = tmp_dir.name - else: - tmp_dir = None - result_files = self.results2img(results, imgfile_prefix, to_label_id) - - return result_files, tmp_dir - - def progressive_evaluate(self, - results, - metric='mIoU', - logger=None, - imgfile_prefix=None): + imgfile_prefix=None): """Evaluation in Cityscapes/default protocol. Args: @@ -233,7 +157,7 @@ def progressive_evaluate(self, if len(metrics) > 0: eval_results.update( super(CityscapesDataset, - self).progressive_evaluate(results, metrics, logger)) + self).evaluate(results, metrics, logger)) return eval_results diff --git a/mmseg/datasets/custom.py b/mmseg/datasets/custom.py index e4c357b8d0..9ba6d4e011 100644 --- a/mmseg/datasets/custom.py +++ b/mmseg/datasets/custom.py @@ -1,7 +1,5 @@ -import os import os.path as osp from collections import Iterable, OrderedDict -from functools import reduce import mmcv import numpy as np @@ -9,7 +7,6 @@ from prettytable import PrettyTable from torch.utils.data import Dataset -from mmseg.core.evaluation import eval_metrics from mmseg.utils import get_root_logger from .builder import DATASETS from .pipelines import Compose @@ -310,107 +307,7 @@ def get_palette_for_custom_classes(self, class_names, palette=None): return palette - def evaluate(self, - results, - metric='mIoU', - logger=None, - efficient_test=False, - **kwargs): - """Evaluate the dataset. - - Args: - results (list): Testing results of the dataset. - metric (str | list[str]): Metrics to be evaluated. 'mIoU', - 'mDice' and 'mFscore' are supported. - logger (logging.Logger | None | str): Logger used for printing - related information during evaluation. Default: None. - - Returns: - dict[str, float]: Default metrics. - """ - - if isinstance(metric, str): - metric = [metric] - allowed_metrics = ['mIoU', 'mDice', 'mFscore'] - if not set(metric).issubset(set(allowed_metrics)): - raise KeyError('metric {} is not supported'.format(metric)) - eval_results = {} - gt_seg_maps = self.get_gt_seg_maps(efficient_test) - if self.CLASSES is None: - num_classes = len( - reduce(np.union1d, [np.unique(_) for _ in gt_seg_maps])) - else: - num_classes = len(self.CLASSES) - ret_metrics = eval_metrics( - results, - gt_seg_maps, - num_classes, - self.ignore_index, - metric, - label_map=self.label_map, - reduce_zero_label=self.reduce_zero_label) - - if self.CLASSES is None: - class_names = tuple(range(num_classes)) - else: - class_names = self.CLASSES - - # summary table - ret_metrics_summary = OrderedDict({ - ret_metric: np.round(np.nanmean(ret_metric_value) * 100, 2) - for ret_metric, ret_metric_value in ret_metrics.items() - }) - - # each class table - ret_metrics.pop('aAcc', None) - ret_metrics_class = OrderedDict({ - ret_metric: np.round(ret_metric_value * 100, 2) - for ret_metric, ret_metric_value in ret_metrics.items() - }) - ret_metrics_class.update({'Class': class_names}) - ret_metrics_class.move_to_end('Class', last=False) - - # for logger - class_table_data = PrettyTable() - for key, val in ret_metrics_class.items(): - class_table_data.add_column(key, val) - - summary_table_data = PrettyTable() - for key, val in ret_metrics_summary.items(): - if key == 'aAcc': - summary_table_data.add_column(key, [val]) - else: - summary_table_data.add_column('m' + key, [val]) - - print_log('per class results:', logger) - print_log('\n' + class_table_data.get_string(), logger=logger) - print_log('Summary:', logger) - print_log('\n' + summary_table_data.get_string(), logger=logger) - - # each metric dict - for key, value in ret_metrics_summary.items(): - if key == 'aAcc': - eval_results[key] = value / 100.0 - else: - eval_results['m' + key] = value / 100.0 - - ret_metrics_class.pop('Class', None) - for key, value in ret_metrics_class.items(): - eval_results.update({ - key + '.' + str(name): value[idx] / 100.0 - for idx, name in enumerate(class_names) - }) - - if mmcv.is_list_of(results, str): - for file_name in results: - os.remove(file_name) - return eval_results - - def progressive_evaluate(self, - processor, - metric='mIoU', - logger=None, - **kwargs): + def evaluate(self, processor, metric='mIoU', logger=None, **kwargs): """Evaluate the dataset. Args: @@ -432,8 +329,8 @@ def progressive_evaluate(self, eval_results = {} ret_metrics = processor.calculate(metric) - # Because dataset.CLASSES is required in progressive_single_gpu_test, - # progressive_multi_gpu_test, so it's necessary to keep + # Because dataset.CLASSES is required in single_gpu_test, + # multi_gpu_test, so it's necessary to keep # dataset.CLASSES. class_names = self.CLASSES diff --git a/tests/test_eval_hook.py b/tests/test_eval_hook.py index 394051b0ba..c83623de0c 100644 --- a/tests/test_eval_hook.py +++ b/tests/test_eval_hook.py @@ -112,11 +112,7 @@ def test_epoch_eval_hook(): logger=runner.logger) -def multi_gpu_test(model, - data_loader, - tmpdir=None, - gpu_collect=False, - efficient_test=False): +def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): results = single_gpu_test(model, data_loader) return results From ba6c9ff7c4839657388746c6f0bf226370cdf0e8 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sat, 24 Jul 2021 22:35:43 +0800 Subject: [PATCH 48/96] Modify error patch size. --- configs/_base_/models/segformer_mit-b0.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/_base_/models/segformer_mit-b0.py b/configs/_base_/models/segformer_mit-b0.py index 5af1c08494..5b3e07331d 100644 --- a/configs/_base_/models/segformer_mit-b0.py +++ b/configs/_base_/models/segformer_mit-b0.py @@ -10,7 +10,7 @@ num_stages=4, num_layers=[2, 2, 2, 2], num_heads=[1, 2, 5, 8], - patch_sizes=[7, 4, 4, 4], + patch_sizes=[7, 3, 3, 3], sr_ratios=[8, 4, 2, 1], out_indices=(0, 1, 2, 3), mlp_ratio=4, From a631e76488e4953797b8c653eb1679e35381c757 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sun, 25 Jul 2021 00:20:43 +0800 Subject: [PATCH 49/96] Fix pretrain of mit_b0 --- configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py index 214d2f5662..65e2bd36ab 100644 --- a/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py @@ -4,7 +4,8 @@ '../_base_/schedules/schedule_160k.py' ] -model = dict(decode_head=dict(num_classes=150)) +model = dict( + pretrained='pretrain/mit_b0.pth', decode_head=dict(num_classes=150)) # optimizer optimizer = dict( From 543b1ab1d62612d89b8122d3f6ca435cf5a4c4ef Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sun, 25 Jul 2021 16:21:52 +0800 Subject: [PATCH 50/96] Fix the test api error. --- mmseg/apis/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index 083fe1310c..13fac960a7 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -165,7 +165,7 @@ def multi_gpu_test(model, for _ in range(total_samples): prog_bar.update() - cur += len(result) * world_size + cur += len(result) * world_size # collect results from all ranks if gpu_collect: From d5fd29a17bbe284c1ba884cc7e76085ac6a7df84 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sun, 25 Jul 2021 16:34:40 +0800 Subject: [PATCH 51/96] Modify dataset base config. --- configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py | 5 ++--- configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py | 5 ++--- configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py | 5 ++--- configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py | 5 ++--- configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py | 5 ++--- configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py | 5 ++--- 6 files changed, 12 insertions(+), 18 deletions(-) diff --git a/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py index 65e2bd36ab..d374a4f8a2 100644 --- a/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py @@ -1,7 +1,6 @@ _base_ = [ - '../_base_/models/segformer_mit-b0.py', - '../_base_/datasets/ade20k_aligned.py', '../_base_/default_runtime.py', - '../_base_/schedules/schedule_160k.py' + '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' ] model = dict( diff --git a/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py index 0c6eab4f72..244af5ea54 100644 --- a/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py @@ -1,7 +1,6 @@ _base_ = [ - '../_base_/models/segformer_mit-b0.py', - '../_base_/datasets/ade20k_aligned.py', '../_base_/default_runtime.py', - '../_base_/schedules/schedule_160k.py' + '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' ] # model settings diff --git a/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py index 6123ec8968..1f2c07453b 100644 --- a/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py @@ -1,7 +1,6 @@ _base_ = [ - '../_base_/models/segformer_mit-b0.py', - '../_base_/datasets/ade20k_aligned.py', '../_base_/default_runtime.py', - '../_base_/schedules/schedule_160k.py' + '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' ] # model settings diff --git a/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py index 7e8cacddbc..70ece5eb13 100644 --- a/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py @@ -1,7 +1,6 @@ _base_ = [ - '../_base_/models/segformer_mit-b0.py', - '../_base_/datasets/ade20k_aligned.py', '../_base_/default_runtime.py', - '../_base_/schedules/schedule_160k.py' + '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' ] # model settings diff --git a/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py index afa6209727..c0f6c0a072 100644 --- a/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py @@ -1,7 +1,6 @@ _base_ = [ - '../_base_/models/segformer_mit-b0.py', - '../_base_/datasets/ade20k_aligned.py', '../_base_/default_runtime.py', - '../_base_/schedules/schedule_160k.py' + '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' ] # model settings diff --git a/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py b/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py index 56dce70bad..e63dfef290 100644 --- a/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py @@ -1,7 +1,6 @@ _base_ = [ - '../_base_/models/segformer_mit-b0.py', - '../_base_/datasets/ade20k_aligned.py', '../_base_/default_runtime.py', - '../_base_/schedules/schedule_160k.py' + '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' ] # model settings From 25943c8fa1e93964ff407a89d786f487b6a909fe Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Mon, 26 Jul 2021 16:29:05 +0800 Subject: [PATCH 52/96] Fix test api error. --- mmseg/apis/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index 083fe1310c..13fac960a7 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -165,7 +165,7 @@ def multi_gpu_test(model, for _ in range(total_samples): prog_bar.update() - cur += len(result) * world_size + cur += len(result) * world_size # collect results from all ranks if gpu_collect: From be13436ef6cb8476a4730701d227ae682303be82 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Tue, 27 Jul 2021 12:00:27 +0800 Subject: [PATCH 53/96] Modify outer api. --- mmseg/core/evaluation/eval_hooks.py | 19 ++++++++++--------- tools/deploy_test.py | 24 ++++++++++-------------- tools/test.py | 28 ++++++++++++---------------- 3 files changed, 32 insertions(+), 39 deletions(-) diff --git a/mmseg/core/evaluation/eval_hooks.py b/mmseg/core/evaluation/eval_hooks.py index f48e5dfaaa..5529088a7a 100644 --- a/mmseg/core/evaluation/eval_hooks.py +++ b/mmseg/core/evaluation/eval_hooks.py @@ -22,15 +22,16 @@ class EvalHook(_EvalHook): def __init__(self, *args, by_epoch=False, **kwargs): super().__init__(*args, by_epoch=by_epoch, **kwargs) - def evaluate(self, runner, processor): + def evaluate(self, runner, pre_eval_results): """Evaluate the results by progressive mode. Args: runner (:obj:`mmcv.Runner`): The underlined training runner. - processor (object): Output processor. + pre_eval_results (tuple[torch.Tensor]): per image eval results for + computing evaluation metric """ eval_res = self.dataloader.dataset.evaluate( - processor, logger=runner.logger, **self.eval_kwargs) + pre_eval_results, logger=runner.logger, **self.eval_kwargs) # TODO: Blocked by mmcv pr: #1213 # evaluation info specific buffer @@ -56,12 +57,12 @@ def _do_evaluate(self, runner): return from mmseg.apis import single_gpu_test - processor = single_gpu_test( + pre_eval_results = single_gpu_test( runner.model, self.dataloader, False, show=False) runner.log_buffer.clear() - key_score = self.evaluate(runner, processor) + key_score = self.evaluate(runner, pre_eval_results) if self.save_best: self._save_ckpt(runner, key_score) @@ -82,7 +83,7 @@ class DistEvalHook(_DistEvalHook): def __init__(self, *args, by_epoch=False, **kwargs): super().__init__(*args, by_epoch=by_epoch, **kwargs) - def evaluate(self, runner, processor): + def evaluate(self, runner, pre_eval_results): """Evaluate the results by progressive mode. Args: @@ -90,7 +91,7 @@ def evaluate(self, runner, processor): processor (object): Output processor. """ eval_res = self.dataloader.dataset.evaluate( - processor, logger=runner.logger, **self.eval_kwargs) + pre_eval_results, logger=runner.logger, **self.eval_kwargs) # TODO: Blocked by mmcv pr: #1213 # evaluation info specific buffer # runner.log_buffer.output['eval_res'] = {} @@ -132,7 +133,7 @@ def _do_evaluate(self, runner): tmpdir = osp.join(runner.work_dir, '.eval_hook') from mmseg.apis import multi_gpu_test - processor = multi_gpu_test( + pre_eval_results = multi_gpu_test( runner.model, self.dataloader, False, @@ -143,7 +144,7 @@ def _do_evaluate(self, runner): if runner.rank == 0: print('\n') - key_score = self.evaluate(runner, processor) + key_score = self.evaluate(runner, pre_eval_results) if self.save_best: self._save_ckpt(runner, key_score) diff --git a/tools/deploy_test.py b/tools/deploy_test.py index a3b2e5f320..32f2a4e121 100644 --- a/tools/deploy_test.py +++ b/tools/deploy_test.py @@ -227,26 +227,22 @@ def main(): model.CLASSES = dataset.CLASSES model.PALETTE = dataset.PALETTE - middle_save = False - if args.eval_options is not None: - middle_save = args.eval_options.get('middle_save', False) - model = MMDataParallel(model, device_ids=[0]) - processor = single_gpu_test(model, data_loader, args.show, args.show_dir, - middle_save, args.opacity) + eval_results = single_gpu_test(model, data_loader, args.show, + args.show_dir, args.opacity) rank, _ = get_dist_info() if rank == 0: - if args.out: - print(f'\nwriting results to {args.out}') - mmcv.dump(processor.retrieval(), args.out) + # TODO: Move this to test api. + # if args.out: + # print(f'\nwriting results to {args.out}') + # mmcv.dump(results, args.out) kwargs = {} if args.eval_options is None else args.eval_options - if args.format_only: - assert middle_save, 'When `middle_save` is True, the ' - '`--format-only` is valid.' - dataset.format_results(processor.retrieval(), **kwargs) + # TODO: Move this to test api. + # if args.format_only: + # dataset.format_results(results, **kwargs) if args.eval: - dataset.evaluate(processor, args.eval, **kwargs) + dataset.evaluate(eval_results, args.eval, **kwargs) if __name__ == '__main__': diff --git a/tools/test.py b/tools/test.py index 416ccac256..b74d97d533 100644 --- a/tools/test.py +++ b/tools/test.py @@ -135,37 +135,33 @@ def main(): print('"PALETTE" not found in meta, use dataset.PALETTE instead') model.PALETTE = dataset.PALETTE - middle_save = False - if args.eval_options is not None: - middle_save = args.eval_options.get('middle_save', False) - # clean gpu memory when starting a new evaluation. torch.cuda.empty_cache() if not distributed: model = MMDataParallel(model, device_ids=[0]) - processor = single_gpu_test(model, data_loader, middle_save, args.show, - args.show_dir, args.opacity) + eval_results = single_gpu_test(model, data_loader, args.show, + args.show_dir, args.opacity) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) - processor = multi_gpu_test(model, data_loader, middle_save, - args.tmpdir, args.gpu_collect) + eval_results = multi_gpu_test(model, data_loader, args.tmpdir, + args.gpu_collect) rank, _ = get_dist_info() if rank == 0: - if args.out: - print(f'\nwriting results to {args.out}') - mmcv.dump(processor.retrieval(), args.out) + # TODO: Move this to test api + # if args.out: + # print(f'\nwriting results to {args.out}') + # mmcv.dump(results, args.out) kwargs = {} if args.eval_options is None else args.eval_options - if args.format_only: - assert middle_save, 'When `middle_save` is True, the ' - '`--format-only` is valid.' - dataset.format_results(processor.retrieval(), **kwargs) + # TODO: Move this to test api. + # if args.format_only: + # dataset.format_results(results, **kwargs) if args.eval: - dataset.evaluate(processor, args.eval, **kwargs) + dataset.evaluate(eval_results, args.eval, **kwargs) if __name__ == '__main__': From 6bb1c9144a6947ca53da0992f283859a51d3d191 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Tue, 27 Jul 2021 12:02:24 +0800 Subject: [PATCH 54/96] Build a sampler test api. --- mmseg/apis/test.py | 199 +++--------------------------- mmseg/core/evaluation/__init__.py | 6 +- mmseg/core/evaluation/metrics.py | 133 ++++---------------- mmseg/datasets/custom.py | 44 ++++--- 4 files changed, 67 insertions(+), 315 deletions(-) diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index 13fac960a7..6dab5cd226 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -1,68 +1,39 @@ import os.path as osp -import pickle -import shutil -import tempfile import mmcv import torch -import torch.distributed as dist +from mmcv.engine import collect_results_cpu, collect_results_gpu from mmcv.image import tensor2imgs from mmcv.runner import get_dist_info -from mmseg.core.evaluation.metrics import ResultProcessor - -def single_gpu_test(model, - data_loader, - middle_save=False, - show=False, - out_dir=None, - opacity=0.5): +def single_gpu_test(model, data_loader, show=False, out_dir=None, opacity=0.5): """Test with single GPU by progressive mode. Args: model (nn.Module): Model to be tested. data_loader (utils.data.Dataloader): Pytorch data loader. show (bool): Whether show results during inference. Default: False. - middle_save (bool, optional): Whether to save middle variables when - progressive test. Default: False. out_dir (str, optional): If specified, the results will be dumped into the directory to save output results. opacity(float): Opacity of painted segmentation map. Default 0.5. Must be in (0, 1] range. Returns: - object: The processor containing results. + list: evaluation preparetion results. """ model.eval() + eval_results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) - if middle_save: - processor = ResultProcessor( - num_classes=len(dataset.CLASSES), - ignore_index=dataset.ignore_index, - collect_type='seg_map', - label_map=dataset.label_map, - reduce_zero_label=dataset.reduce_zero_label) - else: - processor = ResultProcessor( - num_classes=len(dataset.CLASSES), - ignore_index=dataset.ignore_index, - collect_type='pixels_count', - label_map=dataset.label_map, - reduce_zero_label=dataset.reduce_zero_label) - - gt_maps_generator = dataset.get_gt_seg_maps() + loader_indices = list(data_loader.sampler) - for _, data in enumerate(data_loader): + for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, **data) - # Collect meta to avoid sorting for results collected from multi gpu. - gt_map = next(gt_maps_generator) - meta = data['img_metas'][0].data[0] - processor.collect(result, gt_map, meta) + eval_results.extend(dataset.pre_eval(result, [loader_indices[i]])) if show or out_dir: img_tensor = data['img'][0] @@ -94,14 +65,10 @@ def single_gpu_test(model, for _ in range(batch_size): prog_bar.update() - return processor + return eval_results -def multi_gpu_test(model, - data_loader, - middle_save=False, - tmpdir=None, - gpu_collect=False): +def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): """Test model with multiple gpus by progressive mode. This method tests model with multiple gpus and collects the results @@ -113,167 +80,39 @@ def multi_gpu_test(model, Args: model (nn.Module): Model to be tested. data_loader (utils.data.Dataloader): Pytorch data loader. - middle_save (bool, optional): Whether to save middle variables when - progressive test. Default: False. tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. The same path is used for efficient test. gpu_collect (bool): Option to use either gpu or cpu to collect results. Returns: - object: The processor containing results + list: evaluation preparetion results. """ model.eval() + eval_results = [] dataset = data_loader.dataset - if middle_save: - processor = ResultProcessor( - num_classes=len(dataset.CLASSES), - ignore_index=dataset.ignore_index, - collect_type='seg_map', - label_map=dataset.label_map, - reduce_zero_label=dataset.reduce_zero_label) - else: - processor = ResultProcessor( - num_classes=len(dataset.CLASSES), - ignore_index=dataset.ignore_index, - collect_type='pixels_count', - label_map=dataset.label_map, - reduce_zero_label=dataset.reduce_zero_label) + loader_indices = data_loader.sampler rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) - cur = 0 - for _, data in enumerate(data_loader): + for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) # TODO: adapt samples_per_gpu > 1. # only samples_per_gpu=1 valid now - if (cur + rank) < len(dataset): - gt_seg_map = dataset.index_gt_seg_maps(cur + rank) - meta = data['img_metas'][0].data[0] - processor.collect(result, gt_seg_map, meta) + eval_results.extend(dataset.pre_eval(result, [loader_indices[i]])) if rank == 0: - batch_size = len(result) - if cur + world_size >= len(dataset): - total_samples = len(dataset) - cur - else: - total_samples = batch_size * world_size - for _ in range(total_samples): + batch_size = len(result) * world_size + for _ in range(batch_size): prog_bar.update() - cur += len(result) * world_size - # collect results from all ranks if gpu_collect: - processor = collect_processors_gpu(processor) - else: - processor = collect_processors_cpu(processor, tmpdir) - return processor - - -def collect_processors_gpu(processor): - """Collect result processors under gpu mode. - - On gpu mode, this function will encode results to gpu tensors and use gpu - communication for results collection. - - Args: - processor (object): Result processor containing predictions and labels - to be collected. - Returns: - object: The gathered processor. - """ - rank, world_size = get_dist_info() - # dump result part to tensor with pickle - part_tensor = torch.tensor( - bytearray(pickle.dumps(processor)), dtype=torch.uint8, device='cuda') - # gather all result part tensor shape - shape_tensor = torch.tensor(part_tensor.shape, device='cuda') - shape_list = [shape_tensor.clone() for _ in range(world_size)] - dist.all_gather(shape_list, shape_tensor) - # padding result part tensor to max length - shape_max = torch.tensor(shape_list).max() - part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') - part_send[:shape_tensor[0]] = part_tensor - part_recv_list = [ - part_tensor.new_zeros(shape_max) for _ in range(world_size) - ] - # gather all result part - dist.all_gather(part_recv_list, part_send) - - if rank == 0: - # load results of all parts from tmp dir - main_processor = pickle.loads( - part_recv_list[0][:shape_list[0]].cpu().numpy().tobytes()) - sub_processors = [] - for recv, shape in zip(part_recv_list, shape_list): - part_processor = pickle.loads( - recv[:shape[0]].cpu().numpy().tobytes()) - # When data is severely insufficient, an empty part_result - # on a certain gpu could makes the overall outputs empty. - if part_processor: - sub_processors.append(part_processor) - main_processor.merge(sub_processors) - return main_processor - - -def collect_processors_cpu(processor, tmpdir=None): - """Collect result processors under cpu mode. - - On cpu mode, this function will save the result processors on different - gpus to``tmpdir`` and collect them by the rank 0 worker. - - Args: - processor (object): Result processor containing predictions and labels - to be collected. - tmpdir (str | None): temporal directory for collected results to - store. If set to None, it will create a random temporal directory - for it. - - Returns: - object: The gathered processor. - """ - rank, world_size = get_dist_info() - # create a tmp dir if it is not specified - if tmpdir is None: - MAX_LEN = 512 - # 32 is whitespace - dir_tensor = torch.full((MAX_LEN, ), - 32, - dtype=torch.uint8, - device='cuda') - if rank == 0: - mmcv.mkdir_or_exist('.dist_test') - tmpdir = tempfile.mkdtemp(dir='.dist_test') - tmpdir = torch.tensor( - bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') - dir_tensor[:len(tmpdir)] = tmpdir - dist.broadcast(dir_tensor, 0) - tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() - else: - mmcv.mkdir_or_exist(tmpdir) - # dump the part result to the dir - mmcv.dump(processor, osp.join(tmpdir, f'part_{rank}.pkl')) - dist.barrier() - # collect all parts - if rank != 0: - return None + eval_results = collect_results_gpu(eval_results, len(dataset)) else: - # load results of all parts from tmp dir - main_processor = mmcv.load(osp.join(tmpdir, f'part_{0}.pkl')) - sub_processors = [] - for i in range(1, world_size): - part_file = osp.join(tmpdir, f'part_{i}.pkl') - part_processor = mmcv.load(part_file) - # When data is severely insufficient, an empty part_result - # on a certain gpu could makes the overall outputs empty. - if part_processor: - sub_processors.append(part_processor) - main_processor.merge(sub_processors) - # remove tmp dir - shutil.rmtree(tmpdir) - return main_processor + eval_results = collect_results_cpu(eval_results, len(dataset), tmpdir) + return eval_results diff --git a/mmseg/core/evaluation/__init__.py b/mmseg/core/evaluation/__init__.py index 56af4e91c2..d44931b53a 100644 --- a/mmseg/core/evaluation/__init__.py +++ b/mmseg/core/evaluation/__init__.py @@ -1,10 +1,10 @@ from .class_names import get_classes, get_palette from .eval_hooks import DistEvalHook, EvalHook -from .metrics import (ResultProcessor, calculate_metrics, eval_metrics, +from .metrics import (convert_pre_eval_results_metrics, eval_metrics, mean_dice, mean_fscore, mean_iou) __all__ = [ 'EvalHook', 'DistEvalHook', 'mean_dice', 'mean_iou', 'mean_fscore', - 'eval_metrics', 'get_classes', 'get_palette', 'calculate_metrics', - 'ResultProcessor' + 'eval_metrics', 'get_classes', 'get_palette', + 'convert_pre_eval_results_metrics' ] diff --git a/mmseg/core/evaluation/metrics.py b/mmseg/core/evaluation/metrics.py index 5052dd18e4..bfb67e7617 100644 --- a/mmseg/core/evaluation/metrics.py +++ b/mmseg/core/evaluation/metrics.py @@ -1,4 +1,4 @@ -from collections import Iterable, OrderedDict +from collections import OrderedDict import mmcv import numpy as np @@ -326,26 +326,18 @@ def eval_metrics(results, return ret_metrics -def calculate_metrics(total_area_intersect, - total_area_union, - total_area_pred_label, - total_area_label, - metrics=['mIoU'], - nan_to_num=None, - beta=1): - """Calculate evaluation metrics +def convert_pre_eval_results_metrics(pre_eval_results, + metrics=['mIoU'], + nan_to_num=None, + beta=1): + """Convert pre-eval results to metrics. + Args: - results (list[ndarray] | list[str]): List of prediction segmentation - maps or list of prediction result filenames. - gt_seg_maps (list[ndarray] | list[str]): list of ground truth - segmentation maps or list of label filenames. - num_classes (int): Number of categories. - ignore_index (int): Index that will be ignored in evaluation. + pre_eval_results (tuple[torch.Tensor]): per image eval results for + computing evaluation metric metrics (list[str] | str): Metrics to be evaluated, 'mIoU' and 'mDice'. nan_to_num (int, optional): If specified, NaN values will be replaced by the numbers defined by the user. Default: None. - label_map (dict): Mapping old labels to new labels. Default: dict(). - reduce_zero_label (bool): Wether ignore zero label. Default: False. Returns: float: Overall accuracy on all images. ndarray: Per category accuracy, shape (num_classes, ). @@ -357,6 +349,18 @@ def calculate_metrics(total_area_intersect, if not set(metrics).issubset(set(allowed_metrics)): raise KeyError('metrics {} is not supported'.format(metrics)) + # convert list of tuples to tuple of lists, e.g. + # [(A_1, B_1, C_1, D_1), ..., (A_n, B_n, C_n, D_n)] to + # ([A_1, ..., A_n], ..., [D_1, ..., D_n]) + + pre_eval_results = tuple(zip(*pre_eval_results)) + assert len(pre_eval_results) == 4 + + total_area_intersect = sum(pre_eval_results[0]) + total_area_union = sum(pre_eval_results[1]) + total_area_pred_label = sum(pre_eval_results[2]) + total_area_label = sum(pre_eval_results[3]) + all_acc = total_area_intersect.sum() / total_area_label.sum() ret_metrics = OrderedDict({'aAcc': all_acc}) for metric in metrics: @@ -390,98 +394,3 @@ def calculate_metrics(total_area_intersect, for metric, metric_value in ret_metrics.items() }) return ret_metrics - - -class ResultProcessor(object): - """collect and process results when progressive evaluation.""" - - def __init__(self, - num_classes, - ignore_index=255, - collect_type='pixels_count', - label_map=dict(), - reduce_zero_label=False): - self.num_classes = num_classes - self.collect_type = collect_type - - self.ignore_index = ignore_index - self.label_map = label_map - self.reduce_zero_label = reduce_zero_label - - assert collect_type.lower() in ['pixels_count', 'seg_map'] - - self.prediction_ram = [] - self.label_ram = [] - self.meta_ram = [] - - self.total_area_intersect = torch.zeros((self.num_classes, ), - dtype=torch.float64) - self.total_area_union = torch.zeros((self.num_classes, ), - dtype=torch.float64) - self.total_area_pred_label = torch.zeros((self.num_classes, ), - dtype=torch.float64) - self.total_area_label = torch.zeros((self.num_classes, ), - dtype=torch.float64) - - def collect(self, preds, labels, metas): - """collect predictions, ground truth labels and meta information.""" - if not isinstance(preds, Iterable): - preds = [preds] - if not isinstance(labels, Iterable): - labels = [labels] - if not isinstance(metas, Iterable): - metas = [metas] - - ret_value = total_intersect_and_union( - preds, - labels, - self.num_classes, - ignore_index=self.ignore_index, - label_map=self.label_map, - reduce_zero_label=self.reduce_zero_label) - self.total_area_intersect += ret_value[0] - self.total_area_union += ret_value[1] - self.total_area_pred_label += ret_value[2] - self.total_area_label += ret_value[3] - - if self.collect_type == 'seg_map': - if isinstance(preds, Iterable): - self.prediction_ram.extend(preds) - self.label_ram.extend(labels) - self.meta_ram.extend(metas) - else: - self.prediction_ram.append(preds) - self.label_ram.append(labels) - self.meta_ram.append(metas) - - def retrieval(self): - """Get processor content by collect type.""" - if self.collect_type == 'pixels_count': - return (self.total_area_intersect, self.total_area_union, - self.total_area_pred_label, self.total_area_label) - elif self.collect_type == 'seg_map': - return self.prediction_ram, self.label_ram, self.meta_ram - - def calculate(self, metrics): - """Calculate metric by using collected pixelx count matrix.""" - return calculate_metrics( - self.total_area_intersect, - self.total_area_union, - self.total_area_pred_label, - self.total_area_label, - metrics=metrics) - - def merge(self, processors): - """Merge other processors into this processor.""" - if not isinstance(processors, Iterable): - processors = [processors] - for processor in processors: - self.total_area_intersect += processor.total_area_intersect - self.total_area_union += processor.total_area_union - self.total_area_pred_label += processor.total_area_pred_label - self.total_area_label += processor.total_area_label - - if self.collect_type == 'seg_map': - self.prediction_ram.extend(processor.prediction_ram) - self.label_ram.extend(processor.label_ram) - self.meta_ram.extend(processor.meta_ram) diff --git a/mmseg/datasets/custom.py b/mmseg/datasets/custom.py index 9ba6d4e011..e5a8c2b3cf 100644 --- a/mmseg/datasets/custom.py +++ b/mmseg/datasets/custom.py @@ -1,5 +1,5 @@ import os.path as osp -from collections import Iterable, OrderedDict +from collections import OrderedDict import mmcv import numpy as np @@ -7,6 +7,8 @@ from prettytable import PrettyTable from torch.utils.data import Dataset +from mmseg.core.evaluation.metrics import (convert_pre_eval_results_metrics, + intersect_and_union) from mmseg.utils import get_root_logger from .builder import DATASETS from .pipelines import Compose @@ -224,25 +226,25 @@ def prepare_test_img(self, idx): def format_results(self, results, **kwargs): """Place holder to format result to dataset specific output.""" - def get_gt_seg_maps(self): - """Get ground truth segmentation maps for evaluation.""" - for img_info in self.img_infos: - seg_map = osp.join(self.ann_dir, img_info['ann']['seg_map']) - gt_seg_map = mmcv.imread( - seg_map, flag='unchanged', backend='pillow') - yield [gt_seg_map] - - def index_gt_seg_maps(self, indexes): - """Get ground truth segmentation map by index for evaluation.""" - if not isinstance(indexes, Iterable): + def pre_eval(self, preds, indexes): + # In order to compat with batch inference + if not isinstance(indexes, list): indexes = [indexes] - gt_seg_maps = [] - for index in indexes: + if not isinstance(preds, list): + preds = [preds] + + eval_results = [] + + for pred, index in zip(preds, indexes): seg_map = osp.join(self.ann_dir, self.img_infos[index]['ann']['seg_map']) - gt_seg_maps.append( - mmcv.imread(seg_map, flag='unchanged', backend='pillow')) - return gt_seg_maps + seg_map = mmcv.imread(seg_map, flag='unchanged', backend='pillow') + eval_results.append( + intersect_and_union(pred, seg_map, self.num_classes, + self.ignore_index, self.label_map, + self.reduce_zero_label)) + + return eval_results def get_classes_and_palette(self, classes=None, palette=None): """Get class names of current dataset. @@ -307,11 +309,12 @@ def get_palette_for_custom_classes(self, class_names, palette=None): return palette - def evaluate(self, processor, metric='mIoU', logger=None, **kwargs): + def evaluate(self, pre_eval_results, metric='mIoU', logger=None, **kwargs): """Evaluate the dataset. Args: - processor (object): The result processor for progressive mode. + pre_eval_results (tuple[torch.Tensor]): per image eval results for + computing evaluation metric metric (str | list[str]): Metrics to be evaluated. 'mIoU', 'mDice' and 'mFscore' are supported. logger (logging.Logger | None | str): Logger used for printing @@ -327,7 +330,8 @@ def evaluate(self, processor, metric='mIoU', logger=None, **kwargs): raise KeyError('metric {} is not supported'.format(metric)) eval_results = {} - ret_metrics = processor.calculate(metric) + ret_metrics = convert_pre_eval_results_metrics(pre_eval_results, + metric) # Because dataset.CLASSES is required in single_gpu_test, # multi_gpu_test, so it's necessary to keep From 25a74755cf8f7ead7d2dad5629665a7171194251 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Tue, 27 Jul 2021 12:06:36 +0800 Subject: [PATCH 55/96] TODO: Refactor format_results. --- mmseg/datasets/ade.py | 2 +- mmseg/datasets/cityscapes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mmseg/datasets/ade.py b/mmseg/datasets/ade.py index 942b37adaa..2b1a11f6fe 100644 --- a/mmseg/datasets/ade.py +++ b/mmseg/datasets/ade.py @@ -130,7 +130,7 @@ def results2img(self, results, imgfile_prefix, to_label_id): return result_files - # TODO: Use processor to format results. + # TODO: Refactor format_results to compat with test api def format_results(self, results, imgfile_prefix=None, to_label_id=True): """Format the results into dir (standard format for ade20k evaluation). diff --git a/mmseg/datasets/cityscapes.py b/mmseg/datasets/cityscapes.py index 5d1c142890..e383d22e1f 100644 --- a/mmseg/datasets/cityscapes.py +++ b/mmseg/datasets/cityscapes.py @@ -88,7 +88,7 @@ def results2img(self, results, imgfile_prefix, to_label_id): return result_files - # TODO: Use processor to format results. + # TODO: Refactor format_results to compat with test api def format_results(self, results, imgfile_prefix=None, to_label_id=True): """Format the results into dir (standard format for Cityscapes evaluation). From 25cc8bad57849c22027f6d373f13d2ebbd30283c Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Tue, 27 Jul 2021 12:07:27 +0800 Subject: [PATCH 56/96] Modify variable names. --- mmseg/apis/test.py | 17 +++++++++-------- mmseg/datasets/custom.py | 6 +++--- tools/deploy_test.py | 6 +++--- tools/test.py | 10 +++++----- 4 files changed, 20 insertions(+), 19 deletions(-) diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index 6dab5cd226..4cc19aaa9f 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -23,7 +23,7 @@ def single_gpu_test(model, data_loader, show=False, out_dir=None, opacity=0.5): list: evaluation preparetion results. """ model.eval() - eval_results = [] + pre_eval_results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) @@ -33,7 +33,7 @@ def single_gpu_test(model, data_loader, show=False, out_dir=None, opacity=0.5): with torch.no_grad(): result = model(return_loss=False, **data) - eval_results.extend(dataset.pre_eval(result, [loader_indices[i]])) + pre_eval_results.extend(dataset.pre_eval(result, [loader_indices[i]])) if show or out_dir: img_tensor = data['img'][0] @@ -65,7 +65,7 @@ def single_gpu_test(model, data_loader, show=False, out_dir=None, opacity=0.5): for _ in range(batch_size): prog_bar.update() - return eval_results + return pre_eval_results def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): @@ -89,7 +89,7 @@ def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): list: evaluation preparetion results. """ model.eval() - eval_results = [] + pre_eval_results = [] dataset = data_loader.dataset loader_indices = data_loader.sampler @@ -103,7 +103,7 @@ def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): # TODO: adapt samples_per_gpu > 1. # only samples_per_gpu=1 valid now - eval_results.extend(dataset.pre_eval(result, [loader_indices[i]])) + pre_eval_results.extend(dataset.pre_eval(result, [loader_indices[i]])) if rank == 0: batch_size = len(result) * world_size @@ -112,7 +112,8 @@ def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): # collect results from all ranks if gpu_collect: - eval_results = collect_results_gpu(eval_results, len(dataset)) + pre_eval_results = collect_results_gpu(pre_eval_results, len(dataset)) else: - eval_results = collect_results_cpu(eval_results, len(dataset), tmpdir) - return eval_results + pre_eval_results = collect_results_cpu(pre_eval_results, len(dataset), + tmpdir) + return pre_eval_results diff --git a/mmseg/datasets/custom.py b/mmseg/datasets/custom.py index e5a8c2b3cf..bd1d61b81d 100644 --- a/mmseg/datasets/custom.py +++ b/mmseg/datasets/custom.py @@ -233,18 +233,18 @@ def pre_eval(self, preds, indexes): if not isinstance(preds, list): preds = [preds] - eval_results = [] + pre_eval_results = [] for pred, index in zip(preds, indexes): seg_map = osp.join(self.ann_dir, self.img_infos[index]['ann']['seg_map']) seg_map = mmcv.imread(seg_map, flag='unchanged', backend='pillow') - eval_results.append( + pre_eval_results.append( intersect_and_union(pred, seg_map, self.num_classes, self.ignore_index, self.label_map, self.reduce_zero_label)) - return eval_results + return pre_eval_results def get_classes_and_palette(self, classes=None, palette=None): """Get class names of current dataset. diff --git a/tools/deploy_test.py b/tools/deploy_test.py index 32f2a4e121..27eebca315 100644 --- a/tools/deploy_test.py +++ b/tools/deploy_test.py @@ -228,8 +228,8 @@ def main(): model.PALETTE = dataset.PALETTE model = MMDataParallel(model, device_ids=[0]) - eval_results = single_gpu_test(model, data_loader, args.show, - args.show_dir, args.opacity) + pre_eval_results = single_gpu_test(model, data_loader, args.show, + args.show_dir, args.opacity) rank, _ = get_dist_info() if rank == 0: @@ -242,7 +242,7 @@ def main(): # if args.format_only: # dataset.format_results(results, **kwargs) if args.eval: - dataset.evaluate(eval_results, args.eval, **kwargs) + dataset.evaluate(pre_eval_results, args.eval, **kwargs) if __name__ == '__main__': diff --git a/tools/test.py b/tools/test.py index b74d97d533..ec3b37f9ca 100644 --- a/tools/test.py +++ b/tools/test.py @@ -140,15 +140,15 @@ def main(): if not distributed: model = MMDataParallel(model, device_ids=[0]) - eval_results = single_gpu_test(model, data_loader, args.show, - args.show_dir, args.opacity) + pre_eval_results = single_gpu_test(model, data_loader, args.show, + args.show_dir, args.opacity) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) - eval_results = multi_gpu_test(model, data_loader, args.tmpdir, - args.gpu_collect) + pre_eval_results = multi_gpu_test(model, data_loader, args.tmpdir, + args.gpu_collect) rank, _ = get_dist_info() if rank == 0: @@ -161,7 +161,7 @@ def main(): # if args.format_only: # dataset.format_results(results, **kwargs) if args.eval: - dataset.evaluate(eval_results, args.eval, **kwargs) + dataset.evaluate(pre_eval_results, args.eval, **kwargs) if __name__ == '__main__': From af5443271f582b44c0727225d39efe6960e356e3 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Tue, 27 Jul 2021 12:17:37 +0800 Subject: [PATCH 57/96] Fix num_classes bug. --- mmseg/datasets/custom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmseg/datasets/custom.py b/mmseg/datasets/custom.py index bd1d61b81d..7e3d174464 100644 --- a/mmseg/datasets/custom.py +++ b/mmseg/datasets/custom.py @@ -240,7 +240,7 @@ def pre_eval(self, preds, indexes): self.img_infos[index]['ann']['seg_map']) seg_map = mmcv.imread(seg_map, flag='unchanged', backend='pillow') pre_eval_results.append( - intersect_and_union(pred, seg_map, self.num_classes, + intersect_and_union(pred, seg_map, len(self.CLASSES), self.ignore_index, self.label_map, self.reduce_zero_label)) From 1b7c9763abce02ed43e634088e99c98be53a4cbb Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Tue, 27 Jul 2021 13:41:31 +0800 Subject: [PATCH 58/96] Fix sampler index bug. --- mmseg/apis/test.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index 4cc19aaa9f..20e7afefef 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -27,13 +27,14 @@ def single_gpu_test(model, data_loader, show=False, out_dir=None, opacity=0.5): dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) - loader_indices = list(data_loader.sampler) + loader_indices = iter(data_loader.sampler) - for i, data in enumerate(data_loader): + for _, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, **data) - pre_eval_results.extend(dataset.pre_eval(result, [loader_indices[i]])) + pre_eval_results.extend( + dataset.pre_eval(result, [next(loader_indices)])) if show or out_dir: img_tensor = data['img'][0] @@ -91,19 +92,20 @@ def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): model.eval() pre_eval_results = [] dataset = data_loader.dataset - loader_indices = data_loader.sampler + loader_indices = iter(data_loader.sampler) rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) - for i, data in enumerate(data_loader): + for _, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) # TODO: adapt samples_per_gpu > 1. # only samples_per_gpu=1 valid now - pre_eval_results.extend(dataset.pre_eval(result, [loader_indices[i]])) + pre_eval_results.extend( + dataset.pre_eval(result, [next(loader_indices)])) if rank == 0: batch_size = len(result) * world_size From 049b307c9684e35672f182cf433b8ccb3415ccc1 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Tue, 27 Jul 2021 13:42:18 +0800 Subject: [PATCH 59/96] Fix grammaly bug. --- mmseg/datasets/custom.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mmseg/datasets/custom.py b/mmseg/datasets/custom.py index 7e3d174464..8fc169b843 100644 --- a/mmseg/datasets/custom.py +++ b/mmseg/datasets/custom.py @@ -226,16 +226,16 @@ def prepare_test_img(self, idx): def format_results(self, results, **kwargs): """Place holder to format result to dataset specific output.""" - def pre_eval(self, preds, indexes): + def pre_eval(self, preds, indices): # In order to compat with batch inference - if not isinstance(indexes, list): - indexes = [indexes] + if not isinstance(indices, list): + indices = [indices] if not isinstance(preds, list): preds = [preds] pre_eval_results = [] - for pred, index in zip(preds, indexes): + for pred, index in zip(preds, indices): seg_map = osp.join(self.ann_dir, self.img_infos[index]['ann']['seg_map']) seg_map = mmcv.imread(seg_map, flag='unchanged', backend='pillow') From 932d0a41d1f78c0d79a0aa459cc9e59ea33eedf2 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Tue, 27 Jul 2021 14:30:55 +0800 Subject: [PATCH 60/96] Add part of benchmark results. --- configs/segformer/readme.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 configs/segformer/readme.md diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md new file mode 100644 index 0000000000..35ab4cf8f5 --- /dev/null +++ b/configs/segformer/readme.md @@ -0,0 +1,27 @@ +# SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers + +## Introduction + + + +```latex +@article{xie2021segformer, + title={SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers}, + author={Xie, Enze and Wang, Wenhai and Yu, Zhiding and Anandkumar, Anima and Alvarez, Jose M and Luo, Ping}, + journal={arXiv preprint arXiv:2105.15203}, + year={2021} +} +``` + +## Results and models + +### ADE20k + +| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | +| ------ | -------- | --------- | ------: | -------: | -------------- | ---: | ------------- | ------ | -------- | +|Segformer | MIT-B0 | 512x512 | 160000 | - | - | 37.41 | - | [config]() | [model]() | [log]() | +|Segformer | MIT-B1 | 512x512 | 160000 | - | - | 41.05 | - | [config]() | [model]() | [log]() | +|Segformer | MIT-B2 | 512x512 | 160000 | - | - | 45.68 | - | [config]() | [model]() | [log]() | +|Segformer | MIT-B3 | 512x512 | 160000 | - | - | - | - | [config]() | [model]() | [log]() | +|Segformer | MIT-B4 | 512x512 | 160000 | - | - | - | - | [config]() | [model]() | [log]() | +|Segformer | MIT-B5 | 512x512 | 160000 | - | - | - | - | [config]() | [model]() | [log]() | From dc35f6db503b18255a9aa18d169dc07336ce22e8 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Tue, 27 Jul 2021 14:58:34 +0800 Subject: [PATCH 61/96] Support batch sampler. --- mmseg/apis/test.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index 20e7afefef..13c04d3bc3 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -27,14 +27,13 @@ def single_gpu_test(model, data_loader, show=False, out_dir=None, opacity=0.5): dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) - loader_indices = iter(data_loader.sampler) + loader_indices = iter(data_loader.batch_sampler) for _, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, **data) - pre_eval_results.extend( - dataset.pre_eval(result, [next(loader_indices)])) + pre_eval_results.extend(dataset.pre_eval(result, next(loader_indices))) if show or out_dir: img_tensor = data['img'][0] @@ -92,7 +91,7 @@ def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): model.eval() pre_eval_results = [] dataset = data_loader.dataset - loader_indices = iter(data_loader.sampler) + loader_indices = iter(data_loader.batch_sampler) rank, world_size = get_dist_info() if rank == 0: @@ -104,8 +103,7 @@ def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): # TODO: adapt samples_per_gpu > 1. # only samples_per_gpu=1 valid now - pre_eval_results.extend( - dataset.pre_eval(result, [next(loader_indices)])) + pre_eval_results.extend(dataset.pre_eval(result, next(loader_indices))) if rank == 0: batch_size = len(result) * world_size From 880fbcba572800b9731bd3a068adafb366fe0095 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Tue, 27 Jul 2021 15:25:01 +0800 Subject: [PATCH 62/96] More readable test api. --- mmseg/apis/test.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index 13c04d3bc3..0e5031d0a9 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -27,13 +27,13 @@ def single_gpu_test(model, data_loader, show=False, out_dir=None, opacity=0.5): dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) - loader_indices = iter(data_loader.batch_sampler) + loader_indices = data_loader.batch_sampler - for _, data in enumerate(data_loader): + for batch_indices, data in zip(loader_indices, data_loader): with torch.no_grad(): result = model(return_loss=False, **data) - pre_eval_results.extend(dataset.pre_eval(result, next(loader_indices))) + pre_eval_results.extend(dataset.pre_eval(result, batch_indices)) if show or out_dir: img_tensor = data['img'][0] @@ -91,19 +91,20 @@ def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): model.eval() pre_eval_results = [] dataset = data_loader.dataset - loader_indices = iter(data_loader.batch_sampler) + + loader_indices = data_loader.batch_sampler rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) - for _, data in enumerate(data_loader): + for batch_indices, data in zip(loader_indices, data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) # TODO: adapt samples_per_gpu > 1. # only samples_per_gpu=1 valid now - pre_eval_results.extend(dataset.pre_eval(result, next(loader_indices))) + pre_eval_results.extend(dataset.pre_eval(result, batch_indices)) if rank == 0: batch_size = len(result) * world_size From f9eda3eb81bea44d7912f3884f7ac3063c06a17c Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Tue, 27 Jul 2021 21:30:25 +0800 Subject: [PATCH 63/96] Remove some command arg and fix eval hook bug. --- mmseg/core/evaluation/eval_hooks.py | 3 +-- tools/test.py | 5 ----- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/mmseg/core/evaluation/eval_hooks.py b/mmseg/core/evaluation/eval_hooks.py index 5529088a7a..7465eb69fa 100644 --- a/mmseg/core/evaluation/eval_hooks.py +++ b/mmseg/core/evaluation/eval_hooks.py @@ -58,7 +58,7 @@ def _do_evaluate(self, runner): from mmseg.apis import single_gpu_test pre_eval_results = single_gpu_test( - runner.model, self.dataloader, False, show=False) + runner.model, self.dataloader, show=False) runner.log_buffer.clear() @@ -136,7 +136,6 @@ def _do_evaluate(self, runner): pre_eval_results = multi_gpu_test( runner.model, self.dataloader, - False, tmpdir=tmpdir, gpu_collect=self.gpu_collect) diff --git a/tools/test.py b/tools/test.py index ec3b37f9ca..c73a4a29e8 100644 --- a/tools/test.py +++ b/tools/test.py @@ -20,7 +20,6 @@ def parse_args(): parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument( '--aug-test', action='store_true', help='Use Flip and Multi scale aug') - parser.add_argument('--out', help='output result file in pickle format') parser.add_argument( '--format-only', action='store_true', @@ -152,10 +151,6 @@ def main(): rank, _ = get_dist_info() if rank == 0: - # TODO: Move this to test api - # if args.out: - # print(f'\nwriting results to {args.out}') - # mmcv.dump(results, args.out) kwargs = {} if args.eval_options is None else args.eval_options # TODO: Move this to test api. # if args.format_only: From 7be603d5c83ad4b71b765e3edd26c0357ba93efb Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 28 Jul 2021 11:59:37 +0800 Subject: [PATCH 64/96] Support format-only arg. --- mmseg/apis/test.py | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index 0e5031d0a9..01fcf80a34 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -7,12 +7,21 @@ from mmcv.runner import get_dist_info -def single_gpu_test(model, data_loader, show=False, out_dir=None, opacity=0.5): +def single_gpu_test(model, + data_loader, + format_only=False, + format_args={}, + show=False, + out_dir=None, + opacity=0.5): """Test with single GPU by progressive mode. Args: model (nn.Module): Model to be tested. data_loader (utils.data.Dataloader): Pytorch data loader. + format_only (bool): Only format result for results commit. + Default: False. + format_args (dict): The args for format_results. Default: {}. show (bool): Whether show results during inference. Default: False. out_dir (str, optional): If specified, the results will be dumped into the directory to save output results. @@ -33,7 +42,10 @@ def single_gpu_test(model, data_loader, show=False, out_dir=None, opacity=0.5): with torch.no_grad(): result = model(return_loss=False, **data) - pre_eval_results.extend(dataset.pre_eval(result, batch_indices)) + if format_only: + dataset.format_results(result, batch_indices, **format_args) + else: + pre_eval_results.extend(dataset.pre_eval(result, batch_indices)) if show or out_dir: img_tensor = data['img'][0] @@ -68,7 +80,12 @@ def single_gpu_test(model, data_loader, show=False, out_dir=None, opacity=0.5): return pre_eval_results -def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): +def multi_gpu_test(model, + data_loader, + format_only=False, + format_args={}, + tmpdir=None, + gpu_collect=False): """Test model with multiple gpus by progressive mode. This method tests model with multiple gpus and collects the results @@ -80,10 +97,14 @@ def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): Args: model (nn.Module): Model to be tested. data_loader (utils.data.Dataloader): Pytorch data loader. + format_only (bool): Only format result for results commit. + Default: False. + format_args (dict): The args for format_results. Default: {}. tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. The same path is used for efficient - test. + test. Default: None. gpu_collect (bool): Option to use either gpu or cpu to collect results. + Default: False. Returns: list: evaluation preparetion results. @@ -102,9 +123,12 @@ def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) - # TODO: adapt samples_per_gpu > 1. - # only samples_per_gpu=1 valid now - pre_eval_results.extend(dataset.pre_eval(result, batch_indices)) + if format_only: + dataset.format_results(result, batch_indices, **format_args) + else: + # TODO: adapt samples_per_gpu > 1. + # only samples_per_gpu=1 valid now + pre_eval_results.extend(dataset.pre_eval(result, batch_indices)) if rank == 0: batch_size = len(result) * world_size From 4243ea2893f0b67de8d58f799f871ed70303e4eb Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 28 Jul 2021 12:01:01 +0800 Subject: [PATCH 65/96] Modify format_results of datasets. --- mmseg/datasets/ade.py | 26 +++++++++++++------------- mmseg/datasets/cityscapes.py | 26 ++++++++++++++------------ mmseg/datasets/custom.py | 2 +- 3 files changed, 28 insertions(+), 26 deletions(-) diff --git a/mmseg/datasets/ade.py b/mmseg/datasets/ade.py index 2b1a11f6fe..35a7a230b2 100644 --- a/mmseg/datasets/ade.py +++ b/mmseg/datasets/ade.py @@ -90,12 +90,13 @@ def __init__(self, **kwargs): reduce_zero_label=True, **kwargs) - def results2img(self, results, imgfile_prefix, to_label_id): + def results2img(self, results, indices, imgfile_prefix, to_label_id): """Write the segmentation results to images. Args: results (list[list | tuple | ndarray]): Testing results of the dataset. + indices (list[int]): Indices of input results. imgfile_prefix (str): The filename prefix of the png files. If the prefix is "somepath/xxx", the png files will be named "somepath/xxx.png". @@ -108,9 +109,7 @@ def results2img(self, results, imgfile_prefix, to_label_id): """ mmcv.mkdir_or_exist(imgfile_prefix) result_files = [] - prog_bar = mmcv.ProgressBar(len(self)) - for idx in range(len(self)): - result = results[idx] + for result, idx in zip(results, indices): filename = self.img_infos[idx]['filename'] basename = osp.splitext(osp.basename(filename))[0] @@ -126,16 +125,18 @@ def results2img(self, results, imgfile_prefix, to_label_id): output.save(png_filename) result_files.append(png_filename) - prog_bar.update() - return result_files - # TODO: Refactor format_results to compat with test api - def format_results(self, results, imgfile_prefix=None, to_label_id=True): + def format_results(self, + results, + indices, + imgfile_prefix=None, + to_label_id=True): """Format the results into dir (standard format for ade20k evaluation). Args: results (list): Testing results of the dataset. + indices (list[int]): Indices of input results. imgfile_prefix (str | None): The prefix of images files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. @@ -149,10 +150,8 @@ def format_results(self, results, imgfile_prefix=None, to_label_id=True): for saving json/png files when img_prefix is not specified. """ - assert isinstance(results, list), 'results must be a list' - assert len(results) == len(self), ( - 'The length of results is not equal to the dataset len: ' - f'{len(results)} != {len(self)}') + assert isinstance(results, list), 'results must be a list.' + assert isinstance(indices, list), 'indices must be a list.' if imgfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() @@ -160,5 +159,6 @@ def format_results(self, results, imgfile_prefix=None, to_label_id=True): else: tmp_dir = None - result_files = self.results2img(results, imgfile_prefix, to_label_id) + result_files = self.results2img(results, indices, imgfile_prefix, + to_label_id) return result_files, tmp_dir diff --git a/mmseg/datasets/cityscapes.py b/mmseg/datasets/cityscapes.py index e383d22e1f..289ebc0b57 100644 --- a/mmseg/datasets/cityscapes.py +++ b/mmseg/datasets/cityscapes.py @@ -47,12 +47,13 @@ def _convert_to_label_id(result): return result_copy - def results2img(self, results, imgfile_prefix, to_label_id): + def results2img(self, results, indices, imgfile_prefix, to_label_id): """Write the segmentation results to images. Args: results (list[list | tuple | ndarray]): Testing results of the dataset. + indices (list[int]): Indices of input results. imgfile_prefix (str): The filename prefix of the png files. If the prefix is "somepath/xxx", the png files will be named "somepath/xxx.png". @@ -65,9 +66,7 @@ def results2img(self, results, imgfile_prefix, to_label_id): """ mmcv.mkdir_or_exist(imgfile_prefix) result_files = [] - prog_bar = mmcv.ProgressBar(len(self)) - for idx in range(len(self)): - result = results[idx] + for result, idx in zip(results, indices): if to_label_id: result = self._convert_to_label_id(result) filename = self.img_infos[idx]['filename'] @@ -84,17 +83,20 @@ def results2img(self, results, imgfile_prefix, to_label_id): output.putpalette(palette) output.save(png_filename) result_files.append(png_filename) - prog_bar.update() return result_files - # TODO: Refactor format_results to compat with test api - def format_results(self, results, imgfile_prefix=None, to_label_id=True): + def format_results(self, + results, + indices, + imgfile_prefix=None, + to_label_id=True): """Format the results into dir (standard format for Cityscapes evaluation). Args: results (list): Testing results of the dataset. + indices (list[int]): Indices of input results. imgfile_prefix (str | None): The prefix of images files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. @@ -108,17 +110,16 @@ def format_results(self, results, imgfile_prefix=None, to_label_id=True): for saving json/png files when img_prefix is not specified. """ - assert isinstance(results, list), 'results must be a list' - assert len(results) == len(self), ( - 'The length of results is not equal to the dataset len: ' - f'{len(results)} != {len(self)}') + assert isinstance(results, list), 'results must be a list.' + assert isinstance(indices, list), 'indices must be a list.' if imgfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() imgfile_prefix = tmp_dir.name else: tmp_dir = None - result_files = self.results2img(results, imgfile_prefix, to_label_id) + result_files = self.results2img(results, indices, imgfile_prefix, + to_label_id) return result_files, tmp_dir @@ -161,6 +162,7 @@ def evaluate(self, return eval_results + # TODO: Compat with new test function. def _evaluate_cityscapes(self, results, logger, imgfile_prefix): """Evaluation in Cityscapes protocol. diff --git a/mmseg/datasets/custom.py b/mmseg/datasets/custom.py index 8fc169b843..612d50e080 100644 --- a/mmseg/datasets/custom.py +++ b/mmseg/datasets/custom.py @@ -223,7 +223,7 @@ def prepare_test_img(self, idx): self.pre_pipeline(results) return self.pipeline(results) - def format_results(self, results, **kwargs): + def format_results(self, results, indices, **kwargs): """Place holder to format result to dataset specific output.""" def pre_eval(self, preds, indices): From ce44bff90da621b99868111167b7317a126cc235 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 28 Jul 2021 12:01:30 +0800 Subject: [PATCH 66/96] Modify tool which use test apis. --- tools/deploy_test.py | 17 ++++++----------- tools/test.py | 16 ++++++++-------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/tools/deploy_test.py b/tools/deploy_test.py index 27eebca315..a3d93ca01a 100644 --- a/tools/deploy_test.py +++ b/tools/deploy_test.py @@ -227,22 +227,17 @@ def main(): model.CLASSES = dataset.CLASSES model.PALETTE = dataset.PALETTE + eval_args = {} if args.eval_options is None else args.eval_options + model = MMDataParallel(model, device_ids=[0]) - pre_eval_results = single_gpu_test(model, data_loader, args.show, - args.show_dir, args.opacity) + pre_eval_results = single_gpu_test(model, data_loader, args.format_only, + eval_args, args.show, args.show_dir, + args.opacity) rank, _ = get_dist_info() if rank == 0: - # TODO: Move this to test api. - # if args.out: - # print(f'\nwriting results to {args.out}') - # mmcv.dump(results, args.out) - kwargs = {} if args.eval_options is None else args.eval_options - # TODO: Move this to test api. - # if args.format_only: - # dataset.format_results(results, **kwargs) if args.eval: - dataset.evaluate(pre_eval_results, args.eval, **kwargs) + dataset.evaluate(pre_eval_results, args.eval, **eval_args) if __name__ == '__main__': diff --git a/tools/test.py b/tools/test.py index c73a4a29e8..54ffb341de 100644 --- a/tools/test.py +++ b/tools/test.py @@ -136,27 +136,27 @@ def main(): # clean gpu memory when starting a new evaluation. torch.cuda.empty_cache() + eval_args = {} if args.eval_options is None else args.eval_options if not distributed: model = MMDataParallel(model, device_ids=[0]) - pre_eval_results = single_gpu_test(model, data_loader, args.show, - args.show_dir, args.opacity) + pre_eval_results = single_gpu_test(model, data_loader, + args.format_only, eval_args, + args.show, args.show_dir, + args.opacity) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) - pre_eval_results = multi_gpu_test(model, data_loader, args.tmpdir, + pre_eval_results = multi_gpu_test(model, data_loader, args.format_only, + eval_args, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: - kwargs = {} if args.eval_options is None else args.eval_options - # TODO: Move this to test api. - # if args.format_only: - # dataset.format_results(results, **kwargs) if args.eval: - dataset.evaluate(pre_eval_results, args.eval, **kwargs) + dataset.evaluate(pre_eval_results, args.eval, **eval_args) if __name__ == '__main__': From ff9e5f7f37df20513e11eb5170d352aac606b37f Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 28 Jul 2021 13:07:13 +0800 Subject: [PATCH 67/96] Update readme. --- configs/segformer/readme.md | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index 35ab4cf8f5..be884a9a4f 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -19,9 +19,20 @@ | Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | | ------ | -------- | --------- | ------: | -------: | -------------- | ---: | ------------- | ------ | -------- | -|Segformer | MIT-B0 | 512x512 | 160000 | - | - | 37.41 | - | [config]() | [model]() | [log]() | -|Segformer | MIT-B1 | 512x512 | 160000 | - | - | 41.05 | - | [config]() | [model]() | [log]() | -|Segformer | MIT-B2 | 512x512 | 160000 | - | - | 45.68 | - | [config]() | [model]() | [log]() | -|Segformer | MIT-B3 | 512x512 | 160000 | - | - | - | - | [config]() | [model]() | [log]() | +|Segformer | MIT-B0 | 512x512 | 160000 | - | - | 37.41 | 38.34 | [config]() | [model]() | [log]() | +|Segformer | MIT-B1 | 512x512 | 160000 | - | - | 40.97 | 42.54 | [config]() | [model]() | [log]() | +|Segformer | MIT-B2 | 512x512 | 160000 | - | - | 45.58 | - | [config]() | [model]() | [log]() | +|Segformer | MIT-B3 | 512x512 | 160000 | - | - | 47.82 | - | [config]() | [model]() | [log]() | |Segformer | MIT-B4 | 512x512 | 160000 | - | - | - | - | [config]() | [model]() | [log]() | -|Segformer | MIT-B5 | 512x512 | 160000 | - | - | - | - | [config]() | [model]() | [log]() | +|Segformer | MIT-B5 | 512x512 | 160000 | - | - | 49.13 | - | [config]() | [model]() | [log]() | + +Evaluation with AlignedResize: + +| Method | Backbone | Crop Size | Lr schd | mIoU | mIoU(ms+flip) | +| ------ | -------- | --------- | ------: | ---: | ------------- | +|Segformer | MIT-B0 | 512x512 | 160000 | 37.41 | 38.34 | +|Segformer | MIT-B1 | 512x512 | 160000 | - | - | +|Segformer | MIT-B2 | 512x512 | 160000 | - | - | +|Segformer | MIT-B3 | 512x512 | 160000 | - | - | +|Segformer | MIT-B4 | 512x512 | 160000 | - | - | +|Segformer | MIT-B5 | 512x512 | 160000 | - | - | From da179df3be205abb93bd4bbec15772abd5e3da8e Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 28 Jul 2021 15:48:43 +0800 Subject: [PATCH 68/96] Update readme of segformer. --- configs/segformer/readme.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index be884a9a4f..943a2568fb 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -21,18 +21,18 @@ | ------ | -------- | --------- | ------: | -------: | -------------- | ---: | ------------- | ------ | -------- | |Segformer | MIT-B0 | 512x512 | 160000 | - | - | 37.41 | 38.34 | [config]() | [model]() | [log]() | |Segformer | MIT-B1 | 512x512 | 160000 | - | - | 40.97 | 42.54 | [config]() | [model]() | [log]() | -|Segformer | MIT-B2 | 512x512 | 160000 | - | - | 45.58 | - | [config]() | [model]() | [log]() | -|Segformer | MIT-B3 | 512x512 | 160000 | - | - | 47.82 | - | [config]() | [model]() | [log]() | +|Segformer | MIT-B2 | 512x512 | 160000 | - | - | 45.58 | 47.03 | [config]() | [model]() | [log]() | +|Segformer | MIT-B3 | 512x512 | 160000 | - | - | 47.82 | 48.81 | [config]() | [model]() | [log]() | |Segformer | MIT-B4 | 512x512 | 160000 | - | - | - | - | [config]() | [model]() | [log]() | -|Segformer | MIT-B5 | 512x512 | 160000 | - | - | 49.13 | - | [config]() | [model]() | [log]() | +|Segformer | MIT-B5 | 640x640 | 160000 | - | - | 49.13 | 50.22 | [config]() | [model]() | [log]() | Evaluation with AlignedResize: | Method | Backbone | Crop Size | Lr schd | mIoU | mIoU(ms+flip) | | ------ | -------- | --------- | ------: | ---: | ------------- | -|Segformer | MIT-B0 | 512x512 | 160000 | 37.41 | 38.34 | -|Segformer | MIT-B1 | 512x512 | 160000 | - | - | -|Segformer | MIT-B2 | 512x512 | 160000 | - | - | -|Segformer | MIT-B3 | 512x512 | 160000 | - | - | +|Segformer | MIT-B0 | 512x512 | 160000 | 38.1 | 38.57 | +|Segformer | MIT-B1 | 512x512 | 160000 | 41.64 | 42.76 | +|Segformer | MIT-B2 | 512x512 | 160000 | - | 47.49 | +|Segformer | MIT-B3 | 512x512 | 160000 | - | 49.14 | |Segformer | MIT-B4 | 512x512 | 160000 | - | - | -|Segformer | MIT-B5 | 512x512 | 160000 | - | - | +|Segformer | MIT-B5 | 640x640 | 160000 | - | 50.72 | From cb77d5d586dafc11d2d97e306d247d497a516705 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 28 Jul 2021 22:08:15 +0800 Subject: [PATCH 69/96] Updata readme of segformer. --- configs/segformer/readme.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index 943a2568fb..bdcad50bd4 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -32,7 +32,7 @@ Evaluation with AlignedResize: | ------ | -------- | --------- | ------: | ---: | ------------- | |Segformer | MIT-B0 | 512x512 | 160000 | 38.1 | 38.57 | |Segformer | MIT-B1 | 512x512 | 160000 | 41.64 | 42.76 | -|Segformer | MIT-B2 | 512x512 | 160000 | - | 47.49 | -|Segformer | MIT-B3 | 512x512 | 160000 | - | 49.14 | +|Segformer | MIT-B2 | 512x512 | 160000 | 46.53 | 47.49 | +|Segformer | MIT-B3 | 512x512 | 160000 | 48.46 | 49.14 | |Segformer | MIT-B4 | 512x512 | 160000 | - | - | -|Segformer | MIT-B5 | 640x640 | 160000 | - | 50.72 | +|Segformer | MIT-B5 | 640x640 | 160000 | 50.08 | 50.72 | From 378083d3eb89f93a46265de445df611bb9ce529b Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 29 Jul 2021 13:21:18 +0800 Subject: [PATCH 70/96] Update segformer readme and fix segformer mit_b4. --- configs/segformer/readme.md | 4 ++-- configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index bdcad50bd4..5f70b2c8fd 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -23,7 +23,7 @@ |Segformer | MIT-B1 | 512x512 | 160000 | - | - | 40.97 | 42.54 | [config]() | [model]() | [log]() | |Segformer | MIT-B2 | 512x512 | 160000 | - | - | 45.58 | 47.03 | [config]() | [model]() | [log]() | |Segformer | MIT-B3 | 512x512 | 160000 | - | - | 47.82 | 48.81 | [config]() | [model]() | [log]() | -|Segformer | MIT-B4 | 512x512 | 160000 | - | - | - | - | [config]() | [model]() | [log]() | +|Segformer | MIT-B4 | 512x512 | 160000 | - | - | 48.46 | - | [config]() | [model]() | [log]() | |Segformer | MIT-B5 | 640x640 | 160000 | - | - | 49.13 | 50.22 | [config]() | [model]() | [log]() | Evaluation with AlignedResize: @@ -34,5 +34,5 @@ Evaluation with AlignedResize: |Segformer | MIT-B1 | 512x512 | 160000 | 41.64 | 42.76 | |Segformer | MIT-B2 | 512x512 | 160000 | 46.53 | 47.49 | |Segformer | MIT-B3 | 512x512 | 160000 | 48.46 | 49.14 | -|Segformer | MIT-B4 | 512x512 | 160000 | - | - | +|Segformer | MIT-B4 | 512x512 | 160000 | 49.34 | 50.29 | |Segformer | MIT-B5 | 640x640 | 160000 | 50.08 | 50.72 | diff --git a/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py index c0f6c0a072..472263fb15 100644 --- a/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py @@ -7,7 +7,7 @@ model = dict( pretrained='pretrain/mit_b4.pth', backbone=dict( - embed_dims=[64, 128, 320, 512], + embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 8, 27, 3]), decode_head=dict(in_channels=[64, 128, 320, 512], num_classes=150)) From d0a724046cf78b2ed166b0a863837548275c67aa Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 29 Jul 2021 13:38:51 +0800 Subject: [PATCH 71/96] Update readme of segformer. --- configs/segformer/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index 5f70b2c8fd..a45f26a00c 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -23,7 +23,7 @@ |Segformer | MIT-B1 | 512x512 | 160000 | - | - | 40.97 | 42.54 | [config]() | [model]() | [log]() | |Segformer | MIT-B2 | 512x512 | 160000 | - | - | 45.58 | 47.03 | [config]() | [model]() | [log]() | |Segformer | MIT-B3 | 512x512 | 160000 | - | - | 47.82 | 48.81 | [config]() | [model]() | [log]() | -|Segformer | MIT-B4 | 512x512 | 160000 | - | - | 48.46 | - | [config]() | [model]() | [log]() | +|Segformer | MIT-B4 | 512x512 | 160000 | - | - | 48.46 | 49.76 | [config]() | [model]() | [log]() | |Segformer | MIT-B5 | 640x640 | 160000 | - | - | 49.13 | 50.22 | [config]() | [model]() | [log]() | Evaluation with AlignedResize: From 77b9850c4cc8ba71a485df4156039e774c28c6dd Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Fri, 30 Jul 2021 01:54:13 +0800 Subject: [PATCH 72/96] Clean AlignedResize related config. --- configs/_base_/datasets/ade20k_640x640.py | 54 ----------------- configs/_base_/datasets/ade20k_aligned.py | 54 ----------------- .../_base_/datasets/ade20k_aligned_640x640.py | 53 ----------------- .../segformer_mit-b5_640x640_160k_ade20k.py | 59 ++++++++++++++++++- 4 files changed, 57 insertions(+), 163 deletions(-) delete mode 100644 configs/_base_/datasets/ade20k_640x640.py delete mode 100644 configs/_base_/datasets/ade20k_aligned.py delete mode 100644 configs/_base_/datasets/ade20k_aligned_640x640.py diff --git a/configs/_base_/datasets/ade20k_640x640.py b/configs/_base_/datasets/ade20k_640x640.py deleted file mode 100644 index 58d36e4a38..0000000000 --- a/configs/_base_/datasets/ade20k_640x640.py +++ /dev/null @@ -1,54 +0,0 @@ -# dataset settings -dataset_type = 'ADE20KDataset' -data_root = 'data/ade/ADEChallengeData2016' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -crop_size = (640, 640) -train_pipeline = [ - dict(type='LoadImageFromFile'), - dict(type='LoadAnnotations', reduce_zero_label=True), - dict(type='Resize', img_scale=(2048, 640), ratio_range=(0.5, 2.0)), - dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), - dict(type='RandomFlip', prob=0.5), - dict(type='PhotoMetricDistortion'), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), - dict(type='DefaultFormatBundle'), - dict(type='Collect', keys=['img', 'gt_semantic_seg']), -] -test_pipeline = [ - dict(type='LoadImageFromFile'), - dict( - type='MultiScaleFlipAug', - img_scale=(2048, 640), - # img_ratios=[0.5640.75, 1.0, 1.25, 1.5, 1.75], - flip=False, - transforms=[ - dict(type='Resize', keep_ratio=True), - dict(type='RandomFlip'), - dict(type='Normalize', **img_norm_cfg), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] -data = dict( - samples_per_gpu=4, - workers_per_gpu=4, - train=dict( - type=dataset_type, - data_root=data_root, - img_dir='images/training', - ann_dir='annotations/training', - pipeline=train_pipeline), - val=dict( - type=dataset_type, - data_root=data_root, - img_dir='images/validation', - ann_dir='annotations/validation', - pipeline=test_pipeline), - test=dict( - type=dataset_type, - data_root=data_root, - img_dir='images/validation', - ann_dir='annotations/validation', - pipeline=test_pipeline)) diff --git a/configs/_base_/datasets/ade20k_aligned.py b/configs/_base_/datasets/ade20k_aligned.py deleted file mode 100644 index 51f24347ab..0000000000 --- a/configs/_base_/datasets/ade20k_aligned.py +++ /dev/null @@ -1,54 +0,0 @@ -# dataset settings -dataset_type = 'ADE20KDataset' -data_root = 'data/ade/ADEChallengeData2016' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -crop_size = (512, 512) -train_pipeline = [ - dict(type='LoadImageFromFile'), - dict(type='LoadAnnotations', reduce_zero_label=True), - dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), - dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), - dict(type='RandomFlip', prob=0.5), - dict(type='PhotoMetricDistortion'), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), - dict(type='DefaultFormatBundle'), - dict(type='Collect', keys=['img', 'gt_semantic_seg']), -] -test_pipeline = [ - dict(type='LoadImageFromFile'), - dict( - type='MultiScaleFlipAug', - img_scale=(2048, 512), - # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], - flip=False, - transforms=[ - dict(type='AlignedResize', keep_ratio=True, size_divisor=32), - dict(type='RandomFlip'), - dict(type='Normalize', **img_norm_cfg), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] -data = dict( - samples_per_gpu=4, - workers_per_gpu=4, - train=dict( - type=dataset_type, - data_root=data_root, - img_dir='images/training', - ann_dir='annotations/training', - pipeline=train_pipeline), - val=dict( - type=dataset_type, - data_root=data_root, - img_dir='images/validation', - ann_dir='annotations/validation', - pipeline=test_pipeline), - test=dict( - type=dataset_type, - data_root=data_root, - img_dir='images/validation', - ann_dir='annotations/validation', - pipeline=test_pipeline)) diff --git a/configs/_base_/datasets/ade20k_aligned_640x640.py b/configs/_base_/datasets/ade20k_aligned_640x640.py deleted file mode 100644 index 19a9061397..0000000000 --- a/configs/_base_/datasets/ade20k_aligned_640x640.py +++ /dev/null @@ -1,53 +0,0 @@ -dataset_type = 'ADE20KDataset' -data_root = 'data/ade/ADEChallengeData2016' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -crop_size = (640, 640) -train_pipeline = [ - dict(type='LoadImageFromFile'), - dict(type='LoadAnnotations', reduce_zero_label=True), - dict(type='Resize', img_scale=(2048, 640), ratio_range=(0.5, 2.0)), - dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), - dict(type='RandomFlip', prob=0.5), - dict(type='PhotoMetricDistortion'), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), - dict(type='DefaultFormatBundle'), - dict(type='Collect', keys=['img', 'gt_semantic_seg']), -] -test_pipeline = [ - dict(type='LoadImageFromFile'), - dict( - type='MultiScaleFlipAug', - img_scale=(2048, 640), - # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], - flip=False, - transforms=[ - dict(type='AlignedResize', keep_ratio=True, size_divisor=32), - dict(type='RandomFlip'), - dict(type='Normalize', **img_norm_cfg), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] -data = dict( - samples_per_gpu=4, - workers_per_gpu=4, - train=dict( - type=dataset_type, - data_root=data_root, - img_dir='images/training', - ann_dir='annotations/training', - pipeline=train_pipeline), - val=dict( - type=dataset_type, - data_root=data_root, - img_dir='images/validation', - ann_dir='annotations/validation', - pipeline=test_pipeline), - test=dict( - type=dataset_type, - data_root=data_root, - img_dir='images/validation', - ann_dir='annotations/validation', - pipeline=test_pipeline)) diff --git a/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py b/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py index e63dfef290..47183d4ea2 100644 --- a/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py @@ -1,8 +1,63 @@ _base_ = [ - '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py', - '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' + '../_base_/models/segformer_mit-b0.py', '../_base_/default_runtime.py', + '../_base_/schedules/schedule_160k.py' ] +# dataset settings +dataset_type = 'ADE20KDataset' +data_root = 'data/ade/ADEChallengeData2016' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +crop_size = (640, 640) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', reduce_zero_label=True), + dict(type='Resize', img_scale=(2048, 640), ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2048, 640), + # img_ratios=[0.5640.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/training', + ann_dir='annotations/training', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + img_dir='images/validation', + ann_dir='annotations/validation', + pipeline=test_pipeline)) + # model settings model = dict( pretrained='pretrain/mit_b5.pth', From d0d51001821075f2f183f9931cbda5ff72069fbb Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Fri, 30 Jul 2021 01:59:47 +0800 Subject: [PATCH 73/96] Clean code from pr #709 --- mmseg/apis/test.py | 113 +++++++++++++++++----------- mmseg/core/evaluation/__init__.py | 6 +- mmseg/core/evaluation/eval_hooks.py | 90 +++++----------------- mmseg/core/evaluation/metrics.py | 70 ----------------- mmseg/datasets/ade.py | 25 +++--- mmseg/datasets/cityscapes.py | 30 ++++---- mmseg/datasets/custom.py | 79 +++++++++++-------- tools/test.py | 28 +++---- 8 files changed, 176 insertions(+), 265 deletions(-) diff --git a/mmseg/apis/test.py b/mmseg/apis/test.py index 01fcf80a34..0034159689 100644 --- a/mmseg/apis/test.py +++ b/mmseg/apis/test.py @@ -1,52 +1,68 @@ import os.path as osp +import tempfile import mmcv +import numpy as np import torch from mmcv.engine import collect_results_cpu, collect_results_gpu from mmcv.image import tensor2imgs from mmcv.runner import get_dist_info +def np2tmp(array, temp_file_name=None, tmpdir=None): + """Save ndarray to local numpy file. + + Args: + array (ndarray): Ndarray to save. + temp_file_name (str): Numpy file name. If 'temp_file_name=None', this + function will generate a file name with tempfile.NamedTemporaryFile + to save ndarray. Default: None. + tmpdir (str): Temporary directory to save Ndarray files. Default: None. + + Returns: + str: The numpy file name. + """ + + if temp_file_name is None: + temp_file_name = tempfile.NamedTemporaryFile( + suffix='.npy', delete=False, dir=tmpdir).name + np.save(temp_file_name, array) + return temp_file_name + + def single_gpu_test(model, data_loader, - format_only=False, - format_args={}, show=False, out_dir=None, + efficient_test=False, opacity=0.5): - """Test with single GPU by progressive mode. + """Test with single GPU. Args: model (nn.Module): Model to be tested. data_loader (utils.data.Dataloader): Pytorch data loader. - format_only (bool): Only format result for results commit. - Default: False. - format_args (dict): The args for format_results. Default: {}. show (bool): Whether show results during inference. Default: False. out_dir (str, optional): If specified, the results will be dumped into the directory to save output results. + efficient_test (bool): Whether save the results as local numpy files to + save CPU memory during evaluation. Default: False. opacity(float): Opacity of painted segmentation map. Default 0.5. Must be in (0, 1] range. Returns: - list: evaluation preparetion results. + list: The prediction results. """ + model.eval() - pre_eval_results = [] + results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) - - loader_indices = data_loader.batch_sampler - - for batch_indices, data in zip(loader_indices, data_loader): + if efficient_test: + mmcv.mkdir_or_exist('.efficient_test') + for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, **data) - if format_only: - dataset.format_results(result, batch_indices, **format_args) - else: - pre_eval_results.extend(dataset.pre_eval(result, batch_indices)) - if show or out_dir: img_tensor = data['img'][0] img_metas = data['img_metas'][0].data[0] @@ -73,20 +89,27 @@ def single_gpu_test(model, out_file=out_file, opacity=opacity) + if isinstance(result, list): + if efficient_test: + result = [np2tmp(_, tmpdir='.efficient_test') for _ in result] + results.extend(result) + else: + if efficient_test: + result = np2tmp(result, tmpdir='.efficient_test') + results.append(result) + batch_size = len(result) for _ in range(batch_size): prog_bar.update() - - return pre_eval_results + return results def multi_gpu_test(model, data_loader, - format_only=False, - format_args={}, tmpdir=None, - gpu_collect=False): - """Test model with multiple gpus by progressive mode. + gpu_collect=False, + efficient_test=False): + """Test model with multiple gpus. This method tests model with multiple gpus and collects the results under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' @@ -97,48 +120,46 @@ def multi_gpu_test(model, Args: model (nn.Module): Model to be tested. data_loader (utils.data.Dataloader): Pytorch data loader. - format_only (bool): Only format result for results commit. - Default: False. - format_args (dict): The args for format_results. Default: {}. tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. The same path is used for efficient - test. Default: None. + test. gpu_collect (bool): Option to use either gpu or cpu to collect results. - Default: False. + efficient_test (bool): Whether save the results as local numpy files to + save CPU memory during evaluation. Default: False. Returns: - list: evaluation preparetion results. + list: The prediction results. """ + model.eval() - pre_eval_results = [] + results = [] dataset = data_loader.dataset - - loader_indices = data_loader.batch_sampler - rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) - - for batch_indices, data in zip(loader_indices, data_loader): + if efficient_test: + mmcv.mkdir_or_exist('.efficient_test') + for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) - if format_only: - dataset.format_results(result, batch_indices, **format_args) + if isinstance(result, list): + if efficient_test: + result = [np2tmp(_, tmpdir='.efficient_test') for _ in result] + results.extend(result) else: - # TODO: adapt samples_per_gpu > 1. - # only samples_per_gpu=1 valid now - pre_eval_results.extend(dataset.pre_eval(result, batch_indices)) + if efficient_test: + result = np2tmp(result, tmpdir='.efficient_test') + results.append(result) if rank == 0: - batch_size = len(result) * world_size - for _ in range(batch_size): + batch_size = len(result) + for _ in range(batch_size * world_size): prog_bar.update() # collect results from all ranks if gpu_collect: - pre_eval_results = collect_results_gpu(pre_eval_results, len(dataset)) + results = collect_results_gpu(results, len(dataset)) else: - pre_eval_results = collect_results_cpu(pre_eval_results, len(dataset), - tmpdir) - return pre_eval_results + results = collect_results_cpu(results, len(dataset), tmpdir) + return results diff --git a/mmseg/core/evaluation/__init__.py b/mmseg/core/evaluation/__init__.py index d44931b53a..f7cc4b2341 100644 --- a/mmseg/core/evaluation/__init__.py +++ b/mmseg/core/evaluation/__init__.py @@ -1,10 +1,8 @@ from .class_names import get_classes, get_palette from .eval_hooks import DistEvalHook, EvalHook -from .metrics import (convert_pre_eval_results_metrics, eval_metrics, - mean_dice, mean_fscore, mean_iou) +from .metrics import eval_metrics, mean_dice, mean_fscore, mean_iou __all__ = [ 'EvalHook', 'DistEvalHook', 'mean_dice', 'mean_iou', 'mean_fscore', - 'eval_metrics', 'get_classes', 'get_palette', - 'convert_pre_eval_results_metrics' + 'eval_metrics', 'get_classes', 'get_palette' ] diff --git a/mmseg/core/evaluation/eval_hooks.py b/mmseg/core/evaluation/eval_hooks.py index 7465eb69fa..928f2ba612 100644 --- a/mmseg/core/evaluation/eval_hooks.py +++ b/mmseg/core/evaluation/eval_hooks.py @@ -13,43 +13,17 @@ class EvalHook(_EvalHook): by_epoch (bool): Determine perform evaluation by epoch or by iteration. If set to True, it will perform by epoch. Otherwise, by iteration. Default: False. + efficient_test (bool): Whether save the results as local numpy files to + save CPU memory during evaluation. Default: False. Returns: list: The prediction results. """ greater_keys = ['mIoU', 'mAcc', 'aAcc'] - def __init__(self, *args, by_epoch=False, **kwargs): + def __init__(self, *args, by_epoch=False, efficient_test=False, **kwargs): super().__init__(*args, by_epoch=by_epoch, **kwargs) - - def evaluate(self, runner, pre_eval_results): - """Evaluate the results by progressive mode. - - Args: - runner (:obj:`mmcv.Runner`): The underlined training runner. - pre_eval_results (tuple[torch.Tensor]): per image eval results for - computing evaluation metric - """ - eval_res = self.dataloader.dataset.evaluate( - pre_eval_results, logger=runner.logger, **self.eval_kwargs) - - # TODO: Blocked by mmcv pr: #1213 - # evaluation info specific buffer - # runner.log_buffer.output['eval_res'] = {} - # for name, val in eval_res.items(): - # runner.log_buffer.output['eval_res'][name] = val - runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) - for name, val in eval_res.items(): - runner.log_buffer.output[name] = val - runner.log_buffer.ready = True - - if self.save_best is not None: - if self.key_indicator == 'auto': - # infer from eval_results - self._init_rule(self.rule, list(eval_res.keys())[0]) - return eval_res[self.key_indicator] - - return None + self.efficient_test = efficient_test def _do_evaluate(self, runner): """perform evaluation and save ckpt.""" @@ -57,12 +31,13 @@ def _do_evaluate(self, runner): return from mmseg.apis import single_gpu_test - pre_eval_results = single_gpu_test( - runner.model, self.dataloader, show=False) - - runner.log_buffer.clear() - - key_score = self.evaluate(runner, pre_eval_results) + results = single_gpu_test( + runner.model, + self.dataloader, + show=False, + efficient_test=self.efficient_test) + runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) + key_score = self.evaluate(runner, results) if self.save_best: self._save_ckpt(runner, key_score) @@ -74,41 +49,17 @@ class DistEvalHook(_DistEvalHook): by_epoch (bool): Determine perform evaluation by epoch or by iteration. If set to True, it will perform by epoch. Otherwise, by iteration. Default: False. + efficient_test (bool): Whether save the results as local numpy files to + save CPU memory during evaluation. Default: False. Returns: list: The prediction results. """ greater_keys = ['mIoU', 'mAcc', 'aAcc'] - def __init__(self, *args, by_epoch=False, **kwargs): + def __init__(self, *args, by_epoch=False, efficient_test=False, **kwargs): super().__init__(*args, by_epoch=by_epoch, **kwargs) - - def evaluate(self, runner, pre_eval_results): - """Evaluate the results by progressive mode. - - Args: - runner (:obj:`mmcv.Runner`): The underlined training runner. - processor (object): Output processor. - """ - eval_res = self.dataloader.dataset.evaluate( - pre_eval_results, logger=runner.logger, **self.eval_kwargs) - # TODO: Blocked by mmcv pr: #1213 - # evaluation info specific buffer - # runner.log_buffer.output['eval_res'] = {} - # for name, val in eval_res.items(): - # runner.log_buffer.output['eval_res'][name] = val - runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) - for name, val in eval_res.items(): - runner.log_buffer.output[name] = val - runner.log_buffer.ready = True - - if self.save_best is not None: - if self.key_indicator == 'auto': - # infer from eval_results - self._init_rule(self.rule, list(eval_res.keys())[0]) - return eval_res[self.key_indicator] - - return None + self.efficient_test = efficient_test def _do_evaluate(self, runner): """perform evaluation and save ckpt.""" @@ -133,17 +84,16 @@ def _do_evaluate(self, runner): tmpdir = osp.join(runner.work_dir, '.eval_hook') from mmseg.apis import multi_gpu_test - pre_eval_results = multi_gpu_test( + results = multi_gpu_test( runner.model, self.dataloader, tmpdir=tmpdir, - gpu_collect=self.gpu_collect) - - runner.log_buffer.clear() - + gpu_collect=self.gpu_collect, + efficient_test=self.efficient_test) if runner.rank == 0: print('\n') - key_score = self.evaluate(runner, pre_eval_results) + runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) + key_score = self.evaluate(runner, results) if self.save_best: self._save_ckpt(runner, key_score) diff --git a/mmseg/core/evaluation/metrics.py b/mmseg/core/evaluation/metrics.py index bfb67e7617..a216afefe6 100644 --- a/mmseg/core/evaluation/metrics.py +++ b/mmseg/core/evaluation/metrics.py @@ -324,73 +324,3 @@ def eval_metrics(results, for metric, metric_value in ret_metrics.items() }) return ret_metrics - - -def convert_pre_eval_results_metrics(pre_eval_results, - metrics=['mIoU'], - nan_to_num=None, - beta=1): - """Convert pre-eval results to metrics. - - Args: - pre_eval_results (tuple[torch.Tensor]): per image eval results for - computing evaluation metric - metrics (list[str] | str): Metrics to be evaluated, 'mIoU' and 'mDice'. - nan_to_num (int, optional): If specified, NaN values will be replaced - by the numbers defined by the user. Default: None. - Returns: - float: Overall accuracy on all images. - ndarray: Per category accuracy, shape (num_classes, ). - ndarray: Per category evaluation metrics, shape (num_classes, ). - """ - if isinstance(metrics, str): - metrics = [metrics] - allowed_metrics = ['mIoU', 'mDice', 'mFscore'] - if not set(metrics).issubset(set(allowed_metrics)): - raise KeyError('metrics {} is not supported'.format(metrics)) - - # convert list of tuples to tuple of lists, e.g. - # [(A_1, B_1, C_1, D_1), ..., (A_n, B_n, C_n, D_n)] to - # ([A_1, ..., A_n], ..., [D_1, ..., D_n]) - - pre_eval_results = tuple(zip(*pre_eval_results)) - assert len(pre_eval_results) == 4 - - total_area_intersect = sum(pre_eval_results[0]) - total_area_union = sum(pre_eval_results[1]) - total_area_pred_label = sum(pre_eval_results[2]) - total_area_label = sum(pre_eval_results[3]) - - all_acc = total_area_intersect.sum() / total_area_label.sum() - ret_metrics = OrderedDict({'aAcc': all_acc}) - for metric in metrics: - if metric == 'mIoU': - iou = total_area_intersect / total_area_union - acc = total_area_intersect / total_area_label - ret_metrics['IoU'] = iou - ret_metrics['Acc'] = acc - elif metric == 'mDice': - dice = 2 * total_area_intersect / ( - total_area_pred_label + total_area_label) - acc = total_area_intersect / total_area_label - ret_metrics['Dice'] = dice - ret_metrics['Acc'] = acc - elif metric == 'mFscore': - precision = total_area_intersect / total_area_pred_label - recall = total_area_intersect / total_area_label - f_value = torch.tensor( - [f_score(x[0], x[1], beta) for x in zip(precision, recall)]) - ret_metrics['Fscore'] = f_value - ret_metrics['Precision'] = precision - ret_metrics['Recall'] = recall - - ret_metrics = { - metric: value.numpy() - for metric, value in ret_metrics.items() - } - if nan_to_num is not None: - ret_metrics = OrderedDict({ - metric: np.nan_to_num(metric_value, nan=nan_to_num) - for metric, metric_value in ret_metrics.items() - }) - return ret_metrics diff --git a/mmseg/datasets/ade.py b/mmseg/datasets/ade.py index 35a7a230b2..5daf7e3731 100644 --- a/mmseg/datasets/ade.py +++ b/mmseg/datasets/ade.py @@ -90,13 +90,12 @@ def __init__(self, **kwargs): reduce_zero_label=True, **kwargs) - def results2img(self, results, indices, imgfile_prefix, to_label_id): + def results2img(self, results, imgfile_prefix, to_label_id): """Write the segmentation results to images. Args: results (list[list | tuple | ndarray]): Testing results of the dataset. - indices (list[int]): Indices of input results. imgfile_prefix (str): The filename prefix of the png files. If the prefix is "somepath/xxx", the png files will be named "somepath/xxx.png". @@ -109,7 +108,9 @@ def results2img(self, results, indices, imgfile_prefix, to_label_id): """ mmcv.mkdir_or_exist(imgfile_prefix) result_files = [] - for result, idx in zip(results, indices): + prog_bar = mmcv.ProgressBar(len(self)) + for idx in range(len(self)): + result = results[idx] filename = self.img_infos[idx]['filename'] basename = osp.splitext(osp.basename(filename))[0] @@ -125,18 +126,15 @@ def results2img(self, results, indices, imgfile_prefix, to_label_id): output.save(png_filename) result_files.append(png_filename) + prog_bar.update() + return result_files - def format_results(self, - results, - indices, - imgfile_prefix=None, - to_label_id=True): + def format_results(self, results, imgfile_prefix=None, to_label_id=True): """Format the results into dir (standard format for ade20k evaluation). Args: results (list): Testing results of the dataset. - indices (list[int]): Indices of input results. imgfile_prefix (str | None): The prefix of images files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. @@ -150,8 +148,10 @@ def format_results(self, for saving json/png files when img_prefix is not specified. """ - assert isinstance(results, list), 'results must be a list.' - assert isinstance(indices, list), 'indices must be a list.' + assert isinstance(results, list), 'results must be a list' + assert len(results) == len(self), ( + 'The length of results is not equal to the dataset len: ' + f'{len(results)} != {len(self)}') if imgfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() @@ -159,6 +159,5 @@ def format_results(self, else: tmp_dir = None - result_files = self.results2img(results, indices, imgfile_prefix, - to_label_id) + result_files = self.results2img(results, imgfile_prefix, to_label_id) return result_files, tmp_dir diff --git a/mmseg/datasets/cityscapes.py b/mmseg/datasets/cityscapes.py index 289ebc0b57..fa9958ac14 100644 --- a/mmseg/datasets/cityscapes.py +++ b/mmseg/datasets/cityscapes.py @@ -47,13 +47,12 @@ def _convert_to_label_id(result): return result_copy - def results2img(self, results, indices, imgfile_prefix, to_label_id): + def results2img(self, results, imgfile_prefix, to_label_id): """Write the segmentation results to images. Args: results (list[list | tuple | ndarray]): Testing results of the dataset. - indices (list[int]): Indices of input results. imgfile_prefix (str): The filename prefix of the png files. If the prefix is "somepath/xxx", the png files will be named "somepath/xxx.png". @@ -66,7 +65,9 @@ def results2img(self, results, indices, imgfile_prefix, to_label_id): """ mmcv.mkdir_or_exist(imgfile_prefix) result_files = [] - for result, idx in zip(results, indices): + prog_bar = mmcv.ProgressBar(len(self)) + for idx in range(len(self)): + result = results[idx] if to_label_id: result = self._convert_to_label_id(result) filename = self.img_infos[idx]['filename'] @@ -83,20 +84,16 @@ def results2img(self, results, indices, imgfile_prefix, to_label_id): output.putpalette(palette) output.save(png_filename) result_files.append(png_filename) + prog_bar.update() return result_files - def format_results(self, - results, - indices, - imgfile_prefix=None, - to_label_id=True): + def format_results(self, results, imgfile_prefix=None, to_label_id=True): """Format the results into dir (standard format for Cityscapes evaluation). Args: results (list): Testing results of the dataset. - indices (list[int]): Indices of input results. imgfile_prefix (str | None): The prefix of images files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. @@ -110,16 +107,17 @@ def format_results(self, for saving json/png files when img_prefix is not specified. """ - assert isinstance(results, list), 'results must be a list.' - assert isinstance(indices, list), 'indices must be a list.' + assert isinstance(results, list), 'results must be a list' + assert len(results) == len(self), ( + 'The length of results is not equal to the dataset len: ' + f'{len(results)} != {len(self)}') if imgfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() imgfile_prefix = tmp_dir.name else: tmp_dir = None - result_files = self.results2img(results, indices, imgfile_prefix, - to_label_id) + result_files = self.results2img(results, imgfile_prefix, to_label_id) return result_files, tmp_dir @@ -127,7 +125,8 @@ def evaluate(self, results, metric='mIoU', logger=None, - imgfile_prefix=None): + imgfile_prefix=None, + efficient_test=False): """Evaluation in Cityscapes/default protocol. Args: @@ -158,11 +157,10 @@ def evaluate(self, if len(metrics) > 0: eval_results.update( super(CityscapesDataset, - self).evaluate(results, metrics, logger)) + self).evaluate(results, metrics, logger, efficient_test)) return eval_results - # TODO: Compat with new test function. def _evaluate_cityscapes(self, results, logger, imgfile_prefix): """Evaluation in Cityscapes protocol. diff --git a/mmseg/datasets/custom.py b/mmseg/datasets/custom.py index 612d50e080..9c88235e39 100644 --- a/mmseg/datasets/custom.py +++ b/mmseg/datasets/custom.py @@ -1,5 +1,7 @@ +import os import os.path as osp from collections import OrderedDict +from functools import reduce import mmcv import numpy as np @@ -7,8 +9,7 @@ from prettytable import PrettyTable from torch.utils.data import Dataset -from mmseg.core.evaluation.metrics import (convert_pre_eval_results_metrics, - intersect_and_union) +from mmseg.core import eval_metrics from mmseg.utils import get_root_logger from .builder import DATASETS from .pipelines import Compose @@ -223,28 +224,21 @@ def prepare_test_img(self, idx): self.pre_pipeline(results) return self.pipeline(results) - def format_results(self, results, indices, **kwargs): + def format_results(self, results, **kwargs): """Place holder to format result to dataset specific output.""" - def pre_eval(self, preds, indices): - # In order to compat with batch inference - if not isinstance(indices, list): - indices = [indices] - if not isinstance(preds, list): - preds = [preds] - - pre_eval_results = [] - - for pred, index in zip(preds, indices): - seg_map = osp.join(self.ann_dir, - self.img_infos[index]['ann']['seg_map']) - seg_map = mmcv.imread(seg_map, flag='unchanged', backend='pillow') - pre_eval_results.append( - intersect_and_union(pred, seg_map, len(self.CLASSES), - self.ignore_index, self.label_map, - self.reduce_zero_label)) - - return pre_eval_results + def get_gt_seg_maps(self, efficient_test=False): + """Get ground truth segmentation maps for evaluation.""" + gt_seg_maps = [] + for img_info in self.img_infos: + seg_map = osp.join(self.ann_dir, img_info['ann']['seg_map']) + if efficient_test: + gt_seg_map = seg_map + else: + gt_seg_map = mmcv.imread( + seg_map, flag='unchanged', backend='pillow') + gt_seg_maps.append(gt_seg_map) + return gt_seg_maps def get_classes_and_palette(self, classes=None, palette=None): """Get class names of current dataset. @@ -309,12 +303,16 @@ def get_palette_for_custom_classes(self, class_names, palette=None): return palette - def evaluate(self, pre_eval_results, metric='mIoU', logger=None, **kwargs): + def evaluate(self, + results, + metric='mIoU', + logger=None, + efficient_test=False, + **kwargs): """Evaluate the dataset. Args: - pre_eval_results (tuple[torch.Tensor]): per image eval results for - computing evaluation metric + results (list): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. 'mIoU', 'mDice' and 'mFscore' are supported. logger (logging.Logger | None | str): Logger used for printing @@ -323,20 +321,32 @@ def evaluate(self, pre_eval_results, metric='mIoU', logger=None, **kwargs): Returns: dict[str, float]: Default metrics. """ + if isinstance(metric, str): metric = [metric] allowed_metrics = ['mIoU', 'mDice', 'mFscore'] if not set(metric).issubset(set(allowed_metrics)): raise KeyError('metric {} is not supported'.format(metric)) - eval_results = {} - ret_metrics = convert_pre_eval_results_metrics(pre_eval_results, - metric) - - # Because dataset.CLASSES is required in single_gpu_test, - # multi_gpu_test, so it's necessary to keep - # dataset.CLASSES. - class_names = self.CLASSES + gt_seg_maps = self.get_gt_seg_maps(efficient_test) + if self.CLASSES is None: + num_classes = len( + reduce(np.union1d, [np.unique(_) for _ in gt_seg_maps])) + else: + num_classes = len(self.CLASSES) + ret_metrics = eval_metrics( + results, + gt_seg_maps, + num_classes, + self.ignore_index, + metric, + label_map=self.label_map, + reduce_zero_label=self.reduce_zero_label) + + if self.CLASSES is None: + class_names = tuple(range(num_classes)) + else: + class_names = self.CLASSES # summary table ret_metrics_summary = OrderedDict({ @@ -384,4 +394,7 @@ def evaluate(self, pre_eval_results, metric='mIoU', logger=None, **kwargs): for idx, name in enumerate(class_names) }) + if mmcv.is_list_of(results, str): + for file_name in results: + os.remove(file_name) return eval_results diff --git a/tools/test.py b/tools/test.py index 54ffb341de..ab2bd60175 100644 --- a/tools/test.py +++ b/tools/test.py @@ -20,6 +20,7 @@ def parse_args(): parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument( '--aug-test', action='store_true', help='Use Flip and Multi scale aug') + parser.add_argument('--out', help='output result file in pickle format') parser.add_argument( '--format-only', action='store_true', @@ -89,8 +90,6 @@ def main(): if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True if args.aug_test: - assert not (args.show or args.show_dir - ), 'when aug test, it is not supported to show result.' # hard code index cfg.data.test.pipeline[1].img_ratios = [ 0.5, 0.75, 1.0, 1.25, 1.5, 1.75 @@ -134,29 +133,32 @@ def main(): print('"PALETTE" not found in meta, use dataset.PALETTE instead') model.PALETTE = dataset.PALETTE - # clean gpu memory when starting a new evaluation. - torch.cuda.empty_cache() - eval_args = {} if args.eval_options is None else args.eval_options + efficient_test = False + if args.eval_options is not None: + efficient_test = args.eval_options.get('efficient_test', False) if not distributed: model = MMDataParallel(model, device_ids=[0]) - pre_eval_results = single_gpu_test(model, data_loader, - args.format_only, eval_args, - args.show, args.show_dir, - args.opacity) + outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, + efficient_test, args.opacity) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) - pre_eval_results = multi_gpu_test(model, data_loader, args.format_only, - eval_args, args.tmpdir, - args.gpu_collect) + outputs = multi_gpu_test(model, data_loader, args.tmpdir, + args.gpu_collect, efficient_test) rank, _ = get_dist_info() if rank == 0: + if args.out: + print(f'\nwriting results to {args.out}') + mmcv.dump(outputs, args.out) + kwargs = {} if args.eval_options is None else args.eval_options + if args.format_only: + dataset.format_results(outputs, **kwargs) if args.eval: - dataset.evaluate(pre_eval_results, args.eval, **eval_args) + dataset.evaluate(outputs, args.eval, **kwargs) if __name__ == '__main__': From 9149191e38a71b1433ab34efc8a901e1c90a9304 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Fri, 30 Jul 2021 02:05:36 +0800 Subject: [PATCH 74/96] Clean code from pr #709 --- tests/test_eval_hook.py | 6 +++++- tools/deploy_test.py | 22 +++++++++++++++------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/tests/test_eval_hook.py b/tests/test_eval_hook.py index c83623de0c..394051b0ba 100644 --- a/tests/test_eval_hook.py +++ b/tests/test_eval_hook.py @@ -112,7 +112,11 @@ def test_epoch_eval_hook(): logger=runner.logger) -def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): +def multi_gpu_test(model, + data_loader, + tmpdir=None, + gpu_collect=False, + efficient_test=False): results = single_gpu_test(model, data_loader) return results diff --git a/tools/deploy_test.py b/tools/deploy_test.py index a3d93ca01a..51f16b4a2a 100644 --- a/tools/deploy_test.py +++ b/tools/deploy_test.py @@ -14,6 +14,7 @@ from mmseg.apis import single_gpu_test from mmseg.datasets import build_dataloader, build_dataset from mmseg.models.segmentors.base import BaseSegmentor +from mmseg.ops import resize class ONNXRuntimeSegmentor(BaseSegmentor): @@ -79,7 +80,7 @@ def simple_test(self, img: torch.Tensor, img_meta: Iterable, if not (ori_shape[0] == seg_pred.shape[-2] and ori_shape[1] == seg_pred.shape[-1]): seg_pred = torch.from_numpy(seg_pred).float() - seg_pred = torch.nn.functional.interpolate( + seg_pred = resize( seg_pred, size=tuple(ori_shape[:2]), mode='nearest') seg_pred = seg_pred.long().detach().cpu().numpy() seg_pred = seg_pred[0] @@ -127,7 +128,7 @@ def simple_test(self, img: torch.Tensor, img_meta: Iterable, if not (ori_shape[0] == seg_pred.shape[-2] and ori_shape[1] == seg_pred.shape[-1]): seg_pred = torch.from_numpy(seg_pred).float() - seg_pred = torch.nn.functional.interpolate( + seg_pred = resize( seg_pred, size=tuple(ori_shape[:2]), mode='nearest') seg_pred = seg_pred.long().detach().cpu().numpy() seg_pred = seg_pred[0] @@ -227,17 +228,24 @@ def main(): model.CLASSES = dataset.CLASSES model.PALETTE = dataset.PALETTE - eval_args = {} if args.eval_options is None else args.eval_options + efficient_test = False + if args.eval_options is not None: + efficient_test = args.eval_options.get('efficient_test', False) model = MMDataParallel(model, device_ids=[0]) - pre_eval_results = single_gpu_test(model, data_loader, args.format_only, - eval_args, args.show, args.show_dir, - args.opacity) + outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, + efficient_test, args.opacity) rank, _ = get_dist_info() if rank == 0: + if args.out: + print(f'\nwriting results to {args.out}') + mmcv.dump(outputs, args.out) + kwargs = {} if args.eval_options is None else args.eval_options + if args.format_only: + dataset.format_results(outputs, **kwargs) if args.eval: - dataset.evaluate(pre_eval_results, args.eval, **eval_args) + dataset.evaluate(outputs, args.eval, **kwargs) if __name__ == '__main__': From 37b493d9cf6dce23a022a3112485231ae166c999 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Fri, 30 Jul 2021 02:12:02 +0800 Subject: [PATCH 75/96] Add 512x512 segformer_mit-b5. --- configs/segformer/readme.md | 6 ++- .../segformer_mit-b5_512x512_160k_ade20k.py | 38 +++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index a45f26a00c..d2c86faf5f 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -24,7 +24,8 @@ |Segformer | MIT-B2 | 512x512 | 160000 | - | - | 45.58 | 47.03 | [config]() | [model]() | [log]() | |Segformer | MIT-B3 | 512x512 | 160000 | - | - | 47.82 | 48.81 | [config]() | [model]() | [log]() | |Segformer | MIT-B4 | 512x512 | 160000 | - | - | 48.46 | 49.76 | [config]() | [model]() | [log]() | -|Segformer | MIT-B5 | 640x640 | 160000 | - | - | 49.13 | 50.22 | [config]() | [model]() | [log]() | +|Segformer | MIT-B5 | 512x512 | 160000 | - | - | 49.13 | 50.22 | [config]() | [model]() | [log]() | +|Segformer | MIT-B5 | 512x512 | 160000 | - | - | - | - | [config]() | [model]() | [log]() | Evaluation with AlignedResize: @@ -35,4 +36,5 @@ Evaluation with AlignedResize: |Segformer | MIT-B2 | 512x512 | 160000 | 46.53 | 47.49 | |Segformer | MIT-B3 | 512x512 | 160000 | 48.46 | 49.14 | |Segformer | MIT-B4 | 512x512 | 160000 | 49.34 | 50.29 | -|Segformer | MIT-B5 | 640x640 | 160000 | 50.08 | 50.72 | +|Segformer | MIT-B5 | 512x512 | 160000 | 50.08 | 50.72 | +|Segformer | MIT-B5 | 512x512 | 160000 | - | - | diff --git a/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py new file mode 100644 index 0000000000..e63dfef290 --- /dev/null +++ b/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py @@ -0,0 +1,38 @@ +_base_ = [ + '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] + +# model settings +model = dict( + pretrained='pretrain/mit_b5.pth', + backbone=dict( + embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 6, 40, 3]), + decode_head=dict(in_channels=[64, 128, 320, 512], num_classes=150)) + +# optimizer +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.00006, + betas=(0.9, 0.999), + weight_decay=0.01, + paramwise_cfg=dict( + custom_keys={ + 'pos_block': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.), + 'head': dict(lr_mult=10.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +# By default, models are trained on 8 GPUs with 2 images per GPU +data = dict(samples_per_gpu=2) From 5722818949e81bfcc9bc8b9704ff74642f271644 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Fri, 30 Jul 2021 02:24:17 +0800 Subject: [PATCH 76/96] Fix lint. --- configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py index 472263fb15..0ec383f052 100644 --- a/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py @@ -7,9 +7,7 @@ model = dict( pretrained='pretrain/mit_b4.pth', backbone=dict( - embed_dims=64, - num_heads=[1, 2, 5, 8], - num_layers=[3, 8, 27, 3]), + embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 8, 27, 3]), decode_head=dict(in_channels=[64, 128, 320, 512], num_classes=150)) # optimizer From 85cae72e09525cd2597e4eb5e9c06318b4308b5f Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Fri, 30 Jul 2021 14:42:28 +0800 Subject: [PATCH 77/96] Fix some segformer head bugs. --- mmseg/models/decode_heads/segformer_head.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mmseg/models/decode_heads/segformer_head.py b/mmseg/models/decode_heads/segformer_head.py index 5c1f9a8733..0919d145ac 100644 --- a/mmseg/models/decode_heads/segformer_head.py +++ b/mmseg/models/decode_heads/segformer_head.py @@ -11,6 +11,9 @@ class SegformerHead(BaseDecodeHead): """The all mlp Head of segformer. + This head is the implementation of + `Segformer ` + Args: interpolate_mode: The interpolate mode of MLP head upsample operation. Default: 'bilinear'. @@ -20,9 +23,10 @@ def __init__(self, interpolate_mode='bilinear', **kwargs): super().__init__(input_transform='multiple_select', **kwargs) self.interpolate_mode = interpolate_mode - num_inputs = len(self.in_channels) + assert num_inputs == len(self.in_index) + self.convs = nn.ModuleList() for i in range(num_inputs): self.convs.append( @@ -44,7 +48,9 @@ def forward(self, inputs): # Receive 4 stage backbone feature map: 1/4, 1/8, 1/16, 1/32 inputs = self._transform_inputs(inputs) outs = [] - for x, conv in zip(inputs, self.convs): + for idx in range(len(inputs)): + x = inputs[idx] + conv = self.convs[idx] outs.append( resize( input=conv(x), From a1fe725bd51a6823ecabe192b0df27e3ab7ea0b8 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Fri, 30 Jul 2021 14:43:25 +0800 Subject: [PATCH 78/96] Add segformer unit tests. --- .../test_heads/test_segformer_head.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 tests/test_models/test_heads/test_segformer_head.py diff --git a/tests/test_models/test_heads/test_segformer_head.py b/tests/test_models/test_heads/test_segformer_head.py new file mode 100644 index 0000000000..aa8dedb1a8 --- /dev/null +++ b/tests/test_models/test_heads/test_segformer_head.py @@ -0,0 +1,39 @@ +import pytest +import torch + +from mmseg.models.decode_heads import SegformerHead + + +def test_segformer_head(): + with pytest.raises(AssertionError): + # `in_channels` must have same length as `in_index` + SegformerHead( + in_channels=(1, 2, 3), in_index=(0, 1), channels=5, num_classes=2) + + H, W = (64, 64) + in_channels = (32, 64, 160, 256) + shapes = [(H // 2**(i + 2), W // 2**(i + 2)) + for i in range(len(in_channels))] + model = SegformerHead( + in_channels=in_channels, + in_index=[0, 1, 2, 3], + channels=256, + num_classes=19) + + with pytest.raises(IndexError): + # in_index must match the input feature maps. + inputs = [ + torch.randn((1, in_channel, *shape)) + for in_channel, shape in zip(in_channels, shapes) + ][:3] + temp = model(inputs) + + # Normal Input + # ((1, 32, 16, 16), (1, 64, 8, 8), (1, 160, 4, 4), (1, 256, 2, 2) + inputs = [ + torch.randn((1, in_channel, *shape)) + for in_channel, shape in zip(in_channels, shapes) + ] + temp = model(inputs) + + assert temp.shape == (1, 19, H // 4, W // 4) From 5f8e7f8730694362dbff6fbbc0976805c346e8e6 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Fri, 30 Jul 2021 15:46:42 +0800 Subject: [PATCH 79/96] Replace AlignedResize to ResizeToMultiple. --- mmseg/datasets/pipelines/transforms.py | 234 ++++--------------------- 1 file changed, 36 insertions(+), 198 deletions(-) diff --git a/mmseg/datasets/pipelines/transforms.py b/mmseg/datasets/pipelines/transforms.py index 10c6ac0000..5f644130dd 100644 --- a/mmseg/datasets/pipelines/transforms.py +++ b/mmseg/datasets/pipelines/transforms.py @@ -7,230 +7,68 @@ @PIPELINES.register_module() -class AlignedResize(object): - """Resize images & seg. +class ResizeToMultiple(object): + """Resize images & seg to multiple of divisor. - Align + Args: + size_divisor (int): images and gt seg maps need to resize to multiple + of size_divisor. Default: 32. """ - def __init__(self, - img_scale=None, - multiscale_mode='range', - ratio_range=None, - keep_ratio=True, - size_divisor=32): - if img_scale is None: - self.img_scale = None - else: - if isinstance(img_scale, list): - self.img_scale = img_scale - else: - self.img_scale = [img_scale] - assert mmcv.is_list_of(self.img_scale, tuple) - - if ratio_range is not None: - # mode 1: given img_scale=None and a range of image ratio - # mode 2: given a scale and a range of image ratio - assert self.img_scale is None or len(self.img_scale) == 1 - else: - # mode 3 and 4: given multiple scales or a range of scales - assert multiscale_mode in ['value', 'range'] - - self.multiscale_mode = multiscale_mode - self.ratio_range = ratio_range - self.keep_ratio = keep_ratio + def __init__(self, size_divisor=32): self.size_divisor = size_divisor - @staticmethod - def random_select(img_scales): - """Randomly select an img_scale from given candidates. + def _align(self, img, interpolation=None): + """Resize input image to multiple of size divisor. Args: - img_scales (list[tuple]): Images scales for selection. + img (np.ndarray): the image which will be resize to multiple of + size divisor shapes like [H, W, C] or [H, W] + interpolation (str | optional): the interpolation mode of resize. + Default: None. Returns: - (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, - where ``img_scale`` is the selected image scale and - ``scale_idx`` is the selected index in the given candidates. + np.ndarry: image after aligned resize. """ - - assert mmcv.is_list_of(img_scales, tuple) - scale_idx = np.random.randint(len(img_scales)) - img_scale = img_scales[scale_idx] - return img_scale, scale_idx - - @staticmethod - def random_sample(img_scales): - """Randomly sample an img_scale when ``multiscale_mode=='range'``. - - Args: - img_scales (list[tuple]): Images scale range for sampling. - There must be two tuples in img_scales, which specify the lower - and uper bound of image scales. - - Returns: - (tuple, None): Returns a tuple ``(img_scale, None)``, where - ``img_scale`` is sampled scale and None is just a placeholder - to be consistent with :func:`random_select`. - """ - - assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2 - img_scale_long = [max(s) for s in img_scales] - img_scale_short = [min(s) for s in img_scales] - long_edge = np.random.randint( - min(img_scale_long), - max(img_scale_long) + 1) - short_edge = np.random.randint( - min(img_scale_short), - max(img_scale_short) + 1) - img_scale = (long_edge, short_edge) - return img_scale, None - - @staticmethod - def random_sample_ratio(img_scale, ratio_range): - """Randomly sample an img_scale when ``ratio_range`` is specified. - - A ratio will be randomly sampled from the range specified by - ``ratio_range``. Then it would be multiplied with ``img_scale`` to - generate sampled scale. - - Args: - img_scale (tuple): Images scale base to multiply with ratio. - ratio_range (tuple[float]): The minimum and maximum ratio to scale - the ``img_scale``. - - Returns: - (tuple, None): Returns a tuple ``(scale, None)``, where - ``scale`` is sampled ratio multiplied with ``img_scale`` and - None is just a placeholder to be consistent with - :func:`random_select`. - """ - - assert isinstance(img_scale, tuple) and len(img_scale) == 2 - min_ratio, max_ratio = ratio_range - assert min_ratio <= max_ratio - ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio - scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) - return scale, None - - def _random_scale(self, results): - """Randomly sample an img_scale according to ``ratio_range`` and - ``multiscale_mode``. - - If ``ratio_range`` is specified, a ratio will be sampled and be - multiplied with ``img_scale``. - If multiple scales are specified by ``img_scale``, a scale will be - sampled according to ``multiscale_mode``. - Otherwise, single scale will be used. - - Args: - results (dict): Result dict from :obj:`dataset`. - - Returns: - dict: Two new keys 'scale` and 'scale_idx` are added into - ``results``, which would be used by subsequent pipelines. - """ - - if self.ratio_range is not None: - if self.img_scale is None: - h, w = results['img'].shape[:2] - scale, scale_idx = self.random_sample_ratio((w, h), - self.ratio_range) - else: - scale, scale_idx = self.random_sample_ratio( - self.img_scale[0], self.ratio_range) - elif len(self.img_scale) == 1: - scale, scale_idx = self.img_scale[0], 0 - elif self.multiscale_mode == 'range': - scale, scale_idx = self.random_sample(self.img_scale) - elif self.multiscale_mode == 'value': - scale, scale_idx = self.random_select(self.img_scale) - else: - raise NotImplementedError - - results['scale'] = scale - results['scale_idx'] = scale_idx - - def _align(self, img, size_divisor, interpolation=None): - align_h = int(np.ceil(img.shape[0] / size_divisor)) * size_divisor - align_w = int(np.ceil(img.shape[1] / size_divisor)) * size_divisor - if interpolation is None: - img = mmcv.imresize(img, (align_w, align_h)) - else: - img = mmcv.imresize( - img, (align_w, align_h), interpolation=interpolation) - return img - - def _resize_img(self, results): - """Resize images with ``results['scale']``.""" - if self.keep_ratio: - img, scale_factor = mmcv.imrescale( - results['img'], results['scale'], return_scale=True) - # align # - img = self._align(img, self.size_divisor) - # the w_scale and h_scale has minor difference - # a real fix should be done in the mmcv.imrescale in the future - new_h, new_w = img.shape[:2] - h, w = results['img'].shape[:2] - w_scale = new_w / w - h_scale = new_h / h - else: - img, w_scale, h_scale = mmcv.imresize( - results['img'], results['scale'], return_scale=True) - - h, w = img.shape[:2] - sd = self.size_divisor - assert h % sd == 0 and w % sd == 0, \ - 'img size not align. h:{} w:{}'.format(h, w) - scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], - dtype=np.float32) + align_h = int(np.ceil( + img.shape[0] / self.size_divisor)) * self.size_divisor + align_w = int(np.ceil( + img.shape[1] / self.size_divisor)) * self.size_divisor + return mmcv.imresize(img, (align_w, align_h), + interpolation if interpolation else 'bilinear') + + def _align_img(self, results): + """Align image to multiple of size divisor.""" + img = results['img'] + img = self._align(img) results['img'] = img results['img_shape'] = img.shape - results['pad_shape'] = img.shape # in case that there is no padding - results['scale_factor'] = scale_factor - results['keep_ratio'] = self.keep_ratio + results['pad_shape'] = img.shape + return img - def _resize_seg(self, results): - """Resize semantic segmentation map with ``results['scale']``.""" + def _align_seg(self, results): + """Align segmentation map to multiple of size divisor.""" for key in results.get('seg_fields', []): - if self.keep_ratio: - gt_seg = mmcv.imrescale( - results[key], results['scale'], interpolation='nearest') - gt_seg = self._align( - gt_seg, self.size_divisor, interpolation='nearest') - else: - gt_seg = mmcv.imresize( - results[key], results['scale'], interpolation='nearest') - h, w = gt_seg.shape[:2] - sd = self.size_divisor - assert h % sd == 0 and w % sd == 0, \ - 'gt_seg size not align. h:{} w:{}'.format(h, w) - results[key] = gt_seg + gt_seg = results[key] + gt_seg = self._align(gt_seg, 'nearest') def __call__(self, results): - """Call function to resize images, bounding boxes, masks, semantic - segmentation map. + """Call function to resize images, semantic segmentation map to + multiple of size divisor. Args: results (dict): Result dict from loading pipeline. Returns: - dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', - 'keep_ratio' keys are added into result dict. + dict: Resized results, 'img_shape', 'pad_shape' keys are updated. """ - - if 'scale' not in results: - self._random_scale(results) - self._resize_img(results) - self._resize_seg(results) + self._align_img(results) + self._align_seg(results) return results def __repr__(self): repr_str = self.__class__.__name__ - repr_str += (f'(img_scale={self.img_scale}, ' - f'multiscale_mode={self.multiscale_mode}, ' - f'ratio_range={self.ratio_range}, ' - f'keep_ratio={self.keep_ratio})') + repr_str += (f'(size_divisor={self.size_divisor})') return repr_str From 09c1796eb93565f078b4e69d631bcfcabf165a65 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Fri, 30 Jul 2021 15:50:05 +0800 Subject: [PATCH 80/96] Modify readme of segformer. --- configs/segformer/readme.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index d2c86faf5f..3ea9e8b46c 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -25,7 +25,7 @@ |Segformer | MIT-B3 | 512x512 | 160000 | - | - | 47.82 | 48.81 | [config]() | [model]() | [log]() | |Segformer | MIT-B4 | 512x512 | 160000 | - | - | 48.46 | 49.76 | [config]() | [model]() | [log]() | |Segformer | MIT-B5 | 512x512 | 160000 | - | - | 49.13 | 50.22 | [config]() | [model]() | [log]() | -|Segformer | MIT-B5 | 512x512 | 160000 | - | - | - | - | [config]() | [model]() | [log]() | +|Segformer | MIT-B5 | 640x640 | 160000 | - | - | - | - | [config]() | [model]() | [log]() | Evaluation with AlignedResize: @@ -37,4 +37,4 @@ Evaluation with AlignedResize: |Segformer | MIT-B3 | 512x512 | 160000 | 48.46 | 49.14 | |Segformer | MIT-B4 | 512x512 | 160000 | 49.34 | 50.29 | |Segformer | MIT-B5 | 512x512 | 160000 | 50.08 | 50.72 | -|Segformer | MIT-B5 | 512x512 | 160000 | - | - | +|Segformer | MIT-B5 | 640x640 | 160000 | - | - | From b9af6cfe92b8605ceb2d9a2ab7574cfacacf4cfa Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Fri, 30 Jul 2021 16:35:55 +0800 Subject: [PATCH 81/96] Fix bug of ResizeToMultiple. --- configs/segformer/readme.md | 22 ++++++++++++++++++++++ mmseg/datasets/pipelines/transforms.py | 5 +++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index 3ea9e8b46c..61de9af160 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -38,3 +38,25 @@ Evaluation with AlignedResize: |Segformer | MIT-B4 | 512x512 | 160000 | 49.34 | 50.29 | |Segformer | MIT-B5 | 512x512 | 160000 | 50.08 | 50.72 | |Segformer | MIT-B5 | 640x640 | 160000 | - | - | + +We replace `AlignedResize` to `Resize + ResizeToMultiple`. If you want to test by +using `AlignedResize`, you can change the dataset pipeline like this: + +```python +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2048, 512), + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='ResizeToMultiple', size_divisor=32), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +``` diff --git a/mmseg/datasets/pipelines/transforms.py b/mmseg/datasets/pipelines/transforms.py index 5f644130dd..f1bd17e679 100644 --- a/mmseg/datasets/pipelines/transforms.py +++ b/mmseg/datasets/pipelines/transforms.py @@ -34,8 +34,9 @@ def _align(self, img, interpolation=None): img.shape[0] / self.size_divisor)) * self.size_divisor align_w = int(np.ceil( img.shape[1] / self.size_divisor)) * self.size_divisor - return mmcv.imresize(img, (align_w, align_h), - interpolation if interpolation else 'bilinear') + return mmcv.imresize( + img, (align_w, align_h), + interpolation=interpolation if interpolation else 'bilinear') def _align_img(self, results): """Align image to multiple of size divisor.""" From 10c2d7ea48a56270f35c76cf4f851791ed842da4 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Sat, 31 Jul 2021 00:20:40 +0800 Subject: [PATCH 82/96] Add ResizeToMultiple unit tests. --- mmseg/datasets/pipelines/transforms.py | 1 + tests/test_data/test_transform.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/mmseg/datasets/pipelines/transforms.py b/mmseg/datasets/pipelines/transforms.py index f1bd17e679..91821f11a3 100644 --- a/mmseg/datasets/pipelines/transforms.py +++ b/mmseg/datasets/pipelines/transforms.py @@ -52,6 +52,7 @@ def _align_seg(self, results): for key in results.get('seg_fields', []): gt_seg = results[key] gt_seg = self._align(gt_seg, 'nearest') + results[key] = gt_seg def __call__(self, results): """Call function to resize images, semantic segmentation map to diff --git a/tests/test_data/test_transform.py b/tests/test_data/test_transform.py index a6417575c3..658606a822 100644 --- a/tests/test_data/test_transform.py +++ b/tests/test_data/test_transform.py @@ -10,6 +10,26 @@ from mmseg.datasets.builder import PIPELINES +def test_align(): + transform = dict(type='ResizeToMultiple', size_divisor=32) + transform = build_from_cfg(transform, PIPELINES) + + img = np.random.randn(213, 232, 3) + seg = np.random.randint(0, 19, (213, 232)) + results = dict() + results['img'] = img + results['gt_semantic_seg'] = seg + results['seg_fields'] = ['gt_semantic_seg'] + results['img_shape'] = img.shape + results['pad_shape'] = img.shape + + results = transform(results) + assert results['img'].shape == (224, 256, 3) + assert results['gt_semantic_seg'].shape == (224, 256) + assert results['img_shape'] == (224, 256, 3) + assert results['pad_shape'] == (224, 256, 3) + + def test_resize(): # test assertion if img_scale is a list with pytest.raises(AssertionError): From cbbda184a45a1b5f7eccacac1e4ed6a5166e4d5f Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 4 Aug 2021 17:03:12 +0800 Subject: [PATCH 83/96] Resolve conflict. --- configs/segformer/readme.md | 2 +- configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index 61de9af160..95225a6867 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -25,7 +25,7 @@ |Segformer | MIT-B3 | 512x512 | 160000 | - | - | 47.82 | 48.81 | [config]() | [model]() | [log]() | |Segformer | MIT-B4 | 512x512 | 160000 | - | - | 48.46 | 49.76 | [config]() | [model]() | [log]() | |Segformer | MIT-B5 | 512x512 | 160000 | - | - | 49.13 | 50.22 | [config]() | [model]() | [log]() | -|Segformer | MIT-B5 | 640x640 | 160000 | - | - | - | - | [config]() | [model]() | [log]() | +|Segformer | MIT-B5 | 512x512 | 160000 | - | - | 49.62 | - | [config]() | [model]() | [log]() | Evaluation with AlignedResize: diff --git a/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py b/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py index 47183d4ea2..f2b6d22191 100644 --- a/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py @@ -30,6 +30,7 @@ flip=False, transforms=[ dict(type='Resize', keep_ratio=True), + dict(type='ResizeToMultiple', size_divisor=32), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), @@ -37,8 +38,8 @@ ]) ] data = dict( - samples_per_gpu=4, - workers_per_gpu=4, + samples_per_gpu=2, + workers_per_gpu=2, train=dict( type=dataset_type, data_root=data_root, @@ -88,6 +89,3 @@ power=1.0, min_lr=0.0, by_epoch=False) - -# By default, models are trained on 8 GPUs with 2 images per GPU -data = dict(samples_per_gpu=2) From 28b35b669669346594620b5e8ec1f77697751f31 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Mon, 9 Aug 2021 11:14:38 +0800 Subject: [PATCH 84/96] Simplify the implementation of ResizeToMultiple. --- configs/segformer/readme.md | 3 +- mmseg/datasets/pipelines/transforms.py | 57 ++++++++++---------------- 2 files changed, 23 insertions(+), 37 deletions(-) diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index 95225a6867..0280da01f7 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -39,7 +39,7 @@ Evaluation with AlignedResize: |Segformer | MIT-B5 | 512x512 | 160000 | 50.08 | 50.72 | |Segformer | MIT-B5 | 640x640 | 160000 | - | - | -We replace `AlignedResize` to `Resize + ResizeToMultiple`. If you want to test by +We replace `AlignedResize` in original implementatiuon to `Resize + ResizeToMultiple`. If you want to test by using `AlignedResize`, you can change the dataset pipeline like this: ```python @@ -52,6 +52,7 @@ test_pipeline = [ flip=False, transforms=[ dict(type='Resize', keep_ratio=True), + # resize image to multiple of 32, improve SegFormer by xx mIoU. dict(type='ResizeToMultiple', size_divisor=32), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), diff --git a/mmseg/datasets/pipelines/transforms.py b/mmseg/datasets/pipelines/transforms.py index 91821f11a3..50b11ee6e7 100644 --- a/mmseg/datasets/pipelines/transforms.py +++ b/mmseg/datasets/pipelines/transforms.py @@ -13,64 +13,49 @@ class ResizeToMultiple(object): Args: size_divisor (int): images and gt seg maps need to resize to multiple of size_divisor. Default: 32. + interpolation (str, optional): The interpolation mode of image resize. + Default: None """ - def __init__(self, size_divisor=32): + def __init__(self, size_divisor=32, interpolation=None): self.size_divisor = size_divisor + self.interpolation = interpolation - def _align(self, img, interpolation=None): - """Resize input image to multiple of size divisor. + def __call__(self, results): + """Call function to resize images, semantic segmentation map to + multiple of size divisor. Args: - img (np.ndarray): the image which will be resize to multiple of - size divisor shapes like [H, W, C] or [H, W] - interpolation (str | optional): the interpolation mode of resize. - Default: None. + results (dict): Result dict from loading pipeline. Returns: - np.ndarry: image after aligned resize. + dict: Resized results, 'img_shape', 'pad_shape' keys are updated. """ - align_h = int(np.ceil( - img.shape[0] / self.size_divisor)) * self.size_divisor - align_w = int(np.ceil( - img.shape[1] / self.size_divisor)) * self.size_divisor - return mmcv.imresize( - img, (align_w, align_h), - interpolation=interpolation if interpolation else 'bilinear') - - def _align_img(self, results): - """Align image to multiple of size divisor.""" + # Align image to multiple of size divisor. img = results['img'] - img = self._align(img) + img = mmcv.imresize_to_multiple( + img, + self.size_divisor, + interpolation=self.interpolation + if self.interpolation else 'bilinear') + results['img'] = img results['img_shape'] = img.shape results['pad_shape'] = img.shape - return img - def _align_seg(self, results): - """Align segmentation map to multiple of size divisor.""" + # Align segmentation map to multiple of size divisor. for key in results.get('seg_fields', []): gt_seg = results[key] - gt_seg = self._align(gt_seg, 'nearest') + gt_seg = mmcv.imresize_to_multiple( + gt_seg, self.size_divisor, interpolation='nearest') results[key] = gt_seg - def __call__(self, results): - """Call function to resize images, semantic segmentation map to - multiple of size divisor. - - Args: - results (dict): Result dict from loading pipeline. - - Returns: - dict: Resized results, 'img_shape', 'pad_shape' keys are updated. - """ - self._align_img(results) - self._align_seg(results) return results def __repr__(self): repr_str = self.__class__.__name__ - repr_str += (f'(size_divisor={self.size_divisor})') + repr_str += (f'(size_divisor={self.size_divisor}, ' + f'interpolation={self.interpolation})') return repr_str From 73e09d22129352f438fed75c861a5b3409af9270 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Mon, 9 Aug 2021 11:21:33 +0800 Subject: [PATCH 85/96] Update test results. --- configs/segformer/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index 0280da01f7..22dcfdb0ea 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -37,7 +37,7 @@ Evaluation with AlignedResize: |Segformer | MIT-B3 | 512x512 | 160000 | 48.46 | 49.14 | |Segformer | MIT-B4 | 512x512 | 160000 | 49.34 | 50.29 | |Segformer | MIT-B5 | 512x512 | 160000 | 50.08 | 50.72 | -|Segformer | MIT-B5 | 640x640 | 160000 | - | - | +|Segformer | MIT-B5 | 640x640 | 160000 | 50.58 | - | We replace `AlignedResize` in original implementatiuon to `Resize + ResizeToMultiple`. If you want to test by using `AlignedResize`, you can change the dataset pipeline like this: From e44ad02647033ffffe051e147807d3909cefcb2c Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Mon, 9 Aug 2021 15:23:15 +0800 Subject: [PATCH 86/96] Fix multi-scale test error when resize_ratio=1.75 and input size=640x640. --- mmseg/models/backbones/mit.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index cad0b43134..d073444664 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -159,7 +159,13 @@ def forward(self, x, hw_shape, identity=None): if identity is None: identity = x_q - out = self.attn(query=x_q, key=x_kv, value=x_kv)[0] + # `need_weights=True` will let nn.MultiHeadAttention + # `return attn_output, attn_output_weights.sum(dim=1) / num_heads` + # The `attn_output_weights.sum(dim=1)` may cause cuda error. So, we set + # `need_weights=False` to ignore `attn_output_weights.sum(dim=1)`. + # This issue - `https://github.com/pytorch/pytorch/issues/37583` report + # the error that large scale tensor sum operation may cause cuda error. + out = self.attn(query=x_q, key=x_kv, value=x_kv, need_weights=False)[0] return identity + self.dropout_layer(self.proj_drop(out)) From 029ceb46868a500b8d17f4eb29a1c5e8764ecf94 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Mon, 9 Aug 2021 17:00:00 +0800 Subject: [PATCH 87/96] Update segformer results. --- configs/segformer/readme.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index 22dcfdb0ea..faca0ab3ad 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -25,7 +25,7 @@ |Segformer | MIT-B3 | 512x512 | 160000 | - | - | 47.82 | 48.81 | [config]() | [model]() | [log]() | |Segformer | MIT-B4 | 512x512 | 160000 | - | - | 48.46 | 49.76 | [config]() | [model]() | [log]() | |Segformer | MIT-B5 | 512x512 | 160000 | - | - | 49.13 | 50.22 | [config]() | [model]() | [log]() | -|Segformer | MIT-B5 | 512x512 | 160000 | - | - | 49.62 | - | [config]() | [model]() | [log]() | +|Segformer | MIT-B5 | 512x512 | 160000 | - | - | 49.62 | 50.36 | [config]() | [model]() | [log]() | Evaluation with AlignedResize: @@ -37,7 +37,7 @@ Evaluation with AlignedResize: |Segformer | MIT-B3 | 512x512 | 160000 | 48.46 | 49.14 | |Segformer | MIT-B4 | 512x512 | 160000 | 49.34 | 50.29 | |Segformer | MIT-B5 | 512x512 | 160000 | 50.08 | 50.72 | -|Segformer | MIT-B5 | 640x640 | 160000 | 50.58 | - | +|Segformer | MIT-B5 | 640x640 | 160000 | 50.58 | 50.8 | We replace `AlignedResize` in original implementatiuon to `Resize + ResizeToMultiple`. If you want to test by using `AlignedResize`, you can change the dataset pipeline like this: From dba7167047f9e292528f60f853675bec6db1af6d Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Mon, 9 Aug 2021 19:42:48 +0800 Subject: [PATCH 88/96] Update Segformer results. --- configs/segformer/readme.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index faca0ab3ad..e990dac025 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -19,13 +19,13 @@ | Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | | ------ | -------- | --------- | ------: | -------: | -------------- | ---: | ------------- | ------ | -------- | -|Segformer | MIT-B0 | 512x512 | 160000 | - | - | 37.41 | 38.34 | [config]() | [model]() | [log]() | -|Segformer | MIT-B1 | 512x512 | 160000 | - | - | 40.97 | 42.54 | [config]() | [model]() | [log]() | -|Segformer | MIT-B2 | 512x512 | 160000 | - | - | 45.58 | 47.03 | [config]() | [model]() | [log]() | -|Segformer | MIT-B3 | 512x512 | 160000 | - | - | 47.82 | 48.81 | [config]() | [model]() | [log]() | -|Segformer | MIT-B4 | 512x512 | 160000 | - | - | 48.46 | 49.76 | [config]() | [model]() | [log]() | -|Segformer | MIT-B5 | 512x512 | 160000 | - | - | 49.13 | 50.22 | [config]() | [model]() | [log]() | -|Segformer | MIT-B5 | 512x512 | 160000 | - | - | 49.62 | 50.36 | [config]() | [model]() | [log]() | +|Segformer | MIT-B0 | 512x512 | 160000 | 2.1 | 51.32 | 37.41 | 38.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20ksegformer_mit-b0_512x512_160k_ade20k_20210726_101530-8ffa8fda.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20ksegformer_mit-b0_512x512_160k_ade20k_20210726_101530.log.json) | +|Segformer | MIT-B1 | 512x512 | 160000 | 2.6 | 47.66 | 40.97 | 42.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20ksegformer_mit-b1_512x512_160k_ade20k_20210726_112106-d70e859d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20ksegformer_mit-b1_512x512_160k_ade20k_20210726_112106.log.json) | +|Segformer | MIT-B2 | 512x512 | 160000 | 3.6 | 30.88 | 45.58 | 47.03 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20ksegformer_mit-b2_512x512_160k_ade20k_20210726_112103-cbd414ac.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20ksegformer_mit-b2_512x512_160k_ade20k_20210726_112103.log.json) | +|Segformer | MIT-B3 | 512x512 | 160000 | 4.8 | 22.11 | 47.82 | 48.81 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20ksegformer_mit-b3_512x512_160k_ade20k_20210726_081410-962b98d2.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20ksegformer_mit-b3_512x512_160k_ade20k_20210726_081410.log.json) | +|Segformer | MIT-B4 | 512x512 | 160000 | 6.1 | 15.45 | 48.46 | 49.76 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20ksegformer_mit-b4_512x512_160k_ade20k_20210728_183055-7f509d7d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20ksegformer_mit-b4_512x512_160k_ade20k_20210728_183055.log.json) | +|Segformer | MIT-B5 | 512x512 | 160000 | 7.2 | 11.89 | 49.13 | 50.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20ksegformer_mit-b5_512x512_160k_ade20k_20210726_145235-94cedf59.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20ksegformer_mit-b5_512x512_160k_ade20k_20210726_145235.log.json) | +|Segformer | MIT-B5 | 640x640 | 160000 | 11.5 | 11.30 | 49.62 | 50.36 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20ksegformer_mit-b5_640x640_160k_ade20k_20210801_121243-41d2845b.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20ksegformer_mit-b5_640x640_160k_ade20k_20210801_121243.log.json) | Evaluation with AlignedResize: From 08df9539d4f391d9261934d69160efdf06e67939 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Tue, 10 Aug 2021 13:37:03 +0800 Subject: [PATCH 89/96] Fix some url bugs and pipelines bug. --- configs/segformer/readme.md | 14 +++++++------- mmseg/datasets/pipelines/transforms.py | 6 +++++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index e990dac025..0bfd5097b5 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -19,13 +19,13 @@ | Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | | ------ | -------- | --------- | ------: | -------: | -------------- | ---: | ------------- | ------ | -------- | -|Segformer | MIT-B0 | 512x512 | 160000 | 2.1 | 51.32 | 37.41 | 38.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20ksegformer_mit-b0_512x512_160k_ade20k_20210726_101530-8ffa8fda.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20ksegformer_mit-b0_512x512_160k_ade20k_20210726_101530.log.json) | -|Segformer | MIT-B1 | 512x512 | 160000 | 2.6 | 47.66 | 40.97 | 42.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20ksegformer_mit-b1_512x512_160k_ade20k_20210726_112106-d70e859d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20ksegformer_mit-b1_512x512_160k_ade20k_20210726_112106.log.json) | -|Segformer | MIT-B2 | 512x512 | 160000 | 3.6 | 30.88 | 45.58 | 47.03 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20ksegformer_mit-b2_512x512_160k_ade20k_20210726_112103-cbd414ac.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20ksegformer_mit-b2_512x512_160k_ade20k_20210726_112103.log.json) | -|Segformer | MIT-B3 | 512x512 | 160000 | 4.8 | 22.11 | 47.82 | 48.81 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20ksegformer_mit-b3_512x512_160k_ade20k_20210726_081410-962b98d2.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20ksegformer_mit-b3_512x512_160k_ade20k_20210726_081410.log.json) | -|Segformer | MIT-B4 | 512x512 | 160000 | 6.1 | 15.45 | 48.46 | 49.76 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20ksegformer_mit-b4_512x512_160k_ade20k_20210728_183055-7f509d7d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20ksegformer_mit-b4_512x512_160k_ade20k_20210728_183055.log.json) | -|Segformer | MIT-B5 | 512x512 | 160000 | 7.2 | 11.89 | 49.13 | 50.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20ksegformer_mit-b5_512x512_160k_ade20k_20210726_145235-94cedf59.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20ksegformer_mit-b5_512x512_160k_ade20k_20210726_145235.log.json) | -|Segformer | MIT-B5 | 640x640 | 160000 | 11.5 | 11.30 | 49.62 | 50.36 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20ksegformer_mit-b5_640x640_160k_ade20k_20210801_121243-41d2845b.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20ksegformer_mit-b5_640x640_160k_ade20k_20210801_121243.log.json) | +|Segformer | MIT-B0 | 512x512 | 160000 | 2.1 | 51.32 | 37.41 | 38.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20k/segformer_mit-b0_512x512_160k_ade20k_20210726_101530-8ffa8fda.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20k/segformer_mit-b0_512x512_160k_ade20k_20210726_101530.log.json) | +|Segformer | MIT-B1 | 512x512 | 160000 | 2.6 | 47.66 | 40.97 | 42.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20k/segformer_mit-b1_512x512_160k_ade20k_20210726_112106-d70e859d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20k/segformer_mit-b1_512x512_160k_ade20k_20210726_112106.log.json) | +|Segformer | MIT-B2 | 512x512 | 160000 | 3.6 | 30.88 | 45.58 | 47.03 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20k/segformer_mit-b2_512x512_160k_ade20k_20210726_112103-cbd414ac.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20k/segformer_mit-b2_512x512_160k_ade20k_20210726_112103.log.json) | +|Segformer | MIT-B3 | 512x512 | 160000 | 4.8 | 22.11 | 47.82 | 48.81 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20k/segformer_mit-b3_512x512_160k_ade20k_20210726_081410-962b98d2.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20k/segformer_mit-b3_512x512_160k_ade20k_20210726_081410.log.json) | +|Segformer | MIT-B4 | 512x512 | 160000 | 6.1 | 15.45 | 48.46 | 49.76 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20k/segformer_mit-b4_512x512_160k_ade20k_20210728_183055-7f509d7d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20k/segformer_mit-b4_512x512_160k_ade20k_20210728_183055.log.json) | +|Segformer | MIT-B5 | 512x512 | 160000 | 7.2 | 11.89 | 49.13 | 50.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20k/segformer_mit-b5_512x512_160k_ade20k_20210726_145235-94cedf59.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20k/segformer_mit-b5_512x512_160k_ade20k_20210726_145235.log.json) | +|Segformer | MIT-B5 | 640x640 | 160000 | 11.5 | 11.30 | 49.62 | 50.36 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20k/segformer_mit-b5_640x640_160k_ade20k_20210801_121243-41d2845b.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20k/segformer_mit-b5_640x640_160k_ade20k_20210801_121243.log.json) | Evaluation with AlignedResize: diff --git a/mmseg/datasets/pipelines/transforms.py b/mmseg/datasets/pipelines/transforms.py index 50b11ee6e7..c5e94a0f14 100644 --- a/mmseg/datasets/pipelines/transforms.py +++ b/mmseg/datasets/pipelines/transforms.py @@ -36,6 +36,7 @@ def __call__(self, results): img = mmcv.imresize_to_multiple( img, self.size_divisor, + scale_factor=1, interpolation=self.interpolation if self.interpolation else 'bilinear') @@ -47,7 +48,10 @@ def __call__(self, results): for key in results.get('seg_fields', []): gt_seg = results[key] gt_seg = mmcv.imresize_to_multiple( - gt_seg, self.size_divisor, interpolation='nearest') + gt_seg, + self.size_divisor, + scale_factor=1, + interpolation='nearest') results[key] = gt_seg return results From 827359420abd05ff9804ad4252c15a855a8217f4 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 11 Aug 2021 01:21:59 +0800 Subject: [PATCH 90/96] Move ckpt convertion to tools. --- configs/segformer/readme.md | 4 +- mmseg/models/backbones/mit.py | 8 +-- mmseg/models/utils/__init__.py | 4 +- mmseg/models/utils/ckpt_convert.py | 49 ---------------- tools/model_converters/mit_convert.py | 81 +++++++++++++++++++++++++++ 5 files changed, 87 insertions(+), 59 deletions(-) create mode 100644 tools/model_converters/mit_convert.py diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index 0bfd5097b5..7c124c3661 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -52,7 +52,7 @@ test_pipeline = [ flip=False, transforms=[ dict(type='Resize', keep_ratio=True), - # resize image to multiple of 32, improve SegFormer by xx mIoU. + # resize image to multiple of 32, improve SegFormer by 0.5-1.0 mIoU. dict(type='ResizeToMultiple', size_divisor=32), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), @@ -61,3 +61,5 @@ test_pipeline = [ ]) ] ``` + +Because the MixVisionTransformer is not supported by mmcls, we just can get backbone weights from official repo. We provide convert script to convert official weights in `tools/model_converters/mit_convert.py`. After converting, you need to modify `pretrained` of model config. diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index d073444664..6e89437722 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -11,7 +11,7 @@ from ...utils import get_root_logger from ..builder import BACKBONES -from ..utils import PatchEmbed, mit_convert, nchw_to_nlc, nlc_to_nchw +from ..utils import PatchEmbed, nchw_to_nlc, nlc_to_nchw class MixFFN(BaseModule): @@ -398,12 +398,6 @@ def init_weights(self): else: state_dict = checkpoint - if self.pretrain_style == 'official': - # Because segformer backbone is not support by mmcls, - # so we need to convert pretrain weights to match this - # implementation. - state_dict = mit_convert(state_dict) - self.load_state_dict(state_dict, False) def forward(self, x): diff --git a/mmseg/models/utils/__init__.py b/mmseg/models/utils/__init__.py index 32a953b834..6ef12bb9ba 100644 --- a/mmseg/models/utils/__init__.py +++ b/mmseg/models/utils/__init__.py @@ -1,4 +1,4 @@ -from .ckpt_convert import mit_convert, swin_convert, vit_convert +from .ckpt_convert import swin_convert, vit_convert from .embed import PatchEmbed from .inverted_residual import InvertedResidual, InvertedResidualV3 from .make_divisible import make_divisible @@ -11,5 +11,5 @@ __all__ = [ 'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual', 'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'vit_convert', - 'mit_convert', 'swin_convert', 'PatchEmbed', 'nchw_to_nlc', 'nlc_to_nchw' + 'swin_convert', 'PatchEmbed', 'nchw_to_nlc', 'nlc_to_nchw' ] diff --git a/mmseg/models/utils/ckpt_convert.py b/mmseg/models/utils/ckpt_convert.py index 26a1b96df9..0b1b27707d 100644 --- a/mmseg/models/utils/ckpt_convert.py +++ b/mmseg/models/utils/ckpt_convert.py @@ -1,7 +1,5 @@ from collections import OrderedDict -import torch - def swin_convert(ckpt): new_ckpt = OrderedDict() @@ -90,50 +88,3 @@ def vit_convert(ckpt): new_ckpt[new_k] = v return new_ckpt - - -def mit_convert(ckpt): - new_ckpt = OrderedDict() - # Process the concat between q linear weights and kv linear weights - for k, v in ckpt.items(): - if k.startswith('head'): - continue - elif k.startswith('patch_embed'): - stage_i = int(k.split('.')[0].replace('patch_embed', '')) - new_k = k.replace(f'patch_embed{stage_i}', f'layers.{stage_i-1}.0') - new_v = v - if 'proj.' in new_k: - new_k = new_k.replace('proj.', 'projection.') - elif k.startswith('block'): - stage_i = int(k.split('.')[0].replace('block', '')) - new_k = k.replace(f'block{stage_i}', f'layers.{stage_i-1}.1') - new_v = v - if 'attn.q.' in new_k: - sub_item_k = k.replace('q.', 'kv.') - new_k = new_k.replace('q.', 'attn.in_proj_') - new_v = torch.cat([v, ckpt[sub_item_k]], dim=0) - elif 'attn.kv.' in new_k: - continue - elif 'attn.proj.' in new_k: - new_k = new_k.replace('proj.', 'attn.out_proj.') - elif 'attn.sr.' in new_k: - new_k = new_k.replace('sr.', 'sr.') - elif 'mlp.' in new_k: - string = f'{new_k}-' - new_k = new_k.replace('mlp.', 'ffn.layers.') - if 'fc1.weight' in new_k or 'fc2.weight' in new_k: - new_v = v.reshape((*v.shape, 1, 1)) - new_k = new_k.replace('fc1.', '0.') - new_k = new_k.replace('dwconv.dwconv.', '1.') - new_k = new_k.replace('fc2.', '4.') - string += f'{new_k} {v.shape}-{new_v.shape}' - # print(string) - elif k.startswith('norm'): - stage_i = int(k.split('.')[0].replace('norm', '')) - new_k = k.replace(f'norm{stage_i}', f'layers.{stage_i-1}.2') - new_v = v - else: - new_k = k - new_v = v - new_ckpt[new_k] = new_v - return new_ckpt diff --git a/tools/model_converters/mit_convert.py b/tools/model_converters/mit_convert.py new file mode 100644 index 0000000000..04a943313d --- /dev/null +++ b/tools/model_converters/mit_convert.py @@ -0,0 +1,81 @@ +import argparse +from collections import OrderedDict + +import torch + + +def mit_convert(ckpt): + new_ckpt = OrderedDict() + # Process the concat between q linear weights and kv linear weights + for k, v in ckpt.items(): + if k.startswith('head'): + continue + # patch embedding convertion + elif k.startswith('patch_embed'): + stage_i = int(k.split('.')[0].replace('patch_embed', '')) + new_k = k.replace(f'patch_embed{stage_i}', f'layers.{stage_i-1}.0') + new_v = v + if 'proj.' in new_k: + new_k = new_k.replace('proj.', 'projection.') + # transformer encoder layer convertion + elif k.startswith('block'): + stage_i = int(k.split('.')[0].replace('block', '')) + new_k = k.replace(f'block{stage_i}', f'layers.{stage_i-1}.1') + new_v = v + if 'attn.q.' in new_k: + sub_item_k = k.replace('q.', 'kv.') + new_k = new_k.replace('q.', 'attn.in_proj_') + new_v = torch.cat([v, ckpt[sub_item_k]], dim=0) + elif 'attn.kv.' in new_k: + continue + elif 'attn.proj.' in new_k: + new_k = new_k.replace('proj.', 'attn.out_proj.') + elif 'attn.sr.' in new_k: + new_k = new_k.replace('sr.', 'sr.') + elif 'mlp.' in new_k: + string = f'{new_k}-' + new_k = new_k.replace('mlp.', 'ffn.layers.') + if 'fc1.weight' in new_k or 'fc2.weight' in new_k: + new_v = v.reshape((*v.shape, 1, 1)) + new_k = new_k.replace('fc1.', '0.') + new_k = new_k.replace('dwconv.dwconv.', '1.') + new_k = new_k.replace('fc2.', '4.') + string += f'{new_k} {v.shape}-{new_v.shape}' + # norm layer convertion + elif k.startswith('norm'): + stage_i = int(k.split('.')[0].replace('norm', '')) + new_k = k.replace(f'norm{stage_i}', f'layers.{stage_i-1}.2') + new_v = v + else: + new_k = k + new_v = v + new_ckpt[new_k] = new_v + return new_ckpt + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + 'src', help='Source path of official segformer backbone weights.') + parser.add_argument( + 'dst', + help='Destination path of converted segformer backbone weights.') + + return parser.parse_args() + + +if __name__ == '__main__': + args = parse_args() + src_path = args.src + dst_path = args.dst + + ckpt = torch.load(src_path) + if 'state_dict' in ckpt: + ckpt = ckpt['state_dict'] + elif 'model' in ckpt: + ckpt = ckpt['model'] + else: + ckpt = ckpt + + ckpt = mit_convert(ckpt) + torch.save(ckpt, dst_path) From a40205ef8b2ce2d1fc4c18a05e3ab92f9476a4d4 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 11 Aug 2021 02:21:41 +0800 Subject: [PATCH 91/96] Add segformer official pretrain weights usage. --- configs/segformer/readme.md | 10 +++++++++- mmseg/models/backbones/mit.py | 2 -- tools/model_converters/mit_convert.py | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/configs/segformer/readme.md b/configs/segformer/readme.md index 7c124c3661..cf2fece512 100644 --- a/configs/segformer/readme.md +++ b/configs/segformer/readme.md @@ -62,4 +62,12 @@ test_pipeline = [ ] ``` -Because the MixVisionTransformer is not supported by mmcls, we just can get backbone weights from official repo. We provide convert script to convert official weights in `tools/model_converters/mit_convert.py`. After converting, you need to modify `pretrained` of model config. +## How to use segformer official pretrain weights + +We convert the backbone weights from the official repo (https://github.com/NVlabs/SegFormer) with `tools/model_converters/mit_convert.py`. + +You may follow below steps to start segformer training preparation: + +1. Download segformer pretrain weights (Suggest put in `pretrain/`); +2. Run convert script to convert official pretrain weights: `python tools/model_converters/mit_convert.py pretrain/mit_b0.pth pretrain/mit_b0.pth`; +3. Modify `pretrained` of segformer model config, for example, `pretrained` of `segformer_mit-b0_512x512_160k_ade20k.py` is set to `pretrain/mit_b0.pth`; diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py index 6e89437722..9d41ea58c1 100644 --- a/mmseg/models/backbones/mit.py +++ b/mmseg/models/backbones/mit.py @@ -393,8 +393,6 @@ def init_weights(self): self.pretrained, logger=logger, map_location='cpu') if 'state_dict' in checkpoint: state_dict = checkpoint['state_dict'] - elif 'model' in checkpoint: - state_dict = checkpoint['model'] else: state_dict = checkpoint diff --git a/tools/model_converters/mit_convert.py b/tools/model_converters/mit_convert.py index 04a943313d..39b84254cd 100644 --- a/tools/model_converters/mit_convert.py +++ b/tools/model_converters/mit_convert.py @@ -69,7 +69,7 @@ def parse_args(): src_path = args.src dst_path = args.dst - ckpt = torch.load(src_path) + ckpt = torch.load(src_path, map_location='cpu') if 'state_dict' in ckpt: ckpt = ckpt['state_dict'] elif 'model' in ckpt: From d650222dcb7970b7122a2ecc0b50d23cb99b6cb1 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Wed, 11 Aug 2021 02:56:22 +0800 Subject: [PATCH 92/96] Clean redundant codes. --- tools/model_converters/mit_convert.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tools/model_converters/mit_convert.py b/tools/model_converters/mit_convert.py index 39b84254cd..bc41f387d8 100644 --- a/tools/model_converters/mit_convert.py +++ b/tools/model_converters/mit_convert.py @@ -70,12 +70,6 @@ def parse_args(): dst_path = args.dst ckpt = torch.load(src_path, map_location='cpu') - if 'state_dict' in ckpt: - ckpt = ckpt['state_dict'] - elif 'model' in ckpt: - ckpt = ckpt['model'] - else: - ckpt = ckpt ckpt = mit_convert(ckpt) torch.save(ckpt, dst_path) From 97a88b0b7abe7997e5f091154dc09fdfd530b7a7 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 12 Aug 2021 14:36:29 +0800 Subject: [PATCH 93/96] Remove redundant codes. --- .../segformer_mit-b1_512x512_160k_ade20k.py | 33 +---------- .../segformer_mit-b2_512x512_160k_ade20k.py | 33 +---------- .../segformer_mit-b3_512x512_160k_ade20k.py | 33 +---------- .../segformer_mit-b4_512x512_160k_ade20k.py | 33 +---------- .../segformer_mit-b5_512x512_160k_ade20k.py | 34 +---------- .../segformer_mit-b5_640x640_160k_ade20k.py | 59 ++----------------- tests/test_data/test_transform.py | 2 +- 7 files changed, 17 insertions(+), 210 deletions(-) diff --git a/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py index 244af5ea54..5fce602144 100644 --- a/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py @@ -1,37 +1,8 @@ -_base_ = [ - '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py', - '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' -] +_base_ = ['./segformer_mit-b0_512x512_160k_ade20k.py'] # model settings model = dict( pretrained='pretrain/mit_b1.pth', backbone=dict( embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[2, 2, 2, 2]), - decode_head=dict(in_channels=[64, 128, 320, 512], num_classes=150)) - -# optimizer -optimizer = dict( - _delete_=True, - type='AdamW', - lr=0.00006, - betas=(0.9, 0.999), - weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={ - 'pos_block': dict(decay_mult=0.), - 'norm': dict(decay_mult=0.), - 'head': dict(lr_mult=10.) - })) - -lr_config = dict( - _delete_=True, - policy='poly', - warmup='linear', - warmup_iters=1500, - warmup_ratio=1e-6, - power=1.0, - min_lr=0.0, - by_epoch=False) - -data = dict(samples_per_gpu=2) + decode_head=dict(in_channels=[64, 128, 320, 512])) diff --git a/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py index 1f2c07453b..afb24b0170 100644 --- a/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py @@ -1,37 +1,8 @@ -_base_ = [ - '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py', - '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' -] +_base_ = ['./segformer_mit-b0_512x512_160k_ade20k.py'] # model settings model = dict( pretrained='pretrain/mit_b2.pth', backbone=dict( embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 4, 6, 3]), - decode_head=dict(in_channels=[64, 128, 320, 512], num_classes=150)) - -# optimizer -optimizer = dict( - _delete_=True, - type='AdamW', - lr=0.00006, - betas=(0.9, 0.999), - weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={ - 'pos_block': dict(decay_mult=0.), - 'norm': dict(decay_mult=0.), - 'head': dict(lr_mult=10.) - })) - -lr_config = dict( - _delete_=True, - policy='poly', - warmup='linear', - warmup_iters=1500, - warmup_ratio=1e-6, - power=1.0, - min_lr=0.0, - by_epoch=False) - -data = dict(samples_per_gpu=2) + decode_head=dict(in_channels=[64, 128, 320, 512])) diff --git a/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py index 70ece5eb13..52348f6fcc 100644 --- a/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py @@ -1,37 +1,8 @@ -_base_ = [ - '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py', - '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' -] +_base_ = ['./segformer_mit-b0_512x512_160k_ade20k.py'] # model settings model = dict( pretrained='pretrain/mit_b3.pth', backbone=dict( embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 4, 18, 3]), - decode_head=dict(in_channels=[64, 128, 320, 512], num_classes=150)) - -# optimizer -optimizer = dict( - _delete_=True, - type='AdamW', - lr=0.00006, - betas=(0.9, 0.999), - weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={ - 'pos_block': dict(decay_mult=0.), - 'norm': dict(decay_mult=0.), - 'head': dict(lr_mult=10.) - })) - -lr_config = dict( - _delete_=True, - policy='poly', - warmup='linear', - warmup_iters=1500, - warmup_ratio=1e-6, - power=1.0, - min_lr=0.0, - by_epoch=False) - -data = dict(samples_per_gpu=2) + decode_head=dict(in_channels=[64, 128, 320, 512])) diff --git a/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py index 0ec383f052..7b50b75608 100644 --- a/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py @@ -1,37 +1,8 @@ -_base_ = [ - '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py', - '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' -] +_base_ = ['./segformer_mit-b0_512x512_160k_ade20k.py'] # model settings model = dict( pretrained='pretrain/mit_b4.pth', backbone=dict( embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 8, 27, 3]), - decode_head=dict(in_channels=[64, 128, 320, 512], num_classes=150)) - -# optimizer -optimizer = dict( - _delete_=True, - type='AdamW', - lr=0.00006, - betas=(0.9, 0.999), - weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={ - 'pos_block': dict(decay_mult=0.), - 'norm': dict(decay_mult=0.), - 'head': dict(lr_mult=10.) - })) - -lr_config = dict( - _delete_=True, - policy='poly', - warmup='linear', - warmup_iters=1500, - warmup_ratio=1e-6, - power=1.0, - min_lr=0.0, - by_epoch=False) - -data = dict(samples_per_gpu=2) + decode_head=dict(in_channels=[64, 128, 320, 512])) diff --git a/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py index e63dfef290..5212fb1f6a 100644 --- a/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py @@ -1,38 +1,8 @@ -_base_ = [ - '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py', - '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' -] +_base_ = ['./segformer_mit-b0_512x512_160k_ade20k.py'] # model settings model = dict( pretrained='pretrain/mit_b5.pth', backbone=dict( embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 6, 40, 3]), - decode_head=dict(in_channels=[64, 128, 320, 512], num_classes=150)) - -# optimizer -optimizer = dict( - _delete_=True, - type='AdamW', - lr=0.00006, - betas=(0.9, 0.999), - weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={ - 'pos_block': dict(decay_mult=0.), - 'norm': dict(decay_mult=0.), - 'head': dict(lr_mult=10.) - })) - -lr_config = dict( - _delete_=True, - policy='poly', - warmup='linear', - warmup_iters=1500, - warmup_ratio=1e-6, - power=1.0, - min_lr=0.0, - by_epoch=False) - -# By default, models are trained on 8 GPUs with 2 images per GPU -data = dict(samples_per_gpu=2) + decode_head=dict(in_channels=[64, 128, 320, 512])) diff --git a/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py b/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py index f2b6d22191..d21774c4d6 100644 --- a/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py @@ -1,11 +1,6 @@ -_base_ = [ - '../_base_/models/segformer_mit-b0.py', '../_base_/default_runtime.py', - '../_base_/schedules/schedule_160k.py' -] +_base_ = ['./segformer_mit-b0_512x512_160k_ade20k.py'] # dataset settings -dataset_type = 'ADE20KDataset' -data_root = 'data/ade/ADEChallengeData2016' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) crop_size = (640, 640) @@ -26,11 +21,10 @@ dict( type='MultiScaleFlipAug', img_scale=(2048, 640), - # img_ratios=[0.5640.75, 1.0, 1.25, 1.5, 1.75], + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], flip=False, transforms=[ dict(type='Resize', keep_ratio=True), - dict(type='ResizeToMultiple', size_divisor=32), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), @@ -38,54 +32,13 @@ ]) ] data = dict( - samples_per_gpu=2, - workers_per_gpu=2, - train=dict( - type=dataset_type, - data_root=data_root, - img_dir='images/training', - ann_dir='annotations/training', - pipeline=train_pipeline), - val=dict( - type=dataset_type, - data_root=data_root, - img_dir='images/validation', - ann_dir='annotations/validation', - pipeline=test_pipeline), - test=dict( - type=dataset_type, - data_root=data_root, - img_dir='images/validation', - ann_dir='annotations/validation', - pipeline=test_pipeline)) + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) # model settings model = dict( pretrained='pretrain/mit_b5.pth', backbone=dict( embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 6, 40, 3]), - decode_head=dict(in_channels=[64, 128, 320, 512], num_classes=150)) - -# optimizer -optimizer = dict( - _delete_=True, - type='AdamW', - lr=0.00006, - betas=(0.9, 0.999), - weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={ - 'pos_block': dict(decay_mult=0.), - 'norm': dict(decay_mult=0.), - 'head': dict(lr_mult=10.) - })) - -lr_config = dict( - _delete_=True, - policy='poly', - warmup='linear', - warmup_iters=1500, - warmup_ratio=1e-6, - power=1.0, - min_lr=0.0, - by_epoch=False) + decode_head=dict(in_channels=[64, 128, 320, 512])) diff --git a/tests/test_data/test_transform.py b/tests/test_data/test_transform.py index 658606a822..33ed4ecb14 100644 --- a/tests/test_data/test_transform.py +++ b/tests/test_data/test_transform.py @@ -10,7 +10,7 @@ from mmseg.datasets.builder import PIPELINES -def test_align(): +def test_resize_to_multiple(): transform = dict(type='ResizeToMultiple', size_divisor=32) transform = build_from_cfg(transform, PIPELINES) From 4cbfb9fa760cb89cf7e2eb558480ed56b6f8d574 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 12 Aug 2021 14:39:47 +0800 Subject: [PATCH 94/96] Unfied format. --- mmseg/models/decode_heads/segformer_head.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmseg/models/decode_heads/segformer_head.py b/mmseg/models/decode_heads/segformer_head.py index 0919d145ac..9ae1ff69d8 100644 --- a/mmseg/models/decode_heads/segformer_head.py +++ b/mmseg/models/decode_heads/segformer_head.py @@ -12,7 +12,7 @@ class SegformerHead(BaseDecodeHead): """The all mlp Head of segformer. This head is the implementation of - `Segformer ` + `Segformer ` _. Args: interpolate_mode: The interpolate mode of MLP head upsample operation. From d1814c168a37096e0e7e6ab873bb1ed47b10c343 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 12 Aug 2021 14:40:28 +0800 Subject: [PATCH 95/96] Add description for segformer converter. --- tools/model_converters/mit_convert.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/model_converters/mit_convert.py b/tools/model_converters/mit_convert.py index bc41f387d8..c914c4edba 100644 --- a/tools/model_converters/mit_convert.py +++ b/tools/model_converters/mit_convert.py @@ -54,7 +54,8 @@ def mit_convert(ckpt): def parse_args(): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser( + 'Convert official segformer backbone weights to mmseg style.') parser.add_argument( 'src', help='Source path of official segformer backbone weights.') parser.add_argument( From 920feba1b81f8f3e3a668e1054d63c8cffb52d06 Mon Sep 17 00:00:00 2001 From: sennnnn <201730271412@mail.scut.edu.cn> Date: Thu, 12 Aug 2021 15:33:07 +0800 Subject: [PATCH 96/96] Update workers. --- configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py index d374a4f8a2..03065a7940 100644 --- a/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py +++ b/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py @@ -30,4 +30,4 @@ min_lr=0.0, by_epoch=False) -data = dict(samples_per_gpu=2) +data = dict(samples_per_gpu=2, workers_per_gpu=2)