diff --git a/configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py b/configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py new file mode 100644 index 0000000000..3dfa6726e8 --- /dev/null +++ b/configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py @@ -0,0 +1,123 @@ +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dCSN', + pretrained2d=False, + pretrained= # noqa: E251 + 'https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/csn/ircsn_from_scratch_r152_ig65m_20200807-771c4135.pth', # noqa: E501 + depth=152, + with_pool2=False, + bottleneck_mode='ir', + norm_eval=True, + bn_frozen=True, + zero_init_residual=False), + cls_head=dict( + type='I3DHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + dropout_ratio=0.5, + init_std=0.01)) +# model training and testing settings +train_cfg = None +test_cfg = dict(average_clips=None) +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=3, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='SGD', lr=0.0005, momentum=0.9, weight_decay=0.0001) # 0.0005 for 32g +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + step=[32, 48], + warmup='linear', + warmup_ratio=0.1, + warmup_by_epoch=True, + warmup_iters=16) +total_epochs = 58 +checkpoint_config = dict(interval=2) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5)) +log_config = dict( + interval=20, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) +# runtime settings +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb' # noqa: E501 +load_from = None +resume_from = None +workflow = [('train', 1)] +find_unused_parameters = True diff --git a/configs/recognition/csn/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb.py b/configs/recognition/csn/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb.py new file mode 100644 index 0000000000..3eb832dd49 --- /dev/null +++ b/configs/recognition/csn/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb.py @@ -0,0 +1,121 @@ +# model settings +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dCSN', + pretrained2d=False, + pretrained= # noqa: E251 + 'https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/csn/ircsn_from_scratch_r152_ig65m_20200807-771c4135.pth', # noqa: E501 + depth=152, + with_pool2=False, + bottleneck_mode='ir', + norm_eval=False, + zero_init_residual=False), + cls_head=dict( + type='I3DHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + dropout_ratio=0.5, + init_std=0.01)) +# model training and testing settings +train_cfg = None +test_cfg = dict(average_clips=None) +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=3, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='SGD', lr=0.0005, momentum=0.9, weight_decay=0.0001) # 0.0005 for 32g +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + step=[32, 48], + warmup='linear', + warmup_ratio=0.1, + warmup_by_epoch=True, + warmup_iters=16) +total_epochs = 58 +checkpoint_config = dict(interval=2) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5)) +log_config = dict( + interval=20, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) +# runtime settings +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/mmaction/models/__init__.py b/mmaction/models/__init__.py index 1b4be9e983..5726187f56 100644 --- a/mmaction/models/__init__.py +++ b/mmaction/models/__init__.py @@ -1,5 +1,5 @@ -from .backbones import (ResNet, ResNet2Plus1d, ResNet3d, ResNet3dSlowFast, - ResNet3dSlowOnly, ResNetTSM) +from .backbones import (ResNet, ResNet2Plus1d, ResNet3d, ResNet3dCSN, + ResNet3dSlowFast, ResNet3dSlowOnly, ResNetTSM) from .builder import (build_backbone, build_head, build_localizer, build_model, build_recognizer) from .common import Conv2plus1d @@ -18,5 +18,5 @@ 'ResNet3dSlowFast', 'SlowFastHead', 'Conv2plus1d', 'ResNet3dSlowOnly', 'BCELossWithLogits', 'LOCALIZERS', 'build_localizer', 'PEM', 'TEM', 'BinaryLogisticRegressionLoss', 'BMN', 'BMNLoss', 'build_model', - 'OHEMHingeLoss', 'SSNLoss' + 'OHEMHingeLoss', 'SSNLoss', 'ResNet3dCSN' ] diff --git a/mmaction/models/backbones/__init__.py b/mmaction/models/backbones/__init__.py index cf703d8dda..1e27f2ee6c 100644 --- a/mmaction/models/backbones/__init__.py +++ b/mmaction/models/backbones/__init__.py @@ -1,11 +1,12 @@ from .resnet import ResNet from .resnet2plus1d import ResNet2Plus1d from .resnet3d import ResNet3d +from .resnet3d_csn import ResNet3dCSN from .resnet3d_slowfast import ResNet3dSlowFast from .resnet3d_slowonly import ResNet3dSlowOnly from .resnet_tsm import ResNetTSM __all__ = [ 'ResNet', 'ResNet3d', 'ResNetTSM', 'ResNet2Plus1d', 'ResNet3dSlowFast', - 'ResNet3dSlowOnly' + 'ResNet3dSlowOnly', 'ResNet3dCSN' ] diff --git a/mmaction/models/backbones/resnet3d.py b/mmaction/models/backbones/resnet3d.py index 682c140a77..1d95a95df1 100644 --- a/mmaction/models/backbones/resnet3d.py +++ b/mmaction/models/backbones/resnet3d.py @@ -368,8 +368,10 @@ class ResNet3d(nn.Module): non_local (Sequence[int]): Determine whether to apply non-local module in the corresponding block of each stages. Default: (0, 0, 0, 0). non_local_cfg (dict): Config for non-local module. Default: ``dict()``. - zero_init_residual (bool): Whether to use zero initialization for - residual block, Default: True. + zero_init_residual (bool): + Whether to use zero initialization for residual block, + Default: True. + kwargs (dict, optional): Key arguments for "make_res_layer". """ arch_settings = { @@ -405,7 +407,8 @@ def __init__(self, with_cp=False, non_local=(0, 0, 0, 0), non_local_cfg=dict(), - zero_init_residual=True): + zero_init_residual=True, + **kwargs): super().__init__() if depth not in self.arch_settings: raise KeyError(f'invalid depth {depth} for resnet') @@ -467,7 +470,8 @@ def __init__(self, non_local_cfg=self.non_local_cfg, inflate=self.stage_inflations[i], inflate_style=self.inflate_style, - with_cp=with_cp) + with_cp=with_cp, + **kwargs) self.inplanes = planes * self.block.expansion layer_name = f'layer{i + 1}' self.add_module(layer_name, res_layer) @@ -492,7 +496,8 @@ def make_res_layer(self, norm_cfg=None, act_cfg=None, conv_cfg=None, - with_cp=False): + with_cp=False, + **kwargs): """Build residual layer for ResNet3D. Args: @@ -565,7 +570,8 @@ def make_res_layer(self, norm_cfg=norm_cfg, conv_cfg=conv_cfg, act_cfg=act_cfg, - with_cp=with_cp)) + with_cp=with_cp, + **kwargs)) inplanes = planes * block.expansion for i in range(1, blocks): layers.append( @@ -583,7 +589,8 @@ def make_res_layer(self, norm_cfg=norm_cfg, conv_cfg=conv_cfg, act_cfg=act_cfg, - with_cp=with_cp)) + with_cp=with_cp, + **kwargs)) return nn.Sequential(*layers) diff --git a/mmaction/models/backbones/resnet3d_csn.py b/mmaction/models/backbones/resnet3d_csn.py new file mode 100644 index 0000000000..d34683f404 --- /dev/null +++ b/mmaction/models/backbones/resnet3d_csn.py @@ -0,0 +1,148 @@ +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmcv.utils import _BatchNorm + +from ..registry import BACKBONES +from .resnet3d import Bottleneck3d, ResNet3d + + +class CSNBottleneck3d(Bottleneck3d): + """Channel-Separated Bottleneck Block. + + This module is proposed in + "Video Classification with Channel-Separated Convolutional Networks" + Link: https://arxiv.org/pdf/1711.11248.pdf + + Args: + inplanes (int): Number of channels for the input in first conv3d layer. + planes (int): Number of channels produced by some norm/conv3d layers. + bottleneck_mode (str): Determine which ways to factorize a 3D + bottleneck block using channel-separated convolutional networks. + If set to 'ip', it will replace the 3x3x3 conv2 layer with a + 1x1x1 traditional convolution and a 3x3x3 depthwise + convolution, i.e., Interaction-preserved channel-separated + bottleneck block. + If set to 'ir', it will replace the 3x3x3 conv2 layer with a + 3x3x3 depthwise convolution, which is derived from preserved + bottleneck block by removing the extra 1x1x1 convolution, + i.e., Interaction-reduced channel-separated bottleneck block. + Default: 'ir'. + args (position arguments): Position arguments for Bottleneck. + kwargs (dict, optional): Keyword arguments for Bottleneck. + """ + + def __init__(self, + inplanes, + planes, + *args, + bottleneck_mode='ir', + **kwargs): + super(CSNBottleneck3d, self).__init__(inplanes, planes, *args, + **kwargs) + self.bottleneck_mode = bottleneck_mode + conv2 = [] + if self.bottleneck_mode == 'ip': + conv2.append( + nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False)) + conv2_kernel_size = self.conv2.conv.kernel_size + conv2_stride = self.conv2.conv.stride + conv2_padding = self.conv2.conv.padding + conv2_dilation = self.conv2.conv.dilation + conv2_bias = True if self.conv2.conv.bias else False + self.conv2 = ConvModule( + planes, + planes, + conv2_kernel_size, + stride=conv2_stride, + padding=conv2_padding, + dilation=conv2_dilation, + bias=conv2_bias, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + groups=planes) + conv2.append(self.conv2) + self.conv2 = nn.Sequential(*conv2) + + +@BACKBONES.register_module() +class ResNet3dCSN(ResNet3d): + """ResNet backbone for CSN. + + Args: + depth (int): Depth of ResNetCSN, from {18, 34, 50, 101, 152}. + pretrained (str | None): Name of pretrained model. + temporal_strides (tuple[int]): + Temporal strides of residual blocks of each stage. + Default: (1, 2, 2, 2). + conv1_kernel (tuple[int]): Kernel size of the first conv layer. + Default: (3, 7, 7). + conv1_stride_t (int): Temporal stride of the first conv layer. + Default: 1. + pool1_stride_t (int): Temporal stride of the first pooling layer. + Default: 1. + norm_cfg (dict): Config for norm layers. required keys are `type` and + `requires_grad`. + Default: dict(type='BN3d', requires_grad=True, eps=1e-3). + inflate_style (str): `3x1x1` or `1x1x1`. which determines the kernel + sizes and padding strides for conv1 and conv2 in each block. + Default: '3x3x3'. + bottleneck_mode (str): Determine which ways to factorize a 3D + bottleneck block using channel-separated convolutional networks. + If set to 'ip', it will replace the 3x3x3 conv2 layer with a + 1x1x1 traditional convolution and a 3x3x3 depthwise + convolution, i.e., Interaction-preserved channel-separated + bottleneck block. + If set to 'ir', it will replace the 3x3x3 conv2 layer with a + 3x3x3 depthwise convolution, which is derived from preserved + bottleneck block by removing the extra 1x1x1 convolution, + i.e., Interaction-reduced channel-separated bottleneck block. + Default: 'ip'. + kwargs (dict, optional): Key arguments for "make_res_layer". + """ + + def __init__(self, + depth, + pretrained, + temporal_strides=(1, 2, 2, 2), + conv1_kernel=(3, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + norm_cfg=dict(type='BN3d', requires_grad=True, eps=1e-3), + inflate_style='3x3x3', + bottleneck_mode='ir', + bn_frozen=False, + **kwargs): + self.arch_settings = { + # 18: (BasicBlock3d, (2, 2, 2, 2)), + # 34: (BasicBlock3d, (3, 4, 6, 3)), + 50: (CSNBottleneck3d, (3, 4, 6, 3)), + 101: (CSNBottleneck3d, (3, 4, 23, 3)), + 152: (CSNBottleneck3d, (3, 8, 36, 3)) + } + self.bn_frozen = bn_frozen + if bottleneck_mode not in ['ip', 'ir']: + raise ValueError(f'Bottleneck mode must be "ip" or "ir",' + f'but got {bottleneck_mode}.') + super(ResNet3dCSN, self).__init__( + depth, + pretrained, + temporal_strides=temporal_strides, + conv1_kernel=conv1_kernel, + conv1_stride_t=conv1_stride_t, + pool1_stride_t=pool1_stride_t, + norm_cfg=norm_cfg, + inflate_style=inflate_style, + bottleneck_mode=bottleneck_mode, + **kwargs) + + def train(self, mode=True): + super(ResNet3d, self).train() + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + if self.bn_frozen: + for param in m.parameters(): + param.requires_grad = False diff --git a/tests/test_models/test_backbone.py b/tests/test_models/test_backbone.py index 79f835c06d..6946db23b9 100644 --- a/tests/test_models/test_backbone.py +++ b/tests/test_models/test_backbone.py @@ -6,8 +6,8 @@ import torch.nn as nn from mmcv.utils import _BatchNorm -from mmaction.models import (ResNet, ResNet2Plus1d, ResNet3d, ResNet3dSlowFast, - ResNet3dSlowOnly, ResNetTSM) +from mmaction.models import (ResNet, ResNet2Plus1d, ResNet3d, ResNet3dCSN, + ResNet3dSlowFast, ResNet3dSlowOnly, ResNetTSM) from mmaction.models.backbones.resnet_tsm import NL3DWrapper @@ -736,6 +736,68 @@ def test_slowonly_backbone(): assert feat.shape == torch.Size([1, 2048, 8, 2, 2]) +def test_resnet_csn_backbone(): + """Test resnet_csn backbone.""" + with pytest.raises(ValueError): + # Bottleneck mode must be "ip" or "ir" + ResNet3dCSN(152, None, bottleneck_mode='id') + + input_shape = (2, 3, 6, 64, 64) + imgs = _demo_inputs(input_shape) + + resnet3d_csn_frozen = ResNet3dCSN( + 152, None, bn_frozen=True, norm_eval=True) + resnet3d_csn_frozen.train() + for m in resnet3d_csn_frozen.modules(): + if isinstance(m, _BatchNorm): + for param in m.parameters(): + assert param.requires_grad is False + + # Interaction-preserved channel-separated bottleneck block + resnet3d_csn_ip = ResNet3dCSN(152, None, bottleneck_mode='ip') + resnet3d_csn_ip.init_weights() + resnet3d_csn_ip.train() + for i, layer_name in enumerate(resnet3d_csn_ip.res_layers): + layers = getattr(resnet3d_csn_ip, layer_name) + num_blocks = resnet3d_csn_ip.stage_blocks[i] + assert len(layers) == num_blocks + for layer in layers: + assert isinstance(layer.conv2, nn.Sequential) + assert len(layer.conv2) == 2 + assert layer.conv2[1].groups == layer.planes + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + resnet3d_csn_ip = resnet3d_csn_ip.cuda() + imgs_gpu = imgs.cuda() + feat = resnet3d_csn_ip(imgs_gpu) + assert feat.shape == torch.Size([2, 2048, 1, 2, 2]) + else: + feat = resnet3d_csn_ip(imgs) + assert feat.shape == torch.Size([2, 2048, 1, 2, 2]) + + # Interaction-reduced channel-separated bottleneck block + resnet3d_csn_ir = ResNet3dCSN(152, None, bottleneck_mode='ir') + resnet3d_csn_ir.init_weights() + resnet3d_csn_ir.train() + for i, layer_name in enumerate(resnet3d_csn_ir.res_layers): + layers = getattr(resnet3d_csn_ir, layer_name) + num_blocks = resnet3d_csn_ir.stage_blocks[i] + assert len(layers) == num_blocks + for layer in layers: + assert isinstance(layer.conv2, nn.Sequential) + assert len(layer.conv2) == 1 + assert layer.conv2[0].groups == layer.planes + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + resnet3d_csn_ir = resnet3d_csn_ir.cuda() + imgs_gpu = imgs.cuda() + feat = resnet3d_csn_ir(imgs_gpu) + assert feat.shape == torch.Size([2, 2048, 1, 2, 2]) + else: + feat = resnet3d_csn_ir(imgs) + assert feat.shape == torch.Size([2, 2048, 1, 2, 2]) + + def _demo_inputs(input_shape=(1, 3, 64, 64)): """Create a superset of inputs needed to run backbone. diff --git a/tests/test_models/test_recognizers.py b/tests/test_models/test_recognizers.py index 96a730745e..8ce86d6ec3 100644 --- a/tests/test_models/test_recognizers.py +++ b/tests/test_models/test_recognizers.py @@ -242,6 +242,46 @@ def test_tsm(): recognizer(one_img, None, return_loss=False) +def test_csn(): + model, train_cfg, test_cfg = _get_recognizer_cfg( + 'csn/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb.py') + model['backbone']['pretrained2d'] = False + model['backbone']['pretrained'] = None + + recognizer = build_recognizer( + model, train_cfg=train_cfg, test_cfg=test_cfg) + + input_shape = (1, 3, 3, 8, 32, 32) + demo_inputs = generate_demo_inputs(input_shape, '3D') + + imgs = demo_inputs['imgs'] + gt_labels = demo_inputs['gt_labels'] + + # parrots 3dconv is only implemented on gpu + if torch.__version__ == 'parrots': + if torch.cuda.is_available(): + recognizer = recognizer.cuda() + imgs = imgs.cuda() + gt_labels = gt_labels.cuda() + losses = recognizer(imgs, gt_labels) + assert isinstance(losses, dict) + + # Test forward test + with torch.no_grad(): + img_list = [img[None, :] for img in imgs] + for one_img in img_list: + recognizer(one_img, None, return_loss=False) + else: + losses = recognizer(imgs, gt_labels) + assert isinstance(losses, dict) + + # Test forward test + with torch.no_grad(): + img_list = [img[None, :] for img in imgs] + for one_img in img_list: + recognizer(one_img, None, return_loss=False) + + def generate_demo_inputs(input_shape=(1, 3, 3, 224, 224), model_type='2D'): """Create a superset of inputs needed to run test or train batches.