From 1271914870c978bc330d2edde59eef99aa851b26 Mon Sep 17 00:00:00 2001 From: leexinhao <1520491933@qq.com> Date: Sun, 12 Feb 2023 11:08:09 +0800 Subject: [PATCH 1/6] Add lip_label_map for Flip in some configs --- ...n-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py | 4 +++- .../tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py | 3 ++- .../tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py | 3 ++- .../tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py | 3 ++- .../tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py | 3 ++- 5 files changed, 11 insertions(+), 5 deletions(-) diff --git a/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py b/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py index d833687d6a..b614d725f7 100644 --- a/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py +++ b/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py @@ -8,12 +8,14 @@ ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' + +sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52} train_pipeline = [ dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), dict(type='RawFrameDecode'), dict(type='RandomResizedCrop'), dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map), dict(type='ColorJitter'), dict(type='FormatShape', input_format='NCHW'), dict(type='PackActionInputs') diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py index 691e39c2b2..cf7f2ae6d0 100644 --- a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py @@ -4,6 +4,7 @@ file_client_args = dict(io_backend='disk') +sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166} train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), @@ -17,7 +18,7 @@ max_wh_scale_gap=1, num_fixed_crops=13), dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map), dict(type='FormatShape', input_format='NCHW'), dict(type='PackActionInputs') ] diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py index ba9c393593..8cd51ded5c 100644 --- a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py @@ -11,6 +11,7 @@ file_client_args = dict(io_backend='disk') +sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166} train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), @@ -24,7 +25,7 @@ max_wh_scale_gap=1, num_fixed_crops=13), dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map), dict(type='FormatShape', input_format='NCHW'), dict(type='PackActionInputs') ] diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py index 5797a6f596..15fde3ba79 100644 --- a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py +++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py @@ -2,6 +2,7 @@ file_client_args = dict(io_backend='disk') +sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166} train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), @@ -15,7 +16,7 @@ max_wh_scale_gap=1, num_fixed_crops=13), dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map), dict(type='FormatShape', input_format='NCHW'), dict(type='PackActionInputs') ] diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py index 39113ba5b3..a94f7b3b22 100644 --- a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py +++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py @@ -14,6 +14,7 @@ file_client_args = dict(io_backend='disk') +sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166} train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), @@ -26,7 +27,7 @@ random_crop=False, max_wh_scale_gap=1), dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map), dict(type='FormatShape', input_format='NCHW'), dict(type='PackActionInputs') ] From ada12652ea6cf1d180f5267b7be321f6bb5edcf1 Mon Sep 17 00:00:00 2001 From: lilin Date: Tue, 21 Feb 2023 14:46:21 +0800 Subject: [PATCH 2/6] update readme --- configs/recognition/tpn/README.md | 2 +- configs/recognition/tpn/metafile.yml | 2 +- configs/recognition/tsn/README.md | 6 +++--- configs/recognition/tsn/metafile.yml | 10 +++++----- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/configs/recognition/tpn/README.md b/configs/recognition/tpn/README.md index 972dbcbc7b..20a488ccb1 100644 --- a/configs/recognition/tpn/README.md +++ b/configs/recognition/tpn/README.md @@ -29,7 +29,7 @@ Visual tempo characterizes the dynamics and the temporal scale of an action. Mod | frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) | config | ckpt | log | | :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :----------------: | :--------------: | :---------------------: | :--------: | :---------------: | :-------------: | :------------: | -| 1x1x8 | height 100 | 8x6 | ResNet50 | TSM | 48.98 | 78.91 | x | x | 8 clips x 3 crop | x | 8828 | [config](/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20220913-d2f5c300.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.log) | +| 1x1x8 | height 100 | 8x6 | ResNet50 | TSM | 51.87 | 79.67 | x | x | 8 clips x 3 crop | x | 8828 | [config](/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20230221-940a3615.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.log) | :::{note} diff --git a/configs/recognition/tpn/metafile.yml b/configs/recognition/tpn/metafile.yml index 702da581e0..ce7d9ebcec 100644 --- a/configs/recognition/tpn/metafile.yml +++ b/configs/recognition/tpn/metafile.yml @@ -70,4 +70,4 @@ Models: Top 5 Accuracy: 78.91 Task: Action Recognition Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20220913-d2f5c300.pth + Weights: (https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20230221-940a3615.pth diff --git a/configs/recognition/tsn/README.md b/configs/recognition/tsn/README.md index d34d1ab433..1b6e34fdc1 100644 --- a/configs/recognition/tsn/README.md +++ b/configs/recognition/tsn/README.md @@ -32,8 +32,8 @@ Deep convolutional networks have achieved great success for visual recognition i | frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | | :---------------------: | :-------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :----: | :----: | :------------------------------: | -----------------------------: | -----------------------------: | -| 1x1x8 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 34.85 | 66.37 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20221122-ad2dbb37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.log) | -| 1x1x16 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 36.55 | 68.00 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20221122-ee13c8e2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.log) | +| 1x1x8 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 35.51 | 67.09 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20230313-06ad7d03.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.log) | +| 1x1x16 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 36.91 | 68.77 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20230221-85bcc1c3.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.log) | ### Using backbones from 3rd-party in TSN @@ -49,7 +49,7 @@ It's possible and convenient to use a 3rd-party backbone for TSN under the frame | 1x1x3 | MultiStep | 224x224 | 8 | DenseNet161 | ImageNet | 72.07 | 90.15 | 25 clips x 10 crop | 194.6G | 27.36M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb_20220906-5f4c0daf.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.log) | | 1x1x3 | MultiStep | 224x224 | 8 | Swin Transformer | ImageNet | 77.03 | 92.61 | 25 clips x 10 crop | 386.7G | 87.15M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb_20220906-65ed814e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.log) | -1. Note that some backbones in TIMM are not supported due to multiple reasons. Please refer to to [PR #880](https://github.com/open-mmlab/mmaction2/pull/880) for details. +1. Note that some backbones in TIMM are not supported due to multiple reasons. Please refer to [PR #880](https://github.com/open-mmlab/mmaction2/pull/880) for details. 2. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. diff --git a/configs/recognition/tsn/metafile.yml b/configs/recognition/tsn/metafile.yml index e618ed71cc..a46bd2e785 100644 --- a/configs/recognition/tsn/metafile.yml +++ b/configs/recognition/tsn/metafile.yml @@ -210,10 +210,10 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 34.85 - Top 5 Accuracy: 66.37 + Top 1 Accuracy: 35.51 + Top 5 Accuracy: 67.09 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20221122-ad2dbb37.pth + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20230313-06ad7d03.pth - Name: tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb Config: configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py @@ -233,7 +233,7 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 36.55 - Top 5 Accuracy: 68.00 + Top 1 Accuracy: 36.91 + Top 5 Accuracy: 68.77 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20221122-ee13c8e2.pth From e4dae33f13feaaf50044e75170288f4e39c9b628 Mon Sep 17 00:00:00 2001 From: lilin Date: Thu, 16 Mar 2023 14:39:20 +0800 Subject: [PATCH 3/6] fix num_crops in test_pipeline of TSM --- .../tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py | 2 +- .../tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py index cf7f2ae6d0..36b1eefcf0 100644 --- a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py @@ -47,7 +47,7 @@ test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), - dict(type='TenCrop', crop_size=224), + dict(type='ThreeCrop', crop_size=256), dict(type='FormatShape', input_format='NCHW'), dict(type='PackActionInputs') ] diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py index 8cd51ded5c..8248bcb02b 100644 --- a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py @@ -54,7 +54,7 @@ twice_sample=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), - dict(type='TenCrop', crop_size=224), + dict(type='ThreeCrop', crop_size=256), dict(type='FormatShape', input_format='NCHW'), dict(type='PackActionInputs') ] From f8d6c35d2e618a6977b2788809054eba67f9a3b6 Mon Sep 17 00:00:00 2001 From: lilin Date: Fri, 17 Mar 2023 17:41:07 +0800 Subject: [PATCH 4/6] update tsm --- configs/recognition/tsm/README.md | 4 ++-- configs/recognition/tsm/metafile.yml | 28 ++++++++++++++-------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/configs/recognition/tsm/README.md b/configs/recognition/tsm/README.md index ca490117c3..d1bf0f17c8 100644 --- a/configs/recognition/tsm/README.md +++ b/configs/recognition/tsm/README.md @@ -34,8 +34,8 @@ The explosive growth in video streaming gives rise to challenges on performing v | frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | | :---------------------: | :--------: | :--: | :-------: | :------: | :------: | :------: | :----------------: | :----: | :----: | :---------------------------------: | :-------------------------------: | :------------------------------: | -| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 60.20 | 86.13 | 8 clips x 10 crop | 32.88G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20221122-446d261a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log) | -| 1x1x16 | 224x224 | 8 | ResNet50 | ImageNet | 62.46 | 87.75 | 16 clips x 10 crop | 65.75G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20221122-b1fb8264.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log) | +| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 62.72 | 87.70 | 8 clips x 10 crop | 32.88G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20230317-be0fc26e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log) | +| 1x1x16 | 224x224 | 8 | ResNet50 | ImageNet | 64.16 | 88.61 | 16 clips x 10 crop | 65.75G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20230317-ec6696ad.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log) | | 1x1x8 | 224x224 | 8 | ResNet101 | ImageNet | 60.49 | 85.99 | 8 clips x 10 crop | 62.66G | 42.86M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20221122-cb2cc64e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log) | 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. diff --git a/configs/recognition/tsm/metafile.yml b/configs/recognition/tsm/metafile.yml index 5adafb069f..0c5e031c9d 100644 --- a/configs/recognition/tsm/metafile.yml +++ b/configs/recognition/tsm/metafile.yml @@ -178,17 +178,17 @@ Models: Parameters: 23.87M Pretrained: ImageNet Resolution: 224x224 - Training Data: Kinetics-400 + Training Data: SthV2 Training Resources: 8 GPUs Modality: RGB Results: - - Dataset: Kinetics-400 + - Dataset: SthV2 Task: Action Recognition Metrics: - Top 1 Accuracy: 60.20 - Top 5 Accuracy: 86.13 + Top 1 Accuracy: 62.72 + Top 5 Accuracy: 87.70 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20221122-446d261a.pth + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20230317-be0fc26e.pth - Name: tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb Config: configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py @@ -196,22 +196,22 @@ Models: Metadata: Architecture: ResNet50 Batch Size: 16 - Epochs: 100 + Epochs: 50 FLOPs: 65.75G Parameters: 23.87M Pretrained: ImageNet Resolution: 224x224 - Training Data: Kinetics-400 + Training Data: SthV2 Training Resources: 8 GPUs Modality: RGB Results: - - Dataset: Kinetics-400 + - Dataset: SthV2 Task: Action Recognition Metrics: - Top 1 Accuracy: 62.46 - Top 5 Accuracy: 87.75 + Top 1 Accuracy: 64.16 + Top 5 Accuracy: 88.61 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20221122-b1fb8264.pth + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20230317-ec6696ad.pth - Name: tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb Config: configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py @@ -219,16 +219,16 @@ Models: Metadata: Architecture: ResNet101 Batch Size: 16 - Epochs: 100 + Epochs: 50 FLOPs: 62.66G Parameters: 42.86M Pretrained: ImageNet Resolution: 224x224 - Training Data: Kinetics-400 + Training Data: SthV2 Training Resources: 8 GPUs Modality: RGB Results: - - Dataset: Kinetics-400 + - Dataset: SthV2 Task: Action Recognition Metrics: Top 1 Accuracy: 60.49 From e9686669e8939f2337590a5e45ca98986fca20b2 Mon Sep 17 00:00:00 2001 From: lilin Date: Mon, 20 Mar 2023 11:44:52 +0800 Subject: [PATCH 5/6] update tsm-r101 --- configs/recognition/tsm/README.md | 10 +++++----- configs/recognition/tsm/metafile.yml | 6 +++--- ...agenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/configs/recognition/tsm/README.md b/configs/recognition/tsm/README.md index d1bf0f17c8..5e5162de83 100644 --- a/configs/recognition/tsm/README.md +++ b/configs/recognition/tsm/README.md @@ -32,11 +32,11 @@ The explosive growth in video streaming gives rise to challenges on performing v ### Something-something V2 -| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | -| :---------------------: | :--------: | :--: | :-------: | :------: | :------: | :------: | :----------------: | :----: | :----: | :---------------------------------: | :-------------------------------: | :------------------------------: | -| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 62.72 | 87.70 | 8 clips x 10 crop | 32.88G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20230317-be0fc26e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log) | -| 1x1x16 | 224x224 | 8 | ResNet50 | ImageNet | 64.16 | 88.61 | 16 clips x 10 crop | 65.75G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20230317-ec6696ad.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log) | -| 1x1x8 | 224x224 | 8 | ResNet101 | ImageNet | 60.49 | 85.99 | 8 clips x 10 crop | 62.66G | 42.86M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20221122-cb2cc64e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log) | +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :-------: | :------: | :------: | :------: | :---------------: | :----: | :----: | :---------------------------------: | :-------------------------------: | :-------------------------------: | +| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 62.72 | 87.70 | 8 clips x 3 crop | 32.88G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20230317-be0fc26e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log) | +| 1x1x16 | 224x224 | 8 | ResNet50 | ImageNet | 64.16 | 88.61 | 16 clips x 3 crop | 65.75G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20230317-ec6696ad.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log) | +| 1x1x8 | 224x224 | 8 | ResNet101 | ImageNet | 63.70 | 88.28 | 8 clips x 3 crop | 62.66G | 42.86M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20230320-efcc0d1b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log) | 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. diff --git a/configs/recognition/tsm/metafile.yml b/configs/recognition/tsm/metafile.yml index 0c5e031c9d..64d37461d4 100644 --- a/configs/recognition/tsm/metafile.yml +++ b/configs/recognition/tsm/metafile.yml @@ -231,7 +231,7 @@ Models: - Dataset: SthV2 Task: Action Recognition Metrics: - Top 1 Accuracy: 60.49 - Top 5 Accuracy: 85.99 + Top 1 Accuracy: 63.70 + Top 5 Accuracy: 88.28 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20221122-cb2cc64e.pth + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20230320-efcc0d1b.pth diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py index 9429730700..7cb4b48ac7 100644 --- a/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py @@ -1,6 +1,6 @@ _base_ = ['tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py'] # model settings -r101_checkpoint = 'https://download.pytorch.org/models/resnet101-cd907fc2.pth' +r101_checkpoint = 'torchvision://resnet101' model = dict(backbone=dict(pretrained=r101_checkpoint, depth=101)) From bdce1104219e6561c7934204baa01b07036497a3 Mon Sep 17 00:00:00 2001 From: lilin Date: Mon, 20 Mar 2023 11:48:31 +0800 Subject: [PATCH 6/6] fix meatfile --- configs/recognition/tpn/metafile.yml | 4 ++-- configs/recognition/tsn/metafile.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/recognition/tpn/metafile.yml b/configs/recognition/tpn/metafile.yml index ce7d9ebcec..ce953f2e89 100644 --- a/configs/recognition/tpn/metafile.yml +++ b/configs/recognition/tpn/metafile.yml @@ -66,8 +66,8 @@ Models: Results: - Dataset: SthV1 Metrics: - Top 1 Accuracy: 48.98 - Top 5 Accuracy: 78.91 + Top 1 Accuracy: 51.87 + Top 5 Accuracy: 79.67 Task: Action Recognition Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.log Weights: (https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20230221-940a3615.pth diff --git a/configs/recognition/tsn/metafile.yml b/configs/recognition/tsn/metafile.yml index a46bd2e785..37943e673b 100644 --- a/configs/recognition/tsn/metafile.yml +++ b/configs/recognition/tsn/metafile.yml @@ -236,4 +236,4 @@ Models: Top 1 Accuracy: 36.91 Top 5 Accuracy: 68.77 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20221122-ee13c8e2.pth + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20230221-85bcc1c3.pth