open-mmlab · cir7 · Jun 19, 2023 · Jun 12, 2023 · Jun 14, 2023 · Jun 19, 2023
diff --git a/configs/recognition/slowonly/README.md b/configs/recognition/slowonly/README.md
@@ -37,6 +37,12 @@ We present SlowFast networks for video recognition. Our model involves (i) a Slo
 |         4x16x1          | Linear+MultiStep |  224x224   | 8x2  | ResNet50 | ImageNet |  65.52   |  86.39   | 10 clips x 3 crop | 27.38G | 32.45M | [config](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb_20221013-98b1b0a7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.log) |
 |          8x8x1          | Linear+MultiStep |  224x224   | 8x2  | ResNet50 | ImageNet |  67.67   |  87.80   | 10 clips x 3 crop | 54.75G | 32.45M | [config](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20221013-15b93b10.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.log) |
 
+### Kinetics-710
+
+| frame sampling strategy |    scheduler     | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol  | FLOPs  | params |             config             |             ckpt             |             log              |
+| :---------------------: | :--------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :---------------: | :----: | :----: | :----------------------------: | :--------------------------: | :--------------------------: |
+|          8x8x1          | Linear+MultiStep |  224x224   | 8x4  | ResNet50 | ImageNet |  72.39   |  90.60   | 10 clips x 3 crop | 54.75G | 32.45M | [config](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb_20230612-12ce977c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.log) |
+
 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
 2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
 

diff --git a/configs/recognition/slowonly/metafile.yml b/configs/recognition/slowonly/metafile.yml
@@ -214,3 +214,26 @@ Models:
           Top 5 Accuracy: 87.47
     Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20220901-4098e1eb.pth
+
+  - Name: slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb
+    Config: configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.py
+    In Collection: SlowOnly
+    Metadata:
+      Architecture: ResNet50
+      Batch Size: 8
+      Epochs: 150
+      FLOPs: 54.75G
+      Parameters: 32.45M
+      Pretrained: ImageNet
+      Resolution: short-side 320
+      Training Data: Kinetics-710
+      Training Resources: 32 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: Kinetics-710
+        Task: Action Recognition
+        Metrics:
+          Top 1 Accuracy: 72.39
+          Top 5 Accuracy: 90.60
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb_20230612-12ce977c.pth
diff --git a/...tion/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.py b/...tion/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.py
@@ -0,0 +1,138 @@
+_base_ = [('slowonly_imagenet-pretrained-r50_16xb16-'
+           '4x16x1-steplr-150e_kinetics700-rgb.py')]
+
+model = dict(cls_head=dict(num_classes=710))
+
+file_client_args = dict(io_backend='disk')
+
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=10,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+k400_data_root = 'data/kinetics400/videos_train'
+k600_data_root = 'data/kinetics600/videos'
+k700_data_root = 'data/kinetics700/videos'
+k400_data_root_val = 'data/kinetics400/videos_val'
+k600_data_root_val = k600_data_root
+k700_data_root_val = k700_data_root
+
+k400_ann_file_train = 'data/kinetics710/k400_train_list_videos.txt'
+k600_ann_file_train = 'data/kinetics710/k600_train_list_videos.txt'
+k700_ann_file_train = 'data/kinetics710/k700_train_list_videos.txt'
+
+k400_ann_file_val = 'data/kinetics710/k400_val_list_videos.txt'
+k600_ann_file_val = 'data/kinetics710/k600_val_list_videos.txt'
+k700_ann_file_val = 'data/kinetics710/k700_val_list_videos.txt'
+
+k400_trainset = dict(
+    type='VideoDataset',
+    ann_file=k400_ann_file_train,
+    data_prefix=dict(video=k400_data_root),
+    pipeline=train_pipeline)
+k600_trainset = dict(
+    type='VideoDataset',
+    ann_file=k600_ann_file_train,
+    data_prefix=dict(video=k600_data_root),
+    pipeline=train_pipeline)
+k700_trainset = dict(
+    type='VideoDataset',
+    ann_file=k700_ann_file_train,
+    data_prefix=dict(video=k700_data_root),
+    pipeline=train_pipeline)
+
+k400_valset = dict(
+    type='VideoDataset',
+    ann_file=k400_ann_file_val,
+    data_prefix=dict(video=k400_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k600_valset = dict(
+    type='VideoDataset',
+    ann_file=k600_ann_file_val,
+    data_prefix=dict(video=k600_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k700_valset = dict(
+    type='VideoDataset',
+    ann_file=k700_ann_file_val,
+    data_prefix=dict(video=k700_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+
+k400_testset = k400_valset.copy()
+k600_testset = k600_valset.copy()
+k700_testset = k700_valset.copy()
+k400_testset['pipeline'] = test_pipeline
+k600_testset['pipeline'] = test_pipeline
+k700_testset['pipeline'] = test_pipeline
+
+k710_trainset = dict(
+    type='ConcatDataset',
+    datasets=[k400_trainset, k600_trainset, k700_trainset],
+    _delete_=True)
+k710_valset = dict(
+    type='ConcatDataset',
+    datasets=[k400_valset, k600_valset, k700_valset],
+    _delete_=True)
+k710_testset = dict(
+    type='ConcatDataset',
+    datasets=[k400_testset, k600_testset, k700_testset],
+    _delete_=True,
+)
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=k710_trainset)
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=k710_valset)
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=k710_testset)
diff --git a/configs/recognition/swin/README.md b/configs/recognition/swin/README.md
@@ -31,7 +31,13 @@ The vision community is witnessing a modeling shift from CNNs to Transformers, w
 
 | frame sampling strategy | resolution | gpus | backbone |   pretrain   | top1 acc | top5 acc | testing protocol | FLOPs | params |               config                |               ckpt                |               log                |
 | :---------------------: | :--------: | :--: | :------: | :----------: | :------: | :------: | :--------------: | :---: | :----: | :---------------------------------: | :-------------------------------: | :------------------------------: |
-|         32x2x1          |  224x224   |  16  |  Swin-L  | ImageNet-22k |  75.92   |  92.72   | 4 clips x 3 crop | 604G  |  197M  | [config](/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb_20220930-f8d74db7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py.log) |
+|         32x2x1          |  224x224   |  16  |  Swin-L  | ImageNet-22k |  75.92   |  92.72   | 4 clips x 3 crop | 604G  |  197M  | [config](/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb_20220930-f8d74db7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.log) |
+
+### Kinetics-710
+
+| frame sampling strategy | resolution | gpus | backbone |  pretrain   | top1 acc | top5 acc | testing protocol | FLOPs | params |               config                |               ckpt                |                log                |
+| :---------------------: | :--------: | :--: | :------: | :---------: | :------: | :------: | :--------------: | :---: | :----: | :---------------------------------: | :-------------------------------: | :-------------------------------: |
+|         32x2x1          |  224x224   |  32  |  Swin-S  | ImageNet-1k |  76.90   |  92.96   | 4 clips x 3 crop | 604G  |  197M  | [config](/configs/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb_20230612-8e082ff1.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.log) |
 
 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
 2. The values in columns named after "reference" are the results got by testing on our dataset, using the checkpoints provided by the author with same model settings. `*` means that the numbers are copied from the paper.

diff --git a/configs/recognition/swin/metafile.yml b/configs/recognition/swin/metafile.yml
@@ -120,3 +120,26 @@ Models:
         Top 5 Accuracy: 92.72
     Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb_20220930-f8d74db7.pth
+
+  - Name: swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb
+    Config: configs/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.py
+    In Collection: Swin
+    Metadata:
+      Architecture: Swin-S
+      Batch Size: 4
+      Epochs: 30
+      FLOPs: 604G
+      Parameters: 197M
+      Pretrained: ImageNet-1K
+      Resolution: 224x224
+      Training Data: Kinetics-710
+      Training Resources: 32 GPUs
+    Modality: RGB
+    Results:
+    - Dataset: Kinetics-710
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 76.90
+        Top 5 Accuracy: 92.96
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb_20230612-8e082ff1.pth
diff --git a/...gs/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.py b/...gs/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.py
@@ -0,0 +1,144 @@
+_base_ = [
+    'swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py'
+]
+
+model = dict(cls_head=dict(num_classes=710))
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+k400_data_root = 'data/kinetics400/videos_train'
+k600_data_root = 'data/kinetics600/videos'
+k700_data_root = 'data/kinetics700/videos'
+k400_data_root_val = 'data/kinetics400/videos_val'
+k600_data_root_val = k600_data_root
+k700_data_root_val = k700_data_root
+
+k400_ann_file_train = 'data/kinetics710/k400_train_list_videos.txt'
+k600_ann_file_train = 'data/kinetics710/k600_train_list_videos.txt'
+k700_ann_file_train = 'data/kinetics710/k700_train_list_videos.txt'
+
+k400_ann_file_val = 'data/kinetics710/k400_val_list_videos.txt'
+k600_ann_file_val = 'data/kinetics710/k600_val_list_videos.txt'
+k700_ann_file_val = 'data/kinetics710/k700_val_list_videos.txt'
+
+k400_trainset = dict(
+    type='VideoDataset',
+    ann_file=k400_ann_file_train,
+    data_prefix=dict(video=k400_data_root),
+    pipeline=train_pipeline)
+k600_trainset = dict(
+    type='VideoDataset',
+    ann_file=k600_ann_file_train,
+    data_prefix=dict(video=k600_data_root),
+    pipeline=train_pipeline)
+k700_trainset = dict(
+    type='VideoDataset',
+    ann_file=k700_ann_file_train,
+    data_prefix=dict(video=k700_data_root),
+    pipeline=train_pipeline)
+
+k400_valset = dict(
+    type='VideoDataset',
+    ann_file=k400_ann_file_val,
+    data_prefix=dict(video=k400_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k600_valset = dict(
+    type='VideoDataset',
+    ann_file=k600_ann_file_val,
+    data_prefix=dict(video=k600_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k700_valset = dict(
+    type='VideoDataset',
+    ann_file=k700_ann_file_val,
+    data_prefix=dict(video=k700_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+
+k400_testset = k400_valset.copy()
+k600_testset = k600_valset.copy()
+k700_testset = k700_valset.copy()
+k400_testset['pipeline'] = test_pipeline
+k600_testset['pipeline'] = test_pipeline
+k700_testset['pipeline'] = test_pipeline
+
+k710_trainset = dict(
+    type='ConcatDataset',
+    datasets=[k400_trainset, k600_trainset, k700_trainset],
+    _delete_=True)
+k710_valset = dict(
+    type='ConcatDataset',
+    datasets=[k400_valset, k600_valset, k700_valset],
+    _delete_=True)
+k710_testset = dict(
+    type='ConcatDataset',
+    datasets=[k400_testset, k600_testset, k700_testset],
+    _delete_=True,
+)
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=k710_trainset)
+val_dataloader = dict(
+    batch_size=4,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=k710_valset)
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=k710_testset)
+
+optim_wrapper = dict(optimizer=dict(lr=2e-3))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (16 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)