Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Support DanceTrack dataset for MOT #543

Merged
merged 10 commits into from
May 12, 2022
74 changes: 74 additions & 0 deletions configs/_base_/datasets/dancetrack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# dataset settings
dataset_type = 'DanceTrackDataset'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadMultiImagesFromFile', to_float32=True),
dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
dict(
type='SeqResize',
img_scale=(1088, 1088),
share_params=True,
ratio_range=(0.8, 1.2),
keep_ratio=True,
bbox_clip_border=False),
dict(type='SeqPhotoMetricDistortion', share_params=True),
dict(
type='SeqRandomCrop',
share_params=False,
crop_size=(1088, 1088),
bbox_clip_border=False),
dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
dict(type='SeqNormalize', **img_norm_cfg),
dict(type='SeqPad', size_divisor=32),
dict(type='MatchInstances', skip_nomatch=True),
dict(
type='VideoCollect',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
'gt_instance_ids'
]),
dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1088, 1088),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='VideoCollect', keys=['img'])
])
]
data_root = 'data/dancetrack/'
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
visibility_thr=-1,
ann_file=data_root + 'annotations/train_cocoformat.json',
img_prefix=data_root + 'train',
ref_img_sampler=dict(
num_ref_imgs=1,
frame_range=10,
filter_key_img=True,
method='uniform'),
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/val_cocoformat.json',
img_prefix=data_root + 'val',
ref_img_sampler=None,
pipeline=test_pipeline),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/val_cocoformat.json',
img_prefix=data_root + 'val',
ref_img_sampler=None,
pipeline=test_pipeline))
81 changes: 81 additions & 0 deletions configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
_base_ = [
'../../_base_/models/faster_rcnn_r50_fpn.py',
'../../_base_/default_runtime.py'
]
model = dict(
type='QDTrack',
detector=dict(
backbone=dict(
norm_cfg=dict(requires_grad=False),
style='caffe',
init_cfg=dict(
type='Pretrained', checkpoint='torchvision://resnet50')),
rpn_head=dict(bbox_coder=dict(clip_border=False)),
roi_head=dict(
bbox_head=dict(
loss_bbox=dict(type='L1Loss', loss_weight=1.0),
bbox_coder=dict(clip_border=False),
num_classes=1)),
init_cfg=dict(
type='Pretrained',
checkpoint= # noqa: E251
'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth' # noqa: E501
)),
track_head=dict(
type='QuasiDenseTrackHead',
roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
embed_head=dict(
type='QuasiDenseEmbedHead',
num_convs=4,
num_fcs=1,
embed_channels=256,
norm_cfg=dict(type='GN', num_groups=32),
loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
loss_track_aux=dict(
type='L2Loss',
neg_pos_ub=3,
pos_margin=0,
neg_margin=0.1,
hard_mining=True,
loss_weight=1.0)),
loss_bbox=dict(type='L1Loss', loss_weight=1.0),
train_cfg=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='CombinedSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=3,
add_gt_as_proposals=True,
pos_sampler=dict(type='InstanceBalancedPosSampler'),
neg_sampler=dict(type='RandomSampler')))),
tracker=dict(
type='QuasiDenseEmbedTracker',
init_score_thr=0.9,
obj_score_thr=0.5,
match_score_thr=0.5,
memo_tracklet_frames=30,
memo_backdrop_frames=1,
memo_momentum=0.8,
nms_conf_thr=0.5,
nms_backdrop_iou_thr=0.3,
nms_class_iou_thr=0.7,
with_cats=True,
match_metric='bisoftmax'))
# optimizer && learning policy
optimizer_config = dict(
_delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(policy='step', step=[3])
# runtime settings
total_epochs = 4
evaluation = dict(metric=['bbox', 'track'], interval=1)
53 changes: 53 additions & 0 deletions configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
_base_ = [
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'./qdtrack_faster-rcnn_r50_fpn_4e_base.py',
'../../_base_/datasets/dancetrack.py',
]
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
train_pipeline = [
dict(type='LoadMultiImagesFromFile', to_float32=True),
dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
dict(
type='SeqResize',
img_scale=(1088, 1088),
share_params=True,
ratio_range=(0.8, 1.2),
keep_ratio=True,
bbox_clip_border=False),
dict(type='SeqPhotoMetricDistortion', share_params=True),
dict(
type='SeqRandomCrop',
share_params=False,
crop_size=(1088, 1088),
bbox_clip_border=False),
dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
dict(type='SeqNormalize', **img_norm_cfg),
dict(type='SeqPad', size_divisor=32),
dict(type='MatchInstances', skip_nomatch=True),
dict(
type='VideoCollect',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
'gt_instance_ids'
]),
dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1088, 1088),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='VideoCollect', keys=['img'])
])
]
data = dict(
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
Original file line number Diff line number Diff line change
@@ -1,77 +1,7 @@
_base_ = [
'../../_base_/models/faster_rcnn_r50_fpn.py',
'../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
'./qdtrack_faster-rcnn_r50_fpn_4e_base.py',
'../../_base_/datasets/mot_challenge.py',
]
model = dict(
type='QDTrack',
detector=dict(
backbone=dict(
norm_cfg=dict(requires_grad=False),
style='caffe',
init_cfg=dict(
type='Pretrained', checkpoint='torchvision://resnet50')),
rpn_head=dict(bbox_coder=dict(clip_border=False)),
roi_head=dict(
bbox_head=dict(
loss_bbox=dict(type='L1Loss', loss_weight=1.0),
bbox_coder=dict(clip_border=False),
num_classes=1)),
init_cfg=dict(
type='Pretrained',
checkpoint= # noqa: E251
'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth' # noqa: E501
)),
track_head=dict(
type='QuasiDenseTrackHead',
roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
embed_head=dict(
type='QuasiDenseEmbedHead',
num_convs=4,
num_fcs=1,
embed_channels=256,
norm_cfg=dict(type='GN', num_groups=32),
loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
loss_track_aux=dict(
type='L2Loss',
neg_pos_ub=3,
pos_margin=0,
neg_margin=0.1,
hard_mining=True,
loss_weight=1.0)),
loss_bbox=dict(type='L1Loss', loss_weight=1.0),
train_cfg=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='CombinedSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=3,
add_gt_as_proposals=True,
pos_sampler=dict(type='InstanceBalancedPosSampler'),
neg_sampler=dict(type='RandomSampler')))),
tracker=dict(
type='QuasiDenseEmbedTracker',
init_score_thr=0.9,
obj_score_thr=0.5,
match_score_thr=0.5,
memo_tracklet_frames=30,
memo_backdrop_frames=1,
memo_momentum=0.8,
nms_conf_thr=0.5,
nms_backdrop_iou_thr=0.3,
nms_class_iou_thr=0.7,
with_cats=True,
match_metric='bisoftmax'))
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
train_pipeline = [
Expand Down Expand Up @@ -121,10 +51,3 @@
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
# optimizer && learning policy
optimizer_config = dict(
_delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(policy='step', step=[3])
# runtime settings
total_epochs = 4
evaluation = dict(metric=['bbox', 'track'], interval=1)
17 changes: 16 additions & 1 deletion docs/en/dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ This page provides the instructions for dataset preparation on existing benchmar
- [CrowdHuman](https://www.crowdhuman.org/)
- [LVIS](https://www.lvisdataset.org/)
- [TAO](https://taodataset.org/)
- [DanceTrack](https://dancetrack.github.io)
- Single Object Tracking
- [LaSOT](http://vision.cs.stonybrook.edu/~lasot/)
- [UAV123](https://cemse.kaust.edu.sa/ivul/uav123/)
Expand All @@ -31,7 +32,7 @@ Please download the datasets from the official websites. It is recommended to sy

#### 1.2 Multiple Object Tracking

- For the training and testing of multi object tracking task, one of the MOT Challenge datasets (e.g. MOT17) and TAO are needed, CrowdHuman and LVIS can be served as comlementary dataset.
- For the training and testing of multi object tracking task, one of the MOT Challenge datasets (e.g. MOT17, TAO and DanceTrack) are needed, CrowdHuman and LVIS can be served as comlementary dataset.

- The `annotations` under `tao` contains the official annotations from [here](https://github.com/TAO-Dataset/annotations).

Expand Down Expand Up @@ -98,6 +99,11 @@ mmtracking
| | ├── train
| | ├── test
│ │
| ├── DanceTrack
| | ├── train
| | ├── val
| | ├── test
| |
│ ├── crowdhuman
│ │ ├── annotation_train.odgt
│ │ ├── annotation_val.odgt
Expand Down Expand Up @@ -230,6 +236,9 @@ python ./tools/convert_datasets/ilsvrc/imagenet2coco_vid.py -i ./data/ILSVRC -o
python ./tools/convert_datasets/mot/mot2coco.py -i ./data/MOT17/ -o ./data/MOT17/annotations --split-train --convert-det
python ./tools/convert_datasets/mot/mot2reid.py -i ./data/MOT17/ -o ./data/MOT17/reid --val-split 0.2 --vis-threshold 0.3

# DanceTrack
python ./tools/convert_datasets/dancetrack/dancetrack2coco.py -i ./data/DanceTrack ./data/DanceTrack/annotations

# CrowdHuman
python ./tools/convert_datasets/mot/crowdhuman2coco.py -i ./data/crowdhuman -o ./data/crowdhuman/annotations

Expand Down Expand Up @@ -320,6 +329,12 @@ mmtracking
│ │ │ ├── imgs
│ │ │ ├── meta
│ │
│ ├── DanceTrack
│ │ ├── train
│ │ ├── val
│ │ ├── test
│ │ ├── annotations
│ │
│ ├── crowdhuman
│ │ ├── annotation_train.odgt
│ │ ├── annotation_val.odgt
Expand Down
Loading