Skip to content

Commit

Permalink
[Feature] Support DanceTrack dataset for MOT (#543)
Browse files Browse the repository at this point in the history
* fix format

* support dancetrack dataset

* delete unused function; inherit dancetrack dataset class from MOTChallenge

* remove duplicated content in DancetrackDataset

* remove legacy code

* refactor DanceTrackDataset class

* fix error in dancetrack qdtrack config file

* refactor qdtrack config files.

* Update dancetrack_dataset.py

Co-authored-by: Tao Gong <gt950513@mail.ustc.edu.cn>
  • Loading branch information
noahcao and GT9505 authored May 12, 2022
1 parent 6e411ff commit 202d7fe
Show file tree
Hide file tree
Showing 10 changed files with 450 additions and 89 deletions.
74 changes: 74 additions & 0 deletions configs/_base_/datasets/dancetrack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# dataset settings
dataset_type = 'DanceTrackDataset'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadMultiImagesFromFile', to_float32=True),
dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
dict(
type='SeqResize',
img_scale=(1088, 1088),
share_params=True,
ratio_range=(0.8, 1.2),
keep_ratio=True,
bbox_clip_border=False),
dict(type='SeqPhotoMetricDistortion', share_params=True),
dict(
type='SeqRandomCrop',
share_params=False,
crop_size=(1088, 1088),
bbox_clip_border=False),
dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
dict(type='SeqNormalize', **img_norm_cfg),
dict(type='SeqPad', size_divisor=32),
dict(type='MatchInstances', skip_nomatch=True),
dict(
type='VideoCollect',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
'gt_instance_ids'
]),
dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1088, 1088),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='VideoCollect', keys=['img'])
])
]
data_root = 'data/dancetrack/'
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
visibility_thr=-1,
ann_file=data_root + 'annotations/train_cocoformat.json',
img_prefix=data_root + 'train',
ref_img_sampler=dict(
num_ref_imgs=1,
frame_range=10,
filter_key_img=True,
method='uniform'),
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/val_cocoformat.json',
img_prefix=data_root + 'val',
ref_img_sampler=None,
pipeline=test_pipeline),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/val_cocoformat.json',
img_prefix=data_root + 'val',
ref_img_sampler=None,
pipeline=test_pipeline))
81 changes: 81 additions & 0 deletions configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
_base_ = [
'../../_base_/models/faster_rcnn_r50_fpn.py',
'../../_base_/default_runtime.py'
]
model = dict(
type='QDTrack',
detector=dict(
backbone=dict(
norm_cfg=dict(requires_grad=False),
style='caffe',
init_cfg=dict(
type='Pretrained', checkpoint='torchvision://resnet50')),
rpn_head=dict(bbox_coder=dict(clip_border=False)),
roi_head=dict(
bbox_head=dict(
loss_bbox=dict(type='L1Loss', loss_weight=1.0),
bbox_coder=dict(clip_border=False),
num_classes=1)),
init_cfg=dict(
type='Pretrained',
checkpoint= # noqa: E251
'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth' # noqa: E501
)),
track_head=dict(
type='QuasiDenseTrackHead',
roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
embed_head=dict(
type='QuasiDenseEmbedHead',
num_convs=4,
num_fcs=1,
embed_channels=256,
norm_cfg=dict(type='GN', num_groups=32),
loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
loss_track_aux=dict(
type='L2Loss',
neg_pos_ub=3,
pos_margin=0,
neg_margin=0.1,
hard_mining=True,
loss_weight=1.0)),
loss_bbox=dict(type='L1Loss', loss_weight=1.0),
train_cfg=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='CombinedSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=3,
add_gt_as_proposals=True,
pos_sampler=dict(type='InstanceBalancedPosSampler'),
neg_sampler=dict(type='RandomSampler')))),
tracker=dict(
type='QuasiDenseEmbedTracker',
init_score_thr=0.9,
obj_score_thr=0.5,
match_score_thr=0.5,
memo_tracklet_frames=30,
memo_backdrop_frames=1,
memo_momentum=0.8,
nms_conf_thr=0.5,
nms_backdrop_iou_thr=0.3,
nms_class_iou_thr=0.7,
with_cats=True,
match_metric='bisoftmax'))
# optimizer && learning policy
optimizer_config = dict(
_delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(policy='step', step=[3])
# runtime settings
total_epochs = 4
evaluation = dict(metric=['bbox', 'track'], interval=1)
53 changes: 53 additions & 0 deletions configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
_base_ = [
'./qdtrack_faster-rcnn_r50_fpn_4e_base.py',
'../../_base_/datasets/dancetrack.py',
]
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
train_pipeline = [
dict(type='LoadMultiImagesFromFile', to_float32=True),
dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
dict(
type='SeqResize',
img_scale=(1088, 1088),
share_params=True,
ratio_range=(0.8, 1.2),
keep_ratio=True,
bbox_clip_border=False),
dict(type='SeqPhotoMetricDistortion', share_params=True),
dict(
type='SeqRandomCrop',
share_params=False,
crop_size=(1088, 1088),
bbox_clip_border=False),
dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
dict(type='SeqNormalize', **img_norm_cfg),
dict(type='SeqPad', size_divisor=32),
dict(type='MatchInstances', skip_nomatch=True),
dict(
type='VideoCollect',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
'gt_instance_ids'
]),
dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1088, 1088),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='VideoCollect', keys=['img'])
])
]
data = dict(
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
Original file line number Diff line number Diff line change
@@ -1,77 +1,7 @@
_base_ = [
'../../_base_/models/faster_rcnn_r50_fpn.py',
'../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
'./qdtrack_faster-rcnn_r50_fpn_4e_base.py',
'../../_base_/datasets/mot_challenge.py',
]
model = dict(
type='QDTrack',
detector=dict(
backbone=dict(
norm_cfg=dict(requires_grad=False),
style='caffe',
init_cfg=dict(
type='Pretrained', checkpoint='torchvision://resnet50')),
rpn_head=dict(bbox_coder=dict(clip_border=False)),
roi_head=dict(
bbox_head=dict(
loss_bbox=dict(type='L1Loss', loss_weight=1.0),
bbox_coder=dict(clip_border=False),
num_classes=1)),
init_cfg=dict(
type='Pretrained',
checkpoint= # noqa: E251
'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth' # noqa: E501
)),
track_head=dict(
type='QuasiDenseTrackHead',
roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
embed_head=dict(
type='QuasiDenseEmbedHead',
num_convs=4,
num_fcs=1,
embed_channels=256,
norm_cfg=dict(type='GN', num_groups=32),
loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
loss_track_aux=dict(
type='L2Loss',
neg_pos_ub=3,
pos_margin=0,
neg_margin=0.1,
hard_mining=True,
loss_weight=1.0)),
loss_bbox=dict(type='L1Loss', loss_weight=1.0),
train_cfg=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='CombinedSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=3,
add_gt_as_proposals=True,
pos_sampler=dict(type='InstanceBalancedPosSampler'),
neg_sampler=dict(type='RandomSampler')))),
tracker=dict(
type='QuasiDenseEmbedTracker',
init_score_thr=0.9,
obj_score_thr=0.5,
match_score_thr=0.5,
memo_tracklet_frames=30,
memo_backdrop_frames=1,
memo_momentum=0.8,
nms_conf_thr=0.5,
nms_backdrop_iou_thr=0.3,
nms_class_iou_thr=0.7,
with_cats=True,
match_metric='bisoftmax'))
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
train_pipeline = [
Expand Down Expand Up @@ -121,10 +51,3 @@
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
# optimizer && learning policy
optimizer_config = dict(
_delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(policy='step', step=[3])
# runtime settings
total_epochs = 4
evaluation = dict(metric=['bbox', 'track'], interval=1)
17 changes: 16 additions & 1 deletion docs/en/dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ This page provides the instructions for dataset preparation on existing benchmar
- [CrowdHuman](https://www.crowdhuman.org/)
- [LVIS](https://www.lvisdataset.org/)
- [TAO](https://taodataset.org/)
- [DanceTrack](https://dancetrack.github.io)
- Single Object Tracking
- [LaSOT](http://vision.cs.stonybrook.edu/~lasot/)
- [UAV123](https://cemse.kaust.edu.sa/ivul/uav123/)
Expand All @@ -31,7 +32,7 @@ Please download the datasets from the official websites. It is recommended to sy

#### 1.2 Multiple Object Tracking

- For the training and testing of multi object tracking task, one of the MOT Challenge datasets (e.g. MOT17) and TAO are needed, CrowdHuman and LVIS can be served as comlementary dataset.
- For the training and testing of multi object tracking task, one of the MOT Challenge datasets (e.g. MOT17, TAO and DanceTrack) are needed, CrowdHuman and LVIS can be served as comlementary dataset.

- The `annotations` under `tao` contains the official annotations from [here](https://github.com/TAO-Dataset/annotations).

Expand Down Expand Up @@ -98,6 +99,11 @@ mmtracking
| | ├── train
| | ├── test
│ │
| ├── DanceTrack
| | ├── train
| | ├── val
| | ├── test
| |
│ ├── crowdhuman
│ │ ├── annotation_train.odgt
│ │ ├── annotation_val.odgt
Expand Down Expand Up @@ -230,6 +236,9 @@ python ./tools/convert_datasets/ilsvrc/imagenet2coco_vid.py -i ./data/ILSVRC -o
python ./tools/convert_datasets/mot/mot2coco.py -i ./data/MOT17/ -o ./data/MOT17/annotations --split-train --convert-det
python ./tools/convert_datasets/mot/mot2reid.py -i ./data/MOT17/ -o ./data/MOT17/reid --val-split 0.2 --vis-threshold 0.3

# DanceTrack
python ./tools/convert_datasets/dancetrack/dancetrack2coco.py -i ./data/DanceTrack ./data/DanceTrack/annotations

# CrowdHuman
python ./tools/convert_datasets/mot/crowdhuman2coco.py -i ./data/crowdhuman -o ./data/crowdhuman/annotations

Expand Down Expand Up @@ -320,6 +329,12 @@ mmtracking
│ │ │ ├── imgs
│ │ │ ├── meta
│ │
│ ├── DanceTrack
│ │ ├── train
│ │ ├── val
│ │ ├── test
│ │ ├── annotations
│ │
│ ├── crowdhuman
│ │ ├── annotation_train.odgt
│ │ ├── annotation_val.odgt
Expand Down
Loading

0 comments on commit 202d7fe

Please sign in to comment.