Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add runner type #118

Merged
merged 20 commits into from
Oct 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion configs/_base_/schedules/schedule_160k.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@
# learning policy
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
# runtime settings
total_iters = 160000
runner = dict(type='IterBasedRunner', max_iters=160000)
checkpoint_config = dict(by_epoch=False, interval=16000)
evaluation = dict(interval=16000, metric='mIoU')
2 changes: 1 addition & 1 deletion configs/_base_/schedules/schedule_20k.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@
# learning policy
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
# runtime settings
total_iters = 20000
runner = dict(type='IterBasedRunner', max_iters=20000)
checkpoint_config = dict(by_epoch=False, interval=2000)
evaluation = dict(interval=2000, metric='mIoU')
2 changes: 1 addition & 1 deletion configs/_base_/schedules/schedule_40k.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@
# learning policy
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
# runtime settings
total_iters = 40000
runner = dict(type='IterBasedRunner', max_iters=40000)
checkpoint_config = dict(by_epoch=False, interval=4000)
evaluation = dict(interval=4000, metric='mIoU')
2 changes: 1 addition & 1 deletion configs/_base_/schedules/schedule_80k.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@
# learning policy
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
# runtime settings
total_iters = 80000
runner = dict(type='IterBasedRunner', max_iters=80000)
checkpoint_config = dict(by_epoch=False, interval=8000)
evaluation = dict(interval=8000, metric='mIoU')
6 changes: 4 additions & 2 deletions docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ dist_params = dict(backend='nccl') # Parameters to setup distributed training,
log_level = 'INFO' # The level of logging.
load_from = None # load models as a pre-trained model from a given path. This will not resume training.
resume_from = None # Resume checkpoints from a given path, the training will be resumed from the iteration when the checkpoint's is saved.
workflow = [('train', 1)] # Workflow for runner. [('train', 1)] means there is only one workflow and the workflow named 'train' is executed once. The workflow trains the model by 40000 iterations according to the total_iters.
workflow = [('train', 1)] # Workflow for runner. [('train', 1)] means there is only one workflow and the workflow named 'train' is executed once. The workflow trains the model by 40000 iterations according to the `runner.max_iters`.
cudnn_benchmark = True # Whether use cudnn_benchmark to speed up, which is fast for fixed input size.
optimizer = dict( # Config used to build optimizer, support all the optimizers in PyTorch whose arguments are also the same as those in PyTorch
type='SGD', # Type of optimizers, refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/optimizer/default_constructor.py#L13 for more details
Expand All @@ -238,7 +238,9 @@ lr_config = dict(
power=0.9, # The power of polynomial decay.
min_lr=0.0001, # The minimum learning rate to stable the training.
by_epoch=False) # Whethe count by epoch or not.
total_iters = 40000 # Total number of iterations.
runner = dict(
type='IterBasedRunner', # Type of runner to use (i.e. IterBasedRunner or EpochBasedRunner)
max_iters=40000) # Total number of iterations. For EpochBasedRunner use `max_epochs`
checkpoint_config = dict( # Config to set the checkpoint hook, Refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/checkpoint.py for implementation.
by_epoch=False, # Whethe count by epoch or not.
interval=4000) # The save interval.
Expand Down
2 changes: 1 addition & 1 deletion mmseg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from .version import __version__, version_info

MMCV_MIN = '1.1.2'
MMCV_MIN = '1.1.4'
MMCV_MAX = '1.2.0'


Expand Down
28 changes: 19 additions & 9 deletions mmseg/apis/train.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import random
import warnings

import numpy as np
import torch
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import IterBasedRunner, build_optimizer
from mmcv.runner import build_optimizer, build_runner

from mmseg.core import DistEvalHook, EvalHook
from mmseg.datasets import build_dataloader, build_dataset
Expand Down Expand Up @@ -70,13 +71,21 @@ def train_segmentor(model,
# build runner
optimizer = build_optimizer(model, cfg.optimizer)

runner = IterBasedRunner(
model=model,
batch_processor=None,
optimizer=optimizer,
work_dir=cfg.work_dir,
logger=logger,
meta=meta)
if cfg.get('runner') is None:
cfg.runner = {'type': 'IterBasedRunner', 'max_iters': cfg.total_iters}
warnings.warn(
'config is now expected to have a `runner` section, '
'please set `runner` in your config.', UserWarning)

runner = build_runner(
cfg.runner,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This breaks the backward compatibility. It should be better to keep the original api and add a deprecation warning.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was not sure about how to maintain the backward compatibility so I create a default runner section that reproduces the old behavior.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, if the user is using an old config file, there will be error since max_iters is not specified.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may keep the backward compatibility so that user may use old config file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the block added above:

    if cfg.get('runner') is None:
        cfg.runner = {'type': 'IterBasedRunner', 'max_iters': cfg.total_iters}
        warnings.warn(
            'config is now expected to have a `runner` section, '
            'please set `runner` in your config.', DeprecationWarning)

User should be able to use an old config file, right?

default_args=dict(
model=model,
batch_processor=None,
optimizer=optimizer,
work_dir=cfg.work_dir,
logger=logger,
meta=meta))

# register hooks
runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
Expand All @@ -96,11 +105,12 @@ def train_segmentor(model,
dist=distributed,
shuffle=False)
eval_cfg = cfg.get('evaluation', {})
eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
eval_hook = DistEvalHook if distributed else EvalHook
runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

if cfg.resume_from:
runner.resume(cfg.resume_from)
elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
runner.run(data_loaders, cfg.workflow, cfg.total_iters)
runner.run(data_loaders, cfg.workflow)
33 changes: 30 additions & 3 deletions mmseg/core/evaluation/eval_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,27 @@ class EvalHook(Hook):
interval (int): Evaluation interval (by epochs). Default: 1.
"""

def __init__(self, dataloader, interval=1, **eval_kwargs):
def __init__(self, dataloader, interval=1, by_epoch=False, **eval_kwargs):
if not isinstance(dataloader, DataLoader):
raise TypeError('dataloader must be a pytorch DataLoader, but got '
f'{type(dataloader)}')
self.dataloader = dataloader
self.interval = interval
self.by_epoch = by_epoch
self.eval_kwargs = eval_kwargs

def after_train_iter(self, runner):
"""After train epoch hook."""
if not self.every_n_iters(runner, self.interval):
if self.by_epoch or not self.every_n_iters(runner, self.interval):
return
from mmseg.apis import single_gpu_test
runner.log_buffer.clear()
results = single_gpu_test(runner.model, self.dataloader, show=False)
self.evaluate(runner, results)

def after_train_epoch(self, runner):
"""After train epoch hook."""
if not self.by_epoch or not self.every_n_epochs(runner, self.interval):
return
from mmseg.apis import single_gpu_test
runner.log_buffer.clear()
Expand Down Expand Up @@ -54,6 +64,7 @@ def __init__(self,
dataloader,
interval=1,
gpu_collect=False,
by_epoch=False,
**eval_kwargs):
if not isinstance(dataloader, DataLoader):
raise TypeError(
Expand All @@ -62,11 +73,27 @@ def __init__(self,
self.dataloader = dataloader
self.interval = interval
self.gpu_collect = gpu_collect
self.by_epoch = by_epoch
self.eval_kwargs = eval_kwargs

def after_train_iter(self, runner):
"""After train epoch hook."""
if not self.every_n_iters(runner, self.interval):
if self.by_epoch or not self.every_n_iters(runner, self.interval):
return
from mmseg.apis import multi_gpu_test
runner.log_buffer.clear()
results = multi_gpu_test(
runner.model,
self.dataloader,
tmpdir=osp.join(runner.work_dir, '.eval_hook'),
gpu_collect=self.gpu_collect)
if runner.rank == 0:
print('\n')
self.evaluate(runner, results)

def after_train_epoch(self, runner):
"""After train epoch hook."""
if not self.by_epoch or not self.every_n_epochs(runner, self.interval):
return
from mmseg.apis import multi_gpu_test
runner.log_buffer.clear()
Expand Down
77 changes: 76 additions & 1 deletion tests/test_eval_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def train_step(self, data_batch, optimizer):
return dict(loss=loss)


def test_eval_hook():
def test_iter_eval_hook():
with pytest.raises(TypeError):
test_dataset = ExampleModel()
data_loader = [
Expand Down Expand Up @@ -75,6 +75,43 @@ def test_eval_hook():
logger=runner.logger)


def test_epoch_eval_hook():
with pytest.raises(TypeError):
test_dataset = ExampleModel()
data_loader = [
DataLoader(
test_dataset,
batch_size=1,
sampler=None,
num_worker=0,
shuffle=False)
]
EvalHook(data_loader, by_epoch=True)

test_dataset = ExampleDataset()
test_dataset.evaluate = MagicMock(return_value=dict(test='success'))
loader = DataLoader(test_dataset, batch_size=1)
model = ExampleModel()
data_loader = DataLoader(
test_dataset, batch_size=1, sampler=None, num_workers=0, shuffle=False)
optim_cfg = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
optimizer = obj_from_dict(optim_cfg, torch.optim,
dict(params=model.parameters()))

# test EvalHook with interval
with tempfile.TemporaryDirectory() as tmpdir:
eval_hook = EvalHook(data_loader, by_epoch=True, interval=2)
runner = mmcv.runner.EpochBasedRunner(
model=model,
optimizer=optimizer,
work_dir=tmpdir,
logger=logging.getLogger())
runner.register_hook(eval_hook)
runner.run([loader], [('train', 1)], 2)
test_dataset.evaluate.assert_called_once_with([torch.tensor([1])],
logger=runner.logger)


def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
results = single_gpu_test(model, data_loader)
return results
Expand Down Expand Up @@ -116,3 +153,41 @@ def test_dist_eval_hook():
runner.run([loader], [('train', 1)], 1)
test_dataset.evaluate.assert_called_with([torch.tensor([1])],
logger=runner.logger)


@patch('mmseg.apis.multi_gpu_test', multi_gpu_test)
def test_dist_eval_hook_epoch():
with pytest.raises(TypeError):
test_dataset = ExampleModel()
data_loader = [
DataLoader(
test_dataset,
batch_size=1,
sampler=None,
num_worker=0,
shuffle=False)
]
DistEvalHook(data_loader)

test_dataset = ExampleDataset()
test_dataset.evaluate = MagicMock(return_value=dict(test='success'))
loader = DataLoader(test_dataset, batch_size=1)
model = ExampleModel()
data_loader = DataLoader(
test_dataset, batch_size=1, sampler=None, num_workers=0, shuffle=False)
optim_cfg = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
optimizer = obj_from_dict(optim_cfg, torch.optim,
dict(params=model.parameters()))

# test DistEvalHook
with tempfile.TemporaryDirectory() as tmpdir:
eval_hook = DistEvalHook(data_loader, by_epoch=True, interval=2)
runner = mmcv.runner.EpochBasedRunner(
model=model,
optimizer=optimizer,
work_dir=tmpdir,
logger=logging.getLogger())
runner.register_hook(eval_hook)
runner.run([loader], [('train', 1)], 2)
test_dataset.evaluate.assert_called_with([torch.tensor([1])],
logger=runner.logger)