Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bugfix: using auto adapt batch size with IterBasedRunner #2182

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ All notable changes to this project will be documented in this file.
### Bug fixes

- Fix backward compatibility with OpenVINO SSD-like detection models from OTE 0.5 (<https://github.com/openvinotoolkit/training_extensions/pull/1970>)
- Fix the bug that auto adapt batch size is unavailable with IterBasedRunner (<https://github.com/openvinotoolkit/training_extensions/pull/2182>)
sungmanc marked this conversation as resolved.
Show resolved Hide resolved

### Known issues

Expand Down
26 changes: 15 additions & 11 deletions otx/algorithms/common/adapters/mmcv/utils/automatic_bs.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,9 @@ def adapt_batch_size(train_func: Callable, cfg, datasets: List, validate: bool =
def train_func_single_iter(batch_size):
copied_cfg = deepcopy(cfg)
_set_batch_size(copied_cfg, batch_size)
_set_max_epoch(copied_cfg, 1) # setup for training a single iter to reduce time

# setup for training a single iter to reduce time
if copied_cfg.runner.get("type") == "AccuracyAwareRunner": # nncf case
if "nncf_config" in copied_cfg.runner:
_set_value_at_dict_in_dict(
copied_cfg.runner["nncf_config"], "accuracy_aware_training.params.maximal_total_epochs", 1
)
else:
copied_cfg.runner["max_epochs"] = 1

# Remove some hooks due to reasons below
# Remove hooks due to reasons below
# OTXProgressHook => prevent progress bar from being 0 and 100 repeatably
# earlystoppinghook => if eval hook is excluded, this hook makes an error due to absence of score history
# CustomEvalHook => exclude validation in classification task
Expand Down Expand Up @@ -115,7 +107,7 @@ def train_func_single_iter(batch_size):
logger.info(f"Batch size is adapted : {default_bs} -> {new_batch_size}")
logger.info(f"learning rate is adapted : {origin_lr} -> {cfg.optimizer.lr}")
else:
logger.info("Adapting batch size is done. Current batch size is availble.")
logger.info("Adapting batch size is done. Batch size isn't changed.")


def _get_batch_size(cfg) -> int:
Expand All @@ -131,6 +123,18 @@ def _set_batch_size(cfg, batch_size: int):
cfg.data.train_dataloader["samples_per_gpu"] = batch_size


def _set_max_epoch(cfg, max_epoch: int):
if cfg.runner.get("type") == "AccuracyAwareRunner": # nncf case
if "nncf_config" in cfg.runner:
_set_value_at_dict_in_dict(
cfg.runner["nncf_config"], "accuracy_aware_training.params.maximal_total_epochs", max_epoch
)
elif "iterbased" in cfg.runner["type"].lower():
cfg.runner["max_iters"] = max_epoch
else:
cfg.runner["max_epochs"] = max_epoch


class SubDataset:
"""Wrapper class to make dataset pretend to have specified number of images.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,12 @@ def common_cfg(mocker):
return mock_cfg


@pytest.fixture
def mock_cfg_not_action(common_cfg):
def set_mock_cfg_not_action(common_cfg):
common_cfg.data.train_dataloader = {"samples_per_gpu": DEFAULT_BS}
return common_cfg


@pytest.fixture
def mock_cfg_action(common_cfg):
def set_mock_cfg_action(common_cfg):
common_cfg.data.videos_per_gpu = DEFAULT_BS
common_cfg.domain = "ACTION_CLASSIFICATION"
return common_cfg
Expand All @@ -65,65 +63,52 @@ def mock_dataset(mocker):


@pytest.mark.parametrize("not_increase", [True, False])
def test_adapt_batch_size_not_action_task(mocker, mock_adapt_algo_cls, mock_cfg_not_action, mock_dataset, not_increase):
@pytest.mark.parametrize("is_action_task", [True, False])
@pytest.mark.parametrize("is_iter_based_runner", [True, False])
def test_adapt_batch_size(
mocker, mock_adapt_algo_cls, common_cfg, mock_dataset, not_increase, is_action_task, is_iter_based_runner
):
# prepare
mock_train_func = mocker.MagicMock()
new_bs = DEFAULT_BS // 2 if not_increase else DEFAULT_BS + 2

max_eph_name = "max_epochs"
if is_iter_based_runner:
common_cfg.runner = {"type": "IterBasedRunnerWithCancel", "max_iters": 100}
max_eph_name = "max_iters"

if is_action_task:
mock_config = set_mock_cfg_action(common_cfg)
else:
mock_config = set_mock_cfg_not_action(common_cfg)

# execute
adapt_batch_size(mock_train_func, mock_cfg_not_action, mock_dataset, False, not_increase)
adapt_batch_size(mock_train_func, mock_config, mock_dataset, False, not_increase)

# check adapted batch size is applied
assert mock_cfg_not_action.data.train_dataloader["samples_per_gpu"] == new_bs
if is_action_task:
assert mock_config.data.videos_per_gpu == new_bs
else:
assert mock_config.data.train_dataloader["samples_per_gpu"] == new_bs
# check leanring rate is updated depending on adapted batch size
bs_change_ratio = new_bs / DEFAULT_BS
assert mock_cfg_not_action.optimizer.lr == pytest.approx(DEFAULT_LR * sqrt(bs_change_ratio))
assert mock_config.optimizer.lr == pytest.approx(DEFAULT_LR * sqrt(bs_change_ratio))
# check adapt function gets proper arguments
assert mock_adapt_algo_cls.call_args.kwargs["default_bs"] == DEFAULT_BS
assert mock_adapt_algo_cls.call_args.kwargs["max_bs"] == TRAINSET_SIZE
# check length of dataset is decreased to reduce time
assert len(mock_train_func.call_args_list[0].kwargs["dataset"][0]) == DEFAULT_BS
assert len(mock_train_func.call_args_list[1].kwargs["dataset"][0]) == new_bs
# check max epoch is set as 1 to reduce time
assert mock_train_func.call_args_list[0].kwargs["cfg"].runner["max_epochs"] == 1
assert mock_train_func.call_args_list[1].kwargs["cfg"].runner["max_epochs"] == 1
assert mock_train_func.call_args_list[0].kwargs["cfg"].runner[max_eph_name] == 1
assert mock_train_func.call_args_list[1].kwargs["cfg"].runner[max_eph_name] == 1
# check eval before run is disabled to reduce time
assert not mock_train_func.call_args_list[0].kwargs["cfg"].custom_hooks[0]["enable_eval_before_run"]
assert not mock_train_func.call_args_list[1].kwargs["cfg"].custom_hooks[0]["enable_eval_before_run"]
# check OTXProgressHook is removed
assert len(mock_train_func.call_args_list[0].kwargs["cfg"].custom_hooks) == 1


@pytest.mark.parametrize("not_increase", [True, False])
def test_adapt_batch_size_action_task(mocker, mock_adapt_algo_cls, mock_cfg_action, mock_dataset, not_increase):
# prepare
mock_train_func = mocker.MagicMock()
new_bs = DEFAULT_BS // 2 if not_increase else DEFAULT_BS + 2

# execute
adapt_batch_size(mock_train_func, mock_cfg_action, mock_dataset, True, not_increase)

# check adapted batch size is applied
assert mock_cfg_action.data.videos_per_gpu == new_bs
# check leanring rate is updated depending on adapted batch size
bs_change_ratio = new_bs / DEFAULT_BS
assert mock_cfg_action.optimizer.lr == pytest.approx(DEFAULT_LR * sqrt(bs_change_ratio))
# check adapt function gets proper arguments
assert mock_adapt_algo_cls.call_args.kwargs["default_bs"] == DEFAULT_BS
assert mock_adapt_algo_cls.call_args.kwargs["max_bs"] == TRAINSET_SIZE
# check length of dataset is decreased to reduce time
assert len(mock_train_func.call_args_list[0].kwargs["dataset"][0]) == DEFAULT_BS
assert len(mock_train_func.call_args_list[1].kwargs["dataset"][0]) == new_bs
# check max epoch is set as 1 to reduce time
assert mock_train_func.call_args_list[0].kwargs["cfg"].runner["max_epochs"] == 1
assert mock_train_func.call_args_list[1].kwargs["cfg"].runner["max_epochs"] == 1
# check eval before run is enabled if validate is set as True
assert mock_train_func.call_args_list[0].kwargs["cfg"].custom_hooks[0]["enable_eval_before_run"]
assert mock_train_func.call_args_list[1].kwargs["cfg"].custom_hooks[0]["enable_eval_before_run"]
# check OTXProgressHook is removed
assert len(mock_train_func.call_args_list[0].kwargs["cfg"].custom_hooks) == 1


class TestSubDataset:
@pytest.fixture(autouse=True)
def set_up(self, mocker):
Expand Down