openvinotoolkit · sungmanc · May 25, 2023 · May 24, 2023 · May 24, 2023 · May 24, 2023
@@ -72,6 +72,7 @@ All notable changes to this project will be documented in this file.
 ### Bug fixes
 
 - Fix backward compatibility with OpenVINO SSD-like detection models from OTE 0.5 (<https://github.com/openvinotoolkit/training_extensions/pull/1970>)
+- Fix the bug that auto adapt batch size is unavailable with IterBasedRunner (<https://github.com/openvinotoolkit/training_extensions/pull/2182>)
 
 ### Known issues
 

@@ -56,17 +56,9 @@ def adapt_batch_size(train_func: Callable, cfg, datasets: List, validate: bool =
     def train_func_single_iter(batch_size):
         copied_cfg = deepcopy(cfg)
         _set_batch_size(copied_cfg, batch_size)
+        _set_max_epoch(copied_cfg, 1)  # setup for training a single iter to reduce time
 
-        # setup for training a single iter to reduce time
-        if copied_cfg.runner.get("type") == "AccuracyAwareRunner":  # nncf case
-            if "nncf_config" in copied_cfg.runner:
-                _set_value_at_dict_in_dict(
-                    copied_cfg.runner["nncf_config"], "accuracy_aware_training.params.maximal_total_epochs", 1
-                )
-        else:
-            copied_cfg.runner["max_epochs"] = 1
-
-        # Remove some hooks due to reasons below
+        # Remove hooks due to reasons below
         # OTXProgressHook => prevent progress bar from being 0 and 100 repeatably
         # earlystoppinghook => if eval hook is excluded, this hook makes an error due to absence of score history
         # CustomEvalHook => exclude validation in classification task
@@ -115,7 +107,7 @@ def train_func_single_iter(batch_size):
         logger.info(f"Batch size is adapted : {default_bs} -> {new_batch_size}")
         logger.info(f"learning rate is adapted : {origin_lr} -> {cfg.optimizer.lr}")
     else:
-        logger.info("Adapting batch size is done. Current batch size is availble.")
+        logger.info("Adapting batch size is done. Batch size isn't changed.")
 
 
 def _get_batch_size(cfg) -> int:
@@ -131,6 +123,18 @@ def _set_batch_size(cfg, batch_size: int):
         cfg.data.train_dataloader["samples_per_gpu"] = batch_size
 
 
+def _set_max_epoch(cfg, max_epoch: int):
+    if cfg.runner.get("type") == "AccuracyAwareRunner":  # nncf case
+        if "nncf_config" in cfg.runner:
+            _set_value_at_dict_in_dict(
+                cfg.runner["nncf_config"], "accuracy_aware_training.params.maximal_total_epochs", max_epoch
+            )
+    elif "iterbased" in cfg.runner["type"].lower():
+        cfg.runner["max_iters"] = max_epoch
+    else:
+        cfg.runner["max_epochs"] = max_epoch
+
+
 class SubDataset:
     """Wrapper class to make dataset pretend to have specified number of images.
 

@@ -44,14 +44,12 @@ def common_cfg(mocker):
     return mock_cfg
 
 
-@pytest.fixture
-def mock_cfg_not_action(common_cfg):
+def set_mock_cfg_not_action(common_cfg):
     common_cfg.data.train_dataloader = {"samples_per_gpu": DEFAULT_BS}
     return common_cfg
 
 
-@pytest.fixture
-def mock_cfg_action(common_cfg):
+def set_mock_cfg_action(common_cfg):
     common_cfg.data.videos_per_gpu = DEFAULT_BS
     common_cfg.domain = "ACTION_CLASSIFICATION"
     return common_cfg
@@ -65,65 +63,52 @@ def mock_dataset(mocker):
 
 
 @pytest.mark.parametrize("not_increase", [True, False])
-def test_adapt_batch_size_not_action_task(mocker, mock_adapt_algo_cls, mock_cfg_not_action, mock_dataset, not_increase):
+@pytest.mark.parametrize("is_action_task", [True, False])
+@pytest.mark.parametrize("is_iter_based_runner", [True, False])
+def test_adapt_batch_size(
+    mocker, mock_adapt_algo_cls, common_cfg, mock_dataset, not_increase, is_action_task, is_iter_based_runner
+):
     # prepare
     mock_train_func = mocker.MagicMock()
     new_bs = DEFAULT_BS // 2 if not_increase else DEFAULT_BS + 2
 
+    max_eph_name = "max_epochs"
+    if is_iter_based_runner:
+        common_cfg.runner = {"type": "IterBasedRunnerWithCancel", "max_iters": 100}
+        max_eph_name = "max_iters"
+
+    if is_action_task:
+        mock_config = set_mock_cfg_action(common_cfg)
+    else:
+        mock_config = set_mock_cfg_not_action(common_cfg)
+
     # execute
-    adapt_batch_size(mock_train_func, mock_cfg_not_action, mock_dataset, False, not_increase)
+    adapt_batch_size(mock_train_func, mock_config, mock_dataset, False, not_increase)
 
     # check adapted batch size is applied
-    assert mock_cfg_not_action.data.train_dataloader["samples_per_gpu"] == new_bs
+    if is_action_task:
+        assert mock_config.data.videos_per_gpu == new_bs
+    else:
+        assert mock_config.data.train_dataloader["samples_per_gpu"] == new_bs
     # check leanring rate is updated depending on adapted batch size
     bs_change_ratio = new_bs / DEFAULT_BS
-    assert mock_cfg_not_action.optimizer.lr == pytest.approx(DEFAULT_LR * sqrt(bs_change_ratio))
+    assert mock_config.optimizer.lr == pytest.approx(DEFAULT_LR * sqrt(bs_change_ratio))
     # check adapt function gets proper arguments
     assert mock_adapt_algo_cls.call_args.kwargs["default_bs"] == DEFAULT_BS
     assert mock_adapt_algo_cls.call_args.kwargs["max_bs"] == TRAINSET_SIZE
     # check length of dataset is decreased to reduce time
     assert len(mock_train_func.call_args_list[0].kwargs["dataset"][0]) == DEFAULT_BS
     assert len(mock_train_func.call_args_list[1].kwargs["dataset"][0]) == new_bs
     # check max epoch is set as 1 to reduce time
-    assert mock_train_func.call_args_list[0].kwargs["cfg"].runner["max_epochs"] == 1
-    assert mock_train_func.call_args_list[1].kwargs["cfg"].runner["max_epochs"] == 1
+    assert mock_train_func.call_args_list[0].kwargs["cfg"].runner[max_eph_name] == 1
+    assert mock_train_func.call_args_list[1].kwargs["cfg"].runner[max_eph_name] == 1
     # check eval before run is disabled to reduce time
     assert not mock_train_func.call_args_list[0].kwargs["cfg"].custom_hooks[0]["enable_eval_before_run"]
     assert not mock_train_func.call_args_list[1].kwargs["cfg"].custom_hooks[0]["enable_eval_before_run"]
     # check OTXProgressHook is removed
     assert len(mock_train_func.call_args_list[0].kwargs["cfg"].custom_hooks) == 1
 
 
-@pytest.mark.parametrize("not_increase", [True, False])
-def test_adapt_batch_size_action_task(mocker, mock_adapt_algo_cls, mock_cfg_action, mock_dataset, not_increase):
-    # prepare
-    mock_train_func = mocker.MagicMock()
-    new_bs = DEFAULT_BS // 2 if not_increase else DEFAULT_BS + 2
-
-    # execute
-    adapt_batch_size(mock_train_func, mock_cfg_action, mock_dataset, True, not_increase)
-
-    # check adapted batch size is applied
-    assert mock_cfg_action.data.videos_per_gpu == new_bs
-    # check leanring rate is updated depending on adapted batch size
-    bs_change_ratio = new_bs / DEFAULT_BS
-    assert mock_cfg_action.optimizer.lr == pytest.approx(DEFAULT_LR * sqrt(bs_change_ratio))
-    # check adapt function gets proper arguments
-    assert mock_adapt_algo_cls.call_args.kwargs["default_bs"] == DEFAULT_BS
-    assert mock_adapt_algo_cls.call_args.kwargs["max_bs"] == TRAINSET_SIZE
-    # check length of dataset is decreased to reduce time
-    assert len(mock_train_func.call_args_list[0].kwargs["dataset"][0]) == DEFAULT_BS
-    assert len(mock_train_func.call_args_list[1].kwargs["dataset"][0]) == new_bs
-    # check max epoch is set as 1 to reduce time
-    assert mock_train_func.call_args_list[0].kwargs["cfg"].runner["max_epochs"] == 1
-    assert mock_train_func.call_args_list[1].kwargs["cfg"].runner["max_epochs"] == 1
-    # check eval before run is enabled if validate is set as True
-    assert mock_train_func.call_args_list[0].kwargs["cfg"].custom_hooks[0]["enable_eval_before_run"]
-    assert mock_train_func.call_args_list[1].kwargs["cfg"].custom_hooks[0]["enable_eval_before_run"]
-    # check OTXProgressHook is removed
-    assert len(mock_train_func.call_args_list[0].kwargs["cfg"].custom_hooks) == 1
-
-
 class TestSubDataset:
     @pytest.fixture(autouse=True)
     def set_up(self, mocker):