Skip to content

Commit

Permalink
Relieve memory usage criteria on batch size 2 during adaptive_bs (#4009)
Browse files Browse the repository at this point in the history
* release memory usage cirteria on batch size 2 during adpative_bs

* update unit test

* update unit test
  • Loading branch information
eunwoosh authored Oct 10, 2024
1 parent 758ea97 commit 7744c89
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 6 deletions.
20 changes: 16 additions & 4 deletions src/otx/engine/adaptive_bs/bs_search_algo.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,14 @@ def auto_decrease_batch_size(self) -> int:
break

if available_bs == 0:
msg = "Current device can't train model even with 2."
raise RuntimeError(msg)
if oom:
msg = "Current device can't train model even with 2."
raise RuntimeError(msg)
logger.warning(
"Even with a batch size of 2, most of the memory is used, "
"which could cause the training to fail midway.",
)
available_bs = 2

return available_bs

Expand Down Expand Up @@ -141,8 +147,14 @@ def find_big_enough_batch_size(self, drop_last: bool = False) -> int:
if oom or bs_mem_usage > self._mem_upper_bound:
self._default_bs -= 2
if self._default_bs <= 0:
msg = "Current device can't train model even with 2."
raise RuntimeError(msg)
if oom:
msg = "Current device can't train model even with 2."
raise RuntimeError(msg)
logger.warning(
"Even with a batch size of 2, most of the memory is used, "
"which could cause the training to fail midway.",
)
return 2

return self.auto_decrease_batch_size()

Expand Down
18 changes: 16 additions & 2 deletions tests/unit/engine/adaptive_bs/test_bs_search_algo.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,19 @@ def test_auto_decrease_batch_size(self):
assert adapted_bs == 80

def test_find_max_usable_bs_gpu_memory_too_small(self):
mock_train_func = self.get_mock_train_func(cuda_oom_bound=4, max_runnable_bs=1)
mock_train_func = self.get_mock_train_func(cuda_oom_bound=1, max_runnable_bs=1)

bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000)
with pytest.raises(RuntimeError):
bs_search_algo.auto_decrease_batch_size()

def test_auto_decrease_batch_size_bs2_not_oom_but_most_mem(self):
"""Batch size 2 doesn't make oom but use most of memory."""
mock_train_func = self.get_mock_train_func(cuda_oom_bound=2, max_runnable_bs=1)

bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000)
assert bs_search_algo.auto_decrease_batch_size() == 2

@pytest.mark.parametrize(
("max_runnable_bs", "max_bs", "expected_bs"),
[
Expand All @@ -126,12 +133,19 @@ def test_find_big_enough_batch_size(self, max_runnable_bs, max_bs, expected_bs):
assert adapted_bs == expected_bs

def test_find_big_enough_batch_size_gpu_memory_too_small(self):
mock_train_func = self.get_mock_train_func(cuda_oom_bound=4, max_runnable_bs=1)
mock_train_func = self.get_mock_train_func(cuda_oom_bound=1, max_runnable_bs=1)

bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000)
with pytest.raises(RuntimeError):
bs_search_algo.find_big_enough_batch_size()

def test_find_big_enough_batch_size_bs2_not_oom_but_most_mem(self):
"""Batch size 2 doesn't make oom but use most of memory."""
mock_train_func = self.get_mock_train_func(cuda_oom_bound=2, max_runnable_bs=1)

bs_search_algo = BsSearchAlgo(mock_train_func, 2, 1000)
assert bs_search_algo.find_big_enough_batch_size() == 2

def test_find_big_enough_batch_size_gradient_zero(self):
def mock_train_func(batch_size) -> int:
if batch_size > 1000:
Expand Down

0 comments on commit 7744c89

Please sign in to comment.