From e4c99e72c02df0ee6884cff0403be782c869ed69 Mon Sep 17 00:00:00 2001 From: Virginia Adams Date: Fri, 15 Jul 2022 19:51:57 +0000 Subject: [PATCH 1/5] Removed NLPDDPPlugin Import check Signed-off-by: Virginia Adams --- nemo/core/classes/modelPT.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index ec29b954ced4..5711f9254cdc 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -37,13 +37,6 @@ from nemo.utils.debug_hook import register_debug_hooks from nemo.utils.get_rank import get_rank, is_global_rank_zero -try: - from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin - - HAVE_NLPPLUGIN = True -except (ImportError, ModuleNotFoundError): - HAVE_NLPPLUGIN = False - __all__ = ['ModelPT'] @@ -492,13 +485,17 @@ def setup_optimization(self, optim_config: Optional[Union[DictConfig, Dict]] = N raise ValueError("We do not currently support gradient acculumation that is not an integer.") if self._trainer.max_steps is None or self.trainer.max_steps < 0: # Store information needed to calculate max_steps + app = AppState() + if app.data_parallel_size is not None: + optim_config['sched']['t_num_workers'] = app.data_parallel_size + elif app.model_parallel_size is None: + optim_config['sched']['t_num_workers'] = self._trainer.num_devices * self._trainer.num_nodes + else: + optim_config['sched']['t_num_workers'] = (self._trainer.num_devices * self._trainer.num_nodes) / app.model_parallel_size + optim_config['sched']['t_max_epochs'] = self._trainer.max_epochs optim_config['sched']['t_accumulate_grad_batches'] = self._trainer.accumulate_grad_batches optim_config['sched']['t_limit_train_batches'] = self._trainer.limit_train_batches - optim_config['sched']['t_num_workers'] = self._trainer.num_devices * self._trainer.num_nodes - if HAVE_NLPPLUGIN and isinstance(self._trainer.accelerator.training_type_plugin, NLPDDPPlugin): - app = AppState() - optim_config['sched']['t_num_workers'] = app.data_parallel_size else: optim_config['sched']['max_steps'] = self._trainer.max_steps From 5a0fd5529a6274c0bfc9e9854d470d2e86210e1a Mon Sep 17 00:00:00 2001 From: Virginia Adams Date: Fri, 15 Jul 2022 19:58:06 +0000 Subject: [PATCH 2/5] Python formatting fix Signed-off-by: Virginia Adams --- nemo/core/classes/modelPT.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index 5711f9254cdc..29cdbe66291e 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -491,8 +491,10 @@ def setup_optimization(self, optim_config: Optional[Union[DictConfig, Dict]] = N elif app.model_parallel_size is None: optim_config['sched']['t_num_workers'] = self._trainer.num_devices * self._trainer.num_nodes else: - optim_config['sched']['t_num_workers'] = (self._trainer.num_devices * self._trainer.num_nodes) / app.model_parallel_size - + optim_config['sched']['t_num_workers'] = ( + self._trainer.num_devices * self._trainer.num_nodes + ) / app.model_parallel_size + optim_config['sched']['t_max_epochs'] = self._trainer.max_epochs optim_config['sched']['t_accumulate_grad_batches'] = self._trainer.accumulate_grad_batches optim_config['sched']['t_limit_train_batches'] = self._trainer.limit_train_batches From 2472a8ae026c15ad1146bfff444b8ab1e9f4099b Mon Sep 17 00:00:00 2001 From: Virginia Adams Date: Fri, 15 Jul 2022 20:05:17 +0000 Subject: [PATCH 3/5] changed app to app_state Signed-off-by: Virginia Adams --- nemo/core/classes/modelPT.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index 29cdbe66291e..3237982506cd 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -485,15 +485,15 @@ def setup_optimization(self, optim_config: Optional[Union[DictConfig, Dict]] = N raise ValueError("We do not currently support gradient acculumation that is not an integer.") if self._trainer.max_steps is None or self.trainer.max_steps < 0: # Store information needed to calculate max_steps - app = AppState() - if app.data_parallel_size is not None: - optim_config['sched']['t_num_workers'] = app.data_parallel_size - elif app.model_parallel_size is None: + app_state = AppState() + if app_state.data_parallel_size is not None: + optim_config['sched']['t_num_workers'] = app_state.data_parallel_size + elif app_state.model_parallel_size is None: optim_config['sched']['t_num_workers'] = self._trainer.num_devices * self._trainer.num_nodes else: optim_config['sched']['t_num_workers'] = ( self._trainer.num_devices * self._trainer.num_nodes - ) / app.model_parallel_size + ) / app_state.model_parallel_size optim_config['sched']['t_max_epochs'] = self._trainer.max_epochs optim_config['sched']['t_accumulate_grad_batches'] = self._trainer.accumulate_grad_batches From 6d3d8e24edeee8e93adedd972c800ebf95dceadc Mon Sep 17 00:00:00 2001 From: Virginia Adams Date: Fri, 15 Jul 2022 20:07:47 +0000 Subject: [PATCH 4/5] moved num workers check back to bottom Signed-off-by: Virginia Adams --- nemo/core/classes/modelPT.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index 3237982506cd..d0286e544dd2 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -485,6 +485,10 @@ def setup_optimization(self, optim_config: Optional[Union[DictConfig, Dict]] = N raise ValueError("We do not currently support gradient acculumation that is not an integer.") if self._trainer.max_steps is None or self.trainer.max_steps < 0: # Store information needed to calculate max_steps + optim_config['sched']['t_max_epochs'] = self._trainer.max_epochs + optim_config['sched']['t_accumulate_grad_batches'] = self._trainer.accumulate_grad_batches + optim_config['sched']['t_limit_train_batches'] = self._trainer.limit_train_batches + app_state = AppState() if app_state.data_parallel_size is not None: optim_config['sched']['t_num_workers'] = app_state.data_parallel_size @@ -494,10 +498,6 @@ def setup_optimization(self, optim_config: Optional[Union[DictConfig, Dict]] = N optim_config['sched']['t_num_workers'] = ( self._trainer.num_devices * self._trainer.num_nodes ) / app_state.model_parallel_size - - optim_config['sched']['t_max_epochs'] = self._trainer.max_epochs - optim_config['sched']['t_accumulate_grad_batches'] = self._trainer.accumulate_grad_batches - optim_config['sched']['t_limit_train_batches'] = self._trainer.limit_train_batches else: optim_config['sched']['max_steps'] = self._trainer.max_steps From cca3e01e2b3580b0495b226fff0c156ee93f82cd Mon Sep 17 00:00:00 2001 From: Virginia Adams Date: Fri, 15 Jul 2022 21:04:27 +0000 Subject: [PATCH 5/5] Python code reformat Signed-off-by: Virginia Adams --- nemo/core/classes/modelPT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index d0286e544dd2..8bab7c573ac1 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -488,7 +488,7 @@ def setup_optimization(self, optim_config: Optional[Union[DictConfig, Dict]] = N optim_config['sched']['t_max_epochs'] = self._trainer.max_epochs optim_config['sched']['t_accumulate_grad_batches'] = self._trainer.accumulate_grad_batches optim_config['sched']['t_limit_train_batches'] = self._trainer.limit_train_batches - + app_state = AppState() if app_state.data_parallel_size is not None: optim_config['sched']['t_num_workers'] = app_state.data_parallel_size