Skip to content

Commit

Permalink
Autodetect number of devices for torchrun executor (#11482)
Browse files Browse the repository at this point in the history
* Autodetect num devices for torchrun executor

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Add docstring

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
  • Loading branch information
janekl authored Jan 10, 2025
1 parent 482ac2f commit 8a596bf
Showing 1 changed file with 21 additions and 2 deletions.
23 changes: 21 additions & 2 deletions nemo/collections/llm/recipes/run/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,35 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional

import nemo_run as run
import torch


@run.cli.factory
def torchrun(devices: int = 8) -> run.Config[run.LocalExecutor]:
"""Local executor using torchrun."""
def torchrun(devices: Optional[int] = None) -> run.Config[run.LocalExecutor]:
"""
Local executor using torchrun.
Args:
devices (Optional[int]): Number of devices to use. If None, it will use all available CUDA devices.
Returns:
run.Config[run.LocalExecutor]: Configuration for the local executor using torchrun.
"""
env_vars = {
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
}

if devices is None:
if torch.cuda.is_available():
devices = torch.cuda.device_count()
else:
raise RuntimeError(
"Cannot infer the 'ntasks_per_node' parameter as CUDA is not available: please specify explicitely."
)

executor = run.Config(
run.LocalExecutor,
ntasks_per_node=devices,
Expand Down

0 comments on commit 8a596bf

Please sign in to comment.