Skip to content

Commit

Permalink
Increment port when taken
Browse files Browse the repository at this point in the history
  • Loading branch information
ZhaofengWu authored and Borda committed Jun 14, 2020
1 parent c826a5f commit d945177
Showing 1 changed file with 12 additions and 2 deletions.
14 changes: 12 additions & 2 deletions pytorch_lightning/core/lightning.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,7 +924,8 @@ def init_ddp_connection(
self,
global_rank: int,
world_size: int,
is_slurm_managing_tasks: bool = True
is_slurm_managing_tasks: bool = True,
retries: int = 20
) -> None:
"""
Override to define your custom way of setting up a distributed environment.
Expand Down Expand Up @@ -957,7 +958,16 @@ def init_ddp_connection(

torch_backend = "nccl" if self.trainer.on_gpu else "gloo"
log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}")
torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
while True:
try:
torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
break
except RuntimeError:
# port is taken; we increment the port and try again
if retries <= 0:
raise
retries -= 1
os.environ['MASTER_PORT'] = str(int(os.environ['MASTER_PORT']) + 1)

def configure_apex(
self,
Expand Down

0 comments on commit d945177

Please sign in to comment.