You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Greetings.
I'm getting this error at the start of a U-Net training with a custom dataset.
2021-08-2300:02:42,352-mmseg-INFO-Loaded1408images2021-08-2300:02:45,215-mmseg-INFO-Loaded245images2021-08-2300:02:45,216-mmseg-INFO-loadcheckpointfrom ./mmsegmentation/deeplabv3_unet_s5-d16_256x256_40k_hrf_20201226_094047-3a1fdf85.pth2021-08-2300:02:45,217-mmseg-INFO-Useload_from_localloader2021-08-2300:02:45,305-mmseg-INFO-Startrunning, host: amourato@cslave, work_dir: /home/amourato/VM/UNET2021-08-2300:02:45,306-mmseg-INFO-Hookswillbeexecutedinthefollowingorder:
before_run:
(VERY_HIGH ) PolyLrUpdaterHook
(NORMAL ) CheckpointHook
(LOW ) EvalHook
(VERY_LOW ) TextLoggerHook--------------------before_train_epoch:
(VERY_HIGH ) PolyLrUpdaterHook
(LOW ) IterTimerHook
(LOW ) EvalHook
(VERY_LOW ) TextLoggerHook--------------------before_train_iter:
(VERY_HIGH ) PolyLrUpdaterHook
(LOW ) IterTimerHook
(LOW ) EvalHook--------------------after_train_iter:
(ABOVE_NORMAL) OptimizerHook
(NORMAL ) CheckpointHook
(LOW ) IterTimerHook
(LOW ) EvalHook
(VERY_LOW ) TextLoggerHook--------------------after_train_epoch:
(NORMAL ) CheckpointHook
(LOW ) EvalHook
(VERY_LOW ) TextLoggerHook--------------------before_val_epoch:
(LOW ) IterTimerHook
(VERY_LOW ) TextLoggerHook--------------------before_val_iter:
(LOW ) IterTimerHook--------------------after_val_iter:
(LOW ) IterTimerHook--------------------after_val_epoch:
(VERY_LOW ) TextLoggerHook--------------------2021-08-2300:02:45,307-mmseg-INFO-workflow: [('train', 1)], max: 500iters---------------------------------------------------------------------------RuntimeErrorTraceback (mostrecentcalllast)
<ipython-input-8-b622679dccf6>in<module>16mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
17train_segmentor(model, datasets, cfg, distributed=False, validate=True,
--->18meta=dict())
~/VM/mmsegmentation/mmseg/apis/train.pyintrain_segmentor(model, dataset, cfg, distributed, validate, timestamp, meta)
118elifcfg.load_from:
119runner.load_checkpoint(cfg.load_from)
-->120runner.run(data_loaders, cfg.workflow)
~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.pyinrun(self, data_loaders, workflow, max_iters, **kwargs)
131ifmode=='train'andself.iter>=self._max_iters:
132break-->133iter_runner(iter_loaders[i], **kwargs)
134135time.sleep(1) # wait for some hooks like loggers to finish~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.pyintrain(self, data_loader, **kwargs)
58data_batch=next(data_loader)
59self.call_hook('before_train_iter')
--->60outputs=self.model.train_step(data_batch, self.optimizer, **kwargs)
61ifnotisinstance(outputs, dict):
62raiseTypeError('model.train_step() must return a dict')
~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/mmcv/parallel/data_parallel.pyintrain_step(self, *inputs, **kwargs)
6566inputs, kwargs=self.scatter(inputs, kwargs, self.device_ids)
--->67returnself.module.train_step(*inputs[0], **kwargs[0])
6869defval_step(self, *inputs, **kwargs):
~/VM/mmsegmentation/mmseg/models/segmentors/base.pyintrain_step(self, data_batch, optimizer, **kwargs)
136averagingthelogs.
137"""--> 138 losses = self(**data_batch) 139 loss, log_vars = self._parse_losses(losses) 140 ~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1050 or _global_forward_hooks or _global_forward_pre_hooks):-> 1051 return forward_call(*input, **kwargs) 1052 # Do not call functions when jit is used 1053 full_backward_hooks, non_full_backward_hooks = [], []~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/mmcv/runner/fp16_utils.py in new_func(*args, **kwargs) 95 'method of nn.Module') 96 if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):---> 97 return old_func(*args, **kwargs) 98 99 # get the arg spec of the decorated method~/VM/mmsegmentation/mmseg/models/segmentors/base.py in forward(self, img, img_metas, return_loss, **kwargs) 106 """107ifreturn_loss:
-->108returnself.forward_train(img, img_metas, **kwargs)
109else:
110returnself.forward_test(img, img_metas, **kwargs)
~/VM/mmsegmentation/mmseg/models/segmentors/encoder_decoder.pyinforward_train(self, img, img_metas, gt_semantic_seg)
137""" 138 --> 139 x = self.extract_feat(img) 140 141 losses = dict()~/VM/mmsegmentation/mmseg/models/segmentors/encoder_decoder.py in extract_feat(self, img) 63 def extract_feat(self, img): 64 """Extractfeaturesfromimages."""---> 65 x = self.backbone(img) 66 if self.with_neck: 67 x = self.neck(x)~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1050 or _global_forward_hooks or _global_forward_pre_hooks):-> 1051 return forward_call(*input, **kwargs) 1052 # Do not call functions when jit is used 1053 full_backward_hooks, non_full_backward_hooks = [], []~/VM/mmsegmentation/mmseg/models/backbones/unet.py in forward(self, x) 406 enc_outs = [] 407 for enc in self.encoder:--> 408 x = enc(x) 409 enc_outs.append(x) 410 dec_outs = [x]~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1050 or _global_forward_hooks or _global_forward_pre_hooks):-> 1051 return forward_call(*input, **kwargs) 1052 # Do not call functions when jit is used 1053 full_backward_hooks, non_full_backward_hooks = [], []~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/torch/nn/modules/container.py in forward(self, input) 137 def forward(self, input): 138 for module in self:--> 139 input = module(input) 140 return input 141 ~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1050 or _global_forward_hooks or _global_forward_pre_hooks):-> 1051 return forward_call(*input, **kwargs) 1052 # Do not call functions when jit is used 1053 full_backward_hooks, non_full_backward_hooks = [], []~/VM/mmsegmentation/mmseg/models/backbones/unet.py in forward(self, x) 83 out = cp.checkpoint(self.convs, x) 84 else:---> 85 out = self.convs(x) 86 return out 87 ~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1050 or _global_forward_hooks or _global_forward_pre_hooks):-> 1051 return forward_call(*input, **kwargs) 1052 # Do not call functions when jit is used 1053 full_backward_hooks, non_full_backward_hooks = [], []~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/torch/nn/modules/container.py in forward(self, input) 137 def forward(self, input): 138 for module in self:--> 139 input = module(input) 140 return input 141 ~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1050 or _global_forward_hooks or _global_forward_pre_hooks):-> 1051 return forward_call(*input, **kwargs) 1052 # Do not call functions when jit is used 1053 full_backward_hooks, non_full_backward_hooks = [], []~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/mmcv/cnn/bricks/conv_module.py in forward(self, x, activate, norm) 198 x = self.conv(x) 199 elif layer == 'norm' and norm and self.with_norm:--> 200 x = self.norm(x) 201 elif layer == 'act' and activate and self.with_activation: 202 x = self.activate(x)~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1050 or _global_forward_hooks or _global_forward_pre_hooks):-> 1051 return forward_call(*input, **kwargs) 1052 # Do not call functions when jit is used 1053 full_backward_hooks, non_full_backward_hooks = [], []~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py in forward(self, input) 729 if self.process_group: 730 process_group = self.process_group--> 731 world_size = torch.distributed.get_world_size(process_group) 732 need_sync = world_size > 1 733 ~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py in get_world_size(group) 746 return -1 747 --> 748 return _get_group_size(group) 749 750 ~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py in _get_group_size(group) 272 """273ifgroupisGroupMember.WORLDorgroupisNone:
-->274default_pg=_get_default_group()
275returndefault_pg.size()
276ifgroupnotin_pg_group_ranks:
~/anaconda3/envs/openmmlab/lib/python3.7/site-packages/torch/distributed/distributed_c10d.pyin_get_default_group()
356 """
357 if not is_initialized():
--> 358 raise RuntimeError("Default process group has not been initialized, "
359 "please make sure to call init_process_group.")
360 return GroupMember.WORLD
RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.
This is strange because somehow the process is assuming distributed data-parallel while the flag is false and the GPU range is 1.
Greetings.
I'm getting this error at the start of a U-Net training with a custom dataset.
This is strange because somehow the process is assuming distributed data-parallel while the flag is false and the GPU range is 1.
settings:
config:
running:
Environment
Any solution?
The text was updated successfully, but these errors were encountered: