diff --git a/docs/en/inference.md b/docs/en/inference.md index 14d772c5ad..cd6eaf0c18 100644 --- a/docs/en/inference.md +++ b/docs/en/inference.md @@ -6,6 +6,7 @@ and also some high-level apis for easier integration to other projects. ### Test a dataset - single GPU +- CPU - single node multiple GPU - multiple node @@ -15,6 +16,10 @@ You can use the following commands to test a dataset. # single-gpu testing python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] [--show] +# CPU: disable GPUs and run single-gpu testing script +export CUDA_VISIBLE_DEVICES=-1 +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] [--show] + # multi-gpu testing ./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] ``` diff --git a/docs/en/train.md b/docs/en/train.md index 673b996407..2c5dfb2b8d 100644 --- a/docs/en/train.md +++ b/docs/en/train.md @@ -33,6 +33,20 @@ python tools/train.py ${CONFIG_FILE} [optional arguments] If you want to specify the working directory in the command, you can add an argument `--work-dir ${YOUR_WORK_DIR}`. +### Train with CPU + +The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process. + +```shell +export CUDA_VISIBLE_DEVICES=-1 +``` + +And then run the script [above](#train-with-a-single-gpu). + +```{warning} +The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process. +``` + ### Train with multiple GPUs ```shell diff --git a/docs/zh_cn/inference.md b/docs/zh_cn/inference.md index 848fcb8b11..b681dca2cf 100644 --- a/docs/zh_cn/inference.md +++ b/docs/zh_cn/inference.md @@ -5,6 +5,7 @@ ### 测试一个数据集 - 单卡 GPU +- CPU - 单节点多卡 GPU - 多节点 @@ -14,6 +15,10 @@ # 单卡 GPU 测试 python tools/test.py ${配置文件} ${检查点文件} [--out ${结果文件}] [--eval ${评估指标}] [--show] +# CPU: 禁用 GPU 并运行单 GPU 测试脚本 +export CUDA_VISIBLE_DEVICES=-1 +python tools/test.py ${配置文件} ${检查点文件} [--out ${结果文件}] [--eval ${评估指标}] [--show] + # 多卡GPU 测试 ./tools/dist_test.sh ${配置文件} ${检查点文件} ${GPU数目} [--out ${结果文件}] [--eval ${评估指标}] ``` diff --git a/docs/zh_cn/train.md b/docs/zh_cn/train.md index 520739dbe2..24737d84af 100644 --- a/docs/zh_cn/train.md +++ b/docs/zh_cn/train.md @@ -23,6 +23,20 @@ python tools/train.py ${配置文件} [可选参数] 如果您想在命令里定义工作文件夹路径,您可以添加一个参数`--work-dir ${YOUR_WORK_DIR}`。 +### 使用 CPU 训练 + +使用 CPU 训练的流程和使用单 GPU 训练的流程一致,我们仅需要在训练流程开始前禁用 GPU。 + +```shell +export CUDA_VISIBLE_DEVICES=-1 +``` + +之后运行单 GPU 训练脚本即可。 + +```{warning} +我们不推荐用户使用 CPU 进行训练,这太过缓慢。我们支持这个功能是为了方便用户在没有 GPU 的机器上进行调试。 +``` + ### 使用多卡 GPU 训练 ```shell diff --git a/mmseg/apis/train.py b/mmseg/apis/train.py index 5d5bb9c085..760701be62 100644 --- a/mmseg/apis/train.py +++ b/mmseg/apis/train.py @@ -2,6 +2,7 @@ import random import warnings +import mmcv import numpy as np import torch import torch.distributed as dist @@ -9,6 +10,7 @@ from mmcv.runner import HOOKS, build_optimizer, build_runner, get_dist_info from mmcv.utils import build_from_cfg +from mmseg import digit_version from mmseg.core import DistEvalHook, EvalHook from mmseg.datasets import build_dataloader, build_dataset from mmseg.utils import find_latest_checkpoint, get_root_logger @@ -99,9 +101,10 @@ def train_segmentor(model, broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: - model = MMDataParallel( - model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) - + if not torch.cuda.is_available(): + assert digit_version(mmcv.__version__) >= digit_version('1.4.4'), \ + 'Please use MMCV >= 1.4.4 for CPU training!' + model = MMDataParallel(model, device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) diff --git a/tools/test.py b/tools/test.py index 172c2a1aa2..d5dc0d5f67 100644 --- a/tools/test.py +++ b/tools/test.py @@ -8,11 +8,13 @@ import mmcv import torch +from mmcv.cnn.utils import revert_sync_batchnorm from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, wrap_fp16_model) from mmcv.utils import DictAction +from mmseg import digit_version from mmseg.apis import multi_gpu_test, single_gpu_test from mmseg.datasets import build_dataloader, build_dataset from mmseg.models import build_segmentor @@ -147,11 +149,18 @@ def main(): cfg.model.pretrained = None cfg.data.test.test_mode = True - cfg.gpu_ids = [args.gpu_id] + if args.gpu_id is not None: + cfg.gpu_ids = [args.gpu_id] # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': + cfg.gpu_ids = [args.gpu_id] distributed = False + if len(cfg.gpu_ids) > 1: + warnings.warn(f'The gpu-ids is reset from {cfg.gpu_ids} to ' + f'{cfg.gpu_ids[0:1]} to avoid potential error in ' + 'non-distribute testing time.') + cfg.gpu_ids = cfg.gpu_ids[0:1] else: distributed = True init_dist(args.launcher, **cfg.dist_params) @@ -236,7 +245,15 @@ def main(): tmpdir = None if not distributed: - model = MMDataParallel(model, device_ids=[0]) + warnings.warn( + 'SyncBN is only supported with DDP. To be compatible with DP, ' + 'we convert SyncBN to BN. Please use dist_train.sh which can ' + 'avoid this error.') + if not torch.cuda.is_available(): + assert digit_version(mmcv.__version__) >= digit_version('1.4.4'), \ + 'Please use MMCV >= 1.4.4 for CPU training!' + model = revert_sync_batchnorm(model) + model = MMDataParallel(model, device_ids=cfg.gpu_ids) results = single_gpu_test( model, data_loader,