From e3bb7e4e2fee333b1d1055e9ee6bdd10480cf15c Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 7 Aug 2019 12:51:34 -0700
Subject: [PATCH 1/7] prevent multi-gpu usage

---
 include/xgboost/generic_parameters.h        |   2 +-
 src/learner.cc                              |  10 +-
 tests/cpp/linear/test_linear.cu             |  43 ------
 tests/cpp/metric/test_elementwise_metric.cc |  29 ----
 tests/cpp/predictor/test_gpu_predictor.cu   | 152 +++++++++-----------
 tests/cpp/test_learner.cc                   |   4 +-
 tests/cpp/tree/test_gpu_hist.cu             |   8 --
 tests/python-gpu/test_gpu_linear.py         |  12 --
 tests/python-gpu/test_gpu_updaters.py       |  11 --
 tests/python-gpu/test_large_sizes.py        |  17 +--
 tests/python-gpu/test_pickling.py           |   2 -
 11 files changed, 83 insertions(+), 207 deletions(-)
diff --git a/include/xgboost/generic_parameters.h b/include/xgboost/generic_parameters.h
index 83c98bed3f8c..9d28c97b57e4 100644
--- a/include/xgboost/generic_parameters.h
+++ b/include/xgboost/generic_parameters.h
@@ -40,7 +40,7 @@ struct GenericParameter : public dmlc::Parameter<GenericParameter> {
         .describe("The primary GPU device ordinal.");
     DMLC_DECLARE_FIELD(n_gpus)
         .set_default(0)
-        .set_lower_bound(-1)
+        .set_lower_bound(0)
         .describe("Deprecated, please use distributed training with one "
                   "process per GPU. "
                   "Number of GPUs to use for multi-gpu algorithms.");
diff --git a/src/learner.cc b/src/learner.cc
index c0fa70384e61..0fdaff7714fb 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -580,8 +580,14 @@ class LearnerImpl : public Learner {
     }
     gbm_->Configure(args);
 
-    if (this->gbm_->UseGPU() && cfg_.find("n_gpus") == cfg_.cend()) {
-      generic_param_.n_gpus = 1;
+    if (this->gbm_->UseGPU()) {
+      if (cfg_.find("n_gpus") == cfg_.cend()) {
+        generic_param_.n_gpus = 1;
+      }
+      if (generic_param_.n_gpus != 1) {
+        LOG(FATAL) << "Multi-GPU training is no longer supported. "
+                      "Please use distributed GPU training with one process per GPU.";
+      }
     }
   }
 
diff --git a/tests/cpp/linear/test_linear.cu b/tests/cpp/linear/test_linear.cu
index 127ddc383412..9fba4735e2d3 100644
--- a/tests/cpp/linear/test_linear.cu
+++ b/tests/cpp/linear/test_linear.cu
@@ -24,47 +24,4 @@ TEST(Linear, GPUCoordinate) {
 
   delete mat;
 }
-
-#if defined(XGBOOST_USE_NCCL)
-TEST(Linear, MGPU_GPUCoordinate) {
-  {
-    auto mat = xgboost::CreateDMatrix(10, 10, 0);
-    auto lparam = CreateEmptyGenericParam(0, -1);
-    lparam.n_gpus = -1;
-    auto updater = std::unique_ptr<xgboost::LinearUpdater>(
-        xgboost::LinearUpdater::Create("gpu_coord_descent", &lparam));
-    updater->Configure({{"eta", "1."}});
-    xgboost::HostDeviceVector<xgboost::GradientPair> gpair(
-        (*mat)->Info().num_row_, xgboost::GradientPair(-5, 1.0));
-    xgboost::gbm::GBLinearModel model;
-    model.param.num_feature = (*mat)->Info().num_col_;
-    model.param.num_output_group = 1;
-    model.LazyInitModel();
-    updater->Update(&gpair, (*mat).get(), &model, gpair.Size());
-
-    ASSERT_EQ(model.bias()[0], 5.0f);
-    delete mat;
-  }
-
-  {
-    auto lparam = CreateEmptyGenericParam(1, -1);
-    lparam.n_gpus = -1;
-    auto mat = xgboost::CreateDMatrix(10, 10, 0);
-    auto updater = std::unique_ptr<xgboost::LinearUpdater>(
-        xgboost::LinearUpdater::Create("gpu_coord_descent", &lparam));
-    updater->Configure({{"eta", "1."}});
-    xgboost::HostDeviceVector<xgboost::GradientPair> gpair(
-        (*mat)->Info().num_row_, xgboost::GradientPair(-5, 1.0));
-    xgboost::gbm::GBLinearModel model;
-    model.param.num_feature = (*mat)->Info().num_col_;
-    model.param.num_output_group = 1;
-    model.LazyInitModel();
-    updater->Update(&gpair, (*mat).get(), &model, gpair.Size());
-
-    ASSERT_EQ(model.bias()[0], 5.0f);
-    delete mat;
-  }
-}
-#endif
-
 }  // namespace xgboost
\ No newline at end of file
diff --git a/tests/cpp/metric/test_elementwise_metric.cc b/tests/cpp/metric/test_elementwise_metric.cc
index 071ab5c48eb6..c38b81a7e6fa 100644
--- a/tests/cpp/metric/test_elementwise_metric.cc
+++ b/tests/cpp/metric/test_elementwise_metric.cc
@@ -101,32 +101,3 @@ TEST(Metric, DeclareUnifiedTest(PoissionNegLogLik)) {
               1.1280f, 0.001f);
   delete metric;
 }
-
-#if defined(XGBOOST_USE_NCCL) && defined(__CUDACC__)
-TEST(Metric, MGPU_RMSE) {
-  {
-    auto lparam = xgboost::CreateEmptyGenericParam(0, -1);
-    xgboost::Metric * metric = xgboost::Metric::Create("rmse", &lparam);
-    metric->Configure({});
-    ASSERT_STREQ(metric->Name(), "rmse");
-    EXPECT_NEAR(GetMetricEval(metric, {0}, {0}), 0, 1e-10);
-    EXPECT_NEAR(GetMetricEval(metric,
-                              {0.1f, 0.9f, 0.1f, 0.9f},
-                              {  0,   0,   1,   1}),
-                0.6403f, 0.001f);
-    delete metric;
-  }
-
-  {
-    auto lparam = xgboost::CreateEmptyGenericParam(1, -1);
-    xgboost::Metric * metric = xgboost::Metric::Create("rmse", &lparam);
-    ASSERT_STREQ(metric->Name(), "rmse");
-    EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-10);
-    EXPECT_NEAR(GetMetricEval(metric,
-                              {0.1f, 0.9f, 0.1f, 0.9f},
-                              {  0,   0,   1,   1}),
-                0.6403f, 0.001f);
-    delete metric;
-  }
-}
-#endif
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 9bca7be930d1..10a6837e67ea 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -68,6 +68,41 @@ TEST(gpu_predictor, Test) {
   delete dmat;
 }
 
+TEST(gpu_predictor, MoreTest) {
+  auto cpu_lparam = CreateEmptyGenericParam(0, 0);
+  auto gpu_lparam = CreateEmptyGenericParam(0, 1);
+
+  std::unique_ptr<Predictor> gpu_predictor =
+      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &gpu_lparam));
+  std::unique_ptr<Predictor> cpu_predictor =
+      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &cpu_lparam));
+
+  cpu_predictor->Configure({}, {});
+
+  for (size_t i = 1; i < 33; i *= 2) {
+    int n_row = i, n_col = i;
+    auto dmat = CreateDMatrix(n_row, n_col, 0);
+
+    gbm::GBTreeModel model = CreateTestModel();
+    model.param.num_feature = n_col;
+
+    // Test predict batch
+    HostDeviceVector<float> gpu_out_predictions;
+    HostDeviceVector<float> cpu_out_predictions;
+
+    gpu_predictor->PredictBatch((*dmat).get(), &gpu_out_predictions, model, 0);
+    cpu_predictor->PredictBatch((*dmat).get(), &cpu_out_predictions, model, 0);
+
+    std::vector<float>& gpu_out_predictions_h = gpu_out_predictions.HostVector();
+    std::vector<float>& cpu_out_predictions_h = cpu_out_predictions.HostVector();
+    float abs_tolerance = 0.001;
+    for (int j = 0; j < gpu_out_predictions.Size(); j++) {
+      ASSERT_NEAR(gpu_out_predictions_h[j], cpu_out_predictions_h[j], abs_tolerance);
+    }
+    delete dmat;
+  }
+}
+
 TEST(gpu_predictor, ExternalMemoryTest) {
   auto lparam = CreateEmptyGenericParam(0, 1);
   std::unique_ptr<Predictor> gpu_predictor =
@@ -89,10 +124,43 @@ TEST(gpu_predictor, ExternalMemoryTest) {
   }
 }
 
-#if defined(XGBOOST_USE_NCCL)
+TEST(gpu_predictor, MoreExternalMemoryTest) {
+  auto gpu_lparam = CreateEmptyGenericParam(0, 1);
+
+  std::unique_ptr<Predictor> gpu_predictor =
+      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &gpu_lparam));
+  gpu_predictor->Configure({}, {});
+
+  gbm::GBTreeModel model = CreateTestModel();
+  model.param.num_feature = 3;
+  const int n_classes = 3;
+  model.param.num_output_group = n_classes;
+  std::vector<std::unique_ptr<DMatrix>> dmats;
+  dmlc::TemporaryDirectory tmpdir;
+  std::string file0 = tmpdir.path + "/big_0.libsvm";
+  std::string file1 = tmpdir.path + "/big_1.libsvm";
+  std::string file2 = tmpdir.path + "/big_2.libsvm";
+  dmats.push_back(CreateSparsePageDMatrix(9, 64UL, file0));
+  dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1));
+  dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2));
+
+  for (const auto& dmat: dmats) {
+    // Test predict batch
+    HostDeviceVector<float> out_predictions;
+    gpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
+    EXPECT_EQ(out_predictions.Size(), dmat->Info().num_row_ * n_classes);
+    const std::vector<float> &host_vector = out_predictions.ConstHostVector();
+    for (int i = 0; i < host_vector.size() / n_classes; i++) {
+      ASSERT_EQ(host_vector[i * n_classes], 1.5);
+      ASSERT_EQ(host_vector[i * n_classes + 1], 0.);
+      ASSERT_EQ(host_vector[i * n_classes + 2], 0.);
+    }
+  }
+}
+
 // Test whether pickling preserves predictor parameters
-TEST(gpu_predictor, MGPU_PicklingTest) {
-  int const ngpu = GPUSet::AllVisible().Size();
+TEST(gpu_predictor, PicklingTest) {
+  int const ngpu = 1;
 
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
@@ -153,12 +221,6 @@ TEST(gpu_predictor, MGPU_PicklingTest) {
     ASSERT_EQ(kwargs.at("n_gpus"), std::to_string(ngpu).c_str());
   }
 
-  {  // Change n_gpus and query again
-    CheckCAPICall(XGBoosterSetParam(bst2, "n_gpus", "1"));
-    const auto& kwargs = QueryBoosterConfigurationArguments(bst2);
-    ASSERT_EQ(kwargs.at("n_gpus"), "1");
-  }
-
   {  // Change predictor and query again
     CheckCAPICall(XGBoosterSetParam(bst2, "predictor", "cpu_predictor"));
     const auto& kwargs = QueryBoosterConfigurationArguments(bst2);
@@ -167,77 +229,5 @@ TEST(gpu_predictor, MGPU_PicklingTest) {
 
   CheckCAPICall(XGBoosterFree(bst2));
 }
-
-// multi-GPU predictor test
-TEST(gpu_predictor, MGPU_Test) {
-  auto cpu_lparam = CreateEmptyGenericParam(0, 0);
-  auto gpu_lparam = CreateEmptyGenericParam(0, -1);
-
-  std::unique_ptr<Predictor> gpu_predictor =
-      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &gpu_lparam));
-  std::unique_ptr<Predictor> cpu_predictor =
-      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &cpu_lparam));
-
-  cpu_predictor->Configure({}, {});
-
-  for (size_t i = 1; i < 33; i *= 2) {
-    int n_row = i, n_col = i;
-    auto dmat = CreateDMatrix(n_row, n_col, 0);
-
-    gbm::GBTreeModel model = CreateTestModel();
-    model.param.num_feature = n_col;
-
-    // Test predict batch
-    HostDeviceVector<float> gpu_out_predictions;
-    HostDeviceVector<float> cpu_out_predictions;
-
-    gpu_predictor->PredictBatch((*dmat).get(), &gpu_out_predictions, model, 0);
-    cpu_predictor->PredictBatch((*dmat).get(), &cpu_out_predictions, model, 0);
-
-    std::vector<float>& gpu_out_predictions_h = gpu_out_predictions.HostVector();
-    std::vector<float>& cpu_out_predictions_h = cpu_out_predictions.HostVector();
-    float abs_tolerance = 0.001;
-    for (int j = 0; j < gpu_out_predictions.Size(); j++) {
-      ASSERT_NEAR(gpu_out_predictions_h[j], cpu_out_predictions_h[j], abs_tolerance);
-    }
-    delete dmat;
-  }
-}
-
-// multi-GPU predictor external memory test
-TEST(gpu_predictor, MGPU_ExternalMemoryTest) {
-  auto gpu_lparam = CreateEmptyGenericParam(0, -1);
-
-  std::unique_ptr<Predictor> gpu_predictor =
-      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &gpu_lparam));
-  gpu_predictor->Configure({}, {});
-
-  gbm::GBTreeModel model = CreateTestModel();
-  model.param.num_feature = 3;
-  const int n_classes = 3;
-  model.param.num_output_group = n_classes;
-  std::vector<std::unique_ptr<DMatrix>> dmats;
-  dmlc::TemporaryDirectory tmpdir;
-  std::string file0 = tmpdir.path + "/big_0.libsvm";
-  std::string file1 = tmpdir.path + "/big_1.libsvm";
-  std::string file2 = tmpdir.path + "/big_2.libsvm";
-  dmats.push_back(CreateSparsePageDMatrix(9, 64UL, file0));
-  dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1));
-  dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2));
-
-  for (const auto& dmat: dmats) {
-    // Test predict batch
-    HostDeviceVector<float> out_predictions;
-    gpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
-    EXPECT_EQ(out_predictions.Size(), dmat->Info().num_row_ * n_classes);
-    const std::vector<float> &host_vector = out_predictions.ConstHostVector();
-    for (int i = 0; i < host_vector.size() / n_classes; i++) {
-      ASSERT_EQ(host_vector[i * n_classes], 1.5);
-      ASSERT_EQ(host_vector[i * n_classes + 1], 0.);
-      ASSERT_EQ(host_vector[i * n_classes + 2], 0.);
-    }
-  }
-}
-#endif  // defined(XGBOOST_USE_NCCL)
 }  // namespace predictor
 }  // namespace xgboost
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index fa2a21d4fa68..9261bb96f89c 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -168,10 +168,10 @@ TEST(Learner, IO) {
   std::unique_ptr<Learner> learner {Learner::Create(mat)};
   learner->SetParams({Arg{"tree_method", "auto"},
                       Arg{"predictor", "gpu_predictor"},
-                      Arg{"n_gpus", "-1"}});
+                      Arg{"n_gpus", "1"}});
   learner->UpdateOneIter(0, p_dmat.get());
   ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
-  ASSERT_EQ(learner->GetGenericParameter().n_gpus, -1);
+  ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
 
   dmlc::TemporaryDirectory tempdir;
   const std::string fname = tempdir.path + "/model.bst";
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 002a04301126..869859fee416 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -415,13 +415,5 @@ TEST(GpuHist, TestHistogramIndex) {
   TestHistogramIndexImpl(1);
 }
 
-#if defined(XGBOOST_USE_NCCL)
-TEST(GpuHist, MGPU_TestHistogramIndex) {
-  auto devices = GPUSet::AllVisible();
-  CHECK_GT(devices.Size(), 1);
-  TestHistogramIndexImpl(-1);
-}
-#endif
-
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/python-gpu/test_gpu_linear.py b/tests/python-gpu/test_gpu_linear.py
index 47d6f92cf1a3..79e5919b6fe1 100644
--- a/tests/python-gpu/test_gpu_linear.py
+++ b/tests/python-gpu/test_gpu_linear.py
@@ -29,15 +29,3 @@ def test_gpu_coordinate(self):
                 param, 150, self.datasets, scale_features=True)
             test_linear.assert_regression_result(results, 1e-2)
             test_linear.assert_classification_result(results)
-
-    @pytest.mark.mgpu
-    @pytest.mark.skipif(**tm.no_sklearn())
-    def test_gpu_coordinate_mgpu(self):
-        parameters = self.common_param.copy()
-        parameters['n_gpus'] = [-1]
-        parameters['gpu_id'] = [1]
-        for param in test_linear.parameter_combinations(parameters):
-            results = test_linear.run_suite(
-                param, 150, self.datasets, scale_features=True)
-            test_linear.assert_regression_result(results, 1e-2)
-            test_linear.assert_classification_result(results)
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 5039093b4b85..dbc7b10209fa 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -36,17 +36,6 @@ def test_gpu_hist(self):
             cpu_results = run_suite(param, select_datasets=datasets)
             assert_gpu_results(cpu_results, gpu_results)
 
-    @pytest.mark.mgpu
-    def test_gpu_hist_mgpu(self):
-        variable_param = {'n_gpus': [-1], 'max_depth': [2, 10],
-                          'max_leaves': [255, 4],
-                          'max_bin': [2, 256],
-                          'grow_policy': ['lossguide'], 'debug_synchronize': [True]}
-        for param in parameter_combinations(variable_param):
-            param['tree_method'] = 'gpu_hist'
-            gpu_results = run_suite(param, select_datasets=datasets)
-            assert_results_non_increasing(gpu_results, 1e-2)
-
     @pytest.mark.mgpu
     def test_specified_gpu_id_gpu_update(self):
         variable_param = {'n_gpus': [1],
diff --git a/tests/python-gpu/test_large_sizes.py b/tests/python-gpu/test_large_sizes.py
index 9241587b151f..421964285fd6 100644
--- a/tests/python-gpu/test_large_sizes.py
+++ b/tests/python-gpu/test_large_sizes.py
@@ -25,7 +25,7 @@ def eprint(*args, **kwargs):
 # reduced to fit onto 1 gpu but still be large
 rows3 = 5000  # small
 rows2 = 4360032  # medium
-rows1 = 42360032  # large
+rows1 = 32360032  # large
 # rows1 = 152360032 # can do this for multi-gpu test (very large)
 rowslist = [rows1, rows2, rows3]
 
@@ -67,15 +67,6 @@ def test_large(self):
                          'objective': 'binary:logistic',
                          'max_bin': max_bin,
                          'eval_metric': 'auc'}
-            ag_param3 = {'max_depth': max_depth,
-                         'tree_method': 'gpu_hist',
-                         'nthread': 0,
-                         'eta': 1,
-                         'verbosity': 3,
-                         'n_gpus': -1,
-                         'objective': 'binary:logistic',
-                         'max_bin': max_bin,
-                         'eval_metric': 'auc'}
             ag_res = {}
             ag_resb = {}
             ag_res2 = {}
@@ -93,9 +84,3 @@ def test_large(self):
             xgb.train(ag_param2, ag_dtrain, num_rounds, [(ag_dtrain, 'train')],
                       evals_result=ag_res2)
             print("Time to Train: %s seconds" % (str(time.time() - tmp)))
-
-            tmp = time.time()
-            eprint("gpu_hist updater all gpus")
-            xgb.train(ag_param3, ag_dtrain, num_rounds, [(ag_dtrain, 'train')],
-                      evals_result=ag_res3)
-            print("Time to Train: %s seconds" % (str(time.time() - tmp)))
diff --git a/tests/python-gpu/test_pickling.py b/tests/python-gpu/test_pickling.py
index 691f4a56ba72..9c077e3155a9 100644
--- a/tests/python-gpu/test_pickling.py
+++ b/tests/python-gpu/test_pickling.py
@@ -35,8 +35,6 @@ def test_pickling(self):
         x, y = build_dataset()
         train_x = xgb.DMatrix(x, label=y)
         param = {'tree_method': 'gpu_hist',
-                 'gpu_id': 0,
-                 'n_gpus': -1,
                  'verbosity': 1}
         bst = xgb.train(param, train_x)
 

From e1f462eda009008a41730a9900b1b286c556d05c Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 7 Aug 2019 15:59:01 -0700
Subject: [PATCH 2/7] fix build

---
 tests/cpp/predictor/test_gpu_predictor.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 10a6837e67ea..eecd55e75d96 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -12,7 +12,6 @@
 #include "gtest/gtest.h"
 #include "../helpers.h"
 
-#if defined(XGBOOST_USE_NCCL)
 namespace {
 
 inline void CheckCAPICall(int ret) {
@@ -20,7 +19,6 @@ inline void CheckCAPICall(int ret) {
 }
 
 }  // namespace anonymous
-#endif
 
 const std::map<std::string, std::string>&
 QueryBoosterConfigurationArguments(BoosterHandle handle) {

From a2ca8077a5d6105e407fb697db88bcfa40acdba6 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 7 Aug 2019 16:57:14 -0700
Subject: [PATCH 3/7] fix distributed test

---
 tests/distributed/distributed_gpu.py | 26 --------------------------
 tests/distributed/runtests-gpu.sh    | 20 +-------------------
 2 files changed, 1 insertion(+), 45 deletions(-)

diff --git a/tests/distributed/distributed_gpu.py b/tests/distributed/distributed_gpu.py
index 0099051104b9..6665cd142d14 100644
--- a/tests/distributed/distributed_gpu.py
+++ b/tests/distributed/distributed_gpu.py
@@ -66,27 +66,6 @@ def params_basic_1x4(rank):
     }), 20
 
 
-def params_basic_2x2(rank):
-    return dict(base_params, **{
-        'n_gpus': 2,
-        'gpu_id': 2*rank,
-    }), 20
-
-
-def params_basic_4x1(rank):
-    return dict(base_params, **{
-        'n_gpus': 4,
-        'gpu_id': rank,
-    }), 20
-
-
-def params_basic_asym(rank):
-    return dict(base_params, **{
-        'n_gpus': 1 if rank == 0 else 3,
-        'gpu_id': rank,
-    }), 20
-
-
 rf_update_params = {
     'subsample': 0.5,
     'colsample_bynode': 0.5
@@ -103,11 +82,6 @@ def wrapped_params_fun(rank):
 
 params_rf_1x4 = wrap_rf(params_basic_1x4)
 
-params_rf_2x2 = wrap_rf(params_basic_2x2)
-
-params_rf_4x1 = wrap_rf(params_basic_4x1)
-
-params_rf_asym = wrap_rf(params_basic_asym)
 
 
 test_name = sys.argv[1]
diff --git a/tests/distributed/runtests-gpu.sh b/tests/distributed/runtests-gpu.sh
index 950704f9850b..e942efd25b80 100755
--- a/tests/distributed/runtests-gpu.sh
+++ b/tests/distributed/runtests-gpu.sh
@@ -8,23 +8,5 @@ submit="timeout 30 python ../../dmlc-core/tracker/dmlc-submit"
 echo -e "\n ====== 1. Basic distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n"
 $submit --num-workers=4 python distributed_gpu.py basic_1x4 || exit 1
 
-echo -e "\n ====== 2. Basic distributed-gpu test with Python: 2 workers; 2 GPUs per worker ====== \n"
-$submit --num-workers=2 python distributed_gpu.py basic_2x2 || exit 1
-
-echo -e "\n ====== 3. Basic distributed-gpu test with Python: 2 workers; Rank 0: 1 GPU, Rank 1: 3 GPUs ====== \n"
-$submit --num-workers=2 python distributed_gpu.py basic_asym || exit 1
-
-echo -e "\n ====== 4. Basic distributed-gpu test with Python: 1 worker; 4 GPUs per worker ====== \n"
-$submit --num-workers=1 python distributed_gpu.py basic_4x1 || exit 1
-
-echo -e "\n ====== 5. RF distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n"
+echo -e "\n ====== 2. RF distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n"
 $submit --num-workers=4 python distributed_gpu.py rf_1x4 || exit 1
-
-echo -e "\n ====== 6. RF distributed-gpu test with Python: 2 workers; 2 GPUs per worker ====== \n"
-$submit --num-workers=2 python distributed_gpu.py rf_2x2 || exit 1
-
-echo -e "\n ====== 7. RF distributed-gpu test with Python: 2 workers; Rank 0: 1 GPU, Rank 1: 3 GPUs ====== \n"
-$submit --num-workers=2 python distributed_gpu.py rf_asym || exit 1
-
-echo -e "\n ====== 8. RF distributed-gpu test with Python: 1 worker; 4 GPUs per worker ====== \n"
-$submit --num-workers=1 python distributed_gpu.py rf_4x1 || exit 1

From 42c573663d30c7c6e1ef124055490e8e425e3b77 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 8 Aug 2019 09:13:34 -0700
Subject: [PATCH 4/7] combine gpu predictor tests

---
 tests/cpp/predictor/test_gpu_predictor.cu | 56 -----------------------
 1 file changed, 56 deletions(-)

diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index eecd55e75d96..5cf5817de1e5 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -44,39 +44,6 @@ TEST(gpu_predictor, Test) {
   gpu_predictor->Configure({}, {});
   cpu_predictor->Configure({}, {});
 
-  int n_row = 5;
-  int n_col = 5;
-
-  gbm::GBTreeModel model = CreateTestModel();
-  model.param.num_feature = n_col;
-  auto dmat = CreateDMatrix(n_row, n_col, 0);
-
-  // Test predict batch
-  HostDeviceVector<float> gpu_out_predictions;
-  HostDeviceVector<float> cpu_out_predictions;
-  gpu_predictor->PredictBatch((*dmat).get(), &gpu_out_predictions, model, 0);
-  cpu_predictor->PredictBatch((*dmat).get(), &cpu_out_predictions, model, 0);
-  std::vector<float>& gpu_out_predictions_h = gpu_out_predictions.HostVector();
-  std::vector<float>& cpu_out_predictions_h = cpu_out_predictions.HostVector();
-  float abs_tolerance = 0.001;
-  for (int i = 0; i < gpu_out_predictions.Size(); i++) {
-    ASSERT_NEAR(gpu_out_predictions_h[i], cpu_out_predictions_h[i], abs_tolerance);
-  }
-
-  delete dmat;
-}
-
-TEST(gpu_predictor, MoreTest) {
-  auto cpu_lparam = CreateEmptyGenericParam(0, 0);
-  auto gpu_lparam = CreateEmptyGenericParam(0, 1);
-
-  std::unique_ptr<Predictor> gpu_predictor =
-      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &gpu_lparam));
-  std::unique_ptr<Predictor> cpu_predictor =
-      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &cpu_lparam));
-
-  cpu_predictor->Configure({}, {});
-
   for (size_t i = 1; i < 33; i *= 2) {
     int n_row = i, n_col = i;
     auto dmat = CreateDMatrix(n_row, n_col, 0);
@@ -106,29 +73,6 @@ TEST(gpu_predictor, ExternalMemoryTest) {
   std::unique_ptr<Predictor> gpu_predictor =
       std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &lparam));
   gpu_predictor->Configure({}, {});
-  gbm::GBTreeModel model = CreateTestModel();
-  int n_col = 3;
-  model.param.num_feature = n_col;
-  dmlc::TemporaryDirectory tmpdir;
-  std::string filename = tmpdir.path + "/big.libsvm";
-  std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(32, 64, filename);
-
-  // Test predict batch
-  HostDeviceVector<float> out_predictions;
-  gpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
-  EXPECT_EQ(out_predictions.Size(), dmat->Info().num_row_);
-  for (const auto& v : out_predictions.HostVector()) {
-    ASSERT_EQ(v, 1.5);
-  }
-}
-
-TEST(gpu_predictor, MoreExternalMemoryTest) {
-  auto gpu_lparam = CreateEmptyGenericParam(0, 1);
-
-  std::unique_ptr<Predictor> gpu_predictor =
-      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &gpu_lparam));
-  gpu_predictor->Configure({}, {});
-
   gbm::GBTreeModel model = CreateTestModel();
   model.param.num_feature = 3;
   const int n_classes = 3;

From 4db36c83a2d59ab8892eaf007071d788f804eb8b Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 8 Aug 2019 10:47:20 -0700
Subject: [PATCH 5/7] set upper bound on n_gpus

---
 include/xgboost/generic_parameters.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/xgboost/generic_parameters.h b/include/xgboost/generic_parameters.h
index 9d28c97b57e4..2ee5ed96f5cf 100644
--- a/include/xgboost/generic_parameters.h
+++ b/include/xgboost/generic_parameters.h
@@ -40,7 +40,7 @@ struct GenericParameter : public dmlc::Parameter<GenericParameter> {
         .describe("The primary GPU device ordinal.");
     DMLC_DECLARE_FIELD(n_gpus)
         .set_default(0)
-        .set_lower_bound(0)
+        .set_range(0, 1)
         .describe("Deprecated, please use distributed training with one "
                   "process per GPU. "
                   "Number of GPUs to use for multi-gpu algorithms.");

From 31b8a004a2f6384d54e4f32e216b0fdd19db988e Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 8 Aug 2019 11:31:26 -0700
Subject: [PATCH 6/7] remove failed mgpu tests

---
 tests/cpp/common/test_gpu_hist_util.cu | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/tests/cpp/common/test_gpu_hist_util.cu b/tests/cpp/common/test_gpu_hist_util.cu
index 2081bd7617f3..5be7d8dd2202 100644
--- a/tests/cpp/common/test_gpu_hist_util.cu
+++ b/tests/cpp/common/test_gpu_hist_util.cu
@@ -88,19 +88,5 @@ TEST(gpu_hist_util, DeviceSketch_ExternalMemory) {
   TestDeviceSketch(GPUSet::Range(0, 1), true);
 }
 
-#if defined(XGBOOST_USE_NCCL)
-TEST(gpu_hist_util, MGPU_DeviceSketch) {
-  auto devices = GPUSet::AllVisible();
-  CHECK_GT(devices.Size(), 1);
-  TestDeviceSketch(devices, false);
-}
-
-TEST(gpu_hist_util, MGPU_DeviceSketch_ExternalMemory) {
-  auto devices = GPUSet::AllVisible();
-  CHECK_GT(devices.Size(), 1);
-  TestDeviceSketch(devices, true);
-}
-#endif
-
 }  // namespace common
 }  // namespace xgboost

From 4fe16ce0feab98a15008d2b6dbd78d90b0bf90de Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Sun, 11 Aug 2019 23:06:19 -0700
Subject: [PATCH 7/7] clarify error message

---
 include/xgboost/generic_parameters.h | 6 +++---
 src/learner.cc                       | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/xgboost/generic_parameters.h b/include/xgboost/generic_parameters.h
index 2ee5ed96f5cf..e4cef9cf094c 100644
--- a/include/xgboost/generic_parameters.h
+++ b/include/xgboost/generic_parameters.h
@@ -41,9 +41,9 @@ struct GenericParameter : public dmlc::Parameter<GenericParameter> {
     DMLC_DECLARE_FIELD(n_gpus)
         .set_default(0)
         .set_range(0, 1)
-        .describe("Deprecated, please use distributed training with one "
-                  "process per GPU. "
-                  "Number of GPUs to use for multi-gpu algorithms.");
+        .describe("Deprecated. Single process multi-GPU training is no longer supported. "
+                  "Please switch to distributed training with one process per GPU. "
+                  "This can be done using Dask or Spark.");
   }
 };
 }  // namespace xgboost
diff --git a/src/learner.cc b/src/learner.cc
index 0fdaff7714fb..8e694882db74 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -585,8 +585,9 @@ class LearnerImpl : public Learner {
         generic_param_.n_gpus = 1;
       }
       if (generic_param_.n_gpus != 1) {
-        LOG(FATAL) << "Multi-GPU training is no longer supported. "
-                      "Please use distributed GPU training with one process per GPU.";
+        LOG(FATAL) << "Single process multi-GPU training is no longer supported. "
+                      "Please switch to distributed GPU training with one process per GPU. "
+                      "This can be done using Dask or Spark.";
       }
     }
   }