diff --git a/src/common/error_msg.cc b/src/common/error_msg.cc index 06254979469b..8871c1a1d697 100644 --- a/src/common/error_msg.cc +++ b/src/common/error_msg.cc @@ -3,9 +3,11 @@ */ #include "error_msg.h" +#include // for call_once, once_flag #include // for stringstream #include "../collective/communicator-inl.h" // for GetRank +#include "xgboost/context.h" // for Context #include "xgboost/logging.h" namespace xgboost::error { @@ -26,34 +28,43 @@ void WarnDeprecatedGPUHist() { } void WarnManualUpdater() { - bool static thread_local logged{false}; - if (logged) { - return; - } - LOG(WARNING) - << "You have manually specified the `updater` parameter. The `tree_method` parameter " - "will be ignored. Incorrect sequence of updaters will produce undefined " - "behavior. For common uses, we recommend using `tree_method` parameter instead."; - logged = true; + static std::once_flag flag; + std::call_once(flag, [] { + LOG(WARNING) + << "You have manually specified the `updater` parameter. The `tree_method` parameter " + "will be ignored. Incorrect sequence of updaters will produce undefined " + "behavior. For common uses, we recommend using `tree_method` parameter instead."; + }); } void WarnDeprecatedGPUId() { - static thread_local bool logged{false}; - if (logged) { - return; - } - auto msg = DeprecatedFunc("gpu_id", "2.0.0", "device"); - msg += " E.g. device=cpu/cuda/cuda:0"; - LOG(WARNING) << msg; - logged = true; + static std::once_flag flag; + std::call_once(flag, [] { + auto msg = DeprecatedFunc("gpu_id", "2.0.0", "device"); + msg += " E.g. device=cpu/cuda/cuda:0"; + LOG(WARNING) << msg; + }); } void WarnEmptyDataset() { - static thread_local bool logged{false}; - if (logged) { - return; - } - LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank(); - logged = true; + static std::once_flag flag; + std::call_once(flag, + [] { LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank(); }); +} + +void MismatchedDevices(Context const* booster, Context const* data) { + static std::once_flag flag; + std::call_once(flag, [&] { + LOG(WARNING) + << "Falling back to prediction using DMatrix due to mismatched devices. This might " + "lead to higher memory usage and slower performance. XGBoost is running on: " + << booster->DeviceName() << ", while the input data is on: " << data->DeviceName() << ".\n" + << R"(Potential solutions: +- Use a data structure that matches the device ordinal in the booster. +- Set the device for booster before call to inplace_predict. + +This warning will only be shown once. +)"; + }); } } // namespace xgboost::error diff --git a/src/common/error_msg.h b/src/common/error_msg.h index 1af4b7c88063..94703fd15c83 100644 --- a/src/common/error_msg.h +++ b/src/common/error_msg.h @@ -10,7 +10,8 @@ #include // for numeric_limits #include // for string -#include "xgboost/base.h" // for bst_feature_t +#include "xgboost/base.h" // for bst_feature_t +#include "xgboost/context.h" // for Context #include "xgboost/logging.h" #include "xgboost/string_view.h" // for StringView @@ -94,5 +95,7 @@ constexpr StringView InvalidCUDAOrdinal() { return "Invalid device. `device` is required to be CUDA and there must be at least one GPU " "available for using GPU."; } + +void MismatchedDevices(Context const* booster, Context const* data); } // namespace xgboost::error #endif // XGBOOST_COMMON_ERROR_MSG_H_ diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc index cb8e290c8ad3..e920ef50e7a9 100644 --- a/src/data/proxy_dmatrix.cc +++ b/src/data/proxy_dmatrix.cc @@ -55,6 +55,7 @@ std::shared_ptr CreateDMatrixFromProxy(Context const *ctx, } CHECK(p_fmat) << "Failed to fallback."; + p_fmat->Info() = proxy->Info().Copy(); return p_fmat; } } // namespace xgboost::data diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 50dfe926266c..438fd15e62cc 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -85,25 +85,6 @@ bool UpdatersMatched(std::vector updater_seq, return name == up->Name(); }); } - -void MismatchedDevices(Context const* booster, Context const* data) { - bool thread_local static logged{false}; - if (logged) { - return; - } - LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. This might " - "lead to higher memory usage and slower performance. XGBoost is running on: " - << booster->DeviceName() << ", while the input data is on: " << data->DeviceName() - << ".\n" - << R"(Potential solutions: -- Use a data structure that matches the device ordinal in the booster. -- Set the device for booster before call to inplace_predict. - -This warning will only be shown once for each thread. Subsequent warnings made by the -current thread will be suppressed. -)"; - logged = true; -} } // namespace void GBTree::Configure(Args const& cfg) { @@ -557,7 +538,7 @@ void GBTree::InplacePredict(std::shared_ptr p_m, float missing, auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end); CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees."; if (p_m->Ctx()->Device() != this->ctx_->Device()) { - MismatchedDevices(this->ctx_, p_m->Ctx()); + error::MismatchedDevices(this->ctx_, p_m->Ctx()); CHECK_EQ(out_preds->version, 0); auto proxy = std::dynamic_pointer_cast(p_m); CHECK(proxy) << error::InplacePredictProxy(); @@ -810,7 +791,7 @@ class Dart : public GBTree { auto n_groups = model_.learner_model_param->num_output_group; if (ctx_->Device() != p_fmat->Ctx()->Device()) { - MismatchedDevices(ctx_, p_fmat->Ctx()); + error::MismatchedDevices(ctx_, p_fmat->Ctx()); auto proxy = std::dynamic_pointer_cast(p_fmat); CHECK(proxy) << error::InplacePredictProxy(); auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing); diff --git a/tests/cpp/gbm/test_gbtree.cu b/tests/cpp/gbm/test_gbtree.cu index 03f6898222ff..801c935d6788 100644 --- a/tests/cpp/gbm/test_gbtree.cu +++ b/tests/cpp/gbm/test_gbtree.cu @@ -58,21 +58,6 @@ void TestInplaceFallback(Context const* ctx) { HostDeviceVector* out_predt{nullptr}; ConsoleLogger::Configure(Args{{"verbosity", "1"}}); std::string output; - // test whether the warning is raised -#if !defined(_WIN32) - // Windows has issue with CUDA and thread local storage. For some reason, on Windows a - // cudaInitializationError is raised during destruction of `HostDeviceVector`. This - // might be related to https://github.com/dmlc/xgboost/issues/5793 - ::testing::internal::CaptureStderr(); - std::thread{[&] { - // Launch a new thread to ensure a warning is raised as we prevent over-verbose - // warning by using thread-local flags. - learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits::quiet_NaN(), - &out_predt, 0, 0); - }}.join(); - output = testing::internal::GetCapturedStderr(); - ASSERT_NE(output.find("Falling back"), std::string::npos); -#endif learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits::quiet_NaN(), &out_predt, 0, 0); diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py index fb5f47c2b282..ec7c45ca248a 100644 --- a/tests/python-gpu/test_gpu_prediction.py +++ b/tests/python-gpu/test_gpu_prediction.py @@ -191,14 +191,32 @@ def test_inplace_predict_device_type(self, device: str) -> None: np.testing.assert_allclose(predt_0, predt_3) np.testing.assert_allclose(predt_0, predt_4) - def run_inplace_base_margin(self, booster, dtrain, X, base_margin): + def run_inplace_base_margin( + self, device: int, booster: xgb.Booster, dtrain: xgb.DMatrix, X, base_margin + ) -> None: import cupy as cp + booster.set_param({"device": f"cuda:{device}"}) dtrain.set_info(base_margin=base_margin) from_inplace = booster.inplace_predict(data=X, base_margin=base_margin) from_dmatrix = booster.predict(dtrain) cp.testing.assert_allclose(from_inplace, from_dmatrix) + booster = booster.copy() # clear prediction cache. + booster.set_param({"device": "cpu"}) + from_inplace = booster.inplace_predict(data=X, base_margin=base_margin) + from_dmatrix = booster.predict(dtrain) + cp.testing.assert_allclose(from_inplace, from_dmatrix) + + booster = booster.copy() # clear prediction cache. + base_margin = cp.asnumpy(base_margin) + if hasattr(X, "values"): + X = cp.asnumpy(X.values) + booster.set_param({"device": f"cuda:{device}"}) + from_inplace = booster.inplace_predict(data=X, base_margin=base_margin) + from_dmatrix = booster.predict(dtrain) + cp.testing.assert_allclose(from_inplace, from_dmatrix, rtol=1e-6) + def run_inplace_predict_cupy(self, device: int) -> None: import cupy as cp @@ -244,7 +262,7 @@ def predict_dense(x): run_threaded_predict(X, rows, predict_dense) base_margin = cp_rng.randn(rows) - self.run_inplace_base_margin(booster, dtrain, X, base_margin) + self.run_inplace_base_margin(device, booster, dtrain, X, base_margin) # Create a wide dataset X = cp_rng.randn(100, 10000) @@ -318,7 +336,7 @@ def predict_df(x): run_threaded_predict(X, rows, predict_df) base_margin = cudf.Series(rng.randn(rows)) - self.run_inplace_base_margin(booster, dtrain, X, base_margin) + self.run_inplace_base_margin(0, booster, dtrain, X, base_margin) @given( strategies.integers(1, 10), tm.make_dataset_strategy(), shap_parameter_strategy