diff --git a/src/common/error_msg.cc b/src/common/error_msg.cc
index 06254979469b..8871c1a1d697 100644
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@@ -3,9 +3,11 @@
  */
 #include "error_msg.h"
 
+#include <mutex>    // for call_once, once_flag
 #include <sstream>  // for stringstream
 
 #include "../collective/communicator-inl.h"  // for GetRank
+#include "xgboost/context.h"                 // for Context
 #include "xgboost/logging.h"
 
 namespace xgboost::error {
@@ -26,34 +28,43 @@ void WarnDeprecatedGPUHist() {
 }
 
 void WarnManualUpdater() {
-  bool static thread_local logged{false};
-  if (logged) {
-    return;
-  }
-  LOG(WARNING)
-      << "You have manually specified the `updater` parameter. The `tree_method` parameter "
-         "will be ignored. Incorrect sequence of updaters will produce undefined "
-         "behavior. For common uses, we recommend using `tree_method` parameter instead.";
-  logged = true;
+  static std::once_flag flag;
+  std::call_once(flag, [] {
+    LOG(WARNING)
+        << "You have manually specified the `updater` parameter. The `tree_method` parameter "
+           "will be ignored. Incorrect sequence of updaters will produce undefined "
+           "behavior. For common uses, we recommend using `tree_method` parameter instead.";
+  });
 }
 
 void WarnDeprecatedGPUId() {
-  static thread_local bool logged{false};
-  if (logged) {
-    return;
-  }
-  auto msg = DeprecatedFunc("gpu_id", "2.0.0", "device");
-  msg += " E.g. device=cpu/cuda/cuda:0";
-  LOG(WARNING) << msg;
-  logged = true;
+  static std::once_flag flag;
+  std::call_once(flag, [] {
+    auto msg = DeprecatedFunc("gpu_id", "2.0.0", "device");
+    msg += " E.g. device=cpu/cuda/cuda:0";
+    LOG(WARNING) << msg;
+  });
 }
 
 void WarnEmptyDataset() {
-  static thread_local bool logged{false};
-  if (logged) {
-    return;
-  }
-  LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
-  logged = true;
+  static std::once_flag flag;
+  std::call_once(flag,
+                 [] { LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank(); });
+}
+
+void MismatchedDevices(Context const* booster, Context const* data) {
+  static std::once_flag flag;
+  std::call_once(flag, [&] {
+    LOG(WARNING)
+        << "Falling back to prediction using DMatrix due to mismatched devices. This might "
+           "lead to higher memory usage and slower performance. XGBoost is running on: "
+        << booster->DeviceName() << ", while the input data is on: " << data->DeviceName() << ".\n"
+        << R"(Potential solutions:
+- Use a data structure that matches the device ordinal in the booster.
+- Set the device for booster before call to inplace_predict.
+
+This warning will only be shown once.
+)";
+  });
 }
 }  // namespace xgboost::error
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 1af4b7c88063..94703fd15c83 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -10,7 +10,8 @@
 #include <limits>     // for numeric_limits
 #include <string>     // for string
 
-#include "xgboost/base.h"  // for bst_feature_t
+#include "xgboost/base.h"     // for bst_feature_t
+#include "xgboost/context.h"  // for Context
 #include "xgboost/logging.h"
 #include "xgboost/string_view.h"  // for StringView
 
@@ -94,5 +95,7 @@ constexpr StringView InvalidCUDAOrdinal() {
   return "Invalid device. `device` is required to be CUDA and there must be at least one GPU "
          "available for using GPU.";
 }
+
+void MismatchedDevices(Context const* booster, Context const* data);
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc
index cb8e290c8ad3..e920ef50e7a9 100644
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -55,6 +55,7 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
   }
 
   CHECK(p_fmat) << "Failed to fallback.";
+  p_fmat->Info() = proxy->Info().Copy();
   return p_fmat;
 }
 }  // namespace xgboost::data
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 50dfe926266c..438fd15e62cc 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -85,25 +85,6 @@ bool UpdatersMatched(std::vector<std::string> updater_seq,
                       return name == up->Name();
                     });
 }
-
-void MismatchedDevices(Context const* booster, Context const* data) {
-  bool thread_local static logged{false};
-  if (logged) {
-    return;
-  }
-  LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. This might "
-                  "lead to higher memory usage and slower performance. XGBoost is running on: "
-               << booster->DeviceName() << ", while the input data is on: " << data->DeviceName()
-               << ".\n"
-               << R"(Potential solutions:
-- Use a data structure that matches the device ordinal in the booster.
-- Set the device for booster before call to inplace_predict.
-
-This warning will only be shown once for each thread. Subsequent warnings made by the
-current thread will be suppressed.
-)";
-  logged = true;
-}
 }  // namespace
 
 void GBTree::Configure(Args const& cfg) {
@@ -557,7 +538,7 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
   auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
   CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
   if (p_m->Ctx()->Device() != this->ctx_->Device()) {
-    MismatchedDevices(this->ctx_, p_m->Ctx());
+    error::MismatchedDevices(this->ctx_, p_m->Ctx());
     CHECK_EQ(out_preds->version, 0);
     auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
     CHECK(proxy) << error::InplacePredictProxy();
@@ -810,7 +791,7 @@ class Dart : public GBTree {
     auto n_groups = model_.learner_model_param->num_output_group;
 
     if (ctx_->Device() != p_fmat->Ctx()->Device()) {
-      MismatchedDevices(ctx_, p_fmat->Ctx());
+      error::MismatchedDevices(ctx_, p_fmat->Ctx());
       auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_fmat);
       CHECK(proxy) << error::InplacePredictProxy();
       auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
diff --git a/tests/cpp/gbm/test_gbtree.cu b/tests/cpp/gbm/test_gbtree.cu
index 03f6898222ff..801c935d6788 100644
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@@ -58,21 +58,6 @@ void TestInplaceFallback(Context const* ctx) {
   HostDeviceVector<float>* out_predt{nullptr};
   ConsoleLogger::Configure(Args{{"verbosity", "1"}});
   std::string output;
-  // test whether the warning is raised
-#if !defined(_WIN32)
-  // Windows has issue with CUDA and thread local storage. For some reason, on Windows a
-  // cudaInitializationError is raised during destruction of `HostDeviceVector`. This
-  // might be related to https://github.com/dmlc/xgboost/issues/5793
-  ::testing::internal::CaptureStderr();
-  std::thread{[&] {
-    // Launch a new thread to ensure a warning is raised as we prevent over-verbose
-    // warning by using thread-local flags.
-    learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
-                            &out_predt, 0, 0);
-  }}.join();
-  output = testing::internal::GetCapturedStderr();
-  ASSERT_NE(output.find("Falling back"), std::string::npos);
-#endif
 
   learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
                           &out_predt, 0, 0);
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index fb5f47c2b282..ec7c45ca248a 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -191,14 +191,32 @@ def test_inplace_predict_device_type(self, device: str) -> None:
         np.testing.assert_allclose(predt_0, predt_3)
         np.testing.assert_allclose(predt_0, predt_4)
 
-    def run_inplace_base_margin(self, booster, dtrain, X, base_margin):
+    def run_inplace_base_margin(
+        self, device: int, booster: xgb.Booster, dtrain: xgb.DMatrix, X, base_margin
+    ) -> None:
         import cupy as cp
 
+        booster.set_param({"device": f"cuda:{device}"})
         dtrain.set_info(base_margin=base_margin)
         from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
         from_dmatrix = booster.predict(dtrain)
         cp.testing.assert_allclose(from_inplace, from_dmatrix)
 
+        booster = booster.copy()  # clear prediction cache.
+        booster.set_param({"device": "cpu"})
+        from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
+        from_dmatrix = booster.predict(dtrain)
+        cp.testing.assert_allclose(from_inplace, from_dmatrix)
+
+        booster = booster.copy()  # clear prediction cache.
+        base_margin = cp.asnumpy(base_margin)
+        if hasattr(X, "values"):
+            X = cp.asnumpy(X.values)
+        booster.set_param({"device": f"cuda:{device}"})
+        from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
+        from_dmatrix = booster.predict(dtrain)
+        cp.testing.assert_allclose(from_inplace, from_dmatrix, rtol=1e-6)
+
     def run_inplace_predict_cupy(self, device: int) -> None:
         import cupy as cp
 
@@ -244,7 +262,7 @@ def predict_dense(x):
             run_threaded_predict(X, rows, predict_dense)
 
         base_margin = cp_rng.randn(rows)
-        self.run_inplace_base_margin(booster, dtrain, X, base_margin)
+        self.run_inplace_base_margin(device, booster, dtrain, X, base_margin)
 
         # Create a wide dataset
         X = cp_rng.randn(100, 10000)
@@ -318,7 +336,7 @@ def predict_df(x):
             run_threaded_predict(X, rows, predict_df)
 
         base_margin = cudf.Series(rng.randn(rows))
-        self.run_inplace_base_margin(booster, dtrain, X, base_margin)
+        self.run_inplace_base_margin(0, booster, dtrain, X, base_margin)
 
     @given(
         strategies.integers(1, 10), tm.make_dataset_strategy(), shap_parameter_strategy