dmlc · trivialfis · Jan 10, 2023 · Jan 9, 2023 · Jan 9, 2023 · Jan 9, 2023
diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2014-2022 by Contributors
+/**
+ * Copyright 2014-2023 by XGBoost Contributors
  * \file objective.h
  * \brief interface of objective function used by xgboost.
  * \author Tianqi Chen, Kailong Chen
@@ -14,6 +14,7 @@
 #include <xgboost/model.h>
 #include <xgboost/task.h>
 
+#include <cstdint>  // std::int32_t
 #include <functional>
 #include <string>
 #include <utility>
@@ -111,12 +112,13 @@ class ObjFunction : public Configurable {
    * \param position The leaf index for each rows.
    * \param info MetaInfo providing labels and weights.
    * \param prediction Model prediction after transformation.
+   * \param group_idx The group index for this tree, 0 when it's not multi-target or multi-class.
    * \param p_tree Tree that needs to be updated.
    */
   virtual void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& /*position*/,
                               MetaInfo const& /*info*/,
                               HostDeviceVector<float> const& /*prediction*/,
-                              RegTree* /*p_tree*/) const {}
+                              std::int32_t /*group_idx*/, RegTree* /*p_tree*/) const {}
 
   /*!
    * \brief Create an objective function according to name.

diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
@@ -317,13 +317,13 @@ def get_dmat(self) -> xgb.DMatrix:
             enable_categorical=True,
         )
 
-    def get_device_dmat(self) -> xgb.DeviceQuantileDMatrix:
+    def get_device_dmat(self) -> xgb.QuantileDMatrix:
         import cupy as cp
 
         w = None if self.w is None else cp.array(self.w)
         X = cp.array(self.X, dtype=np.float32)
         y = cp.array(self.y, dtype=np.float32)
-        return xgb.DeviceQuantileDMatrix(X, y, w, base_margin=self.margin)
+        return xgb.QuantileDMatrix(X, y, weight=w, base_margin=self.margin)
 
     def get_external_dmat(self) -> xgb.DMatrix:
         n_samples = self.X.shape[0]
@@ -726,10 +726,16 @@ def random_csc(t_id: int) -> sparse.csc_matrix:
         TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
         TestDataset(
             "mtreg",
-            lambda: datasets.make_regression(n_samples=128, n_targets=3),
+            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
             "reg:squarederror",
             "rmse",
         ),
+        TestDataset(
+            "mtreg-l1",
+            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
+            "reg:absoluteerror",
+            "mae",
+        ),
         TestDataset("sparse", get_sparse, "reg:squarederror", "rmse"),
         TestDataset("sparse-l1", get_sparse, "reg:absoluteerror", "mae"),
         TestDataset(
@@ -753,7 +759,7 @@ def _dataset_weight_margin(draw: Callable) -> TestDataset:
         num_class = 1
         if data.objective == "multi:softmax":
             num_class = int(np.max(data.y) + 1)
-        elif data.name == "mtreg":
+        elif data.name.startswith("mtreg"):
             num_class = data.y.shape[1]
 
         data.margin = draw(

diff --git a/src/collective/rabit_communicator.h b/src/collective/rabit_communicator.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2022 XGBoost contributors
+/**
+ * Copyright 2022-2023 by XGBoost contributors
  */
 #pragma once
 #include <rabit/rabit.h>
@@ -119,7 +119,7 @@ class RabitCommunicator : public Communicator {
   }
 
   template <typename DType, std::enable_if_t<std::is_floating_point<DType>::value> * = nullptr>
-  void DoBitwiseAllReduce(void *send_receive_buffer, std::size_t count, Operation op) {
+  void DoBitwiseAllReduce(void *, std::size_t, Operation) {
     LOG(FATAL) << "Floating point types do not support bitwise operations.";
   }
 

diff --git a/src/common/stats.cc b/src/common/stats.cc
@@ -1,11 +1,13 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023 by XGBoost Contributors
  */
 #include "stats.h"
 
+#include <cstddef>                       // std::size_t
 #include <numeric>                       // std::accumulate
 
 #include "common.h"                      // OptionalWeights
+#include "linalg_op.h"
 #include "threading_utils.h"             // ParallelFor, MemStackAllocator
 #include "transform_iterator.h"          // MakeIndexTransformIter
 #include "xgboost/context.h"             // Context
@@ -15,32 +17,32 @@
 
 namespace xgboost {
 namespace common {
-float Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
-             HostDeviceVector<float> const& weights) {
-  CHECK_LE(t.Shape(1), 1) << "Matrix is not yet supported.";
+void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
+            HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
   if (!ctx->IsCPU()) {
     weights.SetDevice(ctx->gpu_id);
     auto opt_weights = OptionalWeights(weights.ConstDeviceSpan());
     auto t_v = t.View(ctx->gpu_id);
-    return cuda_impl::Median(ctx, t_v, opt_weights);
+    cuda_impl::Median(ctx, t_v, opt_weights, out);
   }
 
   auto opt_weights = OptionalWeights(weights.ConstHostSpan());
   auto t_v = t.HostView();
-  auto iter = common::MakeIndexTransformIter(
-      [&](size_t i) { return linalg::detail::Apply(t_v, linalg::UnravelIndex(i, t_v.Shape())); });
-  float q{0};
-  if (opt_weights.Empty()) {
-    q = common::Quantile(0.5, iter, iter + t_v.Size());
-  } else {
-    CHECK_NE(t_v.Shape(1), 0);
-    auto w_it = common::MakeIndexTransformIter([&](size_t i) {
-      auto sample_idx = i / t_v.Shape(1);
-      return opt_weights[sample_idx];
-    });
-    q = common::WeightedQuantile(0.5, iter, iter + t_v.Size(), w_it);
+  out->Reshape(t.Shape(1));
+  auto h_out = out->HostView();
+  for (std::size_t i{0}; i < t.Shape(1); ++i) {
+    auto ti_v = t_v.Slice(linalg::All(), i);
+    auto iter = linalg::cbegin(ti_v);
+    float q{0};
+    if (opt_weights.Empty()) {
+      q = common::Quantile(0.5, iter, iter + ti_v.Size());
+    } else {
+      CHECK_NE(t_v.Shape(1), 0);
+      auto w_it = common::MakeIndexTransformIter([&](std::size_t i) { return opt_weights[i]; });
+      q = common::WeightedQuantile(0.5, iter, iter + ti_v.Size(), w_it);
+    }
+    h_out(i) = q;
   }
-  return q;
 }
 
 void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) {

diff --git a/src/common/stats.cu b/src/common/stats.cu
@@ -1,46 +1,52 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023 by XGBoost Contributors
  */
 
 #include <thrust/iterator/counting_iterator.h>  // thrust::make_counting_iterator
 
-#include "common.h"            // common::OptionalWeights
-#include "device_helpers.cuh"  // dh::MakeTransformIterator, tcbegin, tcend
-#include "stats.cuh"           // common::SegmentedQuantile, common::SegmentedWeightedQuantile
-#include "xgboost/context.h"   // Context
+#include <cstddef>                              // size_t
+
+#include "common.h"                             // common::OptionalWeights
+#include "cuda_context.cuh"                     // CUDAContext
+#include "device_helpers.cuh"                   // dh::MakeTransformIterator, tcbegin, tcend
+#include "stats.cuh"          // common::SegmentedQuantile, common::SegmentedWeightedQuantile
+#include "xgboost/base.h"     // XGBOOST_DEVICE
+#include "xgboost/context.h"  // Context
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 #include "xgboost/linalg.h"              // linalg::TensorView, UnravelIndex, Apply
 
 namespace xgboost {
 namespace common {
 namespace cuda_impl {
-float Median(Context const* ctx, linalg::TensorView<float const, 2> t,
-             common::OptionalWeights weights) {
-  HostDeviceVector<size_t> segments{0, t.Size()};
+void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
+            common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
+  CHECK_GE(t.Shape(1), 1);
+  HostDeviceVector<std::size_t> segments(t.Shape(1) + 1, 0);
   segments.SetDevice(ctx->gpu_id);
-  auto d_segments = segments.ConstDeviceSpan();
+  auto d_segments = segments.DeviceSpan();
+  dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(),
+              [=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; });
   auto val_it = dh::MakeTransformIterator<float>(
       thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(size_t i) {
         return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
       });
 
-  HostDeviceVector<float> quantile{0};
-  quantile.SetDevice(ctx->gpu_id);
+  out->SetDevice(ctx->gpu_id);
+  out->Reshape(t.Shape(1));
   if (weights.Empty()) {
     common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it,
-                              val_it + t.Size(), &quantile);
+                              val_it + t.Size(), out->Data());
   } else {
     CHECK_NE(t.Shape(1), 0);
     auto w_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
-                                                 [=] XGBOOST_DEVICE(size_t i) {
+                                                 [=] XGBOOST_DEVICE(std::size_t i) {
                                                    auto sample_idx = i / t.Shape(1);
                                                    return weights[sample_idx];
                                                  });
     common::SegmentedWeightedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments),
-                                      val_it, val_it + t.Size(), w_it, w_it + t.Size(), &quantile);
+                                      val_it, val_it + t.Size(), w_it, w_it + t.Size(),
+                                      out->Data());
   }
-  CHECK_EQ(quantile.Size(), 1);
-  return quantile.HostVector().front();
 }
 
 void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorView<float> out) {
@@ -49,9 +55,10 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorV
       thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) { return v(i) / n; });
   std::size_t bytes;
   CHECK_EQ(out.Size(), 1);
-  cub::DeviceReduce::Sum(nullptr, bytes, it, out.Values().data(), v.Size());
+  auto s = ctx->CUDACtx()->Stream();
+  cub::DeviceReduce::Sum(nullptr, bytes, it, out.Values().data(), v.Size(), s);
   dh::TemporaryArray<char> temp{bytes};
-  cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size());
+  cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
 }
 }  // namespace cuda_impl
 }  // namespace common

diff --git a/src/common/stats.h b/src/common/stats.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023 by XGBoost Contributors
  */
 #ifndef XGBOOST_COMMON_STATS_H_
 #define XGBOOST_COMMON_STATS_H_
@@ -95,22 +95,27 @@ float WeightedQuantile(double alpha, Iter begin, Iter end, WeightIter weights) {
 }
 
 namespace cuda_impl {
-float Median(Context const* ctx, linalg::TensorView<float const, 2> t, OptionalWeights weights);
+void Median(Context const* ctx, linalg::TensorView<float const, 2> t, OptionalWeights weights,
+            linalg::Tensor<float, 1>* out);
+
 void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorView<float> out);
 
 #if !defined(XGBOOST_USE_CUDA)
-inline float Median(Context const*, linalg::TensorView<float const, 2>, OptionalWeights) {
+inline void Median(Context const*, linalg::TensorView<float const, 2>, OptionalWeights,
+                   linalg::Tensor<float, 1>*) {
   common::AssertGPUSupport();
-  return 0;
 }
 inline void Mean(Context const*, linalg::VectorView<float const>, linalg::VectorView<float>) {
   common::AssertGPUSupport();
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
 }  // namespace cuda_impl
 
-float Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
-             HostDeviceVector<float> const& weights);
+/**
+ * \brief Calculate medians for each column of the input matrix.
+ */
+void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
+            HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out);
 
 void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out);
 }  // namespace common

diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2014-2022 by Contributors
+/**
+ * Copyright 2014-2023 by Contributors
  * \file gbtree.cc
  * \brief gradient boosted tree implementation.
  * \author Tianqi Chen
@@ -21,6 +21,7 @@
 #include "../common/threading_utils.h"
 #include "../common/timer.h"
 #include "gbtree_model.h"
+#include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/gbm.h"
 #include "xgboost/host_device_vector.h"
@@ -219,6 +220,8 @@ void CopyGradient(HostDeviceVector<GradientPair> const* in_gpair, int32_t n_thre
 
 void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const& predictions,
                             ObjFunction const* obj,
+                            std::int32_t group_idx,
+                            std::vector<HostDeviceVector<bst_node_t>> const& node_position,
                             std::vector<std::unique_ptr<RegTree>>* p_trees) {
   CHECK(!updaters_.empty());
   if (!updaters_.back()->HasNodePosition()) {
@@ -227,10 +230,14 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
   if (!obj || !obj->Task().UpdateTreeLeaf()) {
     return;
   }
+
   auto& trees = *p_trees;
-  for (size_t tree_idx = 0; tree_idx < trees.size(); ++tree_idx) {
-    auto const& position = this->node_position_.at(tree_idx);
-    obj->UpdateTreeLeaf(position, p_fmat->Info(), predictions, trees[tree_idx].get());
+  CHECK_EQ(model_.param.num_parallel_tree, trees.size());
+  CHECK_EQ(model_.param.num_parallel_tree, 1)
+      << "Boosting random forest is not supported for current objective.";
+  for (std::size_t tree_idx = 0; tree_idx < trees.size(); ++tree_idx) {
+    auto const& position = node_position.at(tree_idx);
+    obj->UpdateTreeLeaf(position, p_fmat->Info(), predictions, group_idx, trees[tree_idx].get());
   }
 }
 
@@ -254,10 +261,14 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
     LOG(FATAL) << "Current objective doesn't support external memory.";
   }
 
+  // The node position for each row, 1 HDV for each tree in the forest.  Note that the
+  // position is negated if the row is sampled out.
+  std::vector<HostDeviceVector<bst_node_t>> node_position;
+
   if (ngroup == 1) {
     std::vector<std::unique_ptr<RegTree>> ret;
-    BoostNewTrees(in_gpair, p_fmat, 0, &ret);
-    UpdateTreeLeaf(p_fmat, predt->predictions, obj, &ret);
+    BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
+    UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
     const size_t num_new_trees = ret.size();
     new_trees.push_back(std::move(ret));
     auto v_predt = out.Slice(linalg::All(), 0);
@@ -271,10 +282,11 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
                                        in_gpair->DeviceIdx());
     bool update_predict = true;
     for (int gid = 0; gid < ngroup; ++gid) {
+      node_position.clear();
       CopyGradient(in_gpair, ctx_->Threads(), ngroup, gid, &tmp);
       std::vector<std::unique_ptr<RegTree>> ret;
-      BoostNewTrees(&tmp, p_fmat, gid, &ret);
-      UpdateTreeLeaf(p_fmat, predt->predictions, obj, &ret);
+      BoostNewTrees(&tmp, p_fmat, gid, &node_position, &ret);
+      UpdateTreeLeaf(p_fmat, predt->predictions, obj, gid, node_position, &ret);
       const size_t num_new_trees = ret.size();
       new_trees.push_back(std::move(ret));
       auto v_predt = out.Slice(linalg::All(), gid);
@@ -334,6 +346,7 @@ void GBTree::InitUpdater(Args const& cfg) {
 }
 
 void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
+                           std::vector<HostDeviceVector<bst_node_t>>* out_position,
                            std::vector<std::unique_ptr<RegTree>>* ret) {
   std::vector<RegTree*> new_trees;
   ret->clear();
@@ -367,14 +380,16 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
       ret->push_back(std::move(t));
     }
   }
+
   // update the trees
   CHECK_EQ(gpair->Size(), p_fmat->Info().num_row_)
       << "Mismatching size between number of rows from input data and size of "
          "gradient vector.";
-  node_position_.resize(new_trees.size());
+
+  CHECK(out_position);
+  out_position->resize(new_trees.size());
   for (auto& up : updaters_) {
-    up->Update(gpair, p_fmat, common::Span<HostDeviceVector<bst_node_t>>{node_position_},
-               new_trees);
+    up->Update(gpair, p_fmat, common::Span<HostDeviceVector<bst_node_t>>{*out_position}, new_trees);
   }
 }