diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in index e2d6f4d87d04..3f76ff86c13a 100644 --- a/R-package/src/Makevars.in +++ b/R-package/src/Makevars.in @@ -55,6 +55,7 @@ OBJECTS= \ $(PKGROOT)/src/predictor/cpu_predictor.o \ $(PKGROOT)/src/tree/constraints.o \ $(PKGROOT)/src/tree/param.o \ + $(PKGROOT)/src/tree/fit_stump.o \ $(PKGROOT)/src/tree/tree_model.o \ $(PKGROOT)/src/tree/tree_updater.o \ $(PKGROOT)/src/tree/updater_approx.o \ @@ -85,6 +86,7 @@ OBJECTS= \ $(PKGROOT)/src/common/pseudo_huber.o \ $(PKGROOT)/src/common/quantile.o \ $(PKGROOT)/src/common/random.o \ + $(PKGROOT)/src/common/stats.o \ $(PKGROOT)/src/common/survival_util.o \ $(PKGROOT)/src/common/threading_utils.o \ $(PKGROOT)/src/common/timer.o \ diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win index a4c99203c641..316e0624942c 100644 --- a/R-package/src/Makevars.win +++ b/R-package/src/Makevars.win @@ -55,6 +55,7 @@ OBJECTS= \ $(PKGROOT)/src/predictor/cpu_predictor.o \ $(PKGROOT)/src/tree/constraints.o \ $(PKGROOT)/src/tree/param.o \ + $(PKGROOT)/src/tree/fit_stump.o \ $(PKGROOT)/src/tree/tree_model.o \ $(PKGROOT)/src/tree/tree_updater.o \ $(PKGROOT)/src/tree/updater_approx.o \ @@ -85,6 +86,7 @@ OBJECTS= \ $(PKGROOT)/src/common/pseudo_huber.o \ $(PKGROOT)/src/common/quantile.o \ $(PKGROOT)/src/common/random.o \ + $(PKGROOT)/src/common/stats.o \ $(PKGROOT)/src/common/survival_util.o \ $(PKGROOT)/src/common/threading_utils.o \ $(PKGROOT)/src/common/timer.o \ diff --git a/include/xgboost/base.h b/include/xgboost/base.h index 05c13bed0363..34312223c0cb 100644 --- a/include/xgboost/base.h +++ b/include/xgboost/base.h @@ -134,6 +134,8 @@ using bst_row_t = std::size_t; // NOLINT using bst_node_t = int32_t; // NOLINT /*! \brief Type for ranking group index. */ using bst_group_t = uint32_t; // NOLINT +/*! \brief Type for indexing target variables. */ +using bst_target_t = std::size_t; // NOLINT namespace detail { /*! \brief Implementation of gradient statistics pair. Template specialisation diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h index d5b255b82b27..ca816bcdb7a4 100644 --- a/include/xgboost/linalg.h +++ b/include/xgboost/linalg.h @@ -15,6 +15,7 @@ #include #include +#include // std::int32_t #include #include #include @@ -388,9 +389,9 @@ class TensorView { * \brief Create a tensor with data, shape and strides. Don't use this constructor if * stride can be calculated from shape. */ - template + template LINALG_HD TensorView(common::Span data, I const (&shape)[D], I const (&stride)[D], - int32_t device) + std::int32_t device) : data_{data}, ptr_{data_.data()}, device_{device} { static_assert(D == kDim, "Invalid shape & stride."); detail::UnrollLoop([&](auto i) { @@ -833,6 +834,27 @@ class Tensor { int32_t DeviceIdx() const { return data_.DeviceIdx(); } }; +template +using Vector = Tensor; + +template +auto Constant(Context const *ctx, T v, Index &&...index) { + Tensor t; + t.SetDevice(ctx->gpu_id); + t.Reshape(index...); + t.Data()->Fill(std::move(v)); + return t; +} + + +/** + * \brief Like `np.zeros`, return a new array of given shape and type, filled with zeros. + */ +template +auto Zeros(Context const *ctx, Index &&...index) { + return Constant(ctx, static_cast(0), index...); +} + // Only first axis is supported for now. template void Stack(Tensor *l, Tensor const &r) { diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h index 9186ef710631..e5da2878bf91 100644 --- a/include/xgboost/objective.h +++ b/include/xgboost/objective.h @@ -93,7 +93,7 @@ class ObjFunction : public Configurable { * \brief Return number of targets for input matrix. Right now XGBoost supports only * multi-target regression. */ - virtual uint32_t Targets(MetaInfo const& info) const { + virtual bst_target_t Targets(MetaInfo const& info) const { if (info.labels.Shape(1) > 1) { LOG(FATAL) << "multioutput is not supported by current objective function"; } diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc index fc33317bd7a8..030070d9aecd 100644 --- a/src/common/host_device_vector.cc +++ b/src/common/host_device_vector.cc @@ -172,6 +172,7 @@ void HostDeviceVector::SetDevice(int) const {} template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; +template class HostDeviceVector; template class HostDeviceVector; // bst_node_t template class HostDeviceVector; template class HostDeviceVector; diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index 00f19230dc7e..a5c5dbf8fa1b 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -404,6 +404,7 @@ void HostDeviceVector::Resize(size_t new_size, T v) { template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; +template class HostDeviceVector; template class HostDeviceVector; // bst_node_t template class HostDeviceVector; template class HostDeviceVector; diff --git a/src/common/numeric.cc b/src/common/numeric.cc index 959f6305d134..2a1ca4d44af0 100644 --- a/src/common/numeric.cc +++ b/src/common/numeric.cc @@ -3,10 +3,8 @@ */ #include "numeric.h" -#include // std::accumulate #include // std::is_same -#include "threading_utils.h" // MemStackAllocator, ParallelFor, DefaultMaxThreads #include "xgboost/context.h" // Context #include "xgboost/host_device_vector.h" // HostDeviceVector @@ -15,14 +13,11 @@ namespace common { double Reduce(Context const* ctx, HostDeviceVector const& values) { if (ctx->IsCPU()) { auto const& h_values = values.ConstHostVector(); - MemStackAllocator result_tloc(ctx->Threads(), 0); - ParallelFor(h_values.size(), ctx->Threads(), - [&](auto i) { result_tloc[omp_get_thread_num()] += h_values[i]; }); - auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cend(), 0.0); + auto result = cpu_impl::Reduce(ctx, h_values.cbegin(), h_values.cend(), 0.0); static_assert(std::is_same::value, ""); return result; } - return cuda::Reduce(ctx, values); + return cuda_impl::Reduce(ctx, values); } } // namespace common } // namespace xgboost diff --git a/src/common/numeric.cu b/src/common/numeric.cu index ad6c6b53c90f..b292edf1aa7f 100644 --- a/src/common/numeric.cu +++ b/src/common/numeric.cu @@ -2,24 +2,22 @@ * Copyright 2022 by XGBoost Contributors */ #include -#include // thrust:plus -#include "device_helpers.cuh" // dh::Reduce, safe_cuda, dh::XGBCachingDeviceAllocator +#include "device_helpers.cuh" // dh::Reduce, dh::XGBCachingDeviceAllocator #include "numeric.h" #include "xgboost/context.h" // Context #include "xgboost/host_device_vector.h" // HostDeviceVector namespace xgboost { namespace common { -namespace cuda { +namespace cuda_impl { double Reduce(Context const* ctx, HostDeviceVector const& values) { values.SetDevice(ctx->gpu_id); auto const d_values = values.ConstDeviceSpan(); dh::XGBCachingDeviceAllocator alloc; - auto res = dh::Reduce(thrust::cuda::par(alloc), d_values.data(), - d_values.data() + d_values.size(), 0.0, thrust::plus{}); - return res; + return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0, + thrust::plus{}); } -} // namespace cuda +} // namespace cuda_impl } // namespace common } // namespace xgboost diff --git a/src/common/numeric.h b/src/common/numeric.h index f3fad57b4645..7b52b7ba66b3 100644 --- a/src/common/numeric.h +++ b/src/common/numeric.h @@ -95,7 +95,7 @@ void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) { exc.Rethrow(); } -namespace cuda { +namespace cuda_impl { double Reduce(Context const* ctx, HostDeviceVector const& values); #if !defined(XGBOOST_USE_CUDA) inline double Reduce(Context const*, HostDeviceVector const&) { @@ -103,9 +103,25 @@ inline double Reduce(Context const*, HostDeviceVector const&) { return 0; } #endif // !defined(XGBOOST_USE_CUDA) -} // namespace cuda +} // namespace cuda_impl + +/** + * \brief Reduction with iterator. init must be additive identity. (0 for primitive types) + */ +namespace cpu_impl { +template +V Reduce(Context const* ctx, It first, It second, V const& init) { + size_t n = std::distance(first, second); + common::MemStackAllocator result_tloc(ctx->Threads(), init); + common::ParallelFor(n, ctx->Threads(), + [&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; }); + auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + ctx->Threads(), init); + return result; +} +} // namespace cpu_impl + /** - * \brief Reduction with summation. + * \brief Reduction on host device vector. */ double Reduce(Context const* ctx, HostDeviceVector const& values); diff --git a/src/common/quantile.cu b/src/common/quantile.cu index 06939e846891..8f89ed26f415 100644 --- a/src/common/quantile.cu +++ b/src/common/quantile.cu @@ -641,7 +641,7 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts) { thrust::equal_to{}, [] __device__(auto l, auto r) { return l.value > r.value ? l : r; }); dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_values)); - auto max_it = common::MakeIndexTransformIter([&](auto i) { + auto max_it = MakeIndexTransformIter([&](auto i) { if (IsCat(h_feature_types, i)) { return max_values[i].value; } diff --git a/src/common/stats.cc b/src/common/stats.cc new file mode 100644 index 000000000000..8d1b54f0ecd3 --- /dev/null +++ b/src/common/stats.cc @@ -0,0 +1,64 @@ +/*! + * Copyright 2022 by XGBoost Contributors + */ +#include "stats.h" + +#include // std::accumulate + +#include "common.h" // OptionalWeights +#include "threading_utils.h" // ParallelFor, MemStackAllocator +#include "transform_iterator.h" // MakeIndexTransformIter +#include "xgboost/context.h" // Context +#include "xgboost/host_device_vector.h" // HostDeviceVector +#include "xgboost/linalg.h" // Tensor, UnravelIndex, Apply +#include "xgboost/logging.h" // CHECK_EQ + +namespace xgboost { +namespace common { +float Median(Context const* ctx, linalg::Tensor const& t, + HostDeviceVector const& weights) { + CHECK_LE(t.Shape(1), 1) << "Matrix is not yet supported."; + if (!ctx->IsCPU()) { + weights.SetDevice(ctx->gpu_id); + auto opt_weights = OptionalWeights(weights.ConstDeviceSpan()); + auto t_v = t.View(ctx->gpu_id); + return cuda_impl::Median(ctx, t_v, opt_weights); + } + + auto opt_weights = OptionalWeights(weights.ConstHostSpan()); + auto t_v = t.HostView(); + auto iter = common::MakeIndexTransformIter( + [&](size_t i) { return linalg::detail::Apply(t_v, linalg::UnravelIndex(i, t_v.Shape())); }); + float q{0}; + if (opt_weights.Empty()) { + q = common::Quantile(0.5, iter, iter + t_v.Size()); + } else { + CHECK_NE(t_v.Shape(1), 0); + auto w_it = common::MakeIndexTransformIter([&](size_t i) { + auto sample_idx = i / t_v.Shape(1); + return opt_weights[sample_idx]; + }); + q = common::WeightedQuantile(0.5, iter, iter + t_v.Size(), w_it); + } + return q; +} + +void Mean(Context const* ctx, linalg::Vector const& v, linalg::Vector* out) { + v.SetDevice(ctx->gpu_id); + out->SetDevice(ctx->gpu_id); + out->Reshape(1); + + if (ctx->IsCPU()) { + auto h_v = v.HostView(); + float n = v.Size(); + MemStackAllocator tloc(ctx->Threads(), 0.0f); + ParallelFor(v.Size(), ctx->Threads(), + [&](auto i) { tloc[omp_get_thread_num()] += h_v(i) / n; }); + auto ret = std::accumulate(tloc.cbegin(), tloc.cend(), .0f); + out->HostView()(0) = ret; + } else { + cuda_impl::Mean(ctx, v.View(ctx->gpu_id), out->View(ctx->gpu_id)); + } +} +} // namespace common +} // namespace xgboost diff --git a/src/common/stats.cu b/src/common/stats.cu index 42a9bc6844bf..2e728a8bc333 100644 --- a/src/common/stats.cu +++ b/src/common/stats.cu @@ -13,7 +13,7 @@ namespace xgboost { namespace common { -namespace cuda { +namespace cuda_impl { float Median(Context const* ctx, linalg::TensorView t, common::OptionalWeights weights) { HostDeviceVector segments{0, t.Size()}; @@ -42,6 +42,17 @@ float Median(Context const* ctx, linalg::TensorView t, CHECK_EQ(quantile.Size(), 1); return quantile.HostVector().front(); } -} // namespace cuda + +void Mean(Context const* ctx, linalg::VectorView v, linalg::VectorView out) { + float n = v.Size(); + auto it = dh::MakeTransformIterator( + thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) { return v(i) / n; }); + std::size_t bytes; + CHECK_EQ(out.Size(), 1); + cub::DeviceReduce::Sum(nullptr, bytes, it, out.Values().data(), v.Size()); + dh::TemporaryArray temp{bytes}; + cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size()); +} +} // namespace cuda_impl } // namespace common } // namespace xgboost diff --git a/src/common/stats.h b/src/common/stats.h index bc82d4da8213..3572d2f0a02b 100644 --- a/src/common/stats.h +++ b/src/common/stats.h @@ -8,10 +8,11 @@ #include #include -#include "common.h" // AssertGPUSupport +#include "common.h" // AssertGPUSupport, OptionalWeights #include "transform_iterator.h" // MakeIndexTransformIter #include "xgboost/context.h" // Context #include "xgboost/linalg.h" +#include "xgboost/logging.h" // CHECK_GE namespace xgboost { namespace common { @@ -93,43 +94,25 @@ float WeightedQuantile(double alpha, Iter begin, Iter end, WeightIter weights) { return val(idx); } -namespace cuda { -float Median(Context const* ctx, linalg::TensorView t, - common::OptionalWeights weights); +namespace cuda_impl { +float Median(Context const* ctx, linalg::TensorView t, OptionalWeights weights); +void Mean(Context const* ctx, linalg::VectorView v, linalg::VectorView out); + #if !defined(XGBOOST_USE_CUDA) -inline float Median(Context const*, linalg::TensorView, common::OptionalWeights) { - AssertGPUSupport(); +inline float Median(Context const*, linalg::TensorView, OptionalWeights) { + common::AssertGPUSupport(); return 0; } +inline void Mean(Context const*, linalg::VectorView, linalg::VectorView) { + common::AssertGPUSupport(); +} #endif // !defined(XGBOOST_USE_CUDA) -} // namespace cuda +} // namespace cuda_impl -inline float Median(Context const* ctx, linalg::Tensor const& t, - HostDeviceVector const& weights) { - if (!ctx->IsCPU()) { - weights.SetDevice(ctx->gpu_id); - auto opt_weights = OptionalWeights(weights.ConstDeviceSpan()); - auto t_v = t.View(ctx->gpu_id); - return cuda::Median(ctx, t_v, opt_weights); - } +float Median(Context const* ctx, linalg::Tensor const& t, + HostDeviceVector const& weights); - auto opt_weights = OptionalWeights(weights.ConstHostSpan()); - auto t_v = t.HostView(); - auto iter = common::MakeIndexTransformIter( - [&](size_t i) { return linalg::detail::Apply(t_v, linalg::UnravelIndex(i, t_v.Shape())); }); - float q{0}; - if (opt_weights.Empty()) { - q = common::Quantile(0.5, iter, iter + t_v.Size()); - } else { - CHECK_NE(t_v.Shape(1), 0); - auto w_it = common::MakeIndexTransformIter([&](size_t i) { - auto sample_idx = i / t_v.Shape(1); - return opt_weights[sample_idx]; - }); - q = common::WeightedQuantile(0.5, iter, iter + t_v.Size(), w_it); - } - return q; -} +void Mean(Context const* ctx, linalg::Vector const& v, linalg::Vector* out); } // namespace common } // namespace xgboost #endif // XGBOOST_COMMON_STATS_H_ diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc index 6ddf39849949..4beb9d9c365e 100644 --- a/src/objective/adaptive.cc +++ b/src/objective/adaptive.cc @@ -10,6 +10,7 @@ #include "../common/numeric.h" #include "../common/stats.h" #include "../common/threading_utils.h" +#include "../common/transform_iterator.h" // MakeIndexTransformIter #include "xgboost/tree_model.h" namespace xgboost { diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu index ef2bcc33183b..f609745cfc31 100644 --- a/src/objective/regression_obj.cu +++ b/src/objective/regression_obj.cu @@ -81,7 +81,7 @@ class RegLossObj : public ObjFunction { ObjInfo Task() const override { return Loss::Info(); } - uint32_t Targets(MetaInfo const& info) const override { + bst_target_t Targets(MetaInfo const& info) const override { // Multi-target regression. return std::max(static_cast(1), info.labels.Shape(1)); } @@ -220,7 +220,7 @@ class PseudoHuberRegression : public ObjFunction { public: void Configure(Args const& args) override { param_.UpdateAllowUnknown(args); } ObjInfo Task() const override { return ObjInfo::kRegression; } - uint32_t Targets(MetaInfo const& info) const override { + bst_target_t Targets(MetaInfo const& info) const override { return std::max(static_cast(1), info.labels.Shape(1)); } diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc new file mode 100644 index 000000000000..82efff2c77ac --- /dev/null +++ b/src/tree/fit_stump.cc @@ -0,0 +1,82 @@ +/** + * Copyright 2022 by XGBoost Contributors + * + * \brief Utilities for estimating initial score. + */ +#include "fit_stump.h" + +#include // std::int32_t +#include // std::size_t + +#include "../collective/communicator-inl.h" +#include "../common/common.h" // AssertGPUSupport +#include "../common/numeric.h" // cpu_impl::Reduce +#include "../common/threading_utils.h" // ParallelFor +#include "../common/transform_iterator.h" // MakeIndexTransformIter +#include "xgboost/base.h" // bst_target_t, GradientPairPrecise +#include "xgboost/context.h" // Context +#include "xgboost/linalg.h" // TensorView, Tensor, Constant +#include "xgboost/logging.h" // CHECK_EQ + +namespace xgboost { +namespace tree { +namespace cpu_impl { +void FitStump(Context const* ctx, linalg::TensorView gpair, + linalg::VectorView out) { + auto n_targets = out.Size(); + CHECK_EQ(n_targets, gpair.Shape(1)); + linalg::Tensor sum_tloc = + linalg::Constant(ctx, GradientPairPrecise{}, ctx->Threads(), n_targets); + auto h_sum_tloc = sum_tloc.HostView(); + // first dim for gpair is samples, second dim is target. + // Reduce by column, parallel by samples + common::ParallelFor(gpair.Shape(0), ctx->Threads(), [&](auto i) { + for (bst_target_t t = 0; t < n_targets; ++t) { + h_sum_tloc(omp_get_thread_num(), t) += GradientPairPrecise{gpair(i, t)}; + } + }); + // Aggregate to the first row. + auto h_sum = h_sum_tloc.Slice(0, linalg::All()); + for (std::int32_t i = 1; i < ctx->Threads(); ++i) { + for (bst_target_t j = 0; j < n_targets; ++j) { + h_sum(j) += h_sum_tloc(i, j); + } + } + CHECK(h_sum.CContiguous()); + collective::Allreduce( + reinterpret_cast(h_sum.Values().data()), h_sum.Size() * 2); + + for (std::size_t i = 0; i < h_sum.Size(); ++i) { + out(i) = static_cast(CalcUnregularizedWeight(h_sum(i).GetGrad(), h_sum(i).GetHess())); + } +} +} // namespace cpu_impl + +namespace cuda_impl { +void FitStump(Context const* ctx, linalg::TensorView gpair, + linalg::VectorView out); + +#if !defined(XGBOOST_USE_CUDA) +inline void FitStump(Context const*, linalg::TensorView, + linalg::VectorView) { + common::AssertGPUSupport(); +} +#endif // !defined(XGBOOST_USE_CUDA) +} // namespace cuda_impl + +void FitStump(Context const* ctx, HostDeviceVector const& gpair, + bst_target_t n_targets, linalg::Vector* out) { + out->SetDevice(ctx->gpu_id); + out->Reshape(n_targets); + auto n_samples = gpair.Size() / n_targets; + + gpair.SetDevice(ctx->gpu_id); + linalg::TensorView gpair_t{ + ctx->IsCPU() ? gpair.ConstHostSpan() : gpair.ConstDeviceSpan(), + {n_samples, n_targets}, + ctx->gpu_id}; + ctx->IsCPU() ? cpu_impl::FitStump(ctx, gpair_t, out->HostView()) + : cuda_impl::FitStump(ctx, gpair_t, out->View(ctx->gpu_id)); +} +} // namespace tree +} // namespace xgboost diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu new file mode 100644 index 000000000000..58a1fae82987 --- /dev/null +++ b/src/tree/fit_stump.cu @@ -0,0 +1,63 @@ +/** + * Copyright 2022 by XGBoost Contributors + * + * \brief Utilities for estimating initial score. + */ +#if !defined(NOMINMAX) && defined(_WIN32) +#define NOMINMAX +#endif // !defined(NOMINMAX) +#include // cuda::par +#include // thrust::make_counting_iterator + +#include // std::size_t + +#include "../collective/device_communicator.cuh" // DeviceCommunicator +#include "../common/device_helpers.cuh" // dh::MakeTransformIterator +#include "fit_stump.h" +#include "xgboost/base.h" // GradientPairPrecise, GradientPair, XGBOOST_DEVICE +#include "xgboost/context.h" // Context +#include "xgboost/linalg.h" // TensorView, Tensor, Constant +#include "xgboost/logging.h" // CHECK_EQ +#include "xgboost/span.h" // span + +namespace xgboost { +namespace tree { +namespace cuda_impl { +void FitStump(Context const* ctx, linalg::TensorView gpair, + linalg::VectorView out) { + auto n_targets = out.Size(); + CHECK_EQ(n_targets, gpair.Shape(1)); + linalg::Vector sum = linalg::Constant(ctx, GradientPairPrecise{}, n_targets); + CHECK(out.Contiguous()); + + // Reduce by column + auto key_it = dh::MakeTransformIterator( + thrust::make_counting_iterator(0ul), + [=] XGBOOST_DEVICE(std::size_t i) -> bst_target_t { return i / gpair.Shape(0); }); + auto grad_it = dh::MakeTransformIterator( + thrust::make_counting_iterator(0ul), + [=] XGBOOST_DEVICE(std::size_t i) -> GradientPairPrecise { + auto target = i / gpair.Shape(0); + auto sample = i % gpair.Shape(0); + return GradientPairPrecise{gpair(sample, target)}; + }); + auto d_sum = sum.View(ctx->gpu_id); + CHECK(d_sum.CContiguous()); + + dh::XGBCachingDeviceAllocator alloc; + auto policy = thrust::cuda::par(alloc); + thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it, + thrust::make_discard_iterator(), dh::tbegin(d_sum.Values())); + + collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(ctx->gpu_id); + communicator->AllReduceSum(reinterpret_cast(d_sum.Values().data()), d_sum.Size() * 2); + + thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets, + [=] XGBOOST_DEVICE(std::size_t i) mutable { + out(i) = static_cast( + CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess())); + }); +} +} // namespace cuda_impl +} // namespace tree +} // namespace xgboost diff --git a/src/tree/fit_stump.h b/src/tree/fit_stump.h new file mode 100644 index 000000000000..1f5cd60b4928 --- /dev/null +++ b/src/tree/fit_stump.h @@ -0,0 +1,37 @@ +/** + * Copyright 2022 by XGBoost Contributors + * + * \brief Utilities for estimating initial score. + */ + +#ifndef XGBOOST_TREE_FIT_STUMP_H_ +#define XGBOOST_TREE_FIT_STUMP_H_ + +#if !defined(NOMINMAX) && defined(_WIN32) +#define NOMINMAX +#endif // !defined(NOMINMAX) + +#include // std::max + +#include "../common/common.h" // AssertGPUSupport +#include "xgboost/base.h" // GradientPair +#include "xgboost/context.h" // Context +#include "xgboost/host_device_vector.h" // HostDeviceVector +#include "xgboost/linalg.h" // TensorView + +namespace xgboost { +namespace tree { + +template +XGBOOST_DEVICE inline double CalcUnregularizedWeight(T sum_grad, T sum_hess) { + return -sum_grad / std::max(sum_hess, static_cast(kRtEps)); +} + +/** + * @brief Fit a tree stump as an estimation of base_score. + */ +void FitStump(Context const* ctx, HostDeviceVector const& gpair, + bst_target_t n_targets, linalg::Vector* out); +} // namespace tree +} // namespace xgboost +#endif // XGBOOST_TREE_FIT_STUMP_H_ diff --git a/tests/cpp/common/test_stats.cc b/tests/cpp/common/test_stats.cc index 99f4c7ee668b..03e50a9846e9 100644 --- a/tests/cpp/common/test_stats.cc +++ b/tests/cpp/common/test_stats.cc @@ -3,8 +3,10 @@ */ #include #include +#include // Tensor,Vector #include "../../../src/common/stats.h" +#include "../../../src/common/transform_iterator.h" // common::MakeIndexTransformIter namespace xgboost { namespace common { @@ -69,5 +71,35 @@ TEST(Stats, Median) { ASSERT_EQ(m, .5f); #endif // defined(XGBOOST_USE_CUDA) } +namespace { +void TestMean(Context const* ctx) { + std::size_t n{128}; + linalg::Vector data({n}, ctx->gpu_id); + auto h_v = data.HostView().Values(); + std::iota(h_v.begin(), h_v.end(), .0f); + + auto nf = static_cast(n); + float mean = nf * (nf - 1) / 2 / n; + + linalg::Vector res{{1}, ctx->gpu_id}; + Mean(ctx, data, &res); + auto h_res = res.HostView(); + ASSERT_EQ(h_res.Size(), 1); + ASSERT_EQ(mean, h_res(0)); +} +} // anonymous namespace + +TEST(Stats, Mean) { + Context ctx; + TestMean(&ctx); +} + +#if defined(XGBOOST_USE_CUDA) +TEST(Stats, GPUMean) { + Context ctx; + ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}}); + TestMean(&ctx); +} +#endif // defined(XGBOOST_USE_CUDA) } // namespace common } // namespace xgboost diff --git a/tests/cpp/common/test_stats.cu b/tests/cpp/common/test_stats.cu index 1a56c68b702d..e71ec3efcb60 100644 --- a/tests/cpp/common/test_stats.cu +++ b/tests/cpp/common/test_stats.cu @@ -7,6 +7,7 @@ #include #include "../../../src/common/stats.cuh" +#include "../../../src/common/stats.h" #include "xgboost/base.h" #include "xgboost/context.h" #include "xgboost/host_device_vector.h" diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 69d4d1fbb3bd..292846c0ad92 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -66,7 +66,7 @@ TEST(Learner, CheckGroup) { std::shared_ptr p_mat{ RandomDataGenerator{kNumRows, kNumCols, 0.0f}.GenerateDMatrix()}; - std::vector weight(kNumGroups); + std::vector weight(kNumGroups, 1); std::vector group(kNumGroups); group[0] = 2; group[1] = 3; diff --git a/tests/cpp/tree/test_fit_stump.cc b/tests/cpp/tree/test_fit_stump.cc new file mode 100644 index 000000000000..ef608e5757d9 --- /dev/null +++ b/tests/cpp/tree/test_fit_stump.cc @@ -0,0 +1,48 @@ +/** + * Copyright 2022 by XGBoost Contributors + */ +#include +#include + +#include "../../src/common/linalg_op.h" +#include "../../src/tree/fit_stump.h" + +namespace xgboost { +namespace tree { +namespace { +void TestFitStump(Context const *ctx) { + std::size_t constexpr kRows = 16, kTargets = 2; + HostDeviceVector gpair; + auto &h_gpair = gpair.HostVector(); + h_gpair.resize(kRows * kTargets); + for (std::size_t i = 0; i < kRows; ++i) { + for (std::size_t t = 0; t < kTargets; ++t) { + h_gpair.at(i * kTargets + t) = GradientPair{static_cast(i), 1}; + } + } + linalg::Vector out; + FitStump(ctx, gpair, kTargets, &out); + auto h_out = out.HostView(); + for (auto it = linalg::cbegin(h_out); it != linalg::cend(h_out); ++it) { + // sum_hess == kRows + auto n = static_cast(kRows); + auto sum_grad = n * (n - 1) / 2; + ASSERT_EQ(static_cast(-sum_grad / n), *it); + } +} +} // anonymous namespace + +TEST(InitEstimation, FitStump) { + Context ctx; + TestFitStump(&ctx); +} + +#if defined(XGBOOST_USE_CUDA) +TEST(InitEstimation, GPUFitStump) { + Context ctx; + ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}}); + TestFitStump(&ctx); +} +#endif // defined(XGBOOST_USE_CUDA) +} // namespace tree +} // namespace xgboost