diff --git a/src/common/stats.cuh b/src/common/stats.cuh index df544e180f0b..b95f6866ca5c 100644 --- a/src/common/stats.cuh +++ b/src/common/stats.cuh @@ -1,99 +1,220 @@ -/*! - * Copyright 2022 by XGBoost Contributors +/** + * Copyright 2022-2023 by XGBoost Contributors */ #ifndef XGBOOST_COMMON_STATS_CUH_ #define XGBOOST_COMMON_STATS_CUH_ -#include -#include +#include // lower_bound +#include // for_each_n +#include // make_constant_iterator +#include // make_counting_iterator +#include // make_permutation_iterator +#include // inclusive_scan_by_key -#include // std::distance +#include // std::min +#include // std::size_t +#include // std::distance +#include // std::numeric_limits +#include // std::is_floating_point,std::iterator_traits +#include "cuda_context.cuh" // CUDAContext #include "device_helpers.cuh" -#include "linalg_op.cuh" -#include "xgboost/context.h" -#include "xgboost/linalg.h" -#include "xgboost/tree_model.h" +#include "xgboost/context.h" // Context +#include "xgboost/span.h" // Span namespace xgboost { namespace common { +namespace detail { +// This should be a lambda function, but for some reason gcc-11 + nvcc-11.8 failed to +// compile it. As a result, a functor is extracted instead. +// +// error: ‘__T288’ was not declared in this scope +template +struct QuantileSegmentOp { + SegIt seg_begin; + ValIt val; + AlphaIt alpha_it; + Span d_results; + + static_assert(std::is_floating_point::value_type>::value, + "Invalid value for quantile."); + static_assert(std::is_floating_point::value_type>::value, + "Invalid alpha."); + + XGBOOST_DEVICE void operator()(std::size_t seg_idx) { + std::size_t begin = seg_begin[seg_idx]; + auto n = static_cast(seg_begin[seg_idx + 1] - begin); + double a = alpha_it[seg_idx]; + + if (n == 0) { + d_results[seg_idx] = std::numeric_limits::quiet_NaN(); + return; + } + + if (a <= (1 / (n + 1))) { + d_results[seg_idx] = val[begin]; + return; + } + if (a >= (n / (n + 1))) { + d_results[seg_idx] = val[common::LastOf(seg_idx, seg_begin)]; + return; + } + + double x = a * static_cast(n + 1); + double k = std::floor(x) - 1; + double d = (x - 1) - k; + + auto v0 = val[begin + static_cast(k)]; + auto v1 = val[begin + static_cast(k) + 1]; + + d_results[seg_idx] = v0 + d * (v1 - v0); + } +}; + +template +auto MakeQSegOp(SegIt seg_it, ValIt val_it, AlphaIt alpha_it, Span d_results) { + return QuantileSegmentOp{seg_it, val_it, alpha_it, d_results}; +} + +template +struct SegOp { + SegIt seg_beg; + SegIt seg_end; + + XGBOOST_DEVICE std::size_t operator()(std::size_t i) { + return dh::SegmentId(seg_beg, seg_end, i); + } +}; + +template +struct WeightOp { + WIter w_begin; + Span d_sorted_idx; + XGBOOST_DEVICE float operator()(std::size_t i) { return w_begin[d_sorted_idx[i]]; } +}; + +template +struct WeightedQuantileSegOp { + AlphaIt alpha_it; + SegIt seg_beg; + ValIt val_begin; + Span d_weight_cdf; + Span d_sorted_idx; + Span d_results; + static_assert(std::is_floating_point::value_type>::value, + "Invalid alpha."); + static_assert(std::is_floating_point::value_type>::value, + "Invalid value for quantile."); + + XGBOOST_DEVICE void operator()(std::size_t seg_idx) { + std::size_t begin = seg_beg[seg_idx]; + auto n = static_cast(seg_beg[seg_idx + 1] - begin); + if (n == 0) { + d_results[seg_idx] = std::numeric_limits::quiet_NaN(); + return; + } + auto seg_cdf = d_weight_cdf.subspan(begin, static_cast(n)); + auto seg_sorted_idx = d_sorted_idx.subspan(begin, static_cast(n)); + double a = alpha_it[seg_idx]; + double thresh = seg_cdf.back() * a; + + std::size_t idx = + thrust::lower_bound(thrust::seq, seg_cdf.data(), seg_cdf.data() + seg_cdf.size(), thresh) - + seg_cdf.data(); + idx = std::min(idx, static_cast(n - 1)); + d_results[seg_idx] = val_begin[seg_sorted_idx[idx]]; + } +}; + +template +auto MakeWQSegOp(SegIt seg_it, ValIt val_it, AlphaIt alpha_it, Span d_weight_cdf, + Span d_sorted_idx, Span d_results) { + return WeightedQuantileSegOp{alpha_it, seg_it, val_it, + d_weight_cdf, d_sorted_idx, d_results}; +} +} // namespace detail /** - * \brief Compute segmented quantile on GPU. + * @brief Compute segmented quantile on GPU. * - * \tparam SegIt Iterator for CSR style segments indptr - * \tparam ValIt Iterator for values + * @tparam SegIt Iterator for CSR style segments indptr + * @tparam ValIt Iterator for values + * @tparam AlphaIt Iterator to alphas * - * \param alpha The p^th quantile we want to compute + * @param alpha The p^th quantile we want to compute, one for each segment. * - * std::distance(ptr_begin, ptr_end) should be equal to n_segments + 1 + * std::distance(seg_begin, seg_end) should be equal to n_segments + 1 */ -template -void SegmentedQuantile(Context const* ctx, double alpha, SegIt seg_begin, SegIt seg_end, +template ::value>* = nullptr> +void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, SegIt seg_end, ValIt val_begin, ValIt val_end, HostDeviceVector* quantiles) { - CHECK(alpha >= 0 && alpha <= 1); - - dh::device_vector sorted_idx; - using Tup = thrust::tuple; + dh::device_vector sorted_idx; + using Tup = thrust::tuple; dh::SegmentedArgSort(seg_begin, seg_end, val_begin, val_end, &sorted_idx); auto n_segments = std::distance(seg_begin, seg_end) - 1; if (n_segments <= 0) { return; } - quantiles->SetDevice(ctx->gpu_id); - quantiles->Resize(n_segments); - auto d_results = quantiles->DeviceSpan(); auto d_sorted_idx = dh::ToSpan(sorted_idx); - auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx)); - dh::LaunchN(n_segments, [=] XGBOOST_DEVICE(size_t i) { - // each segment is the index of a leaf. - size_t seg_idx = i; - size_t begin = seg_begin[seg_idx]; - auto n = static_cast(seg_begin[seg_idx + 1] - begin); - if (n == 0) { - d_results[i] = std::numeric_limits::quiet_NaN(); - return; - } - - if (alpha <= (1 / (n + 1))) { - d_results[i] = val[begin]; - return; - } - if (alpha >= (n / (n + 1))) { - d_results[i] = val[common::LastOf(seg_idx, seg_begin)]; - return; - } + quantiles->SetDevice(ctx->gpu_id); + quantiles->Resize(n_segments); + auto d_results = quantiles->DeviceSpan(); - double x = alpha * static_cast(n + 1); - double k = std::floor(x) - 1; - double d = (x - 1) - k; + dh::LaunchN(n_segments, ctx->CUDACtx()->Stream(), + detail::MakeQSegOp(seg_begin, val, alpha_it, d_results)); +} - auto v0 = val[begin + static_cast(k)]; - auto v1 = val[begin + static_cast(k) + 1]; - d_results[seg_idx] = v0 + d * (v1 - v0); - }); +/** + * @brief Compute segmented quantile on GPU. + * + * @tparam SegIt Iterator for CSR style segments indptr + * @tparam ValIt Iterator for values + * + * @param alpha The p^th quantile we want to compute + * + * std::distance(ptr_begin, ptr_end) should be equal to n_segments + 1 + */ +template +void SegmentedQuantile(Context const* ctx, double alpha, SegIt seg_begin, SegIt seg_end, + ValIt val_begin, ValIt val_end, HostDeviceVector* quantiles) { + CHECK(alpha >= 0 && alpha <= 1); + auto alpha_it = thrust::make_constant_iterator(alpha); + return SegmentedQuantile(ctx, alpha_it, seg_begin, seg_end, val_begin, val_end, quantiles); } -template -void SegmentedWeightedQuantile(Context const* ctx, double alpha, SegIt seg_beg, SegIt seg_end, +/** + * @brief Compute segmented quantile on GPU with weighted inputs. + * + * @tparam SegIt Iterator for CSR style segments indptr + * @tparam ValIt Iterator for values + * @tparam WIter Iterator for weights + * + * @param alpha_it Iterator for the p^th quantile we want to compute, one per-segment + * @param w_begin Iterator for weight for each input element + */ +template ::value_type, void>::value>* = nullptr> +void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_beg, SegIt seg_end, ValIt val_begin, ValIt val_end, WIter w_begin, WIter w_end, HostDeviceVector* quantiles) { - CHECK(alpha >= 0 && alpha <= 1); - dh::device_vector sorted_idx; + auto cuctx = ctx->CUDACtx(); + dh::device_vector sorted_idx; dh::SegmentedArgSort(seg_beg, seg_end, val_begin, val_end, &sorted_idx); auto d_sorted_idx = dh::ToSpan(sorted_idx); - size_t n_weights = std::distance(w_begin, w_end); + std::size_t n_weights = std::distance(w_begin, w_end); dh::device_vector weights_cdf(n_weights); + std::size_t n_elems = std::distance(val_begin, val_end); + CHECK_EQ(n_weights, n_elems); dh::XGBCachingDeviceAllocator caching; - auto scan_key = dh::MakeTransformIterator( - thrust::make_counting_iterator(0ul), - [=] XGBOOST_DEVICE(size_t i) { return dh::SegmentId(seg_beg, seg_end, i); }); - auto scan_val = dh::MakeTransformIterator( - thrust::make_counting_iterator(0ul), - [=] XGBOOST_DEVICE(size_t i) { return w_begin[d_sorted_idx[i]]; }); + auto scan_key = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), + detail::SegOp{seg_beg, seg_end}); + auto scan_val = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), + detail::WeightOp{w_begin, d_sorted_idx}); thrust::inclusive_scan_by_key(thrust::cuda::par(caching), scan_key, scan_key + n_weights, scan_val, weights_cdf.begin()); @@ -103,24 +224,18 @@ void SegmentedWeightedQuantile(Context const* ctx, double alpha, SegIt seg_beg, auto d_results = quantiles->DeviceSpan(); auto d_weight_cdf = dh::ToSpan(weights_cdf); - dh::LaunchN(n_segments, [=] XGBOOST_DEVICE(size_t i) { - size_t seg_idx = i; - size_t begin = seg_beg[seg_idx]; - auto n = static_cast(seg_beg[seg_idx + 1] - begin); - if (n == 0) { - d_results[i] = std::numeric_limits::quiet_NaN(); - return; - } - auto leaf_cdf = d_weight_cdf.subspan(begin, static_cast(n)); - auto leaf_sorted_idx = d_sorted_idx.subspan(begin, static_cast(n)); - float thresh = leaf_cdf.back() * alpha; - - size_t idx = thrust::lower_bound(thrust::seq, leaf_cdf.data(), - leaf_cdf.data() + leaf_cdf.size(), thresh) - - leaf_cdf.data(); - idx = std::min(idx, static_cast(n - 1)); - d_results[i] = val_begin[leaf_sorted_idx[idx]]; - }); + thrust::for_each_n( + cuctx->CTP(), thrust::make_counting_iterator(0ul), n_segments, + detail::MakeWQSegOp(seg_beg, val_begin, alpha_it, d_weight_cdf, d_sorted_idx, d_results)); +} + +template +void SegmentedWeightedQuantile(Context const* ctx, double alpha, SegIt seg_beg, SegIt seg_end, + ValIt val_begin, ValIt val_end, WIter w_begin, WIter w_end, + HostDeviceVector* quantiles) { + CHECK(alpha >= 0 && alpha <= 1); + return SegmentedWeightedQuantile(ctx, thrust::make_constant_iterator(alpha), seg_beg, seg_end, + val_begin, val_end, w_begin, w_end, quantiles); } } // namespace common } // namespace xgboost diff --git a/tests/cpp/common/test_stats.cu b/tests/cpp/common/test_stats.cu index e71ec3efcb60..8643e75a721f 100644 --- a/tests/cpp/common/test_stats.cu +++ b/tests/cpp/common/test_stats.cu @@ -1,79 +1,155 @@ -/*! - * Copyright 2022 by XGBoost Contributors +/** + * Copyright 2022-2023 by XGBoost Contributors */ #include -#include -#include +#include // std::size_t +#include // std::pair +#include // std::vector +#include "../../../src/common/linalg_op.cuh" // ElementWiseTransformDevice #include "../../../src/common/stats.cuh" -#include "../../../src/common/stats.h" -#include "xgboost/base.h" -#include "xgboost/context.h" -#include "xgboost/host_device_vector.h" -#include "xgboost/linalg.h" +#include "xgboost/base.h" // XGBOOST_DEVICE +#include "xgboost/context.h" // Context +#include "xgboost/host_device_vector.h" // HostDeviceVector +#include "xgboost/linalg.h" // Tensor namespace xgboost { namespace common { namespace { class StatsGPU : public ::testing::Test { private: - linalg::Tensor arr_{ - {1.f, 2.f, 3.f, 4.f, 5.f, - 2.f, 4.f, 5.f, 3.f, 1.f}, - {10}, 0}; - linalg::Tensor indptr_{{0, 5, 10}, {3}, 0}; - HostDeviceVector resutls_; + linalg::Tensor arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, 0}; + linalg::Tensor indptr_{{0, 5, 10}, {3}, 0}; + HostDeviceVector results_; using TestSet = std::vector>; Context ctx_; void Check(float expected) { - auto const& h_results = resutls_.HostVector(); + auto const& h_results = results_.HostVector(); ASSERT_EQ(h_results.size(), indptr_.Size() - 1); ASSERT_EQ(h_results.front(), expected); - EXPECT_EQ(h_results.back(), expected); + ASSERT_EQ(h_results.back(), expected); } public: void SetUp() override { ctx_.gpu_id = 0; } + + void WeightedMulti() { + // data for one segment + std::vector seg{1.f, 2.f, 3.f, 4.f, 5.f}; + auto seg_size = seg.size(); + + // 3 segments + std::vector data; + data.insert(data.cend(), seg.begin(), seg.end()); + data.insert(data.cend(), seg.begin(), seg.end()); + data.insert(data.cend(), seg.begin(), seg.end()); + linalg::Tensor arr{data.cbegin(), data.cend(), {data.size()}, 0}; + auto d_arr = arr.View(0); + + auto key_it = dh::MakeTransformIterator( + thrust::make_counting_iterator(0ul), + [=] XGBOOST_DEVICE(std::size_t i) { return i * seg_size; }); + auto val_it = + dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), + [=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); }); + + // one alpha for each segment + HostDeviceVector alphas{0.0f, 0.5f, 1.0f}; + alphas.SetDevice(0); + auto d_alphas = alphas.ConstDeviceSpan(); + auto w_it = thrust::make_constant_iterator(0.1f); + SegmentedWeightedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it, + val_it + d_arr.Size(), w_it, w_it + d_arr.Size(), &results_); + + auto const& h_results = results_.HostVector(); + ASSERT_EQ(1.0f, h_results[0]); + ASSERT_EQ(3.0f, h_results[1]); + ASSERT_EQ(5.0f, h_results[2]); + } + void Weighted() { auto d_arr = arr_.View(0); auto d_key = indptr_.View(0); - auto key_it = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), - [=] __device__(size_t i) { return d_key(i); }); - auto val_it = dh::MakeTransformIterator( - thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(size_t i) { return d_arr(i); }); + auto key_it = dh::MakeTransformIterator( + thrust::make_counting_iterator(0ul), + [=] XGBOOST_DEVICE(std::size_t i) { return d_key(i); }); + auto val_it = + dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), + [=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); }); linalg::Tensor weights{{10}, 0}; linalg::ElementWiseTransformDevice(weights.View(0), - [=] XGBOOST_DEVICE(size_t, float) { return 1.0; }); + [=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; }); auto w_it = weights.Data()->ConstDevicePointer(); for (auto const& pair : TestSet{{0.0f, 1.0f}, {0.5f, 3.0f}, {1.0f, 5.0f}}) { SegmentedWeightedQuantile(&ctx_, pair.first, key_it, key_it + indptr_.Size(), val_it, - val_it + arr_.Size(), w_it, w_it + weights.Size(), &resutls_); + val_it + arr_.Size(), w_it, w_it + weights.Size(), &results_); this->Check(pair.second); } } + void NonWeightedMulti() { + // data for one segment + std::vector seg{20.f, 15.f, 50.f, 40.f, 35.f}; + auto seg_size = seg.size(); + + // 3 segments + std::vector data; + data.insert(data.cend(), seg.begin(), seg.end()); + data.insert(data.cend(), seg.begin(), seg.end()); + data.insert(data.cend(), seg.begin(), seg.end()); + linalg::Tensor arr{data.cbegin(), data.cend(), {data.size()}, 0}; + auto d_arr = arr.View(0); + + auto key_it = dh::MakeTransformIterator( + thrust::make_counting_iterator(0ul), + [=] XGBOOST_DEVICE(std::size_t i) { return i * seg_size; }); + auto val_it = + dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), + [=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); }); + + // one alpha for each segment + HostDeviceVector alphas{0.1f, 0.2f, 0.4f}; + alphas.SetDevice(0); + auto d_alphas = alphas.ConstDeviceSpan(); + SegmentedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it, + val_it + d_arr.Size(), &results_); + + auto const& h_results = results_.HostVector(); + EXPECT_EQ(15.0f, h_results[0]); + EXPECT_EQ(16.0f, h_results[1]); + ASSERT_EQ(26.0f, h_results[2]); + } + void NonWeighted() { auto d_arr = arr_.View(0); auto d_key = indptr_.View(0); - auto key_it = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), - [=] __device__(size_t i) { return d_key(i); }); - auto val_it = dh::MakeTransformIterator( - thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(size_t i) { return d_arr(i); }); + auto key_it = dh::MakeTransformIterator( + thrust::make_counting_iterator(0ul), [=] __device__(std::size_t i) { return d_key(i); }); + auto val_it = + dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), + [=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); }); for (auto const& pair : TestSet{{0.0f, 1.0f}, {0.5f, 3.0f}, {1.0f, 5.0f}}) { SegmentedQuantile(&ctx_, pair.first, key_it, key_it + indptr_.Size(), val_it, - val_it + arr_.Size(), &resutls_); + val_it + arr_.Size(), &results_); this->Check(pair.second); } } }; } // anonymous namespace -TEST_F(StatsGPU, Quantile) { this->NonWeighted(); } -TEST_F(StatsGPU, WeightedQuantile) { this->Weighted(); } +TEST_F(StatsGPU, Quantile) { + this->NonWeighted(); + this->NonWeightedMulti(); +} + +TEST_F(StatsGPU, WeightedQuantile) { + this->Weighted(); + this->WeightedMulti(); +} } // namespace common } // namespace xgboost