Skip to content

Commit

Permalink
minor performance tuning
Browse files Browse the repository at this point in the history
  • Loading branch information
seunghwak committed Oct 26, 2024
1 parent 8a78131 commit aa13925
Show file tree
Hide file tree
Showing 5 changed files with 435 additions and 343 deletions.
46 changes: 46 additions & 0 deletions cpp/include/cugraph/edge_partition_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return dcs_nzd_vertices_ ? thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand Down Expand Up @@ -257,6 +258,10 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
raft::device_span<size_t> count /* size = 1 */,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) {
RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream));
}

rmm::device_uvector<std::byte> d_tmp_storage(0, stream);
size_t tmp_storage_bytes{0};

Expand Down Expand Up @@ -368,6 +373,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return dcs_nzd_vertices_ ? thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand Down Expand Up @@ -627,6 +633,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand All @@ -643,6 +650,44 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
thrust::plus<size_t>());
}

template <typename MajorIterator>
__host__ void compute_number_of_edges_async(MajorIterator major_first,
MajorIterator major_last,
raft::device_span<size_t> count /* size = 1 */,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) {
RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream));
}

rmm::device_uvector<std::byte> d_tmp_storage(0, stream);
size_t tmp_storage_bytes{0};

auto local_degree_first = thrust::make_transform_iterator(
major_first,
detail::local_degree_op_t<vertex_t,
edge_t,
size_t /* no limit on majors.size(), so edge_t can overflow */,
multi_gpu,
false>{this->offsets_,
std::byte{0} /* dummy */,
std::byte{0} /* dummy */,
std::byte{0} /* dummy */});
cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
d_tmp_storage.resize(tmp_storage_bytes, stream);
cub::DeviceReduce::Sum(d_tmp_storage.data(),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
}

__host__ rmm::device_uvector<edge_t> compute_local_degrees(rmm::cuda_stream_view stream) const
{
rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
Expand Down Expand Up @@ -682,6 +727,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand Down
74 changes: 46 additions & 28 deletions cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -750,14 +750,18 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
}
auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx);
if (segment_offsets) {
key_segment_offsets = compute_key_segment_offsets(
frontier_key_first,
frontier_key_last,
raft::host_span<vertex_t const>((*segment_offsets).data(), (*segment_offsets).size()),
graph_view.local_vertex_partition_range_first(),
handle.get_stream());
(*key_segment_offsets).back() = *((*key_segment_offsets).rbegin() + 1);
frontier_key_last = frontier_key_first + (*key_segment_offsets).back();
if (thrust::distance(frontier_key_first, frontier_key_last) > 0) {
key_segment_offsets = compute_key_segment_offsets(
frontier_key_first,
frontier_key_last,
raft::host_span<vertex_t const>((*segment_offsets).data(), (*segment_offsets).size()),
graph_view.local_vertex_partition_range_first(),
handle.get_stream());
(*key_segment_offsets).back() = *((*key_segment_offsets).rbegin() + 1);
frontier_key_last = frontier_key_first + (*key_segment_offsets).back();
} else {
key_segment_offsets = std::vector<size_t>((*segment_offsets).size(), 0);
}
}
}

Expand Down Expand Up @@ -931,10 +935,16 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
(range_size > 0) ? (num_keys / static_cast<double>(range_size)) : double{0.0};
}
avg_fill_ratio /= static_cast<double>(minor_comm_size);

constexpr double threshold_ratio =
8.0 /* tuning parameter */ / static_cast<double>(sizeof(vertex_t) * 8);
if (avg_fill_ratio > threshold_ratio) {
auto avg_frontier_size =
std::reduce(local_frontier_sizes.begin(), local_frontier_sizes.end()) /
static_cast<vertex_t>(minor_comm_size);

if ((avg_fill_ratio > threshold_ratio) &&
(static_cast<size_t>(avg_frontier_size) >
packed_bools_per_word() *
32 /* tuning parameter, to consider additional kernel launch overhead */)) {
frontier_bitmap =
compute_vertex_list_bitmap_info(frontier_key_first,
frontier_key_last,
Expand Down Expand Up @@ -972,18 +982,21 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
auto max_tmp_buffer_size = static_cast<size_t>(
static_cast<double>(handle.get_device_properties().totalGlobalMem) * 0.2);

auto aggregate_major_range_size = host_scalar_allreduce(
comm,
static_cast<size_t>(thrust::distance(frontier_key_first, frontier_key_last)),
raft::comms::op_t::SUM,
handle.get_stream());
auto aggregate_max_pushes = host_scalar_allreduce(
comm,
local_max_pushes,
raft::comms::op_t::SUM,
handle.get_stream()); // this is approximate as we only consider local edges for
// [frontier_key_first, frontier_key_last), note that neighbor lists
// are partitioned if minor_comm_size > 1
size_t aggregate_major_range_size{};
size_t aggregate_max_pushes{}; // this is approximate as we only consider local edges for
// [frontier_key_first, frontier_key_last), note that neighbor
// lists are partitioned if minor_comm_size > 1
{
auto tmp = host_scalar_allreduce(
comm,
thrust::make_tuple(
static_cast<size_t>(thrust::distance(frontier_key_first, frontier_key_last)),
local_max_pushes),
raft::comms::op_t::SUM,
handle.get_stream());
aggregate_major_range_size = thrust::get<0>(tmp);
aggregate_max_pushes = thrust::get<1>(tmp);
}

size_t key_size{0};
if constexpr (std::is_arithmetic_v<key_t>) {
Expand Down Expand Up @@ -1290,6 +1303,10 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
}
}
}
#if EXTRACT_PERFORMANCE_MEASUREMENT
if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
auto subtime2 = std::chrono::steady_clock::now();
#endif

if (key_segment_offset_vectors) {
for (size_t j = 0; j < loop_count; ++j) {
Expand Down Expand Up @@ -1377,7 +1394,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
}
if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
#if EXTRACT_PERFORMANCE_MEASUREMENT
auto subtime2 = std::chrono::steady_clock::now();
auto subtime3 = std::chrono::steady_clock::now();
#endif

thrust::fill(
Expand Down Expand Up @@ -1501,14 +1518,14 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,

if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); }
#if EXTRACT_PERFORMANCE_MEASUREMENT
auto subtime3 = std::chrono::steady_clock::now();
auto subtime4 = std::chrono::steady_clock::now();
#endif

std::vector<size_t> h_counts(loop_count);
raft::update_host(h_counts.data(), counters.data(), loop_count, handle.get_stream());
handle.sync_stream();
#if EXTRACT_PERFORMANCE_MEASUREMENT
auto subtime4 = std::chrono::steady_clock::now();
auto subtime5 = std::chrono::steady_clock::now();
#endif

for (size_t j = 0; j < loop_count; ++j) {
Expand All @@ -1535,15 +1552,16 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
}
if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
#if EXTRACT_PERFORMANCE_MEASUREMENT
auto subtime5 = std::chrono::steady_clock::now();
auto subtime6 = std::chrono::steady_clock::now();
std::chrono::duration<double> subdur0 = subtime1 - subtime0;
std::chrono::duration<double> subdur1 = subtime2 - subtime1;
std::chrono::duration<double> subdur2 = subtime3 - subtime2;
std::chrono::duration<double> subdur3 = subtime4 - subtime3;
std::chrono::duration<double> subdur4 = subtime5 - subtime4;
std::chrono::duration<double> subdur5 = subtime6 - subtime5;
std::cerr << "sub (extract) took (" << subdur0.count() << "," << subdur1.count() << ","
<< subdur2.count() << "," << subdur3.count() << "," << subdur4.count()
<< ") loop_count=" << loop_count << std::endl;
<< subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << ","
<< subdur5.count() << ") loop_count=" << loop_count << std::endl;
#endif
}
#if EXTRACT_PERFORMANCE_MEASUREMENT
Expand Down
10 changes: 8 additions & 2 deletions cpp/src/prims/detail/per_v_transform_reduce_e.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1944,11 +1944,17 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
(range_size > 0) ? (num_keys / static_cast<double>(range_size)) : double{0.0};
}
avg_fill_ratio /= static_cast<double>(minor_comm_size);

double threshold_ratio =
2.0 /* tuning parameter (consider that we need to reprodce vertex list from bitmap)*/ /
static_cast<double>((v_compressible ? sizeof(uint32_t) : sizeof(vertex_t)) * 8);
if (avg_fill_ratio > threshold_ratio) {
auto avg_key_list_size =
std::reduce(local_key_list_sizes.begin(), local_key_list_sizes.end()) /
static_cast<vertex_t>(minor_comm_size);

if ((avg_fill_ratio > threshold_ratio) &&
(static_cast<size_t>(avg_key_list_size) >
packed_bools_per_word() *
32 /* tuning parameter, to considerr additional kernel launch overhead */)) {
v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_key_first,
sorted_unique_nzd_key_last,
local_v_list_range_firsts[minor_comm_rank],
Expand Down
Loading

0 comments on commit aa13925

Please sign in to comment.