Update primitive to compute weighted Jaccard, Sorensen and Overlap si…

…milarity (#3728) This PR - changes `per_v_pair_transform_dst_nbr_intersection` to support computing weighted intersection - updates implementation of `similarity`, `jaccard_coefficients`, `sorensen_coefficients`, `overlap_coefficients` for weighted graphs NOTE: current implementation doesn't support computing similarity for multi-edge graphs. closes #2748 closes #3477 Authors: - Naim (https://github.com/naimnv) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) - Seunghwa Kang (https://github.com/seunghwak) - Joseph Nke (https://github.com/jnke2016) URL: #3728
rapidsai · Jul 31, 2023 · 14862c6 · 14862c6
1 parent be2a63e
commit 14862c6
Show file tree

Hide file tree

Showing 18 changed files with 2,202 additions and 305 deletions.
diff --git a/cpp/src/c_api/similarity.cpp b/cpp/src/c_api/similarity.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

diff --git a/cpp/src/link_prediction/jaccard_impl.cuh b/cpp/src/link_prediction/jaccard_impl.cuh
@@ -24,22 +24,15 @@ namespace cugraph {
 namespace detail {
 
 struct jaccard_functor_t {
-  template <typename weight_t>
-  weight_t __device__ compute_score(weight_t cardinality_a,
-                                    weight_t cardinality_b,
-                                    weight_t cardinality_a_intersect_b) const
-  {
-    return cardinality_a_intersect_b / (cardinality_a + cardinality_b - cardinality_a_intersect_b);
-  }
-};
-
-struct weighted_jaccard_functor_t {
   template <typename weight_t>
   weight_t __device__ compute_score(weight_t weight_a,
                                     weight_t weight_b,
-                                    weight_t min_weight_a_intersect_b) const
+                                    weight_t weight_a_intersect_b,
+                                    weight_t weight_a_union_b) const
   {
-    return min_weight_a_intersect_b / (weight_a + weight_b - min_weight_a_intersect_b);
+    return weight_a_union_b <= std::numeric_limits<weight_t>::min()
+             ? weight_t{0}
+             : weight_a_intersect_b / weight_a_union_b;
   }
 };
 
@@ -55,20 +48,12 @@ rmm::device_uvector<weight_t> jaccard_coefficients(
 {
   CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
-  if (!edge_weight_view)
-    return detail::similarity(handle,
-                              graph_view,
-                              edge_weight_view,
-                              vertex_pairs,
-                              detail::jaccard_functor_t{},
-                              do_expensive_check);
-  else
-    return detail::similarity(handle,
-                              graph_view,
-                              edge_weight_view,
-                              vertex_pairs,
-                              detail::weighted_jaccard_functor_t{},
-                              do_expensive_check);
+  return detail::similarity(handle,
+                            graph_view,
+                            edge_weight_view,
+                            vertex_pairs,
+                            detail::jaccard_functor_t{},
+                            do_expensive_check);
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/link_prediction/overlap_impl.cuh b/cpp/src/link_prediction/overlap_impl.cuh
@@ -24,22 +24,15 @@ namespace cugraph {
 namespace detail {
 
 struct overlap_functor_t {
-  template <typename weight_t>
-  weight_t __device__ compute_score(weight_t cardinality_a,
-                                    weight_t cardinality_b,
-                                    weight_t cardinality_a_intersect_b) const
-  {
-    return cardinality_a_intersect_b / std::min(cardinality_a, cardinality_b);
-  }
-};
-
-struct weighted_overlap_functor_t {
   template <typename weight_t>
   weight_t __device__ compute_score(weight_t weight_a,
                                     weight_t weight_b,
-                                    weight_t min_weight_a_intersect_b) const
+                                    weight_t weight_a_intersect_b,
+                                    weight_t weight_a_union_b) const
   {
-    return min_weight_a_intersect_b / std::min(weight_a, weight_b);
+    return std::min(weight_a, weight_b) <= std::numeric_limits<weight_t>::min()
+             ? weight_t{0}
+             : weight_a_intersect_b / std::min(weight_a, weight_b);
   }
 };
 
@@ -55,20 +48,12 @@ rmm::device_uvector<weight_t> overlap_coefficients(
 {
   CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
-  if (!edge_weight_view)
-    return detail::similarity(handle,
-                              graph_view,
-                              edge_weight_view,
-                              vertex_pairs,
-                              detail::overlap_functor_t{},
-                              do_expensive_check);
-  else
-    return detail::similarity(handle,
-                              graph_view,
-                              edge_weight_view,
-                              vertex_pairs,
-                              detail::weighted_overlap_functor_t{},
-                              do_expensive_check);
+  return detail::similarity(handle,
+                            graph_view,
+                            edge_weight_view,
+                            vertex_pairs,
+                            detail::overlap_functor_t{},
+                            do_expensive_check);
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/link_prediction/similarity_impl.cuh b/cpp/src/link_prediction/similarity_impl.cuh
@@ -15,9 +15,11 @@
  */
 #pragma once
 
+#include <prims/count_if_e.cuh>
 #include <prims/per_v_pair_transform_dst_nbr_intersection.cuh>
 #include <prims/update_edge_src_dst_property.cuh>
 
+#include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
 
 #include <raft/core/device_span.hpp>
@@ -51,33 +53,106 @@ rmm::device_uvector<weight_t> similarity(
   auto vertex_pairs_begin =
     thrust::make_zip_iterator(std::get<0>(vertex_pairs).data(), std::get<1>(vertex_pairs).data());
 
+  if (do_expensive_check) {
+    auto num_invalids = detail::count_invalid_vertex_pairs(
+      handle, graph_view, vertex_pairs_begin, vertex_pairs_begin + num_vertex_pairs);
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input arguments: there are invalid input vertex pairs.");
+
+    if (edge_weight_view) {
+      auto num_negative_edge_weights =
+        count_if_e(handle,
+                   graph_view,
+                   edge_src_dummy_property_t{}.view(),
+                   edge_dst_dummy_property_t{}.view(),
+                   *edge_weight_view,
+                   [] __device__(vertex_t, vertex_t, auto, auto, weight_t w) { return w < 0.0; });
+      CUGRAPH_EXPECTS(
+        num_negative_edge_weights == 0,
+        "Invalid input argument: input edge weights should have non-negative values.");
+    }
+  }
+
   if (edge_weight_view) {
-    // FIXME: need implementation, similar to unweighted
-    //    Use compute_out_weight_sums instead of compute_out_degrees
-    //    Sum up for each common edge compute (u,a,v): min weight ((u,a), (a,v)) and
-    //        max weight((u,a), (a,v)).
-    //    Use these to compute weighted score
-    //
-    CUGRAPH_FAIL("weighted similarity computations are not supported in this release");
+    rmm::device_uvector<weight_t> similarity_score(num_vertex_pairs, handle.get_stream());
+    rmm::device_uvector<weight_t> weighted_out_degrees =
+      compute_out_weight_sums(handle, graph_view, *edge_weight_view);
+
+    per_v_pair_transform_dst_nbr_intersection(
+      handle,
+      graph_view,
+      *edge_weight_view,
+      vertex_pairs_begin,
+      vertex_pairs_begin + num_vertex_pairs,
+      weighted_out_degrees.begin(),
+      [functor] __device__(auto a,
+                           auto b,
+                           auto weight_a,
+                           auto weight_b,
+                           auto intersection,
+                           auto intersected_properties_a,
+                           auto intersected_properties_b) {
+        weight_t sum_of_min_weight_a_intersect_b = weight_t{0};
+        weight_t sum_of_max_weight_a_intersect_b = weight_t{0};
+        weight_t sum_of_intersected_a            = weight_t{0};
+        weight_t sum_of_intersected_b            = weight_t{0};
+
+        auto pair_first = thrust::make_zip_iterator(intersected_properties_a.data(),
+                                                    intersected_properties_b.data());
+        thrust::tie(sum_of_min_weight_a_intersect_b,
+                    sum_of_max_weight_a_intersect_b,
+                    sum_of_intersected_a,
+                    sum_of_intersected_b) =
+          thrust::transform_reduce(
+            thrust::seq,
+            pair_first,
+            pair_first + intersected_properties_a.size(),
+            [] __device__(auto property_pair) {
+              auto prop_a = thrust::get<0>(property_pair);
+              auto prop_b = thrust::get<1>(property_pair);
+              return thrust::make_tuple(min(prop_a, prop_b), max(prop_a, prop_b), prop_a, prop_b);
+            },
+            thrust::make_tuple(weight_t{0}, weight_t{0}, weight_t{0}, weight_t{0}),
+            [] __device__(auto lhs, auto rhs) {
+              return thrust::make_tuple(thrust::get<0>(lhs) + thrust::get<0>(rhs),
+                                        thrust::get<1>(lhs) + thrust::get<1>(rhs),
+                                        thrust::get<2>(lhs) + thrust::get<2>(rhs),
+                                        thrust::get<3>(lhs) + thrust::get<3>(rhs));
+            });
+
+        weight_t sum_of_uniq_a = weight_a - sum_of_intersected_a;
+        weight_t sum_of_uniq_b = weight_b - sum_of_intersected_b;
+
+        sum_of_max_weight_a_intersect_b += sum_of_uniq_a + sum_of_uniq_b;
+
+        return functor.compute_score(static_cast<weight_t>(weight_a),
+                                     static_cast<weight_t>(weight_b),
+                                     static_cast<weight_t>(sum_of_min_weight_a_intersect_b),
+                                     static_cast<weight_t>(sum_of_max_weight_a_intersect_b));
+      },
+      similarity_score.begin(),
+      do_expensive_check);
+
+    return similarity_score;
   } else {
     rmm::device_uvector<weight_t> similarity_score(num_vertex_pairs, handle.get_stream());
 
-    //
-    //  Compute vertex_degree for all vertices, then distribute to each GPU.
-    //  Need to use this instead of the dummy properties below
-    //
     auto out_degrees = graph_view.compute_out_degrees(handle);
 
     per_v_pair_transform_dst_nbr_intersection(
       handle,
       graph_view,
+      cugraph::edge_dummy_property_t{}.view(),
       vertex_pairs_begin,
       vertex_pairs_begin + num_vertex_pairs,
       out_degrees.begin(),
-      [functor] __device__(auto v1, auto v2, auto v1_degree, auto v2_degree, auto intersection) {
-        return functor.compute_score(static_cast<weight_t>(v1_degree),
-                                     static_cast<weight_t>(v2_degree),
-                                     static_cast<weight_t>(intersection.size()));
+      [functor] __device__(
+        auto v1, auto v2, auto v1_degree, auto v2_degree, auto intersection, auto, auto) {
+        return functor.compute_score(
+          static_cast<weight_t>(v1_degree),
+          static_cast<weight_t>(v2_degree),
+          static_cast<weight_t>(intersection.size()),
+          static_cast<weight_t>(v1_degree + v2_degree - intersection.size()));
       },
       similarity_score.begin(),
       do_expensive_check);

diff --git a/cpp/src/link_prediction/sorensen_impl.cuh b/cpp/src/link_prediction/sorensen_impl.cuh
@@ -24,22 +24,15 @@ namespace cugraph {
 namespace detail {
 
 struct sorensen_functor_t {
-  template <typename weight_t>
-  weight_t __device__ compute_score(weight_t cardinality_a,
-                                    weight_t cardinality_b,
-                                    weight_t cardinality_a_intersect_b) const
-  {
-    return (2 * cardinality_a_intersect_b) / (cardinality_a + cardinality_b);
-  }
-};
-
-struct weighted_sorensen_functor_t {
   template <typename weight_t>
   weight_t __device__ compute_score(weight_t weight_a,
                                     weight_t weight_b,
-                                    weight_t min_weight_a_intersect_b) const
+                                    weight_t weight_a_intersect_b,
+                                    weight_t weight_a_union_b) const
   {
-    return (2 * min_weight_a_intersect_b) / (weight_a + weight_b);
+    return (weight_a + weight_b) <= std::numeric_limits<weight_t>::min()
+             ? weight_t{0}
+             : (2 * weight_a_intersect_b) / (weight_a + weight_b);
   }
 };
 
@@ -55,20 +48,12 @@ rmm::device_uvector<weight_t> sorensen_coefficients(
 {
   CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
-  if (!edge_weight_view)
-    return detail::similarity(handle,
-                              graph_view,
-                              edge_weight_view,
-                              vertex_pairs,
-                              detail::sorensen_functor_t{},
-                              do_expensive_check);
-  else
-    return detail::similarity(handle,
-                              graph_view,
-                              edge_weight_view,
-                              vertex_pairs,
-                              detail::weighted_sorensen_functor_t{},
-                              do_expensive_check);
+  return detail::similarity(handle,
+                            graph_view,
+                            edge_weight_view,
+                            vertex_pairs,
+                            detail::sorensen_functor_t{},
+                            do_expensive_check);
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <prims/detail/optional_dataframe_buffer.hpp>
 #include <prims/property_op_utils.cuh>
 
 #include <cugraph/edge_partition_device_view.cuh>
@@ -60,83 +61,6 @@ namespace detail {
 
 int32_t constexpr extract_transform_v_frontier_e_kernel_block_size = 512;
 
-// we cannot use thrust::iterator_traits<Iterator>::value_type if Iterator is void* (reference to
-// void is not allowed)
-template <typename Iterator, typename Enable = void>
-struct optional_dataframe_buffer_value_type_t;
-
-template <typename Iterator>
-struct optional_dataframe_buffer_value_type_t<Iterator,
-                                              std::enable_if_t<!std::is_same_v<Iterator, void*>>> {
-  using value = typename thrust::iterator_traits<Iterator>::value_type;
-};
-
-template <typename Iterator>
-struct optional_dataframe_buffer_value_type_t<Iterator,
-                                              std::enable_if_t<std::is_same_v<Iterator, void*>>> {
-  using value = void;
-};
-
-template <typename T, std::enable_if_t<std::is_same_v<T, void>>* = nullptr>
-std::byte allocate_optional_dataframe_buffer(size_t size, rmm::cuda_stream_view stream)
-{
-  return std::byte{0};  // dummy
-}
-
-template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
-auto allocate_optional_dataframe_buffer(size_t size, rmm::cuda_stream_view stream)
-{
-  return allocate_dataframe_buffer<T>(size, stream);
-}
-
-template <typename T, std::enable_if_t<std::is_same_v<T, void>>* = nullptr>
-void* get_optional_dataframe_buffer_begin(std::byte& optional_dataframe_buffer)
-{
-  return static_cast<void*>(nullptr);
-}
-
-template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
-auto get_optional_dataframe_buffer_begin(
-  std::add_lvalue_reference_t<decltype(allocate_dataframe_buffer<T>(
-    size_t{0}, rmm::cuda_stream_view{}))> optional_dataframe_buffer)
-{
-  return get_dataframe_buffer_begin(optional_dataframe_buffer);
-}
-
-template <typename T, std::enable_if_t<std::is_same_v<T, void>>* = nullptr>
-void resize_optional_dataframe_buffer(std::byte& optional_dataframe_buffer,
-                                      size_t new_buffer_size,
-                                      rmm::cuda_stream_view stream_view)
-{
-  return;
-}
-
-template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
-void resize_optional_dataframe_buffer(
-  std::add_lvalue_reference_t<decltype(allocate_dataframe_buffer<T>(
-    size_t{0}, rmm::cuda_stream_view{}))> optional_dataframe_buffer,
-  size_t new_buffer_size,
-  rmm::cuda_stream_view stream_view)
-{
-  return resize_dataframe_buffer(optional_dataframe_buffer, new_buffer_size, stream_view);
-}
-
-template <typename T, std::enable_if_t<std::is_same_v<T, void>>* = nullptr>
-void shrink_to_fit_optional_dataframe_buffer(std::byte& optional_dataframe_buffer,
-                                             rmm::cuda_stream_view stream_view)
-{
-  return;
-}
-
-template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
-void shrink_to_fit_optional_dataframe_buffer(
-  std::add_lvalue_reference_t<decltype(allocate_dataframe_buffer<T>(
-    size_t{0}, rmm::cuda_stream_view{}))> optional_dataframe_buffer,
-  rmm::cuda_stream_view stream_view)
-{
-  return shrink_to_fit_dataframe_buffer(optional_dataframe_buffer, stream_view);
-}
-
 template <typename e_op_result_t,
           typename BufferKeyOutputIterator,
           typename BufferValueOutputIterator>