rapidsai · rapids-bot · Aug 21, 2023 · Aug 1, 2023 · Aug 1, 2023 · Aug 3, 2023
@@ -30,6 +30,7 @@ static void bench_minhash(nvbench::state& state)
   auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const hash_width = static_cast<cudf::size_type>(state.get_int64("hash_width"));
   auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
+  auto const base64     = state.get_int64("hash_type") == 64;
 
   if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
       static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
@@ -44,9 +45,9 @@ static void bench_minhash(nvbench::state& state)
 
   data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution(
     cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, row_width);
-  auto const seeds_table = create_random_table(
-    {cudf::type_to_id<cudf::hash_value_type>()}, row_count{seed_count}, seeds_profile);
-  auto seeds = seeds_table->get_column(0);
+  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
+  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
+  auto seeds             = seeds_table->get_column(0);
   seeds.set_null_mask(rmm::device_buffer{}, 0);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
@@ -56,13 +57,15 @@ static void bench_minhash(nvbench::state& state)
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = nvtext::minhash(input, seeds.view(), hash_width);
+    auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width)
+                         : nvtext::minhash(input, seeds.view(), hash_width);
   });
 }
 
 NVBENCH_BENCH(bench_minhash)
   .set_name("minhash")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
+  .add_int64_axis("num_rows", {1024, 8192, 16364, 131072})
   .add_int64_axis("row_width", {128, 512, 2048})
-  .add_int64_axis("hash_width", {5, 10, 25})
-  .add_int64_axis("seed_count", {2, 26});
+  .add_int64_axis("hash_width", {5, 10})
+  .add_int64_axis("seed_count", {2, 26})
+  .add_int64_axis("hash_type", {32, 64});
@@ -36,24 +36,22 @@ namespace nvtext {
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
  * @throw std::invalid_argument if the width < 2
- * @throw std::invalid_argument if hash_function is not HASH_MURMUR3
  *
  * @param input Strings column to compute minhash
- * @param seed  Seed value used for the MurmurHash3_x86_32 algorithm
+ * @param seed  Seed value used for the hash algorithm
  * @param width The character width used for apply substrings;
  *              Default is 4 characters.
- * @param hash_function Hash algorithm to use;
- *                      Only HASH_MURMUR3 is currently supported.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Minhash values for each string in input
  */
 std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
-  cudf::numeric_scalar<cudf::hash_value_type> seed = cudf::numeric_scalar(cudf::DEFAULT_HASH_SEED),
-  cudf::size_type width                            = 4,
-  cudf::hash_id hash_function                      = cudf::hash_id::HASH_MURMUR3,
-  rmm::mr::device_memory_resource* mr              = rmm::mr::get_current_device_resource());
+  cudf::numeric_scalar<uint32_t> seed = 0,
+  cudf::size_type width               = 4,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the minhash values for each string per seed
@@ -64,28 +62,84 @@ std::unique_ptr<cudf::column> minhash(
  * string. The order of the elements in each row match the order of
  * the seeds provided in the `seeds` parameter.
  *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
  * Any null row entries result in corresponding null output rows.
  *
  * @throw std::invalid_argument if the width < 2
- * @throw std::invalid_argument if hash_function is not HASH_MURMUR3
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
- * @param seeds Seed values used for the MurmurHash3_x86_32 algorithm
+ * @param seeds Seed values used for the hash algorithm
  * @param width The character width used for apply substrings;
  *              Default is 4 characters.
- * @param hash_function Hash algorithm to use;
- *                      Only HASH_MURMUR3 is currently supported.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
- *         or a hash_value_type column if only a single seed is specified
+ *         or a UINT32 type column if only a single seed is specified
  */
 std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
-  cudf::device_span<cudf::hash_value_type const> seeds,
+  cudf::device_span<uint32_t const> seeds,
+  cudf::size_type width               = 4,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the minhash value for each string
+ *
+ * Hash values are computed from substrings of each string and the
+ * minimum hash value is returned for each string.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ * The hash function returns 2 uint64 values but only the first value
+ * is used with the minhash calculation.
+ *
+ * @throw std::invalid_argument if the width < 2
+ *
+ * @param input Strings column to compute minhash
+ * @param seed  Seed value used for the hash algorithm
+ * @param width The character width used for apply substrings;
+ *              Default is 4 characters.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Minhash values as UINT64 for each string in input
+ */
+std::unique_ptr<cudf::column> minhash64(
+  cudf::strings_column_view const& input,
+  cudf::numeric_scalar<uint64_t> seed = 0,
+  cudf::size_type width               = 4,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the minhash values for each string per seed
+ *
+ * Hash values are computed from substrings of each string and the
+ * minimum hash value is returned for each string for each seed.
+ * Each row of the list column are seed results for the corresponding
+ * string. The order of the elements in each row match the order of
+ * the seeds provided in the `seeds` parameter.
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if the width < 2
+ * @throw std::invalid_argument if seeds is empty
+ * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
+ *
+ * @param input Strings column to compute minhash
+ * @param seeds Seed values used for the hash algorithm
+ * @param width The character width used for apply substrings;
+ *              Default is 4 characters.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ *         or a UINT64 type column if only a single seed is specified
+ */
+std::unique_ptr<cudf::column> minhash64(
+  cudf::strings_column_view const& input,
+  cudf::device_span<uint64_t const> seeds,
   cudf::size_type width               = 4,
-  cudf::hash_id hash_function         = cudf::hash_id::HASH_MURMUR3,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group