Skip to content

Commit

Permalink
Add minhash support for MurmurHash3_x64_128 (#13796)
Browse files Browse the repository at this point in the history
Adds `nvtext::minhash64` to libcudf and the Cython/Python changes to call it.
The `MurmurHash3_x64_128` is called and only the first `uint64` value is used.

The libcudf API was changed to remove the `hash_id` parameter since it was incompatible with the seed types.

Authors:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #13796
  • Loading branch information
davidwendt authored Aug 21, 2023
1 parent c2f2167 commit 261bcb2
Show file tree
Hide file tree
Showing 9 changed files with 434 additions and 156 deletions.
17 changes: 10 additions & 7 deletions cpp/benchmarks/text/minhash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ static void bench_minhash(nvbench::state& state)
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const hash_width = static_cast<cudf::size_type>(state.get_int64("hash_width"));
auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
auto const base64 = state.get_int64("hash_type") == 64;

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
Expand All @@ -44,9 +45,9 @@ static void bench_minhash(nvbench::state& state)

data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution(
cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, row_width);
auto const seeds_table = create_random_table(
{cudf::type_to_id<cudf::hash_value_type>()}, row_count{seed_count}, seeds_profile);
auto seeds = seeds_table->get_column(0);
auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
auto seeds = seeds_table->get_column(0);
seeds.set_null_mask(rmm::device_buffer{}, 0);

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
Expand All @@ -56,13 +57,15 @@ static void bench_minhash(nvbench::state& state)
state.add_global_memory_writes<nvbench::int32_t>(num_rows); // output are hashes

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = nvtext::minhash(input, seeds.view(), hash_width);
auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width)
: nvtext::minhash(input, seeds.view(), hash_width);
});
}

NVBENCH_BENCH(bench_minhash)
.set_name("minhash")
.add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
.add_int64_axis("num_rows", {1024, 8192, 16364, 131072})
.add_int64_axis("row_width", {128, 512, 2048})
.add_int64_axis("hash_width", {5, 10, 25})
.add_int64_axis("seed_count", {2, 26});
.add_int64_axis("hash_width", {5, 10})
.add_int64_axis("seed_count", {2, 26})
.add_int64_axis("hash_type", {32, 64});
90 changes: 75 additions & 15 deletions cpp/include/nvtext/minhash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,24 @@ namespace nvtext {
*
* Any null row entries result in corresponding null output rows.
*
* This function uses MurmurHash3_x86_32 for the hash algorithm.
*
* @throw std::invalid_argument if the width < 2
* @throw std::invalid_argument if hash_function is not HASH_MURMUR3
*
* @param input Strings column to compute minhash
* @param seed Seed value used for the MurmurHash3_x86_32 algorithm
* @param seed Seed value used for the hash algorithm
* @param width The character width used for apply substrings;
* Default is 4 characters.
* @param hash_function Hash algorithm to use;
* Only HASH_MURMUR3 is currently supported.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return Minhash values for each string in input
*/
std::unique_ptr<cudf::column> minhash(
cudf::strings_column_view const& input,
cudf::numeric_scalar<cudf::hash_value_type> seed = cudf::numeric_scalar(cudf::DEFAULT_HASH_SEED),
cudf::size_type width = 4,
cudf::hash_id hash_function = cudf::hash_id::HASH_MURMUR3,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
cudf::numeric_scalar<uint32_t> seed = 0,
cudf::size_type width = 4,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns the minhash values for each string per seed
Expand All @@ -64,28 +64,88 @@ std::unique_ptr<cudf::column> minhash(
* string. The order of the elements in each row match the order of
* the seeds provided in the `seeds` parameter.
*
* This function uses MurmurHash3_x86_32 for the hash algorithm.
*
* Any null row entries result in corresponding null output rows.
*
* @throw std::invalid_argument if the width < 2
* @throw std::invalid_argument if hash_function is not HASH_MURMUR3
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
*
* @param input Strings column to compute minhash
* @param seeds Seed values used for the MurmurHash3_x86_32 algorithm
* @param seeds Seed values used for the hash algorithm
* @param width The character width used for apply substrings;
* Default is 4 characters.
* @param hash_function Hash algorithm to use;
* Only HASH_MURMUR3 is currently supported.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return List column of minhash values for each string per seed
* or a hash_value_type column if only a single seed is specified
*/
std::unique_ptr<cudf::column> minhash(
cudf::strings_column_view const& input,
cudf::device_span<cudf::hash_value_type const> seeds,
cudf::device_span<uint32_t const> seeds,
cudf::size_type width = 4,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns the minhash value for each string
*
* Hash values are computed from substrings of each string and the
* minimum hash value is returned for each string.
*
* Any null row entries result in corresponding null output rows.
*
* This function uses MurmurHash3_x64_128 for the hash algorithm.
* The hash function returns 2 uint64 values but only the first value
* is used with the minhash calculation.
*
* @throw std::invalid_argument if the width < 2
*
* @param input Strings column to compute minhash
* @param seed Seed value used for the hash algorithm
* @param width The character width used for apply substrings;
* Default is 4 characters.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return Minhash values as UINT64 for each string in input
*/
std::unique_ptr<cudf::column> minhash64(
cudf::strings_column_view const& input,
cudf::numeric_scalar<uint64_t> seed = 0,
cudf::size_type width = 4,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns the minhash values for each string per seed
*
* Hash values are computed from substrings of each string and the
* minimum hash value is returned for each string for each seed.
* Each row of the list column are seed results for the corresponding
* string. The order of the elements in each row match the order of
* the seeds provided in the `seeds` parameter.
*
* This function uses MurmurHash3_x64_128 for the hash algorithm.
*
* Any null row entries result in corresponding null output rows.
*
* @throw std::invalid_argument if the width < 2
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
*
* @param input Strings column to compute minhash
* @param seeds Seed values used for the hash algorithm
* @param width The character width used for apply substrings;
* Default is 4 characters.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return List column of minhash values for each string per seed
*/
std::unique_ptr<cudf::column> minhash64(
cudf::strings_column_view const& input,
cudf::device_span<uint64_t const> seeds,
cudf::size_type width = 4,
cudf::hash_id hash_function = cudf::hash_id::HASH_MURMUR3,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
Loading

0 comments on commit 261bcb2

Please sign in to comment.