Skip to content

Commit 4483b87

Browse files
authored
Performance improvement for nvtext::minhash (#13333)
Improves performance of `nvtext::minhash` by minimizing character counting in the internal logic. The MinHash strings are expected to be very long ( `> 1KB`). Improvement is measure to be up to 2x. Authors: - David Wendt (https://github.com/davidwendt) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) - Mark Harris (https://github.com/harrism) - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) URL: #13333
1 parent 89feac7 commit 4483b87

File tree

1 file changed

+13
-13
lines changed

1 file changed

+13
-13
lines changed

cpp/src/text/minhash.cu

+13-13
Original file line numberDiff line numberDiff line change
@@ -74,20 +74,20 @@ struct minhash_fn {
7474
}
7575
__syncwarp();
7676

77-
auto const begin = d_str.begin() + lane_idx;
78-
auto const end = [d_str, width = width] {
79-
auto const length = d_str.length();
80-
if (length > width) { return (d_str.end() - (width - 1)); }
81-
return d_str.begin() + static_cast<cudf::size_type>(length > 0);
82-
}();
83-
84-
// each lane hashes substrings of the given width
85-
for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
86-
auto const offset = itr.byte_offset();
87-
auto const hash_str =
88-
cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset);
77+
auto const begin = d_str.data() + lane_idx;
78+
auto const end = d_str.data() + d_str.size_bytes();
8979

90-
// hashing each seed on the same section of string is 10x faster than
80+
// each lane hashes 'width' substrings of d_str
81+
for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
82+
if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; }
83+
auto const check_str = // used for counting 'width' characters
84+
cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
85+
auto const [bytes, left] =
86+
cudf::strings::detail::bytes_to_character_position(check_str, width);
87+
if ((itr != d_str.data()) && (left > 0)) { continue; } // true if past the end of the string
88+
89+
auto const hash_str = cudf::string_view(itr, bytes);
90+
// hashing with each seed on the same section of the string is 10x faster than
9191
// computing the substrings for each seed
9292
for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
9393
auto const hasher = cudf::detail::MurmurHash3_32<cudf::string_view>{seeds[seed_idx]};

0 commit comments

Comments
 (0)