Performance improvement for nvtext::minhash (#13333)

davidwendt · web-flow · commit 4483b879047e · 2023-05-16T15:30:29.000Z
Improves performance of `nvtext::minhash` by minimizing character counting in the internal logic. The MinHash strings are expected to be very long ( `> 1KB`). Improvement is measure to be up to 2x. Authors: - David Wendt (https://github.com/davidwendt) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) - Mark Harris (https://github.com/harrism) - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) URL: #13333
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
@@ -74,20 +74,20 @@ struct minhash_fn {
     }
     __syncwarp();
 
-    auto const begin = d_str.begin() + lane_idx;
-    auto const end   = [d_str, width = width] {
-      auto const length = d_str.length();
-      if (length > width) { return (d_str.end() - (width - 1)); }
-      return d_str.begin() + static_cast<cudf::size_type>(length > 0);
-    }();
-
-    // each lane hashes substrings of the given width
-    for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
-      auto const offset = itr.byte_offset();
-      auto const hash_str =
-        cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset);
+    auto const begin = d_str.data() + lane_idx;
+    auto const end   = d_str.data() + d_str.size_bytes();
 
-      // hashing each seed on the same section of string is 10x faster than
+    // each lane hashes 'width'  substrings of d_str
+    for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
+      if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; }
+      auto const check_str =  // used for counting 'width' characters
+        cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+      auto const [bytes, left] =
+        cudf::strings::detail::bytes_to_character_position(check_str, width);
+      if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
+
+      auto const hash_str = cudf::string_view(itr, bytes);
+      // hashing with each seed on the same section of the string is 10x faster than
       // computing the substrings for each seed
       for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
         auto const hasher = cudf::detail::MurmurHash3_32<cudf::string_view>{seeds[seed_idx]};