@@ -74,20 +74,20 @@ struct minhash_fn {
74
74
}
75
75
__syncwarp ();
76
76
77
- auto const begin = d_str.begin () + lane_idx;
78
- auto const end = [d_str, width = width] {
79
- auto const length = d_str.length ();
80
- if (length > width) { return (d_str.end () - (width - 1 )); }
81
- return d_str.begin () + static_cast <cudf::size_type>(length > 0 );
82
- }();
83
-
84
- // each lane hashes substrings of the given width
85
- for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
86
- auto const offset = itr.byte_offset ();
87
- auto const hash_str =
88
- cudf::string_view (d_str.data () + offset, (itr + width).byte_offset () - offset);
77
+ auto const begin = d_str.data () + lane_idx;
78
+ auto const end = d_str.data () + d_str.size_bytes ();
89
79
90
- // hashing each seed on the same section of string is 10x faster than
80
+ // each lane hashes 'width' substrings of d_str
81
+ for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
82
+ if (cudf::strings::detail::is_utf8_continuation_char (*itr)) { continue ; }
83
+ auto const check_str = // used for counting 'width' characters
84
+ cudf::string_view (itr, static_cast <cudf::size_type>(thrust::distance (itr, end)));
85
+ auto const [bytes, left] =
86
+ cudf::strings::detail::bytes_to_character_position (check_str, width);
87
+ if ((itr != d_str.data ()) && (left > 0 )) { continue ; } // true if past the end of the string
88
+
89
+ auto const hash_str = cudf::string_view (itr, bytes);
90
+ // hashing with each seed on the same section of the string is 10x faster than
91
91
// computing the substrings for each seed
92
92
for (std::size_t seed_idx = 0 ; seed_idx < seeds.size (); ++seed_idx) {
93
93
auto const hasher = cudf::detail::MurmurHash3_32<cudf::string_view>{seeds[seed_idx]};
0 commit comments