rapidsai · rapids-bot · Sep 22, 2023 · Sep 7, 2023 · Sep 10, 2023 · Sep 12, 2023
@@ -47,18 +47,20 @@ namespace nvtext {
  * @throw cudf::logic_error if `separator` is invalid
  * @throw cudf::logic_error if there are not enough strings to generate any ngrams
  *
- * @param strings Strings column to tokenize and produce ngrams from.
+ * @param input Strings column to tokenize and produce ngrams from
  * @param ngrams The ngram number to generate.
  *               Default is 2 = bigram.
  * @param separator The string to use for separating ngram tokens.
  *                  Default is "_" character.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> generate_ngrams(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams               = 2,
-  cudf::string_scalar const& separator = cudf::string_scalar{"_"},
+  cudf::string_scalar const& separator = cudf::string_scalar{"_", true, cudf::get_default_stream()},
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -79,15 +81,17 @@ std::unique_ptr<cudf::column> generate_ngrams(
  * @throw cudf::logic_error if `ngrams < 2`
  * @throw cudf::logic_error if there are not enough characters to generate any ngrams
  *
- * @param strings Strings column to produce ngrams from.
+ * @param input Strings column to produce ngrams from
  * @param ngrams The ngram number to generate.
  *               Default is 2 = bigram.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> generate_character_ngrams(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams              = 2,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -113,14 +117,16 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
  * @throw cudf::logic_error if `ngrams < 2`
  * @throw cudf::logic_error if there are not enough characters to generate any ngrams
  *
- * @param strings Strings column to produce ngrams from.
+ * @param input Strings column to produce ngrams from
  * @param ngrams The ngram number to generate. Default is 5.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A lists column of hash values
  */
 std::unique_ptr<cudf::column> hash_character_ngrams(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams              = 5,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,21 +66,23 @@ namespace nvtext {
  *
  * All null row entries are ignored and the output contains all valid rows.
  *
- * @param strings Strings column to tokenize and produce ngrams from.
+ * @param input Strings column to tokenize and produce ngrams from
  * @param ngrams The ngram number to generate.
  *               Default is 2 = bigram.
  * @param delimiter UTF-8 characters used to separate each string into tokens.
  *                  The default of empty string will separate tokens using whitespace.
  * @param separator The string to use for separating ngram tokens.
  *                  Default is "_" character.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> ngrams_tokenize(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams               = 2,
-  cudf::string_scalar const& delimiter = cudf::string_scalar{""},
-  cudf::string_scalar const& separator = cudf::string_scalar{"_"},
+  cudf::string_scalar const& delimiter = cudf::string_scalar{"", true, cudf::get_default_stream()},
+  cudf::string_scalar const& separator = cudf::string_scalar{"_", true, cudf::get_default_stream()},
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group

@@ -150,10 +150,11 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
 std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& strings,
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& separator,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_ngrams(strings, ngrams, separator, cudf::get_default_stream(), mr);
+  return detail::generate_ngrams(strings, ngrams, separator, stream, mr);
 }
 
 namespace detail {
@@ -317,18 +318,20 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
 
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
                                                         cudf::size_type ngrams,
+                                                        rmm::cuda_stream_view stream,
                                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
+  return detail::generate_character_ngrams(strings, ngrams, stream, mr);
 }
 
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
                                                     cudf::size_type ngrams,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
+  return detail::hash_character_ngrams(strings, ngrams, stream, mr);
 }
 
 }  // namespace nvtext
@@ -107,7 +107,7 @@ rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view con
  *
  * This is called with a warp per row
  */
-struct sorted_interset_fn {
+struct sorted_intersect_fn {
   cudf::column_device_view const d_input1;
   cudf::column_device_view const d_input2;
   cudf::size_type* d_results;
@@ -151,7 +151,7 @@ rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view
   auto const d_input1 = cudf::column_device_view::create(input1, stream);
   auto const d_input2 = cudf::column_device_view::create(input2, stream);
   auto d_results      = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
-  sorted_interset_fn fn{*d_input1, *d_input2, d_results.data()};
+  sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()};
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::counting_iterator<cudf::size_type>(0),
                      input1.size() * cudf::detail::warp_size,

@@ -265,11 +265,11 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& delimiter,
                                               cudf::string_scalar const& separator,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ngrams_tokenize(
-    strings, ngrams, delimiter, separator, cudf::get_default_stream(), mr);
+  return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr);
 }
 
 }  // namespace nvtext
@@ -627,6 +627,7 @@ ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE t
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvtext/generate_ngrams.hpp>
+#include <nvtext/ngrams_tokenize.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class TextNGramsTest : public cudf::test::BaseFixture {};
+
+TEST_F(TextNGramsTest, GenerateNgrams)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
+  nvtext::generate_ngrams(
+    cudf::strings_column_view(input), 3, separator, cudf::test::get_default_stream());
+}
+
+TEST_F(TextNGramsTest, GenerateCharacterNgrams)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  nvtext::generate_character_ngrams(
+    cudf::strings_column_view(input), 3, cudf::test::get_default_stream());
+}
+
+TEST_F(TextNGramsTest, HashCharacterNgrams)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
+  nvtext::hash_character_ngrams(
+    cudf::strings_column_view(input), 5, cudf::test::get_default_stream());
+}
+
+TEST_F(TextNGramsTest, NgramsTokenize)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
+  auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
+  nvtext::ngrams_tokenize(
+    cudf::strings_column_view(input), 2, delimiter, separator, cudf::test::get_default_stream());
+}