Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose stream parameter in public nvtext ngram APIs #14061

Merged
merged 15 commits into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions cpp/include/nvtext/generate_ngrams.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,20 @@ namespace nvtext {
* @throw cudf::logic_error if `separator` is invalid
* @throw cudf::logic_error if there are not enough strings to generate any ngrams
*
* @param strings Strings column to tokenize and produce ngrams from.
* @param input Strings column to tokenize and produce ngrams from
* @param ngrams The ngram number to generate.
* Default is 2 = bigram.
* @param separator The string to use for separating ngram tokens.
* Default is "_" character.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> generate_ngrams(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::size_type ngrams = 2,
cudf::string_scalar const& separator = cudf::string_scalar{"_"},
cudf::string_scalar const& separator = cudf::string_scalar{"_", true, cudf::get_default_stream()},
vyasr marked this conversation as resolved.
Show resolved Hide resolved
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -79,15 +81,17 @@ std::unique_ptr<cudf::column> generate_ngrams(
* @throw cudf::logic_error if `ngrams < 2`
* @throw cudf::logic_error if there are not enough characters to generate any ngrams
*
* @param strings Strings column to produce ngrams from.
* @param input Strings column to produce ngrams from
* @param ngrams The ngram number to generate.
* Default is 2 = bigram.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> generate_character_ngrams(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::size_type ngrams = 2,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -113,14 +117,16 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
* @throw cudf::logic_error if `ngrams < 2`
* @throw cudf::logic_error if there are not enough characters to generate any ngrams
*
* @param strings Strings column to produce ngrams from.
* @param input Strings column to produce ngrams from
* @param ngrams The ngram number to generate. Default is 5.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A lists column of hash values
*/
std::unique_ptr<cudf::column> hash_character_ngrams(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::size_type ngrams = 5,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
16 changes: 9 additions & 7 deletions cpp/include/nvtext/ngrams_tokenize.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -66,21 +66,23 @@ namespace nvtext {
*
* All null row entries are ignored and the output contains all valid rows.
*
* @param strings Strings column to tokenize and produce ngrams from.
* @param input Strings column to tokenize and produce ngrams from
* @param ngrams The ngram number to generate.
* Default is 2 = bigram.
* @param delimiter UTF-8 characters used to separate each string into tokens.
* The default of empty string will separate tokens using whitespace.
* @param separator The string to use for separating ngram tokens.
* Default is "_" character.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> ngrams_tokenize(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::size_type ngrams = 2,
cudf::string_scalar const& delimiter = cudf::string_scalar{""},
cudf::string_scalar const& separator = cudf::string_scalar{"_"},
cudf::string_scalar const& delimiter = cudf::string_scalar{"", true, cudf::get_default_stream()},
cudf::string_scalar const& separator = cudf::string_scalar{"_", true, cudf::get_default_stream()},
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
9 changes: 6 additions & 3 deletions cpp/src/text/generate_ngrams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,11 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
cudf::string_scalar const& separator,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::generate_ngrams(strings, ngrams, separator, cudf::get_default_stream(), mr);
return detail::generate_ngrams(strings, ngrams, separator, stream, mr);
}

namespace detail {
Expand Down Expand Up @@ -317,18 +318,20 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co

std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
return detail::generate_character_ngrams(strings, ngrams, stream, mr);
}

std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
return detail::hash_character_ngrams(strings, ngrams, stream, mr);
}

} // namespace nvtext
4 changes: 2 additions & 2 deletions cpp/src/text/jaccard.cu
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view con
*
* This is called with a warp per row
*/
struct sorted_interset_fn {
struct sorted_intersect_fn {
cudf::column_device_view const d_input1;
cudf::column_device_view const d_input2;
cudf::size_type* d_results;
Expand Down Expand Up @@ -151,7 +151,7 @@ rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view
auto const d_input1 = cudf::column_device_view::create(input1, stream);
auto const d_input2 = cudf::column_device_view::create(input2, stream);
auto d_results = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
sorted_interset_fn fn{*d_input1, *d_input2, d_results.data()};
sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::counting_iterator<cudf::size_type>(0),
input1.size() * cudf::detail::warp_size,
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/text/ngrams_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -265,11 +265,11 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
cudf::size_type ngrams,
cudf::string_scalar const& delimiter,
cudf::string_scalar const& separator,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::ngrams_tokenize(
strings, ngrams, delimiter, separator, cudf::get_default_stream(), mr);
return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr);
}

} // namespace nvtext
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,7 @@ ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE t
ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)

# ##################################################################################################
# Install tests ####################################################################################
Expand Down
59 changes: 59 additions & 0 deletions cpp/tests/streams/text/ngrams_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <nvtext/generate_ngrams.hpp>
#include <nvtext/ngrams_tokenize.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/default_stream.hpp>

class TextNGramsTest : public cudf::test::BaseFixture {};

TEST_F(TextNGramsTest, GenerateNgrams)
{
auto const input =
cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
nvtext::generate_ngrams(
cudf::strings_column_view(input), 3, separator, cudf::test::get_default_stream());
}

TEST_F(TextNGramsTest, GenerateCharacterNgrams)
{
auto const input =
cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
nvtext::generate_character_ngrams(
cudf::strings_column_view(input), 3, cudf::test::get_default_stream());
}

TEST_F(TextNGramsTest, HashCharacterNgrams)
{
auto input =
cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
nvtext::hash_character_ngrams(
cudf::strings_column_view(input), 5, cudf::test::get_default_stream());
}

TEST_F(TextNGramsTest, NgramsTokenize)
{
auto input =
cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
nvtext::ngrams_tokenize(
cudf::strings_column_view(input), 2, delimiter, separator, cudf::test::get_default_stream());
}