Skip to content

Commit 4063f4c

Browse files
authored
Merge pull request #4922 from davidwendt/perf-strings-split-to-many-columns
[REVIEW] Fix cudf::strings:split logic for many columns
2 parents 20bde03 + 5bc4ede commit 4063f4c

File tree

9 files changed

+1503
-804
lines changed

9 files changed

+1503
-804
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@
192192
- PR #4899 Fix series inplace handling
193193
- PR #4940 Fix boolean mask issue with large sized Dataframe
194194
- PR #4889 Fix multi-index merging
195+
- PR #4922 Fix cudf::strings:split logic for many columns
195196
- PR #4949 Fix scatter, gather benchmark constructor call
196197
- PR #4965 Raise Error when there are duplicate columns sent to `cudf.concat`
197198
- PR #4996 Parquet writer: fix potentially zero-sized string dictionary

cpp/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,7 @@ add_library(cudf
632632
src/strings/sorting/sorting.cu
633633
src/strings/split/partition.cu
634634
src/strings/split/split.cu
635+
src/strings/split/split_record.cu
635636
src/strings/strings_column_factories.cu
636637
src/strings/strings_column_view.cu
637638
src/strings/strings_scalar_factories.cpp
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/*
2+
* Copyright (c) 2020, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <cudf/column/column.hpp>
18+
#include <cudf/column/column_factories.hpp>
19+
#include <cudf/detail/nvtx/ranges.hpp>
20+
#include <cudf/detail/valid_if.cuh>
21+
#include <cudf/strings/detail/utilities.hpp>
22+
#include <cudf/utilities/error.hpp>
23+
#include <strings/utilities.cuh>
24+
25+
#include <rmm/thrust_rmm_allocator.h>
26+
#include <thrust/for_each.h>
27+
#include <thrust/transform_reduce.h>
28+
29+
// clang-format off
30+
namespace cudf {
31+
namespace strings {
32+
namespace detail {
33+
34+
// Create a strings-type column from vector of pointer/size pairs
35+
template<typename IndexPairIterator>
36+
std::unique_ptr<column> make_strings_column(
37+
IndexPairIterator begin, IndexPairIterator end,
38+
rmm::mr::device_memory_resource* mr,
39+
cudaStream_t stream )
40+
{
41+
CUDF_FUNC_RANGE();
42+
size_type strings_count = thrust::distance(begin,end);
43+
if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream);
44+
45+
using string_index_pair = thrust::pair<const char*, size_type>;
46+
47+
auto execpol = rmm::exec_policy(stream);
48+
// check total size is not too large for cudf column
49+
size_t bytes = thrust::transform_reduce(
50+
execpol->on(stream), begin, end,
51+
[] __device__(string_index_pair const& item) {
52+
return (item.first != nullptr) ? item.second : 0;
53+
},
54+
0,
55+
thrust::plus<size_t>());
56+
CUDF_EXPECTS(bytes < std::numeric_limits<size_type>::max(),
57+
"total size of strings is too large for cudf column");
58+
59+
// build offsets column from the strings sizes
60+
auto offsets_transformer = [begin] __device__(size_type idx) {
61+
string_index_pair const item = begin[idx];
62+
return (item.first != nullptr ? static_cast<int32_t>(item.second) : 0);
63+
};
64+
auto offsets_transformer_itr = thrust::make_transform_iterator(
65+
thrust::make_counting_iterator<size_type>(0), offsets_transformer);
66+
auto offsets_column = strings::detail::make_offsets_child_column(
67+
offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
68+
auto d_offsets = offsets_column->view().template data<int32_t>();
69+
70+
// create null mask
71+
auto new_nulls = experimental::detail::valid_if( begin, end,
72+
[] __device__(string_index_pair const item) { return item.first != nullptr; },
73+
stream,
74+
mr);
75+
auto null_count = new_nulls.second;
76+
rmm::device_buffer null_mask;
77+
if (null_count > 0) null_mask = std::move(new_nulls.first);
78+
79+
// build chars column
80+
auto chars_column =
81+
strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream);
82+
auto d_chars = chars_column->mutable_view().template data<char>();
83+
thrust::for_each_n(execpol->on(stream),
84+
thrust::make_counting_iterator<size_type>(0),
85+
strings_count,
86+
[begin, d_offsets, d_chars] __device__(size_type idx) {
87+
string_index_pair const item = begin[idx];
88+
if (item.first != nullptr)
89+
memcpy(d_chars + d_offsets[idx], item.first, item.second);
90+
});
91+
92+
return make_strings_column(strings_count,
93+
std::move(offsets_column),
94+
std::move(chars_column),
95+
null_count,
96+
std::move(null_mask),
97+
stream,
98+
mr);
99+
}
100+
101+
} // namespace detail
102+
} // namespace strings
103+
} // namespace cudf
104+
// clang-format on TODO fix

cpp/include/cudf/strings/split/split.hpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ namespace strings {
3535
*
3636
* Any null string entries return corresponding null output columns.
3737
*
38-
* @param strings Strings instance for this operation.
38+
* @param strings_column Strings instance for this operation.
3939
* @param delimiter UTF-8 encoded string indentifying the split points in each string.
4040
* Default of empty string indicates split on whitespace.
4141
* @param maxsplit Maximum number of splits to perform.
@@ -44,7 +44,7 @@ namespace strings {
4444
* @return New table of strings columns.
4545
*/
4646
std::unique_ptr<experimental::table> split(
47-
strings_column_view const& strings,
47+
strings_column_view const& strings_column,
4848
string_scalar const& delimiter = string_scalar(""),
4949
size_type maxsplit = -1,
5050
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
@@ -63,7 +63,7 @@ std::unique_ptr<experimental::table> split(
6363
*
6464
* Any null string entries return corresponding null output columns.
6565
*
66-
* @param strings Strings instance for this operation.
66+
* @param strings_column Strings instance for this operation.
6767
* @param delimiter UTF-8 encoded string indentifying the split points in each string.
6868
* Default of empty string indicates split on whitespace.
6969
* @param maxsplit Maximum number of splits to perform.
@@ -72,7 +72,7 @@ std::unique_ptr<experimental::table> split(
7272
* @return New strings columns.
7373
*/
7474
std::unique_ptr<experimental::table> rsplit(
75-
strings_column_view const& strings,
75+
strings_column_view const& strings_column,
7676
string_scalar const& delimiter = string_scalar(""),
7777
size_type maxsplit = -1,
7878
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

cpp/src/rolling/rolling.cu

+1-2
Original file line numberDiff line numberDiff line change
@@ -446,8 +446,7 @@ struct rolling_window_launcher {
446446
// and that's why nullify_out_of_bounds/ignore_out_of_bounds is true.
447447
auto output_table =
448448
detail::gather(table_view{{input}}, output->view(), false, true, false, mr, stream);
449-
return std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
450-
;
449+
output = std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
451450
}
452451

453452
return output;

0 commit comments

Comments
 (0)