From d81f2a21bdbee8cdb59077b362228041130b5f1c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 6 Jun 2024 18:09:19 +0000 Subject: [PATCH 01/13] return LargeStringArray if large strings are enabled --- cpp/src/interop/to_arrow.cu | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index e871e656c48..529a5fc2828 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -296,8 +297,13 @@ std::shared_ptr dispatch_to_arrow::operator()( auto tmp_data_buffer = allocate_arrow_buffer(0, ar_mr); tmp_offset_buffer->mutable_data()[0] = 0; - return std::make_shared( - 0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer)); + if (cudf::strings::detail::is_large_strings_enabled()) { + return std::make_shared( + 0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer)); + } else { + return std::make_shared( + 0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer)); + } } auto offset_buffer = child_arrays[strings_column_view::offsets_column_index]->data()->buffers[1]; auto const sview = strings_column_view{input_view}; @@ -306,11 +312,19 @@ std::shared_ptr dispatch_to_arrow::operator()( static_cast(sview.chars_size(stream))}, ar_mr, stream); - return std::make_shared(static_cast(input_view.size()), - offset_buffer, - data_buffer, - fetch_mask_buffer(input_view, ar_mr, stream), - static_cast(input_view.null_count())); + if (cudf::strings::detail::is_large_strings_enabled()) { + return std::make_shared(static_cast(input_view.size()), + offset_buffer, + data_buffer, + fetch_mask_buffer(input_view, ar_mr, stream), + static_cast(input_view.null_count())); + } else { + return std::make_shared(static_cast(input_view.size()), + offset_buffer, + data_buffer, + fetch_mask_buffer(input_view, ar_mr, stream), + static_cast(input_view.null_count())); + } } template <> From d33bb09c348fe271c6283650ae0aa2c259c440bf Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 7 Jun 2024 08:35:02 -0500 Subject: [PATCH 02/13] Update cpp/src/interop/to_arrow.cu Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/src/interop/to_arrow.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 529a5fc2828..8908f05f947 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -312,7 +312,7 @@ std::shared_ptr dispatch_to_arrow::operator()( static_cast(sview.chars_size(stream))}, ar_mr, stream); - if (cudf::strings::detail::is_large_strings_enabled()) { + if (sview.offsets().type().id()==cudf::type_id::INT64) { return std::make_shared(static_cast(input_view.size()), offset_buffer, data_buffer, From 0ee3f7843edf453351135ce5cf178e7e7ca0358a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 7 Jun 2024 13:40:08 +0000 Subject: [PATCH 03/13] revert --- cpp/src/interop/to_arrow.cu | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 8908f05f947..9271b6aee43 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -297,13 +297,8 @@ std::shared_ptr dispatch_to_arrow::operator()( auto tmp_data_buffer = allocate_arrow_buffer(0, ar_mr); tmp_offset_buffer->mutable_data()[0] = 0; - if (cudf::strings::detail::is_large_strings_enabled()) { - return std::make_shared( - 0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer)); - } else { - return std::make_shared( - 0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer)); - } + return std::make_shared( + 0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer)); } auto offset_buffer = child_arrays[strings_column_view::offsets_column_index]->data()->buffers[1]; auto const sview = strings_column_view{input_view}; @@ -312,7 +307,7 @@ std::shared_ptr dispatch_to_arrow::operator()( static_cast(sview.chars_size(stream))}, ar_mr, stream); - if (sview.offsets().type().id()==cudf::type_id::INT64) { + if (sview.offsets().type().id() == cudf::type_id::INT64) { return std::make_shared(static_cast(input_view.size()), offset_buffer, data_buffer, From 7c586932f40da1b0f5a2d531b6080fd6184dd373 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 10 Jun 2024 14:28:20 +0000 Subject: [PATCH 04/13] Add from_arrow support --- cpp/src/interop/from_arrow.cu | 32 +++++++++++++++++++------- python/cudf/cudf/core/column/column.py | 6 ----- python/cudf/cudf/tests/test_series.py | 11 +++++---- 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index f100ca0cc2b..ce76ea6e3dc 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -78,6 +78,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type) } } case arrow::Type::STRING: return data_type(type_id::STRING); + case arrow::Type::LARGE_STRING: return data_type(type_id::STRING); case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32); case arrow::Type::LIST: return data_type(type_id::LIST); case arrow::Type::DECIMAL: { @@ -276,14 +277,30 @@ std::unique_ptr dispatch_to_cudf_column::operator()( rmm::device_async_resource_ref mr) { if (array.length() == 0) { return make_empty_column(type_id::STRING); } - auto str_array = static_cast(&array); - auto offset_array = std::make_unique( - str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr); - auto char_array = std::make_unique( - str_array->value_data()->size(), str_array->value_data(), nullptr); - auto offsets_column = dispatch_to_cudf_column{}.operator()( - *offset_array, data_type(type_id::INT32), true, stream, mr); + std::unique_ptr offsets_column; + std::unique_ptr char_array; + + if (array.type_id() == arrow::Type::LARGE_STRING) { + auto str_array = static_cast(&array); + auto offset_array = std::make_unique( + str_array->value_offsets()->size() / sizeof(int64_t), str_array->value_offsets(), nullptr); + offsets_column = dispatch_to_cudf_column{}.operator()( + *offset_array, data_type(type_id::INT64), true, stream, mr); + char_array = std::make_unique( + str_array->value_data()->size(), str_array->value_data(), nullptr); + } else if (array.type_id() == arrow::Type::STRING) { + auto str_array = static_cast(&array); + auto offset_array = std::make_unique( + str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr); + offsets_column = dispatch_to_cudf_column{}.operator()( + *offset_array, data_type(type_id::INT32), true, stream, mr); + char_array = std::make_unique( + str_array->value_data()->size(), str_array->value_data(), nullptr); + } else { + throw std::runtime_error("Unsupported array type"); + } + auto chars_column = dispatch_to_cudf_column{}.operator()( *char_array, data_type(type_id::INT8), true, stream, mr); @@ -304,7 +321,6 @@ std::unique_ptr dispatch_to_cudf_column::operator()( stream, mr); } - template <> std::unique_ptr dispatch_to_cudf_column::operator()( arrow::Array const& array, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 475d52d0fbb..8faad72ab76 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -338,12 +338,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: ) elif isinstance(array.type, ArrowIntervalType): return cudf.core.column.IntervalColumn.from_arrow(array) - elif pa.types.is_large_string(array.type): - # Pandas-2.2+: Pandas defaults to `large_string` type - # instead of `string` without data-introspection. - # Temporary workaround until cudf has native - # support for `LARGE_STRING` i.e., 64 bit offsets - array = array.cast(pa.string()) data = pa.table([array], [None]) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index f47c42d9a1d..30189e1ac8a 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2737,13 +2737,16 @@ def test_series_dtype_astypes(data): assert_eq(result, expected) -def test_series_from_large_string(): - pa_large_string_array = pa.array(["a", "b", "c"]).cast(pa.large_string()) - got = cudf.Series(pa_large_string_array) - expected = pd.Series(pa_large_string_array) +@pytest.mark.parametrize("pa_type", [pa.string, pa.large_string]) +def test_series_from_large_string(pa_type): + pa_string_array = pa.array(["a", "b", "c"]).cast(pa_type()) + got = cudf.Series(pa_string_array) + expected = pd.Series(pa_string_array) assert_eq(expected, got) + assert pa_string_array.equals(got.to_arrow()) + @pytest.mark.parametrize( "scalar", From 9788deebe9ed4980493a1d11d649e0602a678611 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 10 Jun 2024 15:05:27 +0000 Subject: [PATCH 05/13] no overflow error --- python/cudf/cudf/tests/test_column.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index a8a297c155f..7f441d4ca8d 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -521,8 +521,10 @@ def test_concatenate_large_column_strings(): s_1 = cudf.Series(["very long string " * string_scale_f] * num_strings) s_2 = cudf.Series(["very long string " * string_scale_f] * num_strings) - with pytest.raises(OverflowError): - cudf.concat([s_1, s_2]) + actual = cudf.concat([s_1, s_2]) + expected = pd.concat([s_1.to_pandas(), s_2.to_pandas()]) + + assert_eq(actual, expected) @pytest.mark.parametrize( From b02a43b814a1adc512212a13b110b1046ac7969d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 10 Jun 2024 19:33:42 +0000 Subject: [PATCH 06/13] Add from arrow tests --- cpp/src/interop/from_arrow.cu | 11 +++++++--- cpp/tests/interop/arrow_utils.hpp | 16 +++++++++++++++ cpp/tests/interop/from_arrow_test.cpp | 29 ++++++++++++++++++++++++--- 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index ce76ea6e3dc..463d29c8e1d 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -301,13 +301,18 @@ std::unique_ptr dispatch_to_cudf_column::operator()( throw std::runtime_error("Unsupported array type"); } - auto chars_column = dispatch_to_cudf_column{}.operator()( - *char_array, data_type(type_id::INT8), true, stream, mr); + rmm::device_uvector chars(char_array->length(), stream, mr); + auto data_buffer = char_array->data()->buffers[1]; + CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(), + reinterpret_cast(data_buffer->address()), + chars.size(), + cudaMemcpyDefault, + stream.value())); auto const num_rows = offsets_column->size() - 1; auto out_col = make_strings_column(num_rows, std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), array.null_count(), std::move(*get_mask_buffer(array, stream, mr))); diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp index 1fdf02e02f1..b4f36eb5e40 100644 --- a/cpp/tests/interop/arrow_utils.hpp +++ b/cpp/tests/interop/arrow_utils.hpp @@ -105,6 +105,22 @@ get_arrow_array(std::vector const& data, std::vector const return string_array; } +template +std::enable_if_t, std::shared_ptr> +get_arrow_large_string_array(std::vector const& data, + std::vector const& mask = {}) +{ + std::shared_ptr large_string_array; + arrow::LargeStringBuilder large_string_builder; + + CUDF_EXPECTS(large_string_builder.AppendValues(data, mask.data()).ok(), + "Failed to append values to string builder"); + CUDF_EXPECTS(large_string_builder.Finish(&large_string_array).ok(), + "Failed to create arrow string array"); + + return large_string_array; +} + template std::enable_if_t, std::shared_ptr> get_arrow_array(std::initializer_list elements, diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 94b0c75f184..a307169c2f2 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -50,6 +50,16 @@ std::unique_ptr get_cudf_table() columns.emplace_back( cudf::test::fixed_width_column_wrapper({true, false, true, false, true}, {1, 0, 1, 1, 0}) .release()); + columns.emplace_back(cudf::test::strings_column_wrapper( + { + "", + "abc", + "def", + "1", + "2", + }, + {0, 1, 1, 1, 1}) + .release()); // columns.emplace_back(cudf::test::lists_column_wrapper({{1, 2}, {3, 4}, {}, {6}, {7, 8, // 9}}).release()); return std::make_unique(std::move(columns)); @@ -289,6 +299,15 @@ TEST_F(FromArrowTest, ChunkedArray) "ccc", }, {0, 1}); + auto large_string_array_1 = get_arrow_large_string_array( + { + "", + "abc", + "def", + "1", + "2", + }, + {0, 1, 1, 1, 1}); auto dict_array1 = get_arrow_dict_array({1, 2, 5, 7}, {0, 1, 2}, {1, 0, 1}); auto dict_array2 = get_arrow_dict_array({1, 2, 5, 7}, {1, 3}); @@ -300,14 +319,17 @@ TEST_F(FromArrowTest, ChunkedArray) auto dict_chunked_array = std::make_shared( std::vector>{dict_array1, dict_array2}); auto boolean_array = get_arrow_array({true, false, true, false, true}, {1, 0, 1, 1, 0}); - auto boolean_chunked_array = std::make_shared(boolean_array); + auto boolean_chunked_array = std::make_shared(boolean_array); + auto large_string_chunked_array = std::make_shared( + std::vector>{large_string_array_1}); std::vector> schema_vector( {arrow::field("a", int32_chunked_array->type()), arrow::field("b", int64array->type()), arrow::field("c", string_array_1->type()), arrow::field("d", dict_chunked_array->type()), - arrow::field("e", boolean_chunked_array->type())}); + arrow::field("e", boolean_chunked_array->type()), + arrow::field("f", large_string_array_1->type())}); auto schema = std::make_shared(schema_vector); auto arrow_table = arrow::Table::Make(schema, @@ -315,7 +337,8 @@ TEST_F(FromArrowTest, ChunkedArray) int64_chunked_array, string_chunked_array, dict_chunked_array, - boolean_chunked_array}); + boolean_chunked_array, + large_string_chunked_array}); auto expected_cudf_table = get_cudf_table(); From bf13c9dbf7801290d1c09f73f6baa900d2088ec6 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 10 Jun 2024 20:10:18 +0000 Subject: [PATCH 07/13] revert pytest --- python/cudf/cudf/tests/test_column.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 7f441d4ca8d..a8a297c155f 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -521,10 +521,8 @@ def test_concatenate_large_column_strings(): s_1 = cudf.Series(["very long string " * string_scale_f] * num_strings) s_2 = cudf.Series(["very long string " * string_scale_f] * num_strings) - actual = cudf.concat([s_1, s_2]) - expected = pd.concat([s_1.to_pandas(), s_2.to_pandas()]) - - assert_eq(actual, expected) + with pytest.raises(OverflowError): + cudf.concat([s_1, s_2]) @pytest.mark.parametrize( From 89c17e96c5976053dcddad66678d5277941e0bb9 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 11 Jun 2024 15:08:01 +0000 Subject: [PATCH 08/13] Use device_buffer --- cpp/src/interop/from_arrow.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 463d29c8e1d..579820cbae3 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -301,7 +301,7 @@ std::unique_ptr dispatch_to_cudf_column::operator()( throw std::runtime_error("Unsupported array type"); } - rmm::device_uvector chars(char_array->length(), stream, mr); + rmm::device_buffer chars(char_array->length(), stream, mr); auto data_buffer = char_array->data()->buffers[1]; CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(), reinterpret_cast(data_buffer->address()), @@ -312,7 +312,7 @@ std::unique_ptr dispatch_to_cudf_column::operator()( auto const num_rows = offsets_column->size() - 1; auto out_col = make_strings_column(num_rows, std::move(offsets_column), - chars.release(), + std::move(chars), array.null_count(), std::move(*get_mask_buffer(array, stream, mr))); @@ -326,6 +326,7 @@ std::unique_ptr dispatch_to_cudf_column::operator()( stream, mr); } + template <> std::unique_ptr dispatch_to_cudf_column::operator()( arrow::Array const& array, From d113c30965b7e833de2c1452874a3a92666971d8 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 11 Jun 2024 10:08:49 -0500 Subject: [PATCH 09/13] Update cpp/src/interop/to_arrow.cu Co-authored-by: Lawrence Mitchell --- cpp/src/interop/to_arrow.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 9271b6aee43..47aee982c32 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include From cea571410fe874fe88305b3289e2e771861d1d3f Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 11 Jun 2024 10:19:02 -0500 Subject: [PATCH 10/13] Update cpp/tests/interop/arrow_utils.hpp Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/tests/interop/arrow_utils.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp index b4f36eb5e40..ce8a634f05f 100644 --- a/cpp/tests/interop/arrow_utils.hpp +++ b/cpp/tests/interop/arrow_utils.hpp @@ -105,9 +105,7 @@ get_arrow_array(std::vector const& data, std::vector const return string_array; } -template -std::enable_if_t, std::shared_ptr> -get_arrow_large_string_array(std::vector const& data, +std::shared_ptr get_arrow_large_string_array(std::vector const& data, std::vector const& mask = {}) { std::shared_ptr large_string_array; From 0f3b3a0de845d4aeebda0da5dd17ad46bc034596 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 11 Jun 2024 15:25:31 +0000 Subject: [PATCH 11/13] style --- cpp/tests/interop/arrow_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp index ce8a634f05f..aa6efbee185 100644 --- a/cpp/tests/interop/arrow_utils.hpp +++ b/cpp/tests/interop/arrow_utils.hpp @@ -106,7 +106,7 @@ get_arrow_array(std::vector const& data, std::vector const } std::shared_ptr get_arrow_large_string_array(std::vector const& data, - std::vector const& mask = {}) + std::vector const& mask = {}) { std::shared_ptr large_string_array; arrow::LargeStringBuilder large_string_builder; From 6a89e20fd0d1a9218cca89f63a13cf15024c37e2 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 11 Jun 2024 16:59:34 -0500 Subject: [PATCH 12/13] Update cpp/tests/interop/from_arrow_test.cpp Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/tests/interop/from_arrow_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index a307169c2f2..610da7e1237 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -299,7 +299,7 @@ TEST_F(FromArrowTest, ChunkedArray) "ccc", }, {0, 1}); - auto large_string_array_1 = get_arrow_large_string_array( + auto large_string_array_1 = get_arrow_large_string_array( { "", "abc", From a60de7d00efe250c59000f7563de41ef7edffe3a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 11 Jun 2024 23:04:25 +0000 Subject: [PATCH 13/13] move get_arrow_large_string_array to cpp --- cpp/tests/interop/arrow_utils.hpp | 14 -------------- cpp/tests/interop/from_arrow_test.cpp | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp index aa6efbee185..1fdf02e02f1 100644 --- a/cpp/tests/interop/arrow_utils.hpp +++ b/cpp/tests/interop/arrow_utils.hpp @@ -105,20 +105,6 @@ get_arrow_array(std::vector const& data, std::vector const return string_array; } -std::shared_ptr get_arrow_large_string_array(std::vector const& data, - std::vector const& mask = {}) -{ - std::shared_ptr large_string_array; - arrow::LargeStringBuilder large_string_builder; - - CUDF_EXPECTS(large_string_builder.AppendValues(data, mask.data()).ok(), - "Failed to append values to string builder"); - CUDF_EXPECTS(large_string_builder.Finish(&large_string_array).ok(), - "Failed to create arrow string array"); - - return large_string_array; -} - template std::enable_if_t, std::shared_ptr> get_arrow_array(std::initializer_list elements, diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 610da7e1237..86daf4e02e5 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -65,6 +65,20 @@ std::unique_ptr get_cudf_table() return std::make_unique(std::move(columns)); } +std::shared_ptr get_arrow_large_string_array( + std::vector const& data, std::vector const& mask = {}) +{ + std::shared_ptr large_string_array; + arrow::LargeStringBuilder large_string_builder; + + CUDF_EXPECTS(large_string_builder.AppendValues(data, mask.data()).ok(), + "Failed to append values to string builder"); + CUDF_EXPECTS(large_string_builder.Finish(&large_string_array).ok(), + "Failed to create arrow string array"); + + return large_string_array; +} + struct FromArrowTest : public cudf::test::BaseFixture {}; template