From b09cfa01baf1edf8d9cfbe90913d3e305036a5c2 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 12 Aug 2024 03:19:07 +0000 Subject: [PATCH 01/30] Works. Need to figure out equality_fn --- cpp/src/io/parquet/chunk_dict.cu | 132 ++++++++++------------------- cpp/src/io/parquet/parquet_gpu.cuh | 72 ++++++++++++++-- cpp/src/io/parquet/parquet_gpu.hpp | 43 +--------- cpp/src/io/parquet/writer_impl.cu | 68 ++++++++++++--- 4 files changed, 165 insertions(+), 150 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index a43c6d4cbb6..147b1f3057b 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -26,28 +26,15 @@ namespace cudf::io::parquet::detail { +namespace cg = cooperative_groups; + namespace { constexpr int DEFAULT_BLOCK_SIZE = 256; } -template -CUDF_KERNEL void __launch_bounds__(block_size) - initialize_chunk_hash_maps_kernel(device_span chunks) -{ - auto const chunk = chunks[blockIdx.x]; - auto const t = threadIdx.x; - // fut: Now that per-chunk dict is same size as ck.num_values, try to not use one block per chunk - for (thread_index_type i = 0; i < chunk.dict_map_size; i += block_size) { - if (t + i < chunk.dict_map_size) { - new (&chunk.dict_map_slots[t + i].first) map_type::atomic_key_type{KEY_SENTINEL}; - new (&chunk.dict_map_slots[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL}; - } - } -} - -template struct equality_functor { column_device_view const& col; + template __device__ bool operator()(size_type lhs_idx, size_type rhs_idx) { // We don't call this for nulls so this is fine @@ -56,50 +43,23 @@ struct equality_functor { } }; -template struct hash_functor { column_device_view const& col; - __device__ auto operator()(size_type idx) const - { - return cudf::hashing::detail::MurmurHash3_x86_32{}(col.element(idx)); - } -}; - -struct map_insert_fn { - map_type::device_mutable_view& map; - - template - __device__ bool operator()(column_device_view const& col, size_type i) - { - if constexpr (column_device_view::has_element_accessor()) { - auto hash_fn = hash_functor{col}; - auto equality_fn = equality_functor{col}; - return map.insert(std::pair(i, i), hash_fn, equality_fn); - } else { - CUDF_UNREACHABLE("Unsupported type to insert in map"); - } - } -}; - -struct map_find_fn { - map_type::device_view& map; - template - __device__ map_type::device_view::iterator operator()(column_device_view const& col, size_type i) + __device__ auto operator()(size_type idx) const { if constexpr (column_device_view::has_element_accessor()) { - auto hash_fn = hash_functor{col}; - auto equality_fn = equality_functor{col}; - return map.find(i, hash_fn, equality_fn); + return cudf::hashing::detail::MurmurHash3_x86_32{}(col.element(idx)); } else { - CUDF_UNREACHABLE("Unsupported type to find in map"); + return static_cast(KEY_SENTINEL); } } }; template CUDF_KERNEL void __launch_bounds__(block_size) - populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan frags) + populate_chunk_hash_maps_kernel(map_ref_type* map_refs, + cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; auto block_x = blockIdx.x; @@ -123,10 +83,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) column_device_view const& data_col = *col->leaf_column; // Make a view of the hash map - auto hash_map_mutable = map_type::device_mutable_view(chunk->dict_map_slots, - chunk->dict_map_size, - cuco::empty_key{KEY_SENTINEL}, - cuco::empty_value{VALUE_SENTINEL}); + auto hash_map_ref = *(map_refs + chunk->dict_map_idx); __shared__ size_type total_num_dict_entries; thread_index_type val_idx = s_start_value_idx + t; @@ -138,8 +95,14 @@ CUDF_KERNEL void __launch_bounds__(block_size) size_type is_unique = 0; size_type uniq_elem_size = 0; if (is_valid) { - is_unique = - type_dispatcher(data_col.type(), map_insert_fn{hash_map_mutable}, data_col, val_idx); + auto const val = type_dispatcher(data_col.type(), hash_functor{data_col}, val_idx); + if (val != KEY_SENTINEL) { + auto map_insert_ref = hash_map_ref.with_operators(cuco::insert); + is_unique = map_insert_ref.insert(cuco::pair{val, val_idx}); + } else { + is_unique = false; + } + // if (!t) printf("inserted val = %u, val_idx = %ld\n", val, val_idx); uniq_elem_size = [&]() -> size_type { if (not is_unique) { return 0; } switch (col->physical_type) { @@ -170,7 +133,6 @@ CUDF_KERNEL void __launch_bounds__(block_size) } }(); } - auto num_unique = block_reduce(reduce_storage).Sum(is_unique); __syncthreads(); auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size); @@ -190,24 +152,20 @@ CUDF_KERNEL void __launch_bounds__(block_size) template CUDF_KERNEL void __launch_bounds__(block_size) - collect_map_entries_kernel(device_span chunks) + collect_map_entries_kernel(map_ref_type* map_refs, device_span chunks) { auto& chunk = chunks[blockIdx.x]; if (not chunk.use_dictionary) { return; } auto t = threadIdx.x; - auto map = map_type::device_view(chunk.dict_map_slots, - chunk.dict_map_size, - cuco::empty_key{KEY_SENTINEL}, - cuco::empty_value{VALUE_SENTINEL}); - + auto map = *(map_refs + chunk.dict_map_idx); __shared__ cuda::atomic counter; using cuda::std::memory_order_relaxed; if (t == 0) { new (&counter) cuda::atomic{0}; } __syncthreads(); for (size_type i = 0; i < chunk.dict_map_size; i += block_size) { if (t + i < chunk.dict_map_size) { - auto* slot = reinterpret_cast(map.begin_slot() + t + i); + auto* slot = reinterpret_cast(&map + t + i); auto key = slot->first; if (key != KEY_SENTINEL) { auto loc = counter.fetch_add(1, memory_order_relaxed); @@ -224,7 +182,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) template CUDF_KERNEL void __launch_bounds__(block_size) - get_dictionary_indices_kernel(cudf::detail::device_2dspan frags) + get_dictionary_indices_kernel(map_ref_type* map_refs, + cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; auto block_x = blockIdx.x; @@ -245,21 +204,22 @@ CUDF_KERNEL void __launch_bounds__(block_size) column_device_view const& data_col = *col->leaf_column; - auto map = map_type::device_view(chunk->dict_map_slots, - chunk->dict_map_size, - cuco::empty_key{KEY_SENTINEL}, - cuco::empty_value{VALUE_SENTINEL}); + auto map = *(map_refs + chunk->dict_map_idx); thread_index_type val_idx = s_start_value_idx + t; while (val_idx < end_value_idx) { if (data_col.is_valid(val_idx)) { - auto found_slot = type_dispatcher(data_col.type(), map_find_fn{map}, data_col, val_idx); - cudf_assert(found_slot != map.end() && - "Unable to find value in map in dictionary index construction"); - if (found_slot != map.end()) { - // No need for atomic as this is not going to be modified by any other thread - auto* val_ptr = reinterpret_cast(&found_slot->second); - chunk->dict_index[val_idx - s_ck_start_val_idx] = *val_ptr; + auto val = type_dispatcher(data_col.type(), hash_functor{data_col}, val_idx); + if (val != static_cast(KEY_SENTINEL)) { + auto map_find_ref = map.with_operators(cuco::find); + auto found_slot = map_find_ref.find(val); + cudf_assert(found_slot != map.end() && + "Unable to find value in map in dictionary index construction"); + if (found_slot != map.end()) { + // No need for atomic as this is not going to be modified by any other thread + auto* val_ptr = reinterpret_cast(&found_slot->second); + chunk->dict_index[val_idx - s_ck_start_val_idx] = *val_ptr; + } } } @@ -267,32 +227,30 @@ CUDF_KERNEL void __launch_bounds__(block_size) } } -void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_stream_view stream) -{ - constexpr int block_size = 1024; - initialize_chunk_hash_maps_kernel - <<>>(chunks); -} - -void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, +void populate_chunk_hash_maps(map_ref_type* map_refs, + cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { dim3 const dim_grid(frags.size().second, frags.size().first); populate_chunk_hash_maps_kernel - <<>>(frags); + <<>>(map_refs, frags); } -void collect_map_entries(device_span chunks, rmm::cuda_stream_view stream) +void collect_map_entries(map_ref_type* map_refs, + device_span chunks, + rmm::cuda_stream_view stream) { constexpr int block_size = 1024; - collect_map_entries_kernel<<>>(chunks); + collect_map_entries_kernel + <<>>(map_refs, chunks); } -void get_dictionary_indices(cudf::detail::device_2dspan frags, +void get_dictionary_indices(map_ref_type* map_refs, + cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { dim3 const dim_grid(frags.size().second, frags.size().first); get_dictionary_indices_kernel - <<>>(frags); + <<>>(map_refs, frags); } } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index e3c44c78898..e8d8eeb8dbf 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -21,22 +21,80 @@ #include #include +#include #include +#include +#include +#include + +#include namespace cudf::io::parquet::detail { -auto constexpr KEY_SENTINEL = size_type{-1}; -auto constexpr VALUE_SENTINEL = size_type{-1}; +using key_type = uint32_t; +using mapped_type = uint32_t; + +auto constexpr cg_size = 1; ///< A CUDA Cooperative Group of 8 threads to handle each subset +auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread + +auto constexpr KEY_SENTINEL = key_type{std::numeric_limits::max()}; +auto constexpr VALUE_SENTINEL = mapped_type{std::numeric_limits::max()}; +auto constexpr SCOPE = cuda::thread_scope_device; + +using slot_type = cuco::pair; + +using storage_type = cuco::aow_storage; +using storage_ref_type = typename storage_type::ref_type; -using map_type = cuco::legacy::static_map; +template +struct my_hasher { + __device__ auto operator()(T index) const { return index; } +}; + +using probing_scheme_type = cuco::linear_probing>; + +using map_ref_type = cuco::static_map_ref, + probing_scheme_type, + storage_ref_type>; ///< Map ref type /** - * @brief The alias of `map_type::pair_atomic_type` class. + * @brief Insert chunk values into their respective hash maps + * + * @param frags Column fragments + * @param stream CUDA stream to use + */ +void populate_chunk_hash_maps(map_ref_type* map_refs, + cudf::detail::device_2dspan frags, + rmm::cuda_stream_view stream); + +/** + * @brief Compact dictionary hash map entries into chunk.dict_data + * + * @param chunks Flat span of chunks to compact hash maps for + * @param stream CUDA stream to use + */ +void collect_map_entries(map_ref_type* map_refs, + device_span chunks, + rmm::cuda_stream_view stream); + +/** + * @brief Get the Dictionary Indices for each row + * + * For each row of a chunk, gets the indices into chunk.dict_data which contains the value otherwise + * stored in input column [row]. Stores these indices into chunk.dict_index. + * + * Since dict_data itself contains indices into the original cudf column, this means that + * col[row] == col[dict_data[dict_index[row - chunk.start_row]]] * - * Declare this struct by trivial subclassing instead of type aliasing so we can have forward - * declaration of this struct somewhere else. + * @param frags Column fragments + * @param stream CUDA stream to use */ -struct slot_type : public map_type::pair_atomic_type {}; +void get_dictionary_indices(map_ref_type* map_refs, + cudf::detail::device_2dspan frags, + rmm::cuda_stream_view stream); /** * @brief Return the byte length of parquet dtypes that are physically represented by INT32 diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index efc1f5ebab1..7044b38f529 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -514,7 +514,6 @@ constexpr unsigned int kDictHashBits = 16; constexpr size_t kDictScratchSize = (1 << kDictHashBits) * sizeof(uint32_t); struct EncPage; -struct slot_type; // convert Encoding to a mask value constexpr uint32_t encoding_to_mask(Encoding encoding) @@ -560,7 +559,7 @@ struct EncColumnChunk { uint8_t is_compressed; //!< Nonzero if the chunk uses compression uint32_t dictionary_size; //!< Size of dictionary page including header uint32_t ck_stat_size; //!< Size of chunk-level statistics (included in 1st page header) - slot_type* dict_map_slots; //!< Hash map storage for calculating dict encoding for this chunk + int32_t dict_map_idx; //!< Hash map storage for calculating dict encoding for this chunk size_type dict_map_size; //!< Size of dict_map_slots size_type num_dict_entries; //!< Total number of entries in dictionary size_type @@ -1001,46 +1000,6 @@ void InitFragmentStatistics(device_span groups, device_span fragments, rmm::cuda_stream_view stream); -/** - * @brief Initialize per-chunk hash maps used for dictionary with sentinel values - * - * @param chunks Flat span of chunks to initialize hash maps for - * @param stream CUDA stream to use - */ -void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_stream_view stream); - -/** - * @brief Insert chunk values into their respective hash maps - * - * @param frags Column fragments - * @param stream CUDA stream to use - */ -void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, - rmm::cuda_stream_view stream); - -/** - * @brief Compact dictionary hash map entries into chunk.dict_data - * - * @param chunks Flat span of chunks to compact hash maps for - * @param stream CUDA stream to use - */ -void collect_map_entries(device_span chunks, rmm::cuda_stream_view stream); - -/** - * @brief Get the Dictionary Indices for each row - * - * For each row of a chunk, gets the indices into chunk.dict_data which contains the value otherwise - * stored in input column [row]. Stores these indices into chunk.dict_index. - * - * Since dict_data itself contains indices into the original cudf column, this means that - * col[row] == col[dict_data[dict_index[row - chunk.start_row]]] - * - * @param frags Column fragments - * @param stream CUDA stream to use - */ -void get_dictionary_indices(cudf::detail::device_2dspan frags, - rmm::cuda_stream_view stream); - /** * @brief Launches kernel for initializing encoder data pages * diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 36a1d8377bf..7aee7f6dc1f 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1285,10 +1285,10 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, return std::pair(std::move(dict_data), std::move(dict_index)); } - // Allocate slots for each chunk - std::vector> hash_maps_storage; - hash_maps_storage.reserve(h_chunks.size()); - for (auto& chunk : h_chunks) { + std::vector valid_chunk_sizes; + valid_chunk_sizes.reserve(h_chunks.size()); + + std::for_each(h_chunks.begin(), h_chunks.end(), [&](auto& chunk) { auto const& chunk_col_desc = col_desc[chunk.col_desc_id]; auto const is_requested_non_dict = chunk_col_desc.requested_encoding != column_encoding::USE_DEFAULT && @@ -1298,20 +1298,60 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, if (is_type_non_dict || is_requested_non_dict) { chunk.use_dictionary = false; + valid_chunk_sizes.emplace_back(static_cast(0)); } else { chunk.use_dictionary = true; - // cuCollections suggests using a hash map of size N * (1/0.7) = num_values * 1.43 - // https://github.com/NVIDIA/cuCollections/blob/3a49fc71/include/cuco/static_map.cuh#L190-L193 - auto& inserted_map = hash_maps_storage.emplace_back(chunk.num_values * 1.43, stream); - chunk.dict_map_slots = inserted_map.data(); - chunk.dict_map_size = inserted_map.size(); + valid_chunk_sizes.emplace_back(static_cast( + cuco::make_window_extent(static_cast(chunk.num_values)))); + chunk.dict_map_size = valid_chunk_sizes.back(); } - } + }); + + std::vector map_offsets(valid_chunk_sizes.size() + 1, 0); + std::exclusive_scan(valid_chunk_sizes.begin(), + valid_chunk_sizes.end(), + map_offsets.begin(), + static_cast(0)); + map_offsets.back() = map_offsets[valid_chunk_sizes.size() - 1] + valid_chunk_sizes.back(); + + // Create a single bulk storage used by all subsets + auto map_storage = storage_type{map_offsets.back()}; + // Initializes the storage with the given sentinel + map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL}, + cuda::stream_ref{stream.value()}); + + std::vector h_map_refs; + h_map_refs.reserve(h_chunks.size()); + + std::for_each( + thrust::make_counting_iterator(static_cast(0)), + thrust::make_counting_iterator(h_chunks.size()), + [&](auto const idx) { + auto& chunk = h_chunks[idx]; + if (chunk.use_dictionary) { + storage_ref_type storage_ref{valid_chunk_sizes[idx], map_storage.data() + map_offsets[idx]}; + + h_map_refs.emplace_back(map_ref_type{cuco::empty_key{KEY_SENTINEL}, + cuco::empty_value{VALUE_SENTINEL}, + {}, + {}, + {}, + storage_ref}); + + chunk.dict_map_idx = h_map_refs.size() - 1; + } + }); + + rmm::device_uvector d_map_refs(h_map_refs.size(), stream); + CUDF_CUDA_TRY(cudaMemcpyAsync(d_map_refs.data(), + h_map_refs.data(), + sizeof(map_ref_type) * h_map_refs.size(), + cudaMemcpyDefault, + stream.value())); chunks.host_to_device_async(stream); - initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream); - populate_chunk_hash_maps(frags, stream); + populate_chunk_hash_maps(d_map_refs.data(), frags, stream); chunks.device_to_host_sync(stream); @@ -1372,8 +1412,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunk.dict_index = inserted_dict_index.data(); } chunks.host_to_device_async(stream); - collect_map_entries(chunks.device_view().flat_view(), stream); - get_dictionary_indices(frags, stream); + collect_map_entries(d_map_refs.data(), chunks.device_view().flat_view(), stream); + get_dictionary_indices(d_map_refs.data(), frags, stream); return std::pair(std::move(dict_data), std::move(dict_index)); } From 089f9095aadaff5771d1c9eb5f3a6a9ec43e0571 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 13 Aug 2024 01:46:03 +0000 Subject: [PATCH 02/30] Almost works. Need to do cg split work. --- cpp/src/io/parquet/chunk_dict.cu | 202 +++++++++++++++++------- cpp/src/io/parquet/parquet_gpu.cuh | 28 +--- cpp/src/io/parquet/parquet_gpu.hpp | 2 +- cpp/src/io/parquet/writer_impl.cu | 40 ++--- cpp/tests/io/parquet_writer_test.cpp | 41 ++--- cpp/tests/utilities/column_utilities.cu | 1 + 6 files changed, 184 insertions(+), 130 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 147b1f3057b..bd5f27ce9f6 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -22,6 +22,7 @@ #include +#include #include namespace cudf::io::parquet::detail { @@ -32,47 +33,142 @@ namespace { constexpr int DEFAULT_BLOCK_SIZE = 256; } +template struct equality_functor { column_device_view const& col; - template - __device__ bool operator()(size_type lhs_idx, size_type rhs_idx) + __device__ bool operator()(key_type const lhs_idx, key_type const rhs_idx) const { - // We don't call this for nulls so this is fine - auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{}; - return equal(col.element(lhs_idx), col.element(rhs_idx)); + // We don't call this for nulls so this is fine + auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{}; + auto const result = equal(col.element(lhs_idx), col.element(rhs_idx)); + printf("col_type_id:%d, equality idx1:%d, idx2:%d, eq:%d\n", + col.type().id(), + lhs_idx, + rhs_idx, + result); + return result; } }; +template struct hash_functor { column_device_view const& col; + __device__ auto operator()(key_type idx) const + { + auto const hashed = cudf::hashing::detail::MurmurHash3_x86_32{}(col.element(idx)); + printf("hashing idx: %d = %d\n", idx, hashed); + return hashed; // cudf::hashing::detail::MurmurHash3_x86_32{}(col.element(idx)); + } +}; + +struct map_insert_fn { + storage_ref_type const& storage_ref; + template - __device__ auto operator()(size_type idx) const + __device__ bool operator()(column_device_view const& col, key_type i) { if constexpr (column_device_view::has_element_accessor()) { - return cudf::hashing::detail::MurmurHash3_x86_32{}(col.element(idx)); + using equality_fn_type = equality_functor; + using hash_fn_type = hash_functor; + using probing_scheme_type = cuco::linear_probing; + + // Instantiate hash and equality functors. + auto hash_fn = hash_fn_type{col}; + auto equal_fn = equality_fn_type{col}; + + // Make a view of the hash map + cuco::static_map_ref + hash_map_ref{cuco::empty_key{KEY_SENTINEL}, + cuco::empty_value{VALUE_SENTINEL}, + {equal_fn}, + {hash_fn}, + {}, + storage_ref}; + + // Create another map with insert operator + auto map_insert_ref = hash_map_ref.with_operators(cuco::insert_and_find); + // Insert + auto [iter, found] = map_insert_ref.insert_and_find(cuco::pair{i, i}); + printf("Inserted k=%d, v=%d, unique=%d\n", iter->first, iter->second, found); + return found; } else { - return static_cast(KEY_SENTINEL); + CUDF_UNREACHABLE("Unsupported type to insert in map"); + } + } +}; // namespace cudf::io::parquet::detail + +struct map_find_fn { + storage_ref_type const& storage_ref; + + template + __device__ cuco::pair operator()(column_device_view const& col, key_type i) + { + if constexpr (column_device_view::has_element_accessor()) { + using equality_fn_type = equality_functor; + using hash_fn_type = hash_functor; + using probing_scheme_type = cuco::linear_probing; + + // Instantiate hash and equality functors. + auto hash_fn = hash_fn_type{col}; + auto equal_fn = equality_fn_type{col}; + + // Make a view of the hash map + cuco::static_map_ref + hash_map_ref{cuco::empty_key{KEY_SENTINEL}, + cuco::empty_value{VALUE_SENTINEL}, + {equal_fn}, + {hash_fn}, + {}, + storage_ref}; + + // Create another map with find operator + auto map_find_ref = hash_map_ref.with_operators(cuco::find); + + // Find the key = i + auto found_slot = map_find_ref.find(i); + + // Check if we found the previously inserted key. + cudf_assert(found_slot != map_find_ref.end() && + "Unable to find value in map in dictionary index construction"); + + // Return a pair of the found key and value. + printf("Find=%d, Found slot: k=%d, v=%d\n", i, found_slot->first, found_slot->second); + return {found_slot->first, found_slot->second}; + } else { + CUDF_UNREACHABLE("Unsupported type to find in map"); } } }; template CUDF_KERNEL void __launch_bounds__(block_size) - populate_chunk_hash_maps_kernel(map_ref_type* map_refs, + populate_chunk_hash_maps_kernel(storage_type::window_type* map_storage, cudf::detail::device_2dspan frags) { - auto col_idx = blockIdx.y; - auto block_x = blockIdx.x; - auto t = threadIdx.x; - auto frag = frags[col_idx][block_x]; - auto chunk = frag.chunk; - auto col = chunk->col_desc; + auto const col_idx = blockIdx.y; + auto const block_x = blockIdx.x; + auto const frag = frags[col_idx][block_x]; + auto chunk = frag.chunk; + auto col = chunk->col_desc; if (not chunk->use_dictionary) { return; } using block_reduce = cub::BlockReduce; __shared__ typename block_reduce::TempStorage reduce_storage; + [[maybe_unused]] auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const t = cg::this_thread_block().thread_rank(); + size_type start_row = frag.start_row; size_type end_row = frag.start_row + frag.num_rows; @@ -81,12 +177,11 @@ CUDF_KERNEL void __launch_bounds__(block_size) size_type const end_value_idx = row_to_value_idx(end_row, *col); column_device_view const& data_col = *col->leaf_column; - - // Make a view of the hash map - auto hash_map_ref = *(map_refs + chunk->dict_map_idx); + storage_ref_type const storage_ref{chunk->dict_map_size, map_storage + chunk->dict_map_offset}; __shared__ size_type total_num_dict_entries; thread_index_type val_idx = s_start_value_idx + t; + while (val_idx - block_size < end_value_idx) { auto const is_valid = val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx); @@ -95,14 +190,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) size_type is_unique = 0; size_type uniq_elem_size = 0; if (is_valid) { - auto const val = type_dispatcher(data_col.type(), hash_functor{data_col}, val_idx); - if (val != KEY_SENTINEL) { - auto map_insert_ref = hash_map_ref.with_operators(cuco::insert); - is_unique = map_insert_ref.insert(cuco::pair{val, val_idx}); - } else { - is_unique = false; - } - // if (!t) printf("inserted val = %u, val_idx = %ld\n", val, val_idx); + auto const is_unique = + type_dispatcher(data_col.type(), map_insert_fn{storage_ref}, data_col, val_idx); uniq_elem_size = [&]() -> size_type { if (not is_unique) { return 0; } switch (col->physical_type) { @@ -152,24 +241,30 @@ CUDF_KERNEL void __launch_bounds__(block_size) template CUDF_KERNEL void __launch_bounds__(block_size) - collect_map_entries_kernel(map_ref_type* map_refs, device_span chunks) + collect_map_entries_kernel(storage_type::window_type* map_storage, + device_span chunks) { auto& chunk = chunks[blockIdx.x]; if (not chunk.use_dictionary) { return; } - auto t = threadIdx.x; - auto map = *(map_refs + chunk.dict_map_idx); + [[maybe_unused]] auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const t = cg::this_thread_block().thread_rank(); + + storage_ref_type const storage_ref{chunk.dict_map_size, map_storage + chunk.dict_map_offset}; + __shared__ cuda::atomic counter; using cuda::std::memory_order_relaxed; if (t == 0) { new (&counter) cuda::atomic{0}; } __syncthreads(); + for (size_type i = 0; i < chunk.dict_map_size; i += block_size) { if (t + i < chunk.dict_map_size) { - auto* slot = reinterpret_cast(&map + t + i); + auto* slot = reinterpret_cast(storage_ref.data() + chunk.dict_map_offset + t + i); auto key = slot->first; if (key != KEY_SENTINEL) { auto loc = counter.fetch_add(1, memory_order_relaxed); cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size"); + printf("Writing %d at loc: %d\n", key, loc); chunk.dict_data[loc] = key; // If sorting dict page ever becomes a hard requirement, enable the following statement and // add a dict sorting step before storing into the slot's second field. @@ -182,18 +277,20 @@ CUDF_KERNEL void __launch_bounds__(block_size) template CUDF_KERNEL void __launch_bounds__(block_size) - get_dictionary_indices_kernel(map_ref_type* map_refs, + get_dictionary_indices_kernel(storage_type::window_type* map_storage, cudf::detail::device_2dspan frags) { - auto col_idx = blockIdx.y; - auto block_x = blockIdx.x; - auto t = threadIdx.x; - auto frag = frags[col_idx][block_x]; - auto chunk = frag.chunk; - auto col = chunk->col_desc; + auto const col_idx = blockIdx.y; + auto const block_x = blockIdx.x; + auto const frag = frags[col_idx][block_x]; + auto chunk = frag.chunk; + auto const col = chunk->col_desc; if (not chunk->use_dictionary) { return; } + [[maybe_unused]] auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const t = cg::this_thread_block().thread_rank(); + size_type start_row = frag.start_row; size_type end_row = frag.start_row + frag.num_rows; @@ -203,54 +300,45 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto const end_value_idx = row_to_value_idx(end_row, *col); column_device_view const& data_col = *col->leaf_column; - - auto map = *(map_refs + chunk->dict_map_idx); + storage_ref_type const storage_ref{chunk->dict_map_size, map_storage + chunk->dict_map_offset}; thread_index_type val_idx = s_start_value_idx + t; while (val_idx < end_value_idx) { if (data_col.is_valid(val_idx)) { - auto val = type_dispatcher(data_col.type(), hash_functor{data_col}, val_idx); - if (val != static_cast(KEY_SENTINEL)) { - auto map_find_ref = map.with_operators(cuco::find); - auto found_slot = map_find_ref.find(val); - cudf_assert(found_slot != map.end() && - "Unable to find value in map in dictionary index construction"); - if (found_slot != map.end()) { - // No need for atomic as this is not going to be modified by any other thread - auto* val_ptr = reinterpret_cast(&found_slot->second); - chunk->dict_index[val_idx - s_ck_start_val_idx] = *val_ptr; - } - } + auto [found_key, found_value] = + type_dispatcher(data_col.type(), map_find_fn{storage_ref}, data_col, val_idx); + // No need for atomic as this is not going to be modified by any other thread + chunk->dict_index[val_idx - s_ck_start_val_idx] = found_value; } val_idx += block_size; } } -void populate_chunk_hash_maps(map_ref_type* map_refs, +void populate_chunk_hash_maps(storage_type::window_type* map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { dim3 const dim_grid(frags.size().second, frags.size().first); populate_chunk_hash_maps_kernel - <<>>(map_refs, frags); + <<>>(map_storage, frags); } -void collect_map_entries(map_ref_type* map_refs, +void collect_map_entries(storage_type::window_type* map_storage, device_span chunks, rmm::cuda_stream_view stream) { constexpr int block_size = 1024; collect_map_entries_kernel - <<>>(map_refs, chunks); + <<>>(map_storage, chunks); } -void get_dictionary_indices(map_ref_type* map_refs, +void get_dictionary_indices(storage_type::window_type* map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { dim3 const dim_grid(frags.size().second, frags.size().first); get_dictionary_indices_kernel - <<>>(map_refs, frags); + <<>>(map_storage, frags); } } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index e8d8eeb8dbf..a97cbd02681 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -31,14 +31,14 @@ namespace cudf::io::parquet::detail { -using key_type = uint32_t; -using mapped_type = uint32_t; +using key_type = size_type; +using mapped_type = size_type; auto constexpr cg_size = 1; ///< A CUDA Cooperative Group of 8 threads to handle each subset auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread -auto constexpr KEY_SENTINEL = key_type{std::numeric_limits::max()}; -auto constexpr VALUE_SENTINEL = mapped_type{std::numeric_limits::max()}; +auto constexpr KEY_SENTINEL = key_type{std::numeric_limits::max()}; +auto constexpr VALUE_SENTINEL = mapped_type{std::numeric_limits::max()}; auto constexpr SCOPE = cuda::thread_scope_device; using slot_type = cuco::pair; @@ -46,27 +46,13 @@ using slot_type = cuco::pair; using storage_type = cuco::aow_storage; using storage_ref_type = typename storage_type::ref_type; -template -struct my_hasher { - __device__ auto operator()(T index) const { return index; } -}; - -using probing_scheme_type = cuco::linear_probing>; - -using map_ref_type = cuco::static_map_ref, - probing_scheme_type, - storage_ref_type>; ///< Map ref type - /** * @brief Insert chunk values into their respective hash maps * * @param frags Column fragments * @param stream CUDA stream to use */ -void populate_chunk_hash_maps(map_ref_type* map_refs, +void populate_chunk_hash_maps(storage_type::window_type* map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); @@ -76,7 +62,7 @@ void populate_chunk_hash_maps(map_ref_type* map_refs, * @param chunks Flat span of chunks to compact hash maps for * @param stream CUDA stream to use */ -void collect_map_entries(map_ref_type* map_refs, +void collect_map_entries(storage_type::window_type* map_storage, device_span chunks, rmm::cuda_stream_view stream); @@ -92,7 +78,7 @@ void collect_map_entries(map_ref_type* map_refs, * @param frags Column fragments * @param stream CUDA stream to use */ -void get_dictionary_indices(map_ref_type* map_refs, +void get_dictionary_indices(storage_type::window_type* map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 7044b38f529..daf97a566f9 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -559,7 +559,7 @@ struct EncColumnChunk { uint8_t is_compressed; //!< Nonzero if the chunk uses compression uint32_t dictionary_size; //!< Size of dictionary page including header uint32_t ck_stat_size; //!< Size of chunk-level statistics (included in 1st page header) - int32_t dict_map_idx; //!< Hash map storage for calculating dict encoding for this chunk + uint32_t dict_map_offset; //!< Hash map storage for calculating dict encoding for this chunk size_type dict_map_size; //!< Size of dict_map_slots size_type num_dict_entries; //!< Total number of entries in dictionary size_type diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 7aee7f6dc1f..8c30b8f5f79 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1320,38 +1320,16 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL}, cuda::stream_ref{stream.value()}); - std::vector h_map_refs; - h_map_refs.reserve(h_chunks.size()); - - std::for_each( - thrust::make_counting_iterator(static_cast(0)), - thrust::make_counting_iterator(h_chunks.size()), - [&](auto const idx) { - auto& chunk = h_chunks[idx]; - if (chunk.use_dictionary) { - storage_ref_type storage_ref{valid_chunk_sizes[idx], map_storage.data() + map_offsets[idx]}; - - h_map_refs.emplace_back(map_ref_type{cuco::empty_key{KEY_SENTINEL}, - cuco::empty_value{VALUE_SENTINEL}, - {}, - {}, - {}, - storage_ref}); - - chunk.dict_map_idx = h_map_refs.size() - 1; - } - }); - - rmm::device_uvector d_map_refs(h_map_refs.size(), stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_map_refs.data(), - h_map_refs.data(), - sizeof(map_ref_type) * h_map_refs.size(), - cudaMemcpyDefault, - stream.value())); + std::for_each(thrust::make_counting_iterator(static_cast(0)), + thrust::make_counting_iterator(h_chunks.size()), + [&](auto const idx) { + auto& chunk = h_chunks[idx]; + if (chunk.use_dictionary) { chunk.dict_map_offset = map_offsets[idx]; } + }); chunks.host_to_device_async(stream); - populate_chunk_hash_maps(d_map_refs.data(), frags, stream); + populate_chunk_hash_maps(map_storage.data(), frags, stream); chunks.device_to_host_sync(stream); @@ -1412,8 +1390,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunk.dict_index = inserted_dict_index.data(); } chunks.host_to_device_async(stream); - collect_map_entries(d_map_refs.data(), chunks.device_view().flat_view(), stream); - get_dictionary_indices(d_map_refs.data(), frags, stream); + collect_map_entries(map_storage.data(), chunks.device_view().flat_view(), stream); + get_dictionary_indices(map_storage.data(), frags, stream); return std::pair(std::move(dict_data), std::move(dict_index)); } diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index e07ebe25322..5e39f9c9cfb 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -52,7 +52,7 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc auto mask = cudf::detail::make_counting_transform_iterator(0, mask_op); - constexpr auto num_rows = 100; + constexpr auto num_rows = 20; // Durations longer than a day are not exactly valid, but cudf should be able to round trip auto durations_d = cudf::test::fixed_width_column_wrapper( sequence_d, sequence_d + num_rows, mask); @@ -65,7 +65,7 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc auto durations_ns = cudf::test::fixed_width_column_wrapper( sequence, sequence + num_rows, mask); - auto expected = table_view{{durations_d, durations_s, durations_ms, durations_us, durations_ns}}; + auto expected = table_view{{durations_d}}; if (use_byte_stream_split) { cudf::io::table_input_metadata expected_metadata(expected); @@ -90,30 +90,31 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view()); - if (arrow_schema) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1)); - } else { - auto durations_s_got = - cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view()); - } + /* + if (arrow_schema) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1)); + } else { + auto durations_s_got = + cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view()); + } - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ns, result.tbl->view().column(4)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ns, result.tbl->view().column(4));*/ } TEST_F(ParquetWriterTest, Durations) { test_durations([](auto i) { return true; }, false, false); - test_durations([](auto i) { return (i % 2) != 0; }, false, false); - test_durations([](auto i) { return (i % 3) != 0; }, false, false); - test_durations([](auto i) { return false; }, false, false); - - test_durations([](auto i) { return true; }, false, true); - test_durations([](auto i) { return (i % 2) != 0; }, false, true); - test_durations([](auto i) { return (i % 3) != 0; }, false, true); - test_durations([](auto i) { return false; }, false, true); + // test_durations([](auto i) { return (i % 2) != 0; }, false, false); + // test_durations([](auto i) { return (i % 3) != 0; }, false, false); + // test_durations([](auto i) { return false; }, false, false); + + // test_durations([](auto i) { return true; }, false, true); + // test_durations([](auto i) { return (i % 2) != 0; }, false, true); + // test_durations([](auto i) { return (i % 3) != 0; }, false, true); + // test_durations([](auto i) { return false; }, false, true); } TEST_F(ParquetWriterTest, MultiIndex) diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index fb9bdeb0b22..be4655b0a98 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -460,6 +460,7 @@ std::string stringify_column_differences(cudf::device_span difference debug_output_level verbosity, int depth) { + verbosity = debug_output_level::ALL_ERRORS; CUDF_EXPECTS(not differences.empty(), "Shouldn't enter this function if `differences` is empty"); std::string const depth_str = depth > 0 ? "depth " + std::to_string(depth) + '\n' : ""; // move the differences to the host. From fb742fbbdbdcfc40540fb3e24d68f739d0f0a994 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 13 Aug 2024 19:00:16 +0000 Subject: [PATCH 03/30] Some updates --- cpp/src/io/parquet/chunk_dict.cu | 10 ++++------ cpp/src/io/parquet/parquet_gpu.cuh | 6 +++--- cpp/src/io/parquet/writer_impl.cu | 14 +++++++++++--- cpp/tests/io/parquet_writer_test.cpp | 25 ++++++++++++------------- 4 files changed, 30 insertions(+), 25 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index bd5f27ce9f6..9b333e350db 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -250,8 +250,6 @@ CUDF_KERNEL void __launch_bounds__(block_size) [[maybe_unused]] auto const tile = cg::tiled_partition(cg::this_thread_block()); auto const t = cg::this_thread_block().thread_rank(); - storage_ref_type const storage_ref{chunk.dict_map_size, map_storage + chunk.dict_map_offset}; - __shared__ cuda::atomic counter; using cuda::std::memory_order_relaxed; if (t == 0) { new (&counter) cuda::atomic{0}; } @@ -259,17 +257,17 @@ CUDF_KERNEL void __launch_bounds__(block_size) for (size_type i = 0; i < chunk.dict_map_size; i += block_size) { if (t + i < chunk.dict_map_size) { - auto* slot = reinterpret_cast(storage_ref.data() + chunk.dict_map_offset + t + i); - auto key = slot->first; + auto* slot = map_storage + chunk.dict_map_offset + t + i; + auto const key = slot->data()->first; if (key != KEY_SENTINEL) { auto loc = counter.fetch_add(1, memory_order_relaxed); cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size"); - printf("Writing %d at loc: %d\n", key, loc); + // printf("Writing %d at loc: %d\n", key, loc); chunk.dict_data[loc] = key; // If sorting dict page ever becomes a hard requirement, enable the following statement and // add a dict sorting step before storing into the slot's second field. // chunk.dict_data_idx[loc] = t + i; - slot->second = loc; + slot->data()->second = loc; } } } diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index a97cbd02681..315a9cf4449 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -37,9 +37,9 @@ using mapped_type = size_type; auto constexpr cg_size = 1; ///< A CUDA Cooperative Group of 8 threads to handle each subset auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread -auto constexpr KEY_SENTINEL = key_type{std::numeric_limits::max()}; -auto constexpr VALUE_SENTINEL = mapped_type{std::numeric_limits::max()}; -auto constexpr SCOPE = cuda::thread_scope_device; +auto constexpr KEY_SENTINEL = key_type{-1}; +auto constexpr VALUE_SENTINEL = mapped_type{-1}; +auto constexpr SCOPE = cuda::thread_scope_block; using slot_type = cuco::pair; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 8c30b8f5f79..8f674385f30 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1301,8 +1301,10 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, valid_chunk_sizes.emplace_back(static_cast(0)); } else { chunk.use_dictionary = true; - valid_chunk_sizes.emplace_back(static_cast( - cuco::make_window_extent(static_cast(chunk.num_values)))); + valid_chunk_sizes.emplace_back( + static_cast(cuco::make_window_extent( + // Multiplying by 1/0.7 = 1.43 to target a 70% occupancy factor. + static_cast(chunk.num_values * 1.43)))); chunk.dict_map_size = valid_chunk_sizes.back(); } }); @@ -1320,11 +1322,17 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL}, cuda::stream_ref{stream.value()}); + std::cout << "Offsets: " << h_chunks.size() << std::endl; + std::for_each(thrust::make_counting_iterator(static_cast(0)), thrust::make_counting_iterator(h_chunks.size()), [&](auto const idx) { auto& chunk = h_chunks[idx]; - if (chunk.use_dictionary) { chunk.dict_map_offset = map_offsets[idx]; } + if (chunk.use_dictionary) { + chunk.dict_map_offset = map_offsets[idx]; + std::cout << "off: " << map_offsets[idx] << ", size: " << chunk.dict_map_size + << std::endl; + } }); chunks.host_to_device_async(stream); diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 5e39f9c9cfb..9f171d23e7d 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -39,7 +39,7 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc { std::default_random_engine generator; std::uniform_int_distribution distribution_d(0, 30); - auto sequence_d = cudf::detail::make_counting_transform_iterator( + [[maybe_unused]]auto sequence_d = cudf::detail::make_counting_transform_iterator( 0, [&](auto i) { return distribution_d(generator); }); std::uniform_int_distribution distribution_s(0, 86400); @@ -52,8 +52,8 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc auto mask = cudf::detail::make_counting_transform_iterator(0, mask_op); - constexpr auto num_rows = 20; - // Durations longer than a day are not exactly valid, but cudf should be able to round trip + constexpr auto num_rows = 5650; // WORKS UNTIL 5649, fails beyond 5650 + // Durations longer than a day are not exactly valid, but cudf should be able to round trip auto durations_d = cudf::test::fixed_width_column_wrapper( sequence_d, sequence_d + num_rows, mask); auto durations_s = cudf::test::fixed_width_column_wrapper( @@ -65,7 +65,7 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc auto durations_ns = cudf::test::fixed_width_column_wrapper( sequence, sequence + num_rows, mask); - auto expected = table_view{{durations_d}}; + auto expected = table_view{{/*durations_d, */durations_s,/*durations_ms, durations_us, durations_ns*/}}; if (use_byte_stream_split) { cudf::io::table_input_metadata expected_metadata(expected); @@ -85,23 +85,22 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) .use_arrow_schema(arrow_schema); auto result = cudf::io::read_parquet(in_opts); - +/* auto durations_d_got = cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());*/ - /* if (arrow_schema) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(0)); } else { auto durations_s_got = - cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS}); + cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_SECONDS}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view()); } - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ns, result.tbl->view().column(4));*/ +/* + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(1)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(2)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ns, result.tbl->view().column(3));*/ } TEST_F(ParquetWriterTest, Durations) From b9067b064df89d2bd3700ef34b27e083df792a81 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 13 Aug 2024 21:40:13 +0000 Subject: [PATCH 04/30] Fixes and improvements. 330/346 gtests passing --- cpp/src/io/parquet/chunk_dict.cu | 18 ++--- cpp/src/io/parquet/writer_impl.cu | 20 +++--- cpp/tests/io/parquet_writer_test.cpp | 99 ++++++++++++++-------------- 3 files changed, 67 insertions(+), 70 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 9b333e350db..49dd7b0b7bf 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -41,11 +41,11 @@ struct equality_functor { // We don't call this for nulls so this is fine auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{}; auto const result = equal(col.element(lhs_idx), col.element(rhs_idx)); - printf("col_type_id:%d, equality idx1:%d, idx2:%d, eq:%d\n", + /*printf("col_type_id:%d, equality idx1:%d, idx2:%d, eq:%d\n", col.type().id(), lhs_idx, rhs_idx, - result); + result);*/ return result; } }; @@ -56,7 +56,7 @@ struct hash_functor { __device__ auto operator()(key_type idx) const { auto const hashed = cudf::hashing::detail::MurmurHash3_x86_32{}(col.element(idx)); - printf("hashing idx: %d = %d\n", idx, hashed); + // printf("hashing idx: %d = %d\n", idx, hashed); return hashed; // cudf::hashing::detail::MurmurHash3_x86_32{}(col.element(idx)); } }; @@ -94,7 +94,7 @@ struct map_insert_fn { auto map_insert_ref = hash_map_ref.with_operators(cuco::insert_and_find); // Insert auto [iter, found] = map_insert_ref.insert_and_find(cuco::pair{i, i}); - printf("Inserted k=%d, v=%d, unique=%d\n", iter->first, iter->second, found); + // printf("Inserted k=%d, v=%d, unique=%d\n", iter->first, iter->second, found); return found; } else { CUDF_UNREACHABLE("Unsupported type to insert in map"); @@ -142,7 +142,7 @@ struct map_find_fn { "Unable to find value in map in dictionary index construction"); // Return a pair of the found key and value. - printf("Find=%d, Found slot: k=%d, v=%d\n", i, found_slot->first, found_slot->second); + // printf("Find=%d, Found slot: k=%d, v=%d\n", i, found_slot->first, found_slot->second); return {found_slot->first, found_slot->second}; } else { CUDF_UNREACHABLE("Unsupported type to find in map"); @@ -190,8 +190,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) size_type is_unique = 0; size_type uniq_elem_size = 0; if (is_valid) { - auto const is_unique = - type_dispatcher(data_col.type(), map_insert_fn{storage_ref}, data_col, val_idx); + is_unique = type_dispatcher(data_col.type(), map_insert_fn{storage_ref}, data_col, val_idx); uniq_elem_size = [&]() -> size_type { if (not is_unique) { return 0; } switch (col->physical_type) { @@ -250,9 +249,9 @@ CUDF_KERNEL void __launch_bounds__(block_size) [[maybe_unused]] auto const tile = cg::tiled_partition(cg::this_thread_block()); auto const t = cg::this_thread_block().thread_rank(); - __shared__ cuda::atomic counter; + __shared__ cuda::atomic counter; using cuda::std::memory_order_relaxed; - if (t == 0) { new (&counter) cuda::atomic{0}; } + if (t == 0) { new (&counter) cuda::atomic{0}; } __syncthreads(); for (size_type i = 0; i < chunk.dict_map_size; i += block_size) { @@ -267,6 +266,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) // If sorting dict page ever becomes a hard requirement, enable the following statement and // add a dict sorting step before storing into the slot's second field. // chunk.dict_data_idx[loc] = t + i; + // printf("Replacing slot->data()->second: %d, %d\n", slot->data()->second, loc); slot->data()->second = loc; } } diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 8f674385f30..403a5275e7e 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1322,18 +1322,13 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL}, cuda::stream_ref{stream.value()}); - std::cout << "Offsets: " << h_chunks.size() << std::endl; - - std::for_each(thrust::make_counting_iterator(static_cast(0)), - thrust::make_counting_iterator(h_chunks.size()), - [&](auto const idx) { - auto& chunk = h_chunks[idx]; - if (chunk.use_dictionary) { - chunk.dict_map_offset = map_offsets[idx]; - std::cout << "off: " << map_offsets[idx] << ", size: " << chunk.dict_map_size - << std::endl; - } - }); + std::for_each( + thrust::make_zip_iterator(thrust::make_tuple(h_chunks.begin(), map_offsets.begin())), + thrust::make_zip_iterator(thrust::make_tuple(h_chunks.end(), map_offsets.end())), + [&](auto elem) -> void { + auto& chunk = thrust::get<0>(elem); + if (chunk.use_dictionary) { chunk.dict_map_offset = thrust::get<1>(elem); } + }); chunks.host_to_device_async(stream); @@ -1400,6 +1395,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunks.host_to_device_async(stream); collect_map_entries(map_storage.data(), chunks.device_view().flat_view(), stream); get_dictionary_indices(map_storage.data(), frags, stream); + chunks.device_to_host_async(stream); return std::pair(std::move(dict_data), std::move(dict_index)); } diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 9f171d23e7d..a06b4f38196 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -39,7 +39,7 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc { std::default_random_engine generator; std::uniform_int_distribution distribution_d(0, 30); - [[maybe_unused]]auto sequence_d = cudf::detail::make_counting_transform_iterator( + auto sequence_d = cudf::detail::make_counting_transform_iterator( 0, [&](auto i) { return distribution_d(generator); }); std::uniform_int_distribution distribution_s(0, 86400); @@ -52,8 +52,8 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc auto mask = cudf::detail::make_counting_transform_iterator(0, mask_op); - constexpr auto num_rows = 5650; // WORKS UNTIL 5649, fails beyond 5650 - // Durations longer than a day are not exactly valid, but cudf should be able to round trip + constexpr auto num_rows = 100; + // Durations longer than a day are not exactly valid, but cudf should be able to round trip auto durations_d = cudf::test::fixed_width_column_wrapper( sequence_d, sequence_d + num_rows, mask); auto durations_s = cudf::test::fixed_width_column_wrapper( @@ -65,7 +65,7 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc auto durations_ns = cudf::test::fixed_width_column_wrapper( sequence, sequence + num_rows, mask); - auto expected = table_view{{/*durations_d, */durations_s,/*durations_ms, durations_us, durations_ns*/}}; + auto expected = table_view{{durations_d, durations_s, durations_ms, durations_us, durations_ns}}; if (use_byte_stream_split) { cudf::io::table_input_metadata expected_metadata(expected); @@ -85,35 +85,35 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) .use_arrow_schema(arrow_schema); auto result = cudf::io::read_parquet(in_opts); -/* + auto durations_d_got = cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());*/ + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view()); + + if (arrow_schema) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1)); + } else { + auto durations_s_got = + cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view()); + } - if (arrow_schema) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(0)); - } else { - auto durations_s_got = - cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_SECONDS}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view()); - } -/* - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(1)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(2)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ns, result.tbl->view().column(3));*/ + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ns, result.tbl->view().column(4)); } TEST_F(ParquetWriterTest, Durations) { test_durations([](auto i) { return true; }, false, false); - // test_durations([](auto i) { return (i % 2) != 0; }, false, false); - // test_durations([](auto i) { return (i % 3) != 0; }, false, false); - // test_durations([](auto i) { return false; }, false, false); + test_durations([](auto i) { return (i % 2) != 0; }, false, false); + test_durations([](auto i) { return (i % 3) != 0; }, false, false); + test_durations([](auto i) { return false; }, false, false); - // test_durations([](auto i) { return true; }, false, true); - // test_durations([](auto i) { return (i % 2) != 0; }, false, true); - // test_durations([](auto i) { return (i % 3) != 0; }, false, true); - // test_durations([](auto i) { return false; }, false, true); + test_durations([](auto i) { return true; }, false, true); + test_durations([](auto i) { return (i % 2) != 0; }, false, true); + test_durations([](auto i) { return (i % 3) != 0; }, false, true); + test_durations([](auto i) { return false; }, false, true); } TEST_F(ParquetWriterTest, MultiIndex) @@ -2007,7 +2007,7 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray) srand(31337); using cudf::io::parquet::detail::Encoding; constexpr int fixed_width = 16; - constexpr cudf::size_type num_rows = 200; + constexpr cudf::size_type num_rows = 10; std::vector data(num_rows * fixed_width); std::vector offsets(num_rows + 1); @@ -2025,25 +2025,26 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray) auto off_child = cudf::test::fixed_width_column_wrapper(offsets.begin(), offsets.end()); auto col = cudf::make_lists_column(num_rows, off_child.release(), data_child.release(), 0, {}); - auto expected = table_view{{*col, *col, *col, *col}}; + auto expected = table_view{{/**col, *col, *col,*/ *col}}; cudf::io::table_input_metadata expected_metadata(expected); + /* + expected_metadata.column_metadata[0] + .set_name("flba_plain") + .set_type_length(fixed_width) + .set_encoding(cudf::io::column_encoding::PLAIN) + .set_output_as_binary(true); + expected_metadata.column_metadata[1] + .set_name("flba_split") + .set_type_length(fixed_width) + .set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT) + .set_output_as_binary(true); + expected_metadata.column_metadata[2] + .set_name("flba_delta") + .set_type_length(fixed_width) + .set_encoding(cudf::io::column_encoding::DELTA_BYTE_ARRAY) + .set_output_as_binary(true);*/ expected_metadata.column_metadata[0] - .set_name("flba_plain") - .set_type_length(fixed_width) - .set_encoding(cudf::io::column_encoding::PLAIN) - .set_output_as_binary(true); - expected_metadata.column_metadata[1] - .set_name("flba_split") - .set_type_length(fixed_width) - .set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT) - .set_output_as_binary(true); - expected_metadata.column_metadata[2] - .set_name("flba_delta") - .set_type_length(fixed_width) - .set_encoding(cudf::io::column_encoding::DELTA_BYTE_ARRAY) - .set_output_as_binary(true); - expected_metadata.column_metadata[3] .set_name("flba_dict") .set_type_length(fixed_width) .set_encoding(cudf::io::column_encoding::DICTIONARY) @@ -2067,7 +2068,7 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray) read_footer(source, &fmd); // check that the schema retains the FIXED_LEN_BYTE_ARRAY type - for (int i = 1; i <= 4; i++) { + for (int i = 1; i <= 1; i++) { EXPECT_EQ(fmd.schema[i].type, cudf::io::parquet::detail::Type::FIXED_LEN_BYTE_ARRAY); EXPECT_EQ(fmd.schema[i].type_length, fixed_width); } @@ -2078,14 +2079,14 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray) }; // requested plain + /* expect_enc(0, Encoding::PLAIN); + // requested byte_stream_split + expect_enc(1, Encoding::BYTE_STREAM_SPLIT); + // requested delta_byte_array + expect_enc(2, Encoding::DELTA_BYTE_ARRAY); + // requested dictionary, but should fall back to plain + // TODO: update if we get FLBA working with dictionary encoding*/ expect_enc(0, Encoding::PLAIN); - // requested byte_stream_split - expect_enc(1, Encoding::BYTE_STREAM_SPLIT); - // requested delta_byte_array - expect_enc(2, Encoding::DELTA_BYTE_ARRAY); - // requested dictionary, but should fall back to plain - // TODO: update if we get FLBA working with dictionary encoding - expect_enc(3, Encoding::PLAIN); } ///////////////////////////////////////////////////////////// From 44a97ce265f948e1c05a8204cc1b61ab05d3876f Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 13 Aug 2024 21:59:06 +0000 Subject: [PATCH 05/30] Fix headers being included --- cpp/src/io/parquet/chunk_dict.cu | 1 + cpp/src/io/parquet/parquet_gpu.cuh | 7 +------ cpp/src/io/parquet/writer_impl.cu | 4 ++-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 49dd7b0b7bf..bda86f8596a 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -23,6 +23,7 @@ #include #include +#include #include namespace cudf::io::parquet::detail { diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index 315a9cf4449..20ca717e01d 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -21,13 +21,8 @@ #include #include -#include -#include -#include +#include #include -#include - -#include namespace cudf::io::parquet::detail { diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 403a5275e7e..5065b57e79d 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1303,7 +1303,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunk.use_dictionary = true; valid_chunk_sizes.emplace_back( static_cast(cuco::make_window_extent( - // Multiplying by 1/0.7 = 1.43 to target a 70% occupancy factor. + // cuCollections suggests using a hash map of size N * (1/0.7) = 1.43 to target a 70% + // occupancy factor. static_cast(chunk.num_values * 1.43)))); chunk.dict_map_size = valid_chunk_sizes.back(); } @@ -1395,7 +1396,6 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunks.host_to_device_async(stream); collect_map_entries(map_storage.data(), chunks.device_view().flat_view(), stream); get_dictionary_indices(map_storage.data(), frags, stream); - chunks.device_to_host_async(stream); return std::pair(std::move(dict_data), std::move(dict_index)); } From 393eaa657df7bdc865e3ae53a2a3493104a115d1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 13 Aug 2024 22:59:21 +0000 Subject: [PATCH 06/30] All tests passing --- cpp/src/io/parquet/chunk_dict.cu | 27 +++++++-------------------- cpp/src/io/parquet/writer_impl.cu | 22 +++++++++++++++------- 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index bda86f8596a..da0f1a726cb 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -40,14 +40,8 @@ struct equality_functor { __device__ bool operator()(key_type const lhs_idx, key_type const rhs_idx) const { // We don't call this for nulls so this is fine - auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{}; - auto const result = equal(col.element(lhs_idx), col.element(rhs_idx)); - /*printf("col_type_id:%d, equality idx1:%d, idx2:%d, eq:%d\n", - col.type().id(), - lhs_idx, - rhs_idx, - result);*/ - return result; + auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{}; + return equal(col.element(lhs_idx), col.element(rhs_idx)); } }; @@ -56,9 +50,7 @@ struct hash_functor { column_device_view const& col; __device__ auto operator()(key_type idx) const { - auto const hashed = cudf::hashing::detail::MurmurHash3_x86_32{}(col.element(idx)); - // printf("hashing idx: %d = %d\n", idx, hashed); - return hashed; // cudf::hashing::detail::MurmurHash3_x86_32{}(col.element(idx)); + return cudf::hashing::detail::MurmurHash3_x86_32{}(col.element(idx)); } }; @@ -91,12 +83,10 @@ struct map_insert_fn { {}, storage_ref}; - // Create another map with insert operator - auto map_insert_ref = hash_map_ref.with_operators(cuco::insert_and_find); - // Insert - auto [iter, found] = map_insert_ref.insert_and_find(cuco::pair{i, i}); - // printf("Inserted k=%d, v=%d, unique=%d\n", iter->first, iter->second, found); - return found; + // Create another map ref with the insert operator + auto map_insert_ref = hash_map_ref.with_operators(cuco::insert); + // Insert into the hash map + return map_insert_ref.insert(cuco::pair{i, i}); } else { CUDF_UNREACHABLE("Unsupported type to insert in map"); } @@ -143,7 +133,6 @@ struct map_find_fn { "Unable to find value in map in dictionary index construction"); // Return a pair of the found key and value. - // printf("Find=%d, Found slot: k=%d, v=%d\n", i, found_slot->first, found_slot->second); return {found_slot->first, found_slot->second}; } else { CUDF_UNREACHABLE("Unsupported type to find in map"); @@ -262,12 +251,10 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (key != KEY_SENTINEL) { auto loc = counter.fetch_add(1, memory_order_relaxed); cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size"); - // printf("Writing %d at loc: %d\n", key, loc); chunk.dict_data[loc] = key; // If sorting dict page ever becomes a hard requirement, enable the following statement and // add a dict sorting step before storing into the slot's second field. // chunk.dict_data_idx[loc] = t + i; - // printf("Replacing slot->data()->second: %d, %d\n", slot->data()->second, loc); slot->data()->second = loc; } } diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 5065b57e79d..e4fc1f00f06 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1310,19 +1310,26 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, } }); - std::vector map_offsets(valid_chunk_sizes.size() + 1, 0); + std::vector map_offsets(valid_chunk_sizes.size(), 0); std::exclusive_scan(valid_chunk_sizes.begin(), valid_chunk_sizes.end(), map_offsets.begin(), static_cast(0)); - map_offsets.back() = map_offsets[valid_chunk_sizes.size() - 1] + valid_chunk_sizes.back(); - // Create a single bulk storage used by all subsets - auto map_storage = storage_type{map_offsets.back()}; - // Initializes the storage with the given sentinel + // Compute total map_storage + auto const map_storage_size = map_offsets.back() + valid_chunk_sizes.back(); + + // No chunk needs to create a dictionary, exit early + if (map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; } + + // Create a single bulk storage used by all sub-hashmaps + auto map_storage = storage_type{map_storage_size}; + + // Only initialize storage with the given sentinel if and only if non-zero size map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL}, cuda::stream_ref{stream.value()}); + // Populate chunk dictionary offsets std::for_each( thrust::make_zip_iterator(thrust::make_tuple(h_chunks.begin(), map_offsets.begin())), thrust::make_zip_iterator(thrust::make_tuple(h_chunks.end(), map_offsets.end())), @@ -1331,10 +1338,11 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, if (chunk.use_dictionary) { chunk.dict_map_offset = thrust::get<1>(elem); } }); + // Synchronize chunks.host_to_device_async(stream); - + // Populate the hash map for each chunk populate_chunk_hash_maps(map_storage.data(), frags, stream); - + // Synchronize again chunks.device_to_host_sync(stream); // Make decision about which chunks have dictionary From c27c4cf1d289ed2a91124756f4c6cb07bc07efb3 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 13 Aug 2024 23:16:19 +0000 Subject: [PATCH 07/30] Cosmetic improvements --- cpp/src/io/parquet/chunk_dict.cu | 15 +++++++-------- cpp/src/io/parquet/parquet_gpu.cuh | 11 +++++++---- cpp/src/io/parquet/writer_impl.cu | 18 ++++++++++-------- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index da0f1a726cb..588a5766c08 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -39,7 +39,7 @@ struct equality_functor { column_device_view const& col; __device__ bool operator()(key_type const lhs_idx, key_type const rhs_idx) const { - // We don't call this for nulls so this is fine + // We don't call this for nulls so this is fine auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{}; return equal(col.element(lhs_idx), col.element(rhs_idx)); } @@ -142,7 +142,7 @@ struct map_find_fn { template CUDF_KERNEL void __launch_bounds__(block_size) - populate_chunk_hash_maps_kernel(storage_type::window_type* map_storage, + populate_chunk_hash_maps_kernel(window_type* map_storage, cudf::detail::device_2dspan frags) { auto const col_idx = blockIdx.y; @@ -230,8 +230,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) template CUDF_KERNEL void __launch_bounds__(block_size) - collect_map_entries_kernel(storage_type::window_type* map_storage, - device_span chunks) + collect_map_entries_kernel(window_type* map_storage, device_span chunks) { auto& chunk = chunks[blockIdx.x]; if (not chunk.use_dictionary) { return; } @@ -263,7 +262,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) template CUDF_KERNEL void __launch_bounds__(block_size) - get_dictionary_indices_kernel(storage_type::window_type* map_storage, + get_dictionary_indices_kernel(window_type* map_storage, cudf::detail::device_2dspan frags) { auto const col_idx = blockIdx.y; @@ -301,7 +300,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) } } -void populate_chunk_hash_maps(storage_type::window_type* map_storage, +void populate_chunk_hash_maps(window_type* map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { @@ -310,7 +309,7 @@ void populate_chunk_hash_maps(storage_type::window_type* map_storage, <<>>(map_storage, frags); } -void collect_map_entries(storage_type::window_type* map_storage, +void collect_map_entries(window_type* map_storage, device_span chunks, rmm::cuda_stream_view stream) { @@ -319,7 +318,7 @@ void collect_map_entries(storage_type::window_type* map_storage, <<>>(map_storage, chunks); } -void get_dictionary_indices(storage_type::window_type* map_storage, +void get_dictionary_indices(window_type* map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index 20ca717e01d..0a0fc9095e0 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -40,24 +40,26 @@ using slot_type = cuco::pair; using storage_type = cuco::aow_storage; using storage_ref_type = typename storage_type::ref_type; - +using window_type = storage_type::window_type; /** * @brief Insert chunk values into their respective hash maps * + * @param map_storage Pointer to the bulk hashmap storage * @param frags Column fragments * @param stream CUDA stream to use */ -void populate_chunk_hash_maps(storage_type::window_type* map_storage, +void populate_chunk_hash_maps(window_type* map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); /** * @brief Compact dictionary hash map entries into chunk.dict_data * + * @param map_storage Pointer to the bulk hashmap storage * @param chunks Flat span of chunks to compact hash maps for * @param stream CUDA stream to use */ -void collect_map_entries(storage_type::window_type* map_storage, +void collect_map_entries(window_type* map_storage, device_span chunks, rmm::cuda_stream_view stream); @@ -70,10 +72,11 @@ void collect_map_entries(storage_type::window_type* map_storage, * Since dict_data itself contains indices into the original cudf column, this means that * col[row] == col[dict_data[dict_index[row - chunk.start_row]]] * + * @param map_storage Pointer to the bulk hashmap storage * @param frags Column fragments * @param stream CUDA stream to use */ -void get_dictionary_indices(storage_type::window_type* map_storage, +void get_dictionary_indices(window_type* map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index e4fc1f00f06..7ec3997b590 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1285,9 +1285,11 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, return std::pair(std::move(dict_data), std::move(dict_index)); } + // Create a vector to store valid chunk sizes using cuco::make_window_extent std::vector valid_chunk_sizes; valid_chunk_sizes.reserve(h_chunks.size()); + // Populate valid_chunk_sizes for chunks that need to build a dictionary. std::for_each(h_chunks.begin(), h_chunks.end(), [&](auto& chunk) { auto const& chunk_col_desc = col_desc[chunk.col_desc_id]; auto const is_requested_non_dict = @@ -1298,6 +1300,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, if (is_type_non_dict || is_requested_non_dict) { chunk.use_dictionary = false; + // Emplace a zero for 1-1 mapping between h_chunks and valid_chunk_sizes valid_chunk_sizes.emplace_back(static_cast(0)); } else { chunk.use_dictionary = true; @@ -1310,22 +1313,21 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, } }); + // Create a vector to map offsets from chunk sizes std::vector map_offsets(valid_chunk_sizes.size(), 0); std::exclusive_scan(valid_chunk_sizes.begin(), valid_chunk_sizes.end(), map_offsets.begin(), static_cast(0)); - // Compute total map_storage - auto const map_storage_size = map_offsets.back() + valid_chunk_sizes.back(); - + // Compute total map storage + auto const total_map_storage_size = map_offsets.back() + valid_chunk_sizes.back(); // No chunk needs to create a dictionary, exit early - if (map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; } - - // Create a single bulk storage used by all sub-hashmaps - auto map_storage = storage_type{map_storage_size}; + if (total_map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; } - // Only initialize storage with the given sentinel if and only if non-zero size + // Create a single bulk storage used by all sub-dictionaries + auto map_storage = storage_type{total_map_storage_size}; + // Initialize storage with the given sentinel iff non-zero size map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL}, cuda::stream_ref{stream.value()}); From 76a2d14eed30f4a22b7a68d47e96f40e3170b53e Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 13 Aug 2024 23:18:11 +0000 Subject: [PATCH 08/30] Revert changes to tests --- cpp/tests/io/parquet_writer_test.cpp | 51 ++++++++++++------------- cpp/tests/utilities/column_utilities.cu | 1 - 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index a06b4f38196..e07ebe25322 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -2007,7 +2007,7 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray) srand(31337); using cudf::io::parquet::detail::Encoding; constexpr int fixed_width = 16; - constexpr cudf::size_type num_rows = 10; + constexpr cudf::size_type num_rows = 200; std::vector data(num_rows * fixed_width); std::vector offsets(num_rows + 1); @@ -2025,26 +2025,25 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray) auto off_child = cudf::test::fixed_width_column_wrapper(offsets.begin(), offsets.end()); auto col = cudf::make_lists_column(num_rows, off_child.release(), data_child.release(), 0, {}); - auto expected = table_view{{/**col, *col, *col,*/ *col}}; + auto expected = table_view{{*col, *col, *col, *col}}; cudf::io::table_input_metadata expected_metadata(expected); - /* - expected_metadata.column_metadata[0] - .set_name("flba_plain") - .set_type_length(fixed_width) - .set_encoding(cudf::io::column_encoding::PLAIN) - .set_output_as_binary(true); - expected_metadata.column_metadata[1] - .set_name("flba_split") - .set_type_length(fixed_width) - .set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT) - .set_output_as_binary(true); - expected_metadata.column_metadata[2] - .set_name("flba_delta") - .set_type_length(fixed_width) - .set_encoding(cudf::io::column_encoding::DELTA_BYTE_ARRAY) - .set_output_as_binary(true);*/ expected_metadata.column_metadata[0] + .set_name("flba_plain") + .set_type_length(fixed_width) + .set_encoding(cudf::io::column_encoding::PLAIN) + .set_output_as_binary(true); + expected_metadata.column_metadata[1] + .set_name("flba_split") + .set_type_length(fixed_width) + .set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT) + .set_output_as_binary(true); + expected_metadata.column_metadata[2] + .set_name("flba_delta") + .set_type_length(fixed_width) + .set_encoding(cudf::io::column_encoding::DELTA_BYTE_ARRAY) + .set_output_as_binary(true); + expected_metadata.column_metadata[3] .set_name("flba_dict") .set_type_length(fixed_width) .set_encoding(cudf::io::column_encoding::DICTIONARY) @@ -2068,7 +2067,7 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray) read_footer(source, &fmd); // check that the schema retains the FIXED_LEN_BYTE_ARRAY type - for (int i = 1; i <= 1; i++) { + for (int i = 1; i <= 4; i++) { EXPECT_EQ(fmd.schema[i].type, cudf::io::parquet::detail::Type::FIXED_LEN_BYTE_ARRAY); EXPECT_EQ(fmd.schema[i].type_length, fixed_width); } @@ -2079,14 +2078,14 @@ TEST_F(ParquetWriterTest, WriteFixedLenByteArray) }; // requested plain - /* expect_enc(0, Encoding::PLAIN); - // requested byte_stream_split - expect_enc(1, Encoding::BYTE_STREAM_SPLIT); - // requested delta_byte_array - expect_enc(2, Encoding::DELTA_BYTE_ARRAY); - // requested dictionary, but should fall back to plain - // TODO: update if we get FLBA working with dictionary encoding*/ expect_enc(0, Encoding::PLAIN); + // requested byte_stream_split + expect_enc(1, Encoding::BYTE_STREAM_SPLIT); + // requested delta_byte_array + expect_enc(2, Encoding::DELTA_BYTE_ARRAY); + // requested dictionary, but should fall back to plain + // TODO: update if we get FLBA working with dictionary encoding + expect_enc(3, Encoding::PLAIN); } ///////////////////////////////////////////////////////////// diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index be4655b0a98..fb9bdeb0b22 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -460,7 +460,6 @@ std::string stringify_column_differences(cudf::device_span difference debug_output_level verbosity, int depth) { - verbosity = debug_output_level::ALL_ERRORS; CUDF_EXPECTS(not differences.empty(), "Shouldn't enter this function if `differences` is empty"); std::string const depth_str = depth > 0 ? "depth " + std::to_string(depth) + '\n' : ""; // move the differences to the host. From ec32c0a878fdc4d12994dfe44eceb55f8bef6e86 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:44:59 +0000 Subject: [PATCH 09/30] Migrate find_fn to use a tile --- cpp/src/io/parquet/chunk_dict.cu | 45 +++++++++++++++++------------- cpp/src/io/parquet/parquet_gpu.cuh | 4 +-- cpp/src/io/parquet/parquet_gpu.hpp | 2 +- cpp/src/io/parquet/writer_impl.cu | 2 +- 4 files changed, 29 insertions(+), 24 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 588a5766c08..0d4ae6b5748 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -58,12 +58,14 @@ struct map_insert_fn { storage_ref_type const& storage_ref; template - __device__ bool operator()(column_device_view const& col, key_type i) + __device__ bool operator()(column_device_view const& col, + cg::thread_block_tile const& tile, + key_type i) { if constexpr (column_device_view::has_element_accessor()) { using equality_fn_type = equality_functor; using hash_fn_type = hash_functor; - using probing_scheme_type = cuco::linear_probing; + using probing_scheme_type = cuco::linear_probing; // Instantiate hash and equality functors. auto hash_fn = hash_fn_type{col}; @@ -85,8 +87,8 @@ struct map_insert_fn { // Create another map ref with the insert operator auto map_insert_ref = hash_map_ref.with_operators(cuco::insert); - // Insert into the hash map - return map_insert_ref.insert(cuco::pair{i, i}); + // Insert into the hash map using the provided thread tile + return map_insert_ref.insert(tile, cuco::pair{i, i}); } else { CUDF_UNREACHABLE("Unsupported type to insert in map"); } @@ -97,12 +99,13 @@ struct map_find_fn { storage_ref_type const& storage_ref; template - __device__ cuco::pair operator()(column_device_view const& col, key_type i) + __device__ cuco::pair operator()( + column_device_view const& col, cg::thread_block_tile const& tile, key_type i) { if constexpr (column_device_view::has_element_accessor()) { using equality_fn_type = equality_functor; using hash_fn_type = hash_functor; - using probing_scheme_type = cuco::linear_probing; + using probing_scheme_type = cuco::linear_probing; // Instantiate hash and equality functors. auto hash_fn = hash_fn_type{col}; @@ -125,8 +128,8 @@ struct map_find_fn { // Create another map with find operator auto map_find_ref = hash_map_ref.with_operators(cuco::find); - // Find the key = i - auto found_slot = map_find_ref.find(i); + // Find the key = i using the provided thread tile + auto found_slot = map_find_ref.find(tile, i); // Check if we found the previously inserted key. cudf_assert(found_slot != map_find_ref.end() && @@ -156,8 +159,9 @@ CUDF_KERNEL void __launch_bounds__(block_size) using block_reduce = cub::BlockReduce; __shared__ typename block_reduce::TempStorage reduce_storage; - [[maybe_unused]] auto const tile = cg::tiled_partition(cg::this_thread_block()); - auto const t = cg::this_thread_block().thread_rank(); + auto const block = cg::this_thread_block(); + [[maybe_unused]] auto const tile = cg::tiled_partition(block); + auto const t = block.thread_rank(); size_type start_row = frag.start_row; size_type end_row = frag.start_row + frag.num_rows; @@ -180,7 +184,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) size_type is_unique = 0; size_type uniq_elem_size = 0; if (is_valid) { - is_unique = type_dispatcher(data_col.type(), map_insert_fn{storage_ref}, data_col, val_idx); + is_unique = + type_dispatcher(data_col.type(), map_insert_fn{storage_ref}, data_col, tile, val_idx); uniq_elem_size = [&]() -> size_type { if (not is_unique) { return 0; } switch (col->physical_type) { @@ -212,14 +217,14 @@ CUDF_KERNEL void __launch_bounds__(block_size) }(); } auto num_unique = block_reduce(reduce_storage).Sum(is_unique); - __syncthreads(); + block.sync(); auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size); if (t == 0) { total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique); total_num_dict_entries += num_unique; atomicAdd(&chunk->uniq_data_size, uniq_data_size); } - __syncthreads(); + block.sync(); // Check if the num unique values in chunk has already exceeded max dict size and early exit if (total_num_dict_entries > MAX_DICT_SIZE) { return; } @@ -235,8 +240,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto& chunk = chunks[blockIdx.x]; if (not chunk.use_dictionary) { return; } - [[maybe_unused]] auto const tile = cg::tiled_partition(cg::this_thread_block()); - auto const t = cg::this_thread_block().thread_rank(); + auto const t = threadIdx.x; __shared__ cuda::atomic counter; using cuda::std::memory_order_relaxed; @@ -273,8 +277,9 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (not chunk->use_dictionary) { return; } - [[maybe_unused]] auto const tile = cg::tiled_partition(cg::this_thread_block()); - auto const t = cg::this_thread_block().thread_rank(); + auto const block = cg::this_thread_block(); + auto const tile = cg::tiled_partition(block); + auto const ntiles = tile.meta_group_size(); size_type start_row = frag.start_row; size_type end_row = frag.start_row + frag.num_rows; @@ -287,16 +292,16 @@ CUDF_KERNEL void __launch_bounds__(block_size) column_device_view const& data_col = *col->leaf_column; storage_ref_type const storage_ref{chunk->dict_map_size, map_storage + chunk->dict_map_offset}; - thread_index_type val_idx = s_start_value_idx + t; + thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank(); while (val_idx < end_value_idx) { if (data_col.is_valid(val_idx)) { auto [found_key, found_value] = - type_dispatcher(data_col.type(), map_find_fn{storage_ref}, data_col, val_idx); + type_dispatcher(data_col.type(), map_find_fn{storage_ref}, data_col, tile, val_idx); // No need for atomic as this is not going to be modified by any other thread chunk->dict_index[val_idx - s_ck_start_val_idx] = found_value; } - val_idx += block_size; + val_idx += ntiles; } } diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index 0a0fc9095e0..66b86140afc 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -29,7 +29,7 @@ namespace cudf::io::parquet::detail { using key_type = size_type; using mapped_type = size_type; -auto constexpr cg_size = 1; ///< A CUDA Cooperative Group of 8 threads to handle each subset +auto constexpr map_cg_size = 1; ///< A CUDA Cooperative Group of 1 thread to handle each subset auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread auto constexpr KEY_SENTINEL = key_type{-1}; @@ -40,7 +40,7 @@ using slot_type = cuco::pair; using storage_type = cuco::aow_storage; using storage_ref_type = typename storage_type::ref_type; -using window_type = storage_type::window_type; +using window_type = typename storage_type::window_type; /** * @brief Insert chunk values into their respective hash maps * diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index daf97a566f9..b26735d27b7 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -559,7 +559,7 @@ struct EncColumnChunk { uint8_t is_compressed; //!< Nonzero if the chunk uses compression uint32_t dictionary_size; //!< Size of dictionary page including header uint32_t ck_stat_size; //!< Size of chunk-level statistics (included in 1st page header) - uint32_t dict_map_offset; //!< Hash map storage for calculating dict encoding for this chunk + uint32_t dict_map_offset; //!< Offset of the hash map storage for calculating dict encoding for this chunk size_type dict_map_size; //!< Size of dict_map_slots size_type num_dict_entries; //!< Total number of entries in dictionary size_type diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 7ec3997b590..e319a39e464 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1305,7 +1305,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, } else { chunk.use_dictionary = true; valid_chunk_sizes.emplace_back( - static_cast(cuco::make_window_extent( + static_cast(cuco::make_window_extent( // cuCollections suggests using a hash map of size N * (1/0.7) = 1.43 to target a 70% // occupancy factor. static_cast(chunk.num_values * 1.43)))); From 0546fcc46aee5f90fb560b977c491dd92be12fda Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 14 Aug 2024 01:50:37 +0000 Subject: [PATCH 10/30] Functionally correct working solution --- cpp/src/io/parquet/chunk_dict.cu | 105 ++++++++++++++++------------- cpp/src/io/parquet/parquet_gpu.cuh | 2 +- cpp/src/io/parquet/parquet_gpu.hpp | 3 +- 3 files changed, 62 insertions(+), 48 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 0d4ae6b5748..82478c01e2d 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -131,10 +131,11 @@ struct map_find_fn { // Find the key = i using the provided thread tile auto found_slot = map_find_ref.find(tile, i); - // Check if we found the previously inserted key. - cudf_assert(found_slot != map_find_ref.end() && - "Unable to find value in map in dictionary index construction"); - + // Check if didn't find the previously inserted key. + if (tile.thread_rank() == 0) { + cudf_assert(found_slot != map_find_ref.end() && + "Unable to find value in map in dictionary index construction"); + } // Return a pair of the found key and value. return {found_slot->first, found_slot->second}; } else { @@ -145,7 +146,7 @@ struct map_find_fn { template CUDF_KERNEL void __launch_bounds__(block_size) - populate_chunk_hash_maps_kernel(window_type* map_storage, + populate_chunk_hash_maps_kernel(window_type* const map_storage, cudf::detail::device_2dspan frags) { auto const col_idx = blockIdx.y; @@ -159,9 +160,9 @@ CUDF_KERNEL void __launch_bounds__(block_size) using block_reduce = cub::BlockReduce; __shared__ typename block_reduce::TempStorage reduce_storage; - auto const block = cg::this_thread_block(); - [[maybe_unused]] auto const tile = cg::tiled_partition(block); - auto const t = block.thread_rank(); + auto const block = cg::this_thread_block(); + auto const tile = cg::tiled_partition(block); + auto const ntiles = tile.meta_group_size(); size_type start_row = frag.start_row; size_type end_row = frag.start_row + frag.num_rows; @@ -174,9 +175,9 @@ CUDF_KERNEL void __launch_bounds__(block_size) storage_ref_type const storage_ref{chunk->dict_map_size, map_storage + chunk->dict_map_offset}; __shared__ size_type total_num_dict_entries; - thread_index_type val_idx = s_start_value_idx + t; + thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank(); - while (val_idx - block_size < end_value_idx) { + while (val_idx - ntiles < end_value_idx) { auto const is_valid = val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx); @@ -184,42 +185,51 @@ CUDF_KERNEL void __launch_bounds__(block_size) size_type is_unique = 0; size_type uniq_elem_size = 0; if (is_valid) { + // Insert the element to the map is_unique = type_dispatcher(data_col.type(), map_insert_fn{storage_ref}, data_col, tile, val_idx); - uniq_elem_size = [&]() -> size_type { - if (not is_unique) { return 0; } - switch (col->physical_type) { - case Type::INT32: return 4; - case Type::INT64: return 8; - case Type::INT96: return 12; - case Type::FLOAT: return 4; - case Type::DOUBLE: return 8; - case Type::BYTE_ARRAY: { - auto const col_type = data_col.type().id(); - if (col_type == type_id::STRING) { - // Strings are stored as 4 byte length + string bytes - return 4 + data_col.element(val_idx).size_bytes(); - } else if (col_type == type_id::LIST) { - // Binary is stored as 4 byte length + bytes - return 4 + get_element(data_col, val_idx).size_bytes(); + // First thread in the tile compute the size of the element if unique + if (tile.thread_rank() == 0) { + uniq_elem_size = [&]() -> size_type { + if (not is_unique) { return 0; } + switch (col->physical_type) { + case Type::INT32: return 4; + case Type::INT64: return 8; + case Type::INT96: return 12; + case Type::FLOAT: return 4; + case Type::DOUBLE: return 8; + case Type::BYTE_ARRAY: { + auto const col_type = data_col.type().id(); + if (col_type == type_id::STRING) { + // Strings are stored as 4 byte length + string bytes + return 4 + data_col.element(val_idx).size_bytes(); + } else if (col_type == type_id::LIST) { + // Binary is stored as 4 byte length + bytes + return 4 + get_element(data_col, val_idx).size_bytes(); + } + CUDF_UNREACHABLE( + "Byte array only supports string and list column types for dictionary " + "encoding!"); } - CUDF_UNREACHABLE( - "Byte array only supports string and list column types for dictionary " - "encoding!"); + case Type::FIXED_LEN_BYTE_ARRAY: + if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); } + CUDF_UNREACHABLE( + "Fixed length byte array only supports decimal 128 column types for dictionary " + "encoding!"); + default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding"); } - case Type::FIXED_LEN_BYTE_ARRAY: - if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); } - CUDF_UNREACHABLE( - "Fixed length byte array only supports decimal 128 column types for dictionary " - "encoding!"); - default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding"); - } - }(); + }(); + } else { + // All threads except the first thread in the tile must reset is_unique to zero + is_unique = 0; + } } auto num_unique = block_reduce(reduce_storage).Sum(is_unique); block.sync(); auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size); - if (t == 0) { + // The first thread in the block atomically updates total num dict entries and unique elements + // data size + if (block.thread_rank() == 0) { total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique); total_num_dict_entries += num_unique; atomicAdd(&chunk->uniq_data_size, uniq_data_size); @@ -229,13 +239,13 @@ CUDF_KERNEL void __launch_bounds__(block_size) // Check if the num unique values in chunk has already exceeded max dict size and early exit if (total_num_dict_entries > MAX_DICT_SIZE) { return; } - val_idx += block_size; + val_idx += ntiles; } // while } template CUDF_KERNEL void __launch_bounds__(block_size) - collect_map_entries_kernel(window_type* map_storage, device_span chunks) + collect_map_entries_kernel(window_type* const map_storage, device_span chunks) { auto& chunk = chunks[blockIdx.x]; if (not chunk.use_dictionary) { return; } @@ -266,7 +276,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) template CUDF_KERNEL void __launch_bounds__(block_size) - get_dictionary_indices_kernel(window_type* map_storage, + get_dictionary_indices_kernel(window_type* const map_storage, cudf::detail::device_2dspan frags) { auto const col_idx = blockIdx.y; @@ -297,15 +307,18 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (data_col.is_valid(val_idx)) { auto [found_key, found_value] = type_dispatcher(data_col.type(), map_find_fn{storage_ref}, data_col, tile, val_idx); - // No need for atomic as this is not going to be modified by any other thread - chunk->dict_index[val_idx - s_ck_start_val_idx] = found_value; + // First thread in the tile updates the dict_index + if (tile.thread_rank() == 0) { + // No need for atomic as this is not going to be modified by any other thread + chunk->dict_index[val_idx - s_ck_start_val_idx] = found_value; + } } val_idx += ntiles; } } -void populate_chunk_hash_maps(window_type* map_storage, +void populate_chunk_hash_maps(window_type* const map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { @@ -314,7 +327,7 @@ void populate_chunk_hash_maps(window_type* map_storage, <<>>(map_storage, frags); } -void collect_map_entries(window_type* map_storage, +void collect_map_entries(window_type* const map_storage, device_span chunks, rmm::cuda_stream_view stream) { @@ -323,7 +336,7 @@ void collect_map_entries(window_type* map_storage, <<>>(map_storage, chunks); } -void get_dictionary_indices(window_type* map_storage, +void get_dictionary_indices(window_type* const map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index 66b86140afc..bfaab7cfcd2 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -29,7 +29,7 @@ namespace cudf::io::parquet::detail { using key_type = size_type; using mapped_type = size_type; -auto constexpr map_cg_size = 1; ///< A CUDA Cooperative Group of 1 thread to handle each subset +auto constexpr map_cg_size = 2; ///< A CUDA Cooperative Group of 1 thread to handle each subset auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread auto constexpr KEY_SENTINEL = key_type{-1}; diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index b26735d27b7..b2eb20708b1 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -559,7 +559,8 @@ struct EncColumnChunk { uint8_t is_compressed; //!< Nonzero if the chunk uses compression uint32_t dictionary_size; //!< Size of dictionary page including header uint32_t ck_stat_size; //!< Size of chunk-level statistics (included in 1st page header) - uint32_t dict_map_offset; //!< Offset of the hash map storage for calculating dict encoding for this chunk + uint32_t dict_map_offset; //!< Offset of the hash map storage for calculating dict encoding for + //!< this chunk size_type dict_map_size; //!< Size of dict_map_slots size_type num_dict_entries; //!< Total number of entries in dictionary size_type From a14007d5b0c58e0db230e3de6f83965a7bd7c044 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 14 Aug 2024 03:30:45 +0000 Subject: [PATCH 11/30] Updated insert with tiles --- cpp/src/io/parquet/chunk_dict.cu | 121 +++++++++++++++-------------- cpp/src/io/parquet/parquet_gpu.cuh | 79 +++++++++---------- 2 files changed, 104 insertions(+), 96 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 82478c01e2d..c3998c36079 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -39,7 +39,7 @@ struct equality_functor { column_device_view const& col; __device__ bool operator()(key_type const lhs_idx, key_type const rhs_idx) const { - // We don't call this for nulls so this is fine + // We don't call this for nulls so this is fine. auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{}; return equal(col.element(lhs_idx), col.element(rhs_idx)); } @@ -71,7 +71,7 @@ struct map_insert_fn { auto hash_fn = hash_fn_type{col}; auto equal_fn = equality_fn_type{col}; - // Make a view of the hash map + // Make a view of the hash map. cuco::static_map_refdict_map_size, map_storage + chunk->dict_map_offset}; __shared__ size_type total_num_dict_entries; - thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank(); - while (val_idx - ntiles < end_value_idx) { - auto const is_valid = - val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx); + for (thread_index_type val_idx = s_start_value_idx + block.thread_rank(); + val_idx - block_size < end_value_idx; + val_idx += block_size) { + // Compute the index to the start of the tile. + auto const tile_val_idx = + val_idx - block.thread_rank() + (tile.meta_group_rank() * tile.num_threads()); - // insert element at val_idx to hash map and count successful insertions size_type is_unique = 0; size_type uniq_elem_size = 0; - if (is_valid) { - // Insert the element to the map - is_unique = - type_dispatcher(data_col.type(), map_insert_fn{storage_ref}, data_col, tile, val_idx); - // First thread in the tile compute the size of the element if unique - if (tile.thread_rank() == 0) { - uniq_elem_size = [&]() -> size_type { - if (not is_unique) { return 0; } - switch (col->physical_type) { - case Type::INT32: return 4; - case Type::INT64: return 8; - case Type::INT96: return 12; - case Type::FLOAT: return 4; - case Type::DOUBLE: return 8; - case Type::BYTE_ARRAY: { - auto const col_type = data_col.type().id(); - if (col_type == type_id::STRING) { - // Strings are stored as 4 byte length + string bytes - return 4 + data_col.element(val_idx).size_bytes(); - } else if (col_type == type_id::LIST) { - // Binary is stored as 4 byte length + bytes - return 4 + get_element(data_col, val_idx).size_bytes(); + + // Insert all elements within each tile. + for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) { + // Compute the index to the element being inserted within the tile. + auto const tile_val_idx_plus_offset = tile_val_idx + tile_offset; + // Insert element at val_idx to hash map and count successful insertions. + auto const is_valid = tile_val_idx_plus_offset < end_value_idx and + tile_val_idx_plus_offset < data_col.size() and + data_col.is_valid(tile_val_idx_plus_offset); + if (is_valid) { + // Insert the element to the map using the entire tile. + auto const tile_is_unique = type_dispatcher( + data_col.type(), map_insert_fn{storage_ref}, data_col, tile, tile_val_idx_plus_offset); + // Only the tile_offset thread rank computes the size of the element if unique. + if (tile.thread_rank() == tile_offset) { + is_unique = tile_is_unique; + uniq_elem_size = [&]() -> size_type { + if (not is_unique) { return 0; } + switch (col->physical_type) { + case Type::INT32: return 4; + case Type::INT64: return 8; + case Type::INT96: return 12; + case Type::FLOAT: return 4; + case Type::DOUBLE: return 8; + case Type::BYTE_ARRAY: { + auto const col_type = data_col.type().id(); + if (col_type == type_id::STRING) { + // Strings are stored as 4 byte length + string bytes + return 4 + data_col.element(val_idx).size_bytes(); + } else if (col_type == type_id::LIST) { + // Binary is stored as 4 byte length + bytes + return 4 + + get_element(data_col, val_idx).size_bytes(); + } + CUDF_UNREACHABLE( + "Byte array only supports string and list column types for dictionary " + "encoding!"); } - CUDF_UNREACHABLE( - "Byte array only supports string and list column types for dictionary " - "encoding!"); + case Type::FIXED_LEN_BYTE_ARRAY: + if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); } + CUDF_UNREACHABLE( + "Fixed length byte array only supports decimal 128 column types for dictionary " + "encoding!"); + default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding"); } - case Type::FIXED_LEN_BYTE_ARRAY: - if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); } - CUDF_UNREACHABLE( - "Fixed length byte array only supports decimal 128 column types for dictionary " - "encoding!"); - default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding"); - } - }(); - } else { - // All threads except the first thread in the tile must reset is_unique to zero - is_unique = 0; + }(); + } } } + // All elements in all tiles inserted, reduce num_unique and uniq_data_size across the block. auto num_unique = block_reduce(reduce_storage).Sum(is_unique); block.sync(); auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size); - // The first thread in the block atomically updates total num dict entries and unique elements - // data size + // The first thread in the block atomically updates total num_unique and uniq_data_size if (block.thread_rank() == 0) { total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique); total_num_dict_entries += num_unique; @@ -238,9 +248,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) // Check if the num unique values in chunk has already exceeded max dict size and early exit if (total_num_dict_entries > MAX_DICT_SIZE) { return; } - - val_idx += ntiles; - } // while + } // for loop } template @@ -302,8 +310,9 @@ CUDF_KERNEL void __launch_bounds__(block_size) column_device_view const& data_col = *col->leaf_column; storage_ref_type const storage_ref{chunk->dict_map_size, map_storage + chunk->dict_map_offset}; - thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank(); - while (val_idx < end_value_idx) { + for (thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank(); + val_idx < end_value_idx; + val_idx += ntiles) { if (data_col.is_valid(val_idx)) { auto [found_key, found_value] = type_dispatcher(data_col.type(), map_find_fn{storage_ref}, data_col, tile, val_idx); @@ -313,8 +322,6 @@ CUDF_KERNEL void __launch_bounds__(block_size) chunk->dict_index[val_idx - s_ck_start_val_idx] = found_value; } } - - val_idx += ntiles; } } diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index bfaab7cfcd2..5bfb531a1c0 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -29,7 +29,7 @@ namespace cudf::io::parquet::detail { using key_type = size_type; using mapped_type = size_type; -auto constexpr map_cg_size = 2; ///< A CUDA Cooperative Group of 1 thread to handle each subset +auto constexpr map_cg_size = 2; ///< A CUDA Cooperative Group of 4 thread to handle each subset auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread auto constexpr KEY_SENTINEL = key_type{-1}; @@ -41,44 +41,6 @@ using slot_type = cuco::pair; using storage_type = cuco::aow_storage; using storage_ref_type = typename storage_type::ref_type; using window_type = typename storage_type::window_type; -/** - * @brief Insert chunk values into their respective hash maps - * - * @param map_storage Pointer to the bulk hashmap storage - * @param frags Column fragments - * @param stream CUDA stream to use - */ -void populate_chunk_hash_maps(window_type* map_storage, - cudf::detail::device_2dspan frags, - rmm::cuda_stream_view stream); - -/** - * @brief Compact dictionary hash map entries into chunk.dict_data - * - * @param map_storage Pointer to the bulk hashmap storage - * @param chunks Flat span of chunks to compact hash maps for - * @param stream CUDA stream to use - */ -void collect_map_entries(window_type* map_storage, - device_span chunks, - rmm::cuda_stream_view stream); - -/** - * @brief Get the Dictionary Indices for each row - * - * For each row of a chunk, gets the indices into chunk.dict_data which contains the value otherwise - * stored in input column [row]. Stores these indices into chunk.dict_index. - * - * Since dict_data itself contains indices into the original cudf column, this means that - * col[row] == col[dict_data[dict_index[row - chunk.start_row]]] - * - * @param map_storage Pointer to the bulk hashmap storage - * @param frags Column fragments - * @param stream CUDA stream to use - */ -void get_dictionary_indices(window_type* map_storage, - cudf::detail::device_2dspan frags, - rmm::cuda_stream_view stream); /** * @brief Return the byte length of parquet dtypes that are physically represented by INT32 @@ -123,4 +85,43 @@ inline size_type __device__ row_to_value_idx(size_type idx, return idx; } +/** + * @brief Insert chunk values into their respective hash maps + * + * @param map_storage Pointer to the bulk hashmap storage + * @param frags Column fragments + * @param stream CUDA stream to use + */ +void populate_chunk_hash_maps(window_type* map_storage, + cudf::detail::device_2dspan frags, + rmm::cuda_stream_view stream); + +/** + * @brief Compact dictionary hash map entries into chunk.dict_data + * + * @param map_storage Pointer to the bulk hashmap storage + * @param chunks Flat span of chunks to compact hash maps for + * @param stream CUDA stream to use + */ +void collect_map_entries(window_type* map_storage, + device_span chunks, + rmm::cuda_stream_view stream); + +/** + * @brief Get the Dictionary Indices for each row + * + * For each row of a chunk, gets the indices into chunk.dict_data which contains the value otherwise + * stored in input column [row]. Stores these indices into chunk.dict_index. + * + * Since dict_data itself contains indices into the original cudf column, this means that + * col[row] == col[dict_data[dict_index[row - chunk.start_row]]] + * + * @param map_storage Pointer to the bulk hashmap storage + * @param frags Column fragments + * @param stream CUDA stream to use + */ +void get_dictionary_indices(window_type* map_storage, + cudf::detail::device_2dspan frags, + rmm::cuda_stream_view stream); + } // namespace cudf::io::parquet::detail From 4935c661f185677e0ed386dd5f15193f91537bcc Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 14 Aug 2024 05:15:22 +0000 Subject: [PATCH 12/30] Cosmetic updates --- cpp/src/io/parquet/chunk_dict.cu | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index c3998c36079..e6b31fce764 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -176,11 +176,12 @@ CUDF_KERNEL void __launch_bounds__(block_size) __shared__ size_type total_num_dict_entries; + // Insert all column chunk elements to the hash map to build the dict. for (thread_index_type val_idx = s_start_value_idx + block.thread_rank(); val_idx - block_size < end_value_idx; val_idx += block_size) { // Compute the index to the start of the tile. - auto const tile_val_idx = + auto const val_idx_base = val_idx - block.thread_rank() + (tile.meta_group_rank() * tile.num_threads()); size_type is_unique = 0; @@ -189,16 +190,19 @@ CUDF_KERNEL void __launch_bounds__(block_size) // Insert all elements within each tile. for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) { // Compute the index to the element being inserted within the tile. - auto const tile_val_idx_plus_offset = tile_val_idx + tile_offset; - // Insert element at val_idx to hash map and count successful insertions. - auto const is_valid = tile_val_idx_plus_offset < end_value_idx and - tile_val_idx_plus_offset < data_col.size() and - data_col.is_valid(tile_val_idx_plus_offset); + auto const tile_val_idx = val_idx_base + tile_offset; + + // Check if this index is valid. + auto const is_valid = tile_val_idx < end_value_idx and tile_val_idx < data_col.size() and + data_col.is_valid(tile_val_idx); + + // Insert tile_val_idx to hash map and count successful insertions. if (is_valid) { // Insert the element to the map using the entire tile. auto const tile_is_unique = type_dispatcher( - data_col.type(), map_insert_fn{storage_ref}, data_col, tile, tile_val_idx_plus_offset); - // Only the tile_offset thread rank computes the size of the element if unique. + data_col.type(), map_insert_fn{storage_ref}, data_col, tile, tile_val_idx); + + // tile_offset'th thread updates its number and size of unique element. if (tile.thread_rank() == tile_offset) { is_unique = tile_is_unique; uniq_elem_size = [&]() -> size_type { @@ -213,11 +217,11 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto const col_type = data_col.type().id(); if (col_type == type_id::STRING) { // Strings are stored as 4 byte length + string bytes - return 4 + data_col.element(val_idx).size_bytes(); + return 4 + data_col.element(tile_val_idx).size_bytes(); } else if (col_type == type_id::LIST) { // Binary is stored as 4 byte length + bytes - return 4 + - get_element(data_col, val_idx).size_bytes(); + return 4 + get_element(data_col, tile_val_idx) + .size_bytes(); } CUDF_UNREACHABLE( "Byte array only supports string and list column types for dictionary " From 63dab9de55400594a3a02c518877fd4da8bebb18 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 14 Aug 2024 18:13:19 +0000 Subject: [PATCH 13/30] Minor improvements --- cpp/src/io/parquet/chunk_dict.cu | 17 +++++++---------- cpp/src/io/parquet/parquet_gpu.cuh | 3 +-- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index e6b31fce764..f92d0eb8337 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -160,9 +160,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) using block_reduce = cub::BlockReduce; __shared__ typename block_reduce::TempStorage reduce_storage; - auto const block = cg::this_thread_block(); - auto const tile = cg::tiled_partition(block); - auto const ntiles = tile.meta_group_size(); + auto const block = cg::this_thread_block(); + auto const tile = cg::tiled_partition(block); size_type start_row = frag.start_row; size_type end_row = frag.start_row + frag.num_rows; @@ -177,20 +176,18 @@ CUDF_KERNEL void __launch_bounds__(block_size) __shared__ size_type total_num_dict_entries; // Insert all column chunk elements to the hash map to build the dict. - for (thread_index_type val_idx = s_start_value_idx + block.thread_rank(); - val_idx - block_size < end_value_idx; + for (thread_index_type val_idx = s_start_value_idx; + val_idx + block.thread_rank() - block_size < end_value_idx; val_idx += block_size) { // Compute the index to the start of the tile. - auto const val_idx_base = - val_idx - block.thread_rank() + (tile.meta_group_rank() * tile.num_threads()); - size_type is_unique = 0; size_type uniq_elem_size = 0; // Insert all elements within each tile. for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) { // Compute the index to the element being inserted within the tile. - auto const tile_val_idx = val_idx_base + tile_offset; + auto const tile_val_idx = + val_idx + tile_offset + (tile.meta_group_rank() * tile.num_threads()); // Check if this index is valid. auto const is_valid = tile_val_idx < end_value_idx and tile_val_idx < data_col.size() and @@ -202,7 +199,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto const tile_is_unique = type_dispatcher( data_col.type(), map_insert_fn{storage_ref}, data_col, tile, tile_val_idx); - // tile_offset'th thread updates its number and size of unique element. + // tile_offset'th thread in the tile updates its number and size of unique element. if (tile.thread_rank() == tile_offset) { is_unique = tile_is_unique; uniq_elem_size = [&]() -> size_type { diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index 5bfb531a1c0..95d4471fac9 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -28,6 +28,7 @@ namespace cudf::io::parquet::detail { using key_type = size_type; using mapped_type = size_type; +using slot_type = cuco::pair; auto constexpr map_cg_size = 2; ///< A CUDA Cooperative Group of 4 thread to handle each subset auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread @@ -36,8 +37,6 @@ auto constexpr KEY_SENTINEL = key_type{-1}; auto constexpr VALUE_SENTINEL = mapped_type{-1}; auto constexpr SCOPE = cuda::thread_scope_block; -using slot_type = cuco::pair; - using storage_type = cuco::aow_storage; using storage_ref_type = typename storage_type::ref_type; using window_type = typename storage_type::window_type; From 40a2f395ecb338ec1c36772dec6c1adfe321fb8d Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 14 Aug 2024 18:51:08 +0000 Subject: [PATCH 14/30] Use span instead of raw pointers. --- cpp/src/io/parquet/chunk_dict.cu | 21 ++++++++++++--------- cpp/src/io/parquet/parquet_gpu.cuh | 14 +++++++------- cpp/src/io/parquet/writer_impl.cu | 8 +++++--- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index f92d0eb8337..62ba593f20f 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -146,7 +146,7 @@ struct map_find_fn { template CUDF_KERNEL void __launch_bounds__(block_size) - populate_chunk_hash_maps_kernel(window_type* const map_storage, + populate_chunk_hash_maps_kernel(device_span const map_storage, cudf::detail::device_2dspan frags) { auto const col_idx = blockIdx.y; @@ -171,7 +171,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) size_type const end_value_idx = row_to_value_idx(end_row, *col); column_device_view const& data_col = *col->leaf_column; - storage_ref_type const storage_ref{chunk->dict_map_size, map_storage + chunk->dict_map_offset}; + storage_ref_type const storage_ref{chunk->dict_map_size, + map_storage.data() + chunk->dict_map_offset}; __shared__ size_type total_num_dict_entries; @@ -254,7 +255,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) template CUDF_KERNEL void __launch_bounds__(block_size) - collect_map_entries_kernel(window_type* const map_storage, device_span chunks) + collect_map_entries_kernel(device_span const map_storage, + device_span chunks) { auto& chunk = chunks[blockIdx.x]; if (not chunk.use_dictionary) { return; } @@ -268,7 +270,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) for (size_type i = 0; i < chunk.dict_map_size; i += block_size) { if (t + i < chunk.dict_map_size) { - auto* slot = map_storage + chunk.dict_map_offset + t + i; + auto* slot = map_storage.data() + chunk.dict_map_offset + t + i; auto const key = slot->data()->first; if (key != KEY_SENTINEL) { auto loc = counter.fetch_add(1, memory_order_relaxed); @@ -285,7 +287,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) template CUDF_KERNEL void __launch_bounds__(block_size) - get_dictionary_indices_kernel(window_type* const map_storage, + get_dictionary_indices_kernel(device_span const map_storage, cudf::detail::device_2dspan frags) { auto const col_idx = blockIdx.y; @@ -309,7 +311,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto const end_value_idx = row_to_value_idx(end_row, *col); column_device_view const& data_col = *col->leaf_column; - storage_ref_type const storage_ref{chunk->dict_map_size, map_storage + chunk->dict_map_offset}; + storage_ref_type const storage_ref{chunk->dict_map_size, + map_storage.data() + chunk->dict_map_offset}; for (thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank(); val_idx < end_value_idx; @@ -326,7 +329,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) } } -void populate_chunk_hash_maps(window_type* const map_storage, +void populate_chunk_hash_maps(device_span const map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { @@ -335,7 +338,7 @@ void populate_chunk_hash_maps(window_type* const map_storage, <<>>(map_storage, frags); } -void collect_map_entries(window_type* const map_storage, +void collect_map_entries(device_span const map_storage, device_span chunks, rmm::cuda_stream_view stream) { @@ -344,7 +347,7 @@ void collect_map_entries(window_type* const map_storage, <<>>(map_storage, chunks); } -void get_dictionary_indices(window_type* const map_storage, +void get_dictionary_indices(device_span const map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index 95d4471fac9..34dc74d1880 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -30,7 +30,7 @@ using key_type = size_type; using mapped_type = size_type; using slot_type = cuco::pair; -auto constexpr map_cg_size = 2; ///< A CUDA Cooperative Group of 4 thread to handle each subset +auto constexpr map_cg_size = 2; ///< A CUDA Cooperative Group of 2 threads to handle each subset auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread auto constexpr KEY_SENTINEL = key_type{-1}; @@ -87,22 +87,22 @@ inline size_type __device__ row_to_value_idx(size_type idx, /** * @brief Insert chunk values into their respective hash maps * - * @param map_storage Pointer to the bulk hashmap storage + * @param map_storage Bulk hashmap storage * @param frags Column fragments * @param stream CUDA stream to use */ -void populate_chunk_hash_maps(window_type* map_storage, +void populate_chunk_hash_maps(device_span const map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); /** * @brief Compact dictionary hash map entries into chunk.dict_data * - * @param map_storage Pointer to the bulk hashmap storage + * @param map_storage Bulk hashmap storage * @param chunks Flat span of chunks to compact hash maps for * @param stream CUDA stream to use */ -void collect_map_entries(window_type* map_storage, +void collect_map_entries(device_span const map_storage, device_span chunks, rmm::cuda_stream_view stream); @@ -115,11 +115,11 @@ void collect_map_entries(window_type* map_storage, * Since dict_data itself contains indices into the original cudf column, this means that * col[row] == col[dict_data[dict_index[row - chunk.start_row]]] * - * @param map_storage Pointer to the bulk hashmap storage + * @param map_storage Bulk hashmap storage * @param frags Column fragments * @param stream CUDA stream to use */ -void get_dictionary_indices(window_type* map_storage, +void get_dictionary_indices(device_span const map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index e319a39e464..f026cd047e8 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1330,6 +1330,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, // Initialize storage with the given sentinel iff non-zero size map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL}, cuda::stream_ref{stream.value()}); + // Create a span of non-const map_storage as map_storage_ref takes in a non-const pointer. + device_span const map_storage_data{map_storage.data(), total_map_storage_size}; // Populate chunk dictionary offsets std::for_each( @@ -1343,7 +1345,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, // Synchronize chunks.host_to_device_async(stream); // Populate the hash map for each chunk - populate_chunk_hash_maps(map_storage.data(), frags, stream); + populate_chunk_hash_maps(map_storage_data, frags, stream); // Synchronize again chunks.device_to_host_sync(stream); @@ -1404,8 +1406,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunk.dict_index = inserted_dict_index.data(); } chunks.host_to_device_async(stream); - collect_map_entries(map_storage.data(), chunks.device_view().flat_view(), stream); - get_dictionary_indices(map_storage.data(), frags, stream); + collect_map_entries(map_storage_data, chunks.device_view().flat_view(), stream); + get_dictionary_indices(map_storage_data, frags, stream); return std::pair(std::move(dict_data), std::move(dict_index)); } From 8f2e65091b23871e0ace6b0613e3112487f463dc Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 14 Aug 2024 23:05:43 +0000 Subject: [PATCH 15/30] Minor improvements --- cpp/src/io/parquet/parquet_gpu.cuh | 2 +- cpp/src/io/parquet/writer_impl.cu | 42 +++++++++--------------------- 2 files changed, 13 insertions(+), 31 deletions(-) diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index 34dc74d1880..0f3cfe1343b 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -30,7 +30,7 @@ using key_type = size_type; using mapped_type = size_type; using slot_type = cuco::pair; -auto constexpr map_cg_size = 2; ///< A CUDA Cooperative Group of 2 threads to handle each subset +auto constexpr map_cg_size = 1; ///< A CUDA Cooperative Group of 2 threads to handle each subset auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread auto constexpr KEY_SENTINEL = key_type{-1}; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index f026cd047e8..e9efc8f7dde 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1285,11 +1285,9 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, return std::pair(std::move(dict_data), std::move(dict_index)); } - // Create a vector to store valid chunk sizes using cuco::make_window_extent - std::vector valid_chunk_sizes; - valid_chunk_sizes.reserve(h_chunks.size()); - - // Populate valid_chunk_sizes for chunks that need to build a dictionary. + // Variable to keep track of the current offset + uint32_t curr_offset = 0; + // Populate dict offsets and sizes for each chunk that need to build a dictionary. std::for_each(h_chunks.begin(), h_chunks.end(), [&](auto& chunk) { auto const& chunk_col_desc = col_desc[chunk.col_desc_id]; auto const is_requested_non_dict = @@ -1300,28 +1298,21 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, if (is_type_non_dict || is_requested_non_dict) { chunk.use_dictionary = false; - // Emplace a zero for 1-1 mapping between h_chunks and valid_chunk_sizes - valid_chunk_sizes.emplace_back(static_cast(0)); } else { chunk.use_dictionary = true; - valid_chunk_sizes.emplace_back( - static_cast(cuco::make_window_extent( - // cuCollections suggests using a hash map of size N * (1/0.7) = 1.43 to target a 70% - // occupancy factor. - static_cast(chunk.num_values * 1.43)))); - chunk.dict_map_size = valid_chunk_sizes.back(); + // cuCollections suggests using a hash map of size N * (1/0.7) = 1.43 to target a 70% + // occupancy factor. + chunk.dict_map_size = + static_cast(cuco::make_window_extent( + static_cast(1.43 * chunk.num_values))); + chunk.dict_map_offset = curr_offset; + curr_offset += chunk.dict_map_size; } }); - // Create a vector to map offsets from chunk sizes - std::vector map_offsets(valid_chunk_sizes.size(), 0); - std::exclusive_scan(valid_chunk_sizes.begin(), - valid_chunk_sizes.end(), - map_offsets.begin(), - static_cast(0)); - // Compute total map storage - auto const total_map_storage_size = map_offsets.back() + valid_chunk_sizes.back(); + auto const total_map_storage_size = static_cast(curr_offset); + // No chunk needs to create a dictionary, exit early if (total_map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; } @@ -1333,15 +1324,6 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, // Create a span of non-const map_storage as map_storage_ref takes in a non-const pointer. device_span const map_storage_data{map_storage.data(), total_map_storage_size}; - // Populate chunk dictionary offsets - std::for_each( - thrust::make_zip_iterator(thrust::make_tuple(h_chunks.begin(), map_offsets.begin())), - thrust::make_zip_iterator(thrust::make_tuple(h_chunks.end(), map_offsets.end())), - [&](auto elem) -> void { - auto& chunk = thrust::get<0>(elem); - if (chunk.use_dictionary) { chunk.dict_map_offset = thrust::get<1>(elem); } - }); - // Synchronize chunks.host_to_device_async(stream); // Populate the hash map for each chunk From 7bcca4529373808b55c55d1c3c914951dcf05286 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 14 Aug 2024 23:30:24 +0000 Subject: [PATCH 16/30] Minor improvements --- cpp/src/io/parquet/chunk_dict.cu | 45 +++++++++++------------------- cpp/src/io/parquet/parquet_gpu.cuh | 2 +- 2 files changed, 17 insertions(+), 30 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 62ba593f20f..6aa434eb59d 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -56,39 +56,32 @@ struct hash_functor { struct map_insert_fn { storage_ref_type const& storage_ref; - + column_device_view const& col; template - __device__ bool operator()(column_device_view const& col, - cg::thread_block_tile const& tile, - key_type i) + __device__ bool operator()(cg::thread_block_tile const& tile, key_type i) { if constexpr (column_device_view::has_element_accessor()) { using equality_fn_type = equality_functor; using hash_fn_type = hash_functor; using probing_scheme_type = cuco::linear_probing; - // Instantiate hash and equality functors. - auto hash_fn = hash_fn_type{col}; - auto equal_fn = equality_fn_type{col}; - // Make a view of the hash map. cuco::static_map_ref + storage_ref_type, + cuco::op::insert_tag> hash_map_ref{cuco::empty_key{KEY_SENTINEL}, cuco::empty_value{VALUE_SENTINEL}, - {equal_fn}, - {hash_fn}, + equality_fn_type{col}, + hash_fn_type{col}, {}, storage_ref}; - // Create another map ref with the insert operator. - auto map_insert_ref = hash_map_ref.with_operators(cuco::insert); // Insert into the hash map using the provided thread tile. - return map_insert_ref.insert(tile, cuco::pair{i, i}); + return hash_map_ref.insert(tile, cuco::pair{i, i}); } else { CUDF_UNREACHABLE("Unsupported type to insert in map"); } @@ -97,39 +90,33 @@ struct map_insert_fn { struct map_find_fn { storage_ref_type const& storage_ref; - + column_device_view const& col; template __device__ cuco::pair operator()( - column_device_view const& col, cg::thread_block_tile const& tile, key_type i) + cg::thread_block_tile const& tile, key_type i) { if constexpr (column_device_view::has_element_accessor()) { using equality_fn_type = equality_functor; using hash_fn_type = hash_functor; using probing_scheme_type = cuco::linear_probing; - // Instantiate hash and equality functors. - auto hash_fn = hash_fn_type{col}; - auto equal_fn = equality_fn_type{col}; - // Make a view of the hash map cuco::static_map_ref + storage_ref_type, + cuco::op::find_tag> hash_map_ref{cuco::empty_key{KEY_SENTINEL}, cuco::empty_value{VALUE_SENTINEL}, - {equal_fn}, - {hash_fn}, + equality_fn_type{col}, + hash_fn_type{col}, {}, storage_ref}; - // Create another map with find operator. - auto map_find_ref = hash_map_ref.with_operators(cuco::find); - // Find the key = i using the provided thread tile. - auto found_slot = map_find_ref.find(tile, i); + auto found_slot = hash_map_ref.find(tile, i); // Check if didn't find the previously inserted key. if (tile.thread_rank() == 0) { @@ -198,7 +185,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (is_valid) { // Insert the element to the map using the entire tile. auto const tile_is_unique = type_dispatcher( - data_col.type(), map_insert_fn{storage_ref}, data_col, tile, tile_val_idx); + data_col.type(), map_insert_fn{storage_ref, data_col}, tile, tile_val_idx); // tile_offset'th thread in the tile updates its number and size of unique element. if (tile.thread_rank() == tile_offset) { @@ -319,7 +306,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) val_idx += ntiles) { if (data_col.is_valid(val_idx)) { auto [found_key, found_value] = - type_dispatcher(data_col.type(), map_find_fn{storage_ref}, data_col, tile, val_idx); + type_dispatcher(data_col.type(), map_find_fn{storage_ref, data_col}, tile, val_idx); // First thread in the tile updates the dict_index if (tile.thread_rank() == 0) { // No need for atomic as this is not going to be modified by any other thread diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index 0f3cfe1343b..34dc74d1880 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -30,7 +30,7 @@ using key_type = size_type; using mapped_type = size_type; using slot_type = cuco::pair; -auto constexpr map_cg_size = 1; ///< A CUDA Cooperative Group of 2 threads to handle each subset +auto constexpr map_cg_size = 2; ///< A CUDA Cooperative Group of 2 threads to handle each subset auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread auto constexpr KEY_SENTINEL = key_type{-1}; From b0e482ed0657cd840c57dbebc741944b6a0ce19f Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 15 Aug 2024 02:12:43 +0000 Subject: [PATCH 17/30] Perf improvements --- cpp/src/io/parquet/chunk_dict.cu | 125 ++++++++++++++----------------- 1 file changed, 56 insertions(+), 69 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 6aa434eb59d..20773ab2e27 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -164,66 +164,57 @@ CUDF_KERNEL void __launch_bounds__(block_size) __shared__ size_type total_num_dict_entries; // Insert all column chunk elements to the hash map to build the dict. - for (thread_index_type val_idx = s_start_value_idx; - val_idx + block.thread_rank() - block_size < end_value_idx; - val_idx += block_size) { - // Compute the index to the start of the tile. + for (thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank(); + val_idx - tile.meta_group_size() < end_value_idx; + val_idx += tile.meta_group_size()) { size_type is_unique = 0; size_type uniq_elem_size = 0; - // Insert all elements within each tile. - for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) { - // Compute the index to the element being inserted within the tile. - auto const tile_val_idx = - val_idx + tile_offset + (tile.meta_group_rank() * tile.num_threads()); - - // Check if this index is valid. - auto const is_valid = tile_val_idx < end_value_idx and tile_val_idx < data_col.size() and - data_col.is_valid(tile_val_idx); - - // Insert tile_val_idx to hash map and count successful insertions. - if (is_valid) { - // Insert the element to the map using the entire tile. - auto const tile_is_unique = type_dispatcher( - data_col.type(), map_insert_fn{storage_ref, data_col}, tile, tile_val_idx); - - // tile_offset'th thread in the tile updates its number and size of unique element. - if (tile.thread_rank() == tile_offset) { - is_unique = tile_is_unique; - uniq_elem_size = [&]() -> size_type { - if (not is_unique) { return 0; } - switch (col->physical_type) { - case Type::INT32: return 4; - case Type::INT64: return 8; - case Type::INT96: return 12; - case Type::FLOAT: return 4; - case Type::DOUBLE: return 8; - case Type::BYTE_ARRAY: { - auto const col_type = data_col.type().id(); - if (col_type == type_id::STRING) { - // Strings are stored as 4 byte length + string bytes - return 4 + data_col.element(tile_val_idx).size_bytes(); - } else if (col_type == type_id::LIST) { - // Binary is stored as 4 byte length + bytes - return 4 + get_element(data_col, tile_val_idx) - .size_bytes(); - } - CUDF_UNREACHABLE( - "Byte array only supports string and list column types for dictionary " - "encoding!"); + // Check if this index is valid. + auto const is_valid = + val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx); + + // Insert tile_val_idx to hash map and count successful insertions. + if (is_valid) { + // Insert the element to the map using the entire tile. + auto const tile_is_unique = + type_dispatcher(data_col.type(), map_insert_fn{storage_ref, data_col}, tile, val_idx); + + // First thread in the tile updates its number and size of unique element. + if (tile.thread_rank() == 0) { + is_unique = tile_is_unique; + uniq_elem_size = [&]() -> size_type { + if (not is_unique) { return 0; } + switch (col->physical_type) { + case Type::INT32: return 4; + case Type::INT64: return 8; + case Type::INT96: return 12; + case Type::FLOAT: return 4; + case Type::DOUBLE: return 8; + case Type::BYTE_ARRAY: { + auto const col_type = data_col.type().id(); + if (col_type == type_id::STRING) { + // Strings are stored as 4 byte length + string bytes + return 4 + data_col.element(val_idx).size_bytes(); + } else if (col_type == type_id::LIST) { + // Binary is stored as 4 byte length + bytes + return 4 + get_element(data_col, val_idx).size_bytes(); } - case Type::FIXED_LEN_BYTE_ARRAY: - if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); } - CUDF_UNREACHABLE( - "Fixed length byte array only supports decimal 128 column types for dictionary " - "encoding!"); - default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding"); + CUDF_UNREACHABLE( + "Byte array only supports string and list column types for dictionary " + "encoding!"); } - }(); - } + case Type::FIXED_LEN_BYTE_ARRAY: + if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); } + CUDF_UNREACHABLE( + "Fixed length byte array only supports decimal 128 column types for dictionary " + "encoding!"); + default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding"); + } + }(); } } - // All elements in all tiles inserted, reduce num_unique and uniq_data_size across the block. + // Reduce num_unique and uniq_data_size from all tiles. auto num_unique = block_reduce(reduce_storage).Sum(is_unique); block.sync(); auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size); @@ -248,26 +239,22 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto& chunk = chunks[blockIdx.x]; if (not chunk.use_dictionary) { return; } - auto const t = threadIdx.x; - __shared__ cuda::atomic counter; using cuda::std::memory_order_relaxed; - if (t == 0) { new (&counter) cuda::atomic{0}; } + if (threadIdx.x == 0) { new (&counter) cuda::atomic{0}; } __syncthreads(); - for (size_type i = 0; i < chunk.dict_map_size; i += block_size) { - if (t + i < chunk.dict_map_size) { - auto* slot = map_storage.data() + chunk.dict_map_offset + t + i; - auto const key = slot->data()->first; - if (key != KEY_SENTINEL) { - auto loc = counter.fetch_add(1, memory_order_relaxed); - cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size"); - chunk.dict_data[loc] = key; - // If sorting dict page ever becomes a hard requirement, enable the following statement and - // add a dict sorting step before storing into the slot's second field. - // chunk.dict_data_idx[loc] = t + i; - slot->data()->second = loc; - } + for (size_type idx = threadIdx.x; idx < chunk.dict_map_size; idx += block_size) { + auto* slot = map_storage.data() + chunk.dict_map_offset + idx; + auto const key = slot->data()->first; + if (key != KEY_SENTINEL) { + auto loc = counter.fetch_add(1, memory_order_relaxed); + cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size"); + chunk.dict_data[loc] = key; + // If sorting dict page ever becomes a hard requirement, enable the following statement and + // add a dict sorting step before storing into the slot's second field. + // chunk.dict_data_idx[loc] = idx; + slot->data()->second = loc; } } } From 4f51253e57aaf1eed02d119004defae3da89f197 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 15 Aug 2024 03:01:59 +0000 Subject: [PATCH 18/30] Cosmetic improvements --- cpp/src/io/parquet/chunk_dict.cu | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 20773ab2e27..46f7acb2a37 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -86,7 +86,7 @@ struct map_insert_fn { CUDF_UNREACHABLE("Unsupported type to insert in map"); } } -}; // namespace cudf::io::parquet::detail +}; struct map_find_fn { storage_ref_type const& storage_ref; @@ -116,13 +116,12 @@ struct map_find_fn { storage_ref}; // Find the key = i using the provided thread tile. - auto found_slot = hash_map_ref.find(tile, i); + auto const found_slot = hash_map_ref.find(tile, i); // Check if didn't find the previously inserted key. - if (tile.thread_rank() == 0) { - cudf_assert(found_slot != map_find_ref.end() && - "Unable to find value in map in dictionary index construction"); - } + cudf_assert(found_slot != hash_map_ref.end() && + "Unable to find value in map in dictionary index construction"); + // Return a pair of the found key and value. return {found_slot->first, found_slot->second}; } else { From 16fa57ead03d4f3dd5d716e8627f17b8337806b6 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 15 Aug 2024 03:10:01 +0000 Subject: [PATCH 19/30] Change cg_size to 1 for best perf so far --- cpp/src/io/parquet/parquet_gpu.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index 34dc74d1880..0f3cfe1343b 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -30,7 +30,7 @@ using key_type = size_type; using mapped_type = size_type; using slot_type = cuco::pair; -auto constexpr map_cg_size = 2; ///< A CUDA Cooperative Group of 2 threads to handle each subset +auto constexpr map_cg_size = 1; ///< A CUDA Cooperative Group of 2 threads to handle each subset auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread auto constexpr KEY_SENTINEL = key_type{-1}; From e394a7108caec5be54323c6623336353bddae38a Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 15 Aug 2024 19:01:12 +0000 Subject: [PATCH 20/30] Incorporate for window_size --- cpp/src/io/parquet/chunk_dict.cu | 22 ++++++++++++---------- cpp/src/io/parquet/parquet_gpu.cuh | 4 ++-- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 46f7acb2a37..70ddb8a230f 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -244,16 +244,18 @@ CUDF_KERNEL void __launch_bounds__(block_size) __syncthreads(); for (size_type idx = threadIdx.x; idx < chunk.dict_map_size; idx += block_size) { - auto* slot = map_storage.data() + chunk.dict_map_offset + idx; - auto const key = slot->data()->first; - if (key != KEY_SENTINEL) { - auto loc = counter.fetch_add(1, memory_order_relaxed); - cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size"); - chunk.dict_data[loc] = key; - // If sorting dict page ever becomes a hard requirement, enable the following statement and - // add a dict sorting step before storing into the slot's second field. - // chunk.dict_data_idx[loc] = idx; - slot->data()->second = loc; + auto* window = map_storage.data() + chunk.dict_map_offset + idx; + for (auto& slot : *window) { + auto const key = slot.first; + if (key != KEY_SENTINEL) { + auto loc = counter.fetch_add(1, memory_order_relaxed); + cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size"); + chunk.dict_data[loc] = key; + // If sorting dict page ever becomes a hard requirement, enable the following statement and + // add a dict sorting step before storing into the slot's second field. + // chunk.dict_data_idx[loc] = idx; + slot.second = loc; + } } } } diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index 0f3cfe1343b..1904b98a87f 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -30,8 +30,8 @@ using key_type = size_type; using mapped_type = size_type; using slot_type = cuco::pair; -auto constexpr map_cg_size = 1; ///< A CUDA Cooperative Group of 2 threads to handle each subset -auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread +auto constexpr map_cg_size = 2; ///< A CUDA Cooperative Group of 2 threads to handle each subset +auto constexpr window_size = 2; ///< Number of concurrent slots handled by each thread auto constexpr KEY_SENTINEL = key_type{-1}; auto constexpr VALUE_SENTINEL = mapped_type{-1}; From 3e6a0b7e6551ea3ccface05847b01a52ecd8fffd Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 15 Aug 2024 23:41:21 +0000 Subject: [PATCH 21/30] Use double hashing --- cpp/src/io/parquet/chunk_dict.cu | 117 ++++++++++++++++------------- cpp/src/io/parquet/parquet_gpu.cuh | 6 +- cpp/src/io/parquet/writer_impl.cu | 9 +-- 3 files changed, 70 insertions(+), 62 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 70ddb8a230f..72592399575 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -48,9 +48,10 @@ struct equality_functor { template struct hash_functor { column_device_view const& col; + uint32_t seed = 0; __device__ auto operator()(key_type idx) const { - return cudf::hashing::detail::MurmurHash3_x86_32{}(col.element(idx)); + return cudf::hashing::detail::MurmurHash3_x86_32{seed}(col.element(idx)); } }; @@ -58,12 +59,12 @@ struct map_insert_fn { storage_ref_type const& storage_ref; column_device_view const& col; template - __device__ bool operator()(cg::thread_block_tile const& tile, key_type i) + __device__ bool operator()(key_type i, cg::thread_block_tile const& tile) { if constexpr (column_device_view::has_element_accessor()) { using equality_fn_type = equality_functor; using hash_fn_type = hash_functor; - using probing_scheme_type = cuco::linear_probing; + using probing_scheme_type = cuco::double_hashing; // Make a view of the hash map. cuco::static_map_ref hash_map_ref{cuco::empty_key{KEY_SENTINEL}, cuco::empty_value{VALUE_SENTINEL}, - equality_fn_type{col}, - hash_fn_type{col}, + {col}, + {{col}, {col, 1}}, {}, storage_ref}; @@ -93,14 +94,14 @@ struct map_find_fn { column_device_view const& col; template __device__ cuco::pair operator()( - cg::thread_block_tile const& tile, key_type i) + key_type i, cg::thread_block_tile const& tile) { if constexpr (column_device_view::has_element_accessor()) { using equality_fn_type = equality_functor; using hash_fn_type = hash_functor; - using probing_scheme_type = cuco::linear_probing; + using probing_scheme_type = cuco::double_hashing; - // Make a view of the hash map + // Make a view of the hash map. cuco::static_map_ref hash_map_ref{cuco::empty_key{KEY_SENTINEL}, cuco::empty_value{VALUE_SENTINEL}, - equality_fn_type{col}, - hash_fn_type{col}, + {col}, + {{col}, {col, 1}}, {}, storage_ref}; @@ -163,54 +164,62 @@ CUDF_KERNEL void __launch_bounds__(block_size) __shared__ size_type total_num_dict_entries; // Insert all column chunk elements to the hash map to build the dict. - for (thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank(); - val_idx - tile.meta_group_size() < end_value_idx; - val_idx += tile.meta_group_size()) { + for (thread_index_type val_idx = s_start_value_idx; + val_idx + block.thread_rank() - block_size < end_value_idx; + val_idx += block_size) { size_type is_unique = 0; size_type uniq_elem_size = 0; - // Check if this index is valid. - auto const is_valid = - val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx); - - // Insert tile_val_idx to hash map and count successful insertions. - if (is_valid) { - // Insert the element to the map using the entire tile. - auto const tile_is_unique = - type_dispatcher(data_col.type(), map_insert_fn{storage_ref, data_col}, tile, val_idx); - - // First thread in the tile updates its number and size of unique element. - if (tile.thread_rank() == 0) { - is_unique = tile_is_unique; - uniq_elem_size = [&]() -> size_type { - if (not is_unique) { return 0; } - switch (col->physical_type) { - case Type::INT32: return 4; - case Type::INT64: return 8; - case Type::INT96: return 12; - case Type::FLOAT: return 4; - case Type::DOUBLE: return 8; - case Type::BYTE_ARRAY: { - auto const col_type = data_col.type().id(); - if (col_type == type_id::STRING) { - // Strings are stored as 4 byte length + string bytes - return 4 + data_col.element(val_idx).size_bytes(); - } else if (col_type == type_id::LIST) { - // Binary is stored as 4 byte length + bytes - return 4 + get_element(data_col, val_idx).size_bytes(); + // Each tile inserts a portion of the elements to the hash map. + for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) { + // Compute the index of the value currently being inserted by this tile. + auto const tile_val_idx = + val_idx + tile_offset + (tile.meta_group_rank() * tile.num_threads()); + + // Check if this index is valid. + auto const is_valid = tile_val_idx < end_value_idx and tile_val_idx < data_col.size() and + data_col.is_valid(tile_val_idx); + + // Insert tile_val_idx to hash map and count successful insertions. + if (is_valid) { + // Insert the element to the map using the entire tile. + auto const tile_is_unique = type_dispatcher( + data_col.type(), map_insert_fn{storage_ref, data_col}, tile_val_idx, tile); + + // tile_offset'th thread in the tile updates its number and size of unique element. + if (tile.thread_rank() == tile_offset) { + is_unique = tile_is_unique; + uniq_elem_size = [&]() -> size_type { + if (not is_unique) { return 0; } + switch (col->physical_type) { + case Type::INT32: return 4; + case Type::INT64: return 8; + case Type::INT96: return 12; + case Type::FLOAT: return 4; + case Type::DOUBLE: return 8; + case Type::BYTE_ARRAY: { + auto const col_type = data_col.type().id(); + if (col_type == type_id::STRING) { + // Strings are stored as 4 byte length + string bytes + return 4 + data_col.element(tile_val_idx).size_bytes(); + } else if (col_type == type_id::LIST) { + // Binary is stored as 4 byte length + bytes + return 4 + get_element(data_col, tile_val_idx) + .size_bytes(); + } + CUDF_UNREACHABLE( + "Byte array only supports string and list column types for dictionary " + "encoding!"); } - CUDF_UNREACHABLE( - "Byte array only supports string and list column types for dictionary " - "encoding!"); + case Type::FIXED_LEN_BYTE_ARRAY: + if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); } + CUDF_UNREACHABLE( + "Fixed length byte array only supports decimal 128 column types for dictionary " + "encoding!"); + default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding"); } - case Type::FIXED_LEN_BYTE_ARRAY: - if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); } - CUDF_UNREACHABLE( - "Fixed length byte array only supports decimal 128 column types for dictionary " - "encoding!"); - default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding"); - } - }(); + }(); + } } } // Reduce num_unique and uniq_data_size from all tiles. @@ -294,7 +303,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) val_idx += ntiles) { if (data_col.is_valid(val_idx)) { auto [found_key, found_value] = - type_dispatcher(data_col.type(), map_find_fn{storage_ref, data_col}, tile, val_idx); + type_dispatcher(data_col.type(), map_find_fn{storage_ref, data_col}, val_idx, tile); // First thread in the tile updates the dict_index if (tile.thread_rank() == 0) { // No need for atomic as this is not going to be modified by any other thread diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index 1904b98a87f..2c5c41d642d 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -30,8 +30,10 @@ using key_type = size_type; using mapped_type = size_type; using slot_type = cuco::pair; -auto constexpr map_cg_size = 2; ///< A CUDA Cooperative Group of 2 threads to handle each subset -auto constexpr window_size = 2; ///< Number of concurrent slots handled by each thread +auto constexpr map_cg_size = 1; ///< A CUDA Cooperative Group of 1 thread(s) to handle each subset +auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread +auto constexpr occupancy_factor = 1.43f; ///< cuCollections suggests using a hash map of size N * + ///< (1/0.7) = 1.43 to target a 70% occupancy factor. auto constexpr KEY_SENTINEL = key_type{-1}; auto constexpr VALUE_SENTINEL = mapped_type{-1}; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index e9efc8f7dde..ad1b39e0fe7 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1300,11 +1300,9 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunk.use_dictionary = false; } else { chunk.use_dictionary = true; - // cuCollections suggests using a hash map of size N * (1/0.7) = 1.43 to target a 70% - // occupancy factor. chunk.dict_map_size = static_cast(cuco::make_window_extent( - static_cast(1.43 * chunk.num_values))); + static_cast(occupancy_factor * chunk.num_values))); chunk.dict_map_offset = curr_offset; curr_offset += chunk.dict_map_size; } @@ -1318,14 +1316,13 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, // Create a single bulk storage used by all sub-dictionaries auto map_storage = storage_type{total_map_storage_size}; - // Initialize storage with the given sentinel iff non-zero size - map_storage.initialize(cuco::pair{KEY_SENTINEL, VALUE_SENTINEL}, - cuda::stream_ref{stream.value()}); // Create a span of non-const map_storage as map_storage_ref takes in a non-const pointer. device_span const map_storage_data{map_storage.data(), total_map_storage_size}; // Synchronize chunks.host_to_device_async(stream); + // Initialize storage with the given sentinel + map_storage.initialize({KEY_SENTINEL, VALUE_SENTINEL}, {stream.value()}); // Populate the hash map for each chunk populate_chunk_hash_maps(map_storage_data, frags, stream); // Synchronize again From 27bee058782a2e28d2b1e160eb31429d3765c3f3 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 16 Aug 2024 07:28:01 +0000 Subject: [PATCH 22/30] Avoid reconstructing hash_map_ref again and again. --- cpp/src/io/parquet/chunk_dict.cu | 257 ++++++++++++++++--------------- 1 file changed, 135 insertions(+), 122 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 72592399575..0a5769d0dd8 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -55,13 +55,21 @@ struct hash_functor { } }; +template struct map_insert_fn { storage_ref_type const& storage_ref; - column_device_view const& col; + EncColumnChunk* const& chunk; + template - __device__ bool operator()(key_type i, cg::thread_block_tile const& tile) + __device__ void operator()(size_type const s_start_value_idx, size_type const end_value_idx) { if constexpr (column_device_view::has_element_accessor()) { + using block_reduce = cub::BlockReduce; + __shared__ typename block_reduce::TempStorage reduce_storage; + + auto const col = chunk->col_desc; + column_device_view const& data_col = *col->leaf_column; + using equality_fn_type = equality_functor; using hash_fn_type = hash_functor; using probing_scheme_type = cuco::double_hashing; @@ -76,13 +84,91 @@ struct map_insert_fn { cuco::op::insert_tag> hash_map_ref{cuco::empty_key{KEY_SENTINEL}, cuco::empty_value{VALUE_SENTINEL}, - {col}, - {{col}, {col, 1}}, + {data_col}, + {{data_col}, {data_col, 1}}, {}, storage_ref}; - // Insert into the hash map using the provided thread tile. - return hash_map_ref.insert(tile, cuco::pair{i, i}); + __shared__ size_type total_num_dict_entries; + + auto const block = cg::this_thread_block(); + auto const tile = cg::tiled_partition(block); + + // Insert all column chunk elements to the hash map to build the dict. + for (thread_index_type val_idx = s_start_value_idx; + val_idx + block.thread_rank() - block_size < end_value_idx; + val_idx += block_size) { + size_type is_unique = 0; + size_type uniq_elem_size = 0; + + // Each tile inserts a portion of the elements to the hash map. + for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) { + // Compute the index of the value currently being inserted by this tile. + auto const tile_val_idx = + val_idx + tile_offset + (tile.meta_group_rank() * tile.num_threads()); + // Check if this index is valid. + auto const is_valid = tile_val_idx < end_value_idx and tile_val_idx < data_col.size() and + data_col.is_valid(tile_val_idx); + + // Insert tile_val_idx to hash map and count successful insertions. + if (is_valid) { + // Insert the element to the map using the entire tile. + auto const tile_is_unique = + // Insert into the hash map using the provided thread tile. + hash_map_ref.insert(tile, cuco::pair{tile_val_idx, tile_val_idx}); + // tile_offset'th thread in the tile updates its number and size of unique element. + if (tile.thread_rank() == tile_offset) { + is_unique = tile_is_unique; + uniq_elem_size = [&]() -> size_type { + if (not is_unique) { return 0; } + switch (col->physical_type) { + case Type::INT32: return 4; + case Type::INT64: return 8; + case Type::INT96: return 12; + case Type::FLOAT: return 4; + case Type::DOUBLE: return 8; + case Type::BYTE_ARRAY: { + auto const col_type = data_col.type().id(); + if (col_type == type_id::STRING) { + // Strings are stored as 4 byte length + string bytes + return 4 + data_col.element(tile_val_idx).size_bytes(); + } else if (col_type == type_id::LIST) { + // Binary is stored as 4 byte length + bytes + return 4 + get_element(data_col, tile_val_idx) + .size_bytes(); + } + CUDF_UNREACHABLE( + "Byte array only supports string and list column types for dictionary " + "encoding!"); + } + case Type::FIXED_LEN_BYTE_ARRAY: + if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); } + CUDF_UNREACHABLE( + "Fixed length byte array only supports decimal 128 column types for " + "dictionary " + "encoding!"); + default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding"); + } + }(); + } + } + } + // Reduce num_unique and uniq_data_size from all tiles. + auto num_unique = block_reduce(reduce_storage).Sum(is_unique); + block.sync(); + auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size); + // The first thread in the block atomically updates total num_unique and uniq_data_size + if (block.thread_rank() == 0) { + total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique); + total_num_dict_entries += num_unique; + atomicAdd(&chunk->uniq_data_size, uniq_data_size); + } + block.sync(); + + // Check if the num unique values in chunk has already exceeded max dict size and early + // exit + if (total_num_dict_entries > MAX_DICT_SIZE) { return; } + } // for loop } else { CUDF_UNREACHABLE("Unsupported type to insert in map"); } @@ -91,12 +177,16 @@ struct map_insert_fn { struct map_find_fn { storage_ref_type const& storage_ref; - column_device_view const& col; + EncColumnChunk* const& chunk; template - __device__ cuco::pair operator()( - key_type i, cg::thread_block_tile const& tile) + __device__ void operator()(size_type const s_start_value_idx, + size_type const end_value_idx, + size_type const s_ck_start_val_idx) { if constexpr (column_device_view::has_element_accessor()) { + auto const col = chunk->col_desc; + column_device_view const& data_col = *col->leaf_column; + using equality_fn_type = equality_functor; using hash_fn_type = hash_functor; using probing_scheme_type = cuco::double_hashing; @@ -111,20 +201,34 @@ struct map_find_fn { cuco::op::find_tag> hash_map_ref{cuco::empty_key{KEY_SENTINEL}, cuco::empty_value{VALUE_SENTINEL}, - {col}, - {{col}, {col, 1}}, + {data_col}, + {{data_col}, {data_col, 1}}, {}, storage_ref}; // Find the key = i using the provided thread tile. - auto const found_slot = hash_map_ref.find(tile, i); - - // Check if didn't find the previously inserted key. - cudf_assert(found_slot != hash_map_ref.end() && - "Unable to find value in map in dictionary index construction"); - - // Return a pair of the found key and value. - return {found_slot->first, found_slot->second}; + auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const ntiles = tile.meta_group_size(); + + for (thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank(); + val_idx < end_value_idx; + val_idx += tile.meta_group_size()) { + if (data_col.is_valid(val_idx)) { + // No need for atomic as this is not going to be modified by any other thread + auto const value = [&]() { + auto const found_slot = hash_map_ref.find(tile, val_idx); + + // Check if didn't find the previously inserted key. + cudf_assert(found_slot != hash_map_ref.end() && + "Unable to find value in map in dictionary index construction"); + + // Return a pair of the found key and value. + return found_slot->second; + }(); + // First thread in the tile updates the dict_index + if (tile.thread_rank() == 0) { chunk->dict_index[val_idx - s_ck_start_val_idx] = value; } + } + } } else { CUDF_UNREACHABLE("Unsupported type to find in map"); } @@ -144,12 +248,6 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (not chunk->use_dictionary) { return; } - using block_reduce = cub::BlockReduce; - __shared__ typename block_reduce::TempStorage reduce_storage; - - auto const block = cg::this_thread_block(); - auto const tile = cg::tiled_partition(block); - size_type start_row = frag.start_row; size_type end_row = frag.start_row + frag.num_rows; @@ -160,83 +258,10 @@ CUDF_KERNEL void __launch_bounds__(block_size) column_device_view const& data_col = *col->leaf_column; storage_ref_type const storage_ref{chunk->dict_map_size, map_storage.data() + chunk->dict_map_offset}; - - __shared__ size_type total_num_dict_entries; - - // Insert all column chunk elements to the hash map to build the dict. - for (thread_index_type val_idx = s_start_value_idx; - val_idx + block.thread_rank() - block_size < end_value_idx; - val_idx += block_size) { - size_type is_unique = 0; - size_type uniq_elem_size = 0; - - // Each tile inserts a portion of the elements to the hash map. - for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) { - // Compute the index of the value currently being inserted by this tile. - auto const tile_val_idx = - val_idx + tile_offset + (tile.meta_group_rank() * tile.num_threads()); - - // Check if this index is valid. - auto const is_valid = tile_val_idx < end_value_idx and tile_val_idx < data_col.size() and - data_col.is_valid(tile_val_idx); - - // Insert tile_val_idx to hash map and count successful insertions. - if (is_valid) { - // Insert the element to the map using the entire tile. - auto const tile_is_unique = type_dispatcher( - data_col.type(), map_insert_fn{storage_ref, data_col}, tile_val_idx, tile); - - // tile_offset'th thread in the tile updates its number and size of unique element. - if (tile.thread_rank() == tile_offset) { - is_unique = tile_is_unique; - uniq_elem_size = [&]() -> size_type { - if (not is_unique) { return 0; } - switch (col->physical_type) { - case Type::INT32: return 4; - case Type::INT64: return 8; - case Type::INT96: return 12; - case Type::FLOAT: return 4; - case Type::DOUBLE: return 8; - case Type::BYTE_ARRAY: { - auto const col_type = data_col.type().id(); - if (col_type == type_id::STRING) { - // Strings are stored as 4 byte length + string bytes - return 4 + data_col.element(tile_val_idx).size_bytes(); - } else if (col_type == type_id::LIST) { - // Binary is stored as 4 byte length + bytes - return 4 + get_element(data_col, tile_val_idx) - .size_bytes(); - } - CUDF_UNREACHABLE( - "Byte array only supports string and list column types for dictionary " - "encoding!"); - } - case Type::FIXED_LEN_BYTE_ARRAY: - if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); } - CUDF_UNREACHABLE( - "Fixed length byte array only supports decimal 128 column types for dictionary " - "encoding!"); - default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding"); - } - }(); - } - } - } - // Reduce num_unique and uniq_data_size from all tiles. - auto num_unique = block_reduce(reduce_storage).Sum(is_unique); - block.sync(); - auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size); - // The first thread in the block atomically updates total num_unique and uniq_data_size - if (block.thread_rank() == 0) { - total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique); - total_num_dict_entries += num_unique; - atomicAdd(&chunk->uniq_data_size, uniq_data_size); - } - block.sync(); - - // Check if the num unique values in chunk has already exceeded max dict size and early exit - if (total_num_dict_entries > MAX_DICT_SIZE) { return; } - } // for loop + type_dispatcher(data_col.type(), + map_insert_fn{storage_ref, chunk}, + s_start_value_idx, + end_value_idx); } template @@ -260,8 +285,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto loc = counter.fetch_add(1, memory_order_relaxed); cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size"); chunk.dict_data[loc] = key; - // If sorting dict page ever becomes a hard requirement, enable the following statement and - // add a dict sorting step before storing into the slot's second field. + // If sorting dict page ever becomes a hard requirement, enable the following statement + // and add a dict sorting step before storing into the slot's second field. // chunk.dict_data_idx[loc] = idx; slot.second = loc; } @@ -278,17 +303,13 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto const block_x = blockIdx.x; auto const frag = frags[col_idx][block_x]; auto chunk = frag.chunk; - auto const col = chunk->col_desc; if (not chunk->use_dictionary) { return; } - auto const block = cg::this_thread_block(); - auto const tile = cg::tiled_partition(block); - auto const ntiles = tile.meta_group_size(); - size_type start_row = frag.start_row; size_type end_row = frag.start_row + frag.num_rows; + auto const col = chunk->col_desc; // Find the bounds of values in leaf column to be searched in the map for current chunk auto const s_start_value_idx = row_to_value_idx(start_row, *col); auto const s_ck_start_val_idx = row_to_value_idx(chunk->start_row, *col); @@ -298,19 +319,11 @@ CUDF_KERNEL void __launch_bounds__(block_size) storage_ref_type const storage_ref{chunk->dict_map_size, map_storage.data() + chunk->dict_map_offset}; - for (thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank(); - val_idx < end_value_idx; - val_idx += ntiles) { - if (data_col.is_valid(val_idx)) { - auto [found_key, found_value] = - type_dispatcher(data_col.type(), map_find_fn{storage_ref, data_col}, val_idx, tile); - // First thread in the tile updates the dict_index - if (tile.thread_rank() == 0) { - // No need for atomic as this is not going to be modified by any other thread - chunk->dict_index[val_idx - s_ck_start_val_idx] = found_value; - } - } - } + type_dispatcher(data_col.type(), + map_find_fn{storage_ref, chunk}, + s_start_value_idx, + end_value_idx, + s_ck_start_val_idx); } void populate_chunk_hash_maps(device_span const map_storage, From fd2cb7f768a86ddee36b3f397ab0794d685e085c Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 16 Aug 2024 23:11:30 +0000 Subject: [PATCH 23/30] Add allocator to map's aow_storage --- cpp/src/io/parquet/parquet_gpu.cuh | 6 +++++- cpp/src/io/parquet/writer_impl.cu | 6 ++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index 2c5c41d642d..35d9c7846aa 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -18,6 +18,7 @@ #include "parquet_gpu.hpp" +#include #include #include @@ -39,7 +40,10 @@ auto constexpr KEY_SENTINEL = key_type{-1}; auto constexpr VALUE_SENTINEL = mapped_type{-1}; auto constexpr SCOPE = cuda::thread_scope_block; -using storage_type = cuco::aow_storage; +using storage_type = cuco::aow_storage, + cudf::detail::cuco_allocator>; using storage_ref_type = typename storage_type::ref_type; using window_type = typename storage_type::window_type; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index ad1b39e0fe7..14504a4ef1c 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1315,14 +1315,16 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, if (total_map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; } // Create a single bulk storage used by all sub-dictionaries - auto map_storage = storage_type{total_map_storage_size}; + auto map_storage = storage_type{ + total_map_storage_size, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}}; // Create a span of non-const map_storage as map_storage_ref takes in a non-const pointer. device_span const map_storage_data{map_storage.data(), total_map_storage_size}; // Synchronize chunks.host_to_device_async(stream); // Initialize storage with the given sentinel - map_storage.initialize({KEY_SENTINEL, VALUE_SENTINEL}, {stream.value()}); + map_storage.initialize_async({KEY_SENTINEL, VALUE_SENTINEL}, {stream.value()}); // Populate the hash map for each chunk populate_chunk_hash_maps(map_storage_data, frags, stream); // Synchronize again From 39c2a3514d448e11086f184453c378ed2b827ccd Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Sat, 17 Aug 2024 00:55:47 +0000 Subject: [PATCH 24/30] Perf optimization --- cpp/src/io/parquet/chunk_dict.cu | 168 +++++++++++++---------------- cpp/src/io/parquet/parquet_gpu.cuh | 11 +- 2 files changed, 82 insertions(+), 97 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 0a5769d0dd8..653b6ddef6b 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -22,14 +22,11 @@ #include -#include #include #include namespace cudf::io::parquet::detail { -namespace cg = cooperative_groups; - namespace { constexpr int DEFAULT_BLOCK_SIZE = 256; } @@ -40,7 +37,8 @@ struct equality_functor { __device__ bool operator()(key_type const lhs_idx, key_type const rhs_idx) const { // We don't call this for nulls so this is fine. - auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{}; + auto constexpr equal = + cudf::experimental::row::equality::nan_equal_physical_equality_comparator{}; return equal(col.element(lhs_idx), col.element(rhs_idx)); } }; @@ -48,7 +46,7 @@ struct equality_functor { template struct hash_functor { column_device_view const& col; - uint32_t seed = 0; + uint32_t const seed = 0; __device__ auto operator()(key_type idx) const { return cudf::hashing::detail::MurmurHash3_x86_32{seed}(col.element(idx)); @@ -69,10 +67,11 @@ struct map_insert_fn { auto const col = chunk->col_desc; column_device_view const& data_col = *col->leaf_column; + __shared__ size_type total_num_dict_entries; using equality_fn_type = equality_functor; using hash_fn_type = hash_functor; - using probing_scheme_type = cuco::double_hashing; + using probing_scheme_type = cuco::linear_probing; // Make a view of the hash map. cuco::static_map_ref + storage_ref_type> hash_map_ref{cuco::empty_key{KEY_SENTINEL}, cuco::empty_value{VALUE_SENTINEL}, {data_col}, - {{data_col}, {data_col, 1}}, + hash_fn_type{data_col}, {}, storage_ref}; - __shared__ size_type total_num_dict_entries; - - auto const block = cg::this_thread_block(); - auto const tile = cg::tiled_partition(block); + // Create a map ref with `cuco::insert` operator + auto map_insert_ref = hash_map_ref.with_operators(cuco::insert); + auto const t = threadIdx.x; - // Insert all column chunk elements to the hash map to build the dict. - for (thread_index_type val_idx = s_start_value_idx; - val_idx + block.thread_rank() - block_size < end_value_idx; + // Note: Adjust the following loop to use `cg::tile` if needed in the future. + for (thread_index_type val_idx = s_start_value_idx + t; val_idx - block_size < end_value_idx; val_idx += block_size) { size_type is_unique = 0; size_type uniq_elem_size = 0; - // Each tile inserts a portion of the elements to the hash map. - for (auto tile_offset = 0; tile_offset < tile.num_threads(); tile_offset++) { - // Compute the index of the value currently being inserted by this tile. - auto const tile_val_idx = - val_idx + tile_offset + (tile.meta_group_rank() * tile.num_threads()); - // Check if this index is valid. - auto const is_valid = tile_val_idx < end_value_idx and tile_val_idx < data_col.size() and - data_col.is_valid(tile_val_idx); - - // Insert tile_val_idx to hash map and count successful insertions. - if (is_valid) { - // Insert the element to the map using the entire tile. - auto const tile_is_unique = - // Insert into the hash map using the provided thread tile. - hash_map_ref.insert(tile, cuco::pair{tile_val_idx, tile_val_idx}); - // tile_offset'th thread in the tile updates its number and size of unique element. - if (tile.thread_rank() == tile_offset) { - is_unique = tile_is_unique; - uniq_elem_size = [&]() -> size_type { - if (not is_unique) { return 0; } - switch (col->physical_type) { - case Type::INT32: return 4; - case Type::INT64: return 8; - case Type::INT96: return 12; - case Type::FLOAT: return 4; - case Type::DOUBLE: return 8; - case Type::BYTE_ARRAY: { - auto const col_type = data_col.type().id(); - if (col_type == type_id::STRING) { - // Strings are stored as 4 byte length + string bytes - return 4 + data_col.element(tile_val_idx).size_bytes(); - } else if (col_type == type_id::LIST) { - // Binary is stored as 4 byte length + bytes - return 4 + get_element(data_col, tile_val_idx) - .size_bytes(); - } - CUDF_UNREACHABLE( - "Byte array only supports string and list column types for dictionary " - "encoding!"); - } - case Type::FIXED_LEN_BYTE_ARRAY: - if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); } - CUDF_UNREACHABLE( - "Fixed length byte array only supports decimal 128 column types for " - "dictionary " - "encoding!"); - default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding"); + // Check if this index is valid. + auto const is_valid = + val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx); + + // Insert tile_val_idx to hash map and count successful insertions. + if (is_valid) { + // Insert the keys using a single thread for best performance for now. + is_unique = map_insert_ref.insert(cuco::pair{val_idx, val_idx}); + uniq_elem_size = [&]() -> size_type { + if (not is_unique) { return 0; } + switch (col->physical_type) { + case Type::INT32: return 4; + case Type::INT64: return 8; + case Type::INT96: return 12; + case Type::FLOAT: return 4; + case Type::DOUBLE: return 8; + case Type::BYTE_ARRAY: { + auto const col_type = data_col.type().id(); + if (col_type == type_id::STRING) { + // Strings are stored as 4 byte length + string bytes + return 4 + data_col.element(val_idx).size_bytes(); + } else if (col_type == type_id::LIST) { + // Binary is stored as 4 byte length + bytes + return 4 + + get_element(data_col, val_idx).size_bytes(); } - }(); + CUDF_UNREACHABLE( + "Byte array only supports string and list column types for dictionary " + "encoding!"); + } + case Type::FIXED_LEN_BYTE_ARRAY: + if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); } + CUDF_UNREACHABLE( + "Fixed length byte array only supports decimal 128 column types for dictionary " + "encoding!"); + default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding"); } - } + }(); } // Reduce num_unique and uniq_data_size from all tiles. auto num_unique = block_reduce(reduce_storage).Sum(is_unique); - block.sync(); + __syncthreads(); auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size); // The first thread in the block atomically updates total num_unique and uniq_data_size - if (block.thread_rank() == 0) { + if (t == 0) { total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique); total_num_dict_entries += num_unique; atomicAdd(&chunk->uniq_data_size, uniq_data_size); } - block.sync(); + __syncthreads(); - // Check if the num unique values in chunk has already exceeded max dict size and early - // exit + // Check if the num unique values in chunk has already exceeded max dict size and early exit if (total_num_dict_entries > MAX_DICT_SIZE) { return; } } // for loop } else { @@ -175,6 +157,7 @@ struct map_insert_fn { } }; +template struct map_find_fn { storage_ref_type const& storage_ref; EncColumnChunk* const& chunk; @@ -189,7 +172,7 @@ struct map_find_fn { using equality_fn_type = equality_functor; using hash_fn_type = hash_functor; - using probing_scheme_type = cuco::double_hashing; + using probing_scheme_type = cuco::linear_probing; // Make a view of the hash map. cuco::static_map_ref + storage_ref_type> hash_map_ref{cuco::empty_key{KEY_SENTINEL}, cuco::empty_value{VALUE_SENTINEL}, {data_col}, - {{data_col}, {data_col, 1}}, + hash_fn_type{data_col}, {}, storage_ref}; - // Find the key = i using the provided thread tile. - auto const tile = cg::tiled_partition(cg::this_thread_block()); - auto const ntiles = tile.meta_group_size(); + // Create a map ref with `cuco::find` operator + auto const map_find_ref = hash_map_ref.with_operators(cuco::find); + auto const t = threadIdx.x; - for (thread_index_type val_idx = s_start_value_idx + tile.meta_group_rank(); - val_idx < end_value_idx; - val_idx += tile.meta_group_size()) { + // Note: Adjust the following loop to use `cg::tiles` if needed in the future. + for (thread_index_type val_idx = s_start_value_idx + t; val_idx < end_value_idx; + val_idx += block_size) { + // Find the key using a single thread for best performance for now. if (data_col.is_valid(val_idx)) { - // No need for atomic as this is not going to be modified by any other thread - auto const value = [&]() { - auto const found_slot = hash_map_ref.find(tile, val_idx); + // No need for atomic as this is not going to be modified by any other thread. + chunk->dict_index[val_idx - s_ck_start_val_idx] = [&]() { + auto const found_slot = map_find_ref.find(val_idx); - // Check if didn't find the previously inserted key. - cudf_assert(found_slot != hash_map_ref.end() && + // Fail if we didn't find the previously inserted key. + cudf_assert(found_slot != map_find_ref.end() && "Unable to find value in map in dictionary index construction"); - // Return a pair of the found key and value. + // Return the found value. return found_slot->second; }(); - // First thread in the tile updates the dict_index - if (tile.thread_rank() == 0) { chunk->dict_index[val_idx - s_ck_start_val_idx] = value; } } } } else { @@ -272,17 +253,18 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto& chunk = chunks[blockIdx.x]; if (not chunk.use_dictionary) { return; } + auto t = threadIdx.x; __shared__ cuda::atomic counter; using cuda::std::memory_order_relaxed; - if (threadIdx.x == 0) { new (&counter) cuda::atomic{0}; } + if (t == 0) { new (&counter) cuda::atomic{0}; } __syncthreads(); - for (size_type idx = threadIdx.x; idx < chunk.dict_map_size; idx += block_size) { - auto* window = map_storage.data() + chunk.dict_map_offset + idx; + for (; t < chunk.dict_map_size; t += block_size) { + auto* window = map_storage.data() + chunk.dict_map_offset + t; for (auto& slot : *window) { auto const key = slot.first; if (key != KEY_SENTINEL) { - auto loc = counter.fetch_add(1, memory_order_relaxed); + auto const loc = counter.fetch_add(1, memory_order_relaxed); cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size"); chunk.dict_data[loc] = key; // If sorting dict page ever becomes a hard requirement, enable the following statement @@ -320,7 +302,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) map_storage.data() + chunk->dict_map_offset}; type_dispatcher(data_col.type(), - map_find_fn{storage_ref, chunk}, + map_find_fn{storage_ref, chunk}, s_start_value_idx, end_value_idx, s_ck_start_val_idx); diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index 35d9c7846aa..7c09764da2d 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -31,10 +31,13 @@ using key_type = size_type; using mapped_type = size_type; using slot_type = cuco::pair; -auto constexpr map_cg_size = 1; ///< A CUDA Cooperative Group of 1 thread(s) to handle each subset -auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread -auto constexpr occupancy_factor = 1.43f; ///< cuCollections suggests using a hash map of size N * - ///< (1/0.7) = 1.43 to target a 70% occupancy factor. +auto constexpr map_cg_size = + 1; ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset. + ///< Note: Adjust insert and find loops to use `cg::tile` if increasing this. +auto constexpr window_size = + 1; ///< Number of concurrent slots (set for best performance) handled by each thread. +auto constexpr occupancy_factor = 1.43f; ///< cuCollections suggests using a hash map of size + ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor. auto constexpr KEY_SENTINEL = key_type{-1}; auto constexpr VALUE_SENTINEL = mapped_type{-1}; From 1b6454072440bdf6adc310fa59fe4b3a8c8554d3 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Sat, 17 Aug 2024 01:20:03 +0000 Subject: [PATCH 25/30] Update comments --- cpp/src/io/parquet/chunk_dict.cu | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 653b6ddef6b..eb51714b8de 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -69,8 +69,10 @@ struct map_insert_fn { column_device_view const& data_col = *col->leaf_column; __shared__ size_type total_num_dict_entries; - using equality_fn_type = equality_functor; - using hash_fn_type = hash_functor; + using equality_fn_type = equality_functor; + using hash_fn_type = hash_functor; + // Choosing `linear_probing` over `double_hashing` for slighhhtly better performance seen in + // benchmarks. using probing_scheme_type = cuco::linear_probing; // Make a view of the hash map. @@ -170,8 +172,10 @@ struct map_find_fn { auto const col = chunk->col_desc; column_device_view const& data_col = *col->leaf_column; - using equality_fn_type = equality_functor; - using hash_fn_type = hash_functor; + using equality_fn_type = equality_functor; + using hash_fn_type = hash_functor; + // Choosing `linear_probing` over `double_hashing` for slighhhtly better performance seen in + // benchmarks. using probing_scheme_type = cuco::linear_probing; // Make a view of the hash map. From a0f5aaba9b6e153bf0a0d3dffe5b1fc863456922 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 19 Aug 2024 21:09:17 +0000 Subject: [PATCH 26/30] Minor improvement --- cpp/src/io/parquet/chunk_dict.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index eb51714b8de..98c1d5ae206 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -263,8 +263,10 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (t == 0) { new (&counter) cuda::atomic{0}; } __syncthreads(); + // Iterate over all windows in the map. for (; t < chunk.dict_map_size; t += block_size) { - auto* window = map_storage.data() + chunk.dict_map_offset + t; + auto window = map_storage.data() + chunk.dict_map_offset + t; + // Collect all slots from each window. for (auto& slot : *window) { auto const key = slot.first; if (key != KEY_SENTINEL) { From 9a16e305ba0e579ddc70962e537312d76e2faea0 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 19 Aug 2024 18:42:49 -0700 Subject: [PATCH 27/30] Minor updates --- cpp/src/io/parquet/writer_impl.cu | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 14504a4ef1c..fb4cad0713f 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1286,7 +1286,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, } // Variable to keep track of the current offset - uint32_t curr_offset = 0; + size_t total_map_storage_size = 0; // Populate dict offsets and sizes for each chunk that need to build a dictionary. std::for_each(h_chunks.begin(), h_chunks.end(), [&](auto& chunk) { auto const& chunk_col_desc = col_desc[chunk.col_desc_id]; @@ -1303,13 +1303,11 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunk.dict_map_size = static_cast(cuco::make_window_extent( static_cast(occupancy_factor * chunk.num_values))); - chunk.dict_map_offset = curr_offset; - curr_offset += chunk.dict_map_size; + chunk.dict_map_offset = total_map_storage_size; + total_map_storage_size += chunk.dict_map_size; } }); - // Compute total map storage - auto const total_map_storage_size = static_cast(curr_offset); // No chunk needs to create a dictionary, exit early if (total_map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; } From 24d16eb256b1251035890ba4248a66c0ffc9d8c5 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 20 Aug 2024 01:46:51 +0000 Subject: [PATCH 28/30] Apply clang-format --- cpp/src/io/parquet/writer_impl.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index fb4cad0713f..9df53bc9b7e 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1285,7 +1285,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, return std::pair(std::move(dict_data), std::move(dict_index)); } - // Variable to keep track of the current offset + // Variable to keep track of the current total map storage size size_t total_map_storage_size = 0; // Populate dict offsets and sizes for each chunk that need to build a dictionary. std::for_each(h_chunks.begin(), h_chunks.end(), [&](auto& chunk) { @@ -1308,7 +1308,6 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, } }); - // No chunk needs to create a dictionary, exit early if (total_map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; } From f18c1f77a064a2ceff3e292ad4dcde05286ec33c Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 21 Aug 2024 23:15:32 +0000 Subject: [PATCH 29/30] Address reviewer comments --- cpp/src/io/parquet/chunk_dict.cu | 47 ++++++++++++++------------------ 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 98c1d5ae206..7a6d470a027 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -34,7 +34,7 @@ constexpr int DEFAULT_BLOCK_SIZE = 256; template struct equality_functor { column_device_view const& col; - __device__ bool operator()(key_type const lhs_idx, key_type const rhs_idx) const + __device__ bool operator()(key_type lhs_idx, key_type rhs_idx) const { // We don't call this for nulls so this is fine. auto constexpr equal = @@ -76,23 +76,21 @@ struct map_insert_fn { using probing_scheme_type = cuco::linear_probing; // Make a view of the hash map. - cuco::static_map_ref - hash_map_ref{cuco::empty_key{KEY_SENTINEL}, - cuco::empty_value{VALUE_SENTINEL}, - {data_col}, - hash_fn_type{data_col}, - {}, - storage_ref}; + auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL}, + cuco::empty_value{VALUE_SENTINEL}, + equality_fn_type{data_col}, + probing_scheme_type{hash_fn_type{data_col}}, + cuco::thread_scope_block, + storage_ref}; // Create a map ref with `cuco::insert` operator auto map_insert_ref = hash_map_ref.with_operators(cuco::insert); auto const t = threadIdx.x; + // Create atomic refs to the current chunk's num_dict_entries and uniq_data_size + cuda::atomic_ref const chunk_num_dict_entries{chunk->num_dict_entries}; + cuda::atomic_ref const chunk_uniq_data_size{chunk->uniq_data_size}; + // Note: Adjust the following loop to use `cg::tile` if needed in the future. for (thread_index_type val_idx = s_start_value_idx + t; val_idx - block_size < end_value_idx; val_idx += block_size) { @@ -144,9 +142,10 @@ struct map_insert_fn { auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size); // The first thread in the block atomically updates total num_unique and uniq_data_size if (t == 0) { - total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique); + total_num_dict_entries = + chunk_num_dict_entries.fetch_add(num_unique, cuda::std::memory_order_relaxed); total_num_dict_entries += num_unique; - atomicAdd(&chunk->uniq_data_size, uniq_data_size); + chunk_uniq_data_size.fetch_add(uniq_data_size, cuda::std::memory_order_relaxed); } __syncthreads(); @@ -179,18 +178,12 @@ struct map_find_fn { using probing_scheme_type = cuco::linear_probing; // Make a view of the hash map. - cuco::static_map_ref - hash_map_ref{cuco::empty_key{KEY_SENTINEL}, - cuco::empty_value{VALUE_SENTINEL}, - {data_col}, - hash_fn_type{data_col}, - {}, - storage_ref}; + auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL}, + cuco::empty_value{VALUE_SENTINEL}, + equality_fn_type{data_col}, + probing_scheme_type{hash_fn_type{data_col}}, + cuco::thread_scope_block, + storage_ref}; // Create a map ref with `cuco::find` operator auto const map_find_ref = hash_map_ref.with_operators(cuco::find); From 8f94acab58dd65198237f2e228e5b13cde3ab0fb Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 29 Aug 2024 14:08:07 -0700 Subject: [PATCH 30/30] Apply suggestion Co-authored-by: Yunsong Wang --- cpp/src/io/parquet/chunk_dict.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 7a6d470a027..17ccb73c0a8 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -92,7 +92,7 @@ struct map_insert_fn { cuda::atomic_ref const chunk_uniq_data_size{chunk->uniq_data_size}; // Note: Adjust the following loop to use `cg::tile` if needed in the future. - for (thread_index_type val_idx = s_start_value_idx + t; val_idx - block_size < end_value_idx; + for (thread_index_type val_idx = s_start_value_idx + t; val_idx - t < end_value_idx; val_idx += block_size) { size_type is_unique = 0; size_type uniq_elem_size = 0;